maloja/database.py

from bottle import Bottle, route, get, post, run, template, static_file, request, response, FormsDict
from importlib.machinery import SourceFileLoader
import urllib
import waitress
import os
import datetime
from cleanup import *
from utilities import *
import sys

dbserver = Bottle()


SCROBBLES = []	# Format: tuple(track_ref,timestamp,saved)
ARTISTS = []	# Format: artist
TRACKS = []	# Format: tuple(frozenset(artist_ref,...),title)

timestamps = set()

cla = CleanerAgent()
coa = CollectorAgent()
clients = []

lastsync = 0

# rulestate that the entire current database was built with, or False if the database was built from inconsistent scrobble files
db_rulestate = False


### symmetric keys are fine for now since we hopefully use HTTPS
def loadAPIkeys():
	global clients
	createTSV("clients/authenticated_machines.tsv")
	clients = parseTSV("clients/authenticated_machines.tsv","string","string")

def checkAPIkey(k):
	return (k in [k for [k,d] in clients])


####
## Getting dict representations of database objects
####

def getScrobbleObject(o):
	track = getTrackObject(TRACKS[o[0]])
	return {"artists":track["artists"],"title":track["title"],"time":o[1]}

def getArtistObject(o):
	return o

def getTrackObject(o):
	artists = [getArtistObject(ARTISTS[a]) for a in o[0]]
	return {"artists":artists,"title":o[1]}


####
## Creating or finding existing database entries
####


def createScrobble(artists,title,time):
	while (time in timestamps):
		time += 1
	timestamps.add(time)
	i = getTrackID(artists,title)
	obj = (i,time,False)
	SCROBBLES.append(obj)


def readScrobble(artists,title,time):
	while (time in timestamps):
		time += 1
	timestamps.add(time)
	i = getTrackID(artists,title)
	obj = (i,time,True)
	SCROBBLES.append(obj)


def getArtistID(name):

	obj = name
	objlower = name.lower()

	try:
		return ARTISTS.index(obj)
	except:
		pass
	try:
		return [a.lower() for a in ARTISTS].index(objlower)
	except:
		i = len(ARTISTS)
		ARTISTS.append(obj)
		return i

def getTrackID(artists,title):
	artistset = set()
	for a in artists:
		artistset.add(getArtistID(name=a))
	obj = (frozenset(artistset),title)
	objlower = (frozenset(artistset),title.lower())

	try:
		return TRACKS.index(obj)
	except:
		pass
	try:
		# not the best performance
		return [(t[0],t[1].lower()) for t in TRACKS].index(objlower)
	except:
		i = len(TRACKS)
		TRACKS.append(obj)
		return i


####
## HTTP requests
####

@dbserver.route("/test")
def test_server():
	apikey = request.query.get("key")
	response.set_header("Access-Control-Allow-Origin","*")
	if apikey is not None and not (checkAPIkey(apikey)):
		response.status = 403
		return "Wrong API key"

	elif db_rulestate:
		response.status = 204
		return
	else:
		response.status = 205
		return

@dbserver.route("/scrobbles")
def get_scrobbles():
	keys = FormsDict.decode(request.query)
	r = db_query(artist=keys.get("artist"),track=keys.get("track"),since=keys.get("since"),to=keys.get("to"))
	r.reverse()

	return {"list":r} ##json can't be a list apparently???

@dbserver.route("/tracks")
def get_tracks():
	keys = FormsDict.decode(request.query)
	artist = keys.get("artist")

	if artist is not None:
		artistid = ARTISTS.index(artist)

	# Option 1
	ls = [getTrackObject(t) for t in TRACKS if (artistid in t[0]) or (artistid==None)]

	# Option 2 is a bit more elegant but much slower
	#tracklist = [getTrackObject(t) for t in TRACKS]
	#ls = [t for t in tracklist if (artist in t["artists"]) or (artist==None)]

	return {"list":ls}

@dbserver.route("/artists")
def get_artists():

	return {"list":ARTISTS}

@dbserver.route("/charts/artists")
def get_charts_artists():
	since = request.query.get("since")
	to = request.query.get("to")

	return {"list":db_aggregate(by="ARTIST",since=since,to=to)}

@dbserver.route("/charts/tracks")
def get_charts_tracks():
	since = request.query.get("since")
	to = request.query.get("to")

	return {"list":db_aggregate(by="TRACK",since=since,to=to)}

@dbserver.route("/charts")
def get_charts():
	since = request.query.get("since")
	to = request.query.get("to")

	return {"number":db_aggregate(since=since,to=to)}

@dbserver.route("/pulse")
def get_pulse():
	since = request.query.get("since")
	to = request.query.get("to")
	(ts_start,ts_end) = getTimestamps(since,to)
	step = request.query.get("step","month")
	trail = int(request.query.get("trail",3))

	[step,stepn] = (step.split("-") + [1])[:2]	# makes the multiplier 1 if not assigned
	stepn = int(stepn)

	d_start = getStartOf(ts_start,step)
	d_end = getStartOf(ts_end,step)

	d_start = getNext(d_start,step,stepn)			# first range should end right after the first active scrobbling week / month / whatever relevant step
	d_start = getNext(d_start,step,stepn * trail * -1)	# go one range back to begin

	results = []

	d_current = d_start
	while True:
		d_current_end = getNext(d_current,step,stepn * trail)
		#print("Checking from " + str(d_current[0]) + "-" + str(d_current[1]) + "-" + str(d_current[2]) + " to " + str(d_current_end[0]) + "-" + str(d_current_end[1]) + "-" + str(d_current_end[2]))
		res = db_aggregate(since=d_current,to=d_current_end)
		results.append({"from":d_current,"to":d_current_end,"scrobbles":res})
		d_current = getNext(d_current,step,stepn)
		if isPast(d_current_end,d_end):
			break

	return {"list":results}


@dbserver.route("/top/artists")
def get_top_artists():
	since = request.query.get("since")
	to = request.query.get("to")
	(ts_start,ts_end) = getTimestamps(since,to)
	step = request.query.get("step","month")
	trail = int(request.query.get("trail",3))

	[step,stepn] = (step.split("-") + [1])[:2]	# makes the multiplier 1 if not assigned
	stepn = int(stepn)

	d_start = getStartOf(ts_start,step)
	d_end = getStartOf(ts_end,step)

	d_start = getNext(d_start,step,stepn)			# first range should end right after the first active scrobbling week / month / whatever relevant step
	d_start = getNext(d_start,step,stepn * trail * -1)	# go one range back to begin

	results = []

	d_current = d_start
	while True:
		d_current_end = getNext(d_current,step,stepn * trail)
		#print("Checking from " + str(d_current[0]) + "-" + str(d_current[1]) + "-" + str(d_current[2]) + " to " + str(d_current_end[0]) + "-" + str(d_current_end[1]) + "-" + str(d_current_end[2]))
		try:
			res = db_aggregate(since=d_current,to=d_current_end,by="ARTIST")[0]
			results.append({"from":d_current,"to":d_current_end,"artist":res["artist"],"scrobbles":res["scrobbles"]})
		except:
			results.append({"from":d_current,"to":d_current_end,"artist":None,"scrobbles":0})
		d_current = getNext(d_current,step,stepn)
		if isPast(d_current_end,d_end):
			break

	return {"list":results}

@dbserver.route("/top/tracks")
def get_top_tracks():
	since = request.query.get("since")
	to = request.query.get("to")
	(ts_start,ts_end) = getTimestamps(since,to)
	step = request.query.get("step","month")
	trail = int(request.query.get("trail",3))

	[step,stepn] = (step.split("-") + [1])[:2]	# makes the multiplier 1 if not assigned
	stepn = int(stepn)

	d_start = getStartOf(ts_start,step)
	d_end = getStartOf(ts_end,step)

	d_start = getNext(d_start,step,stepn)			# first range should end right after the first active scrobbling week / month / whatever relevant step
	d_start = getNext(d_start,step,stepn * trail * -1)	# go one range back to begin

	results = []

	d_current = d_start
	while True:
		d_current_end = getNext(d_current,step,stepn * trail)
		#print("Checking from " + str(d_current[0]) + "-" + str(d_current[1]) + "-" + str(d_current[2]) + " to " + str(d_current_end[0]) + "-" + str(d_current_end[1]) + "-" + str(d_current_end[2]))
		try:
			res = db_aggregate(since=d_current,to=d_current_end,by="TRACK")[0]
			results.append({"from":d_current,"to":d_current_end,"track":res["track"],"scrobbles":res["scrobbles"]})
		except:
			results.append({"from":d_current,"to":d_current_end,"track":None,"scrobbles":0})
		d_current = getNext(d_current,step,stepn)
		if isPast(d_current_end,d_end):
			break

	return {"list":results}


def getStartOf(timestamp,unit):
	date = datetime.datetime.utcfromtimestamp(timestamp)
	if unit == "year":
		return [date.year,1,1]
	elif unit == "month":
		return [date.year,date.month,1]
	elif unit == "day":
		return [date.year,date.month,date.day]
	elif unit == "week":
		change = (date.weekday() + 1) % 7
		d = datetime.timedelta(days=change)
		newdate = date - d
		return [newdate.year,newdate.month,newdate.day]

def getNext(time,unit,step=1):
	if unit == "year":
		return [time[0] + step,time[1],time[2]]
	elif unit == "month":
		result = [time[0],time[1] + step,time[2]]
		while result[1] > 12:
			result[1] -= 12
			result[0] += 1
		while result[1] < 1:
			result[1] += 12
			result[0] -= 1
		return result
	elif unit == "day":
		dt = datetime.datetime(time[0],time[1],time[2])
		d = datetime.timedelta(days=step)
		newdate = dt + d
		return [newdate.year,newdate.month,newdate.day]
		#eugh
	elif unit == "week":
		return getNext(time,"day",step * 7)

@dbserver.route("/artistinfo")
def artistInfo():
	keys = FormsDict.decode(request.query)
	artist = keys.get("artist")

	charts = db_aggregate(by="ARTIST")
	scrobbles = len(db_query(artist=artist)) #we cant take the scrobble number from the charts because that includes all countas scrobbles
	try:
		c = [e for e in charts if e["artist"] == artist][0]
		others = coa.getAllAssociated(artist)
		return {"scrobbles":scrobbles,"position":charts.index(c) + 1,"associated":others}
	except:
		# if the artist isnt in the charts, they are not being credited and we need to show information about the credited one
		artist = coa.getCredited(artist)
		c = [e for e in charts if e["artist"] == artist][0]
		return {"replace":artist,"scrobbles":scrobbles,"position":charts.index(c) + 1}

def isPast(date,limit):
	if not date[0] == limit[0]:
		return date[0] > limit[0]
	if not date[1] == limit[1]:
		return date[1] > limit[1]
	return (date[2] > limit[2])

@dbserver.get("/newscrobble")
def pseudo_post_scrobble():
	keys = FormsDict.decode(request.query) # The Dal★Shabet handler
	artists = keys.get("artist")
	title = keys.get("title")
	try:
		time = int(keys.get("time"))
	except:
		time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	(artists,title) = cla.fullclean(artists,title)

	## this is necessary for localhost testing
	response.set_header("Access-Control-Allow-Origin","*")

	createScrobble(artists,title,time)

	if (time - lastsync) > 3600:
		sync()

	return ""

@dbserver.post("/newscrobble")
def post_scrobble():
	keys = FormsDict.decode(request.forms) # The Dal★Shabet handler
	artists = keys.get("artist")
	title = keys.get("title")
	apikey = keys.get("key")
	if not (checkAPIkey(apikey)):
		response.status = 403
		return ""

	try:
		time = int(keys.get("time"))
	except:
		time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	(artists,title) = cla.fullclean(artists,title)

	## this is necessary for localhost testing
	response.set_header("Access-Control-Allow-Origin","*")

	createScrobble(artists,title,time)

	if (time - lastsync) > 3600:
		sync()

	return ""

@dbserver.route("/sync")
def abouttoshutdown():
	sync()
	#sys.exit()

@dbserver.post("/newrule")
def newrule():
	keys = FormsDict.decode(request.forms)
	addEntry("rules/webmade.tsv",[k for k in keys])
	global db_rulestate
	db_rulestate = False

@dbserver.route("/issues")
def issues():
	combined = []
	duplicates = []
	newartists = []
	inconsistent = not db_rulestate


	import itertools
	import difflib

	sortedartists = ARTISTS.copy()
	sortedartists.sort(key=len,reverse=True)
	reversesortedartists = sortedartists.copy()
	reversesortedartists.reverse()
	for a in reversesortedartists:

		nochange = cla.confirmedReal(a)

		st = a
		lis = []
		reachedmyself = False
		for ar in sortedartists:
			if (ar != a) and not reachedmyself:
				continue
			elif not reachedmyself:
				reachedmyself = True
				continue

			if (ar.lower() == a.lower()) or ("the " + ar.lower() == a.lower()) or ("a " + ar.lower() == a.lower()):
				duplicates.append((ar,a))
				break

			if (ar + " " in st) or (" " + ar in st):
				lis.append(ar)
				st = st.replace(ar,"").strip()
			elif (ar == st):
				lis.append(ar)
				st = ""
				if not nochange:
					combined.append((a,lis))
				break

			elif (ar in st) and len(ar)*2 > len(st):
				duplicates.append((a,ar))

		st = st.replace("&","").replace("and","").replace("with","").strip()
		if st != "" and st != a:
			if len(st) < 5 and len(lis) == 1:
				#check if we havent just randomly found the string in another word
				#if (" " + st + " ") in lis[0] or (lis[0].endswith(" " + st)) or (lis[0].startswith(st + " ")):
				duplicates.append((a,lis[0]))
			elif len(st) < 5 and len(lis) > 1 and not nochange:
				combined.append((a,lis))
			elif len(st) >= 5 and not nochange:
				#check if we havent just randomly found the string in another word
				if (" " + st + " ") in a or (a.endswith(" " + st)) or (a.startswith(st + " ")):
					newartists.append((st,a,lis))

	#for c in itertools.combinations(ARTISTS,3):
	#	l = list(c)
	#	print(l)
	#	l.sort(key=len,reverse=True)
	#	[full,a1,a2] = l
	#	if (a1 + " " + a2 in full) or (a2 + " " + a1 in full):
	#		combined.append((full,a1,a2))


	#for c in itertools.combinations(ARTISTS,2):
	#	if
	#
	#	if (c[0].lower == c[1].lower):
	#		duplicates.append((c[0],c[1]))


	#	elif (c[0] + " " in c[1]) or (" " + c[0] in c[1]) or (c[1] + " " in c[0]) or (" " + c[1] in c[0]):
	#		if (c[0] in c[1]):
	#			full, part = c[1],c[0]
	#			rest = c[1].replace(c[0],"").strip()
	#		else:
	#			full, part = c[0],c[1]
	#			rest = c[0].replace(c[1],"").strip()
	#		if rest in ARTISTS and full not in [c[0] for c in combined]:
	#			combined.append((full,part,rest))

	#	elif (c[0] in c[1]) or (c[1] in c[0]):
	#		duplicates.append((c[0],c[1]))


	return {"duplicates":duplicates,"combined":combined,"newartists":newartists,"inconsistent":inconsistent}

@dbserver.post("/rebuild")
def rebuild():
	global db_rulestate
	db_rulestate = False
	sync()
	os.system("python3 fixexisting.py")
	global cla, coa
	cla = CleanerAgent()
	coa = CollectorAgent()
	build_db()


####
## Server operation
####


# Starts the server
def runserver(PORT):
	global lastsync
	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	build_db()


	loadAPIkeys()

	run(dbserver, host='0.0.0.0', port=PORT, server='waitress')


def build_db():


	global SCROBBLES, ARTISTS, TRACKS

	SCROBBLES = []
	ARTISTS = []
	TRACKS = []


	db = parseAllTSV("scrobbles","int","string","string")
	for sc in db:
		artists = sc[1].split("␟")
		title = sc[2]
		time = sc[0]

		readScrobble(artists,title,time)

	#for f in os.listdir("scrobbles/"):
	#
	#	if not (f.endswith(".tsv")):
	#		continue
	#
	#	logfile = open("scrobbles/" + f)
	#	for l in logfile:
	#
	#		l = l.replace("\n","")
	#		data = l.split("\t")
	#
	#		## saving album in the scrobbles is supported, but for now we don't use it. It shouldn't be a defining part of the track (same song from Album or EP), but derived information
	#		artists = data[1].split("␟")
	#		#album = data[3]
	#		title = data[2]
	#		time = int(data[0])
	#
	#		readScrobble(artists,title,time)

	SCROBBLES.sort(key = lambda tup: tup[1])

	coa.updateIDs(ARTISTS)

	global db_rulestate
	db_rulestate = consistentRulestate("scrobbles",cla.checksums)


# Saves all cached entries to disk
def sync():

	# all entries by file collected
	# so we don't open the same file for every entry
	entries = {}

	for idx in range(len(SCROBBLES)):
		if not SCROBBLES[idx][2]:

			t = getScrobbleObject(SCROBBLES[idx])

			artistlist = list(t["artists"])
			artistlist.sort() #we want the order of artists to be deterministic so when we update files with new rules a diff can see what has actually been changed
			artistss = "␟".join(artistlist)
			timestamp = datetime.date.fromtimestamp(t["time"])

			entry = [str(t["time"]),artistss,t["title"]]

			monthcode = str(timestamp.year) + "_" + str(timestamp.month)
			entries.setdefault(monthcode,[]).append(entry) #i feckin love the setdefault function
			#prev = entries.get(monthcode,[])
			#prev.append(entry)
			#entries[monthcode] = prev


	#		monthfile = open("scrobbles/" + str(timestamp.year) + "_" + str(timestamp.month) + ".tsv","a")
	#		monthfile.write(entry)
	#		monthfile.write("\n")
	#		monthfile.close()
	#
	#		if os.path.exists("scrobbles/" + str(timestamp.year) + "_" + str(timestamp.month) + ".tsv.rulestate"):
	#			checkfile = open("scrobbles/" + str(timestamp.year) + "_" + str(timestamp.month) + ".tsv.rulestate","r")
	#			print("Checking rulestate of " + str(timestamp.year) + "_" + str(timestamp.month) + ".tsv")
	#			if checkfile.read() != cla.checksums:
	#				print("Checksum does not match, file is inconsistent")
	#				#cla.checksums represents the rule state that all current unsaved scrobbles were created under. if this is the same than the existing one, we're all good
	#				#if not, the file is not consistent to any single rule state
	#				checkfile.close()
	#				checkfile = open("scrobbles/" + str(timestamp.year) + "_" + str(timestamp.month) + ".tsv.rulestate","w")
	#				checkfile.write("INVALID") # this will never match any sha256sum
	#			checkfile.close()
	#		else:
	#			print(str(timestamp.year) + "_" + str(timestamp.month) + ".tsv does not yet exist, writing current rulestate")
	#			#if the file didn't exist before, all its scrobbles come from our current server instance and are therefore under the current rule state
	#			checkfile = open("scrobbles/" + str(timestamp.year) + "_" + str(timestamp.month) + ".tsv.rulestate","w")
	#			checkfile.write(cla.checksums)
	#			checkfile.close()

			SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)

	for e in entries:
		addEntries("scrobbles/" + e + ".tsv",entries[e])
		combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)


	global lastsync
	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	print("Database saved to disk.")


####
## Database queries
####


# Queries the database
def db_query(artist=None,track=None,since=None,to=None):
	(since, to) = getTimestamps(since,to)


	# this is not meant as a search function. we *can* query the db with a string, but it only works if it matches exactly (and title string simply picks the first track with that name)
	if isinstance(artist, str):
		artist = ARTISTS.index(artist)
	if isinstance(track, str):
		track = TRACKS.index(track)

	return [getScrobbleObject(s) for s in SCROBBLES if (s[0] == track or track==None) and (artist in TRACKS[s[0]][0] or artist==None) and (since < s[1] < to)]
	# pointless to check for artist when track is checked because every track has a fixed set of artists, but it's more elegant this way


	#thingsweneed = ["artists","title","time"]
	#return [{key:t[key] for key in thingsweneed} for t in DATABASE if (artist in t["artists"] or artist==None) and (t["title"]==title or title==None) and (since < t["time"] < to)]


# Queries that... well... aggregate
def db_aggregate(by=None,since=None,to=None):
	(since, to) = getTimestamps(since,to)

	if (by=="ARTIST"):
		#this is probably a really bad idea
		#for a in ARTISTS:
		#	num = len(db_query(artist=a,since=since,to=to))
		#

		# alright let's try for real
		charts = {}
		for s in [scr for scr in SCROBBLES if since < scr[1] < to]:
			artists = TRACKS[s[0]][0]
			for a in coa.getCreditedList(artists):
				# this either creates the new entry or increments the existing one
				charts[a] = charts.setdefault(a,0) + 1

		ls = [{"artist":getArtistObject(ARTISTS[a]),"scrobbles":charts[a]} for a in charts]
		return sorted(ls,key=lambda k:k["scrobbles"], reverse=True)

	elif (by=="TRACK"):
		charts = {}
		for s in [scr for scr in SCROBBLES if since < scr[1] < to]:
			track = s[0]
			# this either creates the new entry or increments the existing one
			charts[track] = charts.setdefault(track,0) + 1

		ls = [{"track":getTrackObject(TRACKS[t]),"scrobbles":charts[t]} for t in charts]
		return sorted(ls,key=lambda k:k["scrobbles"], reverse=True)

	else:
		return len([scr for scr in SCROBBLES if since < scr[1] < to])


# Search for strings
def db_search(query,type=None):
	if type=="ARTIST":
		results = []
		for a in ARTISTS:
			if query.lower() in a.lower():
				results.append(a)

	if type=="TRACK":
		results = []
		for t in TRACKS:
			if query.lower() in t[1].lower():
				results.append(t)

	return results


####
## Useful functions
####


# Takes user inputs like YYYY/MM and returns the timestamps. Returns timestamp if timestamp was already given.
def getTimestamps(f,t):
	#(f,t) = inp
	if isinstance(f, str) and f.lower() == "today":
		tod = datetime.date.today()
		f = [tod.year,tod.month,tod.day]
	if isinstance(t, str) and t.lower() == "today":
		tod = datetime.date.today()
		t = [tod.year,tod.month,tod.day]


	if isinstance(f, str):
		f = [int(x) for x in f.split("/")]

	if isinstance(t, str):
		t = [int(x) for x in t.split("/")]


	# this step is done if either the input is a list or the first step was done (which creates a list)
	if isinstance(f, list):
		date = [1970,1,1,0,0]
		date[:len(f)] = f
		f = int(datetime.datetime(date[0],date[1],date[2],date[3],date[4],tzinfo=datetime.timezone.utc).timestamp())

	if isinstance(t, list):
		date = [1970,1,1,0,0]
		date[:len(t)] = t
		t = int(datetime.datetime(date[0],date[1],date[2],date[3],date[4],tzinfo=datetime.timezone.utc).timestamp())


	if (f==None):
		f = min(timestamps)
	if (t==None):
		t = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).timestamp()

	return (f,t)


def getArtistId(nameorid):
	if isinstance(nameorid,int):
		return nameorid
	else:
		try:
			return ARTISTS.index(nameorid)
		except:
			return -1