maloja/database.py

from bottle import Bottle, route, get, post, run, template, static_file, request, response, FormsDict
from importlib.machinery import SourceFileLoader
import urllib
import waitress
import os
import datetime
from cleanup import *
from utilities import *
import sys

dbserver = Bottle()


SCROBBLES = []	# Format: tuple(track_ref,timestamp,saved)
ARTISTS = []	# Format: artist
TRACKS = []	# Format: tuple(frozenset(artist_ref,...),title)

timestamps = set()

cla = CleanerAgent()
coa = CollectorAgent()
clients = []

lastsync = 0

# rulestate that the entire current database was built with, or False if the database was built from inconsistent scrobble files
db_rulestate = False


### symmetric keys are fine for now since we hopefully use HTTPS
def loadAPIkeys():
	global clients
	createTSV("clients/authenticated_machines.tsv")
	clients = parseTSV("clients/authenticated_machines.tsv","string","string")

def checkAPIkey(k):
	return (k in [k for [k,d] in clients])


####
## Getting dict representations of database objects
####

def getScrobbleObject(o):
	track = getTrackObject(TRACKS[o[0]])
	return {"artists":track["artists"],"title":track["title"],"time":o[1]}

def getArtistObject(o):
	return o

def getTrackObject(o):
	artists = [getArtistObject(ARTISTS[a]) for a in o[0]]
	return {"artists":artists,"title":o[1]}


####
## Creating or finding existing database entries
####


def createScrobble(artists,title,time):
	while (time in timestamps):
		time += 1
	timestamps.add(time)
	i = getTrackID(artists,title)
	obj = (i,time,False)
	SCROBBLES.append(obj)


def readScrobble(artists,title,time):
	while (time in timestamps):
		time += 1
	timestamps.add(time)
	i = getTrackID(artists,title)
	obj = (i,time,True)
	SCROBBLES.append(obj)


def getArtistID(name):

	obj = name
	objlower = name.lower()

	try:
		return ARTISTS.index(obj)
	except:
		pass
	try:
		return [a.lower() for a in ARTISTS].index(objlower)
	except:
		i = len(ARTISTS)
		ARTISTS.append(obj)
		return i

def getTrackID(artists,title):
	artistset = set()
	for a in artists:
		artistset.add(getArtistID(name=a))
	obj = (frozenset(artistset),title)
	objlower = (frozenset(artistset),title.lower())

	try:
		return TRACKS.index(obj)
	except:
		pass
	try:
		# not the best performance
		return [(t[0],t[1].lower()) for t in TRACKS].index(objlower)
	except:
		i = len(TRACKS)
		TRACKS.append(obj)
		return i


####
## HTTP requests
####

@dbserver.route("/test")
def test_server():
	apikey = request.query.get("key")
	response.set_header("Access-Control-Allow-Origin","*")
	if apikey is not None and not (checkAPIkey(apikey)):
		response.status = 403
		return "Wrong API key"

	elif db_rulestate:
		response.status = 204
		return
	else:
		response.status = 205
		return

@dbserver.route("/scrobbles")
def get_scrobbles():
	keys = FormsDict.decode(request.query)

	r = db_query(artists=keys.getall("artist"),title=keys.get("title"),since=keys.get("since"),to=keys.get("to"),associated=(keys.get("associated")!=None))
	r.reverse()

	return {"list":r} ##json can't be a list apparently???

@dbserver.route("/tracks")
def get_tracks():
	keys = FormsDict.decode(request.query)
	artist = keys.get("artist")

	if artist is not None:
		artistid = ARTISTS.index(artist)

	# Option 1
	ls = [getTrackObject(t) for t in TRACKS if (artistid in t[0]) or (artistid==None)]

	# Option 2 is a bit more elegant but much slower
	#tracklist = [getTrackObject(t) for t in TRACKS]
	#ls = [t for t in tracklist if (artist in t["artists"]) or (artist==None)]

	return {"list":ls}

@dbserver.route("/artists")
def get_artists():

	return {"list":ARTISTS}

@dbserver.route("/charts/artists")
def get_charts_artists():
	since = request.query.get("since")
	to = request.query.get("to")

	return {"list":db_aggregate(by="ARTIST",since=since,to=to)}

@dbserver.route("/charts/tracks")
def get_charts_tracks():
	keys = FormsDict.decode(request.query)
	since = keys.get("since")
	to = keys.get("to")
	artist = keys.get("artist")

	return {"list":db_aggregate(by="TRACK",since=since,to=to,artist=artist)}

@dbserver.route("/charts")
def get_charts():
	since = request.query.get("since")
	to = request.query.get("to")

	return {"number":db_aggregate(since=since,to=to)}

@dbserver.route("/pulse")
def get_pulse():
	since = request.query.get("since")
	to = request.query.get("to")
	(ts_start,ts_end) = getTimestamps(since,to)
	step = request.query.get("step","month")
	trail = int(request.query.get("trail",3))

	[step,stepn] = (step.split("-") + [1])[:2]	# makes the multiplier 1 if not assigned
	stepn = int(stepn)

	d_start = getStartOf(ts_start,step)
	d_end = getStartOf(ts_end,step)

	d_start = getNext(d_start,step,stepn)			# first range should end right after the first active scrobbling week / month / whatever relevant step
	d_start = getNext(d_start,step,stepn * trail * -1)	# go one range back to begin

	results = []

	d_current = d_start
	while True:
		d_current_end = getNext(d_current,step,stepn * trail)
		#print("Checking from " + str(d_current[0]) + "-" + str(d_current[1]) + "-" + str(d_current[2]) + " to " + str(d_current_end[0]) + "-" + str(d_current_end[1]) + "-" + str(d_current_end[2]))
		res = db_aggregate(since=d_current,to=d_current_end)
		results.append({"from":d_current,"to":d_current_end,"scrobbles":res})
		d_current = getNext(d_current,step,stepn)
		if isPast(d_current_end,d_end):
			break

	return {"list":results}


@dbserver.route("/top/artists")
def get_top_artists():
	since = request.query.get("since")
	to = request.query.get("to")
	(ts_start,ts_end) = getTimestamps(since,to)
	step = request.query.get("step","month")
	trail = int(request.query.get("trail",3))

	[step,stepn] = (step.split("-") + [1])[:2]	# makes the multiplier 1 if not assigned
	stepn = int(stepn)

	d_start = getStartOf(ts_start,step)
	d_end = getStartOf(ts_end,step)

	d_start = getNext(d_start,step,stepn)			# first range should end right after the first active scrobbling week / month / whatever relevant step
	d_start = getNext(d_start,step,stepn * trail * -1)	# go one range back to begin

	results = []

	d_current = d_start
	while True:
		d_current_end = getNext(d_current,step,stepn * trail)
		#print("Checking from " + str(d_current[0]) + "-" + str(d_current[1]) + "-" + str(d_current[2]) + " to " + str(d_current_end[0]) + "-" + str(d_current_end[1]) + "-" + str(d_current_end[2]))
		try:
			res = db_aggregate(since=d_current,to=d_current_end,by="ARTIST")[0]
			results.append({"from":d_current,"to":d_current_end,"artist":res["artist"],"scrobbles":res["scrobbles"]})
		except:
			results.append({"from":d_current,"to":d_current_end,"artist":None,"scrobbles":0})
		d_current = getNext(d_current,step,stepn)
		if isPast(d_current_end,d_end):
			break

	return {"list":results}

@dbserver.route("/top/tracks")
def get_top_tracks():
	since = request.query.get("since")
	to = request.query.get("to")
	(ts_start,ts_end) = getTimestamps(since,to)
	step = request.query.get("step","month")
	trail = int(request.query.get("trail",3))

	[step,stepn] = (step.split("-") + [1])[:2]	# makes the multiplier 1 if not assigned
	stepn = int(stepn)

	d_start = getStartOf(ts_start,step)
	d_end = getStartOf(ts_end,step)

	d_start = getNext(d_start,step,stepn)			# first range should end right after the first active scrobbling week / month / whatever relevant step
	d_start = getNext(d_start,step,stepn * trail * -1)	# go one range back to begin

	results = []

	d_current = d_start
	while True:
		d_current_end = getNext(d_current,step,stepn * trail)
		#print("Checking from " + str(d_current[0]) + "-" + str(d_current[1]) + "-" + str(d_current[2]) + " to " + str(d_current_end[0]) + "-" + str(d_current_end[1]) + "-" + str(d_current_end[2]))
		try:
			res = db_aggregate(since=d_current,to=d_current_end,by="TRACK")[0]
			results.append({"from":d_current,"to":d_current_end,"track":res["track"],"scrobbles":res["scrobbles"]})
		except:
			results.append({"from":d_current,"to":d_current_end,"track":None,"scrobbles":0})
		d_current = getNext(d_current,step,stepn)
		if isPast(d_current_end,d_end):
			break

	return {"list":results}


def getStartOf(timestamp,unit):
	date = datetime.datetime.utcfromtimestamp(timestamp)
	if unit == "year":
		return [date.year,1,1]
	elif unit == "month":
		return [date.year,date.month,1]
	elif unit == "day":
		return [date.year,date.month,date.day]
	elif unit == "week":
		change = (date.weekday() + 1) % 7
		d = datetime.timedelta(days=change)
		newdate = date - d
		return [newdate.year,newdate.month,newdate.day]

def getNext(time,unit,step=1):
	if unit == "year":
		return [time[0] + step,time[1],time[2]]
	elif unit == "month":
		result = [time[0],time[1] + step,time[2]]
		while result[1] > 12:
			result[1] -= 12
			result[0] += 1
		while result[1] < 1:
			result[1] += 12
			result[0] -= 1
		return result
	elif unit == "day":
		dt = datetime.datetime(time[0],time[1],time[2])
		d = datetime.timedelta(days=step)
		newdate = dt + d
		return [newdate.year,newdate.month,newdate.day]
		#eugh
	elif unit == "week":
		return getNext(time,"day",step * 7)

@dbserver.route("/artistinfo")
def artistInfo():
	keys = FormsDict.decode(request.query)
	artist = keys.get("artist")

	charts = db_aggregate(by="ARTIST")
	scrobbles = len(db_query(artists=[artist])) #we cant take the scrobble number from the charts because that includes all countas scrobbles
	try:
		c = [e for e in charts if e["artist"] == artist][0]
		others = coa.getAllAssociated(artist)
		return {"scrobbles":scrobbles,"position":charts.index(c) + 1,"associated":others}
	except:
		# if the artist isnt in the charts, they are not being credited and we need to show information about the credited one
		artist = coa.getCredited(artist)
		c = [e for e in charts if e["artist"] == artist][0]
		return {"replace":artist,"scrobbles":scrobbles,"position":charts.index(c) + 1}

def isPast(date,limit):
	if not date[0] == limit[0]:
		return date[0] > limit[0]
	if not date[1] == limit[1]:
		return date[1] > limit[1]
	return (date[2] > limit[2])

@dbserver.get("/newscrobble")
def pseudo_post_scrobble():
	keys = FormsDict.decode(request.query) # The Dal★Shabet handler
	artists = keys.get("artist")
	title = keys.get("title")
	try:
		time = int(keys.get("time"))
	except:
		time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	(artists,title) = cla.fullclean(artists,title)

	## this is necessary for localhost testing
	response.set_header("Access-Control-Allow-Origin","*")

	createScrobble(artists,title,time)

	if (time - lastsync) > 3600:
		sync()

	return ""

@dbserver.post("/newscrobble")
def post_scrobble():
	keys = FormsDict.decode(request.forms) # The Dal★Shabet handler
	artists = keys.get("artist")
	title = keys.get("title")
	apikey = keys.get("key")
	if not (checkAPIkey(apikey)):
		response.status = 403
		return ""

	try:
		time = int(keys.get("time"))
	except:
		time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	(artists,title) = cla.fullclean(artists,title)

	## this is necessary for localhost testing
	response.set_header("Access-Control-Allow-Origin","*")

	createScrobble(artists,title,time)

	if (time - lastsync) > 3600:
		sync()

	return ""

@dbserver.route("/sync")
def abouttoshutdown():
	sync()
	#sys.exit()

@dbserver.post("/newrule")
def newrule():
	keys = FormsDict.decode(request.forms)
	addEntry("rules/webmade.tsv",[k for k in keys])
	global db_rulestate
	db_rulestate = False

@dbserver.route("/issues")
def issues():
	combined = []
	duplicates = []
	newartists = []
	inconsistent = not db_rulestate
	# if the user manually edits files while the server is running this won't show, but too lazy to check the rulestate here

	import itertools
	import difflib

	sortedartists = ARTISTS.copy()
	sortedartists.sort(key=len,reverse=True)
	reversesortedartists = sortedartists.copy()
	reversesortedartists.reverse()
	for a in reversesortedartists:

		nochange = cla.confirmedReal(a)

		st = a
		lis = []
		reachedmyself = False
		for ar in sortedartists:
			if (ar != a) and not reachedmyself:
				continue
			elif not reachedmyself:
				reachedmyself = True
				continue

			if (ar.lower() == a.lower()) or ("the " + ar.lower() == a.lower()) or ("a " + ar.lower() == a.lower()):
				duplicates.append((ar,a))
				break

			if (ar + " " in st) or (" " + ar in st):
				lis.append(ar)
				st = st.replace(ar,"").strip()
			elif (ar == st):
				lis.append(ar)
				st = ""
				if not nochange:
					combined.append((a,lis))
				break

			elif (ar in st) and len(ar)*2 > len(st):
				duplicates.append((a,ar))

		st = st.replace("&","").replace("and","").replace("with","").strip()
		if st != "" and st != a:
			if len(st) < 5 and len(lis) == 1:
				#check if we havent just randomly found the string in another word
				#if (" " + st + " ") in lis[0] or (lis[0].endswith(" " + st)) or (lis[0].startswith(st + " ")):
				duplicates.append((a,lis[0]))
			elif len(st) < 5 and len(lis) > 1 and not nochange:
				combined.append((a,lis))
			elif len(st) >= 5 and not nochange:
				#check if we havent just randomly found the string in another word
				if (" " + st + " ") in a or (a.endswith(" " + st)) or (a.startswith(st + " ")):
					newartists.append((st,a,lis))

	#for c in itertools.combinations(ARTISTS,3):
	#	l = list(c)
	#	print(l)
	#	l.sort(key=len,reverse=True)
	#	[full,a1,a2] = l
	#	if (a1 + " " + a2 in full) or (a2 + " " + a1 in full):
	#		combined.append((full,a1,a2))


	#for c in itertools.combinations(ARTISTS,2):
	#	if
	#
	#	if (c[0].lower == c[1].lower):
	#		duplicates.append((c[0],c[1]))


	#	elif (c[0] + " " in c[1]) or (" " + c[0] in c[1]) or (c[1] + " " in c[0]) or (" " + c[1] in c[0]):
	#		if (c[0] in c[1]):
	#			full, part = c[1],c[0]
	#			rest = c[1].replace(c[0],"").strip()
	#		else:
	#			full, part = c[0],c[1]
	#			rest = c[0].replace(c[1],"").strip()
	#		if rest in ARTISTS and full not in [c[0] for c in combined]:
	#			combined.append((full,part,rest))

	#	elif (c[0] in c[1]) or (c[1] in c[0]):
	#		duplicates.append((c[0],c[1]))


	return {"duplicates":duplicates,"combined":combined,"newartists":newartists,"inconsistent":inconsistent}

@dbserver.post("/rebuild")
def rebuild():
	global db_rulestate
	db_rulestate = False
	sync()
	os.system("python3 fixexisting.py")
	global cla, coa
	cla = CleanerAgent()
	coa = CollectorAgent()
	build_db()


####
## Server operation
####


# Starts the server
def runserver(PORT):
	global lastsync
	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	build_db()


	loadAPIkeys()

	run(dbserver, host='0.0.0.0', port=PORT, server='waitress')


def build_db():


	global SCROBBLES, ARTISTS, TRACKS

	SCROBBLES = []
	ARTISTS = []
	TRACKS = []


	db = parseAllTSV("scrobbles","int","string","string")
	for sc in db:
		artists = sc[1].split("␟")
		title = sc[2]
		time = sc[0]

		readScrobble(artists,title,time)


	SCROBBLES.sort(key = lambda tup: tup[1])

	coa.updateIDs(ARTISTS)

	global db_rulestate
	db_rulestate = consistentRulestate("scrobbles",cla.checksums)


# Saves all cached entries to disk
def sync():

	# all entries by file collected
	# so we don't open the same file for every entry
	entries = {}

	for idx in range(len(SCROBBLES)):
		if not SCROBBLES[idx][2]:

			t = getScrobbleObject(SCROBBLES[idx])

			artistlist = list(t["artists"])
			artistlist.sort() #we want the order of artists to be deterministic so when we update files with new rules a diff can see what has actually been changed
			artistss = "␟".join(artistlist)
			timestamp = datetime.date.fromtimestamp(t["time"])

			entry = [str(t["time"]),artistss,t["title"]]

			monthcode = str(timestamp.year) + "_" + str(timestamp.month)
			entries.setdefault(monthcode,[]).append(entry) #i feckin love the setdefault function

			SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)

	for e in entries:
		addEntries("scrobbles/" + e + ".tsv",entries[e])
		combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)


	global lastsync
	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
	print("Database saved to disk.")


####
## Database queries
####


# Queries the database
def db_query(artists=None,title=None,track=None,since=None,to=None,associated=False):
	(since, to) = getTimestamps(since,to)


	# this is not meant as a search function. we *can* query the db with a string, but it only works if it matches exactly
	# if a title is specified, we assume that a specific track (with the exact artist combination) is requested
	# if not, multiple artists are interpreted as requesting all scrobbles they were all involved in (but possibly other too)
	# eg a track named "Awesome Song" by "TWICE", "AOA" and "f(x)" would count when we specifiy only the artists "AOA" and "f(x)", but not when we add the title (because then we'd be
	# looking for that specific track with only those two artists - which could in fact exist)

	artists = set([(ARTISTS.index(a) if isinstance(a,str) else a) for a in artists])
	#for artist in artists:
	#	if isinstance(artist, str):
	#	artist = ARTISTS.index(artist)
	#if isinstance(title, str):
	#	track = (frozenset(artists),title)
	#	track = TRACKS.index(track)

	# if track is specified (only number works), we ignore title string
	if title!=None and track==None:
		track = TRACKS.index((frozenset(artists),title))


	if artists == []:
		artists = None


	# right now we always request everything by name, maybe we don't actually need the request by number, but i'll leave it in for now

	if associated:
		return [getScrobbleObject(s) for s in SCROBBLES if (s[0] == track or track==None) and (artists==None or artists.issubset(coa.getCreditedList(TRACKS[s[0]][0]))) and (since < s[1] < to)]
	else:
		return [getScrobbleObject(s) for s in SCROBBLES if (s[0] == track or track==None) and (artists==None or artists.issubset(TRACKS[s[0]][0])) and (since < s[1] < to)]
	# pointless to check for artist when track is checked because every track has a fixed set of artists, but it's more elegant this way


# Queries that... well... aggregate
def db_aggregate(by=None,since=None,to=None,artist=None):
	(since, to) = getTimestamps(since,to)

	if isinstance(artist, str):
		artist = ARTISTS.index(artist)

	if (by=="ARTIST"):
		#this is probably a really bad idea
		#for a in ARTISTS:
		#	num = len(db_query(artist=a,since=since,to=to))
		#

		# alright let's try for real
		charts = {}
		for s in [scr for scr in SCROBBLES if since < scr[1] < to]:
			artists = TRACKS[s[0]][0]
			for a in coa.getCreditedList(artists):
				# this either creates the new entry or increments the existing one
				charts[a] = charts.setdefault(a,0) + 1

		ls = [{"artist":getArtistObject(ARTISTS[a]),"scrobbles":charts[a],"counting":coa.getAllAssociated(ARTISTS[a])} for a in charts]
		return sorted(ls,key=lambda k:k["scrobbles"], reverse=True)

	elif (by=="TRACK"):
		charts = {}
		for s in [scr for scr in SCROBBLES if since < scr[1] < to and (artist==None or (artist in TRACKS[scr[0]][0]))]:
			track = s[0]
			# this either creates the new entry or increments the existing one
			charts[track] = charts.setdefault(track,0) + 1

		ls = [{"track":getTrackObject(TRACKS[t]),"scrobbles":charts[t]} for t in charts]
		return sorted(ls,key=lambda k:k["scrobbles"], reverse=True)

	else:
		return len([scr for scr in SCROBBLES if since < scr[1] < to])


# Search for strings
def db_search(query,type=None):
	if type=="ARTIST":
		results = []
		for a in ARTISTS:
			if query.lower() in a.lower():
				results.append(a)

	if type=="TRACK":
		results = []
		for t in TRACKS:
			if query.lower() in t[1].lower():
				results.append(t)

	return results


####
## Useful functions
####


# Takes user inputs like YYYY/MM and returns the timestamps. Returns timestamp if timestamp was already given.
def getTimestamps(f,t):
	#(f,t) = inp
	if isinstance(f, str) and f.lower() == "today":
		tod = datetime.date.today()
		f = [tod.year,tod.month,tod.day]
	if isinstance(t, str) and t.lower() == "today":
		tod = datetime.date.today()
		t = [tod.year,tod.month,tod.day]


	if isinstance(f, str):
		f = [int(x) for x in f.split("/")]

	if isinstance(t, str):
		t = [int(x) for x in t.split("/")]


	# this step is done if either the input is a list or the first step was done (which creates a list)
	if isinstance(f, list):
		date = [1970,1,1,0,0]
		date[:len(f)] = f
		f = int(datetime.datetime(date[0],date[1],date[2],date[3],date[4],tzinfo=datetime.timezone.utc).timestamp())

	if isinstance(t, list):
		date = [1970,1,1,0,0]
		date[:len(t)] = t
		t = int(datetime.datetime(date[0],date[1],date[2],date[3],date[4],tzinfo=datetime.timezone.utc).timestamp())


	if (f==None):
		f = min(timestamps)
	if (t==None):
		t = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).timestamp()

	return (f,t)


def getArtistId(nameorid):
	if isinstance(nameorid,int):
		return nameorid
	else:
		try:
			return ARTISTS.index(nameorid)
		except:
			return -1