Initial work on SQLite

2023-08-10 21:12:55 +03:00 · 2022-01-03 02:08:02 +01:00
parent 68a450672e
commit 9eb8dc0b47
3 changed files with 212 additions and 150 deletions
--- a/maloja/database.py
+++ b/maloja/database.py
@@ -23,6 +23,9 @@ except: pass
 import doreah


+#db
+import sqlalchemy as sql
+

 # technical
 import os
@@ -31,7 +34,7 @@ import sys
 import unicodedata
 from collections import namedtuple
 from threading import Lock
-import yaml
+import yaml, json
 import lru
 import math

@@ -688,151 +691,154 @@ def get_predefined_rulesets():
 ## Server operation
 ####

+DB = {}


-# Starts the server
+engine = sql.create_engine(f"sqlite:///{data_dir['scrobbles']('malojadb.sqlite')}", echo = False)
+meta = sql.MetaData()
+
+DB['scrobbles'] = sql.Table(
+	'scrobbles', meta,
+	sql.Column('timestamp',sql.Integer,primary_key=True),
+	sql.Column('rawscrobble',sql.String),
+	sql.Column('origin',sql.String),
+	sql.Column('duration',sql.Integer),
+	sql.Column('track_id',sql.Integer)
+)
+DB['tracks'] = sql.Table(
+	'tracks', meta,
+	sql.Column('id',sql.Integer,primary_key=True),
+	sql.Column('title',sql.String),
+	sql.Column('title_normalized',sql.String)
+)
+DB['artists'] = sql.Table(
+	'artists', meta,
+	sql.Column('id',sql.Integer,primary_key=True),
+	sql.Column('name',sql.String),
+	sql.Column('name_normalized',sql.String)
+)
+DB['trackartists'] = sql.Table(
+	'trackartists', meta,
+	sql.Column('id',sql.Integer,primary_key=True),
+	sql.Column('artist_id',sql.Integer),
+	sql.Column('track_id',sql.Integer)
+)
+
+meta.create_all(engine)
+
+
+
+
+
+
+
+#### ATTENTION ALL ADVENTURERS
+#### THIS IS WHAT A SCROBBLE DICT WILL LOOK LIKE FROM NOW ON
+#### THIS IS THE SINGLE CANONICAL SOURCE OF TRUTH
+#### STOP MAKING DIFFERENT LITTLE DICTS IN EVERY SINGLE FUNCTION
+#### THIS IS THE SCHEMA THAT WILL DEFINITELY 100% STAY LIKE THIS AND NOT
+#### RANDOMLY GET CHANGED TWO VERSIONS LATER
+#### HERE WE GO
+#
+# {
+# 	"time":int,
+# 	"track":{
+# 		"artists":list,
+# 		"title":string,
+# 		"album":{
+# 			"name":string,
+# 			"artists":list
+# 		},
+# 		"length":None
+# 	},
+# 	"duration":int,
+# 	"origin":string
+# }
+
+def add_scrobble(scrobbledict):
+	add_scrobbles([scrobbledict])
+
+def add_scrobbles(scrobbleslist):
+
+	ops = [
+		DB['scrobbles'].insert().values(
+			rawscrobble=json.dumps(s),
+			timestamp=s['time'],
+			origin=s['origin'],
+			duration=s['duration'] or -1,
+			track_id=get_track_id(s['track'])
+		) for s in scrobbleslist
+	]
+
+	with engine.begin() as conn:
+		for op in ops:
+			conn.execute(op)
+
+
+
+### DB interface functions - these will 'get' the ID of an entity,
+### creating it if necessary
+
+
+def get_track_id(trackdict):
+	ntitle = normalize_name(trackdict['title'])
+	artist_ids = [get_artist_id(a) for a in trackdict['artists']]
+
+
+
+	with engine.begin() as conn:
+		op = DB['tracks'].select(
+			DB['tracks'].c.id
+		).where(
+			DB['tracks'].c.title_normalized==ntitle
+		)
+		result = conn.execute(op)
+		for row in result:
+			print("ID for",trackdict['title'],"was",row[0])
+			return row[0]
+
+	with engine.begin() as conn:
+		op = DB['tracks'].insert().values(
+			title=trackdict['title'],
+			title_normalized=ntitle
+		)
+		result = conn.execute(op)
+		print("Created",trackdict['title'],result.inserted_primary_key)
+		return result.inserted_primary_key[0]
+
+def get_artist_id(artistname):
+	nname = normalize_name(artistname)
+	print("looking for",nname)
+
+	with engine.begin() as conn:
+		op = DB['artists'].select(
+			DB['artists'].c.id
+		).where(
+			DB['artists'].c.name_normalized==nname
+		)
+		result = conn.execute(op)
+		for row in result:
+			print("ID for",artistname,"was",row[0])
+			return row[0]
+
+	with engine.begin() as conn:
+		op = DB['artists'].insert().values(
+			name=artistname,
+			name_normalized=nname
+		)
+		result = conn.execute(op)
+		print("Created",artistname,result.inserted_primary_key)
+		return result.inserted_primary_key[0]
+
 def start_db():
-	log("Starting database...")
-	global lastsync
-	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
-	build_db()
-	#run(dbserver, host='::', port=PORT, server='waitress')
-	log("Database reachable!")
-
-def build_db():
-
-	global dbstatus
-	dbstatus['healthy'] = False
-	dbstatus['complete'] = False
-	dbstatus['rebuildinprogress'] = True
-
-	log("Building database...")
-
-	global SCROBBLES, ARTISTS, TRACKS
-	global TRACKS_NORMALIZED_SET, TRACKS_NORMALIZED, ARTISTS_NORMALIZED_SET, ARTISTS_NORMALIZED
-	global SCROBBLESDICT, STAMPS
-
-	SCROBBLES = []
-	ARTISTS = []
-	TRACKS = []
-	STAMPS = []
-	SCROBBLESDICT = {}
-
-	TRACKS_NORMALIZED = []
-	ARTISTS_NORMALIZED = []
-	ARTISTS_NORMALIZED_SET = set()
-	TRACKS_NORMALIZED_SET = set()
-
-
-	# parse files
-	db = tsv.parse_all(data_dir['scrobbles'](),"int","string","string",comments=False)
-	scrobblenum = len(db)
-	log(f"Found {scrobblenum} scrobbles...")
-
-	usebar = not malojaconfig["CLEAN_OUTPUT"]
-	if usebar: pbar = ProgressBar(max=scrobblenum,prefix="Loading scrobbles")
-	else:
-		n = 0
-		m = max(int(scrobblenum / 25),20)
-	#db = parseAllTSV("scrobbles","int","string","string",escape=False)
-	for sc in db:
-		artists = sc[1].split("␟")
-		title = sc[2]
-		time = sc[0]
-
-		readScrobble(artists,title,time)
-		if usebar: pbar.progress()
-		else:
-			n += 1
-			if n % m == 0: log(f"Loaded {n}/{scrobblenum}...")
-
-	if usebar: pbar.done()
-
-
-	log("Database loaded, optimizing...")
-
-	# optimize database
-	SCROBBLES.sort(key = lambda tup: tup[1])
-	#SCROBBLESDICT = {obj[1]:obj for obj in SCROBBLES}
-	STAMPS = [t for t in SCROBBLESDICT]
-	STAMPS.sort()
-
-	# inform malojatime module about earliest scrobble
-	if STAMPS: register_scrobbletime(STAMPS[0])
-
-	# NOT NEEDED BECAUSE WE DO THAT ON ADDING EVERY ARTIST ANYWAY
-	# get extra artists with no real scrobbles from countas rules
-	#for artist in coa.getAllArtists():
-	#for artist in coa.getCreditedList(ARTISTS):
-	#	if artist not in ARTISTS:
-	#		log(artist + " is added to database because of countas rules",module="debug")
-	#		ARTISTS.append(artist)
-	# coa.updateIDs(ARTISTS)
-
-	dbstatus['healthy'] = True
-
-
-	#start regular tasks
-	utilities.update_medals()
-	utilities.update_weekly()
-	utilities.send_stats()
-
-
-	global ISSUES
-	ISSUES = check_issues()
-
-
-	dbstatus['complete'] = True
-	dbstatus['rebuildinprogress'] = False
-
-	log("Database fully built!")
+	from . import upgrade
+	upgrade.upgrade_db(add_scrobbles)



-# Saves all cached entries to disk
-def sync():
-
-	# all entries by file collected
-	# so we don't open the same file for every entry
-	#log("Syncing",module="debug")
-	entries = {}
-
-	for idx in range(len(SCROBBLES)):
-		if not SCROBBLES[idx].saved:
-
-			t = get_scrobble_dict(SCROBBLES[idx])
-
-			artistlist = list(t["artists"])
-			artistlist.sort() #we want the order of artists to be deterministic so when we update files with new rules a diff can see what has actually been changed
-			artistss = "␟".join(artistlist)
-			timestamp = datetime.date.fromtimestamp(t["time"])
-
-			album = t["album"] or "-"
-			duration = t["duration"] or "-"
-
-			entry = [str(t["time"]),artistss,t["title"],album,duration]
-
-			monthcode = str(timestamp.year) + "_" + str(timestamp.month)
-			entries.setdefault(monthcode,[]).append(entry) #i feckin love the setdefault function
-
-			SCROBBLES[idx] = Scrobble(*SCROBBLES[idx][:-1],True)
-			# save copy with last tuple entry set to true
-
-	#log("Sorted into months",module="debug")
-
-	for e in entries:
-		tsv.add_entries(data_dir['scrobbles'](e + ".tsv"),entries[e],comments=False)
-		#addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
-
-	#log("Written files",module="debug")


-	global lastsync
-	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
-	#log("Database saved to disk.")

-	# save cached images
-	#saveCache()



--- a/maloja/globalconf.py
+++ b/maloja/globalconf.py
@@ -7,6 +7,7 @@ from .__pkginfo__ import VERSION



+
 # if DATA_DIRECTORY is specified, this is the directory to use for EVERYTHING, no matter what
 # but with asynnetrical structure, cache and logs in subfolders
 # otherwise, each directory is treated seperately
@@ -311,24 +312,19 @@ config(



-
 ### API KEYS
+### symmetric keys are fine since we hopefully use HTTPS


-
-### symmetric keys are fine for now since we hopefully use HTTPS
 apikeystore = KeyStore(file=data_dir['clients']("apikeys.yml"),save_endpoint="/apis/mlj_1/apikeys")
+from . import upgrade
+upgrade.upgrade_apikeys()
+
+
+
+
+

-oldfile = pthj(dir_settings['config'],"clients","authenticated_machines.tsv")
-if os.path.exists(oldfile):
-	try:
-		from doreah import tsv
-		clients = tsv.parse(oldfile,"string","string")
-		for key,identifier in clients:
-			apikeystore[identifier] = key
-		os.remove(oldfile)
-	except:
-		pass


 # what the fuck did i just write
--- a/maloja/upgrade.py
+++ b/maloja/upgrade.py
@@ -0,0 +1,60 @@
+# This module should take care of recognizing old install data and upgrading it before the actual server deals with it
+
+import os
+import re
+
+from doreah.logging import log
+
+from .globalconf import data_dir, dir_settings, apikeystore
+
+
+def upgrade_apikeys():
+
+	oldfile = os.path.join(dir_settings['config'],"clients","authenticated_machines.tsv")
+	if os.path.exists(oldfile):
+		try:
+			from doreah import tsv
+			clients = tsv.parse(oldfile,"string","string")
+			for key,identifier in clients:
+				apikeystore[identifier] = key
+			os.remove(oldfile)
+		except:
+			pass
+
+
+def upgrade_db(callback_add_scrobbles):
+	oldfolder = os.path.join(dir_settings['state'],"scrobbles")
+	if os.path.exists(oldfolder):
+		scrobblefiles = os.listdir(oldfolder)
+		for sf in scrobblefiles:
+			if sf.endswith(".tsv"):
+				log(f"Found old tsv scrobble file: {sf}")
+				if re.match(r"[0-9]+_[0-9]+\.tsv",sf):
+					origin = 'native'
+				elif sf == "lastfmimport.tsv":
+					origin = 'lastfm-import'
+				else:
+					origin = 'unknown'
+
+				from doreah import tsv
+				scrobbles = tsv.parse(os.path.join(oldfolder,sf),"int","string","string","string","string",comments=False)
+				scrobblelist = []
+				for scrobble in scrobbles:
+					timestamp, artists, title, album, duration = scrobble
+					if album in ('-',''): album = None
+					if duration in ('-',''): duration = None
+					scrobblelist.append({
+						"time":int(timestamp),
+						"track":{
+							"artists":artists.split('␟'),
+							"title":title,
+							"album":{
+								"name":album,
+								"artists":None
+							},
+							"length":None
+						},
+						"duration":duration,
+						"origin":origin
+					})
+				callback_add_scrobbles(scrobblelist)