Initial work on SQLite

2023-08-10 21:12:55 +03:00 · 2022-01-03 02:08:02 +01:00 · 2022-01-03 02:08:02 +01:00 · 9eb8dc0b47
commit 9eb8dc0b47
parent 68a450672e
3 changed files with 212 additions and 150 deletions
--- a/maloja/database.py
+++ b/maloja/database.py
@ -23,6 +23,9 @@ except: pass
 import doreah
 #db
 import sqlalchemy as sql
 # technical
 import os
@ -31,7 +34,7 @@ import sys
 import unicodedata
 from collections import namedtuple
 from threading import Lock
-import yaml
+import yaml, json
 import lru
 import math
@ -688,151 +691,154 @@ def get_predefined_rulesets():
 ## Server operation
 ####
 DB = {}
-# Starts the server
+engine = sql.create_engine(f"sqlite:///{data_dir['scrobbles']('malojadb.sqlite')}", echo = False)
 meta = sql.MetaData()
 DB['scrobbles'] = sql.Table(
 	'scrobbles', meta,
 	sql.Column('timestamp',sql.Integer,primary_key=True),
 	sql.Column('rawscrobble',sql.String),
 	sql.Column('origin',sql.String),
 	sql.Column('duration',sql.Integer),
 	sql.Column('track_id',sql.Integer)
 )
 DB['tracks'] = sql.Table(
 	'tracks', meta,
 	sql.Column('id',sql.Integer,primary_key=True),
 	sql.Column('title',sql.String),
 	sql.Column('title_normalized',sql.String)
 )
 DB['artists'] = sql.Table(
 	'artists', meta,
 	sql.Column('id',sql.Integer,primary_key=True),
 	sql.Column('name',sql.String),
 	sql.Column('name_normalized',sql.String)
 )
 DB['trackartists'] = sql.Table(
 	'trackartists', meta,
 	sql.Column('id',sql.Integer,primary_key=True),
 	sql.Column('artist_id',sql.Integer),
 	sql.Column('track_id',sql.Integer)
 )
 meta.create_all(engine)
 #### ATTENTION ALL ADVENTURERS
 #### THIS IS WHAT A SCROBBLE DICT WILL LOOK LIKE FROM NOW ON
 #### THIS IS THE SINGLE CANONICAL SOURCE OF TRUTH
 #### STOP MAKING DIFFERENT LITTLE DICTS IN EVERY SINGLE FUNCTION
 #### THIS IS THE SCHEMA THAT WILL DEFINITELY 100% STAY LIKE THIS AND NOT
 #### RANDOMLY GET CHANGED TWO VERSIONS LATER
 #### HERE WE GO
 #
 # {
 # 	"time":int,
 # 	"track":{
 # 		"artists":list,
 # 		"title":string,
 # 		"album":{
 # 			"name":string,
 # 			"artists":list
 # 		},
 # 		"length":None
 # 	},
 # 	"duration":int,
 # 	"origin":string
 # }
 def add_scrobble(scrobbledict):
 	add_scrobbles([scrobbledict])
 def add_scrobbles(scrobbleslist):
 	ops = [
 		DB['scrobbles'].insert().values(
 			rawscrobble=json.dumps(s),
 			timestamp=s['time'],
 			origin=s['origin'],
 			duration=s['duration'] or -1,
 			track_id=get_track_id(s['track'])
 		) for s in scrobbleslist
 	]
 	with engine.begin() as conn:
 		for op in ops:
 			conn.execute(op)
 ### DB interface functions - these will 'get' the ID of an entity,
 ### creating it if necessary
 def get_track_id(trackdict):
 	ntitle = normalize_name(trackdict['title'])
 	artist_ids = [get_artist_id(a) for a in trackdict['artists']]
 	with engine.begin() as conn:
 		op = DB['tracks'].select(
 			DB['tracks'].c.id
 		).where(
 			DB['tracks'].c.title_normalized==ntitle
 		)
 		result = conn.execute(op)
 		for row in result:
 			print("ID for",trackdict['title'],"was",row[0])
 			return row[0]
 	with engine.begin() as conn:
 		op = DB['tracks'].insert().values(
 			title=trackdict['title'],
 			title_normalized=ntitle
 		)
 		result = conn.execute(op)
 		print("Created",trackdict['title'],result.inserted_primary_key)
 		return result.inserted_primary_key[0]
 def get_artist_id(artistname):
 	nname = normalize_name(artistname)
 	print("looking for",nname)
 	with engine.begin() as conn:
 		op = DB['artists'].select(
 			DB['artists'].c.id
 		).where(
 			DB['artists'].c.name_normalized==nname
 		)
 		result = conn.execute(op)
 		for row in result:
 			print("ID for",artistname,"was",row[0])
 			return row[0]
 	with engine.begin() as conn:
 		op = DB['artists'].insert().values(
 			name=artistname,
 			name_normalized=nname
 		)
 		result = conn.execute(op)
 		print("Created",artistname,result.inserted_primary_key)
 		return result.inserted_primary_key[0]
 def start_db():
-	log("Starting database...")
+	from . import upgrade
-	global lastsync
+	upgrade.upgrade_db(add_scrobbles)
 	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
 	build_db()
 	#run(dbserver, host='::', port=PORT, server='waitress')
 	log("Database reachable!")
 def build_db():
 	global dbstatus
 	dbstatus['healthy'] = False
 	dbstatus['complete'] = False
 	dbstatus['rebuildinprogress'] = True
 	log("Building database...")
 	global SCROBBLES, ARTISTS, TRACKS
 	global TRACKS_NORMALIZED_SET, TRACKS_NORMALIZED, ARTISTS_NORMALIZED_SET, ARTISTS_NORMALIZED
 	global SCROBBLESDICT, STAMPS
 	SCROBBLES = []
 	ARTISTS = []
 	TRACKS = []
 	STAMPS = []
 	SCROBBLESDICT = {}
 	TRACKS_NORMALIZED = []
 	ARTISTS_NORMALIZED = []
 	ARTISTS_NORMALIZED_SET = set()
 	TRACKS_NORMALIZED_SET = set()
 	# parse files
 	db = tsv.parse_all(data_dir['scrobbles'](),"int","string","string",comments=False)
 	scrobblenum = len(db)
 	log(f"Found {scrobblenum} scrobbles...")
 	usebar = not malojaconfig["CLEAN_OUTPUT"]
 	if usebar: pbar = ProgressBar(max=scrobblenum,prefix="Loading scrobbles")
 	else:
 		n = 0
 		m = max(int(scrobblenum / 25),20)
 	#db = parseAllTSV("scrobbles","int","string","string",escape=False)
 	for sc in db:
 		artists = sc[1].split("␟")
 		title = sc[2]
 		time = sc[0]
 		readScrobble(artists,title,time)
 		if usebar: pbar.progress()
 		else:
 			n += 1
 			if n % m == 0: log(f"Loaded {n}/{scrobblenum}...")
 	if usebar: pbar.done()
 	log("Database loaded, optimizing...")
 	# optimize database
 	SCROBBLES.sort(key = lambda tup: tup[1])
 	#SCROBBLESDICT = {obj[1]:obj for obj in SCROBBLES}
 	STAMPS = [t for t in SCROBBLESDICT]
 	STAMPS.sort()
 	# inform malojatime module about earliest scrobble
 	if STAMPS: register_scrobbletime(STAMPS[0])
 	# NOT NEEDED BECAUSE WE DO THAT ON ADDING EVERY ARTIST ANYWAY
 	# get extra artists with no real scrobbles from countas rules
 	#for artist in coa.getAllArtists():
 	#for artist in coa.getCreditedList(ARTISTS):
 	#	if artist not in ARTISTS:
 	#		log(artist + " is added to database because of countas rules",module="debug")
 	#		ARTISTS.append(artist)
 	# coa.updateIDs(ARTISTS)
 	dbstatus['healthy'] = True
 	#start regular tasks
 	utilities.update_medals()
 	utilities.update_weekly()
 	utilities.send_stats()
 	global ISSUES
 	ISSUES = check_issues()
 	dbstatus['complete'] = True
 	dbstatus['rebuildinprogress'] = False
 	log("Database fully built!")
 # Saves all cached entries to disk
 def sync():
 	# all entries by file collected
 	# so we don't open the same file for every entry
 	#log("Syncing",module="debug")
 	entries = {}
 	for idx in range(len(SCROBBLES)):
 		if not SCROBBLES[idx].saved:
 			t = get_scrobble_dict(SCROBBLES[idx])
 			artistlist = list(t["artists"])
 			artistlist.sort() #we want the order of artists to be deterministic so when we update files with new rules a diff can see what has actually been changed
 			artistss = "␟".join(artistlist)
 			timestamp = datetime.date.fromtimestamp(t["time"])
 			album = t["album"] or "-"
 			duration = t["duration"] or "-"
 			entry = [str(t["time"]),artistss,t["title"],album,duration]
 			monthcode = str(timestamp.year) + "_" + str(timestamp.month)
 			entries.setdefault(monthcode,[]).append(entry) #i feckin love the setdefault function
 			SCROBBLES[idx] = Scrobble(*SCROBBLES[idx][:-1],True)
 			# save copy with last tuple entry set to true
 	#log("Sorted into months",module="debug")
 	for e in entries:
 		tsv.add_entries(data_dir['scrobbles'](e + ".tsv"),entries[e],comments=False)
 		#addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
 	#log("Written files",module="debug")
 	global lastsync
 	lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
 	#log("Database saved to disk.")
 	# save cached images
 	#saveCache()
--- a/maloja/globalconf.py
+++ b/maloja/globalconf.py
@ -7,6 +7,7 @@ from .__pkginfo__ import VERSION
 # if DATA_DIRECTORY is specified, this is the directory to use for EVERYTHING, no matter what
 # but with asynnetrical structure, cache and logs in subfolders
 # otherwise, each directory is treated seperately
@ -311,24 +312,19 @@ config(
 ### API KEYS
 ### symmetric keys are fine since we hopefully use HTTPS
 ### symmetric keys are fine for now since we hopefully use HTTPS
 apikeystore = KeyStore(file=data_dir['clients']("apikeys.yml"),save_endpoint="/apis/mlj_1/apikeys")
 from . import upgrade
 upgrade.upgrade_apikeys()
 oldfile = pthj(dir_settings['config'],"clients","authenticated_machines.tsv")
 if os.path.exists(oldfile):
 	try:
 		from doreah import tsv
 		clients = tsv.parse(oldfile,"string","string")
 		for key,identifier in clients:
 			apikeystore[identifier] = key
 		os.remove(oldfile)
 	except:
 		pass
 # what the fuck did i just write
--- a/maloja/upgrade.py
+++ b/maloja/upgrade.py
@ -0,0 +1,60 @@
 # This module should take care of recognizing old install data and upgrading it before the actual server deals with it
 import os
 import re
 from doreah.logging import log
 from .globalconf import data_dir, dir_settings, apikeystore
 def upgrade_apikeys():
 	oldfile = os.path.join(dir_settings['config'],"clients","authenticated_machines.tsv")
 	if os.path.exists(oldfile):
 		try:
 			from doreah import tsv
 			clients = tsv.parse(oldfile,"string","string")
 			for key,identifier in clients:
 				apikeystore[identifier] = key
 			os.remove(oldfile)
 		except:
 			pass
 def upgrade_db(callback_add_scrobbles):
 	oldfolder = os.path.join(dir_settings['state'],"scrobbles")
 	if os.path.exists(oldfolder):
 		scrobblefiles = os.listdir(oldfolder)
 		for sf in scrobblefiles:
 			if sf.endswith(".tsv"):
 				log(f"Found old tsv scrobble file: {sf}")
 				if re.match(r"[0-9]+_[0-9]+\.tsv",sf):
 					origin = 'native'
 				elif sf == "lastfmimport.tsv":
 					origin = 'lastfm-import'
 				else:
 					origin = 'unknown'
 				from doreah import tsv
 				scrobbles = tsv.parse(os.path.join(oldfolder,sf),"int","string","string","string","string",comments=False)
 				scrobblelist = []
 				for scrobble in scrobbles:
 					timestamp, artists, title, album, duration = scrobble
 					if album in ('-',''): album = None
 					if duration in ('-',''): duration = None
 					scrobblelist.append({
 						"time":int(timestamp),
 						"track":{
 							"artists":artists.split('␟'),
 							"title":title,
 							"album":{
 								"name":album,
 								"artists":None
 							},
 							"length":None
 						},
 						"duration":duration,
 						"origin":origin
 					})
 				callback_add_scrobbles(scrobblelist)