From 9eb8dc0b47695096bdad8bd42830f11cbc372f7b Mon Sep 17 00:00:00 2001 From: krateng Date: Mon, 3 Jan 2022 02:08:02 +0100 Subject: [PATCH] Initial work on SQLite --- maloja/database.py | 280 ++++++++++++++++++++++--------------------- maloja/globalconf.py | 22 ++-- maloja/upgrade.py | 60 ++++++++++ 3 files changed, 212 insertions(+), 150 deletions(-) create mode 100644 maloja/upgrade.py diff --git a/maloja/database.py b/maloja/database.py index 484d187..b7a07e7 100644 --- a/maloja/database.py +++ b/maloja/database.py @@ -23,6 +23,9 @@ except: pass import doreah +#db +import sqlalchemy as sql + # technical import os @@ -31,7 +34,7 @@ import sys import unicodedata from collections import namedtuple from threading import Lock -import yaml +import yaml, json import lru import math @@ -688,151 +691,154 @@ def get_predefined_rulesets(): ## Server operation #### +DB = {} -# Starts the server +engine = sql.create_engine(f"sqlite:///{data_dir['scrobbles']('malojadb.sqlite')}", echo = False) +meta = sql.MetaData() + +DB['scrobbles'] = sql.Table( + 'scrobbles', meta, + sql.Column('timestamp',sql.Integer,primary_key=True), + sql.Column('rawscrobble',sql.String), + sql.Column('origin',sql.String), + sql.Column('duration',sql.Integer), + sql.Column('track_id',sql.Integer) +) +DB['tracks'] = sql.Table( + 'tracks', meta, + sql.Column('id',sql.Integer,primary_key=True), + sql.Column('title',sql.String), + sql.Column('title_normalized',sql.String) +) +DB['artists'] = sql.Table( + 'artists', meta, + sql.Column('id',sql.Integer,primary_key=True), + sql.Column('name',sql.String), + sql.Column('name_normalized',sql.String) +) +DB['trackartists'] = sql.Table( + 'trackartists', meta, + sql.Column('id',sql.Integer,primary_key=True), + sql.Column('artist_id',sql.Integer), + sql.Column('track_id',sql.Integer) +) + +meta.create_all(engine) + + + + + + + +#### ATTENTION ALL ADVENTURERS +#### THIS IS WHAT A SCROBBLE DICT WILL LOOK LIKE FROM NOW ON +#### THIS IS THE SINGLE CANONICAL SOURCE OF TRUTH +#### STOP MAKING DIFFERENT LITTLE DICTS IN EVERY SINGLE FUNCTION +#### THIS IS THE SCHEMA THAT WILL DEFINITELY 100% STAY LIKE THIS AND NOT +#### RANDOMLY GET CHANGED TWO VERSIONS LATER +#### HERE WE GO +# +# { +# "time":int, +# "track":{ +# "artists":list, +# "title":string, +# "album":{ +# "name":string, +# "artists":list +# }, +# "length":None +# }, +# "duration":int, +# "origin":string +# } + +def add_scrobble(scrobbledict): + add_scrobbles([scrobbledict]) + +def add_scrobbles(scrobbleslist): + + ops = [ + DB['scrobbles'].insert().values( + rawscrobble=json.dumps(s), + timestamp=s['time'], + origin=s['origin'], + duration=s['duration'] or -1, + track_id=get_track_id(s['track']) + ) for s in scrobbleslist + ] + + with engine.begin() as conn: + for op in ops: + conn.execute(op) + + + +### DB interface functions - these will 'get' the ID of an entity, +### creating it if necessary + + +def get_track_id(trackdict): + ntitle = normalize_name(trackdict['title']) + artist_ids = [get_artist_id(a) for a in trackdict['artists']] + + + + with engine.begin() as conn: + op = DB['tracks'].select( + DB['tracks'].c.id + ).where( + DB['tracks'].c.title_normalized==ntitle + ) + result = conn.execute(op) + for row in result: + print("ID for",trackdict['title'],"was",row[0]) + return row[0] + + with engine.begin() as conn: + op = DB['tracks'].insert().values( + title=trackdict['title'], + title_normalized=ntitle + ) + result = conn.execute(op) + print("Created",trackdict['title'],result.inserted_primary_key) + return result.inserted_primary_key[0] + +def get_artist_id(artistname): + nname = normalize_name(artistname) + print("looking for",nname) + + with engine.begin() as conn: + op = DB['artists'].select( + DB['artists'].c.id + ).where( + DB['artists'].c.name_normalized==nname + ) + result = conn.execute(op) + for row in result: + print("ID for",artistname,"was",row[0]) + return row[0] + + with engine.begin() as conn: + op = DB['artists'].insert().values( + name=artistname, + name_normalized=nname + ) + result = conn.execute(op) + print("Created",artistname,result.inserted_primary_key) + return result.inserted_primary_key[0] + def start_db(): - log("Starting database...") - global lastsync - lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) - build_db() - #run(dbserver, host='::', port=PORT, server='waitress') - log("Database reachable!") - -def build_db(): - - global dbstatus - dbstatus['healthy'] = False - dbstatus['complete'] = False - dbstatus['rebuildinprogress'] = True - - log("Building database...") - - global SCROBBLES, ARTISTS, TRACKS - global TRACKS_NORMALIZED_SET, TRACKS_NORMALIZED, ARTISTS_NORMALIZED_SET, ARTISTS_NORMALIZED - global SCROBBLESDICT, STAMPS - - SCROBBLES = [] - ARTISTS = [] - TRACKS = [] - STAMPS = [] - SCROBBLESDICT = {} - - TRACKS_NORMALIZED = [] - ARTISTS_NORMALIZED = [] - ARTISTS_NORMALIZED_SET = set() - TRACKS_NORMALIZED_SET = set() - - - # parse files - db = tsv.parse_all(data_dir['scrobbles'](),"int","string","string",comments=False) - scrobblenum = len(db) - log(f"Found {scrobblenum} scrobbles...") - - usebar = not malojaconfig["CLEAN_OUTPUT"] - if usebar: pbar = ProgressBar(max=scrobblenum,prefix="Loading scrobbles") - else: - n = 0 - m = max(int(scrobblenum / 25),20) - #db = parseAllTSV("scrobbles","int","string","string",escape=False) - for sc in db: - artists = sc[1].split("␟") - title = sc[2] - time = sc[0] - - readScrobble(artists,title,time) - if usebar: pbar.progress() - else: - n += 1 - if n % m == 0: log(f"Loaded {n}/{scrobblenum}...") - - if usebar: pbar.done() - - - log("Database loaded, optimizing...") - - # optimize database - SCROBBLES.sort(key = lambda tup: tup[1]) - #SCROBBLESDICT = {obj[1]:obj for obj in SCROBBLES} - STAMPS = [t for t in SCROBBLESDICT] - STAMPS.sort() - - # inform malojatime module about earliest scrobble - if STAMPS: register_scrobbletime(STAMPS[0]) - - # NOT NEEDED BECAUSE WE DO THAT ON ADDING EVERY ARTIST ANYWAY - # get extra artists with no real scrobbles from countas rules - #for artist in coa.getAllArtists(): - #for artist in coa.getCreditedList(ARTISTS): - # if artist not in ARTISTS: - # log(artist + " is added to database because of countas rules",module="debug") - # ARTISTS.append(artist) - # coa.updateIDs(ARTISTS) - - dbstatus['healthy'] = True - - - #start regular tasks - utilities.update_medals() - utilities.update_weekly() - utilities.send_stats() - - - global ISSUES - ISSUES = check_issues() - - - dbstatus['complete'] = True - dbstatus['rebuildinprogress'] = False - - log("Database fully built!") + from . import upgrade + upgrade.upgrade_db(add_scrobbles) -# Saves all cached entries to disk -def sync(): - - # all entries by file collected - # so we don't open the same file for every entry - #log("Syncing",module="debug") - entries = {} - - for idx in range(len(SCROBBLES)): - if not SCROBBLES[idx].saved: - - t = get_scrobble_dict(SCROBBLES[idx]) - - artistlist = list(t["artists"]) - artistlist.sort() #we want the order of artists to be deterministic so when we update files with new rules a diff can see what has actually been changed - artistss = "␟".join(artistlist) - timestamp = datetime.date.fromtimestamp(t["time"]) - - album = t["album"] or "-" - duration = t["duration"] or "-" - - entry = [str(t["time"]),artistss,t["title"],album,duration] - - monthcode = str(timestamp.year) + "_" + str(timestamp.month) - entries.setdefault(monthcode,[]).append(entry) #i feckin love the setdefault function - - SCROBBLES[idx] = Scrobble(*SCROBBLES[idx][:-1],True) - # save copy with last tuple entry set to true - - #log("Sorted into months",module="debug") - - for e in entries: - tsv.add_entries(data_dir['scrobbles'](e + ".tsv"),entries[e],comments=False) - #addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False) - - #log("Written files",module="debug") - global lastsync - lastsync = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) - #log("Database saved to disk.") - # save cached images - #saveCache() diff --git a/maloja/globalconf.py b/maloja/globalconf.py index 0859ed9..fba96dc 100644 --- a/maloja/globalconf.py +++ b/maloja/globalconf.py @@ -7,6 +7,7 @@ from .__pkginfo__ import VERSION + # if DATA_DIRECTORY is specified, this is the directory to use for EVERYTHING, no matter what # but with asynnetrical structure, cache and logs in subfolders # otherwise, each directory is treated seperately @@ -311,24 +312,19 @@ config( - ### API KEYS +### symmetric keys are fine since we hopefully use HTTPS - -### symmetric keys are fine for now since we hopefully use HTTPS apikeystore = KeyStore(file=data_dir['clients']("apikeys.yml"),save_endpoint="/apis/mlj_1/apikeys") +from . import upgrade +upgrade.upgrade_apikeys() + + + + + -oldfile = pthj(dir_settings['config'],"clients","authenticated_machines.tsv") -if os.path.exists(oldfile): - try: - from doreah import tsv - clients = tsv.parse(oldfile,"string","string") - for key,identifier in clients: - apikeystore[identifier] = key - os.remove(oldfile) - except: - pass # what the fuck did i just write diff --git a/maloja/upgrade.py b/maloja/upgrade.py new file mode 100644 index 0000000..e892624 --- /dev/null +++ b/maloja/upgrade.py @@ -0,0 +1,60 @@ +# This module should take care of recognizing old install data and upgrading it before the actual server deals with it + +import os +import re + +from doreah.logging import log + +from .globalconf import data_dir, dir_settings, apikeystore + + +def upgrade_apikeys(): + + oldfile = os.path.join(dir_settings['config'],"clients","authenticated_machines.tsv") + if os.path.exists(oldfile): + try: + from doreah import tsv + clients = tsv.parse(oldfile,"string","string") + for key,identifier in clients: + apikeystore[identifier] = key + os.remove(oldfile) + except: + pass + + +def upgrade_db(callback_add_scrobbles): + oldfolder = os.path.join(dir_settings['state'],"scrobbles") + if os.path.exists(oldfolder): + scrobblefiles = os.listdir(oldfolder) + for sf in scrobblefiles: + if sf.endswith(".tsv"): + log(f"Found old tsv scrobble file: {sf}") + if re.match(r"[0-9]+_[0-9]+\.tsv",sf): + origin = 'native' + elif sf == "lastfmimport.tsv": + origin = 'lastfm-import' + else: + origin = 'unknown' + + from doreah import tsv + scrobbles = tsv.parse(os.path.join(oldfolder,sf),"int","string","string","string","string",comments=False) + scrobblelist = [] + for scrobble in scrobbles: + timestamp, artists, title, album, duration = scrobble + if album in ('-',''): album = None + if duration in ('-',''): duration = None + scrobblelist.append({ + "time":int(timestamp), + "track":{ + "artists":artists.split('␟'), + "title":title, + "album":{ + "name":album, + "artists":None + }, + "length":None + }, + "duration":duration, + "origin":origin + }) + callback_add_scrobbles(scrobblelist)