1
0
mirror of https://github.com/krateng/maloja.git synced 2023-08-10 21:12:55 +03:00

Moved tsv handling to doreah

This commit is contained in:
Krateng 2019-03-29 20:23:32 +01:00
parent 5765687f9d
commit 2f30157b04
3 changed files with 132 additions and 123 deletions

View File

@ -1,41 +1,42 @@
import re import re
import utilities import utilities
from doreah import tsv
# need to do this as a class so it can retain loaded settings from file # need to do this as a class so it can retain loaded settings from file
# apparently this is not true # apparently this is not true
# I'm dumb # I'm dumb
class CleanerAgent: class CleanerAgent:
def __init__(self): def __init__(self):
self.updateRules() self.updateRules()
def updateRules(self): def updateRules(self):
raw = utilities.parseAllTSV("rules","string","string","string") raw = tsv.parse_all("rules","string","string","string")
self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"] self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"] self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"} self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"} self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
# we always need to be able to tell if our current database is made with the current rules # we always need to be able to tell if our current database is made with the current rules
self.checksums = utilities.checksumTSV("rules") self.checksums = utilities.checksumTSV("rules")
def fullclean(self,artist,title): def fullclean(self,artist,title):
artists = self.parseArtists(self.removespecial(artist)) artists = self.parseArtists(self.removespecial(artist))
title = self.parseTitle(self.removespecial(title)) title = self.parseTitle(self.removespecial(title))
(title,moreartists) = self.parseTitleForArtists(title) (title,moreartists) = self.parseTitleForArtists(title)
artists += moreartists artists += moreartists
artists = list(set(artists)) artists = list(set(artists))
artists.sort() artists.sort()
return (artists,title) return (artists,title)
def removespecial(self,s): def removespecial(self,s):
s = s.replace("\t","").replace("","").replace("\n","") s = s.replace("\t","").replace("","").replace("\n","")
s = re.sub(" +"," ",s) s = re.sub(" +"," ",s)
return s return s
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that # if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
def confirmedReal(self,a): def confirmedReal(self,a):
@ -51,54 +52,54 @@ class CleanerAgent:
if a.strip() == "": if a.strip() == "":
return [] return []
if a.strip() in self.rules_notanartist: if a.strip() in self.rules_notanartist:
return [] return []
if " performing " in a.lower(): if " performing " in a.lower():
return self.parseArtists(re.split(" [Pp]erforming",a)[0]) return self.parseArtists(re.split(" [Pp]erforming",a)[0])
if a.strip() in self.rules_belongtogether: if a.strip() in self.rules_belongtogether:
return [a.strip()] return [a.strip()]
if a.strip() in self.rules_replaceartist: if a.strip() in self.rules_replaceartist:
return self.rules_replaceartist[a.strip()].split("") return self.rules_replaceartist[a.strip()].split("")
for d in self.delimiters_feat: for d in self.delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None: if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a)) return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
for d in self.delimiters_formal: for d in self.delimiters_formal:
if (d in a): if (d in a):
ls = [] ls = []
for i in a.split(d): for i in a.split(d):
ls += self.parseArtists(i) ls += self.parseArtists(i)
return ls return ls
for d in (self.delimiters_feat + self.delimiters): for d in (self.delimiters_feat + self.delimiters):
if ((" " + d + " ") in a): if ((" " + d + " ") in a):
ls = [] ls = []
for i in a.split(" " + d + " "): for i in a.split(" " + d + " "):
ls += self.parseArtists(i) ls += self.parseArtists(i)
return ls return ls
return [a.strip()] return [a.strip()]
def parseTitle(self,t): def parseTitle(self,t):
if t.strip() in self.rules_replacetitle: if t.strip() in self.rules_replacetitle:
return self.rules_replacetitle[t.strip()] return self.rules_replacetitle[t.strip()]
t = t.replace("[","(").replace("]",")") t = t.replace("[","(").replace("]",")")
t = re.sub(r" \(as made famous by .*?\)","",t) t = re.sub(r" \(as made famous by .*?\)","",t)
t = re.sub(r" \(originally by .*?\)","",t) t = re.sub(r" \(originally by .*?\)","",t)
t = re.sub(r" \(.*?Remaster.*?\)","",t) t = re.sub(r" \(.*?Remaster.*?\)","",t)
return t.strip() return t.strip()
def parseTitleForArtists(self,t): def parseTitleForArtists(self,t):
@ -115,30 +116,30 @@ class CleanerAgent:
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t)) (title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t)) artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
return (title,artists) return (title,artists)
return (t,[])
#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc) return (t,[])
#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
class CollectorAgent: class CollectorAgent:
def __init__(self): def __init__(self):
self.updateRules() self.updateRules()
def updateRules(self): def updateRules(self):
raw = utilities.parseAllTSV("rules","string","string","string") raw = tsv.parse_all("rules","string","string","string")
self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"} self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!) self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
for a in self.rules_countas: for a in self.rules_countas:
self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a] self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]
# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of # this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of
def updateIDs(self,artistlist): def updateIDs(self,artistlist):
self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas} self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas}
#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include} #self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
#this needs to take lists into account #this needs to take lists into account
def getCredited(self,artist): def getCredited(self,artist):
if artist in self.rules_countas_id: if artist in self.rules_countas_id:
return self.rules_countas_id[artist] return self.rules_countas_id[artist]
@ -146,36 +147,36 @@ class CollectorAgent:
return self.rules_countas[artist] return self.rules_countas[artist]
else: else:
return artist return artist
def getCreditedList(self,artists): def getCreditedList(self,artists):
updatedArtists = [] updatedArtists = []
for artist in artists: for artist in artists:
updatedArtists.append(self.getCredited(artist)) updatedArtists.append(self.getCredited(artist))
return list(set(updatedArtists)) return list(set(updatedArtists))
def getAllAssociated(self,artist): def getAllAssociated(self,artist):
return self.rules_include.get(artist,[]) return self.rules_include.get(artist,[])
# this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when # this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when
# countas rules are declared preemptively # countas rules are declared preemptively
def getAllArtists(self): def getAllArtists(self):
return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas])) return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))
def flatten(lis): def flatten(lis):
newlist = [] newlist = []
for l in lis: for l in lis:
if isinstance(l, str): if isinstance(l, str):
newlist.append(l) newlist.append(l)
else: else:
newlist = newlist + l newlist = newlist + l
return list(set(newlist)) return list(set(newlist))

View File

@ -7,6 +7,7 @@ import datetime
from cleanup import * from cleanup import *
from utilities import * from utilities import *
from doreah.logging import log from doreah.logging import log
from doreah import tsv
from malojatime import * from malojatime import *
import sys import sys
import unicodedata import unicodedata
@ -39,8 +40,10 @@ db_rulestate = False
### symmetric keys are fine for now since we hopefully use HTTPS ### symmetric keys are fine for now since we hopefully use HTTPS
def loadAPIkeys(): def loadAPIkeys():
global clients global clients
createTSV("clients/authenticated_machines.tsv") tsv.create("clients/authenticated_machines.tsv")
clients = parseTSV("clients/authenticated_machines.tsv","string","string") #createTSV("clients/authenticated_machines.tsv")
clients = tsv.parse("clients/authenticated_machines.tsv","string","string")
#clients = parseTSV("clients/authenticated_machines.tsv","string","string")
log("Authenticated Machines: " + ", ".join([m[1] for m in clients])) log("Authenticated Machines: " + ", ".join([m[1] for m in clients]))
def checkAPIkey(k): def checkAPIkey(k):
@ -550,7 +553,8 @@ def newrule():
keys = FormsDict.decode(request.forms) keys = FormsDict.decode(request.forms)
apikey = keys.pop("key",None) apikey = keys.pop("key",None)
if (checkAPIkey(apikey)): if (checkAPIkey(apikey)):
addEntry("rules/webmade.tsv",[k for k in keys]) tsv.add_entry("rules/webmade.tsv",[k for k in keys])
#addEntry("rules/webmade.tsv",[k for k in keys])
global db_rulestate global db_rulestate
db_rulestate = False db_rulestate = False
@ -742,7 +746,8 @@ def build_db():
# parse files # parse files
db = parseAllTSV("scrobbles","int","string","string",escape=False) db = tsv.parse_all("scrobbles","int","string","string",comments=False)
#db = parseAllTSV("scrobbles","int","string","string",escape=False)
for sc in db: for sc in db:
artists = sc[1].split("") artists = sc[1].split("")
title = sc[2] title = sc[2]
@ -803,7 +808,8 @@ def sync():
SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True) SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)
for e in entries: for e in entries:
addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False) tsv.add_entries("scrobbles/" + e + ".tsv",entries[e],comments=False)
#addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
combineChecksums("scrobbles/" + e + ".tsv",cla.checksums) combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)

View File

@ -6,48 +6,49 @@ import pickle
import urllib import urllib
import datetime import datetime
from doreah import settings from doreah import settings
from doreah.logging import log
### TSV files ### TSV files
def parseTSV(filename,*args,escape=True): #def parseTSV(filename,*args,escape=True):
f = open(filename) # f = open(filename)
#
# result = []
# for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
#
# l = l.replace("\n","")
# if escape:
# l = l.split("#")[0]
# l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
# data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
# entry = [] * len(args)
# for i in range(len(args)):
# if args[i]=="list":
# try:
# entry.append(data[i].split("␟"))
# except:
# entry.append([])
# elif args[i]=="string":
# try:
# entry.append(data[i])
# except:
# entry.append("")
# elif args[i]=="int":
# try:
# entry.append(int(data[i]))
# except:
# entry.append(0)
# elif args[i]=="bool":
# try:
# entry.append((data[i].lower() in ["true","yes","1","y"]))
# except:
# entry.append(False)
#
# result.append(entry)
result = [] # f.close()
for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]: # return result
l = l.replace("\n","")
if escape:
l = l.split("#")[0]
l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
entry = [] * len(args)
for i in range(len(args)):
if args[i]=="list":
try:
entry.append(data[i].split(""))
except:
entry.append([])
elif args[i]=="string":
try:
entry.append(data[i])
except:
entry.append("")
elif args[i]=="int":
try:
entry.append(int(data[i]))
except:
entry.append(0)
elif args[i]=="bool":
try:
entry.append((data[i].lower() in ["true","yes","1","y"]))
except:
entry.append(False)
result.append(entry)
f.close()
return result
def checksumTSV(folder): def checksumTSV(folder):
@ -110,40 +111,40 @@ def consistentRulestate(folder,checksums):
return True return True
def parseAllTSV(path,*args,escape=True): #def parseAllTSV(path,*args,escape=True):
#
#
# result = []
# for f in os.listdir(path + "/"):
#
# if (f.endswith(".tsv")):
#
# result += parseTSV(path + "/" + f,*args,escape=escape)
#
# return result
#def createTSV(filename):
#
# if not os.path.exists(filename):
# open(filename,"w").close()
result = [] #def addEntry(filename,a,escape=True):
for f in os.listdir(path + "/"): #
# createTSV(filename)
if (f.endswith(".tsv")): #
# line = "\t".join(a)
result += parseTSV(path + "/" + f,*args,escape=escape) # if escape: line = line.replace("#",r"\num")
# with open(filename,"a") as f:
return result # f.write(line + "\n")
def createTSV(filename):
if not os.path.exists(filename):
open(filename,"w").close()
def addEntry(filename,a,escape=True):
createTSV(filename)
line = "\t".join(a)
if escape: line = line.replace("#",r"\num")
with open(filename,"a") as f:
f.write(line + "\n")
def addEntries(filename,al,escape=True):
with open(filename,"a") as f:
for a in al:
line = "\t".join(a)
if escape: line = line.replace("#",r"\num")
f.write(line + "\n")
#def addEntries(filename,al,escape=True):
#
# with open(filename,"a") as f:
# for a in al:
# line = "\t".join(a)
# if escape: line = line.replace("#",r"\num")
# f.write(line + "\n")
#
### Useful functions ### Useful functions
@ -273,6 +274,7 @@ def cache_track(artists,title,result):
day = datetime.date.today().toordinal() day = datetime.date.today().toordinal()
cachedTracksDays[(frozenset(artists),title)] = day cachedTracksDays[(frozenset(artists),title)] = day
def cache_artist(artist,result): def cache_artist(artist,result):
if result is None: log("Caching None for " + artist,module="debug")
cachedArtists[artist] = result cachedArtists[artist] = result
day = datetime.date.today().toordinal() day = datetime.date.today().toordinal()
cachedArtistsDays[artist] = day cachedArtistsDays[artist] = day