mirror of
https://github.com/krateng/maloja.git
synced 2023-08-10 21:12:55 +03:00
Moved tsv handling to doreah
This commit is contained in:
parent
5765687f9d
commit
2f30157b04
101
cleanup.py
101
cleanup.py
@ -1,41 +1,42 @@
|
|||||||
import re
|
import re
|
||||||
import utilities
|
import utilities
|
||||||
|
from doreah import tsv
|
||||||
|
|
||||||
# need to do this as a class so it can retain loaded settings from file
|
# need to do this as a class so it can retain loaded settings from file
|
||||||
# apparently this is not true
|
# apparently this is not true
|
||||||
# I'm dumb
|
# I'm dumb
|
||||||
class CleanerAgent:
|
class CleanerAgent:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.updateRules()
|
self.updateRules()
|
||||||
|
|
||||||
def updateRules(self):
|
def updateRules(self):
|
||||||
raw = utilities.parseAllTSV("rules","string","string","string")
|
raw = tsv.parse_all("rules","string","string","string")
|
||||||
self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
|
self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
|
||||||
self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
|
self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
|
||||||
self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
|
self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
|
||||||
self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
|
self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
|
||||||
|
|
||||||
# we always need to be able to tell if our current database is made with the current rules
|
# we always need to be able to tell if our current database is made with the current rules
|
||||||
self.checksums = utilities.checksumTSV("rules")
|
self.checksums = utilities.checksumTSV("rules")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fullclean(self,artist,title):
|
def fullclean(self,artist,title):
|
||||||
artists = self.parseArtists(self.removespecial(artist))
|
artists = self.parseArtists(self.removespecial(artist))
|
||||||
title = self.parseTitle(self.removespecial(title))
|
title = self.parseTitle(self.removespecial(title))
|
||||||
(title,moreartists) = self.parseTitleForArtists(title)
|
(title,moreartists) = self.parseTitleForArtists(title)
|
||||||
artists += moreartists
|
artists += moreartists
|
||||||
artists = list(set(artists))
|
artists = list(set(artists))
|
||||||
artists.sort()
|
artists.sort()
|
||||||
|
|
||||||
return (artists,title)
|
return (artists,title)
|
||||||
|
|
||||||
def removespecial(self,s):
|
def removespecial(self,s):
|
||||||
s = s.replace("\t","").replace("␟","").replace("\n","")
|
s = s.replace("\t","").replace("␟","").replace("\n","")
|
||||||
s = re.sub(" +"," ",s)
|
s = re.sub(" +"," ",s)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
|
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
|
||||||
def confirmedReal(self,a):
|
def confirmedReal(self,a):
|
||||||
@ -51,54 +52,54 @@ class CleanerAgent:
|
|||||||
|
|
||||||
if a.strip() == "":
|
if a.strip() == "":
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if a.strip() in self.rules_notanartist:
|
if a.strip() in self.rules_notanartist:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if " performing " in a.lower():
|
if " performing " in a.lower():
|
||||||
return self.parseArtists(re.split(" [Pp]erforming",a)[0])
|
return self.parseArtists(re.split(" [Pp]erforming",a)[0])
|
||||||
|
|
||||||
if a.strip() in self.rules_belongtogether:
|
if a.strip() in self.rules_belongtogether:
|
||||||
return [a.strip()]
|
return [a.strip()]
|
||||||
if a.strip() in self.rules_replaceartist:
|
if a.strip() in self.rules_replaceartist:
|
||||||
return self.rules_replaceartist[a.strip()].split("␟")
|
return self.rules_replaceartist[a.strip()].split("␟")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for d in self.delimiters_feat:
|
for d in self.delimiters_feat:
|
||||||
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
||||||
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
||||||
|
|
||||||
for d in self.delimiters_formal:
|
for d in self.delimiters_formal:
|
||||||
if (d in a):
|
if (d in a):
|
||||||
ls = []
|
ls = []
|
||||||
for i in a.split(d):
|
for i in a.split(d):
|
||||||
ls += self.parseArtists(i)
|
ls += self.parseArtists(i)
|
||||||
return ls
|
return ls
|
||||||
|
|
||||||
for d in (self.delimiters_feat + self.delimiters):
|
for d in (self.delimiters_feat + self.delimiters):
|
||||||
if ((" " + d + " ") in a):
|
if ((" " + d + " ") in a):
|
||||||
ls = []
|
ls = []
|
||||||
for i in a.split(" " + d + " "):
|
for i in a.split(" " + d + " "):
|
||||||
ls += self.parseArtists(i)
|
ls += self.parseArtists(i)
|
||||||
return ls
|
return ls
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return [a.strip()]
|
return [a.strip()]
|
||||||
|
|
||||||
def parseTitle(self,t):
|
def parseTitle(self,t):
|
||||||
if t.strip() in self.rules_replacetitle:
|
if t.strip() in self.rules_replacetitle:
|
||||||
return self.rules_replacetitle[t.strip()]
|
return self.rules_replacetitle[t.strip()]
|
||||||
|
|
||||||
t = t.replace("[","(").replace("]",")")
|
t = t.replace("[","(").replace("]",")")
|
||||||
|
|
||||||
t = re.sub(r" \(as made famous by .*?\)","",t)
|
t = re.sub(r" \(as made famous by .*?\)","",t)
|
||||||
t = re.sub(r" \(originally by .*?\)","",t)
|
t = re.sub(r" \(originally by .*?\)","",t)
|
||||||
t = re.sub(r" \(.*?Remaster.*?\)","",t)
|
t = re.sub(r" \(.*?Remaster.*?\)","",t)
|
||||||
|
|
||||||
return t.strip()
|
return t.strip()
|
||||||
|
|
||||||
def parseTitleForArtists(self,t):
|
def parseTitleForArtists(self,t):
|
||||||
@ -115,30 +116,30 @@ class CleanerAgent:
|
|||||||
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
|
||||||
artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
|
artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
|
||||||
return (title,artists)
|
return (title,artists)
|
||||||
|
|
||||||
return (t,[])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
|
return (t,[])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
|
||||||
class CollectorAgent:
|
class CollectorAgent:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.updateRules()
|
self.updateRules()
|
||||||
|
|
||||||
def updateRules(self):
|
def updateRules(self):
|
||||||
raw = utilities.parseAllTSV("rules","string","string","string")
|
raw = tsv.parse_all("rules","string","string","string")
|
||||||
self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
|
self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
|
||||||
self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
|
self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
|
||||||
for a in self.rules_countas:
|
for a in self.rules_countas:
|
||||||
self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]
|
self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]
|
||||||
|
|
||||||
# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of
|
# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of
|
||||||
def updateIDs(self,artistlist):
|
def updateIDs(self,artistlist):
|
||||||
self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas}
|
self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas}
|
||||||
#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
|
#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
|
||||||
#this needs to take lists into account
|
#this needs to take lists into account
|
||||||
|
|
||||||
def getCredited(self,artist):
|
def getCredited(self,artist):
|
||||||
if artist in self.rules_countas_id:
|
if artist in self.rules_countas_id:
|
||||||
return self.rules_countas_id[artist]
|
return self.rules_countas_id[artist]
|
||||||
@ -146,36 +147,36 @@ class CollectorAgent:
|
|||||||
return self.rules_countas[artist]
|
return self.rules_countas[artist]
|
||||||
else:
|
else:
|
||||||
return artist
|
return artist
|
||||||
|
|
||||||
|
|
||||||
def getCreditedList(self,artists):
|
def getCreditedList(self,artists):
|
||||||
updatedArtists = []
|
updatedArtists = []
|
||||||
for artist in artists:
|
for artist in artists:
|
||||||
updatedArtists.append(self.getCredited(artist))
|
updatedArtists.append(self.getCredited(artist))
|
||||||
return list(set(updatedArtists))
|
return list(set(updatedArtists))
|
||||||
|
|
||||||
def getAllAssociated(self,artist):
|
def getAllAssociated(self,artist):
|
||||||
return self.rules_include.get(artist,[])
|
return self.rules_include.get(artist,[])
|
||||||
|
|
||||||
# this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when
|
# this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when
|
||||||
# countas rules are declared preemptively
|
# countas rules are declared preemptively
|
||||||
def getAllArtists(self):
|
def getAllArtists(self):
|
||||||
return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))
|
return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def flatten(lis):
|
def flatten(lis):
|
||||||
|
|
||||||
newlist = []
|
newlist = []
|
||||||
|
|
||||||
for l in lis:
|
for l in lis:
|
||||||
if isinstance(l, str):
|
if isinstance(l, str):
|
||||||
newlist.append(l)
|
newlist.append(l)
|
||||||
else:
|
else:
|
||||||
newlist = newlist + l
|
newlist = newlist + l
|
||||||
|
|
||||||
return list(set(newlist))
|
return list(set(newlist))
|
||||||
|
16
database.py
16
database.py
@ -7,6 +7,7 @@ import datetime
|
|||||||
from cleanup import *
|
from cleanup import *
|
||||||
from utilities import *
|
from utilities import *
|
||||||
from doreah.logging import log
|
from doreah.logging import log
|
||||||
|
from doreah import tsv
|
||||||
from malojatime import *
|
from malojatime import *
|
||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
@ -39,8 +40,10 @@ db_rulestate = False
|
|||||||
### symmetric keys are fine for now since we hopefully use HTTPS
|
### symmetric keys are fine for now since we hopefully use HTTPS
|
||||||
def loadAPIkeys():
|
def loadAPIkeys():
|
||||||
global clients
|
global clients
|
||||||
createTSV("clients/authenticated_machines.tsv")
|
tsv.create("clients/authenticated_machines.tsv")
|
||||||
clients = parseTSV("clients/authenticated_machines.tsv","string","string")
|
#createTSV("clients/authenticated_machines.tsv")
|
||||||
|
clients = tsv.parse("clients/authenticated_machines.tsv","string","string")
|
||||||
|
#clients = parseTSV("clients/authenticated_machines.tsv","string","string")
|
||||||
log("Authenticated Machines: " + ", ".join([m[1] for m in clients]))
|
log("Authenticated Machines: " + ", ".join([m[1] for m in clients]))
|
||||||
|
|
||||||
def checkAPIkey(k):
|
def checkAPIkey(k):
|
||||||
@ -550,7 +553,8 @@ def newrule():
|
|||||||
keys = FormsDict.decode(request.forms)
|
keys = FormsDict.decode(request.forms)
|
||||||
apikey = keys.pop("key",None)
|
apikey = keys.pop("key",None)
|
||||||
if (checkAPIkey(apikey)):
|
if (checkAPIkey(apikey)):
|
||||||
addEntry("rules/webmade.tsv",[k for k in keys])
|
tsv.add_entry("rules/webmade.tsv",[k for k in keys])
|
||||||
|
#addEntry("rules/webmade.tsv",[k for k in keys])
|
||||||
global db_rulestate
|
global db_rulestate
|
||||||
db_rulestate = False
|
db_rulestate = False
|
||||||
|
|
||||||
@ -742,7 +746,8 @@ def build_db():
|
|||||||
|
|
||||||
|
|
||||||
# parse files
|
# parse files
|
||||||
db = parseAllTSV("scrobbles","int","string","string",escape=False)
|
db = tsv.parse_all("scrobbles","int","string","string",comments=False)
|
||||||
|
#db = parseAllTSV("scrobbles","int","string","string",escape=False)
|
||||||
for sc in db:
|
for sc in db:
|
||||||
artists = sc[1].split("␟")
|
artists = sc[1].split("␟")
|
||||||
title = sc[2]
|
title = sc[2]
|
||||||
@ -803,7 +808,8 @@ def sync():
|
|||||||
SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)
|
SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)
|
||||||
|
|
||||||
for e in entries:
|
for e in entries:
|
||||||
addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
|
tsv.add_entries("scrobbles/" + e + ".tsv",entries[e],comments=False)
|
||||||
|
#addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
|
||||||
combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)
|
combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)
|
||||||
|
|
||||||
|
|
||||||
|
138
utilities.py
138
utilities.py
@ -6,48 +6,49 @@ import pickle
|
|||||||
import urllib
|
import urllib
|
||||||
import datetime
|
import datetime
|
||||||
from doreah import settings
|
from doreah import settings
|
||||||
|
from doreah.logging import log
|
||||||
|
|
||||||
|
|
||||||
### TSV files
|
### TSV files
|
||||||
|
|
||||||
def parseTSV(filename,*args,escape=True):
|
#def parseTSV(filename,*args,escape=True):
|
||||||
f = open(filename)
|
# f = open(filename)
|
||||||
|
#
|
||||||
|
# result = []
|
||||||
|
# for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
|
||||||
|
#
|
||||||
|
# l = l.replace("\n","")
|
||||||
|
# if escape:
|
||||||
|
# l = l.split("#")[0]
|
||||||
|
# l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
|
||||||
|
# data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
|
||||||
|
# entry = [] * len(args)
|
||||||
|
# for i in range(len(args)):
|
||||||
|
# if args[i]=="list":
|
||||||
|
# try:
|
||||||
|
# entry.append(data[i].split("␟"))
|
||||||
|
# except:
|
||||||
|
# entry.append([])
|
||||||
|
# elif args[i]=="string":
|
||||||
|
# try:
|
||||||
|
# entry.append(data[i])
|
||||||
|
# except:
|
||||||
|
# entry.append("")
|
||||||
|
# elif args[i]=="int":
|
||||||
|
# try:
|
||||||
|
# entry.append(int(data[i]))
|
||||||
|
# except:
|
||||||
|
# entry.append(0)
|
||||||
|
# elif args[i]=="bool":
|
||||||
|
# try:
|
||||||
|
# entry.append((data[i].lower() in ["true","yes","1","y"]))
|
||||||
|
# except:
|
||||||
|
# entry.append(False)
|
||||||
|
#
|
||||||
|
# result.append(entry)
|
||||||
|
|
||||||
result = []
|
# f.close()
|
||||||
for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
|
# return result
|
||||||
|
|
||||||
l = l.replace("\n","")
|
|
||||||
if escape:
|
|
||||||
l = l.split("#")[0]
|
|
||||||
l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
|
|
||||||
data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
|
|
||||||
entry = [] * len(args)
|
|
||||||
for i in range(len(args)):
|
|
||||||
if args[i]=="list":
|
|
||||||
try:
|
|
||||||
entry.append(data[i].split("␟"))
|
|
||||||
except:
|
|
||||||
entry.append([])
|
|
||||||
elif args[i]=="string":
|
|
||||||
try:
|
|
||||||
entry.append(data[i])
|
|
||||||
except:
|
|
||||||
entry.append("")
|
|
||||||
elif args[i]=="int":
|
|
||||||
try:
|
|
||||||
entry.append(int(data[i]))
|
|
||||||
except:
|
|
||||||
entry.append(0)
|
|
||||||
elif args[i]=="bool":
|
|
||||||
try:
|
|
||||||
entry.append((data[i].lower() in ["true","yes","1","y"]))
|
|
||||||
except:
|
|
||||||
entry.append(False)
|
|
||||||
|
|
||||||
result.append(entry)
|
|
||||||
|
|
||||||
f.close()
|
|
||||||
return result
|
|
||||||
|
|
||||||
def checksumTSV(folder):
|
def checksumTSV(folder):
|
||||||
|
|
||||||
@ -110,40 +111,40 @@ def consistentRulestate(folder,checksums):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def parseAllTSV(path,*args,escape=True):
|
#def parseAllTSV(path,*args,escape=True):
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# result = []
|
||||||
|
# for f in os.listdir(path + "/"):
|
||||||
|
#
|
||||||
|
# if (f.endswith(".tsv")):
|
||||||
|
#
|
||||||
|
# result += parseTSV(path + "/" + f,*args,escape=escape)
|
||||||
|
#
|
||||||
|
# return result
|
||||||
|
|
||||||
|
#def createTSV(filename):
|
||||||
|
#
|
||||||
|
# if not os.path.exists(filename):
|
||||||
|
# open(filename,"w").close()
|
||||||
|
|
||||||
result = []
|
#def addEntry(filename,a,escape=True):
|
||||||
for f in os.listdir(path + "/"):
|
#
|
||||||
|
# createTSV(filename)
|
||||||
if (f.endswith(".tsv")):
|
#
|
||||||
|
# line = "\t".join(a)
|
||||||
result += parseTSV(path + "/" + f,*args,escape=escape)
|
# if escape: line = line.replace("#",r"\num")
|
||||||
|
# with open(filename,"a") as f:
|
||||||
return result
|
# f.write(line + "\n")
|
||||||
|
|
||||||
def createTSV(filename):
|
|
||||||
|
|
||||||
if not os.path.exists(filename):
|
|
||||||
open(filename,"w").close()
|
|
||||||
|
|
||||||
def addEntry(filename,a,escape=True):
|
|
||||||
|
|
||||||
createTSV(filename)
|
|
||||||
|
|
||||||
line = "\t".join(a)
|
|
||||||
if escape: line = line.replace("#",r"\num")
|
|
||||||
with open(filename,"a") as f:
|
|
||||||
f.write(line + "\n")
|
|
||||||
|
|
||||||
def addEntries(filename,al,escape=True):
|
|
||||||
|
|
||||||
with open(filename,"a") as f:
|
|
||||||
for a in al:
|
|
||||||
line = "\t".join(a)
|
|
||||||
if escape: line = line.replace("#",r"\num")
|
|
||||||
f.write(line + "\n")
|
|
||||||
|
|
||||||
|
#def addEntries(filename,al,escape=True):
|
||||||
|
#
|
||||||
|
# with open(filename,"a") as f:
|
||||||
|
# for a in al:
|
||||||
|
# line = "\t".join(a)
|
||||||
|
# if escape: line = line.replace("#",r"\num")
|
||||||
|
# f.write(line + "\n")
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
### Useful functions
|
### Useful functions
|
||||||
@ -273,6 +274,7 @@ def cache_track(artists,title,result):
|
|||||||
day = datetime.date.today().toordinal()
|
day = datetime.date.today().toordinal()
|
||||||
cachedTracksDays[(frozenset(artists),title)] = day
|
cachedTracksDays[(frozenset(artists),title)] = day
|
||||||
def cache_artist(artist,result):
|
def cache_artist(artist,result):
|
||||||
|
if result is None: log("Caching None for " + artist,module="debug")
|
||||||
cachedArtists[artist] = result
|
cachedArtists[artist] = result
|
||||||
day = datetime.date.today().toordinal()
|
day = datetime.date.today().toordinal()
|
||||||
cachedArtistsDays[artist] = day
|
cachedArtistsDays[artist] = day
|
||||||
|
Loading…
x
Reference in New Issue
Block a user