diff --git a/cleanup.py b/cleanup.py index 7330529..1e71e05 100644 --- a/cleanup.py +++ b/cleanup.py @@ -27,6 +27,10 @@ class CleanerAgent: def removespecial(self,s): return s.replace("\t","").replace("␟","").replace("\n","") + # if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that + def confirmedReal(self,a): + confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist] + return (a in confirmed) delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"] #Delimiters used for extra artists, even when in the title field delimiters = ["vs.","vs","&"] #Delimiters in informal artist strings, spaces expected around them diff --git a/database.py b/database.py index 8034b3b..7f611b1 100644 --- a/database.py +++ b/database.py @@ -23,6 +23,9 @@ clients = [] lastsync = 0 +rulescheck = "" + + ### symmetric keys are fine for now since we hopefully use HTTPS def loadAPIkeys(): @@ -376,7 +379,103 @@ def post_scrobble(): def abouttoshutdown(): sync() #sys.exit() + +@dbserver.post("/newrule") +def newrule(): + keys = FormsDict.decode(request.forms) + addEntry("rules/webmade.tsv",[k for k in keys]) +@dbserver.route("/issues") +def issues(): + combined = [] + duplicates = [] + newartists = [] + + + import itertools + import difflib + + sortedartists = ARTISTS.copy() + sortedartists.sort(key=len,reverse=True) + reversesortedartists = sortedartists.copy() + reversesortedartists.reverse() + for a in reversesortedartists: + + nochange = cla.confirmedReal(a) + + st = a + lis = [] + reachedmyself = False + for ar in sortedartists: + if (ar != a) and not reachedmyself: + continue + elif not reachedmyself: + reachedmyself = True + continue + + if (ar.lower() == a.lower()) or ("the " + ar.lower() == a.lower()) or ("a " + ar.lower() == a.lower()): + duplicates.append((ar,a)) + break + + if (ar + " " in st) or (" " + ar in st): + lis.append(ar) + st = st.replace(ar,"").strip() + elif (ar == st): + lis.append(ar) + st = "" + if not nochange: + combined.append((a,lis)) + break + + elif (ar in st) and len(ar)*2 > len(st): + duplicates.append((a,ar)) + + st = st.replace("&","").replace("and","").replace("with","").strip() + if st != "" and st != a: + if len(st) < 5 and len(lis) == 1: + #check if we havent just randomly found the string in another word + #if (" " + st + " ") in lis[0] or (lis[0].endswith(" " + st)) or (lis[0].startswith(st + " ")): + duplicates.append((a,lis[0])) + elif len(st) < 5 and len(lis) > 1 and not nochange: + combined.append((a,lis)) + elif len(st) >= 5 and not nochange: + #check if we havent just randomly found the string in another word + if (" " + st + " ") in a or (a.endswith(" " + st)) or (a.startswith(st + " ")): + newartists.append((st,a,lis)) + + #for c in itertools.combinations(ARTISTS,3): + # l = list(c) + # print(l) + # l.sort(key=len,reverse=True) + # [full,a1,a2] = l + # if (a1 + " " + a2 in full) or (a2 + " " + a1 in full): + # combined.append((full,a1,a2)) + + + #for c in itertools.combinations(ARTISTS,2): + # if + # + # if (c[0].lower == c[1].lower): + # duplicates.append((c[0],c[1])) + + + # elif (c[0] + " " in c[1]) or (" " + c[0] in c[1]) or (c[1] + " " in c[0]) or (" " + c[1] in c[0]): + # if (c[0] in c[1]): + # full, part = c[1],c[0] + # rest = c[1].replace(c[0],"").strip() + # else: + # full, part = c[0],c[1] + # rest = c[0].replace(c[1],"").strip() + # if rest in ARTISTS and full not in [c[0] for c in combined]: + # combined.append((full,part,rest)) + + # elif (c[0] in c[1]) or (c[1] in c[0]): + # duplicates.append((c[0],c[1])) + + + return {"duplicates":duplicates,"combined":combined,"newartists":newartists} + + #### ## Server operation @@ -388,8 +487,6 @@ def abouttoshutdown(): def runserver(PORT): global lastsync lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) - #reload() - #buildh() build_db() coa.updateIDs(ARTISTS) @@ -400,6 +497,9 @@ def runserver(PORT): def build_db(): + global rulescheck + + global SCROBBLES SCROBBLESNEW = [] diff --git a/utilities.py b/utilities.py index 7bae2c3..92f84e2 100644 --- a/utilities.py +++ b/utilities.py @@ -54,6 +54,15 @@ def createTSV(filename): if not os.path.exists(filename): open(filename,"w").close() + +def addEntry(filename,args): + + createTSV(filename) + + line = "\t".join(args) + with open(filename,"a") as f: + f.write(line + "\n") + ### Logging @@ -148,6 +157,9 @@ def cacheImage(url,path,filename): target = path + "/" + filename + "." + response.info().get_content_subtype() urllib.request.urlretrieve(url,target) +def artistLink(name): + import urllib + return "" + name + "" def getTimeDesc(timestamp): diff --git a/website/issues.html b/website/issues.html new file mode 100644 index 0000000..268e8d3 --- /dev/null +++ b/website/issues.html @@ -0,0 +1,45 @@ + + + +
+ +
+ Possible Issues+ with your library + + + Maloja can identify possible problems with consistency or redundancy in your library. + |
+
'" + artistLink(d[0]) + "'" + html += " is a possible duplicate of " + html += "'" + artistLink(d[1]) + "' | " + html += """""" + d[1] + """ is correct | """ + html += """""" + d[0] + """ is correct | """ + html += "
'" + artistLink(c[0]) + "' sounds like the combination of " + str(len(c[1])) + " artists: " + for a in c[1]: + html += "'" + artistLink(a) + "' " + html += " | " + html += """Fix it | """ + html += "|
Is '" + n[0] + "' in '" + artistLink(n[1]) + "' an artist? | " + html += """Yes | """ + html += "