From f5dced4f6ebb614313cd860e7bb287e6416af563 Mon Sep 17 00:00:00 2001 From: Krateng Date: Thu, 20 Dec 2018 17:23:16 +0100 Subject: [PATCH] Added web interface to audit library data --- cleanup.py | 4 ++ database.py | 104 +++++++++++++++++++++++++++++++++++++++++++- utilities.py | 12 +++++ website/issues.html | 45 +++++++++++++++++++ website/issues.py | 39 +++++++++++++++++ website/maloja.css | 9 ++++ 6 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 website/issues.html create mode 100644 website/issues.py diff --git a/cleanup.py b/cleanup.py index 7330529..1e71e05 100644 --- a/cleanup.py +++ b/cleanup.py @@ -27,6 +27,10 @@ class CleanerAgent: def removespecial(self,s): return s.replace("\t","").replace("␟","").replace("\n","") + # if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that + def confirmedReal(self,a): + confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist] + return (a in confirmed) delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"] #Delimiters used for extra artists, even when in the title field delimiters = ["vs.","vs","&"] #Delimiters in informal artist strings, spaces expected around them diff --git a/database.py b/database.py index 8034b3b..7f611b1 100644 --- a/database.py +++ b/database.py @@ -23,6 +23,9 @@ clients = [] lastsync = 0 +rulescheck = "" + + ### symmetric keys are fine for now since we hopefully use HTTPS def loadAPIkeys(): @@ -376,7 +379,103 @@ def post_scrobble(): def abouttoshutdown(): sync() #sys.exit() + +@dbserver.post("/newrule") +def newrule(): + keys = FormsDict.decode(request.forms) + addEntry("rules/webmade.tsv",[k for k in keys]) +@dbserver.route("/issues") +def issues(): + combined = [] + duplicates = [] + newartists = [] + + + import itertools + import difflib + + sortedartists = ARTISTS.copy() + sortedartists.sort(key=len,reverse=True) + reversesortedartists = sortedartists.copy() + reversesortedartists.reverse() + for a in reversesortedartists: + + nochange = cla.confirmedReal(a) + + st = a + lis = [] + reachedmyself = False + for ar in sortedartists: + if (ar != a) and not reachedmyself: + continue + elif not reachedmyself: + reachedmyself = True + continue + + if (ar.lower() == a.lower()) or ("the " + ar.lower() == a.lower()) or ("a " + ar.lower() == a.lower()): + duplicates.append((ar,a)) + break + + if (ar + " " in st) or (" " + ar in st): + lis.append(ar) + st = st.replace(ar,"").strip() + elif (ar == st): + lis.append(ar) + st = "" + if not nochange: + combined.append((a,lis)) + break + + elif (ar in st) and len(ar)*2 > len(st): + duplicates.append((a,ar)) + + st = st.replace("&","").replace("and","").replace("with","").strip() + if st != "" and st != a: + if len(st) < 5 and len(lis) == 1: + #check if we havent just randomly found the string in another word + #if (" " + st + " ") in lis[0] or (lis[0].endswith(" " + st)) or (lis[0].startswith(st + " ")): + duplicates.append((a,lis[0])) + elif len(st) < 5 and len(lis) > 1 and not nochange: + combined.append((a,lis)) + elif len(st) >= 5 and not nochange: + #check if we havent just randomly found the string in another word + if (" " + st + " ") in a or (a.endswith(" " + st)) or (a.startswith(st + " ")): + newartists.append((st,a,lis)) + + #for c in itertools.combinations(ARTISTS,3): + # l = list(c) + # print(l) + # l.sort(key=len,reverse=True) + # [full,a1,a2] = l + # if (a1 + " " + a2 in full) or (a2 + " " + a1 in full): + # combined.append((full,a1,a2)) + + + #for c in itertools.combinations(ARTISTS,2): + # if + # + # if (c[0].lower == c[1].lower): + # duplicates.append((c[0],c[1])) + + + # elif (c[0] + " " in c[1]) or (" " + c[0] in c[1]) or (c[1] + " " in c[0]) or (" " + c[1] in c[0]): + # if (c[0] in c[1]): + # full, part = c[1],c[0] + # rest = c[1].replace(c[0],"").strip() + # else: + # full, part = c[0],c[1] + # rest = c[0].replace(c[1],"").strip() + # if rest in ARTISTS and full not in [c[0] for c in combined]: + # combined.append((full,part,rest)) + + # elif (c[0] in c[1]) or (c[1] in c[0]): + # duplicates.append((c[0],c[1])) + + + return {"duplicates":duplicates,"combined":combined,"newartists":newartists} + + #### ## Server operation @@ -388,8 +487,6 @@ def abouttoshutdown(): def runserver(PORT): global lastsync lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) - #reload() - #buildh() build_db() coa.updateIDs(ARTISTS) @@ -400,6 +497,9 @@ def runserver(PORT): def build_db(): + global rulescheck + + global SCROBBLES SCROBBLESNEW = [] diff --git a/utilities.py b/utilities.py index 7bae2c3..92f84e2 100644 --- a/utilities.py +++ b/utilities.py @@ -54,6 +54,15 @@ def createTSV(filename): if not os.path.exists(filename): open(filename,"w").close() + +def addEntry(filename,args): + + createTSV(filename) + + line = "\t".join(args) + with open(filename,"a") as f: + f.write(line + "\n") + ### Logging @@ -148,6 +157,9 @@ def cacheImage(url,path,filename): target = path + "/" + filename + "." + response.info().get_content_subtype() urllib.request.urlretrieve(url,target) +def artistLink(name): + import urllib + return "" + name + "" def getTimeDesc(timestamp): diff --git a/website/issues.html b/website/issues.html new file mode 100644 index 0000000..268e8d3 --- /dev/null +++ b/website/issues.html @@ -0,0 +1,45 @@ + + + + + + Maloja - Issues + + + + + + + + + +
+

Possible Issues


+ with your library +

KEY_ISSUES Issues

+ +

Maloja can identify possible problems with consistency or redundancy in your library.

+
+ + KEY_ISSUESLIST + + + + + diff --git a/website/issues.py b/website/issues.py new file mode 100644 index 0000000..655b2dd --- /dev/null +++ b/website/issues.py @@ -0,0 +1,39 @@ +import urllib +import json +from utilities import artistLink + +def replacedict(keys,dbport): + + response = urllib.request.urlopen("http://localhost:" + str(dbport) + "/issues") + db_data = json.loads(response.read()) + i = 0 + + html = "" + for d in db_data["duplicates"]: + html += "" + html += "" + html += """""" + html += """""" + html += "" + i += 1 + for c in db_data["combined"]: + html += "" + html += "" + html += """""" + html += "" + i += 1 + for n in db_data["newartists"]: + html += "" + html += "" + html += """""" + html += "" + i += 1 + + html += "
'" + artistLink(d[0]) + "'" + html += " is a possible duplicate of " + html += "'" + artistLink(d[1]) + "'""" + d[1] + """ is correct""" + d[0] + """ is correct
'" + artistLink(c[0]) + "' sounds like the combination of " + str(len(c[1])) + " artists: " + for a in c[1]: + html += "'" + artistLink(a) + "' " + html += "Fix it
Is '" + n[0] + "' in '" + artistLink(n[1]) + "' an artist?Yes
" + + return {"KEY_ISSUESLIST":html,"KEY_ISSUES":str(i)} diff --git a/website/maloja.css b/website/maloja.css index 6c9b939..8d9ea42 100644 --- a/website/maloja.css +++ b/website/maloja.css @@ -55,3 +55,12 @@ table td.time { table td.artists,td.artist,td.title,td.amount { width:300px; } + +table td.button { + width:150px; + background-color:yellow; + color:#333337; + padding:1px; + border-radius:4px; + cursor:pointer; +}