From 144198f9336097a6c8fec1f9b4a30d9584972f08 Mon Sep 17 00:00:00 2001 From: Krateng Date: Wed, 28 Nov 2018 15:33:30 +0100 Subject: [PATCH] Improved artist / title parsing --- cleanup.py | 186 +++++-------------- database.py | 6 +- lastfmconverter.py | 6 +- rules/.gitignore | 4 +- rules/{examplerules.csv => examplerules.tsv} | 10 +- 5 files changed, 58 insertions(+), 154 deletions(-) rename rules/{examplerules.csv => examplerules.tsv} (86%) diff --git a/cleanup.py b/cleanup.py index 8202806..a7ea8c3 100644 --- a/cleanup.py +++ b/cleanup.py @@ -1,161 +1,65 @@ import re def fullclean(artist,title): - artists = cleanup(removespecial(artist)) - title = cleantitle(removespecial(title)) - (title,moreartists) = findartistsintitle(title) + artists = parseArtists(removespecial(artist)) + title = parseTitle(removespecial(title)) + (title,moreartists) = parseTitleForArtists(title) artists += moreartists - return (artists,title) + return (list(set(artists)),title) def removespecial(s): return s.replace("\t","").replace("␟","").replace("\n","") -def cleanup(artiststr): - if artiststr == "": +delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field +delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them +delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used + + +def parseArtists(a): + + if a.strip() == "": return [] + + for d in delimiters_feat: + if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None: + return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a)) + + for d in (delimiters + delimiters_feat): + if ((" " + d + " ") in a): + ls = [] + for i in a.split(" " + d + " "): + ls += parseArtists(i) + return ls + + for d in delimiters_formal: + if (d in a): + ls = [] + for i in a.split(d): + ls += parseArtists(i) + return ls + + + return [a.strip()] - artists = [artiststr] +def parseTitle(t): + t = t.replace("[","(").replace("]",")") - artistsnew = [] - for a in artists: - artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\1",a)) - artistsnew.append(re.sub(r"(.*) \(ft. (.*)\)",r"\2",a)) + t = re.sub(r" \(as made famous by .*?\)","",t) + t = re.sub(r" \(originally by .*?\)","",t) - artists = artistsnew - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(" vs. ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(" vs ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(" & ")) - - artists = flatten(artistsnew) - artistsnew = [] - - - for a in artists: - artistsnew.append(a.split(" ft. ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(" Ft. ")) - - artists = flatten(artistsnew) - artistsnew = [] - - - for a in artists: - artistsnew.append(a.split(" Feat. ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(" feat. ")) - - artists = flatten(artistsnew) - artistsnew = [] - - - for a in artists: - artistsnew.append(a.split(" featuring ")) - - artists = flatten(artistsnew) - artistsnew = [] - - - for a in artists: - artistsnew.append(a.split(" Featuring ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(" ; ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split("; ")) - - artists = flatten(artistsnew) - artistsnew = [] - - for a in artists: - artistsnew.append(a.split(";")) - - artists = flatten(artistsnew) - artistsnew = [] - - #if not artists[0] == artiststr: - # print(artiststr + " became " + str(artists)) - - return artists - - -def cleantitle(title): - title = title.replace("[","(").replace("]",")") - - title = re.sub(r" \(as made famous by .*?\)","",title) - title = re.sub(r" \(originally by .*?\)","",title) - - return title + return t -def findartistsintitle(title): +def parseTitleForArtists(t): + for d in delimiters_feat: + if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None: + (title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t)) + artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t)) + return (title,artists) - truetitle = title - artists = "" - - newtitle = re.sub(r"(.*) \(ft. (.*?)\)",r"\1",title) - if (title != newtitle): - artists = re.sub(r"(.*) \(ft. (.*?)\).*",r"\2",title) - truetitle = newtitle - - newtitle = re.sub(r"(.*) \(feat. (.*?)\)",r"\1",title) - if (title != newtitle): - artists = re.sub(r"(.*) \(feat. (.*?)\).*",r"\2",title) - truetitle = newtitle - - newtitle = re.sub(r"(.*) \(Feat. (.*?)\)",r"\1",title) - if (title != newtitle): - artists = re.sub(r"(.*) \(Feat. (.*?)\).*",r"\2",title) - truetitle = newtitle - - newtitle = re.sub(r"(.*) \(Ft. (.*?)\)",r"\1",title) - if (title != newtitle): - artists = re.sub(r"(.*) \(Ft. (.*?)\).*",r"\2",title) - truetitle = newtitle - - newtitle = re.sub(r"(.*) \(Featuring (.*?)\)",r"\1",title) - if (title != newtitle): - artists = re.sub(r"(.*) \(Featuring (.*?)\).*",r"\2",title) - truetitle = newtitle - - newtitle = re.sub(r"(.*) \(featuring (.*?)\)",r"\1",title) - if (title != newtitle): - artists = re.sub(r"(.*) \(featuring (.*?)\).*",r"\2",title) - truetitle = newtitle - - - artistlist = cleanup(artists) - - return (truetitle,artistlist) + return (t,[]) def flatten(lis): diff --git a/database.py b/database.py index 43fc3dd..2066d09 100644 --- a/database.py +++ b/database.py @@ -118,8 +118,10 @@ def post_scrobble(): #title = urllib.parse.unquote(keys.get("title")) artists = keys.get("artist") title = keys.get("title") + time = int(keys.get("time")) (artists,title) = cleanup.fullclean(artists,title) - time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) + if time is None: + time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) ## this is necessary for localhost testing response.set_header("Access-Control-Allow-Origin","*") @@ -134,7 +136,6 @@ def post_scrobble(): @route("/sync") def abouttoshutdown(): sync() - print("Database saved to disk.") #sys.exit() # Starts the server @@ -298,6 +299,7 @@ def sync(): global lastsync lastsync = time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) + print("Database saved to disk.") # Queries the database diff --git a/lastfmconverter.py b/lastfmconverter.py index 9a36619..f02c013 100644 --- a/lastfmconverter.py +++ b/lastfmconverter.py @@ -13,10 +13,8 @@ for l in log: title = data[2] time = data[3] - title = cleanup.cleantitle(title) - artists = cleanup.cleanup(artist) - (title,extraartists) = cleanup.findartistsintitle(title) - artists = list(set(artists + extraartists)) + + (artists,title) = cleanup.fullclean(artist,title) artistsstr = "␟".join(artists) diff --git a/rules/.gitignore b/rules/.gitignore index 002ee35..528bf91 100644 --- a/rules/.gitignore +++ b/rules/.gitignore @@ -1,2 +1,2 @@ -*.csv -!examplerules.csv +*.tsv +!examplerules.tsv diff --git a/rules/examplerules.csv b/rules/examplerules.tsv similarity index 86% rename from rules/examplerules.csv rename to rules/examplerules.tsv index 3d3e8fb..61d8b6b 100644 --- a/rules/examplerules.csv +++ b/rules/examplerules.tsv @@ -7,8 +7,8 @@ ### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist ### ### THE RULES IN THIS EXAMPLE FILE ARE IGNORED -notanartist,In Dreams, -belongtogether,Darth & Vader, -replacetitle,첫 사랑니 (Rum Pum Pum Pum),Rum Pum Pum Pum -replaceartist,Dal Shabet,Dal★Shabet -countas,Trouble Maker,HyunA +notanartist In Dreams +belongtogether Darth & Vader +replacetitle 첫 사랑니 (Rum Pum Pum Pum) Rum Pum Pum Pum +replaceartist Dal Shabet Dal★Shabet +countas Trouble Maker HyunA