2018-11-24 18:29:24 +03:00
|
|
|
import re
|
2018-11-28 19:45:52 +03:00
|
|
|
import utilities
|
2019-08-30 16:57:54 +03:00
|
|
|
from doreah import tsv, settings
|
2018-11-24 18:29:24 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
# need to do this as a class so it can retain loaded settings from file
|
2019-01-10 01:29:01 +03:00
|
|
|
# apparently this is not true
|
|
|
|
# I'm dumb
|
2018-11-28 19:45:52 +03:00
|
|
|
class CleanerAgent:
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def __init__(self):
|
|
|
|
self.updateRules()
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def updateRules(self):
|
2019-03-29 22:23:32 +03:00
|
|
|
raw = tsv.parse_all("rules","string","string","string")
|
2018-11-28 19:45:52 +03:00
|
|
|
self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
|
|
|
|
self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
|
2019-04-08 14:38:47 +03:00
|
|
|
self.rules_replacetitle = {b.lower():c for [a,b,c] in raw if a=="replacetitle"}
|
|
|
|
self.rules_replaceartist = {b.lower():c for [a,b,c] in raw if a=="replaceartist"}
|
2019-09-05 15:46:34 +03:00
|
|
|
self.rules_ignoreartist = [b.lower() for [a,b,c] in raw if a=="ignoreartist"]
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-12-20 20:46:55 +03:00
|
|
|
# we always need to be able to tell if our current database is made with the current rules
|
|
|
|
self.checksums = utilities.checksumTSV("rules")
|
2019-03-29 22:23:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def fullclean(self,artist,title):
|
|
|
|
artists = self.parseArtists(self.removespecial(artist))
|
|
|
|
title = self.parseTitle(self.removespecial(title))
|
|
|
|
(title,moreartists) = self.parseTitleForArtists(title)
|
2019-03-29 22:23:32 +03:00
|
|
|
artists += moreartists
|
2018-12-22 14:47:49 +03:00
|
|
|
artists = list(set(artists))
|
2018-12-21 21:13:24 +03:00
|
|
|
artists.sort()
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-12-22 14:47:49 +03:00
|
|
|
return (artists,title)
|
2018-11-26 18:21:07 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def removespecial(self,s):
|
2018-12-21 20:22:58 +03:00
|
|
|
s = s.replace("\t","").replace("␟","").replace("\n","")
|
|
|
|
s = re.sub(" +"," ",s)
|
|
|
|
return s
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-26 18:21:07 +03:00
|
|
|
|
2018-12-20 19:23:16 +03:00
|
|
|
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
|
|
|
|
def confirmedReal(self,a):
|
|
|
|
confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist]
|
|
|
|
return (a in confirmed)
|
2018-11-24 18:29:24 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
#Delimiters used for extra artists, even when in the title field
|
|
|
|
delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"]
|
|
|
|
#Delimiters in informal artist strings, spaces expected around them
|
|
|
|
delimiters = ["vs.","vs","&"]
|
|
|
|
#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
|
|
|
|
delimiters_formal = ["; ",";","/"]
|
2018-11-28 17:33:30 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def parseArtists(self,a):
|
2018-11-28 17:33:30 +03:00
|
|
|
|
2019-08-30 16:57:54 +03:00
|
|
|
if a.strip() in settings.get_settings("INVALID_ARTISTS"):
|
|
|
|
return []
|
|
|
|
|
2019-09-05 15:46:34 +03:00
|
|
|
if a.strip().lower() in self.rules_ignoreartist:
|
|
|
|
return []
|
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
if a.strip() == "":
|
|
|
|
return []
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-12-17 17:10:10 +03:00
|
|
|
if a.strip() in self.rules_notanartist:
|
|
|
|
return []
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-12-17 17:10:10 +03:00
|
|
|
if " performing " in a.lower():
|
|
|
|
return self.parseArtists(re.split(" [Pp]erforming",a)[0])
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
if a.strip() in self.rules_belongtogether:
|
|
|
|
return [a.strip()]
|
2019-04-08 14:38:47 +03:00
|
|
|
if a.strip().lower() in self.rules_replaceartist:
|
|
|
|
return self.rules_replaceartist[a.strip().lower()].split("␟")
|
2019-03-29 22:23:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
for d in self.delimiters_feat:
|
|
|
|
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
2019-04-08 18:32:31 +03:00
|
|
|
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + \
|
|
|
|
self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-12-17 01:56:30 +03:00
|
|
|
for d in self.delimiters_formal:
|
|
|
|
if (d in a):
|
|
|
|
ls = []
|
|
|
|
for i in a.split(d):
|
|
|
|
ls += self.parseArtists(i)
|
|
|
|
return ls
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
for d in (self.delimiters_feat + self.delimiters):
|
|
|
|
if ((" " + d + " ") in a):
|
|
|
|
ls = []
|
|
|
|
for i in a.split(" " + d + " "):
|
|
|
|
ls += self.parseArtists(i)
|
|
|
|
return ls
|
2019-03-29 22:23:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
return [a.strip()]
|
2018-11-24 18:29:24 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def parseTitle(self,t):
|
2019-04-08 14:38:47 +03:00
|
|
|
if t.strip().lower() in self.rules_replacetitle:
|
|
|
|
return self.rules_replacetitle[t.strip().lower()]
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
t = t.replace("[","(").replace("]",")")
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
t = re.sub(r" \(as made famous by .*?\)","",t)
|
|
|
|
t = re.sub(r" \(originally by .*?\)","",t)
|
2018-12-17 01:56:30 +03:00
|
|
|
t = re.sub(r" \(.*?Remaster.*?\)","",t)
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
return t.strip()
|
2018-11-28 17:33:30 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
def parseTitleForArtists(self,t):
|
|
|
|
for d in self.delimiters_feat:
|
|
|
|
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
|
|
|
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
|
|
|
|
artists += self.parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
|
|
|
|
return (title,artists)
|
2018-12-12 21:37:59 +03:00
|
|
|
if re.match(r"(.*) - " + d + " (.*)",t) is not None:
|
|
|
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) - " + d + " (.*)",r"\1",t))
|
|
|
|
artists += self.parseArtists(re.sub(r"(.*) - " + d + " (.*).*",r"\2",t))
|
|
|
|
return (title,artists)
|
|
|
|
if re.match(r"(.*) " + d + " (.*)",t) is not None:
|
|
|
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
|
|
|
|
artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
|
|
|
|
return (title,artists)
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-28 19:45:52 +03:00
|
|
|
return (t,[])
|
2018-12-04 20:43:48 +03:00
|
|
|
|
2019-03-29 22:23:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
|
2018-12-04 20:43:48 +03:00
|
|
|
class CollectorAgent:
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-12-04 20:43:48 +03:00
|
|
|
def __init__(self):
|
|
|
|
self.updateRules()
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
# rules_countas dict: real artist -> credited artist
|
|
|
|
# rules_countas_id dict: real artist ID -> credited artist ID
|
|
|
|
# rules_include dict: credited artist -> all real artists
|
|
|
|
|
2018-12-04 20:43:48 +03:00
|
|
|
def updateRules(self):
|
2019-03-29 22:23:32 +03:00
|
|
|
raw = tsv.parse_all("rules","string","string","string")
|
2018-12-04 20:43:48 +03:00
|
|
|
self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
|
2019-04-08 18:32:31 +03:00
|
|
|
self.rules_countas_id = {}
|
|
|
|
self.rules_include = {} #Twice the memory, double the performance!
|
|
|
|
# (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
|
2018-12-04 20:43:48 +03:00
|
|
|
for a in self.rules_countas:
|
|
|
|
self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
# this agent needs to be aware of the current id assignment in the main program
|
|
|
|
# unelegant, but the best way i can think of
|
2018-12-04 20:43:48 +03:00
|
|
|
def updateIDs(self,artistlist):
|
2019-04-08 18:32:31 +03:00
|
|
|
self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas if a in artistlist}
|
2018-12-04 20:43:48 +03:00
|
|
|
#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
|
|
|
|
#this needs to take lists into account
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
|
|
|
|
# get who is credited for this artist
|
2018-12-04 20:43:48 +03:00
|
|
|
def getCredited(self,artist):
|
|
|
|
if artist in self.rules_countas:
|
|
|
|
return self.rules_countas[artist]
|
2019-04-08 18:32:31 +03:00
|
|
|
if artist in self.rules_countas_id:
|
|
|
|
return self.rules_countas_id[artist]
|
|
|
|
|
2018-12-04 20:43:48 +03:00
|
|
|
else:
|
|
|
|
return artist
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
# get all credited artists for the artists given
|
2018-12-04 20:43:48 +03:00
|
|
|
def getCreditedList(self,artists):
|
|
|
|
updatedArtists = []
|
|
|
|
for artist in artists:
|
|
|
|
updatedArtists.append(self.getCredited(artist))
|
|
|
|
return list(set(updatedArtists))
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
# get artists who the given artist is given credit for
|
2018-12-17 01:56:30 +03:00
|
|
|
def getAllAssociated(self,artist):
|
|
|
|
return self.rules_include.get(artist,[])
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2019-04-08 18:32:31 +03:00
|
|
|
# this function is there to check for artists that we should include in the
|
|
|
|
# database even though they never have any scrobble.
|
2019-02-03 01:55:13 +03:00
|
|
|
def getAllArtists(self):
|
2019-04-08 18:32:31 +03:00
|
|
|
return list(set([self.rules_countas[a] for a in self.rules_countas]))
|
|
|
|
# artists that count can be nonexisting (counting HyunA as 4Minute even
|
|
|
|
# though 4Minute has never been listened to)
|
|
|
|
# but artists that are counted as someone else are only relevant if they
|
|
|
|
# exist (so we can preemptively declare lots of rules just in case)
|
|
|
|
#return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))
|
2019-03-29 22:23:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-11-24 18:29:24 +03:00
|
|
|
def flatten(lis):
|
|
|
|
|
|
|
|
newlist = []
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-24 18:29:24 +03:00
|
|
|
for l in lis:
|
|
|
|
if isinstance(l, str):
|
|
|
|
newlist.append(l)
|
|
|
|
else:
|
|
|
|
newlist = newlist + l
|
2019-03-29 22:23:32 +03:00
|
|
|
|
2018-11-24 18:29:24 +03:00
|
|
|
return list(set(newlist))
|