maloja/cleanup.py

import re
import utilities
from doreah import tsv

# need to do this as a class so it can retain loaded settings from file
# apparently this is not true
# I'm dumb
class CleanerAgent:

	def __init__(self):
		self.updateRules()

	def updateRules(self):
		raw = tsv.parse_all("rules","string","string","string")
		self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
		self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
		self.rules_replacetitle = {b.lower():c for [a,b,c] in raw if a=="replacetitle"}
		self.rules_replaceartist = {b.lower():c for [a,b,c] in raw if a=="replaceartist"}

		# we always need to be able to tell if our current database is made with the current rules
		self.checksums = utilities.checksumTSV("rules")


	def fullclean(self,artist,title):
		artists = self.parseArtists(self.removespecial(artist))
		title = self.parseTitle(self.removespecial(title))
		(title,moreartists) = self.parseTitleForArtists(title)
		artists += moreartists
		artists = list(set(artists))
		artists.sort()

		return (artists,title)

	def removespecial(self,s):
		s = s.replace("\t","").replace("␟","").replace("\n","")
		s = re.sub(" +"," ",s)
		return s


	# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
	def confirmedReal(self,a):
		confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist]
		return (a in confirmed)

	#Delimiters used for extra artists, even when in the title field
	delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"]
	#Delimiters in informal artist strings, spaces expected around them
	delimiters = ["vs.","vs","&"]
	#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
	delimiters_formal = ["; ",";","/"]

	def parseArtists(self,a):

		if a.strip() == "":
			return []

		if a.strip() in self.rules_notanartist:
			return []

		if " performing " in a.lower():
			return self.parseArtists(re.split(" [Pp]erforming",a)[0])

		if a.strip() in self.rules_belongtogether:
			return [a.strip()]
		if a.strip().lower() in self.rules_replaceartist:
			return self.rules_replaceartist[a.strip().lower()].split("␟")


		for d in self.delimiters_feat:
			if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
				return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + \
						self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))

		for d in self.delimiters_formal:
			if (d in a):
				ls = []
				for i in a.split(d):
					ls += self.parseArtists(i)
				return ls

		for d in (self.delimiters_feat + self.delimiters):
			if ((" " + d + " ") in a):
				ls = []
				for i in a.split(" " + d + " "):
					ls += self.parseArtists(i)
				return ls


		return [a.strip()]

	def parseTitle(self,t):
		if t.strip().lower() in self.rules_replacetitle:
			return self.rules_replacetitle[t.strip().lower()]

		t = t.replace("[","(").replace("]",")")

		t = re.sub(r" \(as made famous by .*?\)","",t)
		t = re.sub(r" \(originally by .*?\)","",t)
		t = re.sub(r" \(.*?Remaster.*?\)","",t)

		return t.strip()

	def parseTitleForArtists(self,t):
		for d in self.delimiters_feat:
			if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
				(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
				artists += self.parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
				return (title,artists)
			if re.match(r"(.*) - " + d + " (.*)",t) is not None:
				(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) - " + d + " (.*)",r"\1",t))
				artists += self.parseArtists(re.sub(r"(.*) - " + d + " (.*).*",r"\2",t))
				return (title,artists)
			if re.match(r"(.*) " + d + " (.*)",t) is not None:
				(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
				artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
				return (title,artists)

		return (t,[])


#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
class CollectorAgent:

	def __init__(self):
		self.updateRules()

	# rules_countas			dict: real artist -> credited artist
	# rules_countas_id		dict: real artist ID -> credited artist ID
	# rules_include			dict: credited artist -> all real artists

	def updateRules(self):
		raw = tsv.parse_all("rules","string","string","string")
		self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
		self.rules_countas_id = {}
		self.rules_include = {} #Twice the memory, double the performance!
		# (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
		for a in self.rules_countas:
			self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]

	# this agent needs to be aware of the current id assignment in the main program
	# unelegant, but the best way i can think of
	def updateIDs(self,artistlist):
		self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas if a in artistlist}
		#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
		#this needs to take lists into account


	# get who is credited for this artist
	def getCredited(self,artist):
		if artist in self.rules_countas:
			return self.rules_countas[artist]
		if artist in self.rules_countas_id:
			return self.rules_countas_id[artist]

		else:
			return artist

	# get all credited artists for the artists given
	def getCreditedList(self,artists):
		updatedArtists = []
		for artist in artists:
			updatedArtists.append(self.getCredited(artist))
		return list(set(updatedArtists))

	# get artists who the given artist is given credit for
	def getAllAssociated(self,artist):
		return self.rules_include.get(artist,[])

	# this function is there to check for artists that we should include in the
	# database even though they never have any scrobble.
	def getAllArtists(self):
		return list(set([self.rules_countas[a] for a in self.rules_countas]))
		# artists that count can be nonexisting (counting HyunA as 4Minute even
		# though 4Minute has never been listened to)
		# but artists that are counted as someone else are only relevant if they
		# exist (so we can preemptively declare lots of rules just in case)
		#return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))


def flatten(lis):

	newlist = []

	for l in lis:
		if isinstance(l, str):
			newlist.append(l)
		else:
			newlist = newlist + l

	return list(set(newlist))
Initial commit 2018-11-24 18:29:24 +03:00			`import re`
Implemented custom rules 2018-11-28 19:45:52 +03:00			`import utilities`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00			`from doreah import tsv`
Initial commit 2018-11-24 18:29:24 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`# need to do this as a class so it can retain loaded settings from file`
Various Fixes 2019-01-10 01:29:01 +03:00			`# apparently this is not true`
			`# I'm dumb`
Implemented custom rules 2018-11-28 19:45:52 +03:00			`class CleanerAgent:`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`def __init__(self):`
			`self.updateRules()`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`def updateRules(self):`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00			`raw = tsv.parse_all("rules","string","string","string")`
Implemented custom rules 2018-11-28 19:45:52 +03:00			`self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]`
			`self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]`
Some improvements to rule handling 2019-04-08 14:38:47 +03:00			`self.rules_replacetitle = {b.lower():c for [a,b,c] in raw if a=="replacetitle"}`
			`self.rules_replaceartist = {b.lower():c for [a,b,c] in raw if a=="replaceartist"}`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Added basic scrobble database consistency system 2018-12-20 20:46:55 +03:00			`# we always need to be able to tell if our current database is made with the current rules`
			`self.checksums = utilities.checksumTSV("rules")`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00


Implemented custom rules 2018-11-28 19:45:52 +03:00			`def fullclean(self,artist,title):`
			`artists = self.parseArtists(self.removespecial(artist))`
			`title = self.parseTitle(self.removespecial(title))`
			`(title,moreartists) = self.parseTitleForArtists(title)`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00			`artists += moreartists`
Simplified some of the webpage building logic 2018-12-22 14:47:49 +03:00			`artists = list(set(artists))`
QoL fixes 2018-12-21 21:13:24 +03:00			`artists.sort()`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Simplified some of the webpage building logic 2018-12-22 14:47:49 +03:00			`return (artists,title)`
Added basic scrobbling function 2018-11-26 18:21:07 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`def removespecial(self,s):`
Case is now ignored when building track titles and artist names 2018-12-21 20:22:58 +03:00			`s = s.replace("\t","").replace("␟","").replace("\n","")`
			`s = re.sub(" +"," ",s)`
			`return s`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Added basic scrobbling function 2018-11-26 18:21:07 +03:00
Added web interface to audit library data 2018-12-20 19:23:16 +03:00			`# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that`
			`def confirmedReal(self,a):`
			`confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist]`
			`return (a in confirmed)`
Initial commit 2018-11-24 18:29:24 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`#Delimiters used for extra artists, even when in the title field`
			`delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"]`
			`#Delimiters in informal artist strings, spaces expected around them`
			`delimiters = ["vs.","vs","&"]`
			`#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used`
			`delimiters_formal = ["; ",";","/"]`
Improved artist / title parsing 2018-11-28 17:33:30 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`def parseArtists(self,a):`
Improved artist / title parsing 2018-11-28 17:33:30 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`if a.strip() == "":`
			`return []`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Better metadata grabbing and caching 2018-12-17 17:10:10 +03:00			`if a.strip() in self.rules_notanartist:`
			`return []`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Better metadata grabbing and caching 2018-12-17 17:10:10 +03:00			`if " performing " in a.lower():`
			`return self.parseArtists(re.split(" [Pp]erforming",a)[0])`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`if a.strip() in self.rules_belongtogether:`
			`return [a.strip()]`
Some improvements to rule handling 2019-04-08 14:38:47 +03:00			`if a.strip().lower() in self.rules_replaceartist:`
			`return self.rules_replaceartist[a.strip().lower()].split("␟")`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00


Implemented custom rules 2018-11-28 19:45:52 +03:00			`for d in self.delimiters_feat:`
			`if re.match(r"(.) \(" + d + " (.)\)",a) is not None:`
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`return self.parseArtists(re.sub(r"(.) \(" + d + " (.)\)",r"\1",a)) + \`
			`self.parseArtists(re.sub(r"(.) \(" + d + " (.)\)",r"\2",a))`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Improvement and bugfixing 2018-12-17 01:56:30 +03:00			`for d in self.delimiters_formal:`
			`if (d in a):`
			`ls = []`
			`for i in a.split(d):`
			`ls += self.parseArtists(i)`
			`return ls`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`for d in (self.delimiters_feat + self.delimiters):`
			`if ((" " + d + " ") in a):`
			`ls = []`
			`for i in a.split(" " + d + " "):`
			`ls += self.parseArtists(i)`
			`return ls`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00




Implemented custom rules 2018-11-28 19:45:52 +03:00			`return [a.strip()]`
Initial commit 2018-11-24 18:29:24 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`def parseTitle(self,t):`
Some improvements to rule handling 2019-04-08 14:38:47 +03:00			`if t.strip().lower() in self.rules_replacetitle:`
			`return self.rules_replacetitle[t.strip().lower()]`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`t = t.replace("[","(").replace("]",")")`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`t = re.sub(r" \(as made famous by .*?\)","",t)`
			`t = re.sub(r" \(originally by .*?\)","",t)`
Improvement and bugfixing 2018-12-17 01:56:30 +03:00			`t = re.sub(r" \(.?Remaster.?\)","",t)`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`return t.strip()`
Improved artist / title parsing 2018-11-28 17:33:30 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`def parseTitleForArtists(self,t):`
			`for d in self.delimiters_feat:`
			`if re.match(r"(.) \(" + d + " (.?)\)",t) is not None:`
			`(title,artists) = self.parseTitleForArtists(re.sub(r"(.) \(" + d + " (.?)\)",r"\1",t))`
			`artists += self.parseArtists(re.sub(r"(.) \(" + d + " (.?)\).*",r"\2",t))`
			`return (title,artists)`
Minor fixes and structure 2018-12-12 21:37:59 +03:00			`if re.match(r"(.) - " + d + " (.)",t) is not None:`
			`(title,artists) = self.parseTitleForArtists(re.sub(r"(.) - " + d + " (.)",r"\1",t))`
			`artists += self.parseArtists(re.sub(r"(.) - " + d + " (.).*",r"\2",t))`
			`return (title,artists)`
			`if re.match(r"(.) " + d + " (.)",t) is not None:`
			`(title,artists) = self.parseTitleForArtists(re.sub(r"(.) " + d + " (.)",r"\1",t))`
			`artists += self.parseArtists(re.sub(r"(.) " + d + " (.).*",r"\2",t))`
			`return (title,artists)`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Implemented custom rules 2018-11-28 19:45:52 +03:00			`return (t,[])`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00

			`#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`class CollectorAgent:`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`def __init__(self):`
			`self.updateRules()`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`# rules_countas dict: real artist -> credited artist`
			`# rules_countas_id dict: real artist ID -> credited artist ID`
			`# rules_include dict: credited artist -> all real artists`

Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`def updateRules(self):`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00			`raw = tsv.parse_all("rules","string","string","string")`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}`
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`self.rules_countas_id = {}`
			`self.rules_include = {} #Twice the memory, double the performance!`
			`# (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`for a in self.rules_countas:`
			`self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`# this agent needs to be aware of the current id assignment in the main program`
			`# unelegant, but the best way i can think of`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`def updateIDs(self,artistlist):`
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas if a in artistlist}`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}`
			`#this needs to take lists into account`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00
			`# get who is credited for this artist`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`def getCredited(self,artist):`
			`if artist in self.rules_countas:`
			`return self.rules_countas[artist]`
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`if artist in self.rules_countas_id:`
			`return self.rules_countas_id[artist]`

Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`else:`
			`return artist`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`# get all credited artists for the artists given`
Charts can now bundle artists 2018-12-04 20:43:48 +03:00			`def getCreditedList(self,artists):`
			`updatedArtists = []`
			`for artist in artists:`
			`updatedArtists.append(self.getCredited(artist))`
			`return list(set(updatedArtists))`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`# get artists who the given artist is given credit for`
Improvement and bugfixing 2018-12-17 01:56:30 +03:00			`def getAllAssociated(self,artist):`
			`return self.rules_include.get(artist,[])`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`# this function is there to check for artists that we should include in the`
			`# database even though they never have any scrobble.`
Fixed small bug with associated artists 2019-02-03 01:55:13 +03:00			`def getAllArtists(self):`
Associated artists now only show up if relevant 2019-04-08 18:32:31 +03:00			`return list(set([self.rules_countas[a] for a in self.rules_countas]))`
			`# artists that count can be nonexisting (counting HyunA as 4Minute even`
			`# though 4Minute has never been listened to)`
			`# but artists that are counted as someone else are only relevant if they`
			`# exist (so we can preemptively declare lots of rules just in case)`
			`#return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00






Initial commit 2018-11-24 18:29:24 +03:00			`def flatten(lis):`

			`newlist = []`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Initial commit 2018-11-24 18:29:24 +03:00			`for l in lis:`
			`if isinstance(l, str):`
			`newlist.append(l)`
			`else:`
			`newlist = newlist + l`
Moved tsv handling to doreah 2019-03-29 22:23:32 +03:00
Initial commit 2018-11-24 18:29:24 +03:00			`return list(set(newlist))`