Implemented custom rules

2023-08-10 21:12:55 +03:00 · 2018-11-28 17:45:52 +01:00
parent 144198f933
commit 54bffc5642
5 changed files with 163 additions and 67 deletions
--- a/cleanup.py
+++ b/cleanup.py
@@ -1,74 +1,112 @@
 import re
 import utilities
-def fullclean(artist,title):
+# need to do this as a class so it can retain loaded settings from file
-	artists = parseArtists(removespecial(artist))
+class CleanerAgent:
 	title = parseTitle(removespecial(title))
 	(title,moreartists) = parseTitleForArtists(title)
 	artists += moreartists
-	return (list(set(artists)),title)
+	def __init__(self):
-
+		self.updateRules()
 def removespecial(s):
 	return s.replace("\t","").replace("␟","").replace("\n","")
 delimiters_feat = ["ft.","ft","feat.","feat","featuring"]			#Delimiters used for extra artists, even when in the title field
 delimiters = ["vs.","vs","&"]							#Delimiters in informal titles, spaces expected around them
 delimiters_formal = ["; ",";"]							#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
 def parseArtists(a):
 	if a.strip() == "":
 		return []
-	for d in delimiters_feat:
+	def updateRules(self):
-		if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
+		raw = utilities.parseAllTSV("rules","string","string","string")
-			return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
+		self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
-	
+		self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
-	for d in (delimiters + delimiters_feat):
+		self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
-		if ((" " + d + " ") in a):
+		self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
 			ls = []
 			for i in a.split(" " + d + " "):
 				ls += parseArtists(i)
 			return ls
-	for d in delimiters_formal:
+	
-		if (d in a):
+	
-			ls = []
+	def fullclean(self,artist,title):
-			for i in a.split(d):
+		artists = self.parseArtists(self.removespecial(artist))
-				ls += parseArtists(i)
+		title = self.parseTitle(self.removespecial(title))
-			return ls
+		(title,moreartists) = self.parseTitleForArtists(title)
 		artists += moreartists
-	
+		return (list(set(artists)),title)
 	def removespecial(self,s):
 		return s.replace("\t","").replace("␟","").replace("\n","")
 	delimiters_feat = ["ft.","ft","feat.","feat","featuring"]			#Delimiters used for extra artists, even when in the title field
 	delimiters = ["vs.","vs","&"]							#Delimiters in informal titles, spaces expected around them
 	delimiters_formal = ["; ",";"]							#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
 	def parseArtists(self,a):
 		if a.strip() == "":
 			return []
 		if a.strip() in self.rules_belongtogether:
 			return [a.strip()]
 		if a.strip() in self.rules_replaceartist:
 			return [self.rules_replaceartist[a.strip()]]
-	return [a.strip()]
+		
 		for d in self.delimiters_feat:
 			if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
 				return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
 		for d in (self.delimiters_feat + self.delimiters):
 			if ((" " + d + " ") in a):
 				ls = []
 				for i in a.split(" " + d + " "):
 					ls += self.parseArtists(i)
 				return ls
 		for d in self.delimiters_formal:
 			if (d in a):
 				ls = []
 				for i in a.split(d):
 					ls += self.parseArtists(i)
 				return ls
 		return [a.strip()]
-def parseTitle(t):
+	def parseTitle(self,t):
-	t = t.replace("[","(").replace("]",")")
+		
 		if t.strip() in self.rules_replacetitle:
 			return self.rules_replacetitle[t.strip()]
-	t = re.sub(r" \(as made famous by .*?\)","",t)
+		t = t.replace("[","(").replace("]",")")
-	t = re.sub(r" \(originally by .*?\)","",t)
+		
-	
+		t = re.sub(r" \(as made famous by .*?\)","",t)
-	return t
+		t = re.sub(r" \(originally by .*?\)","",t)
 		return t.strip()
-def parseTitleForArtists(t):
+	def parseTitleForArtists(self,t):
-	for d in delimiters_feat:
+		for d in self.delimiters_feat:
-		if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
+			if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
-			(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
+				(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
-			artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
+				artists += self.parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
-			return (title,artists)
+				return (title,artists)
-	
+		
-	return (t,[])
+		return (t,[])
-	
+		
 def flatten(lis):
 	newlist = []
-	
+		
 	for l in lis:
 		if isinstance(l, str):
 			newlist.append(l)
 		else:
 			newlist = newlist + l
-			
+				
 	return list(set(newlist))
--- a/database.py
+++ b/database.py
@@ -4,7 +4,7 @@ import urllib
 import waitress
 import os
 import datetime
-import cleanup
+from cleanup import *
 import sys
@@ -12,6 +12,8 @@ SCROBBLES = []	# Format: tuple(track_ref,timestamp,saved)
 ARTISTS = []	# Format: artist
 TRACKS = []	# Format: tuple(frozenset(artist_ref,...),title)
 c = CleanerAgent()
 lastsync = 0
@@ -118,11 +120,12 @@ def post_scrobble():
 	#title = urllib.parse.unquote(keys.get("title"))
 	artists = keys.get("artist")
 	title = keys.get("title")
-	time = int(keys.get("time"))
+	try:
-	(artists,title) = cleanup.fullclean(artists,title)
+		time = int(keys.get("time"))
-	if time is None:
+	except:
 		time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
-	
+	(artists,title) = c.fullclean(artists,title)
 	## this is necessary for localhost testing
 	response.set_header("Access-Control-Allow-Origin","*")
--- a/lastfmconverter.py
+++ b/lastfmconverter.py
@@ -1,9 +1,12 @@
 import sys, os, datetime, re, cleanup
 from cleanup import *
 log = open(sys.argv[1])
 outputlog = open(sys.argv[2],"a")
 c = CleanerAgent()
 for l in log:
 	l = l.replace("\n","")
 	data = l.split(",")
@@ -13,8 +16,8 @@ for l in log:
 	title = data[2]
 	time = data[3]
-
+	
-	(artists,title) = cleanup.fullclean(artist,title)
+	(artists,title) = c.fullclean(artist,title)
 	artistsstr = "␟".join(artists)
--- a/rules/examplerules.tsv
+++ b/rules/examplerules.tsv
@@ -7,8 +7,8 @@
 ###	countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist
 ###
 ### THE RULES IN THIS EXAMPLE FILE ARE IGNORED
-notanartist	In Dreams
+#notanartist	In Dreams
-belongtogether	Darth & Vader
+#belongtogether	Darth & Vader
-replacetitle	첫 사랑니 (Rum Pum Pum Pum)	Rum Pum Pum Pum
+#replacetitle	첫 사랑니 (Rum Pum Pum Pum)	Rum Pum Pum Pum
-replaceartist	Dal Shabet			Dal★Shabet
+#replaceartist	Dal Shabet			Dal★Shabet
-countas		Trouble Maker			HyunA
+#countas		Trouble Maker			HyunA
--- a/utilities.py
+++ b/utilities.py
@@ -0,0 +1,52 @@
 def parseTSV(filename,*args):
 	f = open(filename)
 	result = []
 	for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
 		l = l.replace("\n","").split("#")[0]
 		data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
 		entry = [] * len(args)
 		for i in range(len(args)):
 			if args[i]=="list":
 				try:
 					entry.append(data[i].split("␟"))
 				except:
 					entry.append([])
 			elif args[i]=="string":
 				try:
 					entry.append(data[i])
 				except:
 					entry.append("")
 			elif args[i]=="int":
 				try:
 					entry.append(int(data[i]))
 				except:
 					entry.append(0)
 			elif args[i]=="bool":
 				try:
 					entry.append((data[i].lower() in ["true","yes","1","y"]))
 				except:
 					entry.append(False)
 		result.append(entry)
 	f.close()
 	return result
 def parseAllTSV(path,*args):
 	import os
 	result = []
 	for f in os.listdir(path + "/"):
 		if (".tsv" in f):
 			result += parseTSV(path + "/" + f,*args)
 	return result