mirror of
https://github.com/krateng/maloja.git
synced 2023-08-10 21:12:55 +03:00
Implemented custom rules
This commit is contained in:
148
cleanup.py
148
cleanup.py
@@ -1,74 +1,112 @@
|
||||
import re
|
||||
import utilities
|
||||
|
||||
def fullclean(artist,title):
|
||||
artists = parseArtists(removespecial(artist))
|
||||
title = parseTitle(removespecial(title))
|
||||
(title,moreartists) = parseTitleForArtists(title)
|
||||
artists += moreartists
|
||||
# need to do this as a class so it can retain loaded settings from file
|
||||
class CleanerAgent:
|
||||
|
||||
return (list(set(artists)),title)
|
||||
|
||||
def removespecial(s):
|
||||
return s.replace("\t","").replace("␟","").replace("\n","")
|
||||
|
||||
|
||||
delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
|
||||
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
|
||||
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
|
||||
|
||||
|
||||
def parseArtists(a):
|
||||
|
||||
if a.strip() == "":
|
||||
return []
|
||||
def __init__(self):
|
||||
self.updateRules()
|
||||
|
||||
for d in delimiters_feat:
|
||||
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
||||
return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
||||
|
||||
for d in (delimiters + delimiters_feat):
|
||||
if ((" " + d + " ") in a):
|
||||
ls = []
|
||||
for i in a.split(" " + d + " "):
|
||||
ls += parseArtists(i)
|
||||
return ls
|
||||
def updateRules(self):
|
||||
raw = utilities.parseAllTSV("rules","string","string","string")
|
||||
self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
|
||||
self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
|
||||
self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
|
||||
self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
|
||||
|
||||
for d in delimiters_formal:
|
||||
if (d in a):
|
||||
ls = []
|
||||
for i in a.split(d):
|
||||
ls += parseArtists(i)
|
||||
return ls
|
||||
|
||||
|
||||
def fullclean(self,artist,title):
|
||||
artists = self.parseArtists(self.removespecial(artist))
|
||||
title = self.parseTitle(self.removespecial(title))
|
||||
(title,moreartists) = self.parseTitleForArtists(title)
|
||||
artists += moreartists
|
||||
|
||||
|
||||
return (list(set(artists)),title)
|
||||
|
||||
def removespecial(self,s):
|
||||
return s.replace("\t","").replace("␟","").replace("\n","")
|
||||
|
||||
|
||||
delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
|
||||
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
|
||||
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
|
||||
|
||||
|
||||
def parseArtists(self,a):
|
||||
|
||||
if a.strip() == "":
|
||||
return []
|
||||
|
||||
if a.strip() in self.rules_belongtogether:
|
||||
return [a.strip()]
|
||||
if a.strip() in self.rules_replaceartist:
|
||||
return [self.rules_replaceartist[a.strip()]]
|
||||
|
||||
|
||||
return [a.strip()]
|
||||
|
||||
for d in self.delimiters_feat:
|
||||
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
||||
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
||||
|
||||
for d in (self.delimiters_feat + self.delimiters):
|
||||
if ((" " + d + " ") in a):
|
||||
ls = []
|
||||
for i in a.split(" " + d + " "):
|
||||
ls += self.parseArtists(i)
|
||||
return ls
|
||||
|
||||
for d in self.delimiters_formal:
|
||||
if (d in a):
|
||||
ls = []
|
||||
for i in a.split(d):
|
||||
ls += self.parseArtists(i)
|
||||
return ls
|
||||
|
||||
|
||||
|
||||
return [a.strip()]
|
||||
|
||||
def parseTitle(t):
|
||||
t = t.replace("[","(").replace("]",")")
|
||||
def parseTitle(self,t):
|
||||
|
||||
if t.strip() in self.rules_replacetitle:
|
||||
return self.rules_replacetitle[t.strip()]
|
||||
|
||||
t = re.sub(r" \(as made famous by .*?\)","",t)
|
||||
t = re.sub(r" \(originally by .*?\)","",t)
|
||||
|
||||
return t
|
||||
t = t.replace("[","(").replace("]",")")
|
||||
|
||||
t = re.sub(r" \(as made famous by .*?\)","",t)
|
||||
t = re.sub(r" \(originally by .*?\)","",t)
|
||||
|
||||
return t.strip()
|
||||
|
||||
def parseTitleForArtists(t):
|
||||
for d in delimiters_feat:
|
||||
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
|
||||
(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
|
||||
artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
|
||||
return (title,artists)
|
||||
|
||||
return (t,[])
|
||||
|
||||
def parseTitleForArtists(self,t):
|
||||
for d in self.delimiters_feat:
|
||||
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
|
||||
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
|
||||
artists += self.parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
|
||||
return (title,artists)
|
||||
|
||||
return (t,[])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def flatten(lis):
|
||||
|
||||
newlist = []
|
||||
|
||||
|
||||
for l in lis:
|
||||
if isinstance(l, str):
|
||||
newlist.append(l)
|
||||
else:
|
||||
newlist = newlist + l
|
||||
|
||||
|
||||
return list(set(newlist))
|
||||
|
||||
Reference in New Issue
Block a user