mirror of
https://github.com/krateng/maloja.git
synced 2023-08-10 21:12:55 +03:00
193 lines
6.0 KiB
Python
193 lines
6.0 KiB
Python
import re
|
|
import os
|
|
import csv
|
|
|
|
from .globalconf import data_dir, malojaconfig
|
|
|
|
# need to do this as a class so it can retain loaded settings from file
|
|
# apparently this is not true
|
|
# I'm dumb
|
|
class CleanerAgent:
|
|
|
|
def __init__(self):
|
|
self.updateRules()
|
|
|
|
def updateRules(self):
|
|
|
|
rawrules = []
|
|
for f in os.listdir(data_dir["rules"]()):
|
|
if f.split('.')[-1].lower() != 'tsv': continue
|
|
filepath = data_dir["rules"](f)
|
|
with open(filepath,'r') as filed:
|
|
reader = csv.reader(filed,delimiter="\t")
|
|
rawrules += [[col for col in entry if col] for entry in reader if len(entry)>0 and not entry[0].startswith('#')]
|
|
|
|
|
|
self.rules_belongtogether = [r[1] for r in rawrules if r[0]=="belongtogether"]
|
|
self.rules_notanartist = [r[1] for r in rawrules if r[0]=="notanartist"]
|
|
self.rules_replacetitle = {r[1].lower():r[2] for r in rawrules if r[0]=="replacetitle"}
|
|
self.rules_replaceartist = {r[1].lower():r[2] for r in rawrules if r[0]=="replaceartist"}
|
|
self.rules_ignoreartist = [r[1].lower() for r in rawrules if r[0]=="ignoreartist"]
|
|
self.rules_addartists = {r[2].lower():(r[1].lower(),r[3]) for r in rawrules if r[0]=="addartists"}
|
|
self.rules_fixartists = {r[2].lower():r[1] for r in rawrules if r[0]=="fixartists"}
|
|
self.rules_artistintitle = {r[1].lower():r[2] for r in rawrules if r[0]=="artistintitle"}
|
|
#self.rules_regexartist = [[b,c] for [a,b,c,d] in raw if a=="regexartist"]
|
|
#self.rules_regextitle = [[b,c] for [a,b,c,d] in raw if a=="regextitle"]
|
|
|
|
|
|
|
|
def fullclean(self,artist,title):
|
|
artists = self.parseArtists(self.removespecial(artist))
|
|
title = self.parseTitle(self.removespecial(title))
|
|
(title,moreartists) = self.parseTitleForArtists(title)
|
|
artists += moreartists
|
|
if title.lower() in self.rules_addartists:
|
|
reqartists, allartists = self.rules_addartists[title.lower()]
|
|
reqartists = reqartists.split("␟")
|
|
allartists = allartists.split("␟")
|
|
if set(reqartists).issubset({a.lower() for a in artists}):
|
|
artists += allartists
|
|
elif title.lower() in self.rules_fixartists:
|
|
allartists = self.rules_fixartists[title.lower()]
|
|
allartists = allartists.split("␟")
|
|
if len({a.lower() for a in allartists} & {a.lower() for a in artists}) > 0:
|
|
artists = allartists
|
|
artists = list(set(artists))
|
|
artists.sort()
|
|
|
|
return (artists,title)
|
|
|
|
def removespecial(self,s):
|
|
if isinstance(s,list):
|
|
return [self.removespecial(se) for se in s]
|
|
s = s.replace("\t","").replace("␟","").replace("\n","")
|
|
s = re.sub(" +"," ",s)
|
|
return s
|
|
|
|
|
|
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
|
|
def confirmedReal(self,a):
|
|
confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist]
|
|
return (a in confirmed)
|
|
|
|
#Delimiters used for extra artists, even when in the title field
|
|
#delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"]
|
|
delimiters_feat = malojaconfig["DELIMITERS_FEAT"]
|
|
#Delimiters in informal artist strings, spaces expected around them
|
|
#delimiters = ["vs.","vs","&"]
|
|
delimiters = malojaconfig["DELIMITERS_INFORMAL"]
|
|
#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
|
|
#delimiters_formal = ["; ",";","/"]
|
|
delimiters_formal = malojaconfig["DELIMITERS_FORMAL"]
|
|
|
|
def parseArtists(self,a):
|
|
|
|
if isinstance(a,list):
|
|
res = [self.parseArtists(art) for art in a]
|
|
return [a for group in res for a in group]
|
|
|
|
if a.strip() in malojaconfig["INVALID_ARTISTS"]:
|
|
return []
|
|
|
|
if a.strip().lower() in self.rules_ignoreartist:
|
|
return []
|
|
|
|
if a.strip() == "":
|
|
return []
|
|
|
|
if a.strip() in self.rules_notanartist:
|
|
return []
|
|
|
|
if " performing " in a.lower():
|
|
return self.parseArtists(re.split(" [Pp]erforming",a)[0])
|
|
|
|
if a.strip() in self.rules_belongtogether:
|
|
return [a.strip()]
|
|
if a.strip().lower() in self.rules_replaceartist:
|
|
return self.rules_replaceartist[a.strip().lower()].split("␟")
|
|
|
|
|
|
|
|
for d in self.delimiters_feat:
|
|
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
|
|
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + \
|
|
self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
|
|
|
|
|
|
|
|
for d in (self.delimiters_feat + self.delimiters):
|
|
if ((" " + d + " ") in a):
|
|
ls = []
|
|
for i in a.split(" " + d + " "):
|
|
ls += self.parseArtists(i)
|
|
return ls
|
|
|
|
for d in self.delimiters_formal:
|
|
if (d in a):
|
|
ls = []
|
|
for i in a.split(d):
|
|
ls += self.parseArtists(i)
|
|
return ls
|
|
|
|
|
|
|
|
|
|
|
|
return [a.strip()]
|
|
|
|
def parseTitle(self,t):
|
|
if t.strip().lower() in self.rules_replacetitle:
|
|
return self.rules_replacetitle[t.strip().lower()]
|
|
|
|
t = t.replace("[","(").replace("]",")")
|
|
|
|
t = re.sub(r" \(as made famous by .*?\)","",t)
|
|
t = re.sub(r" \(originally by .*?\)","",t)
|
|
t = re.sub(r" \(.*?Remaster.*?\)","",t)
|
|
|
|
for s in malojaconfig["REMOVE_FROM_TITLE"]:
|
|
if s in t:
|
|
t = t.replace(s,"")
|
|
|
|
t = t.strip()
|
|
#for p in self.plugin_titleparsers:
|
|
# t = p(t).strip()
|
|
return t
|
|
|
|
def parseTitleForArtists(self,t):
|
|
for d in self.delimiters_feat:
|
|
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
|
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
|
|
artists += self.parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
|
|
return (title,artists)
|
|
if re.match(r"(.*) - " + d + " (.*)",t) is not None:
|
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) - " + d + " (.*)",r"\1",t))
|
|
artists += self.parseArtists(re.sub(r"(.*) - " + d + " (.*).*",r"\2",t))
|
|
return (title,artists)
|
|
if re.match(r"(.*) " + d + " (.*)",t) is not None:
|
|
(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
|
|
artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
|
|
return (title,artists)
|
|
|
|
artists = []
|
|
for st in self.rules_artistintitle:
|
|
if st in t.lower(): artists += self.rules_artistintitle[st].split("␟")
|
|
return (t,artists)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def flatten(lis):
|
|
|
|
newlist = []
|
|
|
|
for l in lis:
|
|
if isinstance(l, str):
|
|
newlist.append(l)
|
|
else:
|
|
newlist += l
|
|
|
|
return list(set(newlist))
|