maloja/maloja/cleanup.py

207 lines
6.4 KiB
Python
Raw Permalink Normal View History

2018-11-24 18:29:24 +03:00
import re
import os
import csv
from .pkg_global.conf import data_dir, malojaconfig
2018-11-24 18:29:24 +03:00
2018-11-28 19:45:52 +03:00
# need to do this as a class so it can retain loaded settings from file
2019-01-10 01:29:01 +03:00
# apparently this is not true
# I'm dumb
2018-11-28 19:45:52 +03:00
class CleanerAgent:
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
def __init__(self):
self.updateRules()
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
def updateRules(self):
rawrules = []
for f in os.listdir(data_dir["rules"]()):
if f.split('.')[-1].lower() != 'tsv': continue
filepath = data_dir["rules"](f)
with open(filepath,'r') as filed:
reader = csv.reader(filed,delimiter="\t")
rawrules += [[col for col in entry if col] for entry in reader if len(entry)>0 and not entry[0].startswith('#')]
self.rules_belongtogether = [r[1] for r in rawrules if r[0]=="belongtogether"]
self.rules_notanartist = [r[1] for r in rawrules if r[0]=="notanartist"]
self.rules_replacetitle = {r[1].lower():r[2] for r in rawrules if r[0]=="replacetitle"}
self.rules_replaceartist = {r[1].lower():r[2] for r in rawrules if r[0]=="replaceartist"}
self.rules_ignoreartist = [r[1].lower() for r in rawrules if r[0]=="ignoreartist"]
self.rules_addartists = {r[2].lower():(r[1].lower(),r[3]) for r in rawrules if r[0]=="addartists"}
self.rules_fixartists = {r[2].lower():r[1] for r in rawrules if r[0]=="fixartists"}
self.rules_artistintitle = {r[1].lower():r[2] for r in rawrules if r[0]=="artistintitle"}
2019-10-24 05:03:44 +03:00
#self.rules_regexartist = [[b,c] for [a,b,c,d] in raw if a=="regexartist"]
#self.rules_regextitle = [[b,c] for [a,b,c,d] in raw if a=="regextitle"]
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
def fullclean(self,artist,title):
artists = self.parseArtists(self.removespecial(artist))
title = self.parseTitle(self.removespecial(title))
(title,moreartists) = self.parseTitleForArtists(title)
2019-03-29 22:23:32 +03:00
artists += moreartists
2019-10-24 05:03:44 +03:00
if title.lower() in self.rules_addartists:
reqartists, allartists = self.rules_addartists[title.lower()]
reqartists = reqartists.split("")
allartists = allartists.split("")
Refactoring (#83) * Merge isinstance calls * Inline variable that is immediately returned * Replace set() with comprehension * Replace assignment with augmented assignment * Remove unnecessary else after guard condition * Convert for loop into list comprehension * Replace unused for index with underscore * Merge nested if conditions * Convert for loop into list comprehension * Convert for loop into set comprehension * Remove unnecessary else after guard condition * Replace if statements with if expressions * Simplify sequence comparison * Replace multiple comparisons with in operator * Merge isinstance calls * Merge nested if conditions * Add guard clause * Merge duplicate blocks in conditional * Replace unneeded comprehension with generator * Inline variable that is immediately returned * Remove unused imports * Replace unneeded comprehension with generator * Remove unused imports * Remove unused import * Inline variable that is immediately returned * Swap if/else branches and remove unnecessary else * Use str.join() instead of for loop * Multiple refactors - Remove redundant pass statement - Hoist repeated code outside conditional statement - Swap if/else to remove empty if body * Inline variable that is immediately returned * Simplify generator expression * Replace if statement with if expression * Multiple refactoring - Replace range(0, x) with range(x) - Swap if/else branches - Remove unnecessary else after guard condition * Use str.join() instead of for loop * Hoist repeated code outside conditional statement * Use str.join() instead of for loop * Inline variables that are immediately returned * Merge dictionary assignment with declaration * Use items() to directly unpack dictionary values * Extract dup code from methods into a new one
2021-10-19 15:58:24 +03:00
if set(reqartists).issubset({a.lower() for a in artists}):
2019-10-24 05:03:44 +03:00
artists += allartists
2019-12-29 21:31:05 +03:00
elif title.lower() in self.rules_fixartists:
allartists = self.rules_fixartists[title.lower()]
allartists = allartists.split("")
Refactoring (#83) * Merge isinstance calls * Inline variable that is immediately returned * Replace set() with comprehension * Replace assignment with augmented assignment * Remove unnecessary else after guard condition * Convert for loop into list comprehension * Replace unused for index with underscore * Merge nested if conditions * Convert for loop into list comprehension * Convert for loop into set comprehension * Remove unnecessary else after guard condition * Replace if statements with if expressions * Simplify sequence comparison * Replace multiple comparisons with in operator * Merge isinstance calls * Merge nested if conditions * Add guard clause * Merge duplicate blocks in conditional * Replace unneeded comprehension with generator * Inline variable that is immediately returned * Remove unused imports * Replace unneeded comprehension with generator * Remove unused imports * Remove unused import * Inline variable that is immediately returned * Swap if/else branches and remove unnecessary else * Use str.join() instead of for loop * Multiple refactors - Remove redundant pass statement - Hoist repeated code outside conditional statement - Swap if/else to remove empty if body * Inline variable that is immediately returned * Simplify generator expression * Replace if statement with if expression * Multiple refactoring - Replace range(0, x) with range(x) - Swap if/else branches - Remove unnecessary else after guard condition * Use str.join() instead of for loop * Hoist repeated code outside conditional statement * Use str.join() instead of for loop * Inline variables that are immediately returned * Merge dictionary assignment with declaration * Use items() to directly unpack dictionary values * Extract dup code from methods into a new one
2021-10-19 15:58:24 +03:00
if len({a.lower() for a in allartists} & {a.lower() for a in artists}) > 0:
2019-12-29 21:31:05 +03:00
artists = allartists
artists = list(set(artists))
2018-12-21 21:13:24 +03:00
artists.sort()
2019-03-29 22:23:32 +03:00
return (artists,title.strip())
2018-11-26 18:21:07 +03:00
2018-11-28 19:45:52 +03:00
def removespecial(self,s):
if isinstance(s,list):
return [self.removespecial(se) for se in s]
Refactoring (#83) * Merge isinstance calls * Inline variable that is immediately returned * Replace set() with comprehension * Replace assignment with augmented assignment * Remove unnecessary else after guard condition * Convert for loop into list comprehension * Replace unused for index with underscore * Merge nested if conditions * Convert for loop into list comprehension * Convert for loop into set comprehension * Remove unnecessary else after guard condition * Replace if statements with if expressions * Simplify sequence comparison * Replace multiple comparisons with in operator * Merge isinstance calls * Merge nested if conditions * Add guard clause * Merge duplicate blocks in conditional * Replace unneeded comprehension with generator * Inline variable that is immediately returned * Remove unused imports * Replace unneeded comprehension with generator * Remove unused imports * Remove unused import * Inline variable that is immediately returned * Swap if/else branches and remove unnecessary else * Use str.join() instead of for loop * Multiple refactors - Remove redundant pass statement - Hoist repeated code outside conditional statement - Swap if/else to remove empty if body * Inline variable that is immediately returned * Simplify generator expression * Replace if statement with if expression * Multiple refactoring - Replace range(0, x) with range(x) - Swap if/else branches - Remove unnecessary else after guard condition * Use str.join() instead of for loop * Hoist repeated code outside conditional statement * Use str.join() instead of for loop * Inline variables that are immediately returned * Merge dictionary assignment with declaration * Use items() to directly unpack dictionary values * Extract dup code from methods into a new one
2021-10-19 15:58:24 +03:00
s = s.replace("\t","").replace("","").replace("\n","")
s = re.sub(" +"," ",s)
return s
2019-03-29 22:23:32 +03:00
2018-11-26 18:21:07 +03:00
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
def confirmedReal(self,a):
confirmed = self.rules_belongtogether + [self.rules_replaceartist[r] for r in self.rules_replaceartist]
return (a in confirmed)
2018-11-24 18:29:24 +03:00
#Delimiters used for extra artists, even when in the title field
#delimiters_feat = ["ft.","ft","feat.","feat","featuring","Ft.","Ft","Feat.","Feat","Featuring"]
2021-12-19 23:10:55 +03:00
delimiters_feat = malojaconfig["DELIMITERS_FEAT"]
#Delimiters in informal artist strings, spaces expected around them
#delimiters = ["vs.","vs","&"]
2021-12-22 09:21:06 +03:00
delimiters = malojaconfig["DELIMITERS_INFORMAL"]
#Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
#delimiters_formal = ["; ",";","/"]
2021-12-22 09:21:06 +03:00
delimiters_formal = malojaconfig["DELIMITERS_FORMAL"]
2018-11-28 17:33:30 +03:00
2018-11-28 19:45:52 +03:00
def parseArtists(self,a):
2018-11-28 17:33:30 +03:00
2022-05-07 22:54:51 +03:00
if isinstance(a,list) or isinstance(a,tuple):
res = [self.parseArtists(art) for art in a]
return [a for group in res for a in group]
2021-12-22 09:21:06 +03:00
if a.strip() in malojaconfig["INVALID_ARTISTS"]:
return []
2019-09-05 15:46:34 +03:00
if a.strip().lower() in self.rules_ignoreartist:
return []
2018-11-28 19:45:52 +03:00
if a.strip() == "":
return []
2019-03-29 22:23:32 +03:00
2018-12-17 17:10:10 +03:00
if a.strip() in self.rules_notanartist:
return []
2019-03-29 22:23:32 +03:00
2018-12-17 17:10:10 +03:00
if " performing " in a.lower():
return self.parseArtists(re.split(" [Pp]erforming",a)[0])
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
if a.strip() in self.rules_belongtogether:
return [a.strip()]
2019-04-08 14:38:47 +03:00
if a.strip().lower() in self.rules_replaceartist:
return self.rules_replaceartist[a.strip().lower()].split("")
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
for d in self.delimiters_feat:
if re.match(r"(.*) [\(\[]" + d + " (.*)[\)\]]",a,flags=re.IGNORECASE) is not None:
return self.parseArtists(re.sub(r"(.*) [\(\[]" + d + " (.*)[\)\]]",r"\1",a,flags=re.IGNORECASE)) + \
self.parseArtists(re.sub(r"(.*) [\(\[]" + d + " (.*)[\)\]]",r"\2",a,flags=re.IGNORECASE))
2019-03-29 22:23:32 +03:00
2020-11-15 22:19:29 +03:00
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
for d in (self.delimiters_feat + self.delimiters):
if ((" " + d + " ") in a):
ls = []
for i in a.split(" " + d + " "):
ls += self.parseArtists(i)
return ls
2019-03-29 22:23:32 +03:00
2020-11-15 22:19:29 +03:00
for d in self.delimiters_formal:
if (d in a):
ls = []
for i in a.split(d):
ls += self.parseArtists(i)
return ls
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
return [a.strip()]
2018-11-24 18:29:24 +03:00
2018-11-28 19:45:52 +03:00
def parseTitle(self,t):
2019-04-08 14:38:47 +03:00
if t.strip().lower() in self.rules_replacetitle:
return self.rules_replacetitle[t.strip().lower()]
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
t = t.replace("[","(").replace("]",")")
2019-03-29 22:23:32 +03:00
# we'll leave these matching all bracket types so future changes
# won't require readaption
t = re.sub(r" [\(\[]as made famous by .*?[\)\]]","",t)
t = re.sub(r" [\(\[]originally by .*?[\)\]]","",t)
t = re.sub(r" [\(\[].*?Remaster.*?[\)\]]","",t)
2019-03-29 22:23:32 +03:00
2021-12-22 09:21:06 +03:00
for s in malojaconfig["REMOVE_FROM_TITLE"]:
2019-12-14 18:02:39 +03:00
if s in t:
2019-12-15 17:27:16 +03:00
t = t.replace(s,"")
2019-12-14 18:02:39 +03:00
2019-12-15 17:27:16 +03:00
t = t.strip()
#for p in self.plugin_titleparsers:
# t = p(t).strip()
return t
2018-11-28 17:33:30 +03:00
2022-05-07 22:54:51 +03:00
def parseTitleForArtists(self,title):
2019-12-11 16:25:09 +03:00
artists = []
2022-05-07 22:54:51 +03:00
for delimiter in malojaconfig["DELIMITERS_FEAT"]:
for pattern in [
r" [\(\[]" + re.escape(delimiter) + " (.*?)[\)\]]",
r" - " + re.escape(delimiter) + " (.*)",
r" " + re.escape(delimiter) + " (.*)"
]:
matches = re.finditer(pattern,title,flags=re.IGNORECASE)
for match in matches:
title = match.re.sub('',match.string) # Remove matched part
artists += self.parseArtists(match.group(1)) # Parse matched artist string
2022-04-27 21:54:33 +03:00
if malojaconfig["PARSE_REMIX_ARTISTS"]:
for filter in malojaconfig["FILTERS_REMIX"]:
2022-05-07 22:54:51 +03:00
for pattern in [
r" [\(\[](.*)" + re.escape(filter) + "[\)\]]", # match remix in brackets
r" - (.*)" + re.escape(filter) # match remix split with "-"
]:
match = re.search(pattern,title,flags=re.IGNORECASE)
if match:
# title stays the same
artists += self.parseArtists(match.group(1))
2022-04-27 21:54:33 +03:00
2019-12-11 16:25:09 +03:00
for st in self.rules_artistintitle:
2022-05-07 22:54:51 +03:00
if st in title.lower(): artists += self.rules_artistintitle[st].split("")
return (title,artists)
2018-12-04 20:43:48 +03:00
2019-03-29 22:23:32 +03:00
2018-11-24 18:29:24 +03:00
def flatten(lis):
newlist = []
2019-03-29 22:23:32 +03:00
2018-11-24 18:29:24 +03:00
for l in lis:
if isinstance(l, str):
newlist.append(l)
else:
Refactoring (#83) * Merge isinstance calls * Inline variable that is immediately returned * Replace set() with comprehension * Replace assignment with augmented assignment * Remove unnecessary else after guard condition * Convert for loop into list comprehension * Replace unused for index with underscore * Merge nested if conditions * Convert for loop into list comprehension * Convert for loop into set comprehension * Remove unnecessary else after guard condition * Replace if statements with if expressions * Simplify sequence comparison * Replace multiple comparisons with in operator * Merge isinstance calls * Merge nested if conditions * Add guard clause * Merge duplicate blocks in conditional * Replace unneeded comprehension with generator * Inline variable that is immediately returned * Remove unused imports * Replace unneeded comprehension with generator * Remove unused imports * Remove unused import * Inline variable that is immediately returned * Swap if/else branches and remove unnecessary else * Use str.join() instead of for loop * Multiple refactors - Remove redundant pass statement - Hoist repeated code outside conditional statement - Swap if/else to remove empty if body * Inline variable that is immediately returned * Simplify generator expression * Replace if statement with if expression * Multiple refactoring - Replace range(0, x) with range(x) - Swap if/else branches - Remove unnecessary else after guard condition * Use str.join() instead of for loop * Hoist repeated code outside conditional statement * Use str.join() instead of for loop * Inline variables that are immediately returned * Merge dictionary assignment with declaration * Use items() to directly unpack dictionary values * Extract dup code from methods into a new one
2021-10-19 15:58:24 +03:00
newlist += l
2019-03-29 22:23:32 +03:00
2018-11-24 18:29:24 +03:00
return list(set(newlist))