1
0
mirror of https://github.com/krateng/maloja.git synced 2023-08-10 21:12:55 +03:00
maloja/cleanup.py

75 lines
1.9 KiB
Python
Raw Normal View History

2018-11-24 18:29:24 +03:00
import re
2018-11-26 18:21:07 +03:00
def fullclean(artist,title):
2018-11-28 17:33:30 +03:00
artists = parseArtists(removespecial(artist))
title = parseTitle(removespecial(title))
(title,moreartists) = parseTitleForArtists(title)
2018-11-26 18:21:07 +03:00
artists += moreartists
2018-11-28 17:33:30 +03:00
return (list(set(artists)),title)
2018-11-26 18:21:07 +03:00
def removespecial(s):
return s.replace("\t","").replace("","").replace("\n","")
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
def parseArtists(a):
if a.strip() == "":
return []
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
for d in delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
for d in (delimiters + delimiters_feat):
if ((" " + d + " ") in a):
ls = []
for i in a.split(" " + d + " "):
ls += parseArtists(i)
return ls
for d in delimiters_formal:
if (d in a):
ls = []
for i in a.split(d):
ls += parseArtists(i)
return ls
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
return [a.strip()]
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
def parseTitle(t):
t = t.replace("[","(").replace("]",")")
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
t = re.sub(r" \(as made famous by .*?\)","",t)
t = re.sub(r" \(originally by .*?\)","",t)
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
return t
def parseTitleForArtists(t):
for d in delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
return (title,artists)
return (t,[])
2018-11-24 18:29:24 +03:00
def flatten(lis):
newlist = []
for l in lis:
if isinstance(l, str):
newlist.append(l)
else:
newlist = newlist + l
return list(set(newlist))