1
0
mirror of https://github.com/krateng/maloja.git synced 2023-08-10 21:12:55 +03:00

Implemented custom rules

This commit is contained in:
Krateng 2018-11-28 17:45:52 +01:00
parent 144198f933
commit 54bffc5642
5 changed files with 163 additions and 67 deletions

View File

@ -1,74 +1,112 @@
import re import re
import utilities
def fullclean(artist,title): # need to do this as a class so it can retain loaded settings from file
artists = parseArtists(removespecial(artist)) class CleanerAgent:
title = parseTitle(removespecial(title))
(title,moreartists) = parseTitleForArtists(title)
artists += moreartists
return (list(set(artists)),title) def __init__(self):
self.updateRules()
def removespecial(s):
return s.replace("\t","").replace("","").replace("\n","")
delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
def parseArtists(a):
if a.strip() == "":
return []
for d in delimiters_feat: def updateRules(self):
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None: raw = utilities.parseAllTSV("rules","string","string","string")
return parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a)) self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
for d in (delimiters + delimiters_feat): self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
if ((" " + d + " ") in a): self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
ls = []
for i in a.split(" " + d + " "):
ls += parseArtists(i)
return ls
for d in delimiters_formal:
if (d in a):
ls = [] def fullclean(self,artist,title):
for i in a.split(d): artists = self.parseArtists(self.removespecial(artist))
ls += parseArtists(i) title = self.parseTitle(self.removespecial(title))
return ls (title,moreartists) = self.parseTitleForArtists(title)
artists += moreartists
return (list(set(artists)),title)
def removespecial(self,s):
return s.replace("\t","").replace("","").replace("\n","")
delimiters_feat = ["ft.","ft","feat.","feat","featuring"] #Delimiters used for extra artists, even when in the title field
delimiters = ["vs.","vs","&"] #Delimiters in informal titles, spaces expected around them
delimiters_formal = ["; ",";"] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
def parseArtists(self,a):
if a.strip() == "":
return []
if a.strip() in self.rules_belongtogether:
return [a.strip()]
if a.strip() in self.rules_replaceartist:
return [self.rules_replaceartist[a.strip()]]
return [a.strip()]
for d in self.delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
for d in (self.delimiters_feat + self.delimiters):
if ((" " + d + " ") in a):
ls = []
for i in a.split(" " + d + " "):
ls += self.parseArtists(i)
return ls
for d in self.delimiters_formal:
if (d in a):
ls = []
for i in a.split(d):
ls += self.parseArtists(i)
return ls
return [a.strip()]
def parseTitle(t): def parseTitle(self,t):
t = t.replace("[","(").replace("]",")")
if t.strip() in self.rules_replacetitle:
return self.rules_replacetitle[t.strip()]
t = re.sub(r" \(as made famous by .*?\)","",t) t = t.replace("[","(").replace("]",")")
t = re.sub(r" \(originally by .*?\)","",t)
t = re.sub(r" \(as made famous by .*?\)","",t)
return t t = re.sub(r" \(originally by .*?\)","",t)
return t.strip()
def parseTitleForArtists(t): def parseTitleForArtists(self,t):
for d in delimiters_feat: for d in self.delimiters_feat:
if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None: if re.match(r"(.*) \(" + d + " (.*?)\)",t) is not None:
(title,artists) = parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t)) (title,artists) = self.parseTitleForArtists(re.sub(r"(.*) \(" + d + " (.*?)\)",r"\1",t))
artists += parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t)) artists += self.parseArtists(re.sub(r"(.*) \(" + d + " (.*?)\).*",r"\2",t))
return (title,artists) return (title,artists)
return (t,[]) return (t,[])
def flatten(lis): def flatten(lis):
newlist = [] newlist = []
for l in lis: for l in lis:
if isinstance(l, str): if isinstance(l, str):
newlist.append(l) newlist.append(l)
else: else:
newlist = newlist + l newlist = newlist + l
return list(set(newlist)) return list(set(newlist))

View File

@ -4,7 +4,7 @@ import urllib
import waitress import waitress
import os import os
import datetime import datetime
import cleanup from cleanup import *
import sys import sys
@ -12,6 +12,8 @@ SCROBBLES = [] # Format: tuple(track_ref,timestamp,saved)
ARTISTS = [] # Format: artist ARTISTS = [] # Format: artist
TRACKS = [] # Format: tuple(frozenset(artist_ref,...),title) TRACKS = [] # Format: tuple(frozenset(artist_ref,...),title)
c = CleanerAgent()
lastsync = 0 lastsync = 0
@ -118,11 +120,12 @@ def post_scrobble():
#title = urllib.parse.unquote(keys.get("title")) #title = urllib.parse.unquote(keys.get("title"))
artists = keys.get("artist") artists = keys.get("artist")
title = keys.get("title") title = keys.get("title")
time = int(keys.get("time")) try:
(artists,title) = cleanup.fullclean(artists,title) time = int(keys.get("time"))
if time is None: except:
time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp())
(artists,title) = c.fullclean(artists,title)
## this is necessary for localhost testing ## this is necessary for localhost testing
response.set_header("Access-Control-Allow-Origin","*") response.set_header("Access-Control-Allow-Origin","*")

View File

@ -1,9 +1,12 @@
import sys, os, datetime, re, cleanup import sys, os, datetime, re, cleanup
from cleanup import *
log = open(sys.argv[1]) log = open(sys.argv[1])
outputlog = open(sys.argv[2],"a") outputlog = open(sys.argv[2],"a")
c = CleanerAgent()
for l in log: for l in log:
l = l.replace("\n","") l = l.replace("\n","")
data = l.split(",") data = l.split(",")
@ -13,8 +16,8 @@ for l in log:
title = data[2] title = data[2]
time = data[3] time = data[3]
(artists,title) = cleanup.fullclean(artist,title) (artists,title) = c.fullclean(artist,title)
artistsstr = "".join(artists) artistsstr = "".join(artists)

View File

@ -7,8 +7,8 @@
### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist ### countas: defines an artist that should be counted together with another artist for chart statistics etc. This will not change the separation in the database and all effects of this rule will disappear as soon as it is no longer active. Second column is the artist, third column the replacement artist
### ###
### THE RULES IN THIS EXAMPLE FILE ARE IGNORED ### THE RULES IN THIS EXAMPLE FILE ARE IGNORED
notanartist In Dreams #notanartist In Dreams
belongtogether Darth & Vader #belongtogether Darth & Vader
replacetitle 첫 사랑니 (Rum Pum Pum Pum) Rum Pum Pum Pum #replacetitle 첫 사랑니 (Rum Pum Pum Pum) Rum Pum Pum Pum
replaceartist Dal Shabet Dal★Shabet #replaceartist Dal Shabet Dal★Shabet
countas Trouble Maker HyunA #countas Trouble Maker HyunA

Can't render this file because it contains an unexpected character in line 3 and column 58.

52
utilities.py Normal file
View File

@ -0,0 +1,52 @@
def parseTSV(filename,*args):
f = open(filename)
result = []
for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
l = l.replace("\n","").split("#")[0]
data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
entry = [] * len(args)
for i in range(len(args)):
if args[i]=="list":
try:
entry.append(data[i].split(""))
except:
entry.append([])
elif args[i]=="string":
try:
entry.append(data[i])
except:
entry.append("")
elif args[i]=="int":
try:
entry.append(int(data[i]))
except:
entry.append(0)
elif args[i]=="bool":
try:
entry.append((data[i].lower() in ["true","yes","1","y"]))
except:
entry.append(False)
result.append(entry)
f.close()
return result
def parseAllTSV(path,*args):
import os
result = []
for f in os.listdir(path + "/"):
if (".tsv" in f):
result += parseTSV(path + "/" + f,*args)
return result