2018-11-24 18:29:24 +03:00
import re
2018-11-28 19:45:52 +03:00
import utilities
2019-03-29 22:23:32 +03:00
from doreah import tsv
2018-11-24 18:29:24 +03:00
2018-11-28 19:45:52 +03:00
# need to do this as a class so it can retain loaded settings from file
2019-01-10 01:29:01 +03:00
# apparently this is not true
# I'm dumb
2018-11-28 19:45:52 +03:00
class CleanerAgent :
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
def __init__ ( self ) :
self . updateRules ( )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
def updateRules ( self ) :
2019-03-29 22:23:32 +03:00
raw = tsv . parse_all ( " rules " , " string " , " string " , " string " )
2018-11-28 19:45:52 +03:00
self . rules_belongtogether = [ b for [ a , b , c ] in raw if a == " belongtogether " ]
self . rules_notanartist = [ b for [ a , b , c ] in raw if a == " notanartist " ]
2019-04-08 14:38:47 +03:00
self . rules_replacetitle = { b . lower ( ) : c for [ a , b , c ] in raw if a == " replacetitle " }
self . rules_replaceartist = { b . lower ( ) : c for [ a , b , c ] in raw if a == " replaceartist " }
2019-03-29 22:23:32 +03:00
2018-12-20 20:46:55 +03:00
# we always need to be able to tell if our current database is made with the current rules
self . checksums = utilities . checksumTSV ( " rules " )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
def fullclean ( self , artist , title ) :
artists = self . parseArtists ( self . removespecial ( artist ) )
title = self . parseTitle ( self . removespecial ( title ) )
( title , moreartists ) = self . parseTitleForArtists ( title )
2019-03-29 22:23:32 +03:00
artists + = moreartists
2018-12-22 14:47:49 +03:00
artists = list ( set ( artists ) )
2018-12-21 21:13:24 +03:00
artists . sort ( )
2019-03-29 22:23:32 +03:00
2018-12-22 14:47:49 +03:00
return ( artists , title )
2018-11-26 18:21:07 +03:00
2018-11-28 19:45:52 +03:00
def removespecial ( self , s ) :
2018-12-21 20:22:58 +03:00
s = s . replace ( " \t " , " " ) . replace ( " ␟ " , " " ) . replace ( " \n " , " " )
s = re . sub ( " + " , " " , s )
return s
2019-03-29 22:23:32 +03:00
2018-11-26 18:21:07 +03:00
2018-12-20 19:23:16 +03:00
# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
def confirmedReal ( self , a ) :
confirmed = self . rules_belongtogether + [ self . rules_replaceartist [ r ] for r in self . rules_replaceartist ]
return ( a in confirmed )
2018-11-24 18:29:24 +03:00
2018-11-28 20:44:33 +03:00
delimiters_feat = [ " ft. " , " ft " , " feat. " , " feat " , " featuring " , " Ft. " , " Ft " , " Feat. " , " Feat " , " Featuring " ] #Delimiters used for extra artists, even when in the title field
delimiters = [ " vs. " , " vs " , " & " ] #Delimiters in informal artist strings, spaces expected around them
2018-12-04 21:17:44 +03:00
delimiters_formal = [ " ; " , " ; " , " / " ] #Delimiters used specifically to tag multiple artists when only one tag field is available, no spaces used
2018-11-24 18:29:24 +03:00
2018-11-28 17:33:30 +03:00
2018-11-28 19:45:52 +03:00
def parseArtists ( self , a ) :
2018-11-28 17:33:30 +03:00
2018-11-28 19:45:52 +03:00
if a . strip ( ) == " " :
return [ ]
2019-03-29 22:23:32 +03:00
2018-12-17 17:10:10 +03:00
if a . strip ( ) in self . rules_notanartist :
return [ ]
2019-03-29 22:23:32 +03:00
2018-12-17 17:10:10 +03:00
if " performing " in a . lower ( ) :
return self . parseArtists ( re . split ( " [Pp]erforming " , a ) [ 0 ] )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
if a . strip ( ) in self . rules_belongtogether :
return [ a . strip ( ) ]
2019-04-08 14:38:47 +03:00
if a . strip ( ) . lower ( ) in self . rules_replaceartist :
return self . rules_replaceartist [ a . strip ( ) . lower ( ) ] . split ( " ␟ " )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
for d in self . delimiters_feat :
if re . match ( r " (.*) \ ( " + d + " (.*) \ ) " , a ) is not None :
return self . parseArtists ( re . sub ( r " (.*) \ ( " + d + " (.*) \ ) " , r " \ 1 " , a ) ) + self . parseArtists ( re . sub ( r " (.*) \ ( " + d + " (.*) \ ) " , r " \ 2 " , a ) )
2019-03-29 22:23:32 +03:00
2018-12-17 01:56:30 +03:00
for d in self . delimiters_formal :
if ( d in a ) :
ls = [ ]
for i in a . split ( d ) :
ls + = self . parseArtists ( i )
return ls
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
for d in ( self . delimiters_feat + self . delimiters ) :
if ( ( " " + d + " " ) in a ) :
ls = [ ]
for i in a . split ( " " + d + " " ) :
ls + = self . parseArtists ( i )
return ls
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
return [ a . strip ( ) ]
2018-11-24 18:29:24 +03:00
2018-11-28 19:45:52 +03:00
def parseTitle ( self , t ) :
2019-04-08 14:38:47 +03:00
if t . strip ( ) . lower ( ) in self . rules_replacetitle :
return self . rules_replacetitle [ t . strip ( ) . lower ( ) ]
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
t = t . replace ( " [ " , " ( " ) . replace ( " ] " , " ) " )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
t = re . sub ( r " \ (as made famous by .*? \ ) " , " " , t )
t = re . sub ( r " \ (originally by .*? \ ) " , " " , t )
2018-12-17 01:56:30 +03:00
t = re . sub ( r " \ (.*?Remaster.*? \ ) " , " " , t )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
return t . strip ( )
2018-11-28 17:33:30 +03:00
2018-11-28 19:45:52 +03:00
def parseTitleForArtists ( self , t ) :
for d in self . delimiters_feat :
if re . match ( r " (.*) \ ( " + d + " (.*?) \ ) " , t ) is not None :
( title , artists ) = self . parseTitleForArtists ( re . sub ( r " (.*) \ ( " + d + " (.*?) \ ) " , r " \ 1 " , t ) )
artists + = self . parseArtists ( re . sub ( r " (.*) \ ( " + d + " (.*?) \ ).* " , r " \ 2 " , t ) )
return ( title , artists )
2018-12-12 21:37:59 +03:00
if re . match ( r " (.*) - " + d + " (.*) " , t ) is not None :
( title , artists ) = self . parseTitleForArtists ( re . sub ( r " (.*) - " + d + " (.*) " , r " \ 1 " , t ) )
artists + = self . parseArtists ( re . sub ( r " (.*) - " + d + " (.*).* " , r " \ 2 " , t ) )
return ( title , artists )
if re . match ( r " (.*) " + d + " (.*) " , t ) is not None :
( title , artists ) = self . parseTitleForArtists ( re . sub ( r " (.*) " + d + " (.*) " , r " \ 1 " , t ) )
artists + = self . parseArtists ( re . sub ( r " (.*) " + d + " (.*).* " , r " \ 2 " , t ) )
return ( title , artists )
2019-03-29 22:23:32 +03:00
2018-11-28 19:45:52 +03:00
return ( t , [ ] )
2018-12-04 20:43:48 +03:00
2019-03-29 22:23:32 +03:00
#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
2018-12-04 20:43:48 +03:00
class CollectorAgent :
2019-03-29 22:23:32 +03:00
2018-12-04 20:43:48 +03:00
def __init__ ( self ) :
self . updateRules ( )
2019-03-29 22:23:32 +03:00
2018-12-04 20:43:48 +03:00
def updateRules ( self ) :
2019-03-29 22:23:32 +03:00
raw = tsv . parse_all ( " rules " , " string " , " string " , " string " )
2018-12-04 20:43:48 +03:00
self . rules_countas = { b : c for [ a , b , c ] in raw if a == " countas " }
self . rules_include = { } #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
for a in self . rules_countas :
self . rules_include [ self . rules_countas [ a ] ] = self . rules_include . setdefault ( self . rules_countas [ a ] , [ ] ) + [ a ]
2019-03-29 22:23:32 +03:00
# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of
2018-12-04 20:43:48 +03:00
def updateIDs ( self , artistlist ) :
self . rules_countas_id = { artistlist . index ( a ) : artistlist . index ( self . rules_countas [ a ] ) for a in self . rules_countas }
#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
#this needs to take lists into account
2019-03-29 22:23:32 +03:00
2018-12-04 20:43:48 +03:00
def getCredited ( self , artist ) :
if artist in self . rules_countas_id :
return self . rules_countas_id [ artist ]
if artist in self . rules_countas :
return self . rules_countas [ artist ]
else :
return artist
2019-03-29 22:23:32 +03:00
2018-12-04 20:43:48 +03:00
def getCreditedList ( self , artists ) :
updatedArtists = [ ]
for artist in artists :
updatedArtists . append ( self . getCredited ( artist ) )
return list ( set ( updatedArtists ) )
2019-03-29 22:23:32 +03:00
2018-12-17 01:56:30 +03:00
def getAllAssociated ( self , artist ) :
return self . rules_include . get ( artist , [ ] )
2019-03-29 22:23:32 +03:00
2019-02-03 01:55:13 +03:00
# this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when
# countas rules are declared preemptively
def getAllArtists ( self ) :
return list ( set ( [ a for a in self . rules_countas ] + [ self . rules_countas [ a ] for a in self . rules_countas ] ) )
2019-03-29 22:23:32 +03:00
2018-11-24 18:29:24 +03:00
def flatten ( lis ) :
newlist = [ ]
2019-03-29 22:23:32 +03:00
2018-11-24 18:29:24 +03:00
for l in lis :
if isinstance ( l , str ) :
newlist . append ( l )
else :
newlist = newlist + l
2019-03-29 22:23:32 +03:00
2018-11-24 18:29:24 +03:00
return list ( set ( newlist ) )