From f9672a918d5536ea25c4f9850b3f21c9046c2a17 Mon Sep 17 00:00:00 2001 From: Krateng Date: Sun, 22 Dec 2019 15:13:51 +0100 Subject: [PATCH] Improved and refactored name normalization --- maloja/database.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/maloja/database.py b/maloja/database.py index 0448cde..56034a0 100644 --- a/maloja/database.py +++ b/maloja/database.py @@ -49,10 +49,10 @@ Scrobble = namedtuple("Scrobble",["track","timestamp","saved"]) SCROBBLESDICT = {} # timestamps to scrobble mapping STAMPS = [] # sorted #STAMPS_SET = set() # as set for easier check if exists # we use the scrobbles dict for that now -TRACKS_LOWER = [] -ARTISTS_LOWER = [] -ARTIST_SET = set() -TRACK_SET = set() +TRACKS_NORMALIZED = [] +ARTISTS_NORMALIZED = [] +ARTISTS_NORMALIZED_SET = set() +TRACKS_NORMALIZED_SET = set() MEDALS = {} #literally only changes once per year, no need to calculate that on the fly MEDALS_TRACKS = {} @@ -173,16 +173,16 @@ def readScrobble(artists,title,time): def getArtistID(name): obj = name - objlower = name.lower().replace("'","") + obj_normalized = normalize_name(name) - if objlower in ARTIST_SET: - return ARTISTS_LOWER.index(objlower) + if obj_normalized in ARTISTS_NORMALIZED_SET: + return ARTISTS_NORMALIZED.index(obj_normalized) else: i = len(ARTISTS) ARTISTS.append(obj) - ARTIST_SET.add(objlower) - ARTISTS_LOWER.append(objlower) + ARTISTS_NORMALIZED_SET.add(obj_normalized) + ARTISTS_NORMALIZED.append(obj_normalized) # with a new artist added, we might also get new artists that they are credited as cr = coa.getCredited(name) @@ -197,20 +197,22 @@ def getTrackID(artists,title): for a in artists: artistset.add(getArtistID(name=a)) obj = Track(artists=frozenset(artistset),title=title) - objlower = Track(artists=frozenset(artistset),title=title.lower().replace("'","")) + obj_normalized = Track(artists=frozenset(artistset),title=normalize_name(title)) - if objlower in TRACK_SET: - return TRACKS_LOWER.index(objlower) + if obj_normalized in TRACKS_NORMALIZED_SET: + return TRACKS_NORMALIZED.index(obj_normalized) else: i = len(TRACKS) TRACKS.append(obj) - TRACK_SET.add(objlower) - TRACKS_LOWER.append(objlower) + TRACKS_NORMALIZED_SET.add(obj_normalized) + TRACKS_NORMALIZED.append(obj_normalized) return i - - +# function to turn the name into a representation that can be easily compared, ignoring minor differences +remove_symbols = ["'","`","’"] +def normalize_name(name): + return "".join(char for char in name.lower() if char not in remove_symbols)