Implemented heuristics for Spotify import with inaccurate timestamps, GH-104

This commit is contained in:
krateng 2022-04-01 17:53:36 +02:00
parent 3389d6c5f5
commit d8821efeeb
1 changed files with 31 additions and 12 deletions

View File

@ -18,17 +18,19 @@ def err(msg):
def import_scrobbles(inputf):
if re.match(".*\.csv",inputf):
filename = os.path.basename(inputf)
if re.match(".*\.csv",filename):
type = "Last.fm"
outputf = data_dir['scrobbles']("lastfmimport.tsv")
importfunc = parse_lastfm
elif re.match("endsong_[0-9]+\.json",inputf):
elif re.match("endsong_[0-9]+\.json",filename):
type = "Spotify"
outputf = data_dir['scrobbles']("spotifyimport.tsv")
importfunc = parse_spotify_full
elif re.match("StreamingHistory[0-9]+\.json",inputf):
elif re.match("StreamingHistory[0-9]+\.json",filename):
type = "Spotify"
outputf = data_dir['scrobbles']("spotifyimport.tsv")
importfunc = parse_spotify_lite
@ -165,28 +167,45 @@ def parse_spotify_full(inputf):
continue
timestamps.setdefault(timestamp,[]).append((artist,title))
# if it's 0, we use ts instead, but identify duplicates much more
# liberally (cause the ts is not accurate)
# if it's 0, we use ts instead, but identify duplicates differently
# (cause the ts is not accurate)
else:
status = 'WARN'
warn(f"{entry} might have an inaccurate timestamp.")
timestamp = int(
datetime.datetime.strptime(entry['ts'].replace('Z','+0000',),"%Y-%m-%dT%H:%M:%S%z").timestamp()
)
# TODO HEURISTICS
ts_group = int(timestamp/10)
relevant_ts_groups = [ts_group-2,ts_group-1,ts_group,ts_group+1,ts_group+2]
similar_scrobbles = [scrob for tsg in relevant_ts_groups for scrob in inaccurate_timestamps.get(tsg,[])]
scrobble_describe = (timestamp,entry['spotify_track_uri'],entry['ms_played'])
found_similar = False
for scr in similar_scrobbles:
# scrobbles count as duplicate if:
# - less than 30 seconds apart
# - exact same track uri
# - exact same ms_played
if (abs(scr[0] - timestamp) < 30) and scr[1:] == scrobble_describe[1:]:
warn(f"{entry} has been identified as potential duplicate, skipping...")
yield ('SKIP',None)
found_similar = True
break
else:
# no duplicates, assume proper scrobble but warn
status = 'WARN'
warn(f"{entry} might have an inaccurate timestamp.")
inaccurate_timestamps.setdefault(ts_group,[]).append(scrobble_describe)
if found_similar:
continue
yield (status,{
'title':title,
'artiststr': artist,
'album': album,
# 'timestamp': int(datetime.datetime.strptime(
# entry['ts'].replace('Z','+0000',),
# "%Y-%m-%dT%H:%M:%S%z"
# ).timestamp()),
'timestamp': timestamp,
'duration':played
})