Implemented heuristics for Spotify import with inaccurate timestamps, GH-104

2023-08-10 21:12:55 +03:00 · 2022-04-01 17:53:36 +02:00 · 2022-04-01 17:53:36 +02:00 · d8821efeeb
commit d8821efeeb
parent 3389d6c5f5
1 changed files with 31 additions and 12 deletions
--- a/maloja/proccontrol/tasks/importer.py
+++ b/maloja/proccontrol/tasks/importer.py
@ -18,17 +18,19 @@ def err(msg):
 def import_scrobbles(inputf):
-	if re.match(".*\.csv",inputf):
+	filename = os.path.basename(inputf)
 	if re.match(".*\.csv",filename):
 		type = "Last.fm"
 		outputf = data_dir['scrobbles']("lastfmimport.tsv")
 		importfunc = parse_lastfm
-	elif re.match("endsong_[0-9]+\.json",inputf):
+	elif re.match("endsong_[0-9]+\.json",filename):
 		type = "Spotify"
 		outputf = data_dir['scrobbles']("spotifyimport.tsv")
 		importfunc = parse_spotify_full
-	elif re.match("StreamingHistory[0-9]+\.json",inputf):
+	elif re.match("StreamingHistory[0-9]+\.json",filename):
 		type = "Spotify"
 		outputf = data_dir['scrobbles']("spotifyimport.tsv")
 		importfunc = parse_spotify_lite
@ -165,28 +167,45 @@ def parse_spotify_full(inputf):
 						continue
 					timestamps.setdefault(timestamp,[]).append((artist,title))
-				# if it's 0, we use ts instead, but identify duplicates much more
+				# if it's 0, we use ts instead, but identify duplicates differently
-				# liberally (cause the ts is not accurate)
+				# (cause the ts is not accurate)
 				else:
-					status = 'WARN'
+
 					warn(f"{entry} might have an inaccurate timestamp.")
 					timestamp = int(
 						datetime.datetime.strptime(entry['ts'].replace('Z','+0000',),"%Y-%m-%dT%H:%M:%S%z").timestamp()
 					)
 					# TODO HEURISTICS
 					ts_group = int(timestamp/10)
 					relevant_ts_groups = [ts_group-2,ts_group-1,ts_group,ts_group+1,ts_group+2]
 					similar_scrobbles = [scrob for tsg in relevant_ts_groups for scrob in inaccurate_timestamps.get(tsg,[])]
 					scrobble_describe = (timestamp,entry['spotify_track_uri'],entry['ms_played'])
 					found_similar = False
 					for scr in similar_scrobbles:
 						# scrobbles count as duplicate if:
 						# - less than 30 seconds apart
 						# - exact same track uri
 						# - exact same ms_played
 						if (abs(scr[0] - timestamp) < 30) and scr[1:] == scrobble_describe[1:]:
 							warn(f"{entry} has been identified as potential duplicate, skipping...")
 							yield ('SKIP',None)
 							found_similar = True
 							break
 					else:
 						# no duplicates, assume proper scrobble but warn
 						status = 'WARN'
 						warn(f"{entry} might have an inaccurate timestamp.")
 						inaccurate_timestamps.setdefault(ts_group,[]).append(scrobble_describe)
 					if found_similar:
 						continue
 				yield (status,{
 					'title':title,
 					'artiststr': artist,
 					'album': album,
 				#	'timestamp': int(datetime.datetime.strptime(
 				#		entry['ts'].replace('Z','+0000',),
 				#		"%Y-%m-%dT%H:%M:%S%z"
 				#	).timestamp()),
 					'timestamp': timestamp,
 					'duration':played
 				})