From 3389d6c5f5b06be9710cabf031450d11b3aac69a Mon Sep 17 00:00:00 2001
From: krateng <git.noreply@krateng.ch>
Date: Fri, 1 Apr 2022 17:16:50 +0200
Subject: [PATCH] Reworked import

---
 maloja/proccontrol/tasks/__init__.py |   4 +-
 maloja/proccontrol/tasks/importer.py | 192 ++++++++++++++++-----------
 2 files changed, 117 insertions(+), 79 deletions(-)

diff --git a/maloja/proccontrol/tasks/__init__.py b/maloja/proccontrol/tasks/__init__.py
index 83b1e51..744534a 100644
--- a/maloja/proccontrol/tasks/__init__.py
+++ b/maloja/proccontrol/tasks/__init__.py
@@ -12,10 +12,12 @@ def loadexternal(filename):
 		return
 
 	from .importer import import_scrobbles
-	imported,failed,warning = import_scrobbles(filename)
+	imported,warning,skipped,failed = import_scrobbles(filename)
 	print("Successfully imported",imported,"scrobbles!")
 	if warning > 0:
 		print(col['orange'](f"{warning} Warning{'s' if warning != 1 else ''}!"))
+	if skipped > 0:
+		print(col['orange'](f"{skipped} Skipped!"))
 	if failed > 0:
 		print(col['red'](f"{failed} Error{'s' if failed != 1 else ''}!"))
 
diff --git a/maloja/proccontrol/tasks/importer.py b/maloja/proccontrol/tasks/importer.py
index ee5f66d..7a332de 100644
--- a/maloja/proccontrol/tasks/importer.py
+++ b/maloja/proccontrol/tasks/importer.py
@@ -18,19 +18,24 @@ def err(msg):
 
 def import_scrobbles(inputf):
 
-	ext = inputf.split('.')[-1].lower()
-
-	if ext == 'csv':
+	if re.match(".*\.csv",inputf):
 		type = "Last.fm"
 		outputf = data_dir['scrobbles']("lastfmimport.tsv")
 		importfunc = parse_lastfm
 
-
-	elif ext == 'json' or os.path.isdir(inputf):
+	elif re.match("endsong_[0-9]+\.json",inputf):
 		type = "Spotify"
 		outputf = data_dir['scrobbles']("spotifyimport.tsv")
-		importfunc = parse_spotify
-		if os.path.isfile(inputf): inputf = os.path.dirname(inputf)
+		importfunc = parse_spotify_full
+
+	elif re.match("StreamingHistory[0-9]+\.json",inputf):
+		type = "Spotify"
+		outputf = data_dir['scrobbles']("spotifyimport.tsv")
+		importfunc = parse_spotify_lite
+
+	else:
+		print("File",inputf,"could not be identified as a valid import source.")
+		return 0,0,0,0
 
 
 	print(f"Parsing {col['yellow'](inputf)} as {col['cyan'](type)} export")
@@ -40,7 +45,7 @@ def import_scrobbles(inputf):
 		while True:
 			action = prompt(f"Already imported {type} data. [O]verwrite, [A]ppend or [C]ancel?",default='c').lower()[0]
 			if action == 'c':
-				return 0,0,0
+				return 0,0,0,0
 			elif action == 'a':
 				mode = 'a'
 				break
@@ -52,20 +57,20 @@ def import_scrobbles(inputf):
 	else:
 		mode = 'w'
 
-		
+
 	with open(outputf,mode) as outputfd:
-		success = 0
-		failed = 0
-		warning = 0
+		success, warning, skipped, failed = 0, 0, 0, 0
 		timestamps = set()
 
-		for scrobble in importfunc(inputf):
-			if scrobble is None:
+		for status,scrobble in importfunc(inputf):
+			if status == 'FAIL':
 				failed += 1
-			elif scrobble is False:
-				warning += 1
+			elif status == 'SKIP':
+				skipped += 1
 			else:
 				success += 1
+				if status == 'WARN':
+					warning += 1
 
 				while scrobble['timestamp'] in timestamps:
 					scrobble['timestamp'] += 1
@@ -89,77 +94,108 @@ def import_scrobbles(inputf):
 				if success % 100 == 0:
 					print(f"Imported {success} scrobbles...")
 
-	return success,failed,warning
+	return success, warning, skipped, failed
 
+def parse_spotify_lite(inputf):
+	inputfolder = os.path.dirname(inputf)
+	filenames = re.compile(r'StreamingHistory[0-9]+\.json')
+	inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]
 
-def parse_spotify(inputf):
+	if inputfiles != [inputf]:
+		print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
+		if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
+			inputfiles = [inputf]
 
+	# TODO
+
+def parse_spotify_full(inputf):
+
+	inputfolder = os.path.dirname(inputf)
 	filenames = re.compile(r'endsong_[0-9]+\.json')
+	inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]
 
-	inputfiles = [os.path.join(inputf,f) for f in os.listdir(inputf) if filenames.match(f)]
+	if inputfiles != [inputf]:
+		print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
+		if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
+			inputfiles = [inputf]
 
-	if len(inputfiles) == 0:
-		print("No files found!")
-	elif ask("Importing the following files: " + ", ".join(col['yellow'](i) for i in inputfiles) + ". Confirm?", default=False):
+	# we keep timestamps here as well to remove duplicates because spotify's export
+	# is messy - this is specific to this import type and should not be mixed with
+	# the outer function timestamp check (which is there to fix duplicate timestamps
+	# that are assumed to correspond to actually distinct plays)
+	timestamps = {}
+	inaccurate_timestamps = {}
 
-		# we keep timestamps here as well to remove duplicates because spotify's export
-		# is messy - this is specific to this import type and should not be mixed with
-		# the outer function timestamp check (which is there to fix duplicate timestamps
-		# that are assumed to correspond to actually distinct plays)
-		timestamps = {}
+	for inputf in inputfiles:
 
-		for inputf in inputfiles:
+		print("Importing",col['yellow'](inputf),"...")
+		with open(inputf,'r') as inputfd:
+			data = json.load(inputfd)
 
-			print("Importing",col['yellow'](inputf),"...")
-			with open(inputf,'r') as inputfd:
-				data = json.load(inputfd)
+		for entry in data:
 
-			for entry in data:
-
-				try:
-					sec = int(entry['ms_played'] / 1000)
-					timestamp = int(entry['offline_timestamp'] / 1000)
-					artist = entry['master_metadata_album_artist_name']
-					title = entry['master_metadata_track_name']
-					album = entry['master_metadata_album_album_name']
+			try:
+				played = int(entry['ms_played'] / 1000)
+				timestamp = int(entry['offline_timestamp'] / 1000)
+				artist = entry['master_metadata_album_artist_name']
+				title = entry['master_metadata_track_name']
+				album = entry['master_metadata_album_album_name']
 
 
-					if title is None:
-						warn(f"{entry} has no title, skipping...")
-						yield False
-						continue
-					if artist is None:
-						warn(f"{entry} has no artist, skipping...")
-						yield False
-						continue
-					if sec < 30:
-						warn(f"{entry} is shorter than 30 seconds, skipping...")
-						yield False
-						continue
-					if timestamp in timestamps and (artist,title) in timestamps[timestamp]:
-						warn(f"{entry} seems to be a duplicate, skipping...")
-						yield False
-						continue
-
-					timestamps.setdefault(timestamp,[]).append((artist,title))
-
-					yield {
-						'title':title,
-						'artiststr': artist,
-						'album': album,
-					#	'timestamp': int(datetime.datetime.strptime(
-					#		entry['ts'].replace('Z','+0000',),
-					#		"%Y-%m-%dT%H:%M:%S%z"
-					#	).timestamp()),
-						'timestamp': timestamp,
-						'duration':sec
-					}
-				except Exception as e:
-					err(f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
-					yield None
+				if title is None:
+					warn(f"{entry} has no title, skipping...")
+					yield ('SKIP',None)
+					continue
+				if artist is None:
+					warn(f"{entry} has no artist, skipping...")
+					yield ('SKIP',None)
+					continue
+				if played < 30:
+					warn(f"{entry} is shorter than 30 seconds, skipping...")
+					yield ('SKIP',None)
 					continue
 
-			print()
+				# if offline_timestamp is a proper number, we treat it as
+				# accurate and check duplicates by that exact timestamp
+				if timestamp != 0:
+					status = 'SUCCESS'
+					if timestamp in timestamps and (artist,title) in timestamps[timestamp]:
+						warn(f"{entry} seems to be a duplicate, skipping...")
+						yield ('SKIP',None)
+						continue
+					timestamps.setdefault(timestamp,[]).append((artist,title))
+
+				# if it's 0, we use ts instead, but identify duplicates much more
+				# liberally (cause the ts is not accurate)
+				else:
+					status = 'WARN'
+					warn(f"{entry} might have an inaccurate timestamp.")
+					timestamp = int(
+						datetime.datetime.strptime(entry['ts'].replace('Z','+0000',),"%Y-%m-%dT%H:%M:%S%z").timestamp()
+					)
+					# TODO HEURISTICS
+
+
+
+
+
+				yield (status,{
+					'title':title,
+					'artiststr': artist,
+					'album': album,
+				#	'timestamp': int(datetime.datetime.strptime(
+				#		entry['ts'].replace('Z','+0000',),
+				#		"%Y-%m-%dT%H:%M:%S%z"
+				#	).timestamp()),
+					'timestamp': timestamp,
+					'duration':played
+				})
+			except Exception as e:
+				err(f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
+				yield ('FAIL',None)
+				continue
+
+		print()
 
 def parse_lastfm(inputf):
 
@@ -170,12 +206,12 @@ def parse_lastfm(inputf):
 			try:
 				artist,album,title,time = row
 			except ValueError:
-				warn(f"{row} does not look like a valid entry. Scrobble not imported.")
-				yield None
+				err(f"{row} does not look like a valid entry. Scrobble not imported.")
+				yield ('FAIL',None)
 				continue
 
 			try:
-				yield {
+				yield ('SUCCESS',{
 					'title': title,
 					'artiststr': artist,
 					'album': album,
@@ -184,8 +220,8 @@ def parse_lastfm(inputf):
 						"%d %b %Y %H:%M%z"
 					).timestamp()),
 					'duration':None
-				}
+				})
 			except Exception as e:
 				err(f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
-				yield None
+				yield ('FAIL',None)
 				continue