maloja/maloja/proccontrol/tasks/import_scrobbles.py

import os, datetime, re
import json, csv

from doreah.io import col, ask, prompt

from ...cleanup import *
from ...pkg_global.conf import data_dir


c = CleanerAgent()

outputs = {
	"CONFIDENT_IMPORT": lambda msg: None,
	"UNCERTAIN_IMPORT": lambda msg: print(col['orange'](msg)),
	#"CONFIDENT_SKIP": lambda msg: print(col['ffcba4'](msg)),
	"CONFIDENT_SKIP": lambda msg: None,
	"UNCERTAIN_SKIP": lambda msg: print(col['indianred'](msg)),
	"FAIL": lambda msg: print(col['red'](msg)),
}


def import_scrobbles(inputf):

	from ...database.sqldb import add_scrobbles

	result = {
		"CONFIDENT_IMPORT": 0,
		"UNCERTAIN_IMPORT": 0,
		"CONFIDENT_SKIP": 0,
		"UNCERTAIN_SKIP": 0,
		"FAIL": 0
	}

	filename = os.path.basename(inputf)

	if re.match(".*\.csv",filename):
		typeid,typedesc = "lastfm","Last.fm"
		importfunc = parse_lastfm

	elif re.match("endsong_[0-9]+\.json",filename):
		typeid,typedesc = "spotify","Spotify"
		importfunc = parse_spotify_full

	elif re.match("StreamingHistory[0-9]+\.json",filename):
		typeid,typedesc = "spotify","Spotify"
		importfunc = parse_spotify_lite

	elif re.match("maloja_export_[0-9]+\.json",filename):
		typeid,typedesc = "maloja","Maloja"
		importfunc = parse_maloja

	else:
		print("File",inputf,"could not be identified as a valid import source.")
		return result


	print(f"Parsing {col['yellow'](inputf)} as {col['cyan'](typedesc)} export")
	print("This could take a while...")

	timestamps = set()
	scrobblebuffer = []

	for status,scrobble,msg in importfunc(inputf):
		result[status] += 1
		outputs[status](msg)
		if status in ['CONFIDENT_IMPORT','UNCERTAIN_IMPORT']:

			# prevent duplicate timestamps
			while scrobble['scrobble_time'] in timestamps:
				scrobble['scrobble_time'] += 1
			timestamps.add(scrobble['scrobble_time'])

			# clean up
			(scrobble['track_artists'],scrobble['track_title']) = c.fullclean(scrobble['track_artists'],scrobble['track_title'])

			# extra info
			extrainfo = {}
			if scrobble.get('album_name'): extrainfo['album_name'] = scrobble['album_name']
			# saving this in the scrobble instead of the track because for now it's not meant
			# to be authorative information, just payload of the scrobble

			scrobblebuffer.append({
				"time":scrobble['scrobble_time'],
				 	"track":{
				 		"artists":scrobble['track_artists'],
				 		"title":scrobble['track_title'],
				 		"length":None
				 	},
				 	"duration":scrobble['scrobble_duration'],
				 	"origin":"import:" + typeid,
					"extra":extrainfo
			})

			if (result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']) % 1000 == 0:
				print(f"Imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles...")
				add_scrobbles(scrobblebuffer)
				scrobblebuffer = []

	add_scrobbles(scrobblebuffer)

	msg = f"Successfully imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles"
	if result['UNCERTAIN_IMPORT'] > 0:
		warningmsg = col['orange'](f"{result['UNCERTAIN_IMPORT']} Warning{'s' if result['UNCERTAIN_IMPORT'] != 1 else ''}!")
		msg += f" ({warningmsg})"
	print(msg)

	msg = f"Skipped {result['CONFIDENT_SKIP'] + result['UNCERTAIN_SKIP']} scrobbles"
	if result['UNCERTAIN_SKIP'] > 0:
		warningmsg = col['indianred'](f"{result['UNCERTAIN_SKIP']} Warning{'s' if result['UNCERTAIN_SKIP'] != 1 else ''}!")
		msg += f" ({warningmsg})"
	print(msg)

	if result['FAIL'] > 0:
		print(col['red'](f"{result['FAIL']} Error{'s' if result['FAIL'] != 1 else ''}!"))


	return result

def parse_spotify_lite(inputf):
	inputfolder = os.path.dirname(inputf)
	filenames = re.compile(r'StreamingHistory[0-9]+\.json')
	inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]

	if inputfiles != [inputf]:
		print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
		if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
			inputfiles = [inputf]

	for inputf in inputfiles:

		print("Importing",col['yellow'](inputf),"...")
		with open(inputf,'r') as inputfd:
			data = json.load(inputfd)

		for entry in data:

			try:
				played = int(entry['msPlayed'] / 1000)
				timestamp = int(
					datetime.datetime.strptime(entry['endTime'],"%Y-%m-%d %H:%M").timestamp()
				)
				artist = entry['artistName']
				title = entry['trackName']

				if played < 30:
					yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")
					continue

				yield ("CONFIDENT_IMPORT",{
					'track_title':title,
					'track_artists': artist,
					'scrobble_time': timestamp,
					'scrobble_duration':played,
					'album_name': None
				},'')
			except Exception as e:
				yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
				continue

		print()


def parse_spotify_full(inputf):

	inputfolder = os.path.dirname(inputf)
	filenames = re.compile(r'endsong_[0-9]+\.json')
	inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]

	if inputfiles != [inputf]:
		print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
		if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
			inputfiles = [inputf]

	# we keep timestamps here as well to remove duplicates because spotify's export
	# is messy - this is specific to this import type and should not be mixed with
	# the outer function timestamp check (which is there to fix duplicate timestamps
	# that are assumed to correspond to actually distinct plays)
	timestamps = {}
	inaccurate_timestamps = {}

	for inputf in inputfiles:

		print("Importing",col['yellow'](inputf),"...")
		with open(inputf,'r') as inputfd:
			data = json.load(inputfd)

		for entry in data:

			try:
				played = int(entry['ms_played'] / 1000)
				timestamp = int(entry['offline_timestamp'] / 1000)
				artist = entry['master_metadata_album_artist_name']
				title = entry['master_metadata_track_name']
				album = entry['master_metadata_album_album_name']


				if title is None:
					yield ('CONFIDENT_SKIP',None,f"{entry} has no title, skipping...")
					continue
				if artist is None:
					yield ('CONFIDENT_SKIP',None,f"{entry} has no artist, skipping...")
					continue
				if played < 30:
					yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")
					continue

				# if offline_timestamp is a proper number, we treat it as
				# accurate and check duplicates by that exact timestamp
				if timestamp != 0:

					if timestamp in timestamps and (artist,title) in timestamps[timestamp]:
						yield ('CONFIDENT_SKIP',None,f"{entry} seems to be a duplicate, skipping...")
						continue
					else:
						status = 'CONFIDENT_IMPORT'
						msg = ''
						timestamps.setdefault(timestamp,[]).append((artist,title))

				# if it's 0, we use ts instead, but identify duplicates differently
				# (cause the ts is not accurate)
				else:

					timestamp = int(
						datetime.datetime.strptime(entry['ts'].replace('Z','+0000'),"%Y-%m-%dT%H:%M:%S%z").timestamp()
					)


					ts_group = int(timestamp/10)
					relevant_ts_groups = [ts_group-3,ts_group-2,ts_group-1,ts_group,ts_group+1,ts_group+2,ts_group+3]
					similar_scrobbles = [scrob for tsg in relevant_ts_groups for scrob in inaccurate_timestamps.get(tsg,[])]

					scrobble_describe = (timestamp,entry['spotify_track_uri'],entry['ms_played'])
					found_similar = False
					for scr in similar_scrobbles:
						# scrobbles count as duplicate if:
						# - less than 30 seconds apart
						# - exact same track uri
						# - exact same ms_played
						if (abs(scr[0] - timestamp) < 30) and scr[1:] == scrobble_describe[1:]:
							yield ('UNCERTAIN_SKIP',None,f"{entry} might be a duplicate, skipping...")
							found_similar = True
							break
					else:
						# no duplicates, assume proper scrobble but warn
						status = 'UNCERTAIN_IMPORT'
						msg = f"{entry} might have an inaccurate timestamp."
						inaccurate_timestamps.setdefault(ts_group,[]).append(scrobble_describe)

					if found_similar:
						continue


				yield (status,{
					'track_title':title,
					'track_artists': artist,
					'album_name': album,
					'scrobble_time': timestamp,
					'scrobble_duration':played
				},msg)
			except Exception as e:
				yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
				continue

		print()

def parse_lastfm(inputf):

	with open(inputf,'r',newline='') as inputfd:
		reader = csv.reader(inputfd)

		for row in reader:
			try:
				artist,album,title,time = row
			except ValueError:
				yield ('FAIL',None,f"{row} does not look like a valid entry. Scrobble not imported.")
				continue

			try:
				yield ('CONFIDENT_IMPORT',{
					'track_title': title,
					'track_artists': artist,
					'album_name': album,
					'scrobble_time': int(datetime.datetime.strptime(
						time + '+0000',
						"%d %b %Y %H:%M%z"
					).timestamp()),
					'scrobble_duration':None
				},'')
			except Exception as e:
				yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
				continue


def parse_maloja(inputf):

	with open(inputf,'r') as inputfd:
		data = json.load(inputfd)

	scrobbles = data['scrobbles']

	for s in scrobbles:
		try:
			yield ('CONFIDENT_IMPORT',{
				'track_title': s['track']['title'],
				'track_artists': s['track']['artists'],
				'album_name': s['track'].get('album',{}).get('name',''),
				'scrobble_time': s['time'],
				'scrobble_duration': s['duration']
			},'')
		except Exception as e:
			yield ('FAIL',None,f"{s} could not be parsed. Scrobble not imported. ({repr(e)})")
			continue
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00			`import os, datetime, re`
			`import json, csv`

Added ability to import multiple files, GH-104 2022-03-29 20:02:59 +03:00			`from doreah.io import col, ask, prompt`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
			`from ...cleanup import *`
Moved monkey patching and globalconf to subpackage 2022-04-09 22:39:04 +03:00			`from ...pkg_global.conf import data_dir`
Refactored imports to avoid DB startup for unrelated tasks 2022-04-08 20:10:20 +03:00
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
			`c = CleanerAgent()`

Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`outputs = {`
			`"CONFIDENT_IMPORT": lambda msg: None,`
			`"UNCERTAIN_IMPORT": lambda msg: print(col['orange'](msg)),`
Renamed import module to match v3 2022-04-04 17:25:21 +03:00			`#"CONFIDENT_SKIP": lambda msg: print(col['ffcba4'](msg)),`
			`"CONFIDENT_SKIP": lambda msg: None,`
Readme and minor fixes 2022-04-05 06:48:23 +03:00			`"UNCERTAIN_SKIP": lambda msg: print(col['indianred'](msg)),`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`"FAIL": lambda msg: print(col['red'](msg)),`
			`}`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00

			`def import_scrobbles(inputf):`

Refactored imports to avoid DB startup for unrelated tasks 2022-04-08 20:10:20 +03:00			`from ...database.sqldb import add_scrobbles`

Improved feedback of import 2022-04-01 19:19:21 +03:00			`result = {`
			`"CONFIDENT_IMPORT": 0,`
			`"UNCERTAIN_IMPORT": 0,`
			`"CONFIDENT_SKIP": 0,`
			`"UNCERTAIN_SKIP": 0,`
			`"FAIL": 0`
			`}`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`filename = os.path.basename(inputf)`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`if re.match(".*\.csv",filename):`
Implemented import from own export 2022-04-04 18:50:46 +03:00			`typeid,typedesc = "lastfm","Last.fm"`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00			`importfunc = parse_lastfm`

Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`elif re.match("endsong_[0-9]+\.json",filename):`
Implemented import from own export 2022-04-04 18:50:46 +03:00			`typeid,typedesc = "spotify","Spotify"`
Reworked import 2022-04-01 18:16:50 +03:00			`importfunc = parse_spotify_full`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`elif re.match("StreamingHistory[0-9]+\.json",filename):`
Implemented import from own export 2022-04-04 18:50:46 +03:00			`typeid,typedesc = "spotify","Spotify"`
Reworked import 2022-04-01 18:16:50 +03:00			`importfunc = parse_spotify_lite`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented import from own export 2022-04-04 18:50:46 +03:00			`elif re.match("maloja_export_[0-9]+\.json",filename):`
			`typeid,typedesc = "maloja","Maloja"`
			`importfunc = parse_maloja`

Reworked import 2022-04-01 18:16:50 +03:00			`else:`
			`print("File",inputf,"could not be identified as a valid import source.")`
Improved feedback of import 2022-04-01 19:19:21 +03:00			`return result`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00

Implemented import from own export 2022-04-04 18:50:46 +03:00			`print(f"Parsing {col['yellow'](inputf)} as {col['cyan'](typedesc)} export")`
			`print("This could take a while...")`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00
			`timestamps = set()`
			`scrobblebuffer = []`

Merge branch 'master' into v3 2022-04-04 17:39:31 +03:00			`for status,scrobble,msg in importfunc(inputf):`
			`result[status] += 1`
			`outputs[status](msg)`
			`if status in ['CONFIDENT_IMPORT','UNCERTAIN_IMPORT']:`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00
Merge branch 'master' into v3 2022-04-04 17:39:31 +03:00			`# prevent duplicate timestamps`
More normalizing 2022-04-08 05:52:59 +03:00			`while scrobble['scrobble_time'] in timestamps:`
			`scrobble['scrobble_time'] += 1`
			`timestamps.add(scrobble['scrobble_time'])`
Merge branch 'master' into v3 2022-04-04 17:39:31 +03:00
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00			`# clean up`
More normalizing 2022-04-08 05:52:59 +03:00			`(scrobble['track_artists'],scrobble['track_title']) = c.fullclean(scrobble['track_artists'],scrobble['track_title'])`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00
Implemented extra information field in DB 2022-04-05 21:51:14 +03:00			`# extra info`
			`extrainfo = {}`
More normalizing 2022-04-08 05:52:59 +03:00			`if scrobble.get('album_name'): extrainfo['album_name'] = scrobble['album_name']`
Implemented extra information field in DB 2022-04-05 21:51:14 +03:00			`# saving this in the scrobble instead of the track because for now it's not meant`
			`# to be authorative information, just payload of the scrobble`

Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00			`scrobblebuffer.append({`
More normalizing 2022-04-08 05:52:59 +03:00			`"time":scrobble['scrobble_time'],`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00			`"track":{`
More normalizing 2022-04-08 05:52:59 +03:00			`"artists":scrobble['track_artists'],`
			`"title":scrobble['track_title'],`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00			`"length":None`
			`},`
More normalizing 2022-04-08 05:52:59 +03:00			`"duration":scrobble['scrobble_duration'],`
Implemented import from own export 2022-04-04 18:50:46 +03:00			`"origin":"import:" + typeid,`
Implemented extra information field in DB 2022-04-05 21:51:14 +03:00			`"extra":extrainfo`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00			`})`

Merge branch 'master' into v3 2022-04-04 17:39:31 +03:00			`if (result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']) % 1000 == 0:`
			`print(f"Imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles...")`
Updated tasks to new database architecture 2022-03-27 06:31:15 +03:00			`add_scrobbles(scrobblebuffer)`
			`scrobblebuffer = []`

			`add_scrobbles(scrobblebuffer)`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Merge branch 'master' into v3 2022-04-04 17:39:31 +03:00			`msg = f"Successfully imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles"`
			`if result['UNCERTAIN_IMPORT'] > 0:`
			`warningmsg = col['orange'](f"{result['UNCERTAIN_IMPORT']} Warning{'s' if result['UNCERTAIN_IMPORT'] != 1 else ''}!")`
			`msg += f" ({warningmsg})"`
			`print(msg)`

			`msg = f"Skipped {result['CONFIDENT_SKIP'] + result['UNCERTAIN_SKIP']} scrobbles"`
			`if result['UNCERTAIN_SKIP'] > 0:`
Readme and minor fixes 2022-04-05 06:48:23 +03:00			`warningmsg = col['indianred'](f"{result['UNCERTAIN_SKIP']} Warning{'s' if result['UNCERTAIN_SKIP'] != 1 else ''}!")`
Merge branch 'master' into v3 2022-04-04 17:39:31 +03:00			`msg += f" ({warningmsg})"`
			`print(msg)`

			`if result['FAIL'] > 0:`
			`print(col['red'](f"{result['FAIL']} Error{'s' if result['FAIL'] != 1 else ''}!"))`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00

Improved feedback of import 2022-04-01 19:19:21 +03:00			`return result`
Reworked import 2022-04-01 18:16:50 +03:00
			`def parse_spotify_lite(inputf):`
			`inputfolder = os.path.dirname(inputf)`
			`filenames = re.compile(r'StreamingHistory[0-9]+\.json')`
			`inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`if inputfiles != [inputf]:`
			`print("Spotify files should all be imported together to identify duplicates across the whole dataset.")`
			`if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):`
			`inputfiles = [inputf]`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`for inputf in inputfiles:`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`print("Importing",col['yellow'](inputf),"...")`
			`with open(inputf,'r') as inputfd:`
			`data = json.load(inputfd)`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`for entry in data:`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
			`try:`
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`played = int(entry['msPlayed'] / 1000)`
			`timestamp = int(`
			`datetime.datetime.strptime(entry['endTime'],"%Y-%m-%d %H:%M").timestamp()`
			`)`
			`artist = entry['artistName']`
			`title = entry['trackName']`

			`if played < 30:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")`
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`continue`

			`yield ("CONFIDENT_IMPORT",{`
More normalizing 2022-04-08 05:52:59 +03:00			`'track_title':title,`
			`'track_artists': artist,`
			`'scrobble_time': timestamp,`
			`'scrobble_duration':played,`
			`'album_name': None`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`},'')`
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`except Exception as e:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00			`continue`

Renamed import module to match v3 2022-04-04 17:25:21 +03:00			`print()`

Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`def parse_spotify_full(inputf):`

			`inputfolder = os.path.dirname(inputf)`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00			`filenames = re.compile(r'endsong_[0-9]+\.json')`
Reworked import 2022-04-01 18:16:50 +03:00			`inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`if inputfiles != [inputf]:`
			`print("Spotify files should all be imported together to identify duplicates across the whole dataset.")`
			`if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):`
			`inputfiles = [inputf]`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`# we keep timestamps here as well to remove duplicates because spotify's export`
			`# is messy - this is specific to this import type and should not be mixed with`
			`# the outer function timestamp check (which is there to fix duplicate timestamps`
			`# that are assumed to correspond to actually distinct plays)`
			`timestamps = {}`
			`inaccurate_timestamps = {}`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`for inputf in inputfiles:`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`print("Importing",col['yellow'](inputf),"...")`
			`with open(inputf,'r') as inputfd:`
			`data = json.load(inputfd)`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`for entry in data:`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`try:`
			`played = int(entry['ms_played'] / 1000)`
			`timestamp = int(entry['offline_timestamp'] / 1000)`
			`artist = entry['master_metadata_album_artist_name']`
			`title = entry['master_metadata_track_name']`
			`album = entry['master_metadata_album_album_name']`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00

Reworked import 2022-04-01 18:16:50 +03:00			`if title is None:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('CONFIDENT_SKIP',None,f"{entry} has no title, skipping...")`
Reworked import 2022-04-01 18:16:50 +03:00			`continue`
			`if artist is None:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('CONFIDENT_SKIP',None,f"{entry} has no artist, skipping...")`
Reworked import 2022-04-01 18:16:50 +03:00			`continue`
			`if played < 30:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")`
Reworked import 2022-04-01 18:16:50 +03:00			`continue`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`# if offline_timestamp is a proper number, we treat it as`
			`# accurate and check duplicates by that exact timestamp`
			`if timestamp != 0:`
Improved feedback of import 2022-04-01 19:19:21 +03:00
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00			`if timestamp in timestamps and (artist,title) in timestamps[timestamp]:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('CONFIDENT_SKIP',None,f"{entry} seems to be a duplicate, skipping...")`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00			`continue`
Improved feedback of import 2022-04-01 19:19:21 +03:00			`else:`
			`status = 'CONFIDENT_IMPORT'`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`msg = ''`
Improved feedback of import 2022-04-01 19:19:21 +03:00			`timestamps.setdefault(timestamp,[]).append((artist,title))`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`# if it's 0, we use ts instead, but identify duplicates differently`
			`# (cause the ts is not accurate)`
Reworked import 2022-04-01 18:16:50 +03:00			`else:`
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`timestamp = int(`
Implemented importing from Spotify's one-year data export 2022-04-01 20:28:13 +03:00			`datetime.datetime.strptime(entry['ts'].replace('Z','+0000'),"%Y-%m-%dT%H:%M:%S%z").timestamp()`
Reworked import 2022-04-01 18:16:50 +03:00			`)`


Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`ts_group = int(timestamp/10)`
Renamed import module to match v3 2022-04-04 17:25:21 +03:00			`relevant_ts_groups = [ts_group-3,ts_group-2,ts_group-1,ts_group,ts_group+1,ts_group+2,ts_group+3]`
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`similar_scrobbles = [scrob for tsg in relevant_ts_groups for scrob in inaccurate_timestamps.get(tsg,[])]`

			`scrobble_describe = (timestamp,entry['spotify_track_uri'],entry['ms_played'])`
			`found_similar = False`
			`for scr in similar_scrobbles:`
			`# scrobbles count as duplicate if:`
			`# - less than 30 seconds apart`
			`# - exact same track uri`
			`# - exact same ms_played`
			`if (abs(scr[0] - timestamp) < 30) and scr[1:] == scrobble_describe[1:]:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('UNCERTAIN_SKIP',None,f"{entry} might be a duplicate, skipping...")`
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`found_similar = True`
			`break`
			`else:`
			`# no duplicates, assume proper scrobble but warn`
Improved feedback of import 2022-04-01 19:19:21 +03:00			`status = 'UNCERTAIN_IMPORT'`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`msg = f"{entry} might have an inaccurate timestamp."`
Implemented heuristics for Spotify import with inaccurate timestamps, GH-104 2022-04-01 18:53:36 +03:00			`inaccurate_timestamps.setdefault(ts_group,[]).append(scrobble_describe)`

			`if found_similar:`
			`continue`
Reworked import 2022-04-01 18:16:50 +03:00

			`yield (status,{`
More normalizing 2022-04-08 05:52:59 +03:00			`'track_title':title,`
			`'track_artists': artist,`
			`'album_name': album,`
			`'scrobble_time': timestamp,`
			`'scrobble_duration':played`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`},msg)`
Reworked import 2022-04-01 18:16:50 +03:00			`except Exception as e:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")`
Reworked import 2022-04-01 18:16:50 +03:00			`continue`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00
Reworked import 2022-04-01 18:16:50 +03:00			`print()`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00
			`def parse_lastfm(inputf):`

			`with open(inputf,'r',newline='') as inputfd:`
			`reader = csv.reader(inputfd)`

			`for row in reader:`
			`try:`
			`artist,album,title,time = row`
			`except ValueError:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('FAIL',None,f"{row} does not look like a valid entry. Scrobble not imported.")`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00			`continue`

			`try:`
Improved feedback of import 2022-04-01 19:19:21 +03:00			`yield ('CONFIDENT_IMPORT',{`
More normalizing 2022-04-08 05:52:59 +03:00			`'track_title': title,`
			`'track_artists': artist,`
			`'album_name': album,`
			`'scrobble_time': int(datetime.datetime.strptime(`
Changed Spotify import to use all files and discard duplicates, GH-104 2022-03-30 18:38:56 +03:00			`time + '+0000',`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00			`"%d %b %Y %H:%M%z"`
			`).timestamp()),`
More normalizing 2022-04-08 05:52:59 +03:00			`'scrobble_duration':None`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`},'')`
Added handling for invalid Spotify scrobbles 2022-03-29 18:27:34 +03:00			`except Exception as e:`
Improved import feedback output logic 2022-04-01 20:43:33 +03:00			`yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")`
Reworked import and added support for Spotify, GH-104 2022-03-27 04:10:54 +03:00			`continue`
Implemented import from own export 2022-04-04 18:50:46 +03:00

			`def parse_maloja(inputf):`

			`with open(inputf,'r') as inputfd:`
			`data = json.load(inputfd)`

			`scrobbles = data['scrobbles']`

			`for s in scrobbles:`
			`try:`
			`yield ('CONFIDENT_IMPORT',{`
More normalizing 2022-04-08 05:52:59 +03:00			`'track_title': s['track']['title'],`
			`'track_artists': s['track']['artists'],`
			`'album_name': s['track'].get('album',{}).get('name',''),`
			`'scrobble_time': s['time'],`
			`'scrobble_duration': s['duration']`
Implemented import from own export 2022-04-04 18:50:46 +03:00			`},'')`
			`except Exception as e:`
			`yield ('FAIL',None,f"{s} could not be parsed. Scrobble not imported. ({repr(e)})")`
			`continue`