maloja/maloja/proccontrol/tasks/import_scrobbles.py

313 lines
9.4 KiB
Python
Raw Normal View History

import os, datetime, re
import json, csv
from doreah.io import col, ask, prompt
from ...cleanup import *
from ...pkg_global.conf import data_dir
c = CleanerAgent()
2022-04-01 20:43:33 +03:00
outputs = {
"CONFIDENT_IMPORT": lambda msg: None,
"UNCERTAIN_IMPORT": lambda msg: print(col['orange'](msg)),
2022-04-04 17:25:21 +03:00
#"CONFIDENT_SKIP": lambda msg: print(col['ffcba4'](msg)),
"CONFIDENT_SKIP": lambda msg: None,
2022-04-05 06:48:23 +03:00
"UNCERTAIN_SKIP": lambda msg: print(col['indianred'](msg)),
2022-04-01 20:43:33 +03:00
"FAIL": lambda msg: print(col['red'](msg)),
}
def import_scrobbles(inputf):
from ...database.sqldb import add_scrobbles
2022-04-01 19:19:21 +03:00
result = {
"CONFIDENT_IMPORT": 0,
"UNCERTAIN_IMPORT": 0,
"CONFIDENT_SKIP": 0,
"UNCERTAIN_SKIP": 0,
"FAIL": 0
}
filename = os.path.basename(inputf)
if re.match(".*\.csv",filename):
2022-04-04 18:50:46 +03:00
typeid,typedesc = "lastfm","Last.fm"
importfunc = parse_lastfm
elif re.match("endsong_[0-9]+\.json",filename):
2022-04-04 18:50:46 +03:00
typeid,typedesc = "spotify","Spotify"
2022-04-01 18:16:50 +03:00
importfunc = parse_spotify_full
elif re.match("StreamingHistory[0-9]+\.json",filename):
2022-04-04 18:50:46 +03:00
typeid,typedesc = "spotify","Spotify"
2022-04-01 18:16:50 +03:00
importfunc = parse_spotify_lite
2022-04-04 18:50:46 +03:00
elif re.match("maloja_export_[0-9]+\.json",filename):
typeid,typedesc = "maloja","Maloja"
importfunc = parse_maloja
2022-04-01 18:16:50 +03:00
else:
print("File",inputf,"could not be identified as a valid import source.")
2022-04-01 19:19:21 +03:00
return result
2022-04-04 18:50:46 +03:00
print(f"Parsing {col['yellow'](inputf)} as {col['cyan'](typedesc)} export")
print("This could take a while...")
timestamps = set()
scrobblebuffer = []
2022-04-04 17:39:31 +03:00
for status,scrobble,msg in importfunc(inputf):
result[status] += 1
outputs[status](msg)
if status in ['CONFIDENT_IMPORT','UNCERTAIN_IMPORT']:
2022-04-04 17:39:31 +03:00
# prevent duplicate timestamps
2022-04-08 05:52:59 +03:00
while scrobble['scrobble_time'] in timestamps:
scrobble['scrobble_time'] += 1
timestamps.add(scrobble['scrobble_time'])
2022-04-04 17:39:31 +03:00
# clean up
2022-04-08 05:52:59 +03:00
(scrobble['track_artists'],scrobble['track_title']) = c.fullclean(scrobble['track_artists'],scrobble['track_title'])
# extra info
extrainfo = {}
2022-04-08 05:52:59 +03:00
if scrobble.get('album_name'): extrainfo['album_name'] = scrobble['album_name']
# saving this in the scrobble instead of the track because for now it's not meant
# to be authorative information, just payload of the scrobble
scrobblebuffer.append({
2022-04-08 05:52:59 +03:00
"time":scrobble['scrobble_time'],
"track":{
2022-04-08 05:52:59 +03:00
"artists":scrobble['track_artists'],
"title":scrobble['track_title'],
"length":None
},
2022-04-08 05:52:59 +03:00
"duration":scrobble['scrobble_duration'],
2022-04-04 18:50:46 +03:00
"origin":"import:" + typeid,
"extra":extrainfo
})
2022-04-04 17:39:31 +03:00
if (result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']) % 1000 == 0:
print(f"Imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles...")
add_scrobbles(scrobblebuffer)
scrobblebuffer = []
add_scrobbles(scrobblebuffer)
2022-04-04 17:39:31 +03:00
msg = f"Successfully imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles"
if result['UNCERTAIN_IMPORT'] > 0:
warningmsg = col['orange'](f"{result['UNCERTAIN_IMPORT']} Warning{'s' if result['UNCERTAIN_IMPORT'] != 1 else ''}!")
msg += f" ({warningmsg})"
print(msg)
msg = f"Skipped {result['CONFIDENT_SKIP'] + result['UNCERTAIN_SKIP']} scrobbles"
if result['UNCERTAIN_SKIP'] > 0:
2022-04-05 06:48:23 +03:00
warningmsg = col['indianred'](f"{result['UNCERTAIN_SKIP']} Warning{'s' if result['UNCERTAIN_SKIP'] != 1 else ''}!")
2022-04-04 17:39:31 +03:00
msg += f" ({warningmsg})"
print(msg)
if result['FAIL'] > 0:
print(col['red'](f"{result['FAIL']} Error{'s' if result['FAIL'] != 1 else ''}!"))
2022-04-01 19:19:21 +03:00
return result
2022-04-01 18:16:50 +03:00
def parse_spotify_lite(inputf):
inputfolder = os.path.dirname(inputf)
filenames = re.compile(r'StreamingHistory[0-9]+\.json')
inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]
2022-04-01 18:16:50 +03:00
if inputfiles != [inputf]:
print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
inputfiles = [inputf]
for inputf in inputfiles:
print("Importing",col['yellow'](inputf),"...")
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
for entry in data:
try:
played = int(entry['msPlayed'] / 1000)
timestamp = int(
datetime.datetime.strptime(entry['endTime'],"%Y-%m-%d %H:%M").timestamp()
)
artist = entry['artistName']
title = entry['trackName']
if played < 30:
2022-04-01 20:43:33 +03:00
yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")
continue
yield ("CONFIDENT_IMPORT",{
2022-04-08 05:52:59 +03:00
'track_title':title,
'track_artists': artist,
'scrobble_time': timestamp,
'scrobble_duration':played,
'album_name': None
2022-04-01 20:43:33 +03:00
},'')
except Exception as e:
2022-04-01 20:43:33 +03:00
yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
continue
2022-04-04 17:25:21 +03:00
print()
2022-04-01 18:16:50 +03:00
def parse_spotify_full(inputf):
inputfolder = os.path.dirname(inputf)
filenames = re.compile(r'endsong_[0-9]+\.json')
2022-04-01 18:16:50 +03:00
inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]
2022-04-01 18:16:50 +03:00
if inputfiles != [inputf]:
print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
inputfiles = [inputf]
2022-04-01 18:16:50 +03:00
# we keep timestamps here as well to remove duplicates because spotify's export
# is messy - this is specific to this import type and should not be mixed with
# the outer function timestamp check (which is there to fix duplicate timestamps
# that are assumed to correspond to actually distinct plays)
timestamps = {}
inaccurate_timestamps = {}
2022-04-01 18:16:50 +03:00
for inputf in inputfiles:
2022-04-01 18:16:50 +03:00
print("Importing",col['yellow'](inputf),"...")
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
2022-04-01 18:16:50 +03:00
for entry in data:
2022-04-01 18:16:50 +03:00
try:
played = int(entry['ms_played'] / 1000)
timestamp = int(entry['offline_timestamp'] / 1000)
artist = entry['master_metadata_album_artist_name']
title = entry['master_metadata_track_name']
album = entry['master_metadata_album_album_name']
2022-04-01 18:16:50 +03:00
if title is None:
2022-04-01 20:43:33 +03:00
yield ('CONFIDENT_SKIP',None,f"{entry} has no title, skipping...")
2022-04-01 18:16:50 +03:00
continue
if artist is None:
2022-04-01 20:43:33 +03:00
yield ('CONFIDENT_SKIP',None,f"{entry} has no artist, skipping...")
2022-04-01 18:16:50 +03:00
continue
if played < 30:
2022-04-01 20:43:33 +03:00
yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")
2022-04-01 18:16:50 +03:00
continue
2022-04-01 18:16:50 +03:00
# if offline_timestamp is a proper number, we treat it as
# accurate and check duplicates by that exact timestamp
if timestamp != 0:
2022-04-01 19:19:21 +03:00
if timestamp in timestamps and (artist,title) in timestamps[timestamp]:
2022-04-01 20:43:33 +03:00
yield ('CONFIDENT_SKIP',None,f"{entry} seems to be a duplicate, skipping...")
continue
2022-04-01 19:19:21 +03:00
else:
status = 'CONFIDENT_IMPORT'
2022-04-01 20:43:33 +03:00
msg = ''
2022-04-01 19:19:21 +03:00
timestamps.setdefault(timestamp,[]).append((artist,title))
# if it's 0, we use ts instead, but identify duplicates differently
# (cause the ts is not accurate)
2022-04-01 18:16:50 +03:00
else:
2022-04-01 18:16:50 +03:00
timestamp = int(
datetime.datetime.strptime(entry['ts'].replace('Z','+0000'),"%Y-%m-%dT%H:%M:%S%z").timestamp()
2022-04-01 18:16:50 +03:00
)
ts_group = int(timestamp/10)
2022-04-04 17:25:21 +03:00
relevant_ts_groups = [ts_group-3,ts_group-2,ts_group-1,ts_group,ts_group+1,ts_group+2,ts_group+3]
similar_scrobbles = [scrob for tsg in relevant_ts_groups for scrob in inaccurate_timestamps.get(tsg,[])]
scrobble_describe = (timestamp,entry['spotify_track_uri'],entry['ms_played'])
found_similar = False
for scr in similar_scrobbles:
# scrobbles count as duplicate if:
# - less than 30 seconds apart
# - exact same track uri
# - exact same ms_played
if (abs(scr[0] - timestamp) < 30) and scr[1:] == scrobble_describe[1:]:
2022-04-01 20:43:33 +03:00
yield ('UNCERTAIN_SKIP',None,f"{entry} might be a duplicate, skipping...")
found_similar = True
break
else:
# no duplicates, assume proper scrobble but warn
2022-04-01 19:19:21 +03:00
status = 'UNCERTAIN_IMPORT'
2022-04-01 20:43:33 +03:00
msg = f"{entry} might have an inaccurate timestamp."
inaccurate_timestamps.setdefault(ts_group,[]).append(scrobble_describe)
if found_similar:
continue
2022-04-01 18:16:50 +03:00
yield (status,{
2022-04-08 05:52:59 +03:00
'track_title':title,
'track_artists': artist,
'album_name': album,
'scrobble_time': timestamp,
'scrobble_duration':played
2022-04-01 20:43:33 +03:00
},msg)
2022-04-01 18:16:50 +03:00
except Exception as e:
2022-04-01 20:43:33 +03:00
yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
2022-04-01 18:16:50 +03:00
continue
2022-04-01 18:16:50 +03:00
print()
def parse_lastfm(inputf):
with open(inputf,'r',newline='') as inputfd:
reader = csv.reader(inputfd)
for row in reader:
try:
artist,album,title,time = row
except ValueError:
2022-04-01 20:43:33 +03:00
yield ('FAIL',None,f"{row} does not look like a valid entry. Scrobble not imported.")
continue
try:
2022-04-01 19:19:21 +03:00
yield ('CONFIDENT_IMPORT',{
2022-04-08 05:52:59 +03:00
'track_title': title,
'track_artists': artist,
'album_name': album,
'scrobble_time': int(datetime.datetime.strptime(
time + '+0000',
"%d %b %Y %H:%M%z"
).timestamp()),
2022-04-08 05:52:59 +03:00
'scrobble_duration':None
2022-04-01 20:43:33 +03:00
},'')
except Exception as e:
2022-04-01 20:43:33 +03:00
yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
continue
2022-04-04 18:50:46 +03:00
def parse_maloja(inputf):
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
scrobbles = data['scrobbles']
for s in scrobbles:
try:
yield ('CONFIDENT_IMPORT',{
2022-04-08 05:52:59 +03:00
'track_title': s['track']['title'],
'track_artists': s['track']['artists'],
'album_name': s['track'].get('album',{}).get('name',''),
'scrobble_time': s['time'],
'scrobble_duration': s['duration']
2022-04-04 18:50:46 +03:00
},'')
except Exception as e:
yield ('FAIL',None,f"{s} could not be parsed. Scrobble not imported. ({repr(e)})")
continue