maloja/maloja/proccontrol/tasks/import_scrobbles.py

359 lines
11 KiB
Python

import os, datetime, re
import json, csv
from doreah.io import col, ask, prompt
from ...cleanup import *
from ...pkg_global.conf import data_dir
c = CleanerAgent()
outputs = {
"CONFIDENT_IMPORT": lambda msg: None,
"UNCERTAIN_IMPORT": lambda msg: print(col['orange'](msg)),
#"CONFIDENT_SKIP": lambda msg: print(col['ffcba4'](msg)),
"CONFIDENT_SKIP": lambda msg: None,
"UNCERTAIN_SKIP": lambda msg: print(col['indianred'](msg)),
"FAIL": lambda msg: print(col['red'](msg)),
}
def import_scrobbles(inputf):
from ...database.sqldb import add_scrobbles
result = {
"CONFIDENT_IMPORT": 0,
"UNCERTAIN_IMPORT": 0,
"CONFIDENT_SKIP": 0,
"UNCERTAIN_SKIP": 0,
"FAIL": 0
}
filename = os.path.basename(inputf)
if re.match(".*\.csv",filename):
typeid,typedesc = "lastfm","Last.fm"
importfunc = parse_lastfm
elif re.match("endsong_[0-9]+\.json",filename):
typeid,typedesc = "spotify","Spotify"
importfunc = parse_spotify_full
elif re.match("StreamingHistory[0-9]+\.json",filename):
typeid,typedesc = "spotify","Spotify"
importfunc = parse_spotify_lite
elif re.match("maloja_export_[0-9]+\.json",filename):
typeid,typedesc = "maloja","Maloja"
importfunc = parse_maloja
# username_lb-YYYY-MM-DD.json
elif re.match(".*_lb-[0-9-]+\.json",filename):
typeid,typedesc = "listenbrainz","ListenBrainz"
importfunc = parse_listenbrainz
else:
print("File",inputf,"could not be identified as a valid import source.")
return result
print(f"Parsing {col['yellow'](inputf)} as {col['cyan'](typedesc)} export")
print("This could take a while...")
timestamps = set()
scrobblebuffer = []
for status,scrobble,msg in importfunc(inputf):
result[status] += 1
outputs[status](msg)
if status in ['CONFIDENT_IMPORT','UNCERTAIN_IMPORT']:
# prevent duplicate timestamps
while scrobble['scrobble_time'] in timestamps:
scrobble['scrobble_time'] += 1
timestamps.add(scrobble['scrobble_time'])
# clean up
(scrobble['track_artists'],scrobble['track_title']) = c.fullclean(scrobble['track_artists'],scrobble['track_title'])
# extra info
extrainfo = {}
if scrobble.get('album_name'): extrainfo['album_name'] = scrobble['album_name']
# saving this in the scrobble instead of the track because for now it's not meant
# to be authorative information, just payload of the scrobble
scrobblebuffer.append({
"time":scrobble['scrobble_time'],
"track":{
"artists":scrobble['track_artists'],
"title":scrobble['track_title'],
"length":scrobble['track_length'],
},
"duration":scrobble['scrobble_duration'],
"origin":"import:" + typeid,
"extra":extrainfo
})
if (result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']) % 1000 == 0:
print(f"Imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles...")
add_scrobbles(scrobblebuffer)
scrobblebuffer = []
add_scrobbles(scrobblebuffer)
msg = f"Successfully imported {result['CONFIDENT_IMPORT'] + result['UNCERTAIN_IMPORT']} scrobbles"
if result['UNCERTAIN_IMPORT'] > 0:
warningmsg = col['orange'](f"{result['UNCERTAIN_IMPORT']} Warning{'s' if result['UNCERTAIN_IMPORT'] != 1 else ''}!")
msg += f" ({warningmsg})"
print(msg)
msg = f"Skipped {result['CONFIDENT_SKIP'] + result['UNCERTAIN_SKIP']} scrobbles"
if result['UNCERTAIN_SKIP'] > 0:
warningmsg = col['indianred'](f"{result['UNCERTAIN_SKIP']} Warning{'s' if result['UNCERTAIN_SKIP'] != 1 else ''}!")
msg += f" ({warningmsg})"
print(msg)
if result['FAIL'] > 0:
print(col['red'](f"{result['FAIL']} Error{'s' if result['FAIL'] != 1 else ''}!"))
return result
def parse_spotify_lite(inputf):
pth = os.path
inputfolder = pth.relpath(pth.dirname(pth.abspath(inputf)))
filenames = re.compile(r'StreamingHistory[0-9]+\.json')
inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]
if len(inputfiles) == 0:
print("No files found!")
return
if inputfiles != [inputf]:
print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
inputfiles = [inputf]
for inputf in inputfiles:
print("Importing",col['yellow'](inputf),"...")
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
for entry in data:
try:
played = int(entry['msPlayed'] / 1000)
timestamp = int(
datetime.datetime.strptime(entry['endTime'],"%Y-%m-%d %H:%M").timestamp()
)
artist = entry['artistName']
title = entry['trackName']
if played < 30:
yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")
continue
yield ("CONFIDENT_IMPORT",{
'track_title':title,
'track_artists': artist,
'track_length': None,
'scrobble_time': timestamp,
'scrobble_duration':played,
'album_name': None
},'')
except Exception as e:
yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
continue
print()
def parse_spotify_full(inputf):
pth = os.path
inputfolder = pth.relpath(pth.dirname(pth.abspath(inputf)))
filenames = re.compile(r'endsong_[0-9]+\.json')
inputfiles = [os.path.join(inputfolder,f) for f in os.listdir(inputfolder) if filenames.match(f)]
if len(inputfiles) == 0:
print("No files found!")
return
if inputfiles != [inputf]:
print("Spotify files should all be imported together to identify duplicates across the whole dataset.")
if not ask("Import " + ", ".join(col['yellow'](i) for i in inputfiles) + "?",default=True):
inputfiles = [inputf]
# we keep timestamps here as well to remove duplicates because spotify's export
# is messy - this is specific to this import type and should not be mixed with
# the outer function timestamp check (which is there to fix duplicate timestamps
# that are assumed to correspond to actually distinct plays)
timestamps = {}
inaccurate_timestamps = {}
for inputf in inputfiles:
print("Importing",col['yellow'](inputf),"...")
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
for entry in data:
try:
played = int(entry['ms_played'] / 1000)
timestamp = int(entry['offline_timestamp'] / 1000)
artist = entry['master_metadata_album_artist_name']
title = entry['master_metadata_track_name']
album = entry['master_metadata_album_album_name']
if title is None:
yield ('CONFIDENT_SKIP',None,f"{entry} has no title, skipping...")
continue
if artist is None:
yield ('CONFIDENT_SKIP',None,f"{entry} has no artist, skipping...")
continue
if played < 30:
yield ('CONFIDENT_SKIP',None,f"{entry} is shorter than 30 seconds, skipping...")
continue
# if offline_timestamp is a proper number, we treat it as
# accurate and check duplicates by that exact timestamp
if timestamp != 0:
if timestamp in timestamps and (artist,title) in timestamps[timestamp]:
yield ('CONFIDENT_SKIP',None,f"{entry} seems to be a duplicate, skipping...")
continue
else:
status = 'CONFIDENT_IMPORT'
msg = ''
timestamps.setdefault(timestamp,[]).append((artist,title))
# if it's 0, we use ts instead, but identify duplicates differently
# (cause the ts is not accurate)
else:
timestamp = int(
datetime.datetime.strptime(entry['ts'].replace('Z','+0000'),"%Y-%m-%dT%H:%M:%S%z").timestamp()
)
ts_group = int(timestamp/10)
relevant_ts_groups = [ts_group-3,ts_group-2,ts_group-1,ts_group,ts_group+1,ts_group+2,ts_group+3]
similar_scrobbles = [scrob for tsg in relevant_ts_groups for scrob in inaccurate_timestamps.get(tsg,[])]
scrobble_describe = (timestamp,entry['spotify_track_uri'],entry['ms_played'])
found_similar = False
for scr in similar_scrobbles:
# scrobbles count as duplicate if:
# - less than 30 seconds apart
# - exact same track uri
# - exact same ms_played
if (abs(scr[0] - timestamp) < 30) and scr[1:] == scrobble_describe[1:]:
yield ('UNCERTAIN_SKIP',None,f"{entry} might be a duplicate, skipping...")
found_similar = True
break
else:
# no duplicates, assume proper scrobble but warn
status = 'UNCERTAIN_IMPORT'
msg = f"{entry} might have an inaccurate timestamp."
inaccurate_timestamps.setdefault(ts_group,[]).append(scrobble_describe)
if found_similar:
continue
yield (status,{
'track_title':title,
'track_artists': artist,
'track_length': None,
'album_name': album,
'scrobble_time': timestamp,
'scrobble_duration':played
},msg)
except Exception as e:
yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
continue
print()
def parse_lastfm(inputf):
with open(inputf,'r',newline='') as inputfd:
reader = csv.reader(inputfd)
line = 0
for row in reader:
line += 1
try:
artist,album,title,time = row
except ValueError:
yield ('FAIL',None,f"{row} (Line {line}) does not look like a valid entry. Scrobble not imported.")
continue
if time == '':
yield ('FAIL',None,f"{row} (Line {line}) is missing a timestamp.")
continue
try:
yield ('CONFIDENT_IMPORT',{
'track_title': title,
'track_artists': artist,
'track_length': None,
'album_name': album,
'scrobble_time': int(datetime.datetime.strptime(
time + '+0000',
"%d %b %Y %H:%M%z"
).timestamp()),
'scrobble_duration':None
},'')
except Exception as e:
yield ('FAIL',None,f"{row} (Line {line}) could not be parsed. Scrobble not imported. ({repr(e)})")
continue
def parse_listenbrainz(inputf):
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
for entry in data:
try:
track_metadata = entry['track_metadata']
additional_info = track_metadata.get('additional_info', {})
yield ("CONFIDENT_IMPORT",{
'track_title': track_metadata['track_name'],
'track_artists': additional_info.get('artist_names') or track_metadata['artist_name'],
'track_length': int(additional_info.get('duration_ms', 0) / 1000) or additional_info.get('duration'),
'album_name': track_metadata.get('release_name'),
'scrobble_time': entry['listened_at'],
'scrobble_duration': None,
},'')
except Exception as e:
yield ('FAIL',None,f"{entry} could not be parsed. Scrobble not imported. ({repr(e)})")
continue
def parse_maloja(inputf):
with open(inputf,'r') as inputfd:
data = json.load(inputfd)
scrobbles = data['scrobbles']
for s in scrobbles:
try:
yield ('CONFIDENT_IMPORT',{
'track_title': s['track']['title'],
'track_artists': s['track']['artists'],
'track_length': s['track']['length'],
'album_name': s['track'].get('album',{}).get('name',''),
'scrobble_time': s['time'],
'scrobble_duration': s['duration']
},'')
except Exception as e:
yield ('FAIL',None,f"{s} could not be parsed. Scrobble not imported. ({repr(e)})")
continue