Moved tsv handling to doreah

2023-08-10 21:12:55 +03:00 · 2019-03-29 20:23:32 +01:00 · 2019-03-29 20:23:32 +01:00 · 2f30157b04
commit 2f30157b04
parent 5765687f9d
3 changed files with 132 additions and 123 deletions
--- a/cleanup.py
+++ b/cleanup.py
@ -1,41 +1,42 @@
 import re
 import utilities
 from doreah import tsv
 # need to do this as a class so it can retain loaded settings from file
 # apparently this is not true
 # I'm dumb
 class CleanerAgent:
-	
+
 	def __init__(self):
 		self.updateRules()
-	
+
 	def updateRules(self):
-		raw = utilities.parseAllTSV("rules","string","string","string")
+		raw = tsv.parse_all("rules","string","string","string")
 		self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
 		self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
 		self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
 		self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
-		
+
 		# we always need to be able to tell if our current database is made with the current rules
 		self.checksums = utilities.checksumTSV("rules")
-			
+
-	
+
-	
+
 	def fullclean(self,artist,title):
 		artists = self.parseArtists(self.removespecial(artist))
 		title = self.parseTitle(self.removespecial(title))
 		(title,moreartists) = self.parseTitleForArtists(title)
-		artists += moreartists	
+		artists += moreartists
 		artists = list(set(artists))
 		artists.sort()
-		
+
 		return (artists,title)
 	def removespecial(self,s):
 		s = s.replace("\t","").replace("␟","").replace("\n","")
 		s = re.sub(" +"," ",s)
 		return s
-		
+
 	# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
 	def confirmedReal(self,a):
@ -51,54 +52,54 @@ class CleanerAgent:
 		if a.strip() == "":
 			return []
-			
+
 		if a.strip() in self.rules_notanartist:
 			return []
-			
+
 		if " performing " in a.lower():
 			return self.parseArtists(re.split(" [Pp]erforming",a)[0])
-			
+
 		if a.strip() in self.rules_belongtogether:
 			return [a.strip()]
 		if a.strip() in self.rules_replaceartist:
 			return self.rules_replaceartist[a.strip()].split("␟")
-			
+
-		
+
-		
+
 		for d in self.delimiters_feat:
 			if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
 				return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
-		
+
 		for d in self.delimiters_formal:
 			if (d in a):
 				ls = []
 				for i in a.split(d):
 					ls += self.parseArtists(i)
 				return ls
-		
+
 		for d in (self.delimiters_feat + self.delimiters):
 			if ((" " + d + " ") in a):
 				ls = []
 				for i in a.split(" " + d + " "):
 					ls += self.parseArtists(i)
 				return ls
-				
+
-		
+
-			
+
-		
+
-			
+
 		return [a.strip()]
 	def parseTitle(self,t):
 		if t.strip() in self.rules_replacetitle:
 			return self.rules_replacetitle[t.strip()]
-	
+
 		t = t.replace("[","(").replace("]",")")
-		
+
 		t = re.sub(r" \(as made famous by .*?\)","",t)
 		t = re.sub(r" \(originally by .*?\)","",t)
 		t = re.sub(r" \(.*?Remaster.*?\)","",t)
-		
+
 		return t.strip()
 	def parseTitleForArtists(self,t):
@ -115,30 +116,30 @@ class CleanerAgent:
 				(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
 				artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
 				return (title,artists)
 		return (t,[])
-#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)		
+		return (t,[])
 #this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
 class CollectorAgent:
-	
+
 	def __init__(self):
 		self.updateRules()
-	
+
 	def updateRules(self):
-		raw = utilities.parseAllTSV("rules","string","string","string")
+		raw = tsv.parse_all("rules","string","string","string")
 		self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
 		self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
 		for a in self.rules_countas:
 			self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]
-	
+
-	# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of	
+	# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of
 	def updateIDs(self,artistlist):
 		self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas}
 		#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
 		#this needs to take lists into account
-		
+
 	def getCredited(self,artist):
 		if artist in self.rules_countas_id:
 			return self.rules_countas_id[artist]
@ -146,36 +147,36 @@ class CollectorAgent:
 			return self.rules_countas[artist]
 		else:
 			return artist
-	
+
-		
+
 	def getCreditedList(self,artists):
 		updatedArtists = []
 		for artist in artists:
 			updatedArtists.append(self.getCredited(artist))
 		return list(set(updatedArtists))
-		
+
 	def getAllAssociated(self,artist):
 		return self.rules_include.get(artist,[])
-		
+
 	# this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when
 	# countas rules are declared preemptively
 	def getAllArtists(self):
 		return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))
-		
+
-		
+
-		
+
-		
+
-		
+
-		
+
-		
+
 def flatten(lis):
 	newlist = []
-		
+
 	for l in lis:
 		if isinstance(l, str):
 			newlist.append(l)
 		else:
 			newlist = newlist + l
-				
+
 	return list(set(newlist))
--- a/database.py
+++ b/database.py
@ -7,6 +7,7 @@ import datetime
 from cleanup import *
 from utilities import *
 from doreah.logging import log
 from doreah import tsv
 from malojatime import *
 import sys
 import unicodedata
@ -39,8 +40,10 @@ db_rulestate = False
 ### symmetric keys are fine for now since we hopefully use HTTPS
 def loadAPIkeys():
 	global clients
-	createTSV("clients/authenticated_machines.tsv")
+	tsv.create("clients/authenticated_machines.tsv")
-	clients = parseTSV("clients/authenticated_machines.tsv","string","string")
+	#createTSV("clients/authenticated_machines.tsv")
 	clients = tsv.parse("clients/authenticated_machines.tsv","string","string")
 	#clients = parseTSV("clients/authenticated_machines.tsv","string","string")
 	log("Authenticated Machines: " + ", ".join([m[1] for m in clients]))
 def checkAPIkey(k):
@ -550,7 +553,8 @@ def newrule():
 	keys = FormsDict.decode(request.forms)
 	apikey = keys.pop("key",None)
 	if (checkAPIkey(apikey)):
-		addEntry("rules/webmade.tsv",[k for k in keys])
+		tsv.add_entry("rules/webmade.tsv",[k for k in keys])
 		#addEntry("rules/webmade.tsv",[k for k in keys])
 		global db_rulestate
 		db_rulestate = False
@ -742,7 +746,8 @@ def build_db():
 	# parse files
-	db = parseAllTSV("scrobbles","int","string","string",escape=False)
+	db = tsv.parse_all("scrobbles","int","string","string",comments=False)
 	#db = parseAllTSV("scrobbles","int","string","string",escape=False)
 	for sc in db:
 		artists = sc[1].split("␟")
 		title = sc[2]
@ -803,7 +808,8 @@ def sync():
 			SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)
 	for e in entries:
-		addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
+		tsv.add_entries("scrobbles/" + e + ".tsv",entries[e],comments=False)
 		#addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
 		combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)
--- a/utilities.py
+++ b/utilities.py
@ -6,48 +6,49 @@ import pickle
 import urllib
 import datetime
 from doreah import settings
 from doreah.logging import log
 ### TSV files
-def parseTSV(filename,*args,escape=True):
+#def parseTSV(filename,*args,escape=True):
-	f = open(filename)
+#	f = open(filename)
 #
 #	result = []
 #	for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
 #
 #		l = l.replace("\n","")
 #		if escape:
 #			l = l.split("#")[0]
 #		l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
 #		data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
 #		entry = [] * len(args)
 #		for i in range(len(args)):
 #			if args[i]=="list":
 #				try:
 #					entry.append(data[i].split("␟"))
 #				except:
 #					entry.append([])
 #			elif args[i]=="string":
 #				try:
 #					entry.append(data[i])
 #				except:
 #					entry.append("")
 #			elif args[i]=="int":
 #				try:
 #					entry.append(int(data[i]))
 #				except:
 #					entry.append(0)
 #			elif args[i]=="bool":
 #				try:
 #					entry.append((data[i].lower() in ["true","yes","1","y"]))
 #				except:
 #					entry.append(False)
 #
 #		result.append(entry)
-	result = []
+#	f.close()
-	for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
+#	return result
 		l = l.replace("\n","")
 		if escape:
 			l = l.split("#")[0]
 		l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
 		data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
 		entry = [] * len(args)
 		for i in range(len(args)):
 			if args[i]=="list":
 				try:
 					entry.append(data[i].split("␟"))
 				except:
 					entry.append([])
 			elif args[i]=="string":
 				try:
 					entry.append(data[i])
 				except:
 					entry.append("")
 			elif args[i]=="int":
 				try:
 					entry.append(int(data[i]))
 				except:
 					entry.append(0)
 			elif args[i]=="bool":
 				try:
 					entry.append((data[i].lower() in ["true","yes","1","y"]))
 				except:
 					entry.append(False)
 		result.append(entry)
 	f.close()
 	return result
 def checksumTSV(folder):
@ -110,40 +111,40 @@ def consistentRulestate(folder,checksums):
 	return True
-def parseAllTSV(path,*args,escape=True):
+#def parseAllTSV(path,*args,escape=True):
 #
 #
 #	result = []
 #	for f in os.listdir(path + "/"):
 #
 #		if (f.endswith(".tsv")):
 #
 #			result += parseTSV(path + "/" + f,*args,escape=escape)
 #
 #	return result
 #def createTSV(filename):
 #
 #	if not os.path.exists(filename):
 #		open(filename,"w").close()
-	result = []
+#def addEntry(filename,a,escape=True):
-	for f in os.listdir(path + "/"):
+#
-
+#	createTSV(filename)
-		if (f.endswith(".tsv")):
+#
-
+#	line = "\t".join(a)
-			result += parseTSV(path + "/" + f,*args,escape=escape)
+#	if escape: line = line.replace("#",r"\num")
-
+#	with open(filename,"a") as f:
-	return result
+#		f.write(line + "\n")
 def createTSV(filename):
 	if not os.path.exists(filename):
 		open(filename,"w").close()
 def addEntry(filename,a,escape=True):
 	createTSV(filename)
 	line = "\t".join(a)
 	if escape: line = line.replace("#",r"\num")
 	with open(filename,"a") as f:
 		f.write(line + "\n")
 def addEntries(filename,al,escape=True):
 	with open(filename,"a") as f:
 		for a in al:
 			line = "\t".join(a)
 			if escape: line = line.replace("#",r"\num")
 			f.write(line + "\n")
 #def addEntries(filename,al,escape=True):
 #
 #	with open(filename,"a") as f:
 #		for a in al:
 #			line = "\t".join(a)
 #			if escape: line = line.replace("#",r"\num")
 #			f.write(line + "\n")
 #
 ### Useful functions
@ -273,6 +274,7 @@ def cache_track(artists,title,result):
 	day = datetime.date.today().toordinal()
 	cachedTracksDays[(frozenset(artists),title)] = day
 def cache_artist(artist,result):
 	if result is None: log("Caching None for " + artist,module="debug")
 	cachedArtists[artist] = result
 	day = datetime.date.today().toordinal()
 	cachedArtistsDays[artist] = day