hashmap: new and fast hashmap with dynamic size

2023-08-10 21:13:21 +03:00 · 2020-01-24 20:13:17 +01:00
parent 219239eadc
commit 6fd175d9be
2 changed files with 229 additions and 108 deletions
--- a/vlib/builtin/hashmap/hashmap.v
+++ b/vlib/builtin/hashmap/hashmap.v
@@ -2,120 +2,242 @@
 // Use of this source code is governed by an MIT license
 // that can be found in the LICENSE file.
 module hashmap
-/*
-	This is work in progress.
-	A very early test version of the Hashmap with a fixed size.
-	Only works with string keys and int values for now.
-
-	I added this to improve performance of the V compiler,
-	which uses lots of O(log n) map get's. Turned out with N < 10 000
-	the performance gains are basically non-existent.
-*/
-
-
-struct Hashmap {
-	cap           int
-	keys          []string
-	table         []Hashmapentry
-	elm_size      int
-pub mut:
-	nr_collisions int
-}
-
-struct Hashmapentry {
-mut:
-	key  string
-	val  int
-	next &Hashmapentry // linked list for collisions
-}

 const (
-	min_cap = 2<<10
-	max_cap = 2<<20
-)
-
-const(
-	fnv64_prime        = 1099511628211
+	initial_size = 2<<4
+	initial_cap = initial_size - 1
+	load_factor = 0.5
+	probe_offset = u16(256)
+	fnv64_prime = 1099511628211
 	fnv64_offset_basis = 14695981039346656037
+	fnv32_offset_basis = u32(2166136261)
+	fnv32_prime = u32(16777619)
 )

-const(
-    fnv32_offset_basis = u32(2166136261)
-    fnv32_prime        = u32(16777619)
-)
-
-pub fn new_hashmap(planned_nr_items int) Hashmap {
-	mut cap := planned_nr_items * 5
-	if cap < min_cap {
-		cap = min_cap
-	}
-	if cap > max_cap {
-		cap = max_cap
-	}
-	return Hashmap{
-		cap: cap
-		elm_size: 4
-		table: make(cap, cap, sizeof(Hashmapentry))
-	}
+pub struct Hashmap {
+mut:
+	info       &u16
+	key_values &KeyValue
+	cap        int
+pub mut:
+	size       int
 }

-pub fn (m mut Hashmap) set(key string, val int) {
-	// mut hash := int(b_fabs(key.hash()))
-	// idx := hash % m.cap
-	idx := int(fnv1a32(key) % m.cap)
-	if m.table[idx].key.len != 0 {
-		// println('\nset() idx=$idx key="$key" hash="$hash" val=$val')
-		m.nr_collisions++
-		// println('collision:' + m.table[idx].key)
-		mut e := &m.table[idx]
-		for e.next != 0 {
-			e = e.next
-		}
-		e.next = &Hashmapentry{
-			key,val,0}
-	}
-	else {
-		m.table[idx] = Hashmapentry{
-			key,val,0}
-	}
-}
-
-pub fn (m &Hashmap) get(key string) int {
-	// mut hash := int(b_fabs(key.hash()))
-	// idx := hash % m.cap
-	idx := int(fnv1a32(key) % m.cap)
-	mut e := &m.table[idx]
-	for e.next != 0 {
-		// todo unsafe {
-		if e.key == key {
-			return e.val
-		}
-		e = e.next
-	}
-	return e.val
-}
-
-[inline]
-fn b_fabs(v int) f64 {
-	return if v < 0 { -v } else { v }
-}
-
-// inline functions here for speed
-// rather than full impl in vlib
-[inline]
-fn fnv1a32(data string) u32 {
-    mut hash := fnv32_offset_basis
-    for i := 0; i < data.len; i++ {
-        hash = (hash ^ u32(data[i])) * fnv32_prime
-    }
-    return hash
+struct KeyValue {
+	key   string
+mut:
+	value int
 }

 [inline]
 fn fnv1a64(data string) u64 {
-    mut hash := fnv64_offset_basis
-    for i := 0; i < data.len; i++ {
-        hash = (hash ^ u64(data[i])) * fnv64_prime
-    }
-    return hash
+	mut hash := fnv64_offset_basis
+	for i := 0; i < data.len; i++ {
+		hash = (hash ^ u64(data[i])) * fnv64_prime
+	}
+	return hash
+}
+
+pub fn new_hashmap() Hashmap {
+	return Hashmap{
+		info: &u16(calloc(sizeof(u16) * initial_size))
+		key_values: &KeyValue(calloc(sizeof(KeyValue) * initial_size))
+		cap: initial_cap
+		size: 0
+	}
+}
+
+pub fn (h mut Hashmap) set(key string, value int) {
+	// The load factor is 0.5.
+	// It will be adjustable  in the future and with
+	// a higher default settings to lower memory usage.
+	if (h.size<<1) == (h.cap - 1) {
+		h.rehash()
+	}
+	// Hash-function will be swapped for wyhash
+	hash := fnv1a64(key)
+	mut info := u16((hash>>56) | probe_offset)
+	mut index := hash & h.cap
+	// While probe count is less
+	for info < h.info[index] {
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	// While we might have a match
+	for info == h.info[index] {
+		if key == h.key_values[index].key {
+			h.key_values[index].value = value
+			return
+		}
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	// Match is not possible anymore.
+	// Probe until an empty index is found.
+	// Swap when probe count is higher/richer (Robin Hood).
+	mut current_key := key
+	mut current_value := value
+	for h.info[index] != 0 {
+		if info > h.info[index] {
+			tmp_kv := h.key_values[index]
+			tmp_info := h.info[index]
+			h.key_values[index] = KeyValue{
+				current_key,current_value}
+			h.info[index] = info
+			current_key = tmp_kv.key
+			current_value = tmp_kv.value
+			info = tmp_info
+		}
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	// Should almost never happen
+	if (info & 0xFF00) == 0xFF00 {
+		h.rehash()
+		h.set(current_key, current_value)
+		return
+	}
+	h.info[index] = info
+	h.key_values[index] = KeyValue{
+		current_key,current_value}
+	h.size++
+}
+
+fn (h mut Hashmap) rehash() {
+	old_cap := h.cap
+	h.cap = ((h.cap + 1)<<1) - 1
+	mut new_key_values := &KeyValue(calloc(sizeof(KeyValue) * (h.cap + 1)))
+	mut new_info := &u16(calloc(sizeof(u16) * (h.cap + 1)))
+	for i in 0 .. (old_cap + 1) {
+		if h.info[i] != 0 {
+			key := h.key_values[i].key
+			value := h.key_values[i].value
+			hash := fnv1a64(key)
+			mut info := u16((hash>>56) | probe_offset)
+			mut index := hash & h.cap
+			// While probe count is less
+			for info < new_info[index] {
+				index = (index + 1) & h.cap
+				info += probe_offset
+			}
+			// While we might have a match
+			for info == new_info[index] {
+				if key == new_key_values[index].key {
+					new_key_values[index].value = value
+					return
+				}
+				index = (index + 1) & h.cap
+				info += probe_offset
+			}
+			// Match is not possible anymore.
+			// Probe until an empty index is found.
+			// Swap when probe count is higher/richer (Robin Hood).
+			mut current_key := key
+			mut current_value := value
+			for new_info[index] != 0 {
+				if info > new_info[index] {
+					tmp_kv := new_key_values[index]
+					tmp_info := new_info[index]
+					new_key_values[index] = KeyValue{
+						current_key,current_value}
+					new_info[index] = info
+					current_key = tmp_kv.key
+					current_value = tmp_kv.value
+					info = tmp_info
+				}
+				index = (index + 1) & h.cap
+				info += probe_offset
+			}
+			// Should almost never happen
+			if (info & 0xFF00) == 0xFF00 {
+				h.rehash()
+				h.set(current_key, current_value)
+				return
+			}
+			new_info[index] = info
+			new_key_values[index] = KeyValue{
+				current_key,current_value}
+		}
+	}
+	h.key_values = new_key_values
+	h.info = new_info
+}
+
+pub fn (h mut Hashmap) delete(key string) {
+	hash := fnv1a64(key)
+	mut index := hash & h.cap
+	mut info := u16((hash>>56) | probe_offset)
+	for info < h.info[index] {
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	// Perform backwards shifting
+	for info == h.info[index] {
+		if key == h.key_values[index].key {
+			mut old_index := index
+			index = (index + 1) & h.cap
+			mut current_info := h.info[index]
+			for (current_info>>8) > 1 {
+				h.info[old_index] = current_info - probe_offset
+				h.key_values[old_index] = h.key_values[index]
+				old_index = index
+				index = (index + 1) & h.cap
+				current_info = h.info[index]
+			}
+			h.info[old_index] = 0
+			h.size--
+			return
+		}
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+}
+
+pub fn (h Hashmap) get(key string) int {
+	hash := fnv1a64(key)
+	mut index := hash & h.cap
+	mut info := u16((hash>>56) | probe_offset)
+	for info < h.info[index] {
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	for info == h.info[index] {
+		if key == h.key_values[index].key {
+			return h.key_values[index].value
+		}
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	return 0
+}
+
+pub fn (h Hashmap) exists(key string) bool {
+	hash := fnv1a64(key)
+	mut index := hash & h.cap
+	mut info := u16((hash>>56) | probe_offset)
+	for info < h.info[index] {
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	for info == h.info[index] {
+		if key == h.key_values[index].key {
+			return true
+		}
+		index = (index + 1) & h.cap
+		info += probe_offset
+	}
+	return false
+}
+
+pub fn (h Hashmap) keys() []string {
+	size := h.size
+	mut keys := [''].repeat(size)
+	mut j := 0
+	for i in 0 .. (h.cap + 1) {
+		if h.info[i] != 0 {
+			keys[j] = h.key_values[i].key
+			j++
+		}
+	}
+	return keys
 }
--- a/vlib/builtin/hashmap/hashmap_test.v
+++ b/vlib/builtin/hashmap/hashmap_test.v
@@ -3,7 +3,7 @@ module hashmap
 import rand

 fn test_random_strings() {
-	mut m := new_hashmap(1000)
+	mut m := new_hashmap()
 	for i in 0..1000 {
 		mut buf := []byte
 		for j in 0..10 {
@@ -21,12 +21,11 @@ fn test_random_strings() {

 fn test_large_hashmap() {
 	N := 300 * 1000
-	mut nums := new_hashmap(N)
+	mut nums := new_hashmap()
 	for i := 0; i < N; i++ {
 	        key := i.str()
 	        nums.set(key, i)
 	}
-	println('nr collisions: $nums.nr_collisions')
 	for i := 0; i < N; i++ {
 		key := i.str()
 		assert nums.get(key) == i