v/vlib/builtin/map.v

// Copyright (c) 2019-2020 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.

module builtin

import (
	strings
	hash.wyhash
)

/*
This is a very fast hashmap implementation. It has several properties
that in combination makes it very fast. Here is a short explanation of
each property. After reading this you should have a basic understanding
of how it works:

1.	Hash-function (Wyhash). Wyhash is the fastest hash-function
	passing SMHasher, so it was an easy choice.

2.	Open addressing (Robin Hood Hashing). With this method a hash
	collision is resolved by probing. As opposed to linear probing,
	Robin Hood hashing has simple but clever twist: As new keys are
	inserted, old keys are shifted around in a way such that all keys
	stay reasonably close to the slot they originally hash to.

3.	Memory layout. Key-value pairs are stored in a "DenseArray",
	with an average of rougly 6.25% unused memory as opposed to
	most other dynamic array implementation with a growth factor
	of 1.5 or 2. The key-values keep their index in the array -
	they and are not probed. Instead, this implementation uses another
	array "metas" storing "metas" (meta-data). Each Key-value has
	a corresponding meta. A meta stores a reference to its key-value, and
	its index in "metas" is determined by the hash of the key and probing.
	A meta also stores bits from the hash (for faster rehashing etc.)
	and how far away it is from the index it was originally hashed to
	(probe count).

	meta (64 bit) = probe_count (8 bits) | hashbits (24 bits) | kv_index (32 bit)
	metas = [meta, 0, meta, 0, meta, meta, meta, 0, ...]
	key_values = [kv, kv, kv, kv, kv, ...]

4.	Power of two. TODO: explain
5.	Extra metas. TODO: explain
6.	Cached rehashing TODO: explain
7.	Load-factor. TODO: explain
8.	Deletion. TODO: explain
*/

const (
	// Number of bits from the hash stored for each entry
	hashbits = 24
	// Number of bits from the hash stored for rehasing
	cached_hashbits = 16
	// Initial log-number of buckets in the hashtable
	init_log_capicity = 5
	// Initial number of buckets in the hashtable
	init_capicity = 1 << init_log_capicity
	// Initial max load-factor
	init_max_load_factor = 0.8
	// Minimum Load-factor.
	// Number is picked to make delete O(1) amortized
	min_load_factor = 0.3
	// Initial range cap
	init_cap = init_capicity - 2
	// Used for incrementing `extra_metas` when max
	// probe count is too high, to avoid overflow
	extra_metas_inc = 4
	// Bitmask to select all the hashbits
	hash_mask = u32(0x00FFFFFF)
	// Used for incrementing the probe-count
	probe_inc = u32(0x01000000)
	// Bitmask for maximum probe count
	max_probe = u32(0xFF000000)
)

struct KeyValue {
	key   string
mut:
	value voidptr
}

// Dynamic array with very low growth factor
struct DenseArray {
mut:
	data &KeyValue
	cap  u32
	size u32
}

[inline]
fn new_dense_array() DenseArray {
	unsafe {
		return DenseArray {
			data: &KeyValue(malloc(8 * sizeof(KeyValue)))
			cap: 8
			size: 0
		}
	}
}

// Push element to array and return index
// The growth-factor is roughly 12.5 `(x + (x >> 3))`
[inline]
fn (d mut DenseArray) push(kv KeyValue) u32 {
	if d.cap == d.size {
		d.cap += d.cap >> 3
		d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
	}
	push_index := d.size
	d.data[push_index] = kv
	d.size++
	return push_index
}

// Move all zeros to the end of the array
// and resize array
fn (d mut DenseArray) zeros_to_end() {
	mut count := u32(0)
	for i in 0..d.size {
		if d.data[i].key.str != 0 {
			tmp := d.data[count]
			d.data[count] = d.data[i]
			d.data[i] = tmp
			count++
		}
	}
	d.size = count
	d.cap = if count < 8 {8} else {count}
	d.data = &KeyValue(C.realloc(d.data, sizeof(KeyValue) * d.cap))
}

pub struct map {
	// Byte size of value
	value_bytes     int
mut:
	// Index of the highest index in the hashtable
	cap             u32
	// Number of cached hashbits left for rehasing
	window          byte
	// Used for right-shifting out used hashbits
	shift           byte
	// Pointer to Key-value memory
	key_values      DenseArray
	// Pointer to meta-data
	metas           &u32
	// Measure that decides when to increase the capacity
	max_load_factor f32
	// Extra metas that allows for no ranging when incrementing
	// index in the hashmap
	extra_metas     u32
pub mut:
	// Number of key-values currently in the hashmap
	size            int
}

fn new_map(n, value_bytes int) map {
	return map{
		value_bytes: value_bytes
		cap: init_cap
		window: cached_hashbits
		shift: init_log_capicity
		key_values: new_dense_array()
		metas: &u32(vcalloc(sizeof(u32) * (init_capicity + extra_metas_inc)))
		max_load_factor: init_max_load_factor
		extra_metas: extra_metas_inc
		size: 0
	}
}

fn new_map_init(n, value_bytes int, keys &string, values voidptr) map {
	mut out := new_map(n, value_bytes)
	for i in 0 .. n {
		out.set(keys[i], values + i * value_bytes)
	}
	return out
}

fn (m mut map) set(key string, value voidptr) {
	if (f32(m.size << 1) / f32(m.cap)) > m.max_load_factor {
		m.expand()
	}
	hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
	mut meta := u32(((hash >> m.shift) & hash_mask) | probe_inc)
	mut index := hash & m.cap
	// While probe count is less
	for meta < m.metas[index] {
		index += 2
		meta += probe_inc
	}
	// While we might have a match
	for meta == m.metas[index] {
		kv_index := m.metas[index + 1]
		if key == m.key_values.data[kv_index].key {
			C.memcpy(m.key_values.data[kv_index].value, value, m.value_bytes)
			return
		}
		index += 2
		meta += probe_inc
	}
	// Match is not possible anymore.
	// Probe until an empty index is found.
	// Swap when probe count is higher/richer (Robin Hood).
	kv := KeyValue{
		key: key
		value: malloc(m.value_bytes)
	}
	C.memcpy(kv.value, value, m.value_bytes)
	mut kv_index := m.key_values.push(kv)
	for m.metas[index] != 0 {
		if meta > m.metas[index] {
			tmp_meta := m.metas[index]
			m.metas[index] = meta
			meta = tmp_meta
			tmp_index := m.metas[index + 1]
			m.metas[index + 1] = kv_index
			kv_index = tmp_index
		}
		index += 2
		meta += probe_inc
	}
	probe_count := (meta >> hashbits) - 1
	if (probe_count << 1) == m.extra_metas {
		// Should almost never happen
		if (meta & max_probe) == max_probe {
			m.expand()
			m.set(kv.key, kv.value)
			return
		}
		m.extra_metas += extra_metas_inc
		mem_size := (m.cap + 2 + m.extra_metas)
		m.metas = &u32(C.realloc(m.metas, sizeof(u32) * mem_size))
		C.memset(m.metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
	}
	m.metas[index] = meta
	m.metas[index + 1] = kv_index
	m.size++
}

// Doubles the size of the hashmap
fn (m mut map) expand() {
	old_cap := m.cap
	m.cap = ((m.cap + 2)<<1) - 2
	// Check if any hashbits are left
	if m.window == 0 {
		m.shift += cached_hashbits
		m.rehash()
		m.window = cached_hashbits
	}
	else {
		m.cached_rehash(old_cap)
	}
	m.window--
}

// Halves the size of the hashmap
fn (m mut map) shrink() {
	m.key_values.zeros_to_end()
	m.cap = ((m.cap + 2)>>1) - 2
	if m.window == 16 {
		m.shift -= cached_hashbits
		m.window = 0
	}
	m.rehash()
	m.window++
}

fn (m mut map) rehash() {
	meta_bytes := sizeof(u32) * (m.cap + 2 + m.extra_metas)
	m.metas = &u32(C.realloc(m.metas, meta_bytes))
	C.memset(m.metas, 0, meta_bytes)
	for i := u32(0); i < m.key_values.size; i++ {
		if m.key_values.data[i].key.str == 0 {
			continue
		}
		kv := m.key_values.data[i]
		hash := wyhash.wyhash_c(kv.key.str, u64(kv.key.len), 0)
		mut meta := u32(((hash>>m.shift) & hash_mask) | probe_inc)
		mut index := hash & m.cap
		// While probe count is less
		for meta < m.metas[index] {
			index += 2
			meta += probe_inc
		}
		// Match is not possible anymore.
		// Probe until an empty index is found.
		// Swap when probe count is higher/richer (Robin Hood).
		mut kv_index := i
		for m.metas[index] != 0 {
			if meta > m.metas[index] {
				tmp_meta := m.metas[index]
				m.metas[index] = meta
				meta = tmp_meta
				tmp_index := m.metas[index + 1]
				m.metas[index + 1] = kv_index
				kv_index = tmp_index
			}
			index += 2
			meta += probe_inc
		}
		probe_count := (meta >> hashbits) - 1
		if (probe_count << 1) == m.extra_metas {
			// Should almost never happen
			if (meta & max_probe) == max_probe {
				m.expand()
				return
			}
			m.extra_metas += extra_metas_inc
			mem_size := (m.cap + 2 + m.extra_metas)
			m.metas = &u32(C.realloc(m.metas, sizeof(u32) * mem_size))
			C.memset(m.metas + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
		}
		m.metas[index] = meta
		m.metas[index + 1] = kv_index
	}
}

fn (m mut map) cached_rehash(old_cap u32) {
	mut new_meta := &u32(vcalloc(sizeof(u32) * (m.cap + 2 + m.extra_metas)))
	old_extra_metas := m.extra_metas
	for i := 0; i <= old_cap + old_extra_metas; i += 2 {
		if m.metas[i] == 0 {
			continue
		}
		old_meta := m.metas[i]
		old_probe_count := u64((old_meta>>hashbits) - 1) << 1
		old_index := (i - old_probe_count) & (m.cap >> 1)
		mut index := u64(old_index) | (old_meta << m.shift) & m.cap
		mut meta := (old_meta & hash_mask) | probe_inc
		// While probe count is less
		for meta < new_meta[index] {
			index += 2
			meta += probe_inc
		}
		// Match is not possible anymore.
		// Probe until an empty index is found.
		// Swap when probe count is higher/richer (Robin Hood).
		mut kv_index := m.metas[i + 1]
		for new_meta[index] != 0 {
			if meta > new_meta[index] {
				tmp_meta := new_meta[index]
				new_meta[index] = meta
				meta = tmp_meta
				tmp_index := new_meta[index + 1]
				new_meta[index + 1] = kv_index
				kv_index = tmp_index
			}
			index += 2
			meta += probe_inc
		}
		probe_count := (meta >> hashbits) - 1
		if (probe_count << 1) == m.extra_metas {
			// Should almost never happen
			if (meta & max_probe) == max_probe {
				free(new_meta)
				m.expand()
				return
			}
			m.extra_metas += extra_metas_inc
			mem_size := (m.cap + 2 + m.extra_metas)
			new_meta = &u32(C.realloc(new_meta, sizeof(u32) * mem_size))
			C.memset(new_meta + mem_size - extra_metas_inc, 0, sizeof(u32) * extra_metas_inc)
		}
		new_meta[index] = meta
		new_meta[index + 1] = kv_index
	}
	unsafe{
		free(m.metas)
	}
	m.metas = new_meta
}

[inline]
fn (m map) get(key string, out voidptr) bool {
	hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
	mut index := hash & m.cap
	mut meta := u32(((hash>>m.shift) & hash_mask) | probe_inc)
	for meta < m.metas[index] {
		index += 2
		meta += probe_inc
	}
	for meta == m.metas[index] {
		kv_index := m.metas[index + 1]
		if key == m.key_values.data[kv_index].key {
			C.memcpy(out, m.key_values.data[kv_index].value, m.value_bytes)
			return true
		}
		index += 2
		meta += probe_inc
	}
	return false
}

[inline]
fn (m map) exists(key string) bool {
	if m.value_bytes == 0 {
		return false
	}
	hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
	mut index := hash & m.cap
	mut meta := u32(((hash>>m.shift) & hash_mask) | probe_inc)
	for meta < m.metas[index] {
		index += 2
		meta += probe_inc
	}
	for meta == m.metas[index] {
		kv_index := m.metas[index + 1]
		if key == m.key_values.data[kv_index].key {
			return true
		}
		index += 2
		meta += probe_inc
	}
	return false
}

pub fn (m mut map) delete(key string) {
	hash := wyhash.wyhash_c(key.str, u64(key.len), 0)
	mut index := hash & m.cap
	mut meta := u32(((hash>>m.shift) & hash_mask) | probe_inc)
	for meta < m.metas[index] {
		index += 2
		meta += probe_inc
	}
	// Perform backwards shifting
	for meta == m.metas[index] {
		kv_index := m.metas[index + 1]
		if key == m.key_values.data[kv_index].key {
			C.memset(&m.key_values.data[kv_index], 0, sizeof(KeyValue))
			mut old_index := index
			index += 2
			mut cur_meta := m.metas[index]
			mut cur_index := m.metas[index + 1]
			for (cur_meta >> hashbits) > 1 {
				m.metas[old_index] = cur_meta - probe_inc
				m.metas[old_index + 1] = cur_index
				old_index = index
				index += 2
				cur_meta = m.metas[index]
				cur_index = m.metas[index + 1]
			}
			m.metas[old_index] = 0
			m.size--
			if m.cap == 30 {return}
			if (f32(m.size << 1) / f32(m.cap)) < min_load_factor {
				m.shrink()
			}
			return
		}
		index += 2
		meta += probe_inc
	}
}

pub fn (m &map) keys() []string {
	mut keys := [''].repeat(m.size)
	if m.value_bytes == 0 {
		return keys
	}
	mut j := 0
	for i := u32(0); i < m.key_values.size; i++ {
		if m.key_values.data[i].key.str == 0 {
			continue
		}
		keys[j] = m.key_values.data[i].key
		j++
	}
	return keys
}

pub fn (m map) free() {
	unsafe {
		free(m.metas)
		free(m.key_values.data)
	}
}

pub fn (m map) print() {
	println('TODO')
}

pub fn (m map_string) str() string {
	if m.size == 0 {
		return '{}'
	}
	mut sb := strings.new_builder(50)
	sb.writeln('{')
	for key, val  in m {
		sb.writeln('  "$key" => "$val"')
	}
	sb.writeln('}')
	return sb.str()
}