datatypes: add Bloom filter (#18327)

2023-08-10 21:13:21 +03:00 · 2023-06-02 15:56:22 +08:00
parent 9764342dbe
commit 0fc33c6fa3
3 changed files with 212 additions and 1 deletions
--- a/vlib/datatypes/README.md
+++ b/vlib/datatypes/README.md
@@ -6,7 +6,7 @@ data types.
 V's `builtin` module is imported implicitly, and has implementations for arrays,
 maps and strings. These are good for many applications, but there are a plethora
 of other useful data structures/containers, like linked lists, priority queues,
-tries, etc, that allow for algorithms with different time complexities, which may
+trees, etc, that allow for algorithms with different time complexities, which may
 be more suitable for your specific application.

 It is implemented using generics, that you have to specialise for the type of
@@ -28,4 +28,5 @@ println(stack)
 - [x] Min heap (priority queue)
 - [x] Set
 - [x] Quadtree
+- [x] Bloom filter
 - [ ] ...
--- a/vlib/datatypes/bloom_filter.v
+++ b/vlib/datatypes/bloom_filter.v
@@ -0,0 +1,125 @@
+module datatypes
+
+// Bloom filter is used to test whether a given element is part of a set.  Lookups will occasionally generate false positives, but never false  negatives.
+
+[heap]
+struct BloomFilter[T] {
+	hash_func     fn (T) u32 // hash function, input [T] , output u32
+	table_size    int        // every entry is one-bit, packed into `table`
+	num_functions int        // 1~16
+mut:
+	table []u8
+}
+
+const (
+	// Salt values(random values).  These salts are XORed with the output of the hash function to give multiple unique hashes.
+	salts = [
+		// vfmt off
+		u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4,
+		0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748,
+		0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3,
+		0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295,
+		// vfmt on
+	]
+)
+
+fn (b &BloomFilter[T]) free() {
+	unsafe {
+		free(b.table)
+	}
+}
+
+// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4
+pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] {
+	return &BloomFilter[T]{
+		hash_func: hash_func
+		table_size: 16384
+		num_functions: 4
+		table: []u8{len: (16384 + 7) / 8}
+	}
+}
+
+// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16
+pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] {
+	if table_size <= 0 {
+		return error('table_size should great that 0')
+	}
+	if num_functions < 1 || num_functions > datatypes.salts.len {
+		return error('num_functions should between 1~${datatypes.salts.len}')
+	}
+
+	return &BloomFilter[T]{
+		hash_func: hash_func
+		table_size: table_size
+		num_functions: num_functions
+		table: []u8{len: (table_size + 7) / 8}
+	}
+}
+
+// adds the element to bloom filter.
+pub fn (mut b BloomFilter[T]) add(element T) {
+	hash := b.hash_func(element)
+
+	for i in 0 .. b.num_functions {
+		subhash := hash ^ datatypes.salts[i]
+		index := int(subhash % u32(b.table_size))
+		bb := u8((1 << (index % 8)))
+		b.table[index / 8] |= bb
+	}
+}
+
+// checks the element is exists.
+pub fn (b &BloomFilter[T]) exists(element T) bool {
+	hash := b.hash_func(element)
+	for i in 0 .. b.num_functions {
+		subhash := hash ^ datatypes.salts[i]
+		index := int(subhash % u32(b.table_size))
+		bb := b.table[index / 8]
+		bit := 1 << (index % 8)
+		if bb & bit == 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+// @union returns the union of the two bloom filters.
+pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] {
+	if l.table_size != r.table_size || l.num_functions != r.num_functions
+		|| l.hash_func != r.hash_func {
+		return error('Both filters must be created with the same values.')
+	}
+
+	mut new_f := BloomFilter[T]{
+		hash_func: l.hash_func
+		table_size: l.table_size
+		num_functions: l.num_functions
+		table: []u8{len: (l.table_size + 7) / 8}
+	}
+	for i in 0 .. l.table.len {
+		new_f.table[i] = l.table[i] | r.table[i]
+	}
+
+	return &new_f
+}
+
+// intersection returns the intersection of bloom filters.
+pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] {
+	if l.table_size != r.table_size || l.num_functions != r.num_functions
+		|| l.hash_func != r.hash_func {
+		return error('Both filters must be created with the same values.')
+	}
+
+	mut new_f := BloomFilter[T]{
+		hash_func: l.hash_func
+		table_size: l.table_size
+		num_functions: l.num_functions
+		table: []u8{len: (l.table_size + 7) / 8}
+	}
+	for i in 0 .. l.table.len {
+		new_f.table[i] = l.table[i] & r.table[i]
+	}
+
+	return &new_f
+}
--- a/vlib/datatypes/bloom_filter_test.v
+++ b/vlib/datatypes/bloom_filter_test.v
@@ -0,0 +1,85 @@
+module datatypes
+
+import hash
+
+fn hash_func(s string) u32 {
+	val64 := hash.sum64_string(s, 0x12345678)
+	return u32(val64)
+}
+
+fn test_bloom_filter_fast() {
+	mut b := new_bloom_filter_fast[string](hash_func)
+	b.add('hello world')
+	b.add('v is awsome')
+	b.add('power by v')
+	assert b.exists('hello world') == true
+	assert b.exists('v is awsome') == true
+	assert b.exists('power by v') == true
+	assert b.exists('my world') == false
+}
+
+fn test_bloom_filter_fast_normal() {
+	mut b := new_bloom_filter[string](hash_func, 65536, 16) or { panic(err) }
+	b.add('hello world')
+	b.add('v is awsome')
+	b.add('power by v')
+	assert b.exists('hello world') == true
+	assert b.exists('v is awsome') == true
+	assert b.exists('power by v') == true
+	assert b.exists('my world') == false
+}
+
+fn test_bloom_filter_false_positive() {
+	// every `add` will set 8 bits in the table(total length = 16), so overflow very quickly
+	mut b := new_bloom_filter[string](hash_func, 16, 8) or { panic(err) }
+	b.add('hello world')
+	b.add('v is awsome')
+	b.add('power by v')
+	assert b.exists('hello world') == true
+	assert b.exists('v is awsome') == true
+	assert b.exists('power by v') == true
+	assert b.exists('my world') == true // false positive
+}
+
+fn test_bloom_filter_fast_union_intersection() {
+	mut a := new_bloom_filter_fast[string](hash_func)
+	mut b := new_bloom_filter_fast[string](hash_func)
+
+	a.add('power by v')
+	a.add('silly c')
+	a.add('super rust')
+
+	b.add('hello world')
+	b.add('v is awsome')
+	b.add('power by v')
+
+	assert a.exists('power by v') == true
+	assert a.exists('silly c') == true
+	assert a.exists('super rust') == true
+	assert a.exists('power c++') == false
+
+	assert b.exists('hello world') == true
+	assert b.exists('v is awsome') == true
+	assert b.exists('power by v') == true
+	assert b.exists('my world') == false
+
+	// a || b test
+	mut c := a.@union(b) or { panic(err) }
+	assert c.exists('silly c') == true
+	assert c.exists('super rust') == true
+	assert c.exists('power c++') == false
+	assert c.exists('hello world') == true
+	assert c.exists('v is awsome') == true
+	assert c.exists('power by v') == true
+	assert c.exists('my world') == false
+
+	// a && b test
+	mut d := a.intersection(b) or { panic(err) }
+	assert d.exists('silly c') == false
+	assert d.exists('super rust') == false
+	assert d.exists('power c++') == false
+	assert d.exists('hello world') == false
+	assert d.exists('v is awsome') == false
+	assert d.exists('power by v') == true
+	assert d.exists('my world') == false
+}