diff --git a/vlib/datatypes/README.md b/vlib/datatypes/README.md index 5c1251e568..2c9fa524c8 100644 --- a/vlib/datatypes/README.md +++ b/vlib/datatypes/README.md @@ -6,7 +6,7 @@ data types. V's `builtin` module is imported implicitly, and has implementations for arrays, maps and strings. These are good for many applications, but there are a plethora of other useful data structures/containers, like linked lists, priority queues, -tries, etc, that allow for algorithms with different time complexities, which may +trees, etc, that allow for algorithms with different time complexities, which may be more suitable for your specific application. It is implemented using generics, that you have to specialise for the type of @@ -28,4 +28,5 @@ println(stack) - [x] Min heap (priority queue) - [x] Set - [x] Quadtree +- [x] Bloom filter - [ ] ... diff --git a/vlib/datatypes/bloom_filter.v b/vlib/datatypes/bloom_filter.v new file mode 100644 index 0000000000..34ee924da1 --- /dev/null +++ b/vlib/datatypes/bloom_filter.v @@ -0,0 +1,125 @@ +module datatypes + +// Bloom filter is used to test whether a given element is part of a set. Lookups will occasionally generate false positives, but never false negatives. + +[heap] +struct BloomFilter[T] { + hash_func fn (T) u32 // hash function, input [T] , output u32 + table_size int // every entry is one-bit, packed into `table` + num_functions int // 1~16 +mut: + table []u8 +} + +const ( + // Salt values(random values). These salts are XORed with the output of the hash function to give multiple unique hashes. + salts = [ + // vfmt off + u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4, + 0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748, + 0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3, + 0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295, + // vfmt on + ] +) + +fn (b &BloomFilter[T]) free() { + unsafe { + free(b.table) + } +} + +// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4 +pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] { + return &BloomFilter[T]{ + hash_func: hash_func + table_size: 16384 + num_functions: 4 + table: []u8{len: (16384 + 7) / 8} + } +} + +// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16 +pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] { + if table_size <= 0 { + return error('table_size should great that 0') + } + if num_functions < 1 || num_functions > datatypes.salts.len { + return error('num_functions should between 1~${datatypes.salts.len}') + } + + return &BloomFilter[T]{ + hash_func: hash_func + table_size: table_size + num_functions: num_functions + table: []u8{len: (table_size + 7) / 8} + } +} + +// adds the element to bloom filter. +pub fn (mut b BloomFilter[T]) add(element T) { + hash := b.hash_func(element) + + for i in 0 .. b.num_functions { + subhash := hash ^ datatypes.salts[i] + index := int(subhash % u32(b.table_size)) + bb := u8((1 << (index % 8))) + b.table[index / 8] |= bb + } +} + +// checks the element is exists. +pub fn (b &BloomFilter[T]) exists(element T) bool { + hash := b.hash_func(element) + for i in 0 .. b.num_functions { + subhash := hash ^ datatypes.salts[i] + index := int(subhash % u32(b.table_size)) + bb := b.table[index / 8] + bit := 1 << (index % 8) + if bb & bit == 0 { + return false + } + } + + return true +} + +// @union returns the union of the two bloom filters. +pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] { + if l.table_size != r.table_size || l.num_functions != r.num_functions + || l.hash_func != r.hash_func { + return error('Both filters must be created with the same values.') + } + + mut new_f := BloomFilter[T]{ + hash_func: l.hash_func + table_size: l.table_size + num_functions: l.num_functions + table: []u8{len: (l.table_size + 7) / 8} + } + for i in 0 .. l.table.len { + new_f.table[i] = l.table[i] | r.table[i] + } + + return &new_f +} + +// intersection returns the intersection of bloom filters. +pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] { + if l.table_size != r.table_size || l.num_functions != r.num_functions + || l.hash_func != r.hash_func { + return error('Both filters must be created with the same values.') + } + + mut new_f := BloomFilter[T]{ + hash_func: l.hash_func + table_size: l.table_size + num_functions: l.num_functions + table: []u8{len: (l.table_size + 7) / 8} + } + for i in 0 .. l.table.len { + new_f.table[i] = l.table[i] & r.table[i] + } + + return &new_f +} diff --git a/vlib/datatypes/bloom_filter_test.v b/vlib/datatypes/bloom_filter_test.v new file mode 100644 index 0000000000..50e1d9c123 --- /dev/null +++ b/vlib/datatypes/bloom_filter_test.v @@ -0,0 +1,85 @@ +module datatypes + +import hash + +fn hash_func(s string) u32 { + val64 := hash.sum64_string(s, 0x12345678) + return u32(val64) +} + +fn test_bloom_filter_fast() { + mut b := new_bloom_filter_fast[string](hash_func) + b.add('hello world') + b.add('v is awsome') + b.add('power by v') + assert b.exists('hello world') == true + assert b.exists('v is awsome') == true + assert b.exists('power by v') == true + assert b.exists('my world') == false +} + +fn test_bloom_filter_fast_normal() { + mut b := new_bloom_filter[string](hash_func, 65536, 16) or { panic(err) } + b.add('hello world') + b.add('v is awsome') + b.add('power by v') + assert b.exists('hello world') == true + assert b.exists('v is awsome') == true + assert b.exists('power by v') == true + assert b.exists('my world') == false +} + +fn test_bloom_filter_false_positive() { + // every `add` will set 8 bits in the table(total length = 16), so overflow very quickly + mut b := new_bloom_filter[string](hash_func, 16, 8) or { panic(err) } + b.add('hello world') + b.add('v is awsome') + b.add('power by v') + assert b.exists('hello world') == true + assert b.exists('v is awsome') == true + assert b.exists('power by v') == true + assert b.exists('my world') == true // false positive +} + +fn test_bloom_filter_fast_union_intersection() { + mut a := new_bloom_filter_fast[string](hash_func) + mut b := new_bloom_filter_fast[string](hash_func) + + a.add('power by v') + a.add('silly c') + a.add('super rust') + + b.add('hello world') + b.add('v is awsome') + b.add('power by v') + + assert a.exists('power by v') == true + assert a.exists('silly c') == true + assert a.exists('super rust') == true + assert a.exists('power c++') == false + + assert b.exists('hello world') == true + assert b.exists('v is awsome') == true + assert b.exists('power by v') == true + assert b.exists('my world') == false + + // a || b test + mut c := a.@union(b) or { panic(err) } + assert c.exists('silly c') == true + assert c.exists('super rust') == true + assert c.exists('power c++') == false + assert c.exists('hello world') == true + assert c.exists('v is awsome') == true + assert c.exists('power by v') == true + assert c.exists('my world') == false + + // a && b test + mut d := a.intersection(b) or { panic(err) } + assert d.exists('silly c') == false + assert d.exists('super rust') == false + assert d.exists('power c++') == false + assert d.exists('hello world') == false + assert d.exists('v is awsome') == false + assert d.exists('power by v') == true + assert d.exists('my world') == false +}