mirror of
https://github.com/vlang/v.git
synced 2023-08-10 21:13:21 +03:00
datatypes: add Bloom filter (#18327)
This commit is contained in:
parent
9764342dbe
commit
0fc33c6fa3
@ -6,7 +6,7 @@ data types.
|
||||
V's `builtin` module is imported implicitly, and has implementations for arrays,
|
||||
maps and strings. These are good for many applications, but there are a plethora
|
||||
of other useful data structures/containers, like linked lists, priority queues,
|
||||
tries, etc, that allow for algorithms with different time complexities, which may
|
||||
trees, etc, that allow for algorithms with different time complexities, which may
|
||||
be more suitable for your specific application.
|
||||
|
||||
It is implemented using generics, that you have to specialise for the type of
|
||||
@ -28,4 +28,5 @@ println(stack)
|
||||
- [x] Min heap (priority queue)
|
||||
- [x] Set
|
||||
- [x] Quadtree
|
||||
- [x] Bloom filter
|
||||
- [ ] ...
|
||||
|
125
vlib/datatypes/bloom_filter.v
Normal file
125
vlib/datatypes/bloom_filter.v
Normal file
@ -0,0 +1,125 @@
|
||||
module datatypes
|
||||
|
||||
// Bloom filter is used to test whether a given element is part of a set. Lookups will occasionally generate false positives, but never false negatives.
|
||||
|
||||
[heap]
|
||||
struct BloomFilter[T] {
|
||||
hash_func fn (T) u32 // hash function, input [T] , output u32
|
||||
table_size int // every entry is one-bit, packed into `table`
|
||||
num_functions int // 1~16
|
||||
mut:
|
||||
table []u8
|
||||
}
|
||||
|
||||
const (
|
||||
// Salt values(random values). These salts are XORed with the output of the hash function to give multiple unique hashes.
|
||||
salts = [
|
||||
// vfmt off
|
||||
u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4,
|
||||
0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748,
|
||||
0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3,
|
||||
0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295,
|
||||
// vfmt on
|
||||
]
|
||||
)
|
||||
|
||||
fn (b &BloomFilter[T]) free() {
|
||||
unsafe {
|
||||
free(b.table)
|
||||
}
|
||||
}
|
||||
|
||||
// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4
|
||||
pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] {
|
||||
return &BloomFilter[T]{
|
||||
hash_func: hash_func
|
||||
table_size: 16384
|
||||
num_functions: 4
|
||||
table: []u8{len: (16384 + 7) / 8}
|
||||
}
|
||||
}
|
||||
|
||||
// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16
|
||||
pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] {
|
||||
if table_size <= 0 {
|
||||
return error('table_size should great that 0')
|
||||
}
|
||||
if num_functions < 1 || num_functions > datatypes.salts.len {
|
||||
return error('num_functions should between 1~${datatypes.salts.len}')
|
||||
}
|
||||
|
||||
return &BloomFilter[T]{
|
||||
hash_func: hash_func
|
||||
table_size: table_size
|
||||
num_functions: num_functions
|
||||
table: []u8{len: (table_size + 7) / 8}
|
||||
}
|
||||
}
|
||||
|
||||
// adds the element to bloom filter.
|
||||
pub fn (mut b BloomFilter[T]) add(element T) {
|
||||
hash := b.hash_func(element)
|
||||
|
||||
for i in 0 .. b.num_functions {
|
||||
subhash := hash ^ datatypes.salts[i]
|
||||
index := int(subhash % u32(b.table_size))
|
||||
bb := u8((1 << (index % 8)))
|
||||
b.table[index / 8] |= bb
|
||||
}
|
||||
}
|
||||
|
||||
// checks the element is exists.
|
||||
pub fn (b &BloomFilter[T]) exists(element T) bool {
|
||||
hash := b.hash_func(element)
|
||||
for i in 0 .. b.num_functions {
|
||||
subhash := hash ^ datatypes.salts[i]
|
||||
index := int(subhash % u32(b.table_size))
|
||||
bb := b.table[index / 8]
|
||||
bit := 1 << (index % 8)
|
||||
if bb & bit == 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// @union returns the union of the two bloom filters.
|
||||
pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] {
|
||||
if l.table_size != r.table_size || l.num_functions != r.num_functions
|
||||
|| l.hash_func != r.hash_func {
|
||||
return error('Both filters must be created with the same values.')
|
||||
}
|
||||
|
||||
mut new_f := BloomFilter[T]{
|
||||
hash_func: l.hash_func
|
||||
table_size: l.table_size
|
||||
num_functions: l.num_functions
|
||||
table: []u8{len: (l.table_size + 7) / 8}
|
||||
}
|
||||
for i in 0 .. l.table.len {
|
||||
new_f.table[i] = l.table[i] | r.table[i]
|
||||
}
|
||||
|
||||
return &new_f
|
||||
}
|
||||
|
||||
// intersection returns the intersection of bloom filters.
|
||||
pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] {
|
||||
if l.table_size != r.table_size || l.num_functions != r.num_functions
|
||||
|| l.hash_func != r.hash_func {
|
||||
return error('Both filters must be created with the same values.')
|
||||
}
|
||||
|
||||
mut new_f := BloomFilter[T]{
|
||||
hash_func: l.hash_func
|
||||
table_size: l.table_size
|
||||
num_functions: l.num_functions
|
||||
table: []u8{len: (l.table_size + 7) / 8}
|
||||
}
|
||||
for i in 0 .. l.table.len {
|
||||
new_f.table[i] = l.table[i] & r.table[i]
|
||||
}
|
||||
|
||||
return &new_f
|
||||
}
|
85
vlib/datatypes/bloom_filter_test.v
Normal file
85
vlib/datatypes/bloom_filter_test.v
Normal file
@ -0,0 +1,85 @@
|
||||
module datatypes
|
||||
|
||||
import hash
|
||||
|
||||
fn hash_func(s string) u32 {
|
||||
val64 := hash.sum64_string(s, 0x12345678)
|
||||
return u32(val64)
|
||||
}
|
||||
|
||||
fn test_bloom_filter_fast() {
|
||||
mut b := new_bloom_filter_fast[string](hash_func)
|
||||
b.add('hello world')
|
||||
b.add('v is awsome')
|
||||
b.add('power by v')
|
||||
assert b.exists('hello world') == true
|
||||
assert b.exists('v is awsome') == true
|
||||
assert b.exists('power by v') == true
|
||||
assert b.exists('my world') == false
|
||||
}
|
||||
|
||||
fn test_bloom_filter_fast_normal() {
|
||||
mut b := new_bloom_filter[string](hash_func, 65536, 16) or { panic(err) }
|
||||
b.add('hello world')
|
||||
b.add('v is awsome')
|
||||
b.add('power by v')
|
||||
assert b.exists('hello world') == true
|
||||
assert b.exists('v is awsome') == true
|
||||
assert b.exists('power by v') == true
|
||||
assert b.exists('my world') == false
|
||||
}
|
||||
|
||||
fn test_bloom_filter_false_positive() {
|
||||
// every `add` will set 8 bits in the table(total length = 16), so overflow very quickly
|
||||
mut b := new_bloom_filter[string](hash_func, 16, 8) or { panic(err) }
|
||||
b.add('hello world')
|
||||
b.add('v is awsome')
|
||||
b.add('power by v')
|
||||
assert b.exists('hello world') == true
|
||||
assert b.exists('v is awsome') == true
|
||||
assert b.exists('power by v') == true
|
||||
assert b.exists('my world') == true // false positive
|
||||
}
|
||||
|
||||
fn test_bloom_filter_fast_union_intersection() {
|
||||
mut a := new_bloom_filter_fast[string](hash_func)
|
||||
mut b := new_bloom_filter_fast[string](hash_func)
|
||||
|
||||
a.add('power by v')
|
||||
a.add('silly c')
|
||||
a.add('super rust')
|
||||
|
||||
b.add('hello world')
|
||||
b.add('v is awsome')
|
||||
b.add('power by v')
|
||||
|
||||
assert a.exists('power by v') == true
|
||||
assert a.exists('silly c') == true
|
||||
assert a.exists('super rust') == true
|
||||
assert a.exists('power c++') == false
|
||||
|
||||
assert b.exists('hello world') == true
|
||||
assert b.exists('v is awsome') == true
|
||||
assert b.exists('power by v') == true
|
||||
assert b.exists('my world') == false
|
||||
|
||||
// a || b test
|
||||
mut c := a.@union(b) or { panic(err) }
|
||||
assert c.exists('silly c') == true
|
||||
assert c.exists('super rust') == true
|
||||
assert c.exists('power c++') == false
|
||||
assert c.exists('hello world') == true
|
||||
assert c.exists('v is awsome') == true
|
||||
assert c.exists('power by v') == true
|
||||
assert c.exists('my world') == false
|
||||
|
||||
// a && b test
|
||||
mut d := a.intersection(b) or { panic(err) }
|
||||
assert d.exists('silly c') == false
|
||||
assert d.exists('super rust') == false
|
||||
assert d.exists('power c++') == false
|
||||
assert d.exists('hello world') == false
|
||||
assert d.exists('v is awsome') == false
|
||||
assert d.exists('power by v') == true
|
||||
assert d.exists('my world') == false
|
||||
}
|
Loading…
Reference in New Issue
Block a user