1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

datatypes: add Bloom filter (#18327)

This commit is contained in:
kbkpbot 2023-06-02 15:56:22 +08:00 committed by GitHub
parent 9764342dbe
commit 0fc33c6fa3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 212 additions and 1 deletions

View File

@ -6,7 +6,7 @@ data types.
V's `builtin` module is imported implicitly, and has implementations for arrays,
maps and strings. These are good for many applications, but there are a plethora
of other useful data structures/containers, like linked lists, priority queues,
tries, etc, that allow for algorithms with different time complexities, which may
trees, etc, that allow for algorithms with different time complexities, which may
be more suitable for your specific application.
It is implemented using generics, that you have to specialise for the type of
@ -28,4 +28,5 @@ println(stack)
- [x] Min heap (priority queue)
- [x] Set
- [x] Quadtree
- [x] Bloom filter
- [ ] ...

View File

@ -0,0 +1,125 @@
module datatypes
// Bloom filter is used to test whether a given element is part of a set. Lookups will occasionally generate false positives, but never false negatives.
[heap]
struct BloomFilter[T] {
hash_func fn (T) u32 // hash function, input [T] , output u32
table_size int // every entry is one-bit, packed into `table`
num_functions int // 1~16
mut:
table []u8
}
const (
// Salt values(random values). These salts are XORed with the output of the hash function to give multiple unique hashes.
salts = [
// vfmt off
u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4,
0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748,
0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3,
0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295,
// vfmt on
]
)
fn (b &BloomFilter[T]) free() {
unsafe {
free(b.table)
}
}
// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4
pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] {
return &BloomFilter[T]{
hash_func: hash_func
table_size: 16384
num_functions: 4
table: []u8{len: (16384 + 7) / 8}
}
}
// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16
pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] {
if table_size <= 0 {
return error('table_size should great that 0')
}
if num_functions < 1 || num_functions > datatypes.salts.len {
return error('num_functions should between 1~${datatypes.salts.len}')
}
return &BloomFilter[T]{
hash_func: hash_func
table_size: table_size
num_functions: num_functions
table: []u8{len: (table_size + 7) / 8}
}
}
// adds the element to bloom filter.
pub fn (mut b BloomFilter[T]) add(element T) {
hash := b.hash_func(element)
for i in 0 .. b.num_functions {
subhash := hash ^ datatypes.salts[i]
index := int(subhash % u32(b.table_size))
bb := u8((1 << (index % 8)))
b.table[index / 8] |= bb
}
}
// checks the element is exists.
pub fn (b &BloomFilter[T]) exists(element T) bool {
hash := b.hash_func(element)
for i in 0 .. b.num_functions {
subhash := hash ^ datatypes.salts[i]
index := int(subhash % u32(b.table_size))
bb := b.table[index / 8]
bit := 1 << (index % 8)
if bb & bit == 0 {
return false
}
}
return true
}
// @union returns the union of the two bloom filters.
pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] {
if l.table_size != r.table_size || l.num_functions != r.num_functions
|| l.hash_func != r.hash_func {
return error('Both filters must be created with the same values.')
}
mut new_f := BloomFilter[T]{
hash_func: l.hash_func
table_size: l.table_size
num_functions: l.num_functions
table: []u8{len: (l.table_size + 7) / 8}
}
for i in 0 .. l.table.len {
new_f.table[i] = l.table[i] | r.table[i]
}
return &new_f
}
// intersection returns the intersection of bloom filters.
pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] {
if l.table_size != r.table_size || l.num_functions != r.num_functions
|| l.hash_func != r.hash_func {
return error('Both filters must be created with the same values.')
}
mut new_f := BloomFilter[T]{
hash_func: l.hash_func
table_size: l.table_size
num_functions: l.num_functions
table: []u8{len: (l.table_size + 7) / 8}
}
for i in 0 .. l.table.len {
new_f.table[i] = l.table[i] & r.table[i]
}
return &new_f
}

View File

@ -0,0 +1,85 @@
module datatypes
import hash
fn hash_func(s string) u32 {
val64 := hash.sum64_string(s, 0x12345678)
return u32(val64)
}
fn test_bloom_filter_fast() {
mut b := new_bloom_filter_fast[string](hash_func)
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == false
}
fn test_bloom_filter_fast_normal() {
mut b := new_bloom_filter[string](hash_func, 65536, 16) or { panic(err) }
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == false
}
fn test_bloom_filter_false_positive() {
// every `add` will set 8 bits in the table(total length = 16), so overflow very quickly
mut b := new_bloom_filter[string](hash_func, 16, 8) or { panic(err) }
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == true // false positive
}
fn test_bloom_filter_fast_union_intersection() {
mut a := new_bloom_filter_fast[string](hash_func)
mut b := new_bloom_filter_fast[string](hash_func)
a.add('power by v')
a.add('silly c')
a.add('super rust')
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert a.exists('power by v') == true
assert a.exists('silly c') == true
assert a.exists('super rust') == true
assert a.exists('power c++') == false
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == false
// a || b test
mut c := a.@union(b) or { panic(err) }
assert c.exists('silly c') == true
assert c.exists('super rust') == true
assert c.exists('power c++') == false
assert c.exists('hello world') == true
assert c.exists('v is awsome') == true
assert c.exists('power by v') == true
assert c.exists('my world') == false
// a && b test
mut d := a.intersection(b) or { panic(err) }
assert d.exists('silly c') == false
assert d.exists('super rust') == false
assert d.exists('power c++') == false
assert d.exists('hello world') == false
assert d.exists('v is awsome') == false
assert d.exists('power by v') == true
assert d.exists('my world') == false
}