v/vlib/datatypes/bloom_filter.v

module datatypes

// Bloom filter is used to test whether a given element is part of a set.  Lookups will occasionally generate false positives, but never false  negatives.

[heap]
struct BloomFilter[T] {
	hash_func     fn (T) u32 // hash function, input [T] , output u32
	table_size    int        // every entry is one-bit, packed into `table`
	num_functions int        // 1~16
mut:
	table []u8
}

const (
	// Salt values(random values).  These salts are XORed with the output of the hash function to give multiple unique hashes.
	salts = [
		// vfmt off
		u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4,
		0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748,
		0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3,
		0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295,
		// vfmt on
	]
)

fn (b &BloomFilter[T]) free() {
	unsafe {
		free(b.table)
	}
}

// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4
pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] {
	return &BloomFilter[T]{
		hash_func: hash_func
		table_size: 16384
		num_functions: 4
		table: []u8{len: (16384 + 7) / 8}
	}
}

// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16
pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] {
	if table_size <= 0 {
		return error('table_size should great that 0')
	}
	if num_functions < 1 || num_functions > datatypes.salts.len {
		return error('num_functions should between 1~${datatypes.salts.len}')
	}

	return &BloomFilter[T]{
		hash_func: hash_func
		table_size: table_size
		num_functions: num_functions
		table: []u8{len: (table_size + 7) / 8}
	}
}

// adds the element to bloom filter.
pub fn (mut b BloomFilter[T]) add(element T) {
	hash := b.hash_func(element)

	for i in 0 .. b.num_functions {
		subhash := hash ^ datatypes.salts[i]
		index := int(subhash % u32(b.table_size))
		bb := u8((1 << (index % 8)))
		b.table[index / 8] |= bb
	}
}

// checks the element is exists.
pub fn (b &BloomFilter[T]) exists(element T) bool {
	hash := b.hash_func(element)
	for i in 0 .. b.num_functions {
		subhash := hash ^ datatypes.salts[i]
		index := int(subhash % u32(b.table_size))
		bb := b.table[index / 8]
		bit := 1 << (index % 8)
		if bb & bit == 0 {
			return false
		}
	}

	return true
}

// @union returns the union of the two bloom filters.
pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] {
	if l.table_size != r.table_size || l.num_functions != r.num_functions
		|| l.hash_func != r.hash_func {
		return error('Both filters must be created with the same values.')
	}

	mut new_f := BloomFilter[T]{
		hash_func: l.hash_func
		table_size: l.table_size
		num_functions: l.num_functions
		table: []u8{len: (l.table_size + 7) / 8}
	}
	for i in 0 .. l.table.len {
		new_f.table[i] = l.table[i] | r.table[i]
	}

	return &new_f
}

// intersection returns the intersection of bloom filters.
pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] {
	if l.table_size != r.table_size || l.num_functions != r.num_functions
		|| l.hash_func != r.hash_func {
		return error('Both filters must be created with the same values.')
	}

	mut new_f := BloomFilter[T]{
		hash_func: l.hash_func
		table_size: l.table_size
		num_functions: l.num_functions
		table: []u8{len: (l.table_size + 7) / 8}
	}
	for i in 0 .. l.table.len {
		new_f.table[i] = l.table[i] & r.table[i]
	}

	return &new_f
}