2024-01-25 12:07:53 +08:00
|
|
|
// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0.
|
|
|
|
|
|
2024-01-28 14:08:08 +08:00
|
|
|
use anyhow::{bail, Result};
|
|
|
|
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
2024-01-25 12:07:53 +08:00
|
|
|
|
|
|
|
|
/// Implements a bloom filter
|
|
|
|
|
pub struct Bloom {
|
|
|
|
|
/// data of filter in bits
|
|
|
|
|
pub(crate) filter: Bytes,
|
|
|
|
|
/// number of hash functions
|
|
|
|
|
pub(crate) k: u8,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub trait BitSlice {
|
|
|
|
|
fn get_bit(&self, idx: usize) -> bool;
|
|
|
|
|
fn bit_len(&self) -> usize;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub trait BitSliceMut {
|
|
|
|
|
fn set_bit(&mut self, idx: usize, val: bool);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl<T: AsRef<[u8]>> BitSlice for T {
|
|
|
|
|
fn get_bit(&self, idx: usize) -> bool {
|
|
|
|
|
let pos = idx / 8;
|
|
|
|
|
let offset = idx % 8;
|
|
|
|
|
(self.as_ref()[pos] & (1 << offset)) != 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn bit_len(&self) -> usize {
|
|
|
|
|
self.as_ref().len() * 8
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl<T: AsMut<[u8]>> BitSliceMut for T {
|
|
|
|
|
fn set_bit(&mut self, idx: usize, val: bool) {
|
|
|
|
|
let pos = idx / 8;
|
|
|
|
|
let offset = idx % 8;
|
|
|
|
|
if val {
|
|
|
|
|
self.as_mut()[pos] |= 1 << offset;
|
|
|
|
|
} else {
|
|
|
|
|
self.as_mut()[pos] &= !(1 << offset);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Bloom {
|
|
|
|
|
/// Decode a bloom filter
|
2024-01-28 14:08:08 +08:00
|
|
|
pub fn decode(buf: &[u8]) -> Result<Self> {
|
|
|
|
|
let checksum = (&buf[buf.len() - 4..buf.len()]).get_u32();
|
|
|
|
|
if checksum != crc32fast::hash(&buf[..buf.len() - 4]) {
|
|
|
|
|
bail!("checksum mismatched for bloom filters");
|
|
|
|
|
}
|
|
|
|
|
let filter = &buf[..buf.len() - 5];
|
|
|
|
|
let k = buf[buf.len() - 5];
|
|
|
|
|
Ok(Self {
|
2024-01-25 12:07:53 +08:00
|
|
|
filter: filter.to_vec().into(),
|
|
|
|
|
k,
|
2024-01-28 14:08:08 +08:00
|
|
|
})
|
2024-01-25 12:07:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Encode a bloom filter
|
|
|
|
|
pub fn encode(&self, buf: &mut Vec<u8>) {
|
2024-01-28 14:08:08 +08:00
|
|
|
let offset = buf.len();
|
2024-01-25 12:07:53 +08:00
|
|
|
buf.extend(&self.filter);
|
|
|
|
|
buf.put_u8(self.k);
|
2024-01-28 14:08:08 +08:00
|
|
|
let checksum = crc32fast::hash(&buf[offset..]);
|
|
|
|
|
buf.put_u32(checksum);
|
2024-01-25 12:07:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Get bloom filter bits per key from entries count and FPR
|
|
|
|
|
pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
|
|
|
|
|
let size =
|
|
|
|
|
-1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2);
|
|
|
|
|
let locs = (size / (entries as f64)).ceil();
|
|
|
|
|
locs as usize
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Build bloom filter from key hashes
|
|
|
|
|
pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self {
|
|
|
|
|
let k = (bits_per_key as f64 * 0.69) as u32;
|
|
|
|
|
let k = k.min(30).max(1);
|
|
|
|
|
let nbits = (keys.len() * bits_per_key).max(64);
|
|
|
|
|
let nbytes = (nbits + 7) / 8;
|
|
|
|
|
let nbits = nbytes * 8;
|
|
|
|
|
let mut filter = BytesMut::with_capacity(nbytes);
|
|
|
|
|
filter.resize(nbytes, 0);
|
|
|
|
|
for h in keys {
|
|
|
|
|
let mut h = *h;
|
|
|
|
|
let delta = (h >> 17) | (h << 15);
|
|
|
|
|
for _ in 0..k {
|
|
|
|
|
let bit_pos = (h as usize) % nbits;
|
|
|
|
|
filter.set_bit(bit_pos, true);
|
|
|
|
|
h = h.wrapping_add(delta);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Self {
|
|
|
|
|
filter: filter.freeze(),
|
|
|
|
|
k: k as u8,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Check if a bloom filter may contain some data
|
|
|
|
|
pub fn may_contain(&self, mut h: u32) -> bool {
|
|
|
|
|
if self.k > 30 {
|
|
|
|
|
// potential new encoding for short bloom filters
|
|
|
|
|
true
|
|
|
|
|
} else {
|
|
|
|
|
let nbits = self.filter.bit_len();
|
|
|
|
|
let delta = (h >> 17) | (h << 15);
|
|
|
|
|
for _ in 0..self.k {
|
|
|
|
|
let bit_pos = h % (nbits as u32);
|
|
|
|
|
if !self.filter.get_bit(bit_pos as usize) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
h = h.wrapping_add(delta);
|
|
|
|
|
}
|
|
|
|
|
true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|