// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use anyhow::{bail, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; /// Implements a bloom filter pub struct Bloom { /// data of filter in bits pub(crate) filter: Bytes, /// number of hash functions pub(crate) k: u8, } pub trait BitSlice { fn get_bit(&self, idx: usize) -> bool; fn bit_len(&self) -> usize; } pub trait BitSliceMut { fn set_bit(&mut self, idx: usize, val: bool); } impl> BitSlice for T { fn get_bit(&self, idx: usize) -> bool { let pos = idx / 8; let offset = idx % 8; (self.as_ref()[pos] & (1 << offset)) != 0 } fn bit_len(&self) -> usize { self.as_ref().len() * 8 } } impl> BitSliceMut for T { fn set_bit(&mut self, idx: usize, val: bool) { let pos = idx / 8; let offset = idx % 8; if val { self.as_mut()[pos] |= 1 << offset; } else { self.as_mut()[pos] &= !(1 << offset); } } } impl Bloom { /// Decode a bloom filter pub fn decode(buf: &[u8]) -> Result { let checksum = (&buf[buf.len() - 4..buf.len()]).get_u32(); if checksum != crc32fast::hash(&buf[..buf.len() - 4]) { bail!("checksum mismatched for bloom filters"); } let filter = &buf[..buf.len() - 5]; let k = buf[buf.len() - 5]; Ok(Self { filter: filter.to_vec().into(), k, }) } /// Encode a bloom filter pub fn encode(&self, buf: &mut Vec) { let offset = buf.len(); buf.extend(&self.filter); buf.put_u8(self.k); let checksum = crc32fast::hash(&buf[offset..]); buf.put_u32(checksum); } /// Get bloom filter bits per key from entries count and FPR pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { let size = -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); let locs = (size / (entries as f64)).ceil(); locs as usize } /// Build bloom filter from key hashes pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self { let k = (bits_per_key as f64 * 0.69) as u32; let k = k.min(30).max(1); let nbits = (keys.len() * bits_per_key).max(64); let nbytes = (nbits + 7) / 8; let nbits = nbytes * 8; let mut filter = BytesMut::with_capacity(nbytes); filter.resize(nbytes, 0); for h in keys { let mut h = *h; let delta = (h >> 17) | (h << 15); for _ in 0..k { let bit_pos = (h as usize) % nbits; filter.set_bit(bit_pos, true); h = h.wrapping_add(delta); } } Self { filter: filter.freeze(), k: k as u8, } } /// Check if a bloom filter may contain some data pub fn may_contain(&self, mut h: u32) -> bool { if self.k > 30 { // potential new encoding for short bloom filters true } else { let nbits = self.filter.bit_len(); let delta = (h >> 17) | (h << 15); for _ in 0..self.k { let bit_pos = h % (nbits as u32); if !self.filter.get_bit(bit_pos as usize) { return false; } h = h.wrapping_add(delta); } true } } }