checkin initial MVCC codebase
Signed-off-by: Alex Chi <iskyzh@gmail.com>
This commit is contained in:
113
mini-lsm-mvcc/src/table/bloom.rs
Normal file
113
mini-lsm-mvcc/src/table/bloom.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0.
|
||||
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// Implements a bloom filter
|
||||
pub struct Bloom {
|
||||
/// data of filter in bits
|
||||
pub(crate) filter: Bytes,
|
||||
/// number of hash functions
|
||||
pub(crate) k: u8,
|
||||
}
|
||||
|
||||
pub trait BitSlice {
|
||||
fn get_bit(&self, idx: usize) -> bool;
|
||||
fn bit_len(&self) -> usize;
|
||||
}
|
||||
|
||||
pub trait BitSliceMut {
|
||||
fn set_bit(&mut self, idx: usize, val: bool);
|
||||
}
|
||||
|
||||
impl<T: AsRef<[u8]>> BitSlice for T {
|
||||
fn get_bit(&self, idx: usize) -> bool {
|
||||
let pos = idx / 8;
|
||||
let offset = idx % 8;
|
||||
(self.as_ref()[pos] & (1 << offset)) != 0
|
||||
}
|
||||
|
||||
fn bit_len(&self) -> usize {
|
||||
self.as_ref().len() * 8
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: AsMut<[u8]>> BitSliceMut for T {
|
||||
fn set_bit(&mut self, idx: usize, val: bool) {
|
||||
let pos = idx / 8;
|
||||
let offset = idx % 8;
|
||||
if val {
|
||||
self.as_mut()[pos] |= 1 << offset;
|
||||
} else {
|
||||
self.as_mut()[pos] &= !(1 << offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Bloom {
|
||||
/// Decode a bloom filter
|
||||
pub fn decode(buf: &[u8]) -> Self {
|
||||
let filter = &buf[..buf.len() - 1];
|
||||
let k = buf[buf.len() - 1];
|
||||
Self {
|
||||
filter: filter.to_vec().into(),
|
||||
k,
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode a bloom filter
|
||||
pub fn encode(&self, buf: &mut Vec<u8>) {
|
||||
buf.extend(&self.filter);
|
||||
buf.put_u8(self.k);
|
||||
}
|
||||
|
||||
/// Get bloom filter bits per key from entries count and FPR
|
||||
pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
|
||||
let size =
|
||||
-1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2);
|
||||
let locs = (size / (entries as f64)).ceil();
|
||||
locs as usize
|
||||
}
|
||||
|
||||
/// Build bloom filter from key hashes
|
||||
pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self {
|
||||
let k = (bits_per_key as f64 * 0.69) as u32;
|
||||
let k = k.min(30).max(1);
|
||||
let nbits = (keys.len() * bits_per_key).max(64);
|
||||
let nbytes = (nbits + 7) / 8;
|
||||
let nbits = nbytes * 8;
|
||||
let mut filter = BytesMut::with_capacity(nbytes);
|
||||
filter.resize(nbytes, 0);
|
||||
for h in keys {
|
||||
let mut h = *h;
|
||||
let delta = (h >> 17) | (h << 15);
|
||||
for _ in 0..k {
|
||||
let bit_pos = (h as usize) % nbits;
|
||||
filter.set_bit(bit_pos, true);
|
||||
h = h.wrapping_add(delta);
|
||||
}
|
||||
}
|
||||
Self {
|
||||
filter: filter.freeze(),
|
||||
k: k as u8,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a bloom filter may contain some data
|
||||
pub fn may_contain(&self, mut h: u32) -> bool {
|
||||
if self.k > 30 {
|
||||
// potential new encoding for short bloom filters
|
||||
true
|
||||
} else {
|
||||
let nbits = self.filter.bit_len();
|
||||
let delta = (h >> 17) | (h << 15);
|
||||
for _ in 0..self.k {
|
||||
let bit_pos = h % (nbits as u32);
|
||||
if !self.filter.get_bit(bit_pos as usize) {
|
||||
return false;
|
||||
}
|
||||
h = h.wrapping_add(delta);
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user