finish week 1 day 7

Signed-off-by: Alex Chi <iskyzh@gmail.com>
This commit is contained in:
Alex Chi
2024-01-21 19:33:05 +08:00
parent 4e2f516b15
commit e8601433d6
8 changed files with 299 additions and 2 deletions

View File

@@ -18,6 +18,7 @@ rand = "0.8.5"
crossbeam-channel = "0.5.11"
serde_json = { version = "1.0" }
serde = { version = "1.0", features = ["derive"] }
farmhash = "1"
[dev-dependencies]
tempfile = "3"

View File

@@ -1,6 +1,7 @@
#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod
#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod
pub(crate) mod bloom;
mod builder;
mod iterator;
@@ -16,6 +17,8 @@ pub use iterator::SsTableIterator;
use crate::block::Block;
use crate::lsm_storage::BlockCache;
use self::bloom::Bloom;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct BlockMeta {
/// Offset of this data block.
@@ -91,6 +94,7 @@ pub struct SsTable {
block_cache: Option<Arc<BlockCache>>,
first_key: Bytes,
last_key: Bytes,
pub(crate) bloom: Option<Bloom>,
}
impl SsTable {
@@ -114,6 +118,7 @@ impl SsTable {
block_cache: None,
first_key,
last_key,
bloom: None,
}
}

View File

@@ -0,0 +1,103 @@
// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0.
use bytes::{BufMut, Bytes, BytesMut};
/// Implements a bloom filter
pub struct Bloom {
/// data of filter in bits
pub(crate) filter: Bytes,
/// number of hash functions
pub(crate) k: u8,
}
pub trait BitSlice {
fn get_bit(&self, idx: usize) -> bool;
fn bit_len(&self) -> usize;
}
pub trait BitSliceMut {
fn set_bit(&mut self, idx: usize, val: bool);
}
impl<T: AsRef<[u8]>> BitSlice for T {
fn get_bit(&self, idx: usize) -> bool {
let pos = idx / 8;
let offset = idx % 8;
(self.as_ref()[pos] & (1 << offset)) != 0
}
fn bit_len(&self) -> usize {
self.as_ref().len() * 8
}
}
impl<T: AsMut<[u8]>> BitSliceMut for T {
fn set_bit(&mut self, idx: usize, val: bool) {
let pos = idx / 8;
let offset = idx % 8;
if val {
self.as_mut()[pos] |= 1 << offset;
} else {
self.as_mut()[pos] &= !(1 << offset);
}
}
}
impl Bloom {
/// Decode a bloom filter
pub fn decode(buf: &[u8]) -> Self {
let filter = &buf[..buf.len() - 1];
let k = buf[buf.len() - 1];
Self {
filter: filter.to_vec().into(),
k,
}
}
/// Encode a bloom filter
pub fn encode(&self, buf: &mut Vec<u8>) {
buf.extend(&self.filter);
buf.put_u8(self.k);
}
/// Get bloom filter bits per key from entries count and FPR
pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
let size =
-1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2);
let locs = (size / (entries as f64)).ceil();
locs as usize
}
/// Build bloom filter from key hashes
pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self {
let k = (bits_per_key as f64 * 0.69) as u32;
let k = k.min(30).max(1);
let nbits = (keys.len() * bits_per_key).max(64);
let nbytes = (nbits + 7) / 8;
let nbits = nbytes * 8;
let mut filter = BytesMut::with_capacity(nbytes);
filter.resize(nbytes, 0);
// TODO: build the bloom filter
Self {
filter: filter.freeze(),
k: k as u8,
}
}
/// Check if a bloom filter may contain some data
pub fn may_contain(&self, h: u32) -> bool {
if self.k > 30 {
// potential new encoding for short bloom filters
true
} else {
let nbits = self.filter.bit_len();
let delta = (h >> 17) | (h << 15);
// TODO: probe the bloom filter
true
}
}
}