2022-12-24 10:11:06 -05:00
|
|
|
use std::path::Path;
|
2022-12-24 18:07:18 -05:00
|
|
|
use std::sync::Arc;
|
2022-12-24 10:11:06 -05:00
|
|
|
|
2022-12-23 21:14:11 -05:00
|
|
|
use anyhow::Result;
|
|
|
|
|
use bytes::BufMut;
|
|
|
|
|
|
2024-01-24 14:32:13 +08:00
|
|
|
use super::bloom::Bloom;
|
2022-12-23 21:14:11 -05:00
|
|
|
use super::{BlockMeta, FileObject, SsTable};
|
|
|
|
|
use crate::block::BlockBuilder;
|
2024-01-25 10:59:08 +08:00
|
|
|
use crate::key::{KeySlice, KeyVec};
|
2022-12-24 18:07:18 -05:00
|
|
|
use crate::lsm_storage::BlockCache;
|
2022-12-23 21:14:11 -05:00
|
|
|
|
2022-12-23 22:32:30 -05:00
|
|
|
/// Builds an SSTable from key-value pairs.
|
2022-12-23 21:14:11 -05:00
|
|
|
pub struct SsTableBuilder {
|
|
|
|
|
builder: BlockBuilder,
|
2024-01-25 10:59:08 +08:00
|
|
|
first_key: KeyVec,
|
|
|
|
|
last_key: KeyVec,
|
2022-12-23 21:14:11 -05:00
|
|
|
data: Vec<u8>,
|
2024-01-24 14:32:13 +08:00
|
|
|
pub(crate) meta: Vec<BlockMeta>,
|
2022-12-23 21:14:11 -05:00
|
|
|
block_size: usize,
|
2024-01-24 14:32:13 +08:00
|
|
|
key_hashes: Vec<u32>,
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl SsTableBuilder {
|
2022-12-24 15:34:34 -05:00
|
|
|
/// Create a builder based on target block size.
|
2022-12-24 10:11:06 -05:00
|
|
|
pub fn new(block_size: usize) -> Self {
|
2022-12-23 21:14:11 -05:00
|
|
|
Self {
|
|
|
|
|
data: Vec::new(),
|
|
|
|
|
meta: Vec::new(),
|
2024-01-25 10:59:08 +08:00
|
|
|
first_key: KeyVec::new(),
|
|
|
|
|
last_key: KeyVec::new(),
|
2022-12-23 21:14:11 -05:00
|
|
|
block_size,
|
|
|
|
|
builder: BlockBuilder::new(block_size),
|
2024-01-24 14:32:13 +08:00
|
|
|
key_hashes: Vec::new(),
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-24 10:11:06 -05:00
|
|
|
/// Adds a key-value pair to SSTable
|
2024-01-25 10:59:08 +08:00
|
|
|
pub fn add(&mut self, key: KeySlice, value: &[u8]) {
|
2022-12-23 21:14:11 -05:00
|
|
|
if self.first_key.is_empty() {
|
2024-01-25 10:59:08 +08:00
|
|
|
self.first_key.set_from_slice(key);
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|
|
|
|
|
|
2024-01-25 10:59:08 +08:00
|
|
|
self.key_hashes.push(farmhash::fingerprint32(key.raw_ref()));
|
2024-01-24 14:32:13 +08:00
|
|
|
|
2022-12-23 21:14:11 -05:00
|
|
|
if self.builder.add(key, value) {
|
2024-01-25 10:59:08 +08:00
|
|
|
self.last_key.set_from_slice(key);
|
2022-12-24 10:11:06 -05:00
|
|
|
return;
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|
2024-01-17 16:49:12 +08:00
|
|
|
|
2022-12-23 21:14:11 -05:00
|
|
|
// create a new block builder and append block data
|
|
|
|
|
self.finish_block();
|
|
|
|
|
|
|
|
|
|
// add the key-value pair to the next block
|
|
|
|
|
assert!(self.builder.add(key, value));
|
2024-01-25 10:59:08 +08:00
|
|
|
self.first_key.set_from_slice(key);
|
|
|
|
|
self.last_key.set_from_slice(key);
|
2022-12-24 10:11:06 -05:00
|
|
|
}
|
2022-12-23 21:14:11 -05:00
|
|
|
|
2022-12-24 10:11:06 -05:00
|
|
|
/// Get the estimated size of the SSTable.
|
|
|
|
|
pub fn estimated_size(&self) -> usize {
|
|
|
|
|
self.data.len()
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn finish_block(&mut self) {
|
|
|
|
|
let builder = std::mem::replace(&mut self.builder, BlockBuilder::new(self.block_size));
|
|
|
|
|
let encoded_block = builder.build().encode();
|
|
|
|
|
self.meta.push(BlockMeta {
|
|
|
|
|
offset: self.data.len(),
|
2024-01-25 10:59:08 +08:00
|
|
|
first_key: std::mem::take(&mut self.first_key).into_key_bytes(),
|
|
|
|
|
last_key: std::mem::take(&mut self.last_key).into_key_bytes(),
|
2022-12-23 21:14:11 -05:00
|
|
|
});
|
2024-01-25 21:53:47 +08:00
|
|
|
let checksum = crc32fast::hash(&encoded_block);
|
2022-12-23 21:14:11 -05:00
|
|
|
self.data.extend(encoded_block);
|
2024-01-25 21:53:47 +08:00
|
|
|
self.data.put_u32(checksum);
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|
|
|
|
|
|
2024-01-24 14:32:13 +08:00
|
|
|
/// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects.
|
2022-12-24 18:07:18 -05:00
|
|
|
pub fn build(
|
|
|
|
|
mut self,
|
|
|
|
|
id: usize,
|
|
|
|
|
block_cache: Option<Arc<BlockCache>>,
|
|
|
|
|
path: impl AsRef<Path>,
|
|
|
|
|
) -> Result<SsTable> {
|
2022-12-23 21:14:11 -05:00
|
|
|
self.finish_block();
|
|
|
|
|
let mut buf = self.data;
|
|
|
|
|
let meta_offset = buf.len();
|
|
|
|
|
BlockMeta::encode_block_meta(&self.meta, &mut buf);
|
|
|
|
|
buf.put_u32(meta_offset as u32);
|
2024-01-24 14:32:13 +08:00
|
|
|
let bloom = Bloom::build_from_key_hashes(
|
|
|
|
|
&self.key_hashes,
|
|
|
|
|
Bloom::bloom_bits_per_key(self.key_hashes.len(), 0.01),
|
|
|
|
|
);
|
|
|
|
|
let bloom_offset = buf.len();
|
|
|
|
|
bloom.encode(&mut buf);
|
|
|
|
|
buf.put_u32(bloom_offset as u32);
|
2022-12-23 21:14:11 -05:00
|
|
|
let file = FileObject::create(path.as_ref(), buf)?;
|
|
|
|
|
Ok(SsTable {
|
2022-12-24 18:07:18 -05:00
|
|
|
id,
|
2022-12-23 21:14:11 -05:00
|
|
|
file,
|
2024-01-17 16:49:12 +08:00
|
|
|
first_key: self.meta.first().unwrap().first_key.clone(),
|
|
|
|
|
last_key: self.meta.last().unwrap().last_key.clone(),
|
2024-01-21 14:21:09 +08:00
|
|
|
block_meta: self.meta,
|
2022-12-23 21:14:11 -05:00
|
|
|
block_meta_offset: meta_offset,
|
2022-12-24 18:07:18 -05:00
|
|
|
block_cache,
|
2024-01-24 14:32:13 +08:00
|
|
|
bloom: Some(bloom),
|
2022-12-23 21:14:11 -05:00
|
|
|
})
|
|
|
|
|
}
|
2022-12-24 18:07:18 -05:00
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
pub(crate) fn build_for_test(self, path: impl AsRef<Path>) -> Result<SsTable> {
|
|
|
|
|
self.build(0, None, path)
|
|
|
|
|
}
|
2022-12-23 21:14:11 -05:00
|
|
|
}
|