60
Cargo.lock
generated
60
Cargo.lock
generated
@@ -2,12 +2,6 @@
|
|||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 3
|
version = 3
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "Inflector"
|
|
||||||
version = "0.11.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aliasable"
|
name = "aliasable"
|
||||||
version = "0.1.3"
|
version = "0.1.3"
|
||||||
@@ -420,8 +414,9 @@ dependencies = [
|
|||||||
"crossbeam-channel",
|
"crossbeam-channel",
|
||||||
"crossbeam-epoch",
|
"crossbeam-epoch",
|
||||||
"crossbeam-skiplist",
|
"crossbeam-skiplist",
|
||||||
|
"farmhash",
|
||||||
"moka",
|
"moka",
|
||||||
"ouroboros 0.15.5",
|
"ouroboros",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"rand",
|
"rand",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -442,7 +437,7 @@ dependencies = [
|
|||||||
"crossbeam-skiplist",
|
"crossbeam-skiplist",
|
||||||
"farmhash",
|
"farmhash",
|
||||||
"moka",
|
"moka",
|
||||||
"ouroboros 0.18.2",
|
"ouroboros",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"rand",
|
"rand",
|
||||||
"serde",
|
"serde",
|
||||||
@@ -509,16 +504,6 @@ dependencies = [
|
|||||||
"windows-sys 0.42.0",
|
"windows-sys 0.42.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ouroboros"
|
|
||||||
version = "0.15.5"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "dfbb50b356159620db6ac971c6d5c9ab788c9cc38a6f49619fca2a27acb062ca"
|
|
||||||
dependencies = [
|
|
||||||
"aliasable",
|
|
||||||
"ouroboros_macro 0.15.5",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ouroboros"
|
name = "ouroboros"
|
||||||
version = "0.18.2"
|
version = "0.18.2"
|
||||||
@@ -526,23 +511,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "a50b637ffd883b2733a8483599fb6136b9dcedaa1850f7ac08b9b6f9f2061208"
|
checksum = "a50b637ffd883b2733a8483599fb6136b9dcedaa1850f7ac08b9b6f9f2061208"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aliasable",
|
"aliasable",
|
||||||
"ouroboros_macro 0.18.2",
|
"ouroboros_macro",
|
||||||
"static_assertions",
|
"static_assertions",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ouroboros_macro"
|
|
||||||
version = "0.15.5"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4a0d9d1a6191c4f391f87219d1ea42b23f09ee84d64763cd05ee6ea88d9f384d"
|
|
||||||
dependencies = [
|
|
||||||
"Inflector",
|
|
||||||
"proc-macro-error",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.107",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ouroboros_macro"
|
name = "ouroboros_macro"
|
||||||
version = "0.18.2"
|
version = "0.18.2"
|
||||||
@@ -586,30 +558,6 @@ version = "0.2.17"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "proc-macro-error"
|
|
||||||
version = "1.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro-error-attr",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 1.0.107",
|
|
||||||
"version_check",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "proc-macro-error-attr"
|
|
||||||
version = "1.0.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"version_check",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.76"
|
version = "1.0.76"
|
||||||
|
|||||||
@@ -59,8 +59,8 @@ We are working on a new version of the mini-lsm tutorial that is split into 3 we
|
|||||||
| 2.2 | Compaction Strategy - Simple | ✅ | ✅ | ✅ |
|
| 2.2 | Compaction Strategy - Simple | ✅ | ✅ | ✅ |
|
||||||
| 2.3 | Compaction Strategy - Tiered | ✅ | ✅ | ✅ |
|
| 2.3 | Compaction Strategy - Tiered | ✅ | ✅ | ✅ |
|
||||||
| 2.4 | Compaction Strategy - Leveled | ✅ | ✅ | ✅ |
|
| 2.4 | Compaction Strategy - Leveled | ✅ | ✅ | ✅ |
|
||||||
| 2.5 | Manifest | ✅ | 🚧 | 🚧 |
|
| 2.5 | Manifest | ✅ | ✅ | 🚧 |
|
||||||
| 2.6 | Write-Ahead Log | ✅ | 🚧 | 🚧 |
|
| 2.6 | Write-Ahead Log | ✅ | ✅ | 🚧 |
|
||||||
| 2.7 | Batch Write + Checksum | | | |
|
| 2.7 | Batch Write + Checksum | | | |
|
||||||
| 3.1 | Timestamp Key Encoding + New Block Format | | | |
|
| 3.1 | Timestamp Key Encoding + New Block Format | | | |
|
||||||
| 3.2 | Prefix Bloom Filter | | | |
|
| 3.2 | Prefix Bloom Filter | | | |
|
||||||
|
|||||||
@@ -343,6 +343,7 @@ fn main() {
|
|||||||
} else {
|
} else {
|
||||||
storage.dump_original_id(false, false);
|
storage.dump_original_id(false, false);
|
||||||
}
|
}
|
||||||
|
println!("--- Compaction Task ---");
|
||||||
let mut num_compactions = 0;
|
let mut num_compactions = 0;
|
||||||
while let Some(task) = {
|
while let Some(task) = {
|
||||||
println!("--- Compaction Task ---");
|
println!("--- Compaction Task ---");
|
||||||
|
|||||||
@@ -107,6 +107,19 @@ fn main() -> Result<()> {
|
|||||||
} else {
|
} else {
|
||||||
println!("{} not exist", key);
|
println!("{} not exist", key);
|
||||||
}
|
}
|
||||||
|
} else if line == "scan" {
|
||||||
|
let mut iter = lsm.scan(std::ops::Bound::Unbounded, std::ops::Bound::Unbounded)?;
|
||||||
|
let mut cnt = 0;
|
||||||
|
while iter.is_valid() {
|
||||||
|
println!(
|
||||||
|
"{:?}={:?}",
|
||||||
|
Bytes::copy_from_slice(iter.key()),
|
||||||
|
Bytes::copy_from_slice(iter.value()),
|
||||||
|
);
|
||||||
|
iter.next()?;
|
||||||
|
cnt += 1;
|
||||||
|
}
|
||||||
|
println!("{} keys scanned", cnt);
|
||||||
} else if line.starts_with("scan ") {
|
} else if line.starts_with("scan ") {
|
||||||
let Some((_, rest)) = line.split_once(' ') else {
|
let Some((_, rest)) = line.split_once(' ') else {
|
||||||
println!("invalid command");
|
println!("invalid command");
|
||||||
@@ -137,7 +150,7 @@ fn main() -> Result<()> {
|
|||||||
lsm.force_flush()?;
|
lsm.force_flush()?;
|
||||||
} else if line == "full_compaction" {
|
} else if line == "full_compaction" {
|
||||||
lsm.force_full_compaction()?;
|
lsm.force_full_compaction()?;
|
||||||
} else if line == "quit" {
|
} else if line == "quit" || line == "close" {
|
||||||
lsm.close()?;
|
lsm.close()?;
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -166,10 +166,16 @@ impl MiniLsm {
|
|||||||
self.inner.scan(lower, upper)
|
self.inner.scan(lower, upper)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Only call this in test cases due to race conditions
|
||||||
pub fn force_flush(&self) -> Result<()> {
|
pub fn force_flush(&self) -> Result<()> {
|
||||||
self.inner
|
if !self.inner.state.read().memtable.is_empty() {
|
||||||
.force_freeze_memtable(&self.inner.state_lock.lock())?;
|
self.inner
|
||||||
self.inner.force_flush_next_imm_memtable()
|
.force_freeze_memtable(&self.inner.state_lock.lock())?;
|
||||||
|
}
|
||||||
|
if !self.inner.state.read().imm_memtables.is_empty() {
|
||||||
|
self.inner.force_flush_next_imm_memtable()?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn force_full_compaction(&self) -> Result<()> {
|
pub fn force_full_compaction(&self) -> Result<()> {
|
||||||
@@ -247,7 +253,7 @@ impl LsmStorageInner {
|
|||||||
Self::path_of_wal_static(&self.path, id)
|
Self::path_of_wal_static(&self.path, id)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sync_dir(&self) -> Result<()> {
|
pub(super) fn sync_dir(&self) -> Result<()> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,11 @@ impl MemTable {
|
|||||||
self.approximate_size
|
self.approximate_size
|
||||||
.load(std::sync::atomic::Ordering::Relaxed)
|
.load(std::sync::atomic::Ordering::Relaxed)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Only use this function when closing the database
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.map.is_empty()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type SkipMapRangeIter<'a> =
|
type SkipMapRangeIter<'a> =
|
||||||
|
|||||||
@@ -16,13 +16,14 @@ bytes = "1"
|
|||||||
crossbeam-epoch = "0.9"
|
crossbeam-epoch = "0.9"
|
||||||
crossbeam-skiplist = "0.1"
|
crossbeam-skiplist = "0.1"
|
||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
ouroboros = "0.15"
|
ouroboros = "0.18"
|
||||||
moka = "0.9"
|
moka = "0.9"
|
||||||
clap = { version = "4.4.17", features = ["derive"] }
|
clap = { version = "4.4.17", features = ["derive"] }
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
crossbeam-channel = "0.5.11"
|
crossbeam-channel = "0.5.11"
|
||||||
serde_json = { version = "1.0" }
|
serde_json = { version = "1.0" }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
farmhash = "1"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ pub(crate) const SIZEOF_U16: usize = std::mem::size_of::<u16>();
|
|||||||
/// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted
|
/// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted
|
||||||
/// key-value pairs.
|
/// key-value pairs.
|
||||||
pub struct Block {
|
pub struct Block {
|
||||||
data: Vec<u8>,
|
pub(crate) data: Vec<u8>,
|
||||||
offsets: Vec<u16>,
|
pub(crate) offsets: Vec<u16>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Block {
|
impl Block {
|
||||||
@@ -41,6 +41,3 @@ impl Block {
|
|||||||
Self { data, offsets }
|
Self { data, offsets }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests;
|
|
||||||
|
|||||||
@@ -10,6 +10,22 @@ pub struct BlockBuilder {
|
|||||||
data: Vec<u8>,
|
data: Vec<u8>,
|
||||||
/// The expected block size.
|
/// The expected block size.
|
||||||
block_size: usize,
|
block_size: usize,
|
||||||
|
/// The first key in the block
|
||||||
|
first_key: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_overlap(first_key: &[u8], key: &[u8]) -> usize {
|
||||||
|
let mut i = 0;
|
||||||
|
loop {
|
||||||
|
if i >= first_key.len() || i >= key.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if first_key[i] != key[i] {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
i
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BlockBuilder {
|
impl BlockBuilder {
|
||||||
@@ -19,6 +35,7 @@ impl BlockBuilder {
|
|||||||
offsets: Vec::new(),
|
offsets: Vec::new(),
|
||||||
data: Vec::new(),
|
data: Vec::new(),
|
||||||
block_size,
|
block_size,
|
||||||
|
first_key: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -38,14 +55,22 @@ impl BlockBuilder {
|
|||||||
}
|
}
|
||||||
// Add the offset of the data into the offset array.
|
// Add the offset of the data into the offset array.
|
||||||
self.offsets.push(self.data.len() as u16);
|
self.offsets.push(self.data.len() as u16);
|
||||||
|
let overlap = compute_overlap(&self.first_key, key);
|
||||||
|
// Encode key overlap.
|
||||||
|
self.data.put_u16(overlap as u16);
|
||||||
// Encode key length.
|
// Encode key length.
|
||||||
self.data.put_u16(key.len() as u16);
|
self.data.put_u16((key.len() - overlap) as u16);
|
||||||
// Encode key content.
|
// Encode key content.
|
||||||
self.data.put(key);
|
self.data.put(&key[overlap..]);
|
||||||
// Encode value length.
|
// Encode value length.
|
||||||
self.data.put_u16(value.len() as u16);
|
self.data.put_u16(value.len() as u16);
|
||||||
// Encode value content.
|
// Encode value content.
|
||||||
self.data.put(value);
|
self.data.put(value);
|
||||||
|
|
||||||
|
if self.first_key.is_empty() {
|
||||||
|
self.first_key = key.to_vec();
|
||||||
|
}
|
||||||
|
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use bytes::Buf;
|
use bytes::Buf;
|
||||||
|
|
||||||
|
use crate::block::SIZEOF_U16;
|
||||||
|
|
||||||
use super::Block;
|
use super::Block;
|
||||||
|
|
||||||
/// Iterates on a block.
|
/// Iterates on a block.
|
||||||
@@ -10,18 +12,31 @@ pub struct BlockIterator {
|
|||||||
block: Arc<Block>,
|
block: Arc<Block>,
|
||||||
/// the current key at the iterator position
|
/// the current key at the iterator position
|
||||||
key: Vec<u8>,
|
key: Vec<u8>,
|
||||||
/// the current value at the iterator position
|
/// the value range from the block
|
||||||
value: Vec<u8>,
|
value_range: (usize, usize),
|
||||||
/// the current index at the iterator position
|
/// the current index at the iterator position
|
||||||
idx: usize,
|
idx: usize,
|
||||||
|
/// the first key in the block
|
||||||
|
first_key: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Block {
|
||||||
|
fn get_first_key(&self) -> Vec<u8> {
|
||||||
|
let mut buf = &self.data[..];
|
||||||
|
buf.get_u16();
|
||||||
|
let key_len = buf.get_u16();
|
||||||
|
let key = &buf[..key_len as usize];
|
||||||
|
key.to_vec()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BlockIterator {
|
impl BlockIterator {
|
||||||
fn new(block: Arc<Block>) -> Self {
|
fn new(block: Arc<Block>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
first_key: block.get_first_key(),
|
||||||
block,
|
block,
|
||||||
key: Vec::new(),
|
key: Vec::new(),
|
||||||
value: Vec::new(),
|
value_range: (0, 0),
|
||||||
idx: 0,
|
idx: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -49,7 +64,7 @@ impl BlockIterator {
|
|||||||
/// Returns the value of the current entry.
|
/// Returns the value of the current entry.
|
||||||
pub fn value(&self) -> &[u8] {
|
pub fn value(&self) -> &[u8] {
|
||||||
debug_assert!(!self.key.is_empty(), "invalid iterator");
|
debug_assert!(!self.key.is_empty(), "invalid iterator");
|
||||||
&self.value
|
&self.block.data[self.value_range.0..self.value_range.1]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if the iterator is valid.
|
/// Returns true if the iterator is valid.
|
||||||
@@ -66,7 +81,7 @@ impl BlockIterator {
|
|||||||
fn seek_to(&mut self, idx: usize) {
|
fn seek_to(&mut self, idx: usize) {
|
||||||
if idx >= self.block.offsets.len() {
|
if idx >= self.block.offsets.len() {
|
||||||
self.key.clear();
|
self.key.clear();
|
||||||
self.value.clear();
|
self.value_range = (0, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let offset = self.block.offsets[idx] as usize;
|
let offset = self.block.offsets[idx] as usize;
|
||||||
@@ -86,16 +101,18 @@ impl BlockIterator {
|
|||||||
let mut entry = &self.block.data[offset..];
|
let mut entry = &self.block.data[offset..];
|
||||||
// Since `get_u16()` will automatically move the ptr 2 bytes ahead here,
|
// Since `get_u16()` will automatically move the ptr 2 bytes ahead here,
|
||||||
// we don't need to manually advance it
|
// we don't need to manually advance it
|
||||||
|
let overlap_len = entry.get_u16() as usize;
|
||||||
let key_len = entry.get_u16() as usize;
|
let key_len = entry.get_u16() as usize;
|
||||||
let key = entry[..key_len].to_vec();
|
let key = entry[..key_len].to_vec();
|
||||||
entry.advance(key_len);
|
entry.advance(key_len);
|
||||||
self.key.clear();
|
self.key.clear();
|
||||||
|
self.key.extend(&self.first_key[..overlap_len]);
|
||||||
self.key.extend(key);
|
self.key.extend(key);
|
||||||
let value_len = entry.get_u16() as usize;
|
let value_len = entry.get_u16() as usize;
|
||||||
let value = entry[..value_len].to_vec();
|
let value_offset_begin = offset + SIZEOF_U16 + SIZEOF_U16 + key_len + SIZEOF_U16;
|
||||||
|
let value_offset_end = value_offset_begin + value_len;
|
||||||
|
self.value_range = (value_offset_begin, value_offset_end);
|
||||||
entry.advance(value_len);
|
entry.advance(value_len);
|
||||||
self.value.clear();
|
|
||||||
self.value.extend(value);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Seek to the first key that is >= `key`.
|
/// Seek to the first key that is >= `key`.
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
|
#![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality
|
||||||
|
|
||||||
mod leveled;
|
mod leveled;
|
||||||
mod simple_leveled;
|
mod simple_leveled;
|
||||||
mod tiered;
|
mod tiered;
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
@@ -13,7 +16,9 @@ pub use simple_leveled::{
|
|||||||
};
|
};
|
||||||
pub use tiered::{TieredCompactionController, TieredCompactionOptions, TieredCompactionTask};
|
pub use tiered::{TieredCompactionController, TieredCompactionOptions, TieredCompactionTask};
|
||||||
|
|
||||||
|
use crate::iterators::concat_iterator::SstConcatIterator;
|
||||||
use crate::iterators::merge_iterator::MergeIterator;
|
use crate::iterators::merge_iterator::MergeIterator;
|
||||||
|
use crate::iterators::two_merge_iterator::TwoMergeIterator;
|
||||||
use crate::iterators::StorageIterator;
|
use crate::iterators::StorageIterator;
|
||||||
use crate::lsm_storage::{LsmStorageInner, LsmStorageState};
|
use crate::lsm_storage::{LsmStorageInner, LsmStorageState};
|
||||||
use crate::manifest::ManifestRecord;
|
use crate::manifest::ManifestRecord;
|
||||||
@@ -24,13 +29,16 @@ pub enum CompactionTask {
|
|||||||
Leveled(LeveledCompactionTask),
|
Leveled(LeveledCompactionTask),
|
||||||
Tiered(TieredCompactionTask),
|
Tiered(TieredCompactionTask),
|
||||||
Simple(SimpleLeveledCompactionTask),
|
Simple(SimpleLeveledCompactionTask),
|
||||||
ForceFullCompaction(Vec<usize>),
|
ForceFullCompaction {
|
||||||
|
l0_sstables: Vec<usize>,
|
||||||
|
l1_sstables: Vec<usize>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CompactionTask {
|
impl CompactionTask {
|
||||||
fn compact_to_bottom_level(&self) -> bool {
|
fn compact_to_bottom_level(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
CompactionTask::ForceFullCompaction(_) => true,
|
CompactionTask::ForceFullCompaction { .. } => true,
|
||||||
CompactionTask::Leveled(task) => task.is_lower_level_bottom_level,
|
CompactionTask::Leveled(task) => task.is_lower_level_bottom_level,
|
||||||
CompactionTask::Simple(task) => task.is_lower_level_bottom_level,
|
CompactionTask::Simple(task) => task.is_lower_level_bottom_level,
|
||||||
CompactionTask::Tiered(task) => task.bottom_tier_included,
|
CompactionTask::Tiered(task) => task.bottom_tier_included,
|
||||||
@@ -105,50 +113,13 @@ pub enum CompactionOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl LsmStorageInner {
|
impl LsmStorageInner {
|
||||||
fn compact(&self, task: &CompactionTask) -> Result<Vec<Arc<SsTable>>> {
|
fn compact_generate_sst_from_iter(
|
||||||
let table_ids = match task {
|
&self,
|
||||||
CompactionTask::Leveled(task) => task
|
mut iter: impl StorageIterator,
|
||||||
.lower_level_sst_ids
|
compact_to_bottom_level: bool,
|
||||||
.iter()
|
) -> Result<Vec<Arc<SsTable>>> {
|
||||||
.copied()
|
|
||||||
.chain(task.upper_level_sst_ids.iter().copied())
|
|
||||||
.collect::<Vec<_>>(),
|
|
||||||
CompactionTask::Simple(task) => task
|
|
||||||
.lower_level_sst_ids
|
|
||||||
.iter()
|
|
||||||
.copied()
|
|
||||||
.chain(task.upper_level_sst_ids.iter().copied())
|
|
||||||
.collect::<Vec<_>>(),
|
|
||||||
CompactionTask::Tiered(task) => task
|
|
||||||
.tiers
|
|
||||||
.iter()
|
|
||||||
.map(|(_, files)| files)
|
|
||||||
.flatten()
|
|
||||||
.copied()
|
|
||||||
.collect::<Vec<_>>(),
|
|
||||||
CompactionTask::ForceFullCompaction(l0_ssts) => l0_ssts.clone(),
|
|
||||||
};
|
|
||||||
let tables: Vec<Arc<SsTable>> = {
|
|
||||||
let state = self.state.read();
|
|
||||||
table_ids
|
|
||||||
.iter()
|
|
||||||
.map(|id| state.sstables.get(id).unwrap().clone())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut iters = Vec::new();
|
|
||||||
iters.reserve(tables.len());
|
|
||||||
for table in tables.iter() {
|
|
||||||
iters.push(Box::new(SsTableIterator::create_and_seek_to_first(
|
|
||||||
table.clone(),
|
|
||||||
)?));
|
|
||||||
}
|
|
||||||
let mut iter = MergeIterator::create(iters);
|
|
||||||
|
|
||||||
let mut builder = None;
|
let mut builder = None;
|
||||||
let mut new_sst = vec![];
|
let mut new_sst = Vec::new();
|
||||||
|
|
||||||
let compact_to_bottom_level = task.compact_to_bottom_level();
|
|
||||||
|
|
||||||
while iter.is_valid() {
|
while iter.is_valid() {
|
||||||
if builder.is_none() {
|
if builder.is_none() {
|
||||||
@@ -165,7 +136,7 @@ impl LsmStorageInner {
|
|||||||
iter.next()?;
|
iter.next()?;
|
||||||
|
|
||||||
if builder_inner.estimated_size() >= self.options.target_sst_size {
|
if builder_inner.estimated_size() >= self.options.target_sst_size {
|
||||||
let sst_id = self.next_sst_id(); // lock dropped here
|
let sst_id = self.next_sst_id();
|
||||||
let builder = builder.take().unwrap();
|
let builder = builder.take().unwrap();
|
||||||
let sst = Arc::new(builder.build(
|
let sst = Arc::new(builder.build(
|
||||||
sst_id,
|
sst_id,
|
||||||
@@ -187,6 +158,98 @@ impl LsmStorageInner {
|
|||||||
Ok(new_sst)
|
Ok(new_sst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn compact(&self, task: &CompactionTask) -> Result<Vec<Arc<SsTable>>> {
|
||||||
|
let snapshot = {
|
||||||
|
let state = self.state.read();
|
||||||
|
state.clone()
|
||||||
|
};
|
||||||
|
match task {
|
||||||
|
CompactionTask::ForceFullCompaction {
|
||||||
|
l0_sstables,
|
||||||
|
l1_sstables,
|
||||||
|
} => {
|
||||||
|
let mut l0_iters = Vec::with_capacity(l0_sstables.len());
|
||||||
|
for id in l0_sstables.iter() {
|
||||||
|
l0_iters.push(Box::new(SsTableIterator::create_and_seek_to_first(
|
||||||
|
snapshot.sstables.get(id).unwrap().clone(),
|
||||||
|
)?));
|
||||||
|
}
|
||||||
|
let mut l1_iters = Vec::with_capacity(l1_sstables.len());
|
||||||
|
for id in l1_sstables.iter() {
|
||||||
|
l1_iters.push(snapshot.sstables.get(id).unwrap().clone());
|
||||||
|
}
|
||||||
|
let iter = TwoMergeIterator::create(
|
||||||
|
MergeIterator::create(l0_iters),
|
||||||
|
SstConcatIterator::create_and_seek_to_first(l1_iters)?,
|
||||||
|
)?;
|
||||||
|
self.compact_generate_sst_from_iter(iter, task.compact_to_bottom_level())
|
||||||
|
}
|
||||||
|
CompactionTask::Simple(SimpleLeveledCompactionTask {
|
||||||
|
upper_level,
|
||||||
|
upper_level_sst_ids,
|
||||||
|
lower_level: _,
|
||||||
|
lower_level_sst_ids,
|
||||||
|
..
|
||||||
|
})
|
||||||
|
| CompactionTask::Leveled(LeveledCompactionTask {
|
||||||
|
upper_level,
|
||||||
|
upper_level_sst_ids,
|
||||||
|
lower_level: _,
|
||||||
|
lower_level_sst_ids,
|
||||||
|
..
|
||||||
|
}) => match upper_level {
|
||||||
|
Some(_) => {
|
||||||
|
let mut upper_ssts = Vec::with_capacity(upper_level_sst_ids.len());
|
||||||
|
for id in upper_level_sst_ids.iter() {
|
||||||
|
upper_ssts.push(snapshot.sstables.get(id).unwrap().clone());
|
||||||
|
}
|
||||||
|
let upper_iter = SstConcatIterator::create_and_seek_to_first(upper_ssts)?;
|
||||||
|
let mut lower_ssts = Vec::with_capacity(upper_level_sst_ids.len());
|
||||||
|
for id in lower_level_sst_ids.iter() {
|
||||||
|
lower_ssts.push(snapshot.sstables.get(id).unwrap().clone());
|
||||||
|
}
|
||||||
|
let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?;
|
||||||
|
self.compact_generate_sst_from_iter(
|
||||||
|
TwoMergeIterator::create(upper_iter, lower_iter)?,
|
||||||
|
task.compact_to_bottom_level(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let mut upper_iters = Vec::with_capacity(upper_level_sst_ids.len());
|
||||||
|
for id in upper_level_sst_ids.iter() {
|
||||||
|
upper_iters.push(Box::new(SsTableIterator::create_and_seek_to_first(
|
||||||
|
snapshot.sstables.get(id).unwrap().clone(),
|
||||||
|
)?));
|
||||||
|
}
|
||||||
|
let upper_iter = MergeIterator::create(upper_iters);
|
||||||
|
let mut lower_ssts = Vec::with_capacity(upper_level_sst_ids.len());
|
||||||
|
for id in lower_level_sst_ids.iter() {
|
||||||
|
lower_ssts.push(snapshot.sstables.get(id).unwrap().clone());
|
||||||
|
}
|
||||||
|
let lower_iter = SstConcatIterator::create_and_seek_to_first(lower_ssts)?;
|
||||||
|
self.compact_generate_sst_from_iter(
|
||||||
|
TwoMergeIterator::create(upper_iter, lower_iter)?,
|
||||||
|
task.compact_to_bottom_level(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
CompactionTask::Tiered(TieredCompactionTask { tiers, .. }) => {
|
||||||
|
let mut iters = Vec::with_capacity(tiers.len());
|
||||||
|
for (_, tier_sst_ids) in tiers {
|
||||||
|
let mut ssts = Vec::with_capacity(tier_sst_ids.len());
|
||||||
|
for id in tier_sst_ids.iter() {
|
||||||
|
ssts.push(snapshot.sstables.get(id).unwrap().clone());
|
||||||
|
}
|
||||||
|
iters.push(Box::new(SstConcatIterator::create_and_seek_to_first(ssts)?));
|
||||||
|
}
|
||||||
|
self.compact_generate_sst_from_iter(
|
||||||
|
MergeIterator::create(iters),
|
||||||
|
task.compact_to_bottom_level(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn force_full_compaction(&self) -> Result<()> {
|
pub fn force_full_compaction(&self) -> Result<()> {
|
||||||
let CompactionOptions::NoCompaction = self.options.compaction_options else {
|
let CompactionOptions::NoCompaction = self.options.compaction_options else {
|
||||||
panic!("full compaction can only be called with compaction is not enabled")
|
panic!("full compaction can only be called with compaction is not enabled")
|
||||||
@@ -195,15 +258,19 @@ impl LsmStorageInner {
|
|||||||
let state = self.state.read();
|
let state = self.state.read();
|
||||||
state.clone()
|
state.clone()
|
||||||
};
|
};
|
||||||
let mut original_sstables = snapshot.l0_sstables.clone();
|
|
||||||
original_sstables.reverse(); // is this correct?
|
let l0_sstables = snapshot.l0_sstables.clone();
|
||||||
let sstables = self.compact(&CompactionTask::ForceFullCompaction(
|
let l1_sstables = snapshot.levels[0].1.clone();
|
||||||
original_sstables.clone(),
|
let compaction_task = CompactionTask::ForceFullCompaction {
|
||||||
))?;
|
l0_sstables: l0_sstables.clone(),
|
||||||
|
l1_sstables: l1_sstables.clone(),
|
||||||
|
};
|
||||||
|
let sstables = self.compact(&compaction_task)?;
|
||||||
|
|
||||||
{
|
{
|
||||||
let _state_lock = self.state_lock.lock();
|
let _state_lock = self.state_lock.lock();
|
||||||
let mut state = self.state.read().as_ref().clone();
|
let mut state = self.state.read().as_ref().clone();
|
||||||
for sst in original_sstables.iter() {
|
for sst in l0_sstables.iter().chain(l1_sstables.iter()) {
|
||||||
let result = state.sstables.remove(sst);
|
let result = state.sstables.remove(sst);
|
||||||
assert!(result.is_some());
|
assert!(result.is_some());
|
||||||
}
|
}
|
||||||
@@ -213,11 +280,20 @@ impl LsmStorageInner {
|
|||||||
let result = state.sstables.insert(new_sst.sst_id(), new_sst);
|
let result = state.sstables.insert(new_sst.sst_id(), new_sst);
|
||||||
assert!(result.is_none());
|
assert!(result.is_none());
|
||||||
}
|
}
|
||||||
state.l0_sstables = ids;
|
assert_eq!(l1_sstables, state.levels[0].1);
|
||||||
|
state.levels[0].1 = ids;
|
||||||
|
let mut l0_sstables_map = l0_sstables.iter().copied().collect::<HashSet<_>>();
|
||||||
|
state.l0_sstables = state
|
||||||
|
.l0_sstables
|
||||||
|
.iter()
|
||||||
|
.filter(|x| !l0_sstables_map.remove(x))
|
||||||
|
.copied()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
assert!(l0_sstables_map.is_empty());
|
||||||
*self.state.write() = Arc::new(state);
|
*self.state.write() = Arc::new(state);
|
||||||
}
|
}
|
||||||
for sst in original_sstables {
|
for sst in l0_sstables.iter().chain(l1_sstables.iter()) {
|
||||||
std::fs::remove_file(self.path_of_sst(sst))?;
|
std::fs::remove_file(self.path_of_sst(*sst))?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -235,6 +311,7 @@ impl LsmStorageInner {
|
|||||||
};
|
};
|
||||||
println!("running compaction task: {:?}", task);
|
println!("running compaction task: {:?}", task);
|
||||||
let sstables = self.compact(&task)?;
|
let sstables = self.compact(&task)?;
|
||||||
|
let files_added = sstables.len();
|
||||||
let output = sstables.iter().map(|x| x.sst_id()).collect::<Vec<_>>();
|
let output = sstables.iter().map(|x| x.sst_id()).collect::<Vec<_>>();
|
||||||
let ssts_to_remove = {
|
let ssts_to_remove = {
|
||||||
let state_lock = self.state_lock.lock();
|
let state_lock = self.state_lock.lock();
|
||||||
@@ -244,7 +321,7 @@ impl LsmStorageInner {
|
|||||||
let mut ssts_to_remove = Vec::with_capacity(files_to_remove.len());
|
let mut ssts_to_remove = Vec::with_capacity(files_to_remove.len());
|
||||||
for file_to_remove in &files_to_remove {
|
for file_to_remove in &files_to_remove {
|
||||||
let result = snapshot.sstables.remove(file_to_remove);
|
let result = snapshot.sstables.remove(file_to_remove);
|
||||||
assert!(result.is_some());
|
assert!(result.is_some(), "cannot remove {}.sst", file_to_remove);
|
||||||
ssts_to_remove.push(result.unwrap());
|
ssts_to_remove.push(result.unwrap());
|
||||||
}
|
}
|
||||||
let mut new_sst_ids = Vec::new();
|
let mut new_sst_ids = Vec::new();
|
||||||
@@ -255,13 +332,24 @@ impl LsmStorageInner {
|
|||||||
}
|
}
|
||||||
let mut state = self.state.write();
|
let mut state = self.state.write();
|
||||||
*state = Arc::new(snapshot);
|
*state = Arc::new(snapshot);
|
||||||
|
drop(state);
|
||||||
|
self.sync_dir()?;
|
||||||
self.manifest
|
self.manifest
|
||||||
|
.as_ref()
|
||||||
|
.unwrap()
|
||||||
.add_record(&state_lock, ManifestRecord::Compaction(task, new_sst_ids))?;
|
.add_record(&state_lock, ManifestRecord::Compaction(task, new_sst_ids))?;
|
||||||
ssts_to_remove
|
ssts_to_remove
|
||||||
};
|
};
|
||||||
|
println!(
|
||||||
|
"compaction finished: {} files removed, {} files added",
|
||||||
|
ssts_to_remove.len(),
|
||||||
|
files_added
|
||||||
|
);
|
||||||
for sst in ssts_to_remove {
|
for sst in ssts_to_remove {
|
||||||
std::fs::remove_file(self.path_of_sst(sst.sst_id()))?;
|
std::fs::remove_file(self.path_of_sst(sst.sst_id()))?;
|
||||||
}
|
}
|
||||||
|
self.sync_dir()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -289,4 +377,34 @@ impl LsmStorageInner {
|
|||||||
}
|
}
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn trigger_flush(&self) -> Result<()> {
|
||||||
|
if {
|
||||||
|
let state = self.state.read();
|
||||||
|
state.imm_memtables.len() >= self.options.num_memtable_limit
|
||||||
|
} {
|
||||||
|
self.force_flush_next_imm_memtable()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn spawn_flush_thread(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
rx: crossbeam_channel::Receiver<()>,
|
||||||
|
) -> Result<Option<std::thread::JoinHandle<()>>> {
|
||||||
|
let this = self.clone();
|
||||||
|
let handle = std::thread::spawn(move || {
|
||||||
|
let ticker = crossbeam_channel::tick(Duration::from_millis(50));
|
||||||
|
loop {
|
||||||
|
crossbeam_channel::select! {
|
||||||
|
recv(ticker) -> _ => if let Err(e) = this.trigger_flush() {
|
||||||
|
eprintln!("flush failed: {}", e);
|
||||||
|
},
|
||||||
|
recv(rx) -> _ => return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return Ok(Some(handle));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,9 @@ impl SimpleLeveledCompactionController {
|
|||||||
Self { options }
|
Self { options }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generates a compaction task.
|
||||||
|
///
|
||||||
|
/// Returns `None` if no compaction needs to be scheduled. The order of SSTs in the compaction task id vector matters.
|
||||||
pub fn generate_compaction_task(
|
pub fn generate_compaction_task(
|
||||||
&self,
|
&self,
|
||||||
snapshot: &LsmStorageState,
|
snapshot: &LsmStorageState,
|
||||||
@@ -68,6 +71,13 @@ impl SimpleLeveledCompactionController {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Apply the compaction result.
|
||||||
|
///
|
||||||
|
/// The compactor will call this function with the compaction task and the list of SST ids generated. This function applies the
|
||||||
|
/// result and generates a new LSM state. The functions should only change `l0_sstables` and `levels` without changing memtables
|
||||||
|
/// and `sstables` hash map. Though there should only be one thread running compaction jobs, you should think about the case
|
||||||
|
/// where an L0 SST gets flushed while the compactor generates new SSTs, and with that in mind, you should do some sanity checks
|
||||||
|
/// in your implementation.
|
||||||
pub fn apply_compaction_result(
|
pub fn apply_compaction_result(
|
||||||
&self,
|
&self,
|
||||||
snapshot: &LsmStorageState,
|
snapshot: &LsmStorageState,
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod concat_iterator;
|
||||||
pub mod merge_iterator;
|
pub mod merge_iterator;
|
||||||
pub mod two_merge_iterator;
|
pub mod two_merge_iterator;
|
||||||
|
|
||||||
@@ -13,7 +14,9 @@ pub trait StorageIterator {
|
|||||||
|
|
||||||
/// Move to the next position.
|
/// Move to the next position.
|
||||||
fn next(&mut self) -> anyhow::Result<()>;
|
fn next(&mut self) -> anyhow::Result<()>;
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
/// Number of underlying active iterators for this iterator.
|
||||||
mod tests;
|
fn num_active_iterators(&self) -> usize {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
122
mini-lsm/src/iterators/concat_iterator.rs
Normal file
122
mini-lsm/src/iterators/concat_iterator.rs
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
use crate::table::{SsTable, SsTableIterator};
|
||||||
|
|
||||||
|
use super::StorageIterator;
|
||||||
|
|
||||||
|
/// Concat multiple iterators ordered in key order and their key ranges do not overlap. We do not want to create the
|
||||||
|
/// iterators when initializing this iterator to reduce the overhead of seeking.
|
||||||
|
pub struct SstConcatIterator {
|
||||||
|
current: Option<SsTableIterator>,
|
||||||
|
next_sst_idx: usize,
|
||||||
|
sstables: Vec<Arc<SsTable>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SstConcatIterator {
|
||||||
|
fn check_sst_valid(sstables: &[Arc<SsTable>]) {
|
||||||
|
for sst in sstables {
|
||||||
|
assert!(sst.first_key() <= sst.last_key());
|
||||||
|
}
|
||||||
|
if !sstables.is_empty() {
|
||||||
|
for i in 0..(sstables.len() - 1) {
|
||||||
|
assert!(sstables[i].last_key() < sstables[i + 1].first_key());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_and_seek_to_first(sstables: Vec<Arc<SsTable>>) -> Result<Self> {
|
||||||
|
Self::check_sst_valid(&sstables);
|
||||||
|
if sstables.is_empty() {
|
||||||
|
return Ok(Self {
|
||||||
|
current: None,
|
||||||
|
next_sst_idx: 0,
|
||||||
|
sstables,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let mut iter = Self {
|
||||||
|
current: Some(SsTableIterator::create_and_seek_to_first(
|
||||||
|
sstables[0].clone(),
|
||||||
|
)?),
|
||||||
|
next_sst_idx: 1,
|
||||||
|
sstables,
|
||||||
|
};
|
||||||
|
iter.move_until_valid()?;
|
||||||
|
Ok(iter)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_and_seek_to_key(sstables: Vec<Arc<SsTable>>, key: &[u8]) -> Result<Self> {
|
||||||
|
Self::check_sst_valid(&sstables);
|
||||||
|
let idx: usize = sstables
|
||||||
|
.partition_point(|table| table.first_key() <= key)
|
||||||
|
.saturating_sub(1);
|
||||||
|
if idx >= sstables.len() {
|
||||||
|
return Ok(Self {
|
||||||
|
current: None,
|
||||||
|
next_sst_idx: sstables.len(),
|
||||||
|
sstables,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let mut iter = Self {
|
||||||
|
current: Some(SsTableIterator::create_and_seek_to_key(
|
||||||
|
sstables[idx].clone(),
|
||||||
|
key,
|
||||||
|
)?),
|
||||||
|
next_sst_idx: idx + 1,
|
||||||
|
sstables,
|
||||||
|
};
|
||||||
|
iter.move_until_valid()?;
|
||||||
|
Ok(iter)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn move_until_valid(&mut self) -> Result<()> {
|
||||||
|
loop {
|
||||||
|
if let Some(iter) = self.current.as_mut() {
|
||||||
|
if iter.is_valid() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if self.next_sst_idx >= self.sstables.len() {
|
||||||
|
self.current = None;
|
||||||
|
} else {
|
||||||
|
self.current = Some(SsTableIterator::create_and_seek_to_first(
|
||||||
|
self.sstables[self.next_sst_idx].clone(),
|
||||||
|
)?);
|
||||||
|
self.next_sst_idx += 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StorageIterator for SstConcatIterator {
|
||||||
|
fn key(&self) -> &[u8] {
|
||||||
|
self.current.as_ref().unwrap().key()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn value(&self) -> &[u8] {
|
||||||
|
self.current.as_ref().unwrap().value()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_valid(&self) -> bool {
|
||||||
|
if let Some(current) = &self.current {
|
||||||
|
assert!(current.is_valid());
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next(&mut self) -> Result<()> {
|
||||||
|
self.current.as_mut().unwrap().next()?;
|
||||||
|
self.move_until_valid()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn num_active_iterators(&self) -> usize {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -77,13 +77,11 @@ impl<I: StorageIterator> MergeIterator<I> {
|
|||||||
|
|
||||||
impl<I: StorageIterator> StorageIterator for MergeIterator<I> {
|
impl<I: StorageIterator> StorageIterator for MergeIterator<I> {
|
||||||
fn key(&self) -> &[u8] {
|
fn key(&self) -> &[u8] {
|
||||||
unsafe { self.current.as_ref().unwrap_unchecked() }.1.key()
|
self.current.as_ref().unwrap().1.key()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn value(&self) -> &[u8] {
|
fn value(&self) -> &[u8] {
|
||||||
unsafe { self.current.as_ref().unwrap_unchecked() }
|
self.current.as_ref().unwrap().1.value()
|
||||||
.1
|
|
||||||
.value()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_valid(&self) -> bool {
|
fn is_valid(&self) -> bool {
|
||||||
@@ -94,7 +92,7 @@ impl<I: StorageIterator> StorageIterator for MergeIterator<I> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn next(&mut self) -> Result<()> {
|
fn next(&mut self) -> Result<()> {
|
||||||
let current = unsafe { self.current.as_mut().unwrap_unchecked() };
|
let current = self.current.as_mut().unwrap();
|
||||||
// Pop the item out of the heap if they have the same value.
|
// Pop the item out of the heap if they have the same value.
|
||||||
while let Some(mut inner_iter) = self.iters.peek_mut() {
|
while let Some(mut inner_iter) = self.iters.peek_mut() {
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
@@ -136,4 +134,16 @@ impl<I: StorageIterator> StorageIterator for MergeIterator<I> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn num_active_iterators(&self) -> usize {
|
||||||
|
self.iters
|
||||||
|
.iter()
|
||||||
|
.map(|x| x.1.num_active_iterators())
|
||||||
|
.sum::<usize>()
|
||||||
|
+ self
|
||||||
|
.current
|
||||||
|
.as_ref()
|
||||||
|
.map(|x| x.1.num_active_iterators())
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -77,4 +77,8 @@ impl<A: StorageIterator, B: StorageIterator> StorageIterator for TwoMergeIterato
|
|||||||
self.choose_a = Self::choose_a(&self.a, &self.b);
|
self.choose_a = Self::choose_a(&self.a, &self.b);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn num_active_iterators(&self) -> usize {
|
||||||
|
self.a.num_active_iterators() + self.b.num_active_iterators()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,19 +1,23 @@
|
|||||||
use std::ops::Bound;
|
use std::ops::Bound;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{bail, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
|
|
||||||
|
use crate::iterators::concat_iterator::SstConcatIterator;
|
||||||
use crate::iterators::merge_iterator::MergeIterator;
|
use crate::iterators::merge_iterator::MergeIterator;
|
||||||
use crate::iterators::two_merge_iterator::TwoMergeIterator;
|
use crate::iterators::two_merge_iterator::TwoMergeIterator;
|
||||||
use crate::iterators::StorageIterator;
|
use crate::iterators::StorageIterator;
|
||||||
use crate::mem_table::MemTableIterator;
|
use crate::mem_table::MemTableIterator;
|
||||||
use crate::table::SsTableIterator;
|
use crate::table::SsTableIterator;
|
||||||
|
|
||||||
type LsmIteratorInner =
|
/// Represents the internal type for an LSM iterator. This type will be changed across the tutorial for multiple times.
|
||||||
TwoMergeIterator<MergeIterator<MemTableIterator>, MergeIterator<SsTableIterator>>;
|
type LsmIteratorInner = TwoMergeIterator<
|
||||||
|
TwoMergeIterator<MergeIterator<MemTableIterator>, MergeIterator<SsTableIterator>>,
|
||||||
|
MergeIterator<SstConcatIterator>,
|
||||||
|
>;
|
||||||
|
|
||||||
pub struct LsmIterator {
|
pub struct LsmIterator {
|
||||||
iter: LsmIteratorInner,
|
inner: LsmIteratorInner,
|
||||||
end_bound: Bound<Bytes>,
|
end_bound: Bound<Bytes>,
|
||||||
is_valid: bool,
|
is_valid: bool,
|
||||||
}
|
}
|
||||||
@@ -22,7 +26,7 @@ impl LsmIterator {
|
|||||||
pub(crate) fn new(iter: LsmIteratorInner, end_bound: Bound<Bytes>) -> Result<Self> {
|
pub(crate) fn new(iter: LsmIteratorInner, end_bound: Bound<Bytes>) -> Result<Self> {
|
||||||
let mut iter = Self {
|
let mut iter = Self {
|
||||||
is_valid: iter.is_valid(),
|
is_valid: iter.is_valid(),
|
||||||
iter,
|
inner: iter,
|
||||||
end_bound,
|
end_bound,
|
||||||
};
|
};
|
||||||
iter.move_to_non_delete()?;
|
iter.move_to_non_delete()?;
|
||||||
@@ -30,21 +34,21 @@ impl LsmIterator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn next_inner(&mut self) -> Result<()> {
|
fn next_inner(&mut self) -> Result<()> {
|
||||||
self.iter.next()?;
|
self.inner.next()?;
|
||||||
if !self.iter.is_valid() {
|
if !self.inner.is_valid() {
|
||||||
self.is_valid = false;
|
self.is_valid = false;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
match self.end_bound.as_ref() {
|
match self.end_bound.as_ref() {
|
||||||
Bound::Unbounded => {}
|
Bound::Unbounded => {}
|
||||||
Bound::Included(key) => self.is_valid = self.iter.key() <= key.as_ref(),
|
Bound::Included(key) => self.is_valid = self.inner.key() <= key.as_ref(),
|
||||||
Bound::Excluded(key) => self.is_valid = self.iter.key() < key.as_ref(),
|
Bound::Excluded(key) => self.is_valid = self.inner.key() < key.as_ref(),
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn move_to_non_delete(&mut self) -> Result<()> {
|
fn move_to_non_delete(&mut self) -> Result<()> {
|
||||||
while self.is_valid() && self.iter.value().is_empty() {
|
while self.is_valid() && self.inner.value().is_empty() {
|
||||||
self.next_inner()?;
|
self.next_inner()?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -57,11 +61,11 @@ impl StorageIterator for LsmIterator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn key(&self) -> &[u8] {
|
fn key(&self) -> &[u8] {
|
||||||
self.iter.key()
|
self.inner.key()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn value(&self) -> &[u8] {
|
fn value(&self) -> &[u8] {
|
||||||
self.iter.value()
|
self.inner.value()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next(&mut self) -> Result<()> {
|
fn next(&mut self) -> Result<()> {
|
||||||
@@ -69,38 +73,63 @@ impl StorageIterator for LsmIterator {
|
|||||||
self.move_to_non_delete()?;
|
self.move_to_non_delete()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn num_active_iterators(&self) -> usize {
|
||||||
|
self.inner.num_active_iterators()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is
|
/// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is
|
||||||
/// invalid.
|
/// invalid. If an iterator is already invalid, `next` does not do anything. If `next` returns an error,
|
||||||
|
/// `is_valid` should return false, and `next` should always return an error.
|
||||||
pub struct FusedIterator<I: StorageIterator> {
|
pub struct FusedIterator<I: StorageIterator> {
|
||||||
iter: I,
|
iter: I,
|
||||||
|
has_errored: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<I: StorageIterator> FusedIterator<I> {
|
impl<I: StorageIterator> FusedIterator<I> {
|
||||||
pub fn new(iter: I) -> Self {
|
pub fn new(iter: I) -> Self {
|
||||||
Self { iter }
|
Self {
|
||||||
|
iter,
|
||||||
|
has_errored: false,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<I: StorageIterator> StorageIterator for FusedIterator<I> {
|
impl<I: StorageIterator> StorageIterator for FusedIterator<I> {
|
||||||
fn is_valid(&self) -> bool {
|
fn is_valid(&self) -> bool {
|
||||||
self.iter.is_valid()
|
!self.has_errored && self.iter.is_valid()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn key(&self) -> &[u8] {
|
fn key(&self) -> &[u8] {
|
||||||
|
if self.has_errored || !self.iter.is_valid() {
|
||||||
|
panic!("invalid access to the underlying iterator");
|
||||||
|
}
|
||||||
self.iter.key()
|
self.iter.key()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn value(&self) -> &[u8] {
|
fn value(&self) -> &[u8] {
|
||||||
|
if self.has_errored || !self.iter.is_valid() {
|
||||||
|
panic!("invalid access to the underlying iterator");
|
||||||
|
}
|
||||||
self.iter.value()
|
self.iter.value()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next(&mut self) -> Result<()> {
|
fn next(&mut self) -> Result<()> {
|
||||||
// only move when the iterator is valid
|
// only move when the iterator is valid and not errored
|
||||||
|
if self.has_errored {
|
||||||
|
bail!("the iterator is tainted");
|
||||||
|
}
|
||||||
if self.iter.is_valid() {
|
if self.iter.is_valid() {
|
||||||
self.iter.next()?;
|
if let Err(e) = self.iter.next() {
|
||||||
|
self.has_errored = true;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn num_active_iterators(&self) -> usize {
|
||||||
|
self.iter.num_active_iterators()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,13 +7,14 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use parking_lot::{Mutex, RwLock};
|
use parking_lot::{Mutex, MutexGuard, RwLock};
|
||||||
|
|
||||||
use crate::block::Block;
|
use crate::block::Block;
|
||||||
use crate::compact::{
|
use crate::compact::{
|
||||||
CompactionController, CompactionOptions, LeveledCompactionController, LeveledCompactionOptions,
|
CompactionController, CompactionOptions, LeveledCompactionController, LeveledCompactionOptions,
|
||||||
SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController,
|
SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController,
|
||||||
};
|
};
|
||||||
|
use crate::iterators::concat_iterator::SstConcatIterator;
|
||||||
use crate::iterators::merge_iterator::MergeIterator;
|
use crate::iterators::merge_iterator::MergeIterator;
|
||||||
use crate::iterators::two_merge_iterator::TwoMergeIterator;
|
use crate::iterators::two_merge_iterator::TwoMergeIterator;
|
||||||
use crate::iterators::StorageIterator;
|
use crate::iterators::StorageIterator;
|
||||||
@@ -24,13 +25,14 @@ use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator};
|
|||||||
|
|
||||||
pub type BlockCache = moka::sync::Cache<(usize, usize), Arc<Block>>;
|
pub type BlockCache = moka::sync::Cache<(usize, usize), Arc<Block>>;
|
||||||
|
|
||||||
|
/// Represents the state of the storage engine.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct LsmStorageState {
|
pub struct LsmStorageState {
|
||||||
/// The current memtable.
|
/// The current memtable.
|
||||||
pub memtable: Arc<MemTable>,
|
pub memtable: Arc<MemTable>,
|
||||||
/// Immutable memtables, from earliest to latest.
|
/// Immutable memtables, from latest to earliest.
|
||||||
pub imm_memtables: Vec<Arc<MemTable>>,
|
pub imm_memtables: Vec<Arc<MemTable>>,
|
||||||
/// L0 SSTs, from earliest to latest.
|
/// L0 SSTs, from latest to earliest.
|
||||||
pub l0_sstables: Vec<usize>,
|
pub l0_sstables: Vec<usize>,
|
||||||
/// SsTables sorted by key range; L1 - L_max for leveled compaction, or tiers for tiered
|
/// SsTables sorted by key range; L1 - L_max for leveled compaction, or tiers for tiered
|
||||||
/// compaction.
|
/// compaction.
|
||||||
@@ -47,7 +49,8 @@ impl LsmStorageState {
|
|||||||
..=*max_levels)
|
..=*max_levels)
|
||||||
.map(|level| (level, Vec::new()))
|
.map(|level| (level, Vec::new()))
|
||||||
.collect::<Vec<_>>(),
|
.collect::<Vec<_>>(),
|
||||||
CompactionOptions::Tiered(_) | CompactionOptions::NoCompaction => Vec::new(),
|
CompactionOptions::Tiered(_) => Vec::new(),
|
||||||
|
CompactionOptions::NoCompaction => vec![(1, Vec::new())],
|
||||||
};
|
};
|
||||||
Self {
|
Self {
|
||||||
memtable: Arc::new(MemTable::create(0)),
|
memtable: Arc::new(MemTable::create(0)),
|
||||||
@@ -60,8 +63,11 @@ impl LsmStorageState {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct LsmStorageOptions {
|
pub struct LsmStorageOptions {
|
||||||
|
// Block size in bytes
|
||||||
pub block_size: usize,
|
pub block_size: usize,
|
||||||
|
// SST size in bytes, also the approximate memtable capacity limit
|
||||||
pub target_sst_size: usize,
|
pub target_sst_size: usize,
|
||||||
|
// Maximum number of memtables in memory, flush to L0 when exceeding this limit
|
||||||
pub num_memtable_limit: usize,
|
pub num_memtable_limit: usize,
|
||||||
pub compaction_options: CompactionOptions,
|
pub compaction_options: CompactionOptions,
|
||||||
pub enable_wal: bool,
|
pub enable_wal: bool,
|
||||||
@@ -74,9 +80,50 @@ impl LsmStorageOptions {
|
|||||||
target_sst_size: 2 << 20,
|
target_sst_size: 2 << 20,
|
||||||
compaction_options: CompactionOptions::NoCompaction,
|
compaction_options: CompactionOptions::NoCompaction,
|
||||||
enable_wal: false,
|
enable_wal: false,
|
||||||
num_memtable_limit: 3,
|
num_memtable_limit: 50,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn default_for_week1_day6_test() -> Self {
|
||||||
|
Self {
|
||||||
|
block_size: 4096,
|
||||||
|
target_sst_size: 2 << 20,
|
||||||
|
compaction_options: CompactionOptions::NoCompaction,
|
||||||
|
enable_wal: false,
|
||||||
|
num_memtable_limit: 2,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn range_overlap(
|
||||||
|
user_begin: Bound<&[u8]>,
|
||||||
|
user_end: Bound<&[u8]>,
|
||||||
|
table_begin: &[u8],
|
||||||
|
table_end: &[u8],
|
||||||
|
) -> bool {
|
||||||
|
match user_end {
|
||||||
|
Bound::Excluded(key) if key <= table_begin => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Bound::Included(key) if key < table_begin => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
match user_begin {
|
||||||
|
Bound::Excluded(key) if key >= table_end => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Bound::Included(key) if key > table_end => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn key_within(user_key: &[u8], table_begin: &[u8], table_end: &[u8]) -> bool {
|
||||||
|
table_begin <= user_key && user_key <= table_end
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The storage interface of the LSM tree.
|
/// The storage interface of the LSM tree.
|
||||||
@@ -88,18 +135,26 @@ pub(crate) struct LsmStorageInner {
|
|||||||
next_sst_id: AtomicUsize,
|
next_sst_id: AtomicUsize,
|
||||||
pub(crate) options: Arc<LsmStorageOptions>,
|
pub(crate) options: Arc<LsmStorageOptions>,
|
||||||
pub(crate) compaction_controller: CompactionController,
|
pub(crate) compaction_controller: CompactionController,
|
||||||
pub(crate) manifest: Manifest,
|
pub(crate) manifest: Option<Manifest>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A thin wrapper for `LsmStorageInner` and the user interface for MiniLSM.
|
||||||
pub struct MiniLsm {
|
pub struct MiniLsm {
|
||||||
pub(crate) inner: Arc<LsmStorageInner>,
|
pub(crate) inner: Arc<LsmStorageInner>,
|
||||||
|
/// Notifies the L0 flush thread to stop working. (In week 1 day 6)
|
||||||
|
flush_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
/// The handle for the compaction thread. (In week 1 day 6)
|
||||||
|
flush_thread: Mutex<Option<std::thread::JoinHandle<()>>>,
|
||||||
|
/// Notifies the compaction thread to stop working. (In week 2)
|
||||||
compaction_notifier: crossbeam_channel::Sender<()>,
|
compaction_notifier: crossbeam_channel::Sender<()>,
|
||||||
|
/// The handle for the compaction thread. (In week 2)
|
||||||
compaction_thread: Mutex<Option<std::thread::JoinHandle<()>>>,
|
compaction_thread: Mutex<Option<std::thread::JoinHandle<()>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for MiniLsm {
|
impl Drop for MiniLsm {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
self.compaction_notifier.send(()).ok();
|
self.compaction_notifier.send(()).ok();
|
||||||
|
self.flush_notifier.send(()).ok();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,22 +162,59 @@ impl MiniLsm {
|
|||||||
pub fn close(&self) -> Result<()> {
|
pub fn close(&self) -> Result<()> {
|
||||||
self.inner.sync_dir()?;
|
self.inner.sync_dir()?;
|
||||||
self.compaction_notifier.send(()).ok();
|
self.compaction_notifier.send(()).ok();
|
||||||
|
self.flush_notifier.send(()).ok();
|
||||||
|
|
||||||
|
if self.inner.options.enable_wal {
|
||||||
|
self.inner.sync()?;
|
||||||
|
self.inner.sync_dir()?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
let mut compaction_thread = self.compaction_thread.lock();
|
let mut compaction_thread = self.compaction_thread.lock();
|
||||||
if let Some(compaction_thread) = compaction_thread.take() {
|
if let Some(compaction_thread) = compaction_thread.take() {
|
||||||
compaction_thread
|
compaction_thread
|
||||||
.join()
|
.join()
|
||||||
.map_err(|e| anyhow::anyhow!("{:?}", e))?;
|
.map_err(|e| anyhow::anyhow!("{:?}", e))?;
|
||||||
}
|
}
|
||||||
|
let mut flush_thread = self.flush_thread.lock();
|
||||||
|
if let Some(flush_thread) = flush_thread.take() {
|
||||||
|
flush_thread
|
||||||
|
.join()
|
||||||
|
.map_err(|e| anyhow::anyhow!("{:?}", e))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create memtable and skip updating manifest
|
||||||
|
if !self.inner.state.read().memtable.is_empty() {
|
||||||
|
self.inner
|
||||||
|
.freeze_memtable_with_memtable(Arc::new(MemTable::create(
|
||||||
|
self.inner.next_sst_id(),
|
||||||
|
)))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
while {
|
||||||
|
let snapshot = self.inner.state.read();
|
||||||
|
!snapshot.imm_memtables.is_empty()
|
||||||
|
} {
|
||||||
|
self.inner.force_flush_next_imm_memtable()?;
|
||||||
|
}
|
||||||
|
self.inner.sync_dir()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Start the storage engine by either loading an existing directory or creating a new one if the directory does
|
||||||
|
/// not exist.
|
||||||
pub fn open(path: impl AsRef<Path>, options: LsmStorageOptions) -> Result<Arc<Self>> {
|
pub fn open(path: impl AsRef<Path>, options: LsmStorageOptions) -> Result<Arc<Self>> {
|
||||||
let inner = Arc::new(LsmStorageInner::open(path, options)?);
|
let inner = Arc::new(LsmStorageInner::open(path, options)?);
|
||||||
let (tx, rx) = crossbeam_channel::unbounded();
|
let (tx1, rx) = crossbeam_channel::unbounded();
|
||||||
let compaction_thread = inner.spawn_compaction_thread(rx)?;
|
let compaction_thread = inner.spawn_compaction_thread(rx)?;
|
||||||
|
let (tx2, rx) = crossbeam_channel::unbounded();
|
||||||
|
let flush_thread = inner.spawn_flush_thread(rx)?;
|
||||||
Ok(Arc::new(Self {
|
Ok(Arc::new(Self {
|
||||||
inner,
|
inner,
|
||||||
compaction_notifier: tx,
|
flush_notifier: tx2,
|
||||||
|
flush_thread: Mutex::new(flush_thread),
|
||||||
|
compaction_notifier: tx1,
|
||||||
compaction_thread: Mutex::new(compaction_thread),
|
compaction_thread: Mutex::new(compaction_thread),
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -139,6 +231,10 @@ impl MiniLsm {
|
|||||||
self.inner.delete(key)
|
self.inner.delete(key)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn sync(&self) -> Result<()> {
|
||||||
|
self.inner.sync()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn scan(
|
pub fn scan(
|
||||||
&self,
|
&self,
|
||||||
lower: Bound<&[u8]>,
|
lower: Bound<&[u8]>,
|
||||||
@@ -147,9 +243,16 @@ impl MiniLsm {
|
|||||||
self.inner.scan(lower, upper)
|
self.inner.scan(lower, upper)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Only call this in test cases due to race conditions
|
||||||
pub fn force_flush(&self) -> Result<()> {
|
pub fn force_flush(&self) -> Result<()> {
|
||||||
self.inner.force_freeze_memtable()?;
|
if !self.inner.state.read().memtable.is_empty() {
|
||||||
self.inner.force_flush_next_imm_memtable()
|
self.inner
|
||||||
|
.force_freeze_memtable(&self.inner.state_lock.lock())?;
|
||||||
|
}
|
||||||
|
if !self.inner.state.read().imm_memtables.is_empty() {
|
||||||
|
self.inner.force_flush_next_imm_memtable()?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn force_full_compaction(&self) -> Result<()> {
|
pub fn force_full_compaction(&self) -> Result<()> {
|
||||||
@@ -163,6 +266,8 @@ impl LsmStorageInner {
|
|||||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
|
.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Start the storage engine by either loading an existing directory or creating a new one if the directory does
|
||||||
|
/// not exist.
|
||||||
pub(crate) fn open(path: impl AsRef<Path>, options: LsmStorageOptions) -> Result<Self> {
|
pub(crate) fn open(path: impl AsRef<Path>, options: LsmStorageOptions) -> Result<Self> {
|
||||||
let mut state = LsmStorageState::create(&options);
|
let mut state = LsmStorageState::create(&options);
|
||||||
let path = path.as_ref();
|
let path = path.as_ref();
|
||||||
@@ -204,10 +309,15 @@ impl LsmStorageInner {
|
|||||||
ManifestRecord::Flush(sst_id) => {
|
ManifestRecord::Flush(sst_id) => {
|
||||||
let res = memtables.remove(&sst_id);
|
let res = memtables.remove(&sst_id);
|
||||||
assert!(res, "memtable not exist?");
|
assert!(res, "memtable not exist?");
|
||||||
state.l0_sstables.insert(0, sst_id);
|
if compaction_controller.flush_to_l0() {
|
||||||
|
state.l0_sstables.insert(0, sst_id);
|
||||||
|
} else {
|
||||||
|
state.levels.insert(0, (sst_id, vec![sst_id]));
|
||||||
|
}
|
||||||
|
next_sst_id = next_sst_id.max(sst_id);
|
||||||
}
|
}
|
||||||
ManifestRecord::NewMemtable(x) => {
|
ManifestRecord::NewMemtable(x) => {
|
||||||
next_sst_id = x + 1;
|
next_sst_id = next_sst_id.max(x);
|
||||||
memtables.insert(x);
|
memtables.insert(x);
|
||||||
}
|
}
|
||||||
ManifestRecord::Compaction(task, output) => {
|
ManifestRecord::Compaction(task, output) => {
|
||||||
@@ -215,9 +325,13 @@ impl LsmStorageInner {
|
|||||||
compaction_controller.apply_compaction_result(&state, &task, &output);
|
compaction_controller.apply_compaction_result(&state, &task, &output);
|
||||||
// TODO: apply remove again
|
// TODO: apply remove again
|
||||||
state = new_state;
|
state = new_state;
|
||||||
|
next_sst_id =
|
||||||
|
next_sst_id.max(output.iter().max().copied().unwrap_or_default());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut sst_cnt = 0;
|
||||||
// recover SSTs
|
// recover SSTs
|
||||||
for table_id in state
|
for table_id in state
|
||||||
.l0_sstables
|
.l0_sstables
|
||||||
@@ -232,15 +346,24 @@ impl LsmStorageInner {
|
|||||||
.context("failed to open SST")?,
|
.context("failed to open SST")?,
|
||||||
)?;
|
)?;
|
||||||
state.sstables.insert(table_id, Arc::new(sst));
|
state.sstables.insert(table_id, Arc::new(sst));
|
||||||
|
sst_cnt += 1;
|
||||||
}
|
}
|
||||||
|
println!("{} SSTs opened", sst_cnt);
|
||||||
|
|
||||||
|
next_sst_id += 1;
|
||||||
|
|
||||||
// recover memtables
|
// recover memtables
|
||||||
if options.enable_wal {
|
if options.enable_wal {
|
||||||
|
let mut wal_cnt = 0;
|
||||||
for id in memtables.iter() {
|
for id in memtables.iter() {
|
||||||
let memtable =
|
let memtable =
|
||||||
MemTable::recover_from_wal(*id, Self::path_of_wal_static(path, *id))?;
|
MemTable::recover_from_wal(*id, Self::path_of_wal_static(path, *id))?;
|
||||||
state.imm_memtables.insert(0, Arc::new(memtable));
|
if !memtable.is_empty() {
|
||||||
next_sst_id = *id + 1;
|
state.imm_memtables.insert(0, Arc::new(memtable));
|
||||||
|
wal_cnt += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
println!("{} WALs recovered", wal_cnt);
|
||||||
state.memtable = Arc::new(MemTable::create_with_wal(
|
state.memtable = Arc::new(MemTable::create_with_wal(
|
||||||
next_sst_id,
|
next_sst_id,
|
||||||
Self::path_of_wal_static(path, next_sst_id),
|
Self::path_of_wal_static(path, next_sst_id),
|
||||||
@@ -260,7 +383,7 @@ impl LsmStorageInner {
|
|||||||
block_cache,
|
block_cache,
|
||||||
next_sst_id: AtomicUsize::new(next_sst_id),
|
next_sst_id: AtomicUsize::new(next_sst_id),
|
||||||
compaction_controller,
|
compaction_controller,
|
||||||
manifest,
|
manifest: Some(manifest),
|
||||||
options: options.into(),
|
options: options.into(),
|
||||||
};
|
};
|
||||||
storage.sync_dir()?;
|
storage.sync_dir()?;
|
||||||
@@ -268,6 +391,10 @@ impl LsmStorageInner {
|
|||||||
Ok(storage)
|
Ok(storage)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn sync(&self) -> Result<()> {
|
||||||
|
self.state.read().memtable.sync_wal()
|
||||||
|
}
|
||||||
|
|
||||||
/// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter.
|
/// Get a key from the storage. In day 7, this can be further optimized by using a bloom filter.
|
||||||
pub fn get(&self, key: &[u8]) -> Result<Option<Bytes>> {
|
pub fn get(&self, key: &[u8]) -> Result<Option<Bytes>> {
|
||||||
let snapshot = {
|
let snapshot = {
|
||||||
@@ -294,19 +421,47 @@ impl LsmStorageInner {
|
|||||||
return Ok(Some(value));
|
return Ok(Some(value));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut iters = Vec::with_capacity(snapshot.l0_sstables.len());
|
|
||||||
for table in snapshot
|
let mut l0_iters = Vec::with_capacity(snapshot.l0_sstables.len());
|
||||||
.l0_sstables
|
|
||||||
.iter()
|
let keep_table = |key: &[u8], table: &SsTable| {
|
||||||
.chain(snapshot.levels.iter().map(|(_, files)| files).flatten())
|
if key_within(key, table.first_key(), table.last_key()) {
|
||||||
{
|
if let Some(bloom) = &table.bloom {
|
||||||
iters.push(Box::new(SsTableIterator::create_and_seek_to_key(
|
if bloom.may_contain(farmhash::fingerprint32(key)) {
|
||||||
snapshot.sstables[table].clone(),
|
return true;
|
||||||
key,
|
}
|
||||||
)?));
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
};
|
||||||
|
|
||||||
|
for table in snapshot.l0_sstables.iter() {
|
||||||
|
let table = snapshot.sstables[table].clone();
|
||||||
|
if keep_table(key, &table) {
|
||||||
|
l0_iters.push(Box::new(SsTableIterator::create_and_seek_to_key(
|
||||||
|
table, key,
|
||||||
|
)?));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
let iter = MergeIterator::create(iters);
|
let l0_iter = MergeIterator::create(l0_iters);
|
||||||
if iter.is_valid() && iter.key() == key {
|
let mut level_iters = Vec::with_capacity(snapshot.levels.len());
|
||||||
|
for (_, level_sst_ids) in &snapshot.levels {
|
||||||
|
let mut level_ssts = Vec::with_capacity(snapshot.levels[0].1.len());
|
||||||
|
for table in level_sst_ids {
|
||||||
|
let table = snapshot.sstables[table].clone();
|
||||||
|
if keep_table(key, &table) {
|
||||||
|
level_ssts.push(table);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let level_iter = SstConcatIterator::create_and_seek_to_key(level_ssts, key)?;
|
||||||
|
level_iters.push(Box::new(level_iter));
|
||||||
|
}
|
||||||
|
|
||||||
|
let iter = TwoMergeIterator::create(l0_iter, MergeIterator::create(level_iters))?;
|
||||||
|
|
||||||
|
if iter.is_valid() && iter.key() == key && !iter.value().is_empty() {
|
||||||
return Ok(Some(Bytes::copy_from_slice(iter.value())));
|
return Ok(Some(Bytes::copy_from_slice(iter.value())));
|
||||||
}
|
}
|
||||||
Ok(None)
|
Ok(None)
|
||||||
@@ -317,8 +472,14 @@ impl LsmStorageInner {
|
|||||||
assert!(!value.is_empty(), "value cannot be empty");
|
assert!(!value.is_empty(), "value cannot be empty");
|
||||||
assert!(!key.is_empty(), "key cannot be empty");
|
assert!(!key.is_empty(), "key cannot be empty");
|
||||||
|
|
||||||
let guard = self.state.read();
|
let size;
|
||||||
guard.memtable.put(key, value)?;
|
{
|
||||||
|
let guard = self.state.read();
|
||||||
|
guard.memtable.put(key, value)?;
|
||||||
|
size = guard.memtable.approximate_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.try_freeze(size)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -327,9 +488,28 @@ impl LsmStorageInner {
|
|||||||
pub fn delete(&self, key: &[u8]) -> Result<()> {
|
pub fn delete(&self, key: &[u8]) -> Result<()> {
|
||||||
assert!(!key.is_empty(), "key cannot be empty");
|
assert!(!key.is_empty(), "key cannot be empty");
|
||||||
|
|
||||||
let guard = self.state.read();
|
let size;
|
||||||
guard.memtable.put(key, b"")?;
|
{
|
||||||
|
let guard = self.state.read();
|
||||||
|
guard.memtable.put(key, b"")?;
|
||||||
|
size = guard.memtable.approximate_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.try_freeze(size)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_freeze(&self, estimated_size: usize) -> Result<()> {
|
||||||
|
if estimated_size >= self.options.target_sst_size {
|
||||||
|
let state_lock = self.state_lock.lock();
|
||||||
|
let guard = self.state.read();
|
||||||
|
// the memtable could have already been frozen, check again to ensure we really need to freeze
|
||||||
|
if guard.memtable.approximate_size() >= self.options.target_sst_size {
|
||||||
|
drop(guard);
|
||||||
|
self.force_freeze_memtable(&state_lock)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -349,39 +529,46 @@ impl LsmStorageInner {
|
|||||||
Self::path_of_wal_static(&self.path, id)
|
Self::path_of_wal_static(&self.path, id)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sync_dir(&self) -> Result<()> {
|
pub(super) fn sync_dir(&self) -> Result<()> {
|
||||||
File::open(&self.path)?.sync_all()?;
|
File::open(&self.path)?.sync_all()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Force freeze the current memetable to an immutable memtable
|
fn freeze_memtable_with_memtable(&self, memtable: Arc<MemTable>) -> Result<()> {
|
||||||
pub fn force_freeze_memtable(&self) -> Result<()> {
|
let mut guard = self.state.write();
|
||||||
let state_lock = self.state_lock.lock();
|
// Swap the current memtable with a new one.
|
||||||
|
let mut snapshot = guard.as_ref().clone();
|
||||||
|
let old_memtable = std::mem::replace(&mut snapshot.memtable, memtable);
|
||||||
|
// Add the memtable to the immutable memtables.
|
||||||
|
snapshot.imm_memtables.insert(0, old_memtable.clone());
|
||||||
|
// Update the snapshot.
|
||||||
|
*guard = Arc::new(snapshot);
|
||||||
|
|
||||||
let memtable_id = self.next_sst_id();
|
drop(guard);
|
||||||
let memtable = Arc::new(if self.options.enable_wal {
|
|
||||||
let mt = MemTable::create_with_wal(memtable_id, self.path_of_wal(memtable_id))?;
|
|
||||||
self.sync_dir()?;
|
|
||||||
mt
|
|
||||||
} else {
|
|
||||||
MemTable::create(memtable_id)
|
|
||||||
});
|
|
||||||
|
|
||||||
let old_memtable;
|
|
||||||
{
|
|
||||||
let mut guard = self.state.write();
|
|
||||||
// Swap the current memtable with a new one.
|
|
||||||
let mut snapshot = guard.as_ref().clone();
|
|
||||||
old_memtable = std::mem::replace(&mut snapshot.memtable, memtable);
|
|
||||||
// Add the memtable to the immutable memtables.
|
|
||||||
snapshot.imm_memtables.insert(0, old_memtable.clone());
|
|
||||||
// Update the snapshot.
|
|
||||||
*guard = Arc::new(snapshot);
|
|
||||||
}
|
|
||||||
old_memtable.sync_wal()?;
|
old_memtable.sync_wal()?;
|
||||||
|
|
||||||
self.manifest
|
Ok(())
|
||||||
.add_record(&state_lock, ManifestRecord::NewMemtable(memtable_id))?;
|
}
|
||||||
|
|
||||||
|
/// Force freeze the current memtable to an immutable memtable
|
||||||
|
pub fn force_freeze_memtable(&self, state_lock_observer: &MutexGuard<'_, ()>) -> Result<()> {
|
||||||
|
let memtable_id = self.next_sst_id();
|
||||||
|
let memtable = if self.options.enable_wal {
|
||||||
|
Arc::new(MemTable::create_with_wal(
|
||||||
|
memtable_id,
|
||||||
|
self.path_of_wal(memtable_id),
|
||||||
|
)?)
|
||||||
|
} else {
|
||||||
|
Arc::new(MemTable::create(memtable_id))
|
||||||
|
};
|
||||||
|
|
||||||
|
self.freeze_memtable_with_memtable(memtable)?;
|
||||||
|
|
||||||
|
self.manifest.as_ref().unwrap().add_record(
|
||||||
|
state_lock_observer,
|
||||||
|
ManifestRecord::NewMemtable(memtable_id),
|
||||||
|
)?;
|
||||||
|
self.sync_dir()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -436,6 +623,8 @@ impl LsmStorageInner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.manifest
|
self.manifest
|
||||||
|
.as_ref()
|
||||||
|
.unwrap()
|
||||||
.add_record(&state_lock, ManifestRecord::Flush(sst_id))?;
|
.add_record(&state_lock, ManifestRecord::Flush(sst_id))?;
|
||||||
|
|
||||||
self.sync_dir()?;
|
self.sync_dir()?;
|
||||||
@@ -462,30 +651,52 @@ impl LsmStorageInner {
|
|||||||
let memtable_iter = MergeIterator::create(memtable_iters);
|
let memtable_iter = MergeIterator::create(memtable_iters);
|
||||||
|
|
||||||
let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len());
|
let mut table_iters = Vec::with_capacity(snapshot.l0_sstables.len());
|
||||||
for table_id in snapshot
|
for table_id in snapshot.l0_sstables.iter() {
|
||||||
.l0_sstables
|
|
||||||
.iter()
|
|
||||||
.chain(snapshot.levels.iter().map(|(_, files)| files).flatten())
|
|
||||||
{
|
|
||||||
let table = snapshot.sstables[table_id].clone();
|
let table = snapshot.sstables[table_id].clone();
|
||||||
let iter = match lower {
|
if range_overlap(lower, upper, table.first_key(), table.last_key()) {
|
||||||
Bound::Included(key) => SsTableIterator::create_and_seek_to_key(table, key)?,
|
let iter = match lower {
|
||||||
|
Bound::Included(key) => SsTableIterator::create_and_seek_to_key(table, key)?,
|
||||||
|
Bound::Excluded(key) => {
|
||||||
|
let mut iter = SsTableIterator::create_and_seek_to_key(table, key)?;
|
||||||
|
if iter.is_valid() && iter.key() == key {
|
||||||
|
iter.next()?;
|
||||||
|
}
|
||||||
|
iter
|
||||||
|
}
|
||||||
|
Bound::Unbounded => SsTableIterator::create_and_seek_to_first(table)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
table_iters.push(Box::new(iter));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let l0_iter = MergeIterator::create(table_iters);
|
||||||
|
let mut level_iters = Vec::with_capacity(snapshot.levels.len());
|
||||||
|
for (_, level_sst_ids) in &snapshot.levels {
|
||||||
|
let mut level_ssts = Vec::with_capacity(level_sst_ids.len());
|
||||||
|
for table in level_sst_ids {
|
||||||
|
let table = snapshot.sstables[table].clone();
|
||||||
|
if range_overlap(lower, upper, table.first_key(), table.last_key()) {
|
||||||
|
level_ssts.push(table);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let level_iter = match lower {
|
||||||
|
Bound::Included(key) => SstConcatIterator::create_and_seek_to_key(level_ssts, key)?,
|
||||||
Bound::Excluded(key) => {
|
Bound::Excluded(key) => {
|
||||||
let mut iter = SsTableIterator::create_and_seek_to_key(table, key)?;
|
let mut iter = SstConcatIterator::create_and_seek_to_key(level_ssts, key)?;
|
||||||
if iter.is_valid() && iter.key() == key {
|
if iter.is_valid() && iter.key() == key {
|
||||||
iter.next()?;
|
iter.next()?;
|
||||||
}
|
}
|
||||||
iter
|
iter
|
||||||
}
|
}
|
||||||
Bound::Unbounded => SsTableIterator::create_and_seek_to_first(table)?,
|
Bound::Unbounded => SstConcatIterator::create_and_seek_to_first(level_ssts)?,
|
||||||
};
|
};
|
||||||
|
level_iters.push(Box::new(level_iter));
|
||||||
table_iters.push(Box::new(iter));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let table_iter = MergeIterator::create(table_iters);
|
let iter = TwoMergeIterator::create(memtable_iter, l0_iter)?;
|
||||||
|
let iter = TwoMergeIterator::create(iter, MergeIterator::create(level_iters))?;
|
||||||
let iter = TwoMergeIterator::create(memtable_iter, table_iter)?;
|
|
||||||
|
|
||||||
Ok(FusedIterator::new(LsmIterator::new(
|
Ok(FusedIterator::new(LsmIterator::new(
|
||||||
iter,
|
iter,
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
|
#![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality
|
||||||
|
|
||||||
use std::ops::Bound;
|
use std::ops::Bound;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::sync::atomic::AtomicUsize;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
@@ -12,13 +15,18 @@ use crate::iterators::StorageIterator;
|
|||||||
use crate::table::SsTableBuilder;
|
use crate::table::SsTableBuilder;
|
||||||
use crate::wal::Wal;
|
use crate::wal::Wal;
|
||||||
|
|
||||||
/// A basic mem-table based on crossbeam-skiplist
|
/// A basic mem-table based on crossbeam-skiplist.
|
||||||
|
///
|
||||||
|
/// An initial implementation of memtable is part of week 1, day 1. It will be incrementally implemented in other
|
||||||
|
/// chapters of week 1 and week 2.
|
||||||
pub struct MemTable {
|
pub struct MemTable {
|
||||||
map: Arc<SkipMap<Bytes, Bytes>>,
|
map: Arc<SkipMap<Bytes, Bytes>>,
|
||||||
wal: Option<Wal>,
|
wal: Option<Wal>,
|
||||||
id: usize,
|
id: usize,
|
||||||
|
approximate_size: Arc<AtomicUsize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Create a bound of `Bytes` from a bound of `&[u8]`.
|
||||||
pub(crate) fn map_bound(bound: Bound<&[u8]>) -> Bound<Bytes> {
|
pub(crate) fn map_bound(bound: Bound<&[u8]>) -> Bound<Bytes> {
|
||||||
match bound {
|
match bound {
|
||||||
Bound::Included(x) => Bound::Included(Bytes::copy_from_slice(x)),
|
Bound::Included(x) => Bound::Included(Bytes::copy_from_slice(x)),
|
||||||
@@ -34,6 +42,7 @@ impl MemTable {
|
|||||||
id,
|
id,
|
||||||
map: Arc::new(SkipMap::new()),
|
map: Arc::new(SkipMap::new()),
|
||||||
wal: None,
|
wal: None,
|
||||||
|
approximate_size: Arc::new(AtomicUsize::new(0)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,6 +52,7 @@ impl MemTable {
|
|||||||
id,
|
id,
|
||||||
map: Arc::new(SkipMap::new()),
|
map: Arc::new(SkipMap::new()),
|
||||||
wal: Some(Wal::create(path.as_ref())?),
|
wal: Some(Wal::create(path.as_ref())?),
|
||||||
|
approximate_size: Arc::new(AtomicUsize::new(0)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,6 +63,7 @@ impl MemTable {
|
|||||||
id,
|
id,
|
||||||
wal: Some(Wal::recover(path.as_ref(), &map)?),
|
wal: Some(Wal::recover(path.as_ref(), &map)?),
|
||||||
map,
|
map,
|
||||||
|
approximate_size: Arc::new(AtomicUsize::new(0)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -62,9 +73,15 @@ impl MemTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Put a key-value pair into the mem-table.
|
/// Put a key-value pair into the mem-table.
|
||||||
|
///
|
||||||
|
/// In week 1, day 1, simply put the key-value pair into the skipmap.
|
||||||
|
/// In week 2, day 6, also flush the data to WAL.
|
||||||
pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> {
|
pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> {
|
||||||
|
let estimated_size = key.len() + value.len();
|
||||||
self.map
|
self.map
|
||||||
.insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value));
|
.insert(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value));
|
||||||
|
self.approximate_size
|
||||||
|
.fetch_add(estimated_size, std::sync::atomic::Ordering::Relaxed);
|
||||||
if let Some(ref wal) = self.wal {
|
if let Some(ref wal) = self.wal {
|
||||||
wal.put(key, value)?;
|
wal.put(key, value)?;
|
||||||
}
|
}
|
||||||
@@ -84,7 +101,7 @@ impl MemTable {
|
|||||||
let mut iter = MemTableIteratorBuilder {
|
let mut iter = MemTableIteratorBuilder {
|
||||||
map: self.map.clone(),
|
map: self.map.clone(),
|
||||||
iter_builder: |map| map.range((lower, upper)),
|
iter_builder: |map| map.range((lower, upper)),
|
||||||
item: (Bytes::from_static(&[]), Bytes::from_static(&[])),
|
item: (Bytes::new(), Bytes::new()),
|
||||||
}
|
}
|
||||||
.build();
|
.build();
|
||||||
let entry = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next()));
|
let entry = iter.with_iter_mut(|iter| MemTableIterator::entry_to_item(iter.next()));
|
||||||
@@ -92,7 +109,7 @@ impl MemTable {
|
|||||||
iter
|
iter
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the mem-table to SSTable.
|
/// Flush the mem-table to SSTable. Implement in week 1 day 6.
|
||||||
pub fn flush(&self, builder: &mut SsTableBuilder) -> Result<()> {
|
pub fn flush(&self, builder: &mut SsTableBuilder) -> Result<()> {
|
||||||
for entry in self.map.iter() {
|
for entry in self.map.iter() {
|
||||||
builder.add(&entry.key()[..], &entry.value()[..]);
|
builder.add(&entry.key()[..], &entry.value()[..]);
|
||||||
@@ -103,18 +120,34 @@ impl MemTable {
|
|||||||
pub fn id(&self) -> usize {
|
pub fn id(&self) -> usize {
|
||||||
self.id
|
self.id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn approximate_size(&self) -> usize {
|
||||||
|
self.approximate_size
|
||||||
|
.load(std::sync::atomic::Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Only use this function when closing the database
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.map.is_empty()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type SkipMapRangeIter<'a> =
|
type SkipMapRangeIter<'a> =
|
||||||
crossbeam_skiplist::map::Range<'a, Bytes, (Bound<Bytes>, Bound<Bytes>), Bytes, Bytes>;
|
crossbeam_skiplist::map::Range<'a, Bytes, (Bound<Bytes>, Bound<Bytes>), Bytes, Bytes>;
|
||||||
|
|
||||||
/// An iterator over a range of `SkipMap`.
|
/// An iterator over a range of `SkipMap`. This is a self-referential structure and please refer to week 1, day 2
|
||||||
|
/// chapter for more information.
|
||||||
|
///
|
||||||
|
/// This is part of week 1, day 2.
|
||||||
#[self_referencing]
|
#[self_referencing]
|
||||||
pub struct MemTableIterator {
|
pub struct MemTableIterator {
|
||||||
|
/// Stores a reference to the skipmap.
|
||||||
map: Arc<SkipMap<Bytes, Bytes>>,
|
map: Arc<SkipMap<Bytes, Bytes>>,
|
||||||
|
/// Stores a skipmap iterator that refers to the lifetime of `MemTableIterator` itself.
|
||||||
#[borrows(map)]
|
#[borrows(map)]
|
||||||
#[not_covariant]
|
#[not_covariant]
|
||||||
iter: SkipMapRangeIter<'this>,
|
iter: SkipMapRangeIter<'this>,
|
||||||
|
/// Stores the current key-value pair.
|
||||||
item: (Bytes, Bytes),
|
item: (Bytes, Bytes),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,6 +178,3 @@ impl StorageIterator for MemTableIterator {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests;
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub(crate) mod bloom;
|
||||||
mod builder;
|
mod builder;
|
||||||
mod iterator;
|
mod iterator;
|
||||||
|
|
||||||
@@ -13,6 +14,8 @@ pub use iterator::SsTableIterator;
|
|||||||
use crate::block::Block;
|
use crate::block::Block;
|
||||||
use crate::lsm_storage::BlockCache;
|
use crate::lsm_storage::BlockCache;
|
||||||
|
|
||||||
|
use self::bloom::Bloom;
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
pub struct BlockMeta {
|
pub struct BlockMeta {
|
||||||
/// Offset of this data block.
|
/// Offset of this data block.
|
||||||
@@ -107,16 +110,20 @@ impl FileObject {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// An SSTable.
|
||||||
pub struct SsTable {
|
pub struct SsTable {
|
||||||
file: FileObject,
|
/// The actual storage unit of SsTable, the format is as above.
|
||||||
block_meta: Vec<BlockMeta>,
|
pub(crate) file: FileObject,
|
||||||
block_meta_offset: usize,
|
/// The meta blocks that hold info for data blocks.
|
||||||
|
pub(crate) block_meta: Vec<BlockMeta>,
|
||||||
|
/// The offset that indicates the start point of meta blocks in `file`.
|
||||||
|
pub(crate) block_meta_offset: usize,
|
||||||
id: usize,
|
id: usize,
|
||||||
block_cache: Option<Arc<BlockCache>>,
|
block_cache: Option<Arc<BlockCache>>,
|
||||||
first_key: Bytes,
|
first_key: Bytes,
|
||||||
last_key: Bytes,
|
last_key: Bytes,
|
||||||
|
pub(crate) bloom: Option<Bloom>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SsTable {
|
impl SsTable {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) fn open_for_test(file: FileObject) -> Result<Self> {
|
pub(crate) fn open_for_test(file: FileObject) -> Result<Self> {
|
||||||
@@ -126,9 +133,13 @@ impl SsTable {
|
|||||||
/// Open SSTable from a file.
|
/// Open SSTable from a file.
|
||||||
pub fn open(id: usize, block_cache: Option<Arc<BlockCache>>, file: FileObject) -> Result<Self> {
|
pub fn open(id: usize, block_cache: Option<Arc<BlockCache>>, file: FileObject) -> Result<Self> {
|
||||||
let len = file.size();
|
let len = file.size();
|
||||||
let raw_meta_offset = file.read(len - 4, 4)?;
|
let raw_bloom_offset = file.read(len - 4, 4)?;
|
||||||
|
let bloom_offset = (&raw_bloom_offset[..]).get_u32() as u64;
|
||||||
|
let raw_bloom = file.read(bloom_offset, len - 4 - bloom_offset)?;
|
||||||
|
let bloom_filter = Bloom::decode(&raw_bloom);
|
||||||
|
let raw_meta_offset = file.read(bloom_offset - 4, 4)?;
|
||||||
let block_meta_offset = (&raw_meta_offset[..]).get_u32() as u64;
|
let block_meta_offset = (&raw_meta_offset[..]).get_u32() as u64;
|
||||||
let raw_meta = file.read(block_meta_offset, len - 4 - block_meta_offset)?;
|
let raw_meta = file.read(block_meta_offset, bloom_offset - 4 - block_meta_offset)?;
|
||||||
let block_meta = BlockMeta::decode_block_meta(&raw_meta[..]);
|
let block_meta = BlockMeta::decode_block_meta(&raw_meta[..]);
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
file,
|
file,
|
||||||
@@ -138,6 +149,7 @@ impl SsTable {
|
|||||||
block_meta_offset: block_meta_offset as usize,
|
block_meta_offset: block_meta_offset as usize,
|
||||||
id,
|
id,
|
||||||
block_cache,
|
block_cache,
|
||||||
|
bloom: Some(bloom_filter),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,6 +163,7 @@ impl SsTable {
|
|||||||
block_cache: None,
|
block_cache: None,
|
||||||
first_key,
|
first_key,
|
||||||
last_key,
|
last_key,
|
||||||
|
bloom: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -207,6 +220,3 @@ impl SsTable {
|
|||||||
self.id
|
self.id
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests;
|
|
||||||
|
|||||||
113
mini-lsm/src/table/bloom.rs
Normal file
113
mini-lsm/src/table/bloom.rs
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0.
|
||||||
|
|
||||||
|
use bytes::{BufMut, Bytes, BytesMut};
|
||||||
|
|
||||||
|
/// Implements a bloom filter
|
||||||
|
pub struct Bloom {
|
||||||
|
/// data of filter in bits
|
||||||
|
pub(crate) filter: Bytes,
|
||||||
|
/// number of hash functions
|
||||||
|
pub(crate) k: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait BitSlice {
|
||||||
|
fn get_bit(&self, idx: usize) -> bool;
|
||||||
|
fn bit_len(&self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait BitSliceMut {
|
||||||
|
fn set_bit(&mut self, idx: usize, val: bool);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: AsRef<[u8]>> BitSlice for T {
|
||||||
|
fn get_bit(&self, idx: usize) -> bool {
|
||||||
|
let pos = idx / 8;
|
||||||
|
let offset = idx % 8;
|
||||||
|
(self.as_ref()[pos] & (1 << offset)) != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bit_len(&self) -> usize {
|
||||||
|
self.as_ref().len() * 8
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: AsMut<[u8]>> BitSliceMut for T {
|
||||||
|
fn set_bit(&mut self, idx: usize, val: bool) {
|
||||||
|
let pos = idx / 8;
|
||||||
|
let offset = idx % 8;
|
||||||
|
if val {
|
||||||
|
self.as_mut()[pos] |= 1 << offset;
|
||||||
|
} else {
|
||||||
|
self.as_mut()[pos] &= !(1 << offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Bloom {
|
||||||
|
/// Decode a bloom filter
|
||||||
|
pub fn decode(buf: &[u8]) -> Self {
|
||||||
|
let filter = &buf[..buf.len() - 1];
|
||||||
|
let k = buf[buf.len() - 1];
|
||||||
|
Self {
|
||||||
|
filter: filter.to_vec().into(),
|
||||||
|
k,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a bloom filter
|
||||||
|
pub fn encode(&self, buf: &mut Vec<u8>) {
|
||||||
|
buf.extend(&self.filter);
|
||||||
|
buf.put_u8(self.k);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get bloom filter bits per key from entries count and FPR
|
||||||
|
pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
|
||||||
|
let size =
|
||||||
|
-1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2);
|
||||||
|
let locs = (size / (entries as f64)).ceil();
|
||||||
|
locs as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build bloom filter from key hashes
|
||||||
|
pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self {
|
||||||
|
let k = (bits_per_key as f64 * 0.69) as u32;
|
||||||
|
let k = k.min(30).max(1);
|
||||||
|
let nbits = (keys.len() * bits_per_key).max(64);
|
||||||
|
let nbytes = (nbits + 7) / 8;
|
||||||
|
let nbits = nbytes * 8;
|
||||||
|
let mut filter = BytesMut::with_capacity(nbytes);
|
||||||
|
filter.resize(nbytes, 0);
|
||||||
|
for h in keys {
|
||||||
|
let mut h = *h;
|
||||||
|
let delta = (h >> 17) | (h << 15);
|
||||||
|
for _ in 0..k {
|
||||||
|
let bit_pos = (h as usize) % nbits;
|
||||||
|
filter.set_bit(bit_pos, true);
|
||||||
|
h = h.wrapping_add(delta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Self {
|
||||||
|
filter: filter.freeze(),
|
||||||
|
k: k as u8,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a bloom filter may contain some data
|
||||||
|
pub fn may_contain(&self, mut h: u32) -> bool {
|
||||||
|
if self.k > 30 {
|
||||||
|
// potential new encoding for short bloom filters
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
let nbits = self.filter.bit_len();
|
||||||
|
let delta = (h >> 17) | (h << 15);
|
||||||
|
for _ in 0..self.k {
|
||||||
|
let bit_pos = h % (nbits as u32);
|
||||||
|
if !self.filter.get_bit(bit_pos as usize) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
h = h.wrapping_add(delta);
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ use std::sync::Arc;
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use bytes::BufMut;
|
use bytes::BufMut;
|
||||||
|
|
||||||
|
use super::bloom::Bloom;
|
||||||
use super::{BlockMeta, FileObject, SsTable};
|
use super::{BlockMeta, FileObject, SsTable};
|
||||||
use crate::block::BlockBuilder;
|
use crate::block::BlockBuilder;
|
||||||
use crate::lsm_storage::BlockCache;
|
use crate::lsm_storage::BlockCache;
|
||||||
@@ -14,8 +15,9 @@ pub struct SsTableBuilder {
|
|||||||
first_key: Vec<u8>,
|
first_key: Vec<u8>,
|
||||||
last_key: Vec<u8>,
|
last_key: Vec<u8>,
|
||||||
data: Vec<u8>,
|
data: Vec<u8>,
|
||||||
pub(super) meta: Vec<BlockMeta>,
|
pub(crate) meta: Vec<BlockMeta>,
|
||||||
block_size: usize,
|
block_size: usize,
|
||||||
|
key_hashes: Vec<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SsTableBuilder {
|
impl SsTableBuilder {
|
||||||
@@ -28,6 +30,7 @@ impl SsTableBuilder {
|
|||||||
last_key: Vec::new(),
|
last_key: Vec::new(),
|
||||||
block_size,
|
block_size,
|
||||||
builder: BlockBuilder::new(block_size),
|
builder: BlockBuilder::new(block_size),
|
||||||
|
key_hashes: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -38,6 +41,8 @@ impl SsTableBuilder {
|
|||||||
self.first_key.extend(key);
|
self.first_key.extend(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.key_hashes.push(farmhash::fingerprint32(key));
|
||||||
|
|
||||||
if self.builder.add(key, value) {
|
if self.builder.add(key, value) {
|
||||||
self.last_key.clear();
|
self.last_key.clear();
|
||||||
self.last_key.extend(key);
|
self.last_key.extend(key);
|
||||||
@@ -71,7 +76,7 @@ impl SsTableBuilder {
|
|||||||
self.data.extend(encoded_block);
|
self.data.extend(encoded_block);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builds the SSTable and writes it to the given path.
|
/// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects.
|
||||||
pub fn build(
|
pub fn build(
|
||||||
mut self,
|
mut self,
|
||||||
id: usize,
|
id: usize,
|
||||||
@@ -83,6 +88,13 @@ impl SsTableBuilder {
|
|||||||
let meta_offset = buf.len();
|
let meta_offset = buf.len();
|
||||||
BlockMeta::encode_block_meta(&self.meta, &mut buf);
|
BlockMeta::encode_block_meta(&self.meta, &mut buf);
|
||||||
buf.put_u32(meta_offset as u32);
|
buf.put_u32(meta_offset as u32);
|
||||||
|
let bloom = Bloom::build_from_key_hashes(
|
||||||
|
&self.key_hashes,
|
||||||
|
Bloom::bloom_bits_per_key(self.key_hashes.len(), 0.01),
|
||||||
|
);
|
||||||
|
let bloom_offset = buf.len();
|
||||||
|
bloom.encode(&mut buf);
|
||||||
|
buf.put_u32(bloom_offset as u32);
|
||||||
let file = FileObject::create(path.as_ref(), buf)?;
|
let file = FileObject::create(path.as_ref(), buf)?;
|
||||||
Ok(SsTable {
|
Ok(SsTable {
|
||||||
id,
|
id,
|
||||||
@@ -92,6 +104,7 @@ impl SsTableBuilder {
|
|||||||
block_meta: self.meta,
|
block_meta: self.meta,
|
||||||
block_meta_offset: meta_offset,
|
block_meta_offset: meta_offset,
|
||||||
block_cache,
|
block_cache,
|
||||||
|
bloom: Some(bloom),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1 +1,9 @@
|
|||||||
|
mod harness;
|
||||||
|
mod week1_day1;
|
||||||
|
mod week1_day2;
|
||||||
|
mod week1_day3;
|
||||||
|
mod week1_day4;
|
||||||
|
mod week1_day5;
|
||||||
|
mod week1_day6;
|
||||||
|
mod week1_day7;
|
||||||
|
mod week2_day1;
|
||||||
|
|||||||
@@ -124,10 +124,9 @@ pub fn generate_sst(
|
|||||||
builder.build(id, block_cache, path.as_ref()).unwrap()
|
builder.build(id, block_cache, path.as_ref()).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub fn sync(storage: &LsmStorageInner) {
|
pub fn sync(storage: &LsmStorageInner) {
|
||||||
storage
|
storage
|
||||||
.force_freeze_memtable(&storage.state_lock.lock())
|
.force_freeze_memtable(&storage.state_lock.lock())
|
||||||
.unwrap();
|
.unwrap();
|
||||||
storage.force_flush_next_imm_memtable().unwrap();
|
storage.force_flush_next_imm_memtable().unwrap();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ use crate::{
|
|||||||
lsm_storage::{LsmStorageInner, LsmStorageOptions, MiniLsm},
|
lsm_storage::{LsmStorageInner, LsmStorageOptions, MiniLsm},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_task1_storage_scan() {
|
fn test_task1_storage_scan() {
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
|
|||||||
Reference in New Issue
Block a user