diff --git a/Cargo.lock b/Cargo.lock index 26bc7ff..4876ce4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -178,6 +178,13 @@ dependencies = [ "bytes", ] +[[package]] +name = "mini-lsm-starter" +version = "0.1.0" +dependencies = [ + "bytes", +] + [[package]] name = "mini-lsm-xtask" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 15207d2..8cf7351 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "mini-lsm", + "mini-lsm-starter", "xtask" ] diff --git a/mini-lsm-book/src/00-overview.md b/mini-lsm-book/src/00-overview.md index 56a407f..4f86f47 100644 --- a/mini-lsm-book/src/00-overview.md +++ b/mini-lsm-book/src/00-overview.md @@ -4,4 +4,91 @@ In this tutorial, you will learn how to build a simple LSM-Tree storage engine i ## What is LSM, and Why LSM? -## Roadmap +Log-structured merge tree is a data structure to maintain key-value pairs. This data structure is widely used in +distributed database systems like [TiDB](https://www.pingcap.com) and [CockroachDB](https://www.cockroachlabs.com) as +their underlying storage engine. [RocksDB](http://rocksdb.org), based on [LevelDB](https://github.com/google/leveldb), +is an implementation of LSM-Tree storage engine. It provides a wide range of key-value access functionalities and is +used in a lot of production systems. + +Generally speaking, LSM Tree is an append-friendly data structure. It is more intuitive to compare LSM to other +key-value data structure like RB-Tree and B-Tree. For RB-Tree and B-Tree, all data operations are in-place. That is to +say, when you update the value corresponding to the key, the value will be overwritten at its original memory or disk +space. But in an LSM Tree, all write operations, i.e., insertions, updates, deletions, are performed in somewhere else. +These operations will be batched into SST (sorted string table) files and be written to the disk. Once written to the +disk, the file will not be changed. These operations are applied lazily on disk with a special task called compaction. +The compaction job will merge multiple SST files and remove unused data. + +This architectural design makes LSM tree easy to work with. + +1. Data are immutable on persistent storage, which means that it is easier to offload the background tasks (compaction) + to remote servers. It is also feasible to directly store and serve data from cloud-native storage systems like S3. +2. An LSM tree can balance between read, write and space amplification by changing the compaction algorithm. The data + structure itself is super versatile and can be optimized for different workloads. + +In this tutorial, we will learn how to build an LSM-Tree-based storage engine in the Rust programming language. + +## Overview of LSM + +An LSM storage engine generally contains 3 parts: + +1. Write-ahead log to persist temporary data for recovery. +2. SSTs on the disk for maintaining a tree structure. +3. Mem-tables in memory for batching small writes. + +The storage engine generally provides the following interfaces: + +* `Put(key, value)`: store a key-value pair in the LSM tree. +* `Delete(key)`: remove a key and its corresponding value. +* `Get(key)`: get the value corresponding to a key. + +To ensure persistence, + +* `Sync()`: ensure all the operations before `sync` are persisted to the disk. + +Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch +of key value pairs. + +In this tutorial, we assume the LSM tree is using leveled compaction algorithm, which is commonly used in real-world +systems. + +## Write Flow + +![Write Flow](figures/lsm-tutorial/00-lsm-write-flow.svg) + +The write flow of LSM contains 4 steps: + +1. Write the key-value pair to write-ahead log, so that it can be recovered after the storage engine crashes. +2. Write the key-value pair to memtable. After (1) and (2) completes, we can notify the user that the write operation + is completed. +3. When a memtable is full, we will flush it to the disk as an SST file in the background. +4. We will compact some files in some level into lower levels to maintain a good shape for the LSM tree, so that read + amplification is low. + +## Read Flow + +![Read Flow](figures/lsm-tutorial/00-lsm-read-flow.svg) + +When we want to read a key, + +1. We will first probe all the memtables from latest to oldest. +2. If the key is not found, we will then search the entire LSM tree containing SSTs to find the data. + +## Tutorial Overview + +![Tutorial Overview](figures/lsm-tutorial/00-lsm-tutorial-overview.svg) + +In this tutorial, we will build the LSM tree structure in 7 days: + +* Day 1: Block encoding. SSTs are composed of multiple data blocks. We will implement the block encoding. +* Day 2: SST encoding. +* Day 3: Engine. In this day we will get a functional (but not persistent) key-value engine with `get`, `put`, `delete` + API. +* Day 4: Block cache. To reduce disk I/O and maximize performance, we will use moka-rs to build a block cache for the + LSM tree. +* Day 5: Compaction. Now it's time to maintain a leveled structure for SSTs. +* Day 6: Recovery. We will implement WAL and manifest so that the engine can recover after restart. +* Day 7: Bloom filter and key compression. They are widely-used optimizations in LSM tree structures. + +We provide you starter code (see `mini-lsm-starter` crate), where we simply replace all function body with +`unimplemented!()`. You can start your project based on this starter code. We provide test cases, but they are very +simple. We recommend you to think carefully about your implementation and write test cases by yourself. diff --git a/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-read-flow.svg b/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-read-flow.svg new file mode 100644 index 0000000..f15c1fe --- /dev/null +++ b/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-read-flow.svg @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + 00-lsm-read-flow + + + Layer 1 + + + + + + + + + SST + + + + + L0 + + + + + + + SST + + + + + + + SST + + + + + L1 + + + + + + + SST + + + + + + + SST + + + + + Ln + + + + + + + SST + + + + + + + SST + + + + + + + + On Disk + + + + + In Memory + + + + + + + Mem + Table + + + + + + + Mem + Table + + + + + + + Mem + Table + + + + + + + WAL + + + + + + + key + value + + + + + + + + + + + + + + + + + + + Manifest + + + + + + + + + + + 1. Find the key-value pair in all memtables (new to old) + + + + + + + + 2. Find the key-value pair in SSTs (top layer to bottom layer) + + + + + diff --git a/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-tutorial-overview.svg b/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-tutorial-overview.svg new file mode 100644 index 0000000..1c30397 --- /dev/null +++ b/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-tutorial-overview.svg @@ -0,0 +1,370 @@ + + + + + + + + + + + + + + + + + 00-lsm-tutorial-overview + + + Layer 1 + + + + + + + + + + + + + + + + + + + Day 2: SST Encoding + + + + + + + + + + + SST + + + + + L0 + + + + + + + SST + + + + + + + SST + + + + + L1 + + + + + + + SST + + + + + + + SST + + + + + Ln + + + + + + + SST + + + + + + + SST + + + + + + + + On Disk + + + + + In Memory + + + + + + + Mem + Table + + + + + + + Mem + Table + + + + + + + Mem + Table + + + + + + + WAL + + + + + + + key + value + + + + + + + + + + + + + + 1. + Write to WAL + + + + + 2. + Write to MemTable + + + + + 3. + Flush to Disk (Background) + + + + + + + + 4. Compaction (Background) + + + + + + + Data + Block + + + + + + + Data + Block + + + + + + + Data + Block + + + + + + + Data + Block + + + + + + + Data + Block + + + + + + + Index + + + + + + + + + + + + + Key + + + + + + + Value + + + + + + + Key + + + + + + + Value + + + + + + + Key + + + + + + + Value + + + + + + + Key + + + + + + + Value + + + + + + + Key + + + + + + + Value + + + + + + + + + + + Day 1: Block Encoding + + + + + Day 3 & 4: Engine + + + + + Day 5: Compaction + + + + + Day 6: Recovery + + + + + + + Manifest + + + + + Day 7.1: Bloom Filter + + + + + + + + Day 7.2: Key Compression + + + + + + + + diff --git a/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-write-flow.svg b/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-write-flow.svg new file mode 100644 index 0000000..04d459e --- /dev/null +++ b/mini-lsm-book/src/figures/lsm-tutorial/00-lsm-write-flow.svg @@ -0,0 +1,187 @@ + + + + + + + + + + + + + + + + + 00-lsm-write-flow + + + Layer 1 + + + + + + + + + SST + + + + + L0 + + + + + + + SST + + + + + + + SST + + + + + L1 + + + + + + + SST + + + + + + + SST + + + + + Ln + + + + + + + SST + + + + + + + SST + + + + + + + + On Disk + + + + + In Memory + + + + + + + Mem + Table + + + + + + + Mem + Table + + + + + + + Mem + Table + + + + + + + WAL + + + + + + + key + value + + + + + + + + + + + + + + 1. + Write to WAL + + + + + 2. + Write to MemTable + + + + + 3. + Flush to Disk (Background) + + + + + + + + 4. Compaction (Background) + + + + + + + Manifest + + + + + + + + diff --git a/mini-lsm-starter/Cargo.toml b/mini-lsm-starter/Cargo.toml new file mode 100644 index 0000000..5900389 --- /dev/null +++ b/mini-lsm-starter/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "mini-lsm-starter" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +repository = { workspace = true } +publish = false + +[dependencies] +bytes = "1" diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs new file mode 100644 index 0000000..0fd4feb --- /dev/null +++ b/mini-lsm-starter/src/block.rs @@ -0,0 +1,26 @@ +#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod +#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod + +mod builder; +mod iterator; + +use bytes::Bytes; + +pub use builder::BlockBuilder; +pub use iterator::BlockIterator; + +/// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. +pub struct Block {} + +impl Block { + pub fn encode(&self) -> Bytes { + unimplemented!() + } + + pub fn decode(data: &[u8]) -> Self { + unimplemented!() + } +} + +#[cfg(test)] +mod tests; diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs new file mode 100644 index 0000000..3c2570e --- /dev/null +++ b/mini-lsm-starter/src/block/builder.rs @@ -0,0 +1,29 @@ +#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod +#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod + +use super::Block; + +/// Builds a block +pub struct BlockBuilder {} + +impl BlockBuilder { + /// Creates a new block builder + pub fn new(target_size: usize) -> Self { + unimplemented!() + } + + /// Adds a key-value pair to the block + #[must_use] + pub fn add(&mut self, key: &[u8], value: &[u8]) -> bool { + unimplemented!() + } + + pub fn is_empty(&self) -> bool { + unimplemented!() + } + + /// Builds a block + pub fn build(self) -> Block { + unimplemented!() + } +} diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs new file mode 100644 index 0000000..a099708 --- /dev/null +++ b/mini-lsm-starter/src/block/iterator.rs @@ -0,0 +1,62 @@ +#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod +#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod + +use std::sync::Arc; + +use super::Block; + +pub struct BlockIterator {} + +impl BlockIterator { + fn new(block: Arc) -> Self { + unimplemented!() + } + + pub fn create_and_seek_to_first(block: Arc) -> Self { + unimplemented!() + } + + pub fn create_and_seek_to_key(block: Arc, key: &[u8]) -> Self { + unimplemented!() + } + + pub fn key(&self) -> &[u8] { + unimplemented!() + } + + pub fn value(&self) -> &[u8] { + unimplemented!() + } + + pub fn is_valid(&self) -> bool { + unimplemented!() + } + + pub fn seek_to_first(&mut self) { + unimplemented!() + } + + pub fn seek_to_last(&mut self) { + unimplemented!() + } + + pub fn len(&self) -> usize { + unimplemented!() + } + + pub fn is_empty(&self) -> bool { + unimplemented!() + } + + pub fn seek_to(&mut self, idx: usize) { + unimplemented!() + } + + pub fn next(&mut self) { + unimplemented!() + } + + pub fn seek_to_key(&mut self, key: &[u8]) { + unimplemented!() + } +} diff --git a/mini-lsm-starter/src/block/tests.rs b/mini-lsm-starter/src/block/tests.rs new file mode 100644 index 0000000..a6eb840 --- /dev/null +++ b/mini-lsm-starter/src/block/tests.rs @@ -0,0 +1 @@ +//! Please copy `mini-lsm/src/block/tests.rs` here so that you can run tests. diff --git a/mini-lsm-starter/src/lib.rs b/mini-lsm-starter/src/lib.rs new file mode 100644 index 0000000..a863eaa --- /dev/null +++ b/mini-lsm-starter/src/lib.rs @@ -0,0 +1 @@ +pub mod block; diff --git a/mini-lsm/src/block.rs b/mini-lsm/src/block.rs index 7d78a8b..b472932 100644 --- a/mini-lsm/src/block.rs +++ b/mini-lsm/src/block.rs @@ -39,94 +39,4 @@ impl Block { } #[cfg(test)] -mod tests { - use std::sync::Arc; - - use super::{builder::BlockBuilder, iterator::BlockIterator, *}; - - #[test] - fn test_block_build_single_key() { - let mut builder = BlockBuilder::new(16); - assert!(builder.add(b"233", b"233333")); - builder.build(); - } - - #[test] - fn test_block_build_full() { - let mut builder = BlockBuilder::new(16); - assert!(builder.add(b"11", b"11")); - assert!(!builder.add(b"22", b"22")); - builder.build(); - } - - fn key_of(idx: usize) -> Vec { - format!("key_{:03}", idx).into_bytes() - } - - fn value_of(idx: usize) -> Vec { - format!("value_{:010}", idx).into_bytes() - } - - fn num_of_keys() -> usize { - 100 - } - - fn generate_block() -> Block { - let mut builder = BlockBuilder::new(10000); - for idx in 0..num_of_keys() { - let key = key_of(idx); - let value = value_of(idx); - assert!(builder.add(&key[..], &value[..])); - } - builder.build() - } - - #[test] - fn test_block_build_all() { - generate_block(); - } - - #[test] - fn test_block_encode() { - let block = generate_block(); - block.encode(); - } - - #[test] - fn test_block_decode() { - let block = generate_block(); - let encoded = block.encode(); - let decoded_block = Block::decode(&encoded); - assert_eq!(block.offsets, decoded_block.offsets); - assert_eq!(block.data, decoded_block.data); - } - - fn as_bytes(x: &[u8]) -> Bytes { - Bytes::copy_from_slice(x) - } - - #[test] - fn test_block_iterator() { - let block = Arc::new(generate_block()); - let mut iter = BlockIterator::create_and_seek_to_first(block); - for i in 0..num_of_keys() { - let key = iter.key(); - let value = iter.value(); - assert_eq!( - key, - key_of(i), - "expected key: {:?}, actual key: {:?}", - as_bytes(&key_of(i)), - as_bytes(key) - ); - assert_eq!( - value, - value_of(i), - "expected value: {:?}, actual value: {:?}", - as_bytes(&value_of(i)), - as_bytes(value) - ); - iter.next(); - } - } -} +mod tests; diff --git a/mini-lsm/src/block/tests.rs b/mini-lsm/src/block/tests.rs new file mode 100644 index 0000000..51edfeb --- /dev/null +++ b/mini-lsm/src/block/tests.rs @@ -0,0 +1,89 @@ +use std::sync::Arc; + +use super::{builder::BlockBuilder, iterator::BlockIterator, *}; + +#[test] +fn test_block_build_single_key() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(b"233", b"233333")); + builder.build(); +} + +#[test] +fn test_block_build_full() { + let mut builder = BlockBuilder::new(16); + assert!(builder.add(b"11", b"11")); + assert!(!builder.add(b"22", b"22")); + builder.build(); +} + +fn key_of(idx: usize) -> Vec { + format!("key_{:03}", idx).into_bytes() +} + +fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() +} + +fn num_of_keys() -> usize { + 100 +} + +fn generate_block() -> Block { + let mut builder = BlockBuilder::new(10000); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + assert!(builder.add(&key[..], &value[..])); + } + builder.build() +} + +#[test] +fn test_block_build_all() { + generate_block(); +} + +#[test] +fn test_block_encode() { + let block = generate_block(); + block.encode(); +} + +#[test] +fn test_block_decode() { + let block = generate_block(); + let encoded = block.encode(); + let decoded_block = Block::decode(&encoded); + assert_eq!(block.offsets, decoded_block.offsets); + assert_eq!(block.data, decoded_block.data); +} + +fn as_bytes(x: &[u8]) -> Bytes { + Bytes::copy_from_slice(x) +} + +#[test] +fn test_block_iterator() { + let block = Arc::new(generate_block()); + let mut iter = BlockIterator::create_and_seek_to_first(block); + for i in 0..num_of_keys() { + let key = iter.key(); + let value = iter.value(); + assert_eq!( + key, + key_of(i), + "expected key: {:?}, actual key: {:?}", + as_bytes(&key_of(i)), + as_bytes(key) + ); + assert_eq!( + value, + value_of(i), + "expected value: {:?}, actual value: {:?}", + as_bytes(&value_of(i)), + as_bytes(value) + ); + iter.next(); + } +} diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 5dc5483..1524562 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -6,8 +6,7 @@ homepage = { workspace = true } keywords = { workspace = true } license = { workspace = true } repository = { workspace = true } - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +publish = false [dependencies] anyhow = "1"