From a03cb765ffbf53171d8cd96129d8c2dbec1fc3ba Mon Sep 17 00:00:00 2001 From: Alex Chi Date: Fri, 23 Dec 2022 23:45:09 -0500 Subject: [PATCH] feat(docs): finish part 1 Signed-off-by: Alex Chi --- Cargo.toml | 4 ++ mini-lsm-book/src/00-overview.md | 23 ++++++++- mini-lsm-book/src/01-block.md | 89 ++++++++++++++++++++++++++++++++ mini-lsm-starter/src/block.rs | 5 +- mini-lsm-starter/src/table.rs | 2 +- mini-lsm/src/block/builder.rs | 1 - xtask/src/main.rs | 26 ++++++++++ 7 files changed, 146 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1814864..352036a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,7 @@ homepage = "https://github.com/skyzh/mini-lsm" keywords = ["storage", "database", "tutorial"] license = "Apache-2.0" repository = "https://github.com/skyzh/mini-lsm" + +[workspace.dependencies] +anyhow = "1" +bytes = "1" diff --git a/mini-lsm-book/src/00-overview.md b/mini-lsm-book/src/00-overview.md index d61a3e9..49fc629 100644 --- a/mini-lsm-book/src/00-overview.md +++ b/mini-lsm-book/src/00-overview.md @@ -95,4 +95,25 @@ We provide you starter code (see `mini-lsm-starter` crate), where we simply repl `unimplemented!()`. You can start your project based on this starter code. We provide test cases, but they are very simple. We recommend you to think carefully about your implementation and write test cases by yourself. -You can use `cargo x scheck` to run all test cases and do style check in your codebase. +* You can use `cargo x scheck` to run all test cases and do style check in your codebase. +* You can use `cargo x copy-test dayX` to copy test cases to the starter code. + +## About the Author + +As of writing (at the end of 2022), Chi is a first-year master's student in Carnegie Mellon University. He has 5 years' +experience with the Rust programming language since 2018. He has been working on a variety of database systems including +[TiKV][db1], [AgateDB][db2], [TerarkDB][db3], [RisingLight][db4], and [RisingWave][db5]. In his first semester in CMU, +he worked as a teaching assistant for CMU's [15-445/645 Intro to Database Systems][15445-course] course, where +he built a new SQL processing layer for the BusTub educational database system, added more query optimization stuff into +the course, and made the course [more challenging than ever before][tweet]. Chi is interested in exploring how the Rust +programming language can fit in the database world. Check out his [previous tutorial](type-exercise) on building a +vectorized expression framework if you are also interested in that topic. + +[db1]: https://github.com/tikv/tikv +[db2]: https://github.com/tikv/agatedb +[db3]: https://github.com/bytedance/terarkdb +[db4]: https://github.com/risinglightdb/risinglight +[db5]: https://github.com/risingwavelabs/risingwave +[15445-course]: https://15445.courses.cs.cmu.edu/fall2022/ +[tweet]: https://twitter.com/andy_pavlo/status/1598137241016360961 +[type-exercise]: https://github.com/skyzh/type-exercise-in-rust diff --git a/mini-lsm-book/src/01-block.md b/mini-lsm-book/src/01-block.md index a629280..68daa82 100644 --- a/mini-lsm-book/src/01-block.md +++ b/mini-lsm-book/src/01-block.md @@ -1,2 +1,91 @@ # Block Builder and Block Iterator +In this part, you will need to modify: + +* `src/block/builder.rs` +* `src/block/iterator.rs` +* `src/block.rs` + +You can use `cargo x copy-test day1` to copy our provided test cases to the starter code directory. After you have +finished this part, use `cargo x scheck` to check the style and run all test cases. If you want to write your own +test cases, write a new module `#[cfg(test)] mod user_tests { /* your test cases */ }` in `block.rs`. Remember to remove +`#![allow(...)]` at the top of the modules you modified so that cargo clippy can actually check the styles. + +## Task 1 - Block Builder + +Block is the minimum read unit in LSM. It is of 4KB size in general, similar database pages. In each block, we will +store a sequence of sorted key value pairs. + +You will need to modify `BlockBuilder` to build the encoded data and the offset array. The block contains two parts: +data and offsets. + +``` +| data | offsets | +|entry|entry|entry|entry|offset|offset|offset|offset|num_of_elements| +``` + +When user adds a key-value pair to a block (which is an entry), we will need to serialize it into the following format: + +``` +| entry1 | +| key_len (2B) | key (varlen) | value_len (2B) | value (varlen) | ... | +``` + +Key length and value length are 2B, which means their maximum length is 65536. + +We assume that keys will never be empty, and values can be empty. An empty value means that the corresponding key has +been deleted in the view of other parts of the system. For the block builder and iterator, we just treat empty value +as-is. + +At the end of the block, we will store the offsets of each entry and the total number of entries. For example, if +the first entry is at 0th position of the block, and the second is at 12th position, + +``` +|offset|offset|num_of_elements| +| 0 | 12 | 2 | +``` + +The footer of the block will be as above. Each of the number is stored as `u16`. + +The block has a size limit, which is `target_size`. Unless the first key-value pair exceeds the target block size, you +should ensure that the encoded block size is always less than or equal to `target_size`. + +The `BlockBuilder` will produce the data part and unencoded entry offsets when `build` is called. The information will +be stored in the `Block` struct. As key-value entries are stored in the raw format and offsets are stored in a separate +vector, this reduces unnecessary memory allocations and processing overhead when decoding data -- what you need to do +is to simply copy the raw block data to the `data` vector and decode the entry offsets every 2 bytes, *instead of* +creating something like `Vec<(Vec, Vec)>` to store all the key value pairs in one block in memory. This compact +memory layout is very efficient. `Block::encode` and `Block::decode` will encode to / decode from the data layout +illustrated in the above figures. + +## Task 2 - Block Iterator + +Given a block object, we will need to extract the key-value pairs. To do this, we create an iterator over a block and +find the information we want. + +`BlockIterator` can be created with an `Arc`. If `create_and_seek_to_first` is called, it will be positioned at +the first key in the block. If `create_and_seek_to_key` is called, the iterator will be positioned at the first key which +is `>=` the provided key. For example, if `1, 3, 5` is in a block, + +```rust +let mut iter = BlockIterator::create_and_seek_to_key(block, b"2"); +assert_eq!(iter.key(), b"3"); +``` + +`seek 2` will make the iterator to be positioned at the next available key of `2`, which is `3`. + +The iterator should copy `key` and `value` from the block and store them inside the iterator, so that users can access +the key and the value without any extra copy with `fn key(&self) -> &[u8]`, which directly returns the reference of the +locally-stored key and value. + +When `next` is called, the iterator will move to the next position. If we reach the end of the block, we can set `key` +to empty and return `false` from `is_valid`, so that the caller can switch to another block if possible. + +After implementing this part, you should be able to pass all tests in `block/tests.rs`. + +## Extra Tasks + +*Note: Some test cases might not pass after implementing this part. You might need to write your own test cases.* + +* Implement block checksum. Verify checksum when decoding the block. +* Compress / uncompress block. Compress on `build` and uncompress on decoding. diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs index 0fd4feb..d672036 100644 --- a/mini-lsm-starter/src/block.rs +++ b/mini-lsm-starter/src/block.rs @@ -10,7 +10,10 @@ pub use builder::BlockBuilder; pub use iterator::BlockIterator; /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. -pub struct Block {} +pub struct Block { + data: Vec, + offsets: Vec, +} impl Block { pub fn encode(&self) -> Bytes { diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index cf3bf04..55ef302 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -25,7 +25,7 @@ impl BlockMeta { /// Encode block meta to a buffer. pub fn encode_block_meta( block_meta: &[BlockMeta], - #[allow(clippy::ptr_arg)] buf: &mut Vec, + #[allow(clippy::ptr_arg)] /* remove this allow after you finish */ buf: &mut Vec, ) { unimplemented!() } diff --git a/mini-lsm/src/block/builder.rs b/mini-lsm/src/block/builder.rs index ff50ad7..0765e72 100644 --- a/mini-lsm/src/block/builder.rs +++ b/mini-lsm/src/block/builder.rs @@ -30,7 +30,6 @@ impl BlockBuilder { #[must_use] pub fn add(&mut self, key: &[u8], value: &[u8]) -> bool { assert!(!key.is_empty(), "key must not be empty"); - assert!(!value.is_empty(), "value must not be empty"); if self.estimated_size() + key.len() + value.len() + SIZEOF_U16 * 3 > self.block_size && !self.is_empty() { diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 1416db2..0c2b26a 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -5,6 +5,11 @@ use clap::Parser; use console::style; use duct::cmd; +#[derive(clap::Subcommand, Debug)] +enum CopyTestAction { + Day1, +} + #[derive(clap::Subcommand, Debug)] enum Action { /// Check. @@ -21,6 +26,9 @@ enum Action { Sync, /// Check starter code Scheck, + /// Copy test cases + #[command(subcommand)] + CopyTest(CopyTestAction), } /// Simple program to greet a person @@ -114,6 +122,20 @@ fn sync() -> Result<()> { Ok(()) } +fn copy_test_case(test: CopyTestAction) -> Result<()> { + match test { + CopyTestAction::Day1 => { + cmd!( + "cp", + "mini-lsm/src/block/tests.rs", + "mini-lsm-starter/src/block/tests.rs" + ) + .run()?; + } + } + Ok(()) +} + fn main() -> Result<()> { let args = Args::parse(); @@ -160,6 +182,10 @@ fn main() -> Result<()> { switch_to_workspace_root()?; sync()?; } + Action::CopyTest(test) => { + switch_to_workspace_root()?; + copy_test_case(test)?; + } } Ok(())