From f7b6d9a8471ed9279553e32ad9ffbd6ad619546f Mon Sep 17 00:00:00 2001 From: Alex Chi Date: Sat, 24 Dec 2022 15:34:34 -0500 Subject: [PATCH] feat(docs): finish part 2 Signed-off-by: Alex Chi --- Cargo.lock | 457 +++++++++++++++++++++++++ README.md | 10 +- mini-lsm-book/src/00-overview.md | 8 +- mini-lsm-book/src/01-block.md | 2 + mini-lsm-book/src/02-sst.md | 73 ++++ mini-lsm-book/src/03-engine.md | 1 - mini-lsm-book/src/03-memtable.md | 28 ++ mini-lsm-book/src/04-block-cache.md | 1 - mini-lsm-book/src/04-engine.md | 1 + mini-lsm-book/src/SUMMARY.md | 4 +- mini-lsm-starter/Cargo.toml | 4 + mini-lsm-starter/src/lsm_iterator.rs | 6 +- mini-lsm-starter/src/table/builder.rs | 2 +- mini-lsm-starter/src/table/iterator.rs | 16 +- mini-lsm/Cargo.toml | 6 +- mini-lsm/src/lsm_iterator.rs | 3 +- mini-lsm/src/table/builder.rs | 2 +- xtask/src/main.rs | 6 + 18 files changed, 601 insertions(+), 29 deletions(-) delete mode 100644 mini-lsm-book/src/03-engine.md create mode 100644 mini-lsm-book/src/03-memtable.md delete mode 100644 mini-lsm-book/src/04-block-cache.md create mode 100644 mini-lsm-book/src/04-engine.md diff --git a/Cargo.lock b/Cargo.lock index 020c72c..cdc0729 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -38,12 +38,55 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bumpalo" +version = "3.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" + +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "bytes" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c" +[[package]] +name = "camino" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ad0e1e3e88dd237a156ab9f571021b8a158caa0ae44b1968a241efb5144c1e" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo-platform" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbdb825da8a5df079a43676dbe042702f1707b1109f713a01420fbb4cc71fa27" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", +] + [[package]] name = "cc" version = "1.0.78" @@ -107,6 +150,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-epoch" version = "0.9.13" @@ -180,6 +233,41 @@ dependencies = [ "libc", ] +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +dependencies = [ + "version_check", +] + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "heck" version = "0.4.0" @@ -195,6 +283,15 @@ dependencies = [ "libc", ] +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + [[package]] name = "io-lifetimes" version = "1.0.3" @@ -217,6 +314,21 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "itoa" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" + +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -245,6 +357,30 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "mach" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa" +dependencies = [ + "libc", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + [[package]] name = "memoffset" version = "0.7.1" @@ -263,8 +399,10 @@ dependencies = [ "bytes", "crossbeam-epoch", "crossbeam-skiplist", + "moka", "ouroboros", "parking_lot", + "tempfile", ] [[package]] @@ -276,8 +414,10 @@ dependencies = [ "bytes", "crossbeam-epoch", "crossbeam-skiplist", + "moka", "ouroboros", "parking_lot", + "tempfile", ] [[package]] @@ -290,6 +430,39 @@ dependencies = [ "duct", ] +[[package]] +name = "moka" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b49a05f67020456541f4f29cbaa812016a266a86ec76f96d3873d459c68fe5e" +dependencies = [ + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "num_cpus", + "once_cell", + "parking_lot", + "quanta", + "rustc_version", + "scheduled-thread-pool", + "skeptic", + "smallvec", + "tagptr", + "thiserror", + "triomphe", + "uuid", +] + +[[package]] +name = "num_cpus" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "once_cell" version = "1.16.0" @@ -391,6 +564,33 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d9cc634bc78768157b5cbfe988ffcd1dcba95cd2b2f03a88316c08c6d00ed63" +dependencies = [ + "bitflags", + "memchr", + "unicase", +] + +[[package]] +name = "quanta" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e31331286705f455e56cca62e0e717158474ff02b7936c1fa596d983f4ae27" +dependencies = [ + "crossbeam-utils", + "libc", + "mach", + "once_cell", + "raw-cpuid", + "wasi 0.10.2+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + [[package]] name = "quote" version = "1.0.23" @@ -400,6 +600,15 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "raw-cpuid" +version = "10.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6823ea29436221176fe662da99998ad3b4db2c7f31e7b6f5fe43adccd6320bb" +dependencies = [ + "bitflags", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -409,6 +618,24 @@ dependencies = [ "bitflags", ] +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.36.5" @@ -423,12 +650,76 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "ryu" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scheduled-thread-pool" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "977a7519bff143a44f842fd07e80ad1329295bd71686457f18e496736f4bf9bf" +dependencies = [ + "parking_lot", +] + [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "semver" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" +dependencies = [ + "serde", +] + +[[package]] +name = "serde" +version = "1.0.151" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fed41fc1a24994d044e6db6935e69511a1153b52c15eb42493b26fa87feba0" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.151" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "255abe9a125a985c05190d687b320c12f9b1f0b99445e608c21ba0782c719ad8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +dependencies = [ + "itoa", + "ryu", + "serde", +] + [[package]] name = "shared_child" version = "1.0.0" @@ -439,6 +730,21 @@ dependencies = [ "winapi", ] +[[package]] +name = "skeptic" +version = "0.13.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8" +dependencies = [ + "bytecount", + "cargo_metadata", + "error-chain", + "glob", + "pulldown-cmark", + "tempfile", + "walkdir", +] + [[package]] name = "smallvec" version = "1.10.0" @@ -462,6 +768,26 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + [[package]] name = "termcolor" version = "1.1.3" @@ -481,6 +807,41 @@ dependencies = [ "winapi", ] +[[package]] +name = "thiserror" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "triomphe" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1ee9bd9239c339d714d657fac840c6d2a4f9c45f4f9ec7b0975113458be78db" + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-ident" version = "1.0.6" @@ -493,12 +854,108 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +[[package]] +name = "uuid" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "422ee0de9031b5b948b97a8fc04e3aa35230001a722ddd27943e0be31564ce4c" +dependencies = [ + "getrandom", +] + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + +[[package]] +name = "web-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" diff --git a/README.md b/README.md index 261f8d1..1086988 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,12 @@ The tutorial has 8 parts (which can be finished in 7 days): * Day 1: Block encoding. SSTs are composed of multiple data blocks. We will implement the block encoding. * Day 2: SST encoding. -* Day 3: Engine. In this day we will get a functional (but not persistent) key-value engine with `get`, `put`, `delete` - API. -* Day 4: Block cache. To reduce disk I/O and maximize performance, we will use moka-rs to build a block cache for the - LSM tree. +* Day 3: MemTable and Merge Iterators. +* Day 4: Block cache and Engine. To reduce disk I/O and maximize performance, we will use moka-rs to build a block cache +* for the LSM tree. In this day we will get a functional (but not persistent) key-value engine with `get`, `put`, `scan`, + `delete` API. * Day 5: Compaction. Now it's time to maintain a leveled structure for SSTs. * Day 6: Recovery. We will implement WAL and manifest so that the engine can recover after restart. * Day 7: Bloom filter and key compression. They are widely-used optimizations in LSM tree structures. -We have reference solution up to day 3 and tutorial up to day 1 for now. +We have reference solution up to day 3 and tutorial up to day 2 for now. diff --git a/mini-lsm-book/src/00-overview.md b/mini-lsm-book/src/00-overview.md index 2738ab8..b5ce185 100644 --- a/mini-lsm-book/src/00-overview.md +++ b/mini-lsm-book/src/00-overview.md @@ -84,10 +84,10 @@ In this tutorial, we will build the LSM tree structure in 7 days: * Day 1: Block encoding. SSTs are composed of multiple data blocks. We will implement the block encoding. * Day 2: SST encoding. -* Day 3: Engine. In this day we will get a functional (but not persistent) key-value engine with `get`, `put`, `delete` - API. -* Day 4: Block cache. To reduce disk I/O and maximize performance, we will use moka-rs to build a block cache for the - LSM tree. +* Day 3: MemTable and Merge Iterators. +* Day 4: Block cache and Engine. To reduce disk I/O and maximize performance, we will use moka-rs to build a block cache +* for the LSM tree. In this day we will get a functional (but not persistent) key-value engine with `get`, `put`, `scan`, + `delete` API. * Day 5: Compaction. Now it's time to maintain a leveled structure for SSTs. * Day 6: Recovery. We will implement WAL and manifest so that the engine can recover after restart. * Day 7: Bloom filter and key compression. They are widely-used optimizations in LSM tree structures. diff --git a/mini-lsm-book/src/01-block.md b/mini-lsm-book/src/01-block.md index 6e0ebd1..50793b2 100644 --- a/mini-lsm-book/src/01-block.md +++ b/mini-lsm-book/src/01-block.md @@ -87,6 +87,8 @@ After implementing this part, you should be able to pass all tests in `block/tes ## Extra Tasks +Here is a list of extra tasks you can do to make the block encoding more robust and efficient. + *Note: Some test cases might not pass after implementing this part. You might need to write your own test cases.* * Implement block checksum. Verify checksum when decoding the block. diff --git a/mini-lsm-book/src/02-sst.md b/mini-lsm-book/src/02-sst.md index 6d6e1de..c8f329e 100644 --- a/mini-lsm-book/src/02-sst.md +++ b/mini-lsm-book/src/02-sst.md @@ -1 +1,74 @@ # SST Builder and SST Iterator + + + +In this part, you will need to modify: + +* `src/table/builder.rs` +* `src/table/iterator.rs` +* `src/table.rs` + +You can use `cargo x copy-test day2` to copy our provided test cases to the starter code directory. After you have +finished this part, use `cargo x scheck` to check the style and run all test cases. If you want to write your own +test cases, write a new module `#[cfg(test)] mod user_tests { /* your test cases */ }` in `table.rs`. Remember to remove +`#![allow(...)]` at the top of the modules you modified so that cargo clippy can actually check the styles. + +## Task 1 - SST Builder + +SST is composed of data blocks and index blocks stored on the disk. Usually, data blocks are lazily loaded -- they will +not be loaded into the memory until a user requests it. Index blocks can also be loaded on-demand, but in this tutorial, +we make simple assumptions that all SST index blocks (meta blocks) can fit in memory. Generally, an SST file is of 256MB +size. + +The SST builder is similar to block builder -- users will call `add` on the builder. You should maintain a `BlockBuilder` +inside SST builder and split block when necessary. Also, you will need to maintain block metadata `BlockMeta`, which +includes the first key in each block and the offset of each block. The `build` function will encode the SST, write +everything to disk using `FileObject::create`, and return an `SsTable` object. Note that in part 2, you don't need to +actually write the data to the disk. Just store everything in memory as a vector until we implement a block cache. + +The encoding of SST is like: + +``` +| data block | data block | data block | data block | meta block | meta block offset (u32) | +``` + +You also need to implement `estimated_size` function of `SsTableBuilder`, so that the caller can know when can it start +a new SST to write data. The function don't need to be very accurate. Given the assumption that data blocks contain much +more data than meta block, we can simply return the size of data blocks for `estimated_size`. + +You can also align blocks to 4KB boundary so as to make it possible to do direct I/O in the future. This is an optional +optimization. + +## Task 2 - SST Iterator + +Like `BlockIteartor`, you will need to implement an iterator over an SST. Note that you should load data on demand. For +example, if your iterator is at block 1, it should not hold any other block content in memory until it reaches the next +block. + +`SsTableIterator` should implement the `StorageIterator` trait, so that it can be composed with other iterators in the +future. + +One thing to note is `seek_to_key` function. Basically, you will need to do binary search on block metadata to find +which block might possibly contain the key. It is possible that the key doesn't exist in the LSM tree so that the +block iterator will be invalid immediately after a seek. For example, + +``` +| block 1 | block 2 | block meta | +| a, b, c | e, f, g | 1: a, 2: e | +``` + +If we do `seek(b)` in this SST, it is quite simple -- using binary search, we can know block 1 contains keys `a <= keys +< e`. Therefore, we load block 1 and seek the block iterator to the corresponding position. + +But if we do `seek(d)`, we will position to block 1, but seeking `d` in block 1 will reach the end of the block. +Therefore, we should check if the iterator is invalid after seek, and switch to the next block if necessary. + +## Extra Tasks + +Here is a list of extra tasks you can do to make the block encoding more robust and efficient. + +*Note: Some test cases might not pass after implementing this part. You might need to write your own test cases.* + +* Implement index checksum. Verify checksum when decoding. +* Explore different SST encoding and layout. For example, in the [Lethe](https://disc-projects.bu.edu/lethe/) paper, + the author adds secondary key support to SST. diff --git a/mini-lsm-book/src/03-engine.md b/mini-lsm-book/src/03-engine.md deleted file mode 100644 index ad43513..0000000 --- a/mini-lsm-book/src/03-engine.md +++ /dev/null @@ -1 +0,0 @@ -# Mem Table and Storage Engine diff --git a/mini-lsm-book/src/03-memtable.md b/mini-lsm-book/src/03-memtable.md new file mode 100644 index 0000000..f24382f --- /dev/null +++ b/mini-lsm-book/src/03-memtable.md @@ -0,0 +1,28 @@ +# Mem Table and Merge Iterators + + + +In this part, you will need to modify: + +* `src/iterators/merge_iterator.rs` +* `src/iterators/two_merge_iterator.rs` +* `src/mem_table.rs` + +You can use `cargo x copy-test day3` to copy our provided test cases to the starter code directory. After you have +finished this part, use `cargo x scheck` to check the style and run all test cases. If you want to write your own +test cases, write a new module `#[cfg(test)] mod user_tests { /* your test cases */ }` in `table.rs`. Remember to remove +`#![allow(...)]` at the top of the modules you modified so that cargo clippy can actually check the styles. + +This is the last part for the basic building blocks of an LSM tree. After implementing the merge iterators, we can +easily merge data from different part of the data structure (mem table + SST) and get an iterator over all data. And +in part 4, we will compose all these things together to make a real storage engine. + +## Task 1 - Mem Table + +## Task 2 - Mem Table Iterator + +## Task 3 - Two-Merge Iterator + +## Task 4 - Merge Iterator + +## Extra Tasks diff --git a/mini-lsm-book/src/04-block-cache.md b/mini-lsm-book/src/04-block-cache.md deleted file mode 100644 index a47c9bf..0000000 --- a/mini-lsm-book/src/04-block-cache.md +++ /dev/null @@ -1 +0,0 @@ -# Block Cache diff --git a/mini-lsm-book/src/04-engine.md b/mini-lsm-book/src/04-engine.md new file mode 100644 index 0000000..83a2f96 --- /dev/null +++ b/mini-lsm-book/src/04-engine.md @@ -0,0 +1 @@ +# Storage Engine and Block Cache diff --git a/mini-lsm-book/src/SUMMARY.md b/mini-lsm-book/src/SUMMARY.md index 73e25be..1e5568a 100644 --- a/mini-lsm-book/src/SUMMARY.md +++ b/mini-lsm-book/src/SUMMARY.md @@ -6,8 +6,8 @@ - [Store key-value pairs in little blocks](./01-block.md) - [And make them into an SST](./02-sst.md) -- [Now it's time for a storage engine](./03-engine.md) -- [Block cache, the good way](./04-block-cache.md) +- [Now it's time for merging everything](./03-memtable.md) +- [Block cache, the good way](./04-engine.md) - [Let's do something in the background](./05-compaction.md) - [Be careful when the system crashes](./06-recovery.md) - [A good bloom filter makes life easier](./07-bloom-filter.md) diff --git a/mini-lsm-starter/Cargo.toml b/mini-lsm-starter/Cargo.toml index 20621fd..987c440 100644 --- a/mini-lsm-starter/Cargo.toml +++ b/mini-lsm-starter/Cargo.toml @@ -12,3 +12,7 @@ crossbeam-epoch = "0.9" crossbeam-skiplist = "0.1" parking_lot = "0.12" ouroboros = "0.15" +moka = "0.9" + +[dev-dependencies] +tempfile = "3" diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 2f90a79..fb5aef0 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,9 +1,10 @@ #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod -use crate::iterators::impls::StorageIterator; use anyhow::Result; +use crate::iterators::impls::StorageIterator; + pub struct LsmIterator {} impl StorageIterator for LsmIterator { @@ -24,7 +25,8 @@ impl StorageIterator for LsmIterator { } } -/// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is invalid. +/// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is +/// invalid. pub struct FusedIterator { iter: I, } diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index 19d3fe2..2104ea5 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -14,7 +14,7 @@ pub struct SsTableBuilder { } impl SsTableBuilder { - /// Create a builder based on target SST size and target block size. + /// Create a builder based on target block size. pub fn new(block_size: usize) -> Self { unimplemented!() } diff --git a/mini-lsm-starter/src/table/iterator.rs b/mini-lsm-starter/src/table/iterator.rs index 302c2fd..6bc52b9 100644 --- a/mini-lsm-starter/src/table/iterator.rs +++ b/mini-lsm-starter/src/table/iterator.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use anyhow::Result; use super::SsTable; +use crate::iterators::impls::StorageIterator; /// An iterator over the contents of an SSTable. pub struct SsTableIterator {} @@ -30,25 +31,22 @@ impl SsTableIterator { pub fn seek_to_key(&mut self, key: &[u8]) -> Result<()> { unimplemented!() } +} - /// Get the current key. - pub fn key(&self) -> &[u8] { +impl StorageIterator for SsTableIterator { + fn value(&self) -> &[u8] { unimplemented!() } - /// Get the current value. - pub fn value(&self) -> &[u8] { + fn key(&self) -> &[u8] { unimplemented!() } - /// Check if the iterator is valid. - pub fn is_valid(&self) -> bool { + fn is_valid(&self) -> bool { unimplemented!() } - /// Move to the next key-value pair. - #[allow(clippy::should_implement_trait)] - pub fn next(&mut self) -> Result<()> { + fn next(&mut self) -> Result<()> { unimplemented!() } } diff --git a/mini-lsm/Cargo.toml b/mini-lsm/Cargo.toml index 15d7354..9916224 100644 --- a/mini-lsm/Cargo.toml +++ b/mini-lsm/Cargo.toml @@ -8,8 +8,6 @@ license = { workspace = true } repository = { workspace = true } description = "A tutorial for building an LSM tree storage engine in a week." -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] anyhow = "1" arc-swap = "1" @@ -18,3 +16,7 @@ crossbeam-epoch = "0.9" crossbeam-skiplist = "0.1" parking_lot = "0.12" ouroboros = "0.15" +moka = "0.9" + +[dev-dependencies] +tempfile = "3" diff --git a/mini-lsm/src/lsm_iterator.rs b/mini-lsm/src/lsm_iterator.rs index 2b74b5f..3dd89ae 100644 --- a/mini-lsm/src/lsm_iterator.rs +++ b/mini-lsm/src/lsm_iterator.rs @@ -71,7 +71,8 @@ impl StorageIterator for LsmIterator { } } -/// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is invalid. +/// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is +/// invalid. pub struct FusedIterator { iter: I, } diff --git a/mini-lsm/src/table/builder.rs b/mini-lsm/src/table/builder.rs index 9fb33e6..db7f476 100644 --- a/mini-lsm/src/table/builder.rs +++ b/mini-lsm/src/table/builder.rs @@ -16,7 +16,7 @@ pub struct SsTableBuilder { } impl SsTableBuilder { - /// Create a builder based on target SST size and target block size. + /// Create a builder based on target block size. pub fn new(block_size: usize) -> Self { Self { data: Vec::new(), diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 46e1bbc..3f1b29b 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -161,6 +161,12 @@ fn copy_test_case(test: CopyTestAction) -> Result<()> { "mini-lsm-starter/src/iterators/tests/two_merge_iterator_test.rs" ) .run()?; + cmd!( + "cp", + "mini-lsm/src/iterators/tests.rs", + "mini-lsm-starter/src/iterators/tests.rs" + ) + .run()?; } } Ok(())