From fd4bb0162ad66b8f5da3ef0156dfa74ce68438f8 Mon Sep 17 00:00:00 2001 From: Alex Chi Date: Sat, 24 Dec 2022 23:45:53 -0500 Subject: [PATCH] feat(docs): finish part 4 Signed-off-by: Alex Chi --- README.md | 2 +- mini-lsm-book/src/03-memtable.md | 4 ++ mini-lsm-book/src/04-engine.md | 60 +++++++++++++++++++++++++++++ mini-lsm-book/src/SUMMARY.md | 2 +- mini-lsm-starter/src/lsm_storage.rs | 24 ++++++++---- mini-lsm/src/tests/day4_tests.rs | 1 + 6 files changed, 84 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2ac6023..1810c8a 100644 --- a/README.md +++ b/README.md @@ -34,4 +34,4 @@ The tutorial has 8 parts (which can be finished in 7 days): * Day 6: Recovery. We will implement WAL and manifest so that the engine can recover after restart. * Day 7: Bloom filter and key compression. They are widely-used optimizations in LSM tree structures. -We have reference solution up to day 4 and tutorial up to day 3 for now. +We have reference solution up to day 4 and tutorial up to day 4 for now. diff --git a/mini-lsm-book/src/03-memtable.md b/mini-lsm-book/src/03-memtable.md index 4c70953..42cc99d 100644 --- a/mini-lsm-book/src/03-memtable.md +++ b/mini-lsm-book/src/03-memtable.md @@ -84,6 +84,10 @@ the inner iter to the next position. +In this design, you might have noticed that as long as we have the iterator object, the mem-table cannot be freed from +the memory. In this tutorial, we assume user operations are short, so that this will not cause big problems. See extra +task for possible improvements. + ## Task 3 - Merge Iterator Now that you have a lot of mem-tables and SSTs, you might want to merge them to get the latest occurence of a key. diff --git a/mini-lsm-book/src/04-engine.md b/mini-lsm-book/src/04-engine.md index 093e8fd..3df4328 100644 --- a/mini-lsm-book/src/04-engine.md +++ b/mini-lsm-book/src/04-engine.md @@ -7,6 +7,7 @@ In this part, you will need to modify: * `src/lsm_iterator.rs` * `src/lsm_storage.rs` +* `src/table.rs` * Other parts that use `SsTable::read_block` You can use `cargo x copy-test day4` to copy our provided test cases to the starter code directory. After you have @@ -16,10 +17,69 @@ test cases, write a new module `#[cfg(test)] mod user_tests { /* your test cases ## Task 1 - Put and Delete +Before implementing put and delete, let's revisit how LSM tree works. The structure of LSM includes: + +* Mem-table: one active mutable mem-table and multiple immutable mem-tables. +* Write-ahead log: each mem-table corresponds to a WAL. +* SSTs: mem-table can be flushed to the disk in SST format. SSTs are organized in multiple levels. + +In this part, we only need to take the lock, write the entry (or tombstone) into the active mem-table. You can modify +`lsm_storage.rs`. + ## Task 2 - Get +To get a value from the LSM, we can simply probe from active memtable, immutable memtables (from latest to earliest), +and all the SSTs. To reduce the critical section, we can hold the read lock to copy all the pointers to mem-tables and +SSTs out of the `LsmStorageInner` structure, and create iterators out of the critical section. Be careful about the +order when creating iterators and probing. + ## Task 3 - Scan +To create a scan iterator `LsmIterator`, you will need to use `TwoMergeIterator` to merge `MergeIterator` on mem-table +and `MergeIterator` on SST. You can implement this in `lsm_iterator.rs`. Optionally, you can implement `FusedIterator` +so that if a user accidentally calls `next` after the iterator becomes invalid, the underlying iterator won't panic. + +The sequence of key-value pairs produced by `TwoMergeIterator` may contain empty value, which means that the value is +deleted. `LsmIterator` should filter these empty values. Also it needs to correctly handle the start and end bounds. + ## Task 4 - Sync +In this part, we will implement mem-tables and flush to L0 SSTs in `lsm_storage.rs`. As in task 1, write operations go +directly into the active mutable mem-table. Once `sync` is called, we flush SSTs to the disk in two steps: + +* Firstly, move the current mutable mem-table to immutable mem-table list, so that no future requests will go into the + current mem-table. Create a new mem-table. All of these should happen in one single critical section and stall all + reads. +* Then, we can flush the mem-table to disk as an SST file without holding any lock. +* Finally, in one critical section, remove the mem-table and put the SST into `l0_tables`. + +Only one thread can sync at a time, and therefore you should use a mutex to ensure this requirement. + ## Task 5 - Block Cache + +Now that we have implemented the LSM structure, we can start writing something to the disk! Previously in `table.rs`, +we implemented a `FileObject` struct, without writing anything to disk. In this task, we will change the implementation +so that: + +* `read` will read from the disk without any caching using `read_exact_at` in `std::os::unix::fs::FileExt`. +* The size of the file should be stored inside the struct, and `size` function directly returns it. +* `create` should write the file to the disk. Generally you should call `fsync` on that file. But this would slow down + unit tests a lot. Therefore, we don't do fsync until day 6 recovery. +* `open` remains unimplemented until day 6 recovery. + +After that, we can implement a new `read_block_cached` function on `SsTable` so that we can leverage block cache to +serve read requests. Upon initializing the `LsmStorage` struct, you should create a block cache of 4GB size using +`moka-rs`. Blocks are cached by SST id + block id. Use `try_get_with` to get the block from cache / populate the cache +if cache miss. If there are multiple requests reading the same block and cache misses, `try_get_with` will only issue a +single read request to the disk and broadcast the result to all requests. + +Remember to change `SsTableIterator` to use the block cache. + +## Extra Tasks + +* As you might have seen, each time we do a put or deletion, we will need to take a write lock protecting the LSM + structure. This can cause a lot of problems. Some lock implementations are fair, which means as long as there is a + writer waiting on the lock, no reader can take the lock. Therefore, the writer will wait until the slowest reader + finishes its operation before it can actually do some work. One possible optimization is to implement `WriteBatch`. + We don't need to immediately write users' requests into mem-table + WAL. We can allow users to do a batch of writes. +* Align blocks to 4K and use direct I/O. diff --git a/mini-lsm-book/src/SUMMARY.md b/mini-lsm-book/src/SUMMARY.md index 5375285..8aaf4fe 100644 --- a/mini-lsm-book/src/SUMMARY.md +++ b/mini-lsm-book/src/SUMMARY.md @@ -7,7 +7,7 @@ - [Store key-value pairs in little blocks](./01-block.md) - [And make them into an SST](./02-sst.md) - [Now it's time to merge everything](./03-memtable.md) -- [The engine on fire](./04-engine.md) +- [The engine is on fire](./04-engine.md) - [Let's do something in the background](./05-compaction.md) - [Be careful when the system crashes](./06-recovery.md) - [A good bloom filter makes life easier](./07-bloom-filter.md) diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 2322c3b..076b900 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -6,8 +6,8 @@ use std::path::Path; use std::sync::Arc; use anyhow::Result; -use arc_swap::ArcSwap; use bytes::Bytes; +use parking_lot::RwLock; use crate::block::Block; use crate::lsm_iterator::{FusedIterator, LsmIterator}; @@ -18,30 +18,40 @@ pub type BlockCache = moka::sync::Cache<(usize, usize), Arc>; #[derive(Clone)] pub struct LsmStorageInner { - /// MemTables, from oldest to earliest. - memtables: Vec>, - /// L0 SsTables, from oldest to earliest. + /// The current memtable. + memtable: Arc, + /// Immutable memTables, from earliest to latest. + imm_memtables: Vec>, + /// L0 SsTables, from earliest to latest. l0_sstables: Vec>, + /// L1 - L6 SsTables, sorted by key range. + #[allow(dead_code)] + levels: Vec>>, + /// The next SSTable ID. + next_sst_id: usize, } impl LsmStorageInner { fn create() -> Self { Self { - memtables: vec![Arc::new(MemTable::create())], + memtable: Arc::new(MemTable::create()), + imm_memtables: vec![], l0_sstables: vec![], + levels: vec![], + next_sst_id: 1, } } } /// The storage interface of the LSM tree. pub struct LsmStorage { - inner: ArcSwap, + inner: Arc>>, } impl LsmStorage { pub fn open(path: impl AsRef) -> Result { Ok(Self { - inner: ArcSwap::from_pointee(LsmStorageInner::create()), + inner: Arc::new(RwLock::new(Arc::new(LsmStorageInner::create()))), }) } diff --git a/mini-lsm/src/tests/day4_tests.rs b/mini-lsm/src/tests/day4_tests.rs index d6c4d86..9bfee56 100644 --- a/mini-lsm/src/tests/day4_tests.rs +++ b/mini-lsm/src/tests/day4_tests.rs @@ -161,6 +161,7 @@ fn test_storage_scan_memtable_2_after_sync() { let storage = LsmStorage::open(&dir).unwrap(); storage.put(b"1", b"233").unwrap(); storage.put(b"2", b"2333").unwrap(); + storage.sync().unwrap(); storage.put(b"3", b"23333").unwrap(); storage.sync().unwrap(); storage.delete(b"1").unwrap();