From de7f2ec2635f876e1d007ee66f9d1af2e1c7dc66 Mon Sep 17 00:00:00 2001 From: Xu Date: Tue, 11 Jul 2023 12:05:34 +0800 Subject: [PATCH] docs: add comments & hints for day one starter and reference code (#18) * feat(docs): Improve/Add comments & some hints for day one starter code * feat(docs): Add comments for day one solution code * feat(docs): Add figure for block storage format in starter code (block.rs) --- mini-lsm-starter/src/block.rs | 15 +++++++++++++-- mini-lsm-starter/src/block/iterator.rs | 6 ++++++ mini-lsm/src/block.rs | 1 + mini-lsm/src/block/builder.rs | 2 ++ mini-lsm/src/block/iterator.rs | 6 +++++- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs index 98aedfd..d87513f 100644 --- a/mini-lsm-starter/src/block.rs +++ b/mini-lsm-starter/src/block.rs @@ -5,21 +5,32 @@ mod builder; mod iterator; pub use builder::BlockBuilder; +/// You may want to check `bytes::BufMut` out when manipulating continuous chunks of memory use bytes::Bytes; pub use iterator::BlockIterator; -/// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted -/// key-value pairs. +/// A block is the smallest unit of read and caching in LSM tree. +/// It is a collection of sorted key-value pairs. +/// The `actual` storage format is as below (After `Block::encode`): +/// +/// ---------------------------------------------------------------------------------------------------- +/// | Data Section | Offset Section | Extra | +/// ---------------------------------------------------------------------------------------------------- +/// | Entry #1 | Entry #2 | ... | Entry #N | Offset #1 | Offset #2 | ... | Offset #N | num_of_elements | +/// ---------------------------------------------------------------------------------------------------- pub struct Block { data: Vec, offsets: Vec, } impl Block { + /// Encode the internal data to the data layout illustrated in the tutorial + /// Note: You may want to recheck if any of the expected field is missing from your output pub fn encode(&self) -> Bytes { unimplemented!() } + /// Decode from the data layout, transform the input `data` to a single `Block` pub fn decode(data: &[u8]) -> Self { unimplemented!() } diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index cef570d..77a832f 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -7,9 +7,13 @@ use super::Block; /// Iterates on a block. pub struct BlockIterator { + /// The internal `Block`, wrapped by an `Arc` block: Arc, + /// The current key, empty represents the iterator is invalid key: Vec, + /// The corresponding value, can be empty value: Vec, + /// Current index of the key-value pair, should be in range of [0, num_of_elements) idx: usize, } @@ -44,6 +48,7 @@ impl BlockIterator { } /// Returns true if the iterator is valid. + /// Note: You may want to make use of `key` pub fn is_valid(&self) -> bool { unimplemented!() } @@ -59,6 +64,7 @@ impl BlockIterator { } /// Seek to the first key that >= `key`. + /// Note: You should assume the key-value pairs in the block are sorted when being added by callers. pub fn seek_to_key(&mut self, key: &[u8]) { unimplemented!() } diff --git a/mini-lsm/src/block.rs b/mini-lsm/src/block.rs index e60415a..23c13b8 100644 --- a/mini-lsm/src/block.rs +++ b/mini-lsm/src/block.rs @@ -21,6 +21,7 @@ impl Block { for offset in &self.offsets { buf.put_u16(*offset); } + // Adds number of elements at the end of the block buf.put_u16(offsets_len as u16); buf.into() } diff --git a/mini-lsm/src/block/builder.rs b/mini-lsm/src/block/builder.rs index 0765e72..3d51001 100644 --- a/mini-lsm/src/block/builder.rs +++ b/mini-lsm/src/block/builder.rs @@ -30,11 +30,13 @@ impl BlockBuilder { #[must_use] pub fn add(&mut self, key: &[u8], value: &[u8]) -> bool { assert!(!key.is_empty(), "key must not be empty"); + // The overhead here is `key_len` + `val_len` + `offset`, each is of type `u16` if self.estimated_size() + key.len() + value.len() + SIZEOF_U16 * 3 > self.block_size && !self.is_empty() { return false; } + // The offsets should be updated at first, to maintain the correct offset self.offsets.push(self.data.len() as u16); self.data.put_u16(key.len() as u16); self.data.put(key); diff --git a/mini-lsm/src/block/iterator.rs b/mini-lsm/src/block/iterator.rs index ae527a6..c044b8e 100644 --- a/mini-lsm/src/block/iterator.rs +++ b/mini-lsm/src/block/iterator.rs @@ -76,8 +76,12 @@ impl BlockIterator { self.seek_to(self.idx); } + /// Seek to the specified position and update the current `key` and `value` + /// Index update will be handled by caller fn seek_to_offset(&mut self, offset: usize) { let mut entry = &self.block.data[offset..]; + // Since `get_u16()` will automatically move the ptr 2 bytes ahead here, + // we don't need to manually advance it let key_len = entry.get_u16() as usize; let key = entry[..key_len].to_vec(); entry.advance(key_len); @@ -90,7 +94,7 @@ impl BlockIterator { self.value.extend(value); } - /// Seek to the first key that >= `key`. + /// Seek to the first key that is >= `key`. pub fn seek_to_key(&mut self, key: &[u8]) { let mut low = 0; let mut high = self.block.offsets.len();