12
README.md
12
README.md
@@ -71,18 +71,6 @@ We are working on chapter 3 and more test cases for all existing contents.
|
|||||||
| 3.5 | Transactions and Optimistic Concurrency Control | | | |
|
| 3.5 | Transactions and Optimistic Concurrency Control | | | |
|
||||||
| 3.6 | Serializable Snapshot Isolation | | | |
|
| 3.6 | Serializable Snapshot Isolation | | | |
|
||||||
| 3.7 | TTL (Time-to-Live) Entries | | | |
|
| 3.7 | TTL (Time-to-Live) Entries | | | |
|
||||||
| 4.1 | Benchmarking | | | |
|
|
||||||
| 4.2 | Block Compression | | | |
|
|
||||||
| 4.3 | Trivial Move and Parallel Compaction | | | |
|
|
||||||
| 4.4 | Alternative Block Encodings | | | |
|
|
||||||
| 4.5 | Rate Limiter and I/O Optimizations | | | |
|
|
||||||
| 4.6 | Build Your Own Block Cache | | | |
|
|
||||||
| 4.7 | Build Your Own SkipList | | | |
|
|
||||||
| 4.8 | Async Engine | | | |
|
|
||||||
| 4.9 | Key-Value Separation | | | |
|
|
||||||
| 4.10 | Column Families | | | |
|
|
||||||
| 4.11 | Sharding | | | |
|
|
||||||
| 4.12 | SQL over Mini-LSM | | | |
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
<!--  -->
|
<!--  -->
|
||||||
|
|
||||||
at the end of each week, we will have some easy, not important, while interesting things
|
In the previous chapter, you already built a full LSM-based storage engine with. At the end of this week, we will implement some easy but important optimizations of the storage engine. Welcome to Mini-LSM's week w snack time!
|
||||||
|
|
||||||
In this chapter, you will:
|
In this chapter, you will:
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,8 @@ This is an advanced part that deep dives into optimizations and applications of
|
|||||||
| 4.7 | Build Your Own SkipList | | | |
|
| 4.7 | Build Your Own SkipList | | | |
|
||||||
| 4.8 | Async Engine | | | |
|
| 4.8 | Async Engine | | | |
|
||||||
| 4.9 | IO-uring-based I/O engine | | | |
|
| 4.9 | IO-uring-based I/O engine | | | |
|
||||||
| 4.10 | Key-Value Separation | | | |
|
| 4.10 | Prefetching | | | |
|
||||||
| 4.11 | Column Families | | | |
|
| 4.11 | Key-Value Separation | | | |
|
||||||
| 4.12 | Sharding | | | |
|
| 4.12 | Column Families | | | |
|
||||||
| 4.13 | SQL over Mini-LSM | | | |
|
| 4.13 | Sharding | | | |
|
||||||
|
| 4.14 | SQL over Mini-LSM | | | |
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ impl LsmStorageInner {
|
|||||||
let builder_inner = builder.as_mut().unwrap();
|
let builder_inner = builder.as_mut().unwrap();
|
||||||
builder_inner.add(iter.key(), iter.value());
|
builder_inner.add(iter.key(), iter.value());
|
||||||
|
|
||||||
let same_as_last_key = iter.key().key_ref() == &last_key;
|
let same_as_last_key = iter.key().key_ref() == last_key;
|
||||||
|
|
||||||
if builder_inner.estimated_size() >= self.options.target_sst_size && !same_as_last_key {
|
if builder_inner.estimated_size() >= self.options.target_sst_size && !same_as_last_key {
|
||||||
let sst_id = self.next_sst_id();
|
let sst_id = self.next_sst_id();
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ impl LsmStorageState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
pub struct LsmStorageOptions {
|
pub struct LsmStorageOptions {
|
||||||
// Block size in bytes
|
// Block size in bytes
|
||||||
pub block_size: usize,
|
pub block_size: usize,
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ impl LsmStorageState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
pub struct LsmStorageOptions {
|
pub struct LsmStorageOptions {
|
||||||
// Block size in bytes
|
// Block size in bytes
|
||||||
pub block_size: usize,
|
pub block_size: usize,
|
||||||
|
|||||||
@@ -253,6 +253,7 @@ impl LsmStorageInner {
|
|||||||
let CompactionOptions::NoCompaction = self.options.compaction_options else {
|
let CompactionOptions::NoCompaction = self.options.compaction_options else {
|
||||||
panic!("full compaction can only be called with compaction is not enabled")
|
panic!("full compaction can only be called with compaction is not enabled")
|
||||||
};
|
};
|
||||||
|
|
||||||
let snapshot = {
|
let snapshot = {
|
||||||
let state = self.state.read();
|
let state = self.state.read();
|
||||||
state.clone()
|
state.clone()
|
||||||
@@ -264,23 +265,26 @@ impl LsmStorageInner {
|
|||||||
l0_sstables: l0_sstables.clone(),
|
l0_sstables: l0_sstables.clone(),
|
||||||
l1_sstables: l1_sstables.clone(),
|
l1_sstables: l1_sstables.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
println!("force full compaction: {:?}", compaction_task);
|
||||||
|
|
||||||
let sstables = self.compact(&compaction_task)?;
|
let sstables = self.compact(&compaction_task)?;
|
||||||
|
let mut ids = Vec::with_capacity(sstables.len());
|
||||||
|
|
||||||
{
|
{
|
||||||
let _state_lock = self.state_lock.lock();
|
let state_lock = self.state_lock.lock();
|
||||||
let mut state = self.state.read().as_ref().clone();
|
let mut state = self.state.read().as_ref().clone();
|
||||||
for sst in l0_sstables.iter().chain(l1_sstables.iter()) {
|
for sst in l0_sstables.iter().chain(l1_sstables.iter()) {
|
||||||
let result = state.sstables.remove(sst);
|
let result = state.sstables.remove(sst);
|
||||||
assert!(result.is_some());
|
assert!(result.is_some());
|
||||||
}
|
}
|
||||||
let mut ids = Vec::with_capacity(sstables.len());
|
|
||||||
for new_sst in sstables {
|
for new_sst in sstables {
|
||||||
ids.push(new_sst.sst_id());
|
ids.push(new_sst.sst_id());
|
||||||
let result = state.sstables.insert(new_sst.sst_id(), new_sst);
|
let result = state.sstables.insert(new_sst.sst_id(), new_sst);
|
||||||
assert!(result.is_none());
|
assert!(result.is_none());
|
||||||
}
|
}
|
||||||
assert_eq!(l1_sstables, state.levels[0].1);
|
assert_eq!(l1_sstables, state.levels[0].1);
|
||||||
state.levels[0].1 = ids;
|
state.levels[0].1 = ids.clone();
|
||||||
let mut l0_sstables_map = l0_sstables.iter().copied().collect::<HashSet<_>>();
|
let mut l0_sstables_map = l0_sstables.iter().copied().collect::<HashSet<_>>();
|
||||||
state.l0_sstables = state
|
state.l0_sstables = state
|
||||||
.l0_sstables
|
.l0_sstables
|
||||||
@@ -290,10 +294,18 @@ impl LsmStorageInner {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
assert!(l0_sstables_map.is_empty());
|
assert!(l0_sstables_map.is_empty());
|
||||||
*self.state.write() = Arc::new(state);
|
*self.state.write() = Arc::new(state);
|
||||||
|
self.sync_dir()?;
|
||||||
|
self.manifest.as_ref().unwrap().add_record(
|
||||||
|
&state_lock,
|
||||||
|
ManifestRecord::Compaction(compaction_task, ids.clone()),
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
for sst in l0_sstables.iter().chain(l1_sstables.iter()) {
|
for sst in l0_sstables.iter().chain(l1_sstables.iter()) {
|
||||||
std::fs::remove_file(self.path_of_sst(*sst))?;
|
std::fs::remove_file(self.path_of_sst(*sst))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println!("force full compaction done, new SSTs: {:?}", ids);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -308,6 +320,7 @@ impl LsmStorageInner {
|
|||||||
let Some(task) = task else {
|
let Some(task) = task else {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
};
|
};
|
||||||
|
self.dump_structure();
|
||||||
println!("running compaction task: {:?}", task);
|
println!("running compaction task: {:?}", task);
|
||||||
let sstables = self.compact(&task)?;
|
let sstables = self.compact(&task)?;
|
||||||
let output = sstables.iter().map(|x| x.sst_id()).collect::<Vec<_>>();
|
let output = sstables.iter().map(|x| x.sst_id()).collect::<Vec<_>>();
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ impl LsmStorageState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
pub struct LsmStorageOptions {
|
pub struct LsmStorageOptions {
|
||||||
// Block size in bytes
|
// Block size in bytes
|
||||||
pub block_size: usize,
|
pub block_size: usize,
|
||||||
@@ -363,7 +364,7 @@ impl LsmStorageInner {
|
|||||||
table_id,
|
table_id,
|
||||||
Some(block_cache.clone()),
|
Some(block_cache.clone()),
|
||||||
FileObject::open(&Self::path_of_sst_static(path, table_id))
|
FileObject::open(&Self::path_of_sst_static(path, table_id))
|
||||||
.context("failed to open SST")?,
|
.with_context(|| format!("failed to open SST: {}", table_id))?,
|
||||||
)?;
|
)?;
|
||||||
state.sstables.insert(table_id, Arc::new(sst));
|
state.sstables.insert(table_id, Arc::new(sst));
|
||||||
sst_cnt += 1;
|
sst_cnt += 1;
|
||||||
|
|||||||
@@ -10,3 +10,5 @@ mod week2_day1;
|
|||||||
mod week2_day2;
|
mod week2_day2;
|
||||||
mod week2_day3;
|
mod week2_day3;
|
||||||
mod week2_day4;
|
mod week2_day4;
|
||||||
|
mod week2_day5;
|
||||||
|
mod week2_day6;
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::{collections::BTreeMap, path::Path, sync::Arc, time::Duration};
|
use std::{collections::BTreeMap, ops::Bound, path::Path, sync::Arc, time::Duration};
|
||||||
|
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
@@ -9,7 +9,7 @@ use crate::{
|
|||||||
TieredCompactionOptions,
|
TieredCompactionOptions,
|
||||||
},
|
},
|
||||||
iterators::StorageIterator,
|
iterators::StorageIterator,
|
||||||
key::KeySlice,
|
key::{KeySlice, TS_ENABLED},
|
||||||
lsm_storage::{BlockCache, LsmStorageInner, MiniLsm},
|
lsm_storage::{BlockCache, LsmStorageInner, MiniLsm},
|
||||||
table::{SsTable, SsTableBuilder},
|
table::{SsTable, SsTableBuilder},
|
||||||
};
|
};
|
||||||
@@ -171,11 +171,12 @@ pub fn compaction_bench(storage: Arc<MiniLsm>) {
|
|||||||
let gen_key = |i| format!("{:010}", i); // 10B
|
let gen_key = |i| format!("{:010}", i); // 10B
|
||||||
let gen_value = |i| format!("{:0110}", i); // 110B
|
let gen_value = |i| format!("{:0110}", i); // 110B
|
||||||
let mut max_key = 0;
|
let mut max_key = 0;
|
||||||
|
let overlaps = if TS_ENABLED { 10000 } else { 20000 };
|
||||||
for iter in 0..10 {
|
for iter in 0..10 {
|
||||||
let range_begin = iter * 5000;
|
let range_begin = iter * 5000;
|
||||||
for i in range_begin..(range_begin + 10000) {
|
for i in range_begin..(range_begin + overlaps) {
|
||||||
// 120B per key, 4MB data populated
|
// 120B per key, 4MB data populated
|
||||||
let key = gen_key(i);
|
let key: String = gen_key(i);
|
||||||
let version = key_map.get(&i).copied().unwrap_or_default() + 1;
|
let version = key_map.get(&i).copied().unwrap_or_default() + 1;
|
||||||
let value = gen_value(version);
|
let value = gen_value(version);
|
||||||
key_map.insert(i, version);
|
key_map.insert(i, version);
|
||||||
@@ -184,17 +185,24 @@ pub fn compaction_bench(storage: Arc<MiniLsm>) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut expected_key_value_pairs = Vec::new();
|
||||||
for i in 0..(max_key + 40000) {
|
for i in 0..(max_key + 40000) {
|
||||||
let key = gen_key(i);
|
let key = gen_key(i);
|
||||||
let value = storage.get(key.as_bytes()).unwrap();
|
let value = storage.get(key.as_bytes()).unwrap();
|
||||||
if let Some(val) = key_map.get(&i) {
|
if let Some(val) = key_map.get(&i) {
|
||||||
let expected_value = gen_value(*val);
|
let expected_value = gen_value(*val);
|
||||||
assert_eq!(value, Some(Bytes::from(expected_value)));
|
assert_eq!(value, Some(Bytes::from(expected_value.clone())));
|
||||||
|
expected_key_value_pairs.push((Bytes::from(key), Bytes::from(expected_value)));
|
||||||
} else {
|
} else {
|
||||||
assert!(value.is_none());
|
assert!(value.is_none());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
check_lsm_iter_result_by_key(
|
||||||
|
&mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(),
|
||||||
|
expected_key_value_pairs,
|
||||||
|
);
|
||||||
|
|
||||||
while {
|
while {
|
||||||
let snapshot = storage.inner.state.read();
|
let snapshot = storage.inner.state.read();
|
||||||
!snapshot.imm_memtables.is_empty()
|
!snapshot.imm_memtables.is_empty()
|
||||||
@@ -324,3 +332,9 @@ pub fn check_compaction_ratio(storage: Arc<MiniLsm>) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn dump_files_in_dir(path: impl AsRef<Path>) {
|
||||||
|
for f in path.as_ref().read_dir().unwrap() {
|
||||||
|
println!("{}", f.unwrap().path().display())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
81
mini-lsm/src/tests/week2_day5.rs
Normal file
81
mini-lsm/src/tests/week2_day5.rs
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
compact::{
|
||||||
|
CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions,
|
||||||
|
TieredCompactionOptions,
|
||||||
|
},
|
||||||
|
lsm_storage::{LsmStorageOptions, MiniLsm},
|
||||||
|
tests::harness::dump_files_in_dir,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_integration_leveled() {
|
||||||
|
test_integration(CompactionOptions::Leveled(LeveledCompactionOptions {
|
||||||
|
level_size_multiplier: 2,
|
||||||
|
level0_file_num_compaction_trigger: 2,
|
||||||
|
max_levels: 3,
|
||||||
|
base_level_size_mb: 1,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_integration_tiered() {
|
||||||
|
test_integration(CompactionOptions::Tiered(TieredCompactionOptions {
|
||||||
|
num_tiers: 3,
|
||||||
|
max_size_amplification_percent: 200,
|
||||||
|
size_ratio: 1,
|
||||||
|
min_merge_width: 3,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_integration_simple() {
|
||||||
|
test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions {
|
||||||
|
size_ratio_percent: 200,
|
||||||
|
level0_file_num_compaction_trigger: 2,
|
||||||
|
max_levels: 3,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_integration(compaction_options: CompactionOptions) {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let storage = MiniLsm::open(
|
||||||
|
&dir,
|
||||||
|
LsmStorageOptions::default_for_week2_test(compaction_options.clone()),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
for i in 0..=20 {
|
||||||
|
storage.put(b"0", format!("v{}", i).as_bytes()).unwrap();
|
||||||
|
if i % 2 == 0 {
|
||||||
|
storage.put(b"1", format!("v{}", i).as_bytes()).unwrap();
|
||||||
|
} else {
|
||||||
|
storage.delete(b"1").unwrap();
|
||||||
|
}
|
||||||
|
if i % 2 == 1 {
|
||||||
|
storage.put(b"2", format!("v{}", i).as_bytes()).unwrap();
|
||||||
|
} else {
|
||||||
|
storage.delete(b"2").unwrap();
|
||||||
|
}
|
||||||
|
storage
|
||||||
|
.inner
|
||||||
|
.force_freeze_memtable(&storage.inner.state_lock.lock())
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
storage.close().unwrap();
|
||||||
|
// ensure all SSTs are flushed
|
||||||
|
assert!(storage.inner.state.read().memtable.is_empty());
|
||||||
|
assert!(storage.inner.state.read().imm_memtables.is_empty());
|
||||||
|
storage.dump_structure();
|
||||||
|
drop(storage);
|
||||||
|
dump_files_in_dir(&dir);
|
||||||
|
|
||||||
|
let storage = MiniLsm::open(
|
||||||
|
&dir,
|
||||||
|
LsmStorageOptions::default_for_week2_test(compaction_options.clone()),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice());
|
||||||
|
assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice());
|
||||||
|
assert_eq!(storage.get(b"2").unwrap(), None);
|
||||||
|
}
|
||||||
77
mini-lsm/src/tests/week2_day6.rs
Normal file
77
mini-lsm/src/tests/week2_day6.rs
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
compact::{
|
||||||
|
CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions,
|
||||||
|
TieredCompactionOptions,
|
||||||
|
},
|
||||||
|
lsm_storage::{LsmStorageOptions, MiniLsm},
|
||||||
|
tests::harness::dump_files_in_dir,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_integration_leveled() {
|
||||||
|
test_integration(CompactionOptions::Leveled(LeveledCompactionOptions {
|
||||||
|
level_size_multiplier: 2,
|
||||||
|
level0_file_num_compaction_trigger: 2,
|
||||||
|
max_levels: 3,
|
||||||
|
base_level_size_mb: 1,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_integration_tiered() {
|
||||||
|
test_integration(CompactionOptions::Tiered(TieredCompactionOptions {
|
||||||
|
num_tiers: 3,
|
||||||
|
max_size_amplification_percent: 200,
|
||||||
|
size_ratio: 1,
|
||||||
|
min_merge_width: 3,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_integration_simple() {
|
||||||
|
test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions {
|
||||||
|
size_ratio_percent: 200,
|
||||||
|
level0_file_num_compaction_trigger: 2,
|
||||||
|
max_levels: 3,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_integration(compaction_options: CompactionOptions) {
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let mut options = LsmStorageOptions::default_for_week2_test(compaction_options);
|
||||||
|
options.enable_wal = true;
|
||||||
|
let storage = MiniLsm::open(&dir, options.clone()).unwrap();
|
||||||
|
for i in 0..=20 {
|
||||||
|
storage.put(b"0", format!("v{}", i).as_bytes()).unwrap();
|
||||||
|
if i % 2 == 0 {
|
||||||
|
storage.put(b"1", format!("v{}", i).as_bytes()).unwrap();
|
||||||
|
} else {
|
||||||
|
storage.delete(b"1").unwrap();
|
||||||
|
}
|
||||||
|
if i % 2 == 1 {
|
||||||
|
storage.put(b"2", format!("v{}", i).as_bytes()).unwrap();
|
||||||
|
} else {
|
||||||
|
storage.delete(b"2").unwrap();
|
||||||
|
}
|
||||||
|
storage
|
||||||
|
.inner
|
||||||
|
.force_freeze_memtable(&storage.inner.state_lock.lock())
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
storage.close().unwrap();
|
||||||
|
// ensure some SSTs are not flushed
|
||||||
|
assert!(
|
||||||
|
!storage.inner.state.read().memtable.is_empty()
|
||||||
|
|| !storage.inner.state.read().imm_memtables.is_empty()
|
||||||
|
);
|
||||||
|
storage.dump_structure();
|
||||||
|
drop(storage);
|
||||||
|
dump_files_in_dir(&dir);
|
||||||
|
|
||||||
|
let storage = MiniLsm::open(&dir, options).unwrap();
|
||||||
|
assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice());
|
||||||
|
assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice());
|
||||||
|
assert_eq!(storage.get(b"2").unwrap(), None);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user