diff --git a/README.md b/README.md index ac9f8dd..47dbfde 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ Run on Apple M4, `-benchtime=5s`. Numbers are per-operation. | bloom=false, lock=true | 70,526 | 35,228 | | bloom=false, lock=false | 80,792 | 39,588 | -### Sequential reads — hits (10k keys pre-loaded) +### Sequential reads, hits (10k keys pre-loaded) | Config | ns/op | |--------|------:| @@ -99,7 +99,7 @@ Run on Apple M4, `-benchtime=5s`. Numbers are per-operation. | bloom=false, lock=true | 5,351 | | bloom=false, lock=false | 5,333 | -### Sequential reads — misses (10k keys pre-loaded, reading non-existent keys) +### Sequential reads, misses (10k keys pre-loaded, reading non-existent keys) | Config | ns/op | |--------|------:| @@ -108,7 +108,7 @@ Run on Apple M4, `-benchtime=5s`. Numbers are per-operation. | bloom=false, lock=true | 67.5 | | bloom=false, lock=false | 67.4 | -### Reads — large dataset (100k keys, hits vs misses) +### Reads, large dataset (100k keys, hits vs misses) | Config | ns/op | |--------|------:| @@ -133,17 +133,17 @@ Run on Apple M4, `-benchtime=5s`. Numbers are per-operation. ### What the numbers tell us -**Bloom filter only helps misses.** On a hit, the bloom filter says "maybe" and you still have to read the index — so you pay the hash cost for nothing. On a miss, bloom can reject a key outright without touching the index at all, which is where the speedup comes from. +**Bloom filter only helps misses.** On a hit, bloom says "maybe" and you still have to read the index, so you pay the hash cost for nothing. On a miss it can reject the key without touching the index at all, which is where the speedup comes from. -**The bloom advantage scales with index size.** With 10k keys (small SSTables, index fits in cache), bloom saves ~3% on misses (65ns vs 67ns). With 100k keys (larger indices after compaction), the saving grows to ~12% (97ns vs 110ns). The index maps are too large for CPU cache at that point, so each lookup becomes a RAM access; bloom's compact bitset stays cache-hot and wins. +**The advantage scales with index size.** With 10k keys the SSTable index maps fit in CPU cache, so a plain map lookup beats running 3 hashes and bloom only saves about 3% on misses (65ns vs 67ns). With 100k keys the indices spill out of cache and each lookup costs a RAM round-trip. At that point bloom's compact bitset stays cache-hot and the saving grows to about 12% (97ns vs 110ns). -**Bloom never helps read hits.** The hit numbers across small and large datasets are nearly identical with and without bloom (~5.3–5.4µs). The SSTable data file read dominates, and bloom adds a small hash overhead before it. +**Bloom never helps hits.** Hit latency is nearly identical with and without bloom across both dataset sizes (~5.3-5.4µs). The SSTable data file read dominates and bloom just adds a small hash overhead before it. -**An uncontended lock is basically free.** For sequential writes, `lock=true` is actually faster than `lock=false` (58µs vs 73µs). There's no contention so the mutex costs nothing, and the two code paths end up with different allocation patterns — `lock=false` allocates about 30% more memory per op, which is where the slowdown comes from. +**An uncontended lock is basically free.** Sequential writes with `lock=true` are actually faster than `lock=false` (58µs vs 73µs). There's no contention so the mutex costs nothing, and the two code paths end up with different allocation patterns. `lock=false` allocates about 30% more per op, which is where the difference comes from. -**Concurrent writes scale to about 3x, not 10x.** Sequential puts cost ~58µs; parallel drops to ~18µs across 10 goroutines. The ceiling is the memtable write lock — WAL appends and B-tree inserts both serialize, so adding more goroutines doesn't help past a point. +**Concurrent writes scale to about 3x, not 10x.** Sequential puts cost ~58µs; parallel drops to ~18µs across 10 goroutines. WAL appends and B-tree inserts both serialize on the write lock, so more goroutines can't help past that point. -**Concurrent reads barely move the needle** (~5.4µs → 4.7µs). Reads take a shared `RLock` so they can technically run in parallel, but the bottleneck is SSTable file I/O which doesn't fan out well on a single drive. +**Concurrent reads barely move.** ~5.4µs drops to ~4.7µs. Reads take a shared `RLock` so they can run in parallel, but the bottleneck is SSTable file I/O which doesn't fan out well on a single drive. ## Running diff --git a/db/db.go b/db/db.go index d3d0888..baaa47c 100644 --- a/db/db.go +++ b/db/db.go @@ -1,144 +1,67 @@ package db import ( - "fmt" - compaction "lorem-lsm/compactor" - "lorem-lsm/memtable" - "lorem-lsm/sstable" - "lorem-lsm/wal" - "os" + "lorem-lsm/shard" "sync" - "time" ) type LoremDB struct { - wal *wal.Wal - memTable *memtable.MemTable - ssTables []*sstable.SSTable - ssTablePath string - useBloom bool - useLock bool - ssTableLimit int - lock sync.RWMutex + shardCount int + dbList []*shard.LoremDBShard + supportConcurrency bool + useBloom bool + lock sync.Mutex } -func NewLoremDB(useBloom bool, supportConcurrency bool) *LoremDB { - - os.MkdirAll("sstable", 0755) - writeAheadLog, _ := wal.NewWal("wal.log") - memTable := memtable.NewMemTable() - - writeAheadLog.Recover(func(walRow *wal.WalRow) { - if !walRow.IsDeleted { - memTable.Put(walRow.Key, walRow.Value) - } else { - memTable.Delete(walRow.Key) - } - }) - - ssTables := []*sstable.SSTable{} - ssTablePath := "sstable" +func NewLoremDB(shardCount int, useBloom bool, supportConcurrency bool) *LoremDB { return &LoremDB{ - wal: writeAheadLog, - memTable: memTable, - ssTables: ssTables, - ssTablePath: ssTablePath, - useBloom: useBloom, - ssTableLimit: 5, - useLock: supportConcurrency, + shardCount: shardCount, + dbList: make([]*shard.LoremDBShard, shardCount), + useBloom: useBloom, + supportConcurrency: supportConcurrency, } } func (db *LoremDB) Put(key string, value string) error { - // write wal first to maximize data recovery chances - if db.useLock { - db.lock.Lock() - defer db.lock.Unlock() - } - - db.wal.Append(wal.WalRow{ - Key: key, - Value: value, - IsDeleted: false, - }) - - isMemTableFull := db.memTable.Put(key, value) - - if isMemTableFull { - path := fmt.Sprintf("%s/%d", db.ssTablePath, time.Now().UnixNano()) - table, err := sstable.CreateSSTable(path, db.useBloom) - - if err != nil { - return err - } - - table.FlushMemTable(db.memTable) - db.ssTables = append(db.ssTables, table) - - if len(db.ssTables) > db.ssTableLimit { - compactor := compaction.NewCompactor(db.ssTables) - db.ssTables = []*sstable.SSTable{compactor.Compact()} + shard := db.GetShard(&key) + return shard.Put(key, value) +} - } - db.memTable = memtable.NewMemTable() - db.wal.Clear() - } - return nil +func (db *LoremDB) Get(key string) (string, bool) { + shard := db.GetShard(&key) + return shard.Get(key) } func (db *LoremDB) Delete(key string) error { + shard := db.GetShard(&key) + return shard.Delete(key) +} - if db.useLock { +func (db *LoremDB) GetShard(key *string) *shard.LoremDBShard { + id := db.GetShardId(key) + selectedShard := db.dbList[id] + if selectedShard == nil { + // lazy loadingl db.lock.Lock() defer db.lock.Unlock() - } - - // write wal first to maximize data recovery chances - db.wal.Append(wal.WalRow{ - Key: key, - Value: "", - IsDeleted: true, - }) - - isMemTableFull := db.memTable.Delete(key) - - if isMemTableFull { - path := fmt.Sprintf("%s/%d", db.ssTablePath, time.Now().UnixNano()) - table, err := sstable.CreateSSTable(path, db.useBloom) - - if err != nil { - - return err + selectedShard = db.dbList[id] + if selectedShard == nil { + selectedShard = shard.NewLoremDBShard(id, db.useBloom, db.supportConcurrency) + db.dbList[id] = selectedShard } - - table.FlushMemTable(db.memTable) - db.ssTables = append(db.ssTables, table) - db.memTable = memtable.NewMemTable() - fmt.Println("reset, new size:", db.memTable.Size()) } - return nil + return selectedShard } -func (db *LoremDB) Get(key string) (string, bool) { +func (db *LoremDB) GetShardId(key *string) int { - if db.useLock { - db.lock.RLock() - defer db.lock.RUnlock() - } + // get value of first 5 characters. + sum := 0 - // check in memtable - val, ok := db.memTable.Get(key) - if ok { - return val, true - } - // fallback to sstable - for i := len(db.ssTables) - 1; i >= 0; i-- { - value, _ := db.ssTables[i].Get(key) - - if value != "" { - return value, true - } + for i := 0; i < min(len(*key), 5); i++ { + sum += int((*key)[i] - 'a') } - return "", false + shardHash := sum % (db.shardCount) + return shardHash } diff --git a/db/db_bench_test.go b/db/db_bench_test.go index 344fd7d..f3f28e5 100644 --- a/db/db_bench_test.go +++ b/db/db_bench_test.go @@ -6,27 +6,30 @@ import ( "testing" ) -func setupDB(b *testing.B, useBloom bool, useLock bool) *LoremDB { +func setupDB(b *testing.B, shardCount int, useBloom bool, useLock bool) *LoremDB { os.RemoveAll("sstable") os.Remove("wal.log") os.MkdirAll("sstable", 0755) - return NewLoremDB(useBloom, useLock) + return NewLoremDB(shardCount, useBloom, useLock) } func BenchmarkPut(b *testing.B) { configs := []struct { - useBloom bool - useLock bool + useBloom bool + useLock bool + shardCount int }{ - {true, true}, - {true, false}, - {false, true}, - {false, false}, + {true, true, 1}, + {true, true, 2}, + {true, false, 1}, + {false, true, 1}, + {false, true, 2}, + {false, false, 1}, } for _, cfg := range configs { - name := fmt.Sprintf("bloom=%v,lock=%v", cfg.useBloom, cfg.useLock) + name := fmt.Sprintf("bloom=%v,lock=%v,shardCount=%d", cfg.useBloom, cfg.useLock, cfg.shardCount) b.Run(name, func(b *testing.B) { - store := setupDB(b, cfg.useBloom, cfg.useLock) + store := setupDB(b, cfg.shardCount, cfg.useBloom, cfg.useLock) b.ResetTimer() for i := 0; i < b.N; i++ { store.Put(fmt.Sprintf("key-%d", i), fmt.Sprintf("value-%d", i)) @@ -37,18 +40,21 @@ func BenchmarkPut(b *testing.B) { func BenchmarkGet(b *testing.B) { configs := []struct { - useBloom bool - useLock bool + useBloom bool + useLock bool + shardCount int }{ - {true, true}, - {true, false}, - {false, true}, - {false, false}, + {true, true, 1}, + {true, true, 2}, + {true, false, 1}, + {false, true, 1}, + {false, true, 2}, + {false, false, 1}, } for _, cfg := range configs { - name := fmt.Sprintf("bloom=%v,lock=%v", cfg.useBloom, cfg.useLock) + name := fmt.Sprintf("bloom=%v,lock=%v,shardCount=%d", cfg.useBloom, cfg.useLock, cfg.shardCount) b.Run(name, func(b *testing.B) { - store := setupDB(b, cfg.useBloom, cfg.useLock) + store := setupDB(b, cfg.shardCount, cfg.useBloom, cfg.useLock) for i := 0; i < 10000; i++ { store.Put(fmt.Sprintf("key-%d", i), fmt.Sprintf("value-%d", i)) } @@ -64,18 +70,21 @@ func BenchmarkGet(b *testing.B) { // every SSTable lookup (no index access needed), so the gap vs no-bloom is largest here. func BenchmarkGetMiss(b *testing.B) { configs := []struct { - useBloom bool - useLock bool + useBloom bool + useLock bool + shardCount int }{ - {true, true}, - {true, false}, - {false, true}, - {false, false}, + {true, true, 1}, + {true, true, 2}, + {true, false, 1}, + {false, true, 1}, + {false, true, 2}, + {false, false, 1}, } for _, cfg := range configs { - name := fmt.Sprintf("bloom=%v,lock=%v", cfg.useBloom, cfg.useLock) + name := fmt.Sprintf("bloom=%v,lock=%v,shardCount=%d", cfg.useBloom, cfg.useLock, cfg.shardCount) b.Run(name, func(b *testing.B) { - store := setupDB(b, cfg.useBloom, cfg.useLock) + store := setupDB(b, cfg.shardCount, cfg.useBloom, cfg.useLock) for i := 0; i < 10000; i++ { store.Put(fmt.Sprintf("key-%d", i), fmt.Sprintf("value-%d", i)) } @@ -107,7 +116,7 @@ func BenchmarkGetLargeDataset(b *testing.B) { hit, useBloom := hit, useBloom label := fmt.Sprintf("hit=%v,bloom=%v", hit, useBloom) b.Run(label, func(b *testing.B) { - store := setupDB(b, useBloom, false) + store := setupDB(b, 1, useBloom, false) for i := 0; i < preload; i++ { store.Put(fmt.Sprintf("key-%d", i), fmt.Sprintf("value-%d", i)) } @@ -126,16 +135,19 @@ func BenchmarkGetLargeDataset(b *testing.B) { func BenchmarkPutConcurrent(b *testing.B) { configs := []struct { - useBloom bool - useLock bool + useBloom bool + useLock bool + shardCount int }{ - {true, true}, - {false, true}, + {true, true, 1}, + {false, true, 1}, + {true, true, 2}, + {false, true, 2}, } for _, cfg := range configs { - name := fmt.Sprintf("bloom=%v,lock=%v", cfg.useBloom, cfg.useLock) + name := fmt.Sprintf("bloom=%v,lock=%v,shardCount=%d", cfg.useBloom, cfg.useLock, cfg.shardCount) b.Run(name, func(b *testing.B) { - store := setupDB(b, cfg.useBloom, cfg.useLock) + store := setupDB(b, cfg.shardCount, cfg.useBloom, cfg.useLock) b.ResetTimer() b.RunParallel(func(pb *testing.PB) { i := 0 @@ -150,16 +162,19 @@ func BenchmarkPutConcurrent(b *testing.B) { func BenchmarkGetConcurrent(b *testing.B) { configs := []struct { - useBloom bool - useLock bool + useBloom bool + useLock bool + shardCount int }{ - {true, true}, - {false, true}, + {true, true, 1}, + {false, true, 1}, + {true, true, 5}, + {false, true, 5}, } for _, cfg := range configs { - name := fmt.Sprintf("bloom=%v,lock=%v", cfg.useBloom, cfg.useLock) + name := fmt.Sprintf("bloom=%v,lock=%v,shardCount=%d", cfg.useBloom, cfg.useLock, cfg.shardCount) b.Run(name, func(b *testing.B) { - store := setupDB(b, cfg.useBloom, cfg.useLock) + store := setupDB(b, cfg.shardCount, cfg.useBloom, cfg.useLock) for i := 0; i < 10000; i++ { store.Put(fmt.Sprintf("key-%d", i), fmt.Sprintf("value-%d", i)) } diff --git a/main.go b/main.go index cb460af..e4cc842 100644 --- a/main.go +++ b/main.go @@ -26,7 +26,7 @@ func main() { } func run_benchmark(useBloom bool) BenchmarkResult { - store := db.NewLoremDB(useBloom) + store := db.NewLoremDB(1, useBloom, true) start := time.Now() for i := 0; i < 500001; i++ { diff --git a/shard/shard.go b/shard/shard.go new file mode 100644 index 0000000..ecfcae3 --- /dev/null +++ b/shard/shard.go @@ -0,0 +1,145 @@ +package shard + +import ( + "fmt" + compaction "lorem-lsm/compactor" + "lorem-lsm/memtable" + "lorem-lsm/sstable" + "lorem-lsm/wal" + "os" + "sync" + "time" +) + +type LoremDBShard struct { + wal *wal.Wal + memTable *memtable.MemTable + ssTables []*sstable.SSTable + ssTablePath string + useBloom bool + useLock bool + ssTableLimit int + lock sync.RWMutex + shardId int +} + +func NewLoremDBShard(id int, useBloomShard bool, supportConcurrency bool) *LoremDBShard { + + os.MkdirAll("sstable", 0755) + writeAheadLog, _ := wal.NewWal(fmt.Sprintf("wal%d.log", id)) + memTable := memtable.NewMemTable() + + writeAheadLog.Recover(func(walRow *wal.WalRow) { + if !walRow.IsDeleted { + memTable.Put(walRow.Key, walRow.Value) + } else { + memTable.Delete(walRow.Key) + } + }) + + ssTables := []*sstable.SSTable{} + ssTablePath := "sstable" + + return &LoremDBShard{ + wal: writeAheadLog, + memTable: memTable, + ssTables: ssTables, + ssTablePath: ssTablePath, + useBloom: useBloomShard, + ssTableLimit: 5, + useLock: supportConcurrency, + } +} + +func (shard *LoremDBShard) Put(key string, value string) error { + // write wal first to maximize data recovery chances + if shard.useLock { + shard.lock.Lock() + defer shard.lock.Unlock() + } + + shard.wal.Append(wal.WalRow{ + Key: key, + Value: value, + IsDeleted: false, + }) + + isMemTableFull := shard.memTable.Put(key, value) + + if isMemTableFull { + path := fmt.Sprintf("%s/%d", shard.ssTablePath, time.Now().UnixNano()) + table, err := sstable.CreateSSTable(path, shard.useBloom) + + if err != nil { + return err + } + + table.FlushMemTable(shard.memTable) + shard.ssTables = append(shard.ssTables, table) + + if len(shard.ssTables) > shard.ssTableLimit { + compactor := compaction.NewCompactor(shard.ssTables) + shard.ssTables = []*sstable.SSTable{compactor.Compact()} + + } + shard.memTable = memtable.NewMemTable() + shard.wal.Clear() + } + return nil +} + +func (shard *LoremDBShard) Delete(key string) error { + + if shard.useLock { + shard.lock.Lock() + defer shard.lock.Unlock() + } + + // write wal first to maximize data recovery chances + shard.wal.Append(wal.WalRow{ + Key: key, + Value: "", + IsDeleted: true, + }) + + isMemTableFull := shard.memTable.Delete(key) + + if isMemTableFull { + path := fmt.Sprintf("%s/%d", shard.ssTablePath, time.Now().UnixNano()) + table, err := sstable.CreateSSTable(path, shard.useBloom) + + if err != nil { + + return err + } + + table.FlushMemTable(shard.memTable) + shard.ssTables = append(shard.ssTables, table) + shard.memTable = memtable.NewMemTable() + fmt.Println("reset, new size:", shard.memTable.Size()) + } + return nil +} + +func (shard *LoremDBShard) Get(key string) (string, bool) { + + if shard.useLock { + shard.lock.RLock() + defer shard.lock.RUnlock() + } + + // check in memtable + val, ok := shard.memTable.Get(key) + if ok { + return val, true + } + // fallback to sstable + for i := len(shard.ssTables) - 1; i >= 0; i-- { + value, _ := shard.ssTables[i].Get(key) + + if value != "" { + return value, true + } + } + return "", false +} diff --git a/db/db_test.go b/shard/shard_test.go similarity index 84% rename from db/db_test.go rename to shard/shard_test.go index e6abc10..5abe0dd 100644 --- a/db/db_test.go +++ b/shard/shard_test.go @@ -1,4 +1,4 @@ -package db +package shard import ( "os" @@ -11,12 +11,12 @@ func TestCrashRecovery(t *testing.T) { os.MkdirAll("sstable", 0755) // write some keys - store := NewLoremDB(true, true) + store := NewLoremDBShard(1, true, true) store.Put("name", "sakar") store.Put("lang", "go") // simulate crash — don't flush, just create a new instance - store2 := NewLoremDB(true, true) + store2 := NewLoremDBShard(1, true, true) val, ok := store2.Get("name") if !ok || val != "sakar" {