Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@ versioning follows [Semantic Versioning](https://semver.org/).
For design background see [ARCHITECTURE.md](ARCHITECTURE.md);
fine-grained per-commit history is in `git log`.

## [0.7.2] — 2026-06-18

### Fixed

- Fixed nightly DB/crash soak failures where stale cross-blob routes could reach
a delete-fenced blob. `DB` no longer runs GUID-only background auto-merge;
DB-wide merge stays rooted in live trees through explicit compaction. The
route cache is now restricted to root-child crossings, and walkers restart
from the root when they encounter a delete-fenced child instead of treating it
as `NotFound`.
- `DB::view` now uses the same fenced snapshot capture path as `Tree::view`,
so multi-tree views cannot capture parent/child topology from mixed write
generations.
- Merge eligibility now rejects snapshot-shared child blobs, preventing
maintenance from deleting a blob still referenced by a live snapshot.

## [0.7.1] — 2026-06-12

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[package]
name = "holt"
version = "0.7.1"
version = "0.7.2"
edition = "2021"
rust-version = "1.82"
autobenches = false
description = "An adaptive-radix-tree metadata storage engine for path-shaped keys, with per-blob concurrency and crash-safe persistence."
license = "MIT"
authors = ["the holt contributors"]
readme = "README.md"
repository = "https://github.com/feichai0017/holt"
homepage = "https://github.com/feichai0017/holt"
repository = "https://github.com/NoKV-Lab/holt"
homepage = "https://github.com/NoKV-Lab/holt"
documentation = "https://docs.rs/holt"
keywords = ["art", "radix-tree", "metadata", "storage", "embedded"]
categories = ["database-implementations", "data-structures", "filesystem"]
Expand Down
2 changes: 1 addition & 1 deletion benches/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions src/api/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,15 @@ impl std::fmt::Debug for DB {

impl DB {
/// Open a multi-tree database using the supplied configuration.
pub fn open(cfg: TreeConfig) -> Result<Self> {
pub fn open(mut cfg: TreeConfig) -> Result<Self> {
// The background merge queue is keyed only by blob GUID. In a
// multi-tree DB, a queued parent may become unreachable from all
// live roots while still sharing children with a live tree or a
// snapshot. DB-wide merge therefore runs through `DB::compact`,
// which walks from live roots; the background checkpointer only
// drains dirty bytes and pending deletes.
cfg.checkpoint.auto_merge = false;

let bm = Tree::open_buffer_manager(&cfg)?;
let mut open_stats = OpenStats::default();

Expand Down Expand Up @@ -332,7 +340,7 @@ impl DB {
.collect::<Vec<_>>();
let mut trees = HashMap::with_capacity(scoped.len());
for (_, name, prefix, tree) in scoped {
trees.insert(name, tree.snapshot_unlocked_unfenced(prefix)?);
trees.insert(name, tree.snapshot_unlocked(prefix)?);
}
DBView { trees }
};
Expand Down
30 changes: 24 additions & 6 deletions src/engine/route_cache.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Small parent-validated route cache for path-shaped metadata keys.
//! Small root-validated route cache for path-shaped metadata keys.
//!
//! A hit is only a candidate. Callers must pin the cached parent,
//! hold its shared latch, verify the parent content version, and
//! then pin the child before using the shortcut. That keeps the
//! cached parent->child edge stable while still allowing deeper
//! prefix anchors than the root-only path.
//! then pin the child before using the shortcut. The cache only keeps
//! root-child crossings: deeper parent edges can remain internally
//! stable even after the parent becomes unreachable from the live root.

use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
Expand Down Expand Up @@ -118,6 +118,9 @@ impl RouteCache {
for &len in &entries.lengths {
if let Some(prefix) = key.user_prefix(len) {
if let Some(entry) = entries.map.get(prefix) {
if entry.parent_depth != 0 {
continue;
}
self.hits.fetch_add(1, Ordering::Relaxed);
return Some(RouteHit {
parent_guid: entry.parent_guid,
Expand Down Expand Up @@ -157,6 +160,17 @@ impl RouteCache {
rebuild_prefix_lengths(&mut entries);
}

/// Drop every cached route. Used when a deeper lock-coupled
/// walker discovers a delete-fenced child that is not represented
/// by the top-level route candidate it entered through.
pub(crate) fn clear(&self) {
self.invalidations.fetch_add(1, Ordering::Relaxed);
let mut entries = self.entries.write().unwrap();
entries.map.clear();
entries.order.clear();
entries.lengths.clear();
}

/// Refresh the parent version after the caller revalidated that
/// the cached parent edge still points at the same child.
pub(crate) fn refresh_parent_version(
Expand Down Expand Up @@ -195,6 +209,9 @@ impl RouteCache {
child_guid: BlobGuid,
child_depth: usize,
) {
if parent_depth != 0 {
return;
}
let Some(prefix) = key.user_prefix(child_depth) else {
return;
};
Expand Down Expand Up @@ -591,7 +608,7 @@ mod tests {
}

#[test]
fn different_parent_depth_does_not_prune_longer_route() {
fn non_root_parent_routes_are_not_cached() {
let cache = RouteCache::new();
cache.learn(
SearchKey::user(b"bucket-00/path/deeper/file"),
Expand All @@ -611,6 +628,7 @@ mod tests {
);

let stats = cache.stats();
assert_eq!(stats.entries, 2);
assert_eq!(stats.entries, 1);
assert_eq!(stats.learns, 1);
}
}
41 changes: 40 additions & 1 deletion src/engine/walker/erase.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use super::readers::{
read_prefix,
};
use super::route::{pin_route_parent, validate_route_edge};
use super::types::{is_stale_blob_crossing, stale_blob_crossing};
use super::types::{EraseCondition, EraseOutcome, EraseReturn, EraseSignal, LookupResult};
use super::writers::{
finish_inner_with_sorted, inner_find_child, inner_update_child, set_prefix_child,
Expand Down Expand Up @@ -72,6 +73,35 @@ pub fn erase_multi_conditional(
key: SearchKey<'_>,
seq: u64,
condition: EraseCondition,
) -> Result<EraseOutcome> {
let mut restarts = 0u32;
loop {
match erase_multi_conditional_once(bm, root_pin, route_cache, key, seq, condition) {
Err(e) if is_stale_blob_crossing(&e) => {
if let Some(cache) = route_cache {
cache.clear();
}
bm.note_optimistic_restart();
restarts = restarts.saturating_add(1);
if restarts >= 128 {
return Err(e);
}
if restarts % 16 == 0 {
std::thread::yield_now();
}
}
out => return out,
}
}
}

fn erase_multi_conditional_once(
bm: &BufferManager,
root_pin: &Arc<CachedBlob>,
route_cache: Option<&RouteCache>,
key: SearchKey<'_>,
seq: u64,
condition: EraseCondition,
) -> Result<EraseOutcome> {
// The caller (typically `Tree`) keeps `root_pin` alive across
// every op so we skip `BufferManager`'s pin-Mutex on the hot
Expand Down Expand Up @@ -343,7 +373,16 @@ fn lock_coupled_erase_in_blob(
let r = match step {
EraseStep::Done(r) => r,
EraseStep::Crossing(crossing) => {
let child_pin = bm.pin(crossing.child_guid)?;
let child_pin = match bm.pin(crossing.child_guid) {
Ok(pin) => pin,
Err(e)
if is_blob_store_not_found(&e) && bm.has_delete_fence(crossing.child_guid) =>
{
drop(guard);
return Err(stale_blob_crossing("stale blob crossing: erase deep child"));
}
Err(e) => return Err(e.with_blob_guid(crossing.child_guid)),
};
child_pin.prefetch_header();
let child_guard = child_pin.write();

Expand Down
97 changes: 96 additions & 1 deletion src/engine/walker/insert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use super::migrate::blob_needs_compaction;
use super::readers::{child_offset, ntype_of, read_leaf_key_ref, read_prefix};
use super::route::{pin_route_parent, validate_route_edge};
use super::spillover::{compact_blob, spillover_blob};
use super::types::{is_stale_blob_crossing, stale_blob_crossing};
use super::types::{InsertCondition, InsertOutcome, InsertReturn, LookupResult};
use super::writers::{
inner_add_child, inner_find_child, inner_update_child, set_prefix_child, write_leaf,
Expand Down Expand Up @@ -107,7 +108,37 @@ pub fn insert_multi_conditional(
if value.len() > u16::MAX as usize {
return Err(Error::ValueTooLong { len: value.len() });
}
let mut restarts = 0u32;
loop {
match insert_multi_conditional_once(bm, root_pin, route_cache, key, value, seq, condition) {
Err(e) if is_stale_blob_crossing(&e) => {
if let Some(cache) = route_cache {
cache.clear();
}
bm.note_optimistic_restart();
restarts = restarts.saturating_add(1);
if restarts >= 128 {
return Err(e);
}
if restarts % 16 == 0 {
std::thread::yield_now();
}
}
out => return out,
}
}
}

#[allow(clippy::too_many_arguments)]
fn insert_multi_conditional_once(
bm: &BufferManager,
root_pin: &Arc<CachedBlob>,
route_cache: Option<&RouteCache>,
key: SearchKey<'_>,
value: &[u8],
seq: u64,
condition: InsertCondition,
) -> Result<InsertOutcome> {
let mut blob_hops = 0u64;
let mut max_cross_blob_depth = 0usize;

Expand Down Expand Up @@ -179,6 +210,12 @@ fn try_insert_from_root_router(
};
let child_pin = match bm.pin(crossing.child_guid) {
Ok(pin) => pin,
Err(e) if is_blob_store_not_found(&e) && bm.has_delete_fence(crossing.child_guid) => {
drop(root_read);
return Err(stale_blob_crossing(
"stale blob crossing: insert root router",
));
}
Err(e) if is_blob_store_not_found(&e) => return Ok(None),
Err(e) => return Err(e),
};
Expand Down Expand Up @@ -302,6 +339,11 @@ fn try_insert_from_optimistic_route(
}
let child_pin = match bm.pin(route.child_guid) {
Ok(pin) => pin,
Err(e) if is_blob_store_not_found(&e) && bm.has_delete_fence(route.child_guid) => {
drop(parent_guard);
cache.invalidate(key, route);
return Err(stale_blob_crossing("stale blob crossing: insert route"));
}
Err(e) if is_blob_store_not_found(&e) => {
drop(parent_guard);
cache.invalidate(key, route);
Expand Down Expand Up @@ -402,7 +444,33 @@ pub(crate) fn insert_multi_batch_conditional(
});
}
}
let mut restarts = 0u32;
loop {
match insert_multi_batch_conditional_once(bm, root_pin, route_cache, items) {
Err(e) if is_stale_blob_crossing(&e) => {
if let Some(cache) = route_cache {
cache.clear();
}
bm.note_optimistic_restart();
restarts = restarts.saturating_add(1);
if restarts >= 128 {
return Err(e);
}
if restarts % 16 == 0 {
std::thread::yield_now();
}
}
out => return out,
}
}
}

pub(crate) fn insert_multi_batch_conditional_once(
bm: &BufferManager,
root_pin: &Arc<CachedBlob>,
route_cache: Option<&RouteCache>,
items: &[InsertBatchItem<'_>],
) -> Result<InsertBatchOutcome> {
let batched = try_insert_batch_from_first_blob(bm, root_pin, route_cache, items)?;
if batched.applied != 0 {
return Ok(batched);
Expand Down Expand Up @@ -470,6 +538,14 @@ fn try_insert_batch_from_first_blob(
let run_len = same_child_prefix_run_len(items, crossing.child_depth);
let child_pin = match bm.pin(crossing.child_guid) {
Ok(pin) => pin,
Err(e)
if is_blob_store_not_found(&e) && bm.has_delete_fence(crossing.child_guid) =>
{
drop(root_read);
return Err(stale_blob_crossing(
"stale blob crossing: insert batch root router",
));
}
Err(e) if is_blob_store_not_found(&e) => {
drop(root_read);
return insert_batch_from_root(bm, root_pin, items);
Expand Down Expand Up @@ -544,6 +620,13 @@ fn try_insert_batch_from_route(
}
let child_pin = match bm.pin(route.child_guid) {
Ok(pin) => pin,
Err(e) if is_blob_store_not_found(&e) && bm.has_delete_fence(route.child_guid) => {
drop(parent_guard);
cache.invalidate(first_key, route);
return Err(stale_blob_crossing(
"stale blob crossing: insert batch route",
));
}
Err(e) if is_blob_store_not_found(&e) => {
drop(parent_guard);
cache.invalidate(first_key, route);
Expand Down Expand Up @@ -803,7 +886,19 @@ fn cross_and_insert(
blob_hops: &mut u64,
max_cross_blob_depth: &mut usize,
) -> Result<InsertOutcome> {
let child_pin = bm.pin(crossing.child_guid)?;
let child_pin = match bm.pin(crossing.child_guid) {
Ok(pin) => pin,
Err(e) if is_blob_store_not_found(&e) && bm.has_delete_fence(crossing.child_guid) => {
drop(parent_guard);
if parent_dirty {
bm.mark_dirty_cached(current_guid, seq, current_entry);
}
return Err(stale_blob_crossing(
"stale blob crossing: insert deep child",
));
}
Err(e) => return Err(e.with_blob_guid(crossing.child_guid)),
};
child_pin.prefetch_header();
let child_guard = child_pin.write();

Expand Down
Loading