diff --git a/Cargo.lock b/Cargo.lock index e296c3aebdb..2108bed8265 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1617,8 +1617,10 @@ dependencies = [ name = "dash-async" version = "4.0.0-rc.2" dependencies = [ + "futures", "thiserror 2.0.18", "tokio", + "tokio-util", "tracing", ] @@ -5141,6 +5143,7 @@ dependencies = [ "async-trait", "bimap", "bs58", + "dash-async", "dash-sdk", "dash-spv", "dashcore", diff --git a/packages/rs-dash-async/Cargo.toml b/packages/rs-dash-async/Cargo.toml index 26e2c8fdeb9..a567cc60ae5 100644 --- a/packages/rs-dash-async/Cargo.toml +++ b/packages/rs-dash-async/Cargo.toml @@ -7,12 +7,20 @@ authors = ["Dash Core Team"] license = "MIT" description = "Async-sync bridging utilities for Dash Platform" +[features] +# Exposes cross-crate test seams (e.g. `ThreadRegistry::park_orphan_for_test`) +# so downstream crates can drive registry regression tests without shipping +# the seam in their production builds. +test-util = [] + [dependencies] thiserror = "2.0" tracing = "0.1.41" [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.40", features = ["rt", "rt-multi-thread", "time", "net"] } +tokio-util = { version = "0.7.12" } +futures = { version = "0.3.30" } [dev-dependencies] -tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync"] } +tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync", "time"] } diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs new file mode 100644 index 00000000000..ecdab75acba --- /dev/null +++ b/packages/rs-dash-async/src/atomic.rs @@ -0,0 +1,64 @@ +use std::sync::atomic::{AtomicBool, Ordering}; + +/// RAII guard that clears an [`AtomicBool`] flag to `false` on drop. +/// +/// Callers set the flag to `true` before constructing the guard (typically +/// via a `compare_exchange`); the guard resets it on every exit path, +/// including panics, so a panicked holder can never leave the flag wedged. +/// +/// **Panic-strategy caveat:** the clear-on-panic guarantee relies on +/// destructors running while the stack unwinds, so it holds under +/// `panic = "unwind"` (the default). Under `panic = "abort"` — e.g. the +/// iOS release profiles — a panic aborts the process immediately and no +/// `Drop` runs; there is simply no "after" left for the flag to gate. +#[must_use = "AtomicFlagGuard clears the flag on drop; binding to `_` or using as a statement drops it immediately"] +pub struct AtomicFlagGuard<'a>(&'a AtomicBool); + +impl<'a> AtomicFlagGuard<'a> { + /// Wrap `flag`. Does **not** set it to `true` — the caller is + /// responsible for doing that before constructing the guard. + pub fn new(flag: &'a AtomicBool) -> Self { + Self(flag) + } +} + +impl Drop for AtomicFlagGuard<'_> { + fn drop(&mut self) { + self.0.store(false, Ordering::Release); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::panic::{catch_unwind, AssertUnwindSafe}; + + /// A guard constructed over a `true` flag holds it while in scope and + /// clears it to `false` on a normal scope exit. + #[test] + fn clears_flag_on_normal_drop() { + let flag = AtomicBool::new(true); + { + let _guard = AtomicFlagGuard::new(&flag); + assert!(flag.load(Ordering::Acquire), "flag stays set while held"); + } + assert!(!flag.load(Ordering::Acquire), "flag cleared on drop"); + } + + /// The clear also runs while unwinding a panic — the load-bearing + /// property the sync coordinators lean on so a panicked pass can't + /// leave `is_syncing` latched and wedge `quiesce()`'s drain. + #[test] + fn clears_flag_while_unwinding_panic() { + let flag = AtomicBool::new(true); + let result = catch_unwind(AssertUnwindSafe(|| { + let _guard = AtomicFlagGuard::new(&flag); + panic!("boom while holding the guard"); + })); + assert!(result.is_err(), "the panic propagated out of catch_unwind"); + assert!( + !flag.load(Ordering::Acquire), + "Drop ran during unwinding and cleared the flag" + ); + } +} diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs index 0ef7785253b..1ce08203599 100644 --- a/packages/rs-dash-async/src/lib.rs +++ b/packages/rs-dash-async/src/lib.rs @@ -2,7 +2,20 @@ //! //! Provides [`block_on`] -- a function that bridges async futures into sync code, //! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread). +//! +//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets, +//! and [`ThreadRegistry`] — a shared lifecycle engine for background OS-thread / tokio-task +//! workers (start, cancel, weight-ordered quiesce + join, orphan reap). +mod atomic; mod block_on; +#[cfg(not(target_arch = "wasm32"))] +mod registry; +pub use atomic::AtomicFlagGuard; pub use block_on::{block_on, AsyncError}; +#[cfg(not(target_arch = "wasm32"))] +pub use registry::{ + DrainHook, RegistryKey, ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig, + WorkerStatus, DEFAULT_JOIN_BUDGET, DEFAULT_REAP_BACKSTOP, +}; diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs new file mode 100644 index 00000000000..01795a5e73f --- /dev/null +++ b/packages/rs-dash-async/src/registry.rs @@ -0,0 +1,1882 @@ +//! Shared lifecycle engine for background workers (`ThreadRegistry`). +//! +//! Centralizes the dangerous, previously-triplicated 80% of a background +//! worker's lifecycle — the generation-match exit epilogue, the +//! reap-or-park of a restarted worker's prior thread, and the orphan +//! drain — into one tested place, while deliberately leaving the +//! domain-specific 20% (the "is a pass in flight?" drain barrier) to the +//! consumer as a [`DrainHook`]. +//! +//! Two worker kinds are supported: +//! - [`start_thread`](ThreadRegistry::start_thread) — a dedicated OS +//! thread, for loops that `block_on` `!Send` futures internally (the +//! `!Send` value never crosses the spawn boundary; the body itself is +//! `Send`). +//! - [`start_task`](ThreadRegistry::start_task) — a tokio task, for +//! `Send` futures. +//! +//! # Safety invariants +//! +//! - **A timed-out or dropped quiesce never detaches a live thread.** +//! Every join path takes `&self`; the live join handle stays owned by +//! the slot and is never moved into a cancellable future's frame. A +//! dropped/timed-out [`quiesce`](ThreadRegistry::quiesce) therefore +//! cannot drop-and-detach the handle — on timeout (or on an external +//! drop) the handle is deterministically re-parked into the orphan +//! list, and the slot reports [`WorkerStatus::Timeout`], never a clean +//! `NotRunning`. +//! - **A store wipe cannot race a parked prior-generation thread.** +//! Orphans live in the registry and +//! [`any_alive_for`](ThreadRegistry::any_alive_for) is the key-scoped +//! liveness gate spanning a key's live slot **and** its parked orphans +//! (with [`any_alive`](ThreadRegistry::any_alive) the registry-wide +//! variant). A store-wiping path scoped to one worker consults the +//! key-scoped gate, so a parked still-live thread blocks the wipe of its +//! own worker's store without an unrelated worker blocking it. + +use std::collections::BTreeMap; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use futures::future::FutureExt; +use tokio::runtime::RuntimeFlavor; +use tokio_util::sync::CancellationToken; + +// --------------------------------------------------------------------- +// Key & weight +// --------------------------------------------------------------------- + +/// Worker identity. A wallet supplies a fixed enum; rs-dapi a generated +/// id. Blanket-implemented — consumers just derive the listed bounds on +/// their own key type. +pub trait RegistryKey: Copy + Ord + Eq + std::fmt::Debug + Send + Sync + 'static {} +impl RegistryKey for T {} + +/// Teardown order. Lower weights drain first; equal weights drain +/// concurrently within a tier. Default `0`. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default)] +pub struct ShutdownWeight(pub i32); + +// --------------------------------------------------------------------- +// Status +// --------------------------------------------------------------------- + +/// Terminal status of one worker. Its variant set and payloads correspond +/// 1:1 to the wallet's `CoordinatorThreadStatus`, which is built from this +/// via an exhaustive by-name `From` so the FFI surface stays stable. The +/// two enums keep their own declaration order and carry no `#[repr]`, so +/// the mapping is a match, never a layout-compatible cast. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum WorkerStatus { + /// The loop exited and its thread/task joined cleanly. + Ok, + /// A tokio task ended for a non-panic, non-clean reason (cancelled / + /// aborted at the runtime level). Carries a reason when available. + /// Only the `Task` kind can produce this; an OS thread never does. + Stopped(Option), + /// The thread/task panicked; carries the best-effort panic message. + Panicked(String), + /// The managed join exceeded this worker's `join_budget`. The live + /// handle was re-parked into the orphan list — UAF-safe, non-clean. + Timeout, + /// A parked orphan was still alive after the reap grace — UAF-safe, + /// non-clean. + Detached, + /// No thread/task was running to join — never started, or already + /// joined by a prior teardown. + NotRunning, + /// Infrastructural join failure that is neither a timeout nor a + /// panic (unreachable in normal operation). + Error(String), +} + +impl WorkerStatus { + /// `true` only for a fully clean outcome: joined normally (`Ok`) or + /// never ran (`NotRunning`). + pub fn is_clean(&self) -> bool { + matches!(self, Self::Ok | Self::NotRunning) + } +} + +/// Aggregate result of [`ThreadRegistry::shutdown`]. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShutdownReport { + /// Per-worker terminal status, keyed by worker id. + pub per_worker: BTreeMap, + /// Number of parked orphans still alive at the reap deadline. + pub detached: usize, +} + +impl ShutdownReport { + /// `true` only when every per-worker status is clean and no orphan + /// survived the reap. + pub fn all_clean(&self) -> bool { + self.detached == 0 && self.per_worker.values().all(WorkerStatus::is_clean) + } +} + +// --------------------------------------------------------------------- +// Per-worker registration options +// --------------------------------------------------------------------- + +/// Async drain hook the registry awaits **before** cancelling a worker, +/// in weight order. The domain barrier (raise a `quiescing` gate, wait +/// out an in-flight pass) lives here, supplied by the consumer — the +/// registry never owns domain semantics. +/// +/// The captured state must be `Send + Sync`; a `!Send` capture does not +/// compile as a `DrainHook`. The fence is anchored to `E0277` (unsatisfied +/// `Send` bound) so the test cannot pass vacuously on some unrelated +/// compile error: +/// +/// ```compile_fail,E0277 +/// use std::rc::Rc; +/// use std::sync::Arc; +/// use dash_async::DrainHook; +/// let rc = Rc::new(42u32); // !Send +/// let _hook: DrainHook = +/// Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) }); +/// ``` +pub type DrainHook = Arc Pin + Send>> + Send + Sync>; + +/// Default managed-join budget when a [`WorkerConfig`] does not override +/// it. Pinned so an accidental change surfaces in tests. +pub const DEFAULT_JOIN_BUDGET: Duration = Duration::from_secs(30); + +/// Default orphan reap backstop (start-time reap and shutdown grace). +pub const DEFAULT_REAP_BACKSTOP: Duration = Duration::from_secs(1); + +/// Per-worker registration options. +pub struct WorkerConfig { + /// Teardown tier; lower drains first, equal weights concurrently. + pub weight: ShutdownWeight, + /// Optional drain barrier awaited before cancellation. + pub drain: Option, + /// Managed-join timeout for this worker. + pub join_budget: Duration, +} + +impl Default for WorkerConfig { + fn default() -> Self { + Self { + weight: ShutdownWeight::default(), + drain: None, + join_budget: DEFAULT_JOIN_BUDGET, + } + } +} + +impl std::fmt::Debug for WorkerConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // `drain` is a boxed closure with no useful `Debug`; render its + // presence instead. + f.debug_struct("WorkerConfig") + .field("weight", &self.weight) + .field("drain", &self.drain.is_some()) + .field("join_budget", &self.join_budget) + .finish() + } +} + +// --------------------------------------------------------------------- +// Internal handle + slot state +// --------------------------------------------------------------------- + +/// A live worker's join handle. Kept owned by its slot so a cancellable +/// caller can never move it into a future frame and detach it on drop. +enum WorkerHandle { + OsThread(std::thread::JoinHandle<()>), + Task(tokio::task::JoinHandle<()>), +} + +impl WorkerHandle { + fn is_finished(&self) -> bool { + match self { + WorkerHandle::OsThread(h) => h.is_finished(), + WorkerHandle::Task(h) => h.is_finished(), + } + } + + /// Classify a **finished** handle. Kind-dispatched (R3): an OS thread + /// yields only `Ok` / `Panicked`; a task can also yield `Stopped` + /// (cancelled / aborted at the runtime level). + fn classify(self) -> WorkerStatus { + match self { + WorkerHandle::OsThread(j) => match j.join() { + Ok(()) => WorkerStatus::Ok, + Err(payload) => WorkerStatus::Panicked(panic_message(payload)), + }, + WorkerHandle::Task(j) => match j.now_or_never() { + Some(Ok(())) => WorkerStatus::Ok, + Some(Err(e)) if e.is_panic() => { + WorkerStatus::Panicked(panic_message(e.into_panic())) + } + Some(Err(e)) => WorkerStatus::Stopped(Some(e.to_string())), + // Only ever called on a finished handle, so a finished + // task is always ready; this arm is defensive. + None => WorkerStatus::Error("task handle not ready at join".to_string()), + }, + } + } +} + +/// Best-effort extraction of a panic message (`&str` / `String` cases). +fn panic_message(payload: Box) -> String { + if let Some(s) = payload.downcast_ref::<&str>() { + (*s).to_string() + } else if let Some(s) = payload.downcast_ref::() { + s.clone() + } else { + "".to_string() + } +} + +/// One key's slot. The entry is created on first start and never removed, +/// so `generation` stays monotonic across the key's whole lifetime — a +/// parked prior-generation thread can therefore always tell that its +/// generation is stale. `cancel.is_some()` is the running indicator; +/// `handle` is the join handle, reaped by the next start or by quiesce. +struct SlotState { + generation: u64, + cancel: Option, + handle: Option, + weight: ShutdownWeight, + drain: Option, + join_budget: Duration, +} + +impl SlotState { + fn dormant() -> Self { + Self { + generation: 0, + cancel: None, + handle: None, + weight: ShutdownWeight::default(), + drain: None, + join_budget: DEFAULT_JOIN_BUDGET, + } + } +} + +// --------------------------------------------------------------------- +// The registry +// --------------------------------------------------------------------- + +/// Shared lifecycle engine for background workers. See the module docs. +/// +/// Parked orphans carry their originating key so a store-wiping path for +/// one worker can gate on [`any_alive_for`](Self::any_alive_for) without +/// being blocked by an unrelated worker still legitimately running. +pub struct ThreadRegistry { + slots: Mutex>, + orphans: Mutex>, + reap_backstop: Duration, + /// One-way teardown latch. [`shutdown`](Self::shutdown) sets it under + /// the slot lock before snapshotting tiers; `start_thread`/`start_task` + /// honour it under the same lock and refuse to register a new worker + /// once teardown has begun, so a start racing shutdown can never leave + /// an un-joined worker behind. + closing: AtomicBool, + /// Test seam: when set, the next OS-thread spawn returns an injected + /// `io::Error` instead of really spawning, so the spawn-failure + /// rollback path can be exercised deterministically. + #[cfg(test)] + force_spawn_failure: AtomicBool, +} + +impl std::fmt::Debug for ThreadRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ThreadRegistry") + .field("live_slots", &self.lock_slots().len()) + .field("orphans", &self.lock_orphans().len()) + .field("reap_backstop", &self.reap_backstop) + .field("closing", &self.closing.load(Ordering::Acquire)) + .finish() + } +} + +impl ThreadRegistry { + /// New registry with the default reap backstop ([`DEFAULT_REAP_BACKSTOP`]). + pub fn new() -> Arc { + Self::with_reap_backstop(DEFAULT_REAP_BACKSTOP) + } + + /// New registry with an explicit orphan reap backstop (the wallet + /// uses 1s — the same grace separates "finishing" from "wedged"). + pub fn with_reap_backstop(backstop: Duration) -> Arc { + Arc::new(Self { + slots: Mutex::new(BTreeMap::new()), + orphans: Mutex::new(Vec::new()), + reap_backstop: backstop, + closing: AtomicBool::new(false), + #[cfg(test)] + force_spawn_failure: AtomicBool::new(false), + }) + } + + /// Start an OS-thread worker for `!Send` loops. `body` runs on a + /// fresh `std::thread` and may build and `block_on` `!Send` futures + /// internally — the `!Send` value never crosses the spawn boundary + /// (`body` itself is `Send`). Starting a key that already has a live + /// worker is a no-op; a key whose prior thread has not been reaped is + /// reaped-or-parked first (the restart-reap path). After + /// [`shutdown`](Self::shutdown) has begun the call is also a no-op (the + /// one-way closing latch). + /// + /// **Requires a multi-thread runtime**: the worker drives its loop + /// via `Handle::block_on` and needs the shared timer/IO driver. + /// + /// # Panics + /// + /// Panics if called outside a multi-thread Tokio runtime (see + /// [`shutdown`](Self::shutdown)). It does **not** panic on thread-spawn + /// failure: a failed spawn (e.g. the OS thread-count limit) is rolled + /// back — the prior handle is re-installed rather than detached and the + /// slot returns to not-running — and the call simply does not start a + /// worker. + pub fn start_thread(self: &Arc, key: K, cfg: WorkerConfig, body: F) + where + F: FnOnce(CancellationToken) + Send + 'static, + { + Self::assert_multi_thread("start_thread"); + let prior_tid = { + let mut slots = self.lock_slots(); + // One-way teardown latch: refuse new workers once shutdown has + // begun, under the same lock shutdown snapshots tiers with. + if self.closing.load(Ordering::Acquire) { + return; + } + let slot = slots.entry(key).or_insert_with(SlotState::dormant); + if slot.cancel.is_some() { + return; + } + // Take the prior handle to reap below; bump generation and + // install the new token under this one lock so a prior + // thread's epilogue observes the post-swap generation. + let prior = slot.handle.take(); + // Snapshot the slot's pre-start config so a spawn failure can roll + // the slot back to exactly its prior state: a re-installed prior + // worker must keep its OWN teardown config, not inherit the failed + // start's weight/drain/join_budget. Generation is rolled back too — + // the bump is only ever observed under this lock and a failed start + // spawns no thread to reference it, so the rollback is net-zero and + // the externally-visible generation stays monotonic. + let prev_generation = slot.generation; + let prev_weight = slot.weight; + let prev_join_budget = slot.join_budget; + let prev_drain = slot.drain.take(); + let token = CancellationToken::new(); + slot.cancel = Some(token.clone()); + slot.generation += 1; + let my_gen = slot.generation; + slot.weight = cfg.weight; + slot.drain = cfg.drain; + slot.join_budget = cfg.join_budget; + + let reg = Arc::clone(self); + let body_token = token; + // Build the epilogue drop-guard INSIDE the worker closure, not + // here: on a spawn failure the closure is dropped while we still + // hold the slot lock, and a guard constructed out here would run + // `run_epilogue` (which re-locks `slots`) on that drop and + // deadlock. Constructing it inside means it only exists once the + // thread is actually running. A panicking `body` then still + // clears this generation's running flag via the guard's Drop + // (under `panic = "unwind"`), and the panic keeps unwinding so + // the join handle still classifies as `Panicked`. + match self.spawn_os_thread(key, move || { + let _epilogue = EpilogueGuard { reg, key, my_gen }; + body(body_token); + }) { + Ok(join) => { + // Store the new handle, then park the prior into orphans — + // both still under THIS slot lock, so `shutdown`'s + // under-lock tier snapshot can never see the new slot + // without also seeing the prior accounted (R1: store handle + // -> park prior -> drop guard -> THEN bounded reap below). + // See `park_prior_locked` for the lock-order rationale; the + // bounded join stays out of the lock in `reap_parked_prior`. + slot.handle = Some(WorkerHandle::OsThread(join)); + self.park_prior_locked(key, prior) + } + Err(e) => { + // Spawn failed (e.g. EAGAIN at the OS thread ceiling). Roll + // the slot back to exactly its pre-start state: clear the + // running flag, re-install the prior handle (never + // detached), and restore the prior teardown config + + // generation so nothing of the failed start lingers. The + // re-installed prior keeps its own weight/drain/join_budget + // for a later quiesce/shutdown, and generation returns to + // its pre-bump value (the bump was never observed outside + // this lock and spawned no thread). Nothing was parked, so + // there is no prior to reap below. + tracing::error!( + ?key, + error = %e, + "failed to spawn registry worker thread; rolling back \ + start (prior handle re-installed, not detached)" + ); + slot.cancel = None; + slot.handle = prior; + slot.generation = prev_generation; + slot.weight = prev_weight; + slot.drain = prev_drain; + slot.join_budget = prev_join_budget; + None + } + } + }; + + // The prior thread was cancellation-signalled by a preceding + // cancel(); with the slot lock released its epilogue completes + // promptly and the join lands in milliseconds — `reap_parked_prior` + // then removes it from orphans and joins it. The backstop fires only + // on a genuine wedge, in which case the still-live handle is left + // parked (not dropped) so teardown can account for it. + self.reap_parked_prior(key, prior_tid); + } + + /// Start a tokio-task worker for `Send` futures. Same restart-reap + /// semantics as [`start_thread`](Self::start_thread); does not require + /// a multi-thread runtime. + /// + // TODO(rs-dapi-adoption): runtime-flavor assert is asymmetric. + // `start_thread` and `shutdown` assert a multi-thread runtime (the + // OS-thread `block_on` needs the shared reactor), but `start_task` does + // not — a task only needs a runtime handle. So a TASK-ONLY consumer + // (rs-dapi, no `start_thread`) can register and run workers on a + // `current_thread` runtime, then panic LATE when it finally calls + // `shutdown()`. The wallet (which always uses `start_thread`) trips the + // assert at start, so it is unaffected. Fix when rs-dapi adopts the + // registry: either drop the assert from `shutdown` for all-task + // registries (track whether any OS-thread worker was ever started) or + // assert in `start_task` too and require multi-thread everywhere. + /// + /// # Panics + /// + /// Panics if called outside a Tokio runtime context (`tokio::spawn`'s + /// own precondition). After [`shutdown`](Self::shutdown) has begun the + /// call is a no-op (the one-way closing latch). + pub fn start_task(self: &Arc, key: K, cfg: WorkerConfig, body: F) + where + F: FnOnce(CancellationToken) -> Fut + Send + 'static, + Fut: Future + Send + 'static, + { + { + let mut slots = self.lock_slots(); + // One-way teardown latch — see `start_thread`. + if self.closing.load(Ordering::Acquire) { + return; + } + let slot = slots.entry(key).or_insert_with(SlotState::dormant); + if slot.cancel.is_some() { + return; + } + let prior = slot.handle.take(); + let token = CancellationToken::new(); + slot.cancel = Some(token.clone()); + slot.generation += 1; + let my_gen = slot.generation; + slot.weight = cfg.weight; + slot.drain = cfg.drain; + slot.join_budget = cfg.join_budget; + + let reg = Arc::clone(self); + let body_token = token; + // Drop-guard epilogue, same rationale as `start_thread`: a task + // whose future panics still clears its running flag via the + // guard's Drop during unwind. + let join = tokio::spawn(async move { + let _epilogue = EpilogueGuard { reg, key, my_gen }; + body(body_token).await; + }); + slot.handle = Some(WorkerHandle::Task(join)); + // Park the prior UNDER this slot lock, same rationale as + // `start_thread`: it keeps `shutdown`'s under-lock tier snapshot + // from ever missing the prior. A task cannot be joined + // synchronously, so there is no bounded reap here — a live prior + // is parked for the async orphan reap (`reap_orphans` / + // `shutdown`) and a finished one is dropped. The returned thread + // id is unused: a task prior has none, and a (mixed-usage) + // OS-thread prior is likewise left to the async reap rather than + // spun on synchronously from this (possibly async) caller. + let _ = self.park_prior_locked(key, prior); + } + } + + /// Whether a worker is currently registered and running for `key`. + pub fn is_running(&self, key: K) -> bool { + self.lock_slots() + .get(&key) + .map(|s| s.cancel.is_some()) + .unwrap_or(false) + } + + /// Signal-only cancellation of one worker. + pub fn cancel(&self, key: K) { + if let Some(slot) = self.lock_slots().get_mut(&key) { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } + } + } + + /// Signal-only cancellation of every registered worker. + pub fn cancel_all(&self) { + for slot in self.lock_slots().values_mut() { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } + } + } + + /// Await this worker's drain hook, cancel it, then join within its + /// budget. The live handle is owned by the slot and is **never** moved + /// into this future's frame, so a dropped/timed-out call cannot detach + /// it; on the managed timeout — or if this future is dropped + /// mid-poll — the handle is re-parked into the orphan list. + pub async fn quiesce(&self, key: K) -> WorkerStatus { + // Snapshot the drain hook + budget + generation, and bail early if + // nothing is registered for this key. The generation is the anchor + // for the supersede guard below. + let (drain, budget, my_gen) = { + let slots = self.lock_slots(); + match slots.get(&key) { + Some(s) if s.cancel.is_some() || s.handle.is_some() => { + (s.drain.clone(), s.join_budget, s.generation) + } + _ => return WorkerStatus::NotRunning, + } + }; + + // R2: gate-before-cancel — fully await the drain hook before the + // cancel signal is observed. + if let Some(drain) = drain { + drain().await; + } + + // Signal-only cancel — but only if this is still the generation we + // snapshotted. A concurrent restart (which can proceed the instant + // we take `cancel` below) bumps the generation; taking the new + // token here would silently un-track the fresh worker. + if let Some(slot) = self.lock_slots().get_mut(&key) { + if slot.generation == my_gen { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } + } + } + + // Poll-join within budget. The re-park guard moves the slot's + // still-live handle into orphans if this future is dropped before + // the loop finishes — the handle is never owned by this frame. Both + // the guard and the loop are generation-scoped, so a concurrent + // same-key restart's live handle is never parked or classified by + // the quiesce that cancelled the *prior* generation. + let _repark = Repark { + reg: self, + key, + my_gen, + }; + let deadline = Instant::now() + budget; + loop { + enum Step { + Classify(WorkerHandle), + Park(WorkerHandle), + NotRunning, + Superseded, + Wait, + } + let step = { + let mut slots = self.lock_slots(); + match slots.get_mut(&key) { + None => Step::NotRunning, + // A restart replaced the generation we were draining: + // the handle now in the slot belongs to a newer, live + // worker the restart owns. Leave it untouched. + Some(slot) if slot.generation != my_gen => Step::Superseded, + Some(slot) => match slot.handle.take_if(|h| h.is_finished()) { + Some(h) => Step::Classify(h), + None if slot.handle.is_none() => Step::NotRunning, + None if Instant::now() >= deadline => { + Step::Park(slot.handle.take().expect("handle present")) + } + None => Step::Wait, + }, + } + }; + match step { + Step::Classify(h) => return h.classify(), + Step::Park(h) => { + self.lock_orphans().push((key, h)); + return WorkerStatus::Timeout; + } + Step::NotRunning | Step::Superseded => return WorkerStatus::NotRunning, + Step::Wait => tokio::time::sleep(Duration::from_millis(5)).await, + } + } + } + + /// Is any registered worker **or** parked orphan still alive across + /// the whole registry? + pub fn any_alive(&self) -> bool { + { + let slots = self.lock_slots(); + for slot in slots.values() { + if slot_alive(slot) { + return true; + } + } + } + self.lock_orphans().iter().any(|(_, h)| !h.is_finished()) + } + + /// Is the worker for `key` — its live slot **or** any orphan parked + /// under that key — still alive? A store-wiping path scoped to one + /// worker must gate on this (rather than the registry-wide + /// [`any_alive`](Self::any_alive)) so an unrelated worker that is + /// legitimately running does not block the wipe. + pub fn any_alive_for(&self, key: K) -> bool { + if let Some(slot) = self.lock_slots().get(&key) { + if slot_alive(slot) { + return true; + } + } + self.lock_orphans() + .iter() + .any(|(k, h)| *k == key && !h.is_finished()) + } + + /// Reap parked orphans with a short grace; survivors are re-parked and + /// reported as [`WorkerStatus::Detached`] (idempotent retry). + pub async fn reap_orphans(&self, grace: Duration) -> WorkerStatus { + self.reap_orphans_impl(grace).await.0 + } + + /// Weight-ordered teardown: ascending tier by tier, each worker's + /// (drain-hook -> cancel -> join) run concurrently within a tier; + /// orphan reap runs last. **Requires a multi-thread runtime.** + /// + /// Latches the registry closed first (under the slot lock, before the + /// tier snapshot), so any `start_thread`/`start_task` racing teardown is + /// either already in the snapshot or refused outright — shutdown is a + /// one-way door and never leaves a worker un-joined. Idempotent. + /// + /// # Panics + /// + /// Panics if called outside a multi-thread Tokio runtime: an OS-thread + /// worker drives its loop via `Handle::block_on` and needs the shared + /// timer/IO driver, so a `current_thread` runtime would deadlock the + /// join. + pub async fn shutdown(&self) -> ShutdownReport { + // TODO(rs-dapi-adoption): see `start_task` — this assert is the late + // panic point for a task-only consumer on a current_thread runtime. + Self::assert_multi_thread("shutdown"); + + // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in + // ascending weight order, giving the lower-first drain. Latch the + // registry closed under the same lock and before the snapshot so a + // racing start is serialized: it either landed before this lock (and + // is in the snapshot) or sees `closing` and bails. + let tiers: BTreeMap> = { + let slots = self.lock_slots(); + self.closing.store(true, Ordering::Release); + let mut tiers: BTreeMap> = BTreeMap::new(); + for (key, slot) in slots.iter() { + tiers.entry(slot.weight).or_default().push(*key); + } + tiers + }; + + let mut per_worker = BTreeMap::new(); + for (_weight, keys) in tiers { + // Drain every worker in this tier concurrently: each + // quiesce() drives its own drain-hook -> cancel -> join, and + // `join_all` polls them on one task so their drain hooks + // interleave (equal-weight concurrency). + let drained = keys + .into_iter() + .map(|key| async move { (key, self.quiesce(key).await) }); + for (key, status) in futures::future::join_all(drained).await { + per_worker.insert(key, status); + } + } + + // Account for parked orphans last. + let (_status, detached) = self.reap_orphans_impl(self.reap_backstop).await; + ShutdownReport { + per_worker, + detached, + } + } + + // ----------------------------------------------------------------- + // Internal helpers + // ----------------------------------------------------------------- + + fn lock_slots(&self) -> std::sync::MutexGuard<'_, BTreeMap> { + self.slots.lock().unwrap_or_else(|e| e.into_inner()) + } + + fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec<(K, WorkerHandle)>> { + self.orphans.lock().unwrap_or_else(|e| e.into_inner()) + } + + fn assert_multi_thread(ctx: &str) { + assert!( + matches!( + tokio::runtime::Handle::current().runtime_flavor(), + RuntimeFlavor::MultiThread + ), + "ThreadRegistry::{ctx}() requires a multi-thread Tokio runtime: an \ + OS-thread worker drives its loop via Handle::block_on and needs the \ + runtime's timer/IO driver, but a current_thread runtime can only \ + drive one block_on at a time" + ); + } + + /// Gen-gated exit epilogue, run on the worker after its body returns + /// (or unwinds): clear this slot's running flag only if a newer start + /// has not since installed a replacement. + fn run_epilogue(&self, key: K, my_gen: u64) { + if let Some(slot) = self.lock_slots().get_mut(&key) { + if slot.generation == my_gen { + slot.cancel = None; + } + } + } + + /// Spawn the named OS worker thread, surfacing a spawn failure as + /// `io::Result` instead of panicking so the caller can roll back. The + /// `#[cfg(test)]` seam forces a synthetic failure to exercise that path. + fn spawn_os_thread(&self, key: K, closure: C) -> std::io::Result> + where + C: FnOnce() + Send + 'static, + { + #[cfg(test)] + if self.force_spawn_failure.load(Ordering::Acquire) { + return Err(std::io::Error::other("forced spawn failure (test seam)")); + } + std::thread::Builder::new() + .name(format!("tr-worker-{key:?}")) + .spawn(closure) + } + + /// Park a restarted key's prior handle into orphans. **Must be called + /// while the slot lock is held** — the resulting `slots`->`orphans` + /// nesting is the only such nesting in this module and is deadlock-free + /// (no path ever acquires `slots` while holding `orphans`, so there is no + /// cycle). Parking the prior here, rather than after the slot lock is + /// released, is what lets `shutdown`'s under-lock tier snapshot never + /// miss it: the take-prior and the park-prior are then atomic from + /// `shutdown`'s view. A finished task is dropped (detaching a finished + /// task is a no-op); a live task and any OS thread are parked. Returns + /// the parked OS thread's id so [`reap_parked_prior`](Self::reap_parked_prior) + /// can find and bounded-join it; tasks (reaped asynchronously) return + /// `None`. + fn park_prior_locked( + &self, + key: K, + prior: Option, + ) -> Option { + match prior { + Some(WorkerHandle::OsThread(h)) => { + let tid = h.thread().id(); + self.lock_orphans().push((key, WorkerHandle::OsThread(h))); + Some(tid) + } + Some(task) => { + if !task.is_finished() { + self.lock_orphans().push((key, task)); + } + None + } + None => None, + } + } + + /// Bounded reap of an OS-thread prior that [`park_prior_locked`](Self::park_prior_locked) + /// parked under `key` at restart. Must be called with no registry lock + /// held (it spins synchronously). The instant the parked thread finishes + /// it is removed from orphans and joined — the join itself stays OUT of + /// any lock (only the bookkeeping is taken under the orphans lock). A + /// genuine wedge past the reap backstop is left parked, so teardown can + /// still account for it. No-op when no OS thread was parked (`None`), or + /// when the orphan was already taken by a concurrent reaper / `shutdown` + /// (which then owns the join). + fn reap_parked_prior(&self, key: K, prior_tid: Option) { + let Some(tid) = prior_tid else { + return; + }; + let deadline = Instant::now() + self.reap_backstop; + loop { + // Bookkeeping under the orphans lock only: locate our parked + // prior by thread id and, once it has finished, take it out to + // join after the lock is released. Never hold the lock across the + // join. + let taken = { + let mut orphans = self.lock_orphans(); + let pos = orphans.iter().position(|(k, h)| { + *k == key && matches!(h, WorkerHandle::OsThread(t) if t.thread().id() == tid) + }); + match pos { + // Already taken by a concurrent reaper / shutdown: it owns + // the join now. + None => return, + Some(i) if orphans[i].1.is_finished() => Some(orphans.remove(i).1), + Some(_) if Instant::now() >= deadline => { + tracing::warn!( + ?key, + backstop = ?self.reap_backstop, + "prior worker thread did not finish within the reap \ + backstop after cancellation; leaving it parked as an \ + orphan for teardown to join rather than detaching it" + ); + return; + } + Some(_) => None, + } + }; + if let Some(WorkerHandle::OsThread(h)) = taken { + let _ = h.join(); + return; + } + std::thread::sleep(Duration::from_millis(5)); + } + } + + /// Drain the orphan list, polling until `grace`. Returns the terminal + /// status and the number of survivors re-parked for an idempotent + /// retry. + async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) { + let mut pending: Vec<(K, WorkerHandle)> = { + let mut guard = self.lock_orphans(); + std::mem::take(&mut *guard) + }; + if pending.is_empty() { + return (WorkerStatus::Ok, 0); + } + + let deadline = Instant::now() + grace; + // Keep the first non-clean terminal status; a live survivor still + // takes precedence at the deadline. + let mut non_clean: Option = None; + loop { + let mut still_live = Vec::with_capacity(pending.len()); + for (key, handle) in pending.drain(..) { + if handle.is_finished() { + let status = handle.classify(); + if !status.is_clean() { + non_clean.get_or_insert(status); + } + } else { + still_live.push((key, handle)); + } + } + pending = still_live; + + if pending.is_empty() { + return (non_clean.unwrap_or(WorkerStatus::Ok), 0); + } + if Instant::now() >= deadline { + let survivors = pending.len(); + self.lock_orphans().extend(pending); + return (WorkerStatus::Detached, survivors); + } + tokio::time::sleep(Duration::from_millis(5)).await; + } + } + + /// Test-only seam: park a raw thread handle as an orphan under `key`. + /// Used by cross-crate regression tests (e.g. the wallet's F2 gate) + /// that must inject a wedged prior-generation thread without driving + /// the full restart-reap path. Feature-gated behind `test-util` so it + /// never ships in a production build of a downstream consumer. + #[cfg(any(test, feature = "test-util"))] + #[doc(hidden)] + pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) { + self.lock_orphans() + .push((key, WorkerHandle::OsThread(handle))); + } +} + +/// `true` if a slot is running or holds an unfinished handle. +fn slot_alive(slot: &SlotState) -> bool { + slot.cancel.is_some() || slot.handle.as_ref().is_some_and(|h| !h.is_finished()) +} + +/// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future +/// is dropped before it finishes (e.g. an outer timeout fires), this moves +/// the slot's still-live handle into the orphan list instead of letting it +/// be dropped-and-detached. On normal completion the handle has already +/// been taken from the slot, so this is a no-op. +/// +/// Generation-scoped: it only re-parks the handle if the slot still holds +/// the generation `quiesce` was draining. A concurrent same-key restart +/// bumps the generation and installs its own live handle; this guard leaves +/// that fresh handle alone. +struct Repark<'a, K: RegistryKey> { + reg: &'a ThreadRegistry, + key: K, + my_gen: u64, +} + +impl Drop for Repark<'_, K> { + fn drop(&mut self) { + // Take the handle under the slot lock, release it, then push to + // orphans. This path holds only one lock at a time; the single + // sanctioned nesting in the module is `slots`->`orphans` in + // `park_prior_locked`, and nothing ever takes `slots` while holding + // `orphans`, so the ordering stays acyclic. Skip if a restart + // superseded our generation (the handle is the new worker's, not + // ours). + let handle = self + .reg + .lock_slots() + .get_mut(&self.key) + .filter(|slot| slot.generation == self.my_gen) + .and_then(|slot| slot.handle.take()); + if let Some(handle) = handle { + self.reg.lock_orphans().push((self.key, handle)); + } + } +} + +/// Worker-side exit guard. Runs the generation-gated [`run_epilogue`] +/// from its `Drop`, so a worker whose `body` returns normally **or** +/// unwinds on panic still clears its running flag — `is_running()` then +/// reflects reality and `start()` can relaunch a crashed loop. +/// +/// Panic-strategy caveat (same as `AtomicFlagGuard`): the clear-on-panic +/// half relies on `Drop` running while the stack unwinds, so it holds under +/// `panic = "unwind"`. Under `panic = "abort"` a worker panic aborts the +/// process and there is no "after" to gate. +struct EpilogueGuard { + reg: Arc>, + key: K, + my_gen: u64, +} + +impl Drop for EpilogueGuard { + fn drop(&mut self) { + self.reg.run_epilogue(self.key, self.my_gen); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::panic::{catch_unwind, AssertUnwindSafe}; + use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::mpsc; + use tokio::runtime::{Builder, Handle}; + use tokio::sync::Barrier; + + type Reg = Arc>; + + /// Start an OS-thread worker that exits cleanly when cancelled. The + /// runtime handle is captured from the caller's context (the worker + /// thread is not itself a tokio worker, so it can't fetch its own). + fn start_clean(reg: &Reg, key: &'static str, cfg: WorkerConfig) { + let handle = Handle::current(); + reg.start_thread(key, cfg, move |cancel| { + handle.block_on(async move { cancel.cancelled().await }); + }); + } + + /// Body for a worker wedged in a non-yielding section: blocks on a + /// channel and ignores its cancellation token (stands in for a thread + /// stuck in a `Drop` that never observes cancel). + fn wedged_body(rx: mpsc::Receiver<()>) -> impl FnOnce(CancellationToken) + Send + 'static { + move |_cancel| { + let _ = rx.recv(); + } + } + + fn orphan_len(reg: &Reg) -> usize { + reg.lock_orphans().len() + } + + // ----- Group 1: F1 regression ------------------------------------- + + /// TC-001 — a `quiesce` whose outer future is dropped (a tiny enclosing + /// timeout) must re-park the live handle, never drop-and-detach it. The + /// slot is cleared (`is_running == false`) but the handle lives in + /// orphans and `any_alive()` stays true. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc001_quiesce_drop_reparks_handle_not_detach() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + reg.start_thread("alpha", WorkerConfig::default(), wedged_body(release_rx)); + assert!(reg.is_running("alpha")); + + // The wedged worker never observes cancel, so the internal 30s + // budget can't fire here; the tiny outer timeout drops the quiesce + // future mid-poll. A naive by-value-into-future impl would detach + // the handle (orphans empty, any_alive false); the fix re-parks it. + let result = tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await; + assert!( + result.is_err(), + "outer timeout must fire on the wedged worker" + ); + + assert!(reg.any_alive(), "re-parked handle keeps any_alive true"); + assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)"); + assert_eq!(orphan_len(®), 1, "handle was re-parked, not detached"); + assert!(!WorkerStatus::Timeout.is_clean()); + + // Release + reap: the orphan joins cleanly and liveness clears. + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert!(!reg.any_alive()); + } + + /// TC-001b — internal-budget variant: a wedged worker with a tiny + /// `join_budget` makes `quiesce` itself time out, re-park, and return + /// `Timeout` (no outer drop involved). + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc001b_quiesce_internal_budget_timeout_reparks() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let cfg = WorkerConfig { + join_budget: Duration::from_millis(50), + ..WorkerConfig::default() + }; + reg.start_thread("alpha", cfg, wedged_body(release_rx)); + + let status = reg.quiesce("alpha").await; + assert_eq!(status, WorkerStatus::Timeout); + assert_eq!(orphan_len(®), 1); + assert!(reg.any_alive()); + assert!(!reg.is_running("alpha")); + + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert!(!reg.any_alive()); + } + + /// GAP-006 — the F1 scenario via the `shutdown()` path: a wedged worker + /// with a tiny budget surfaces as `Timeout` in the report, its handle + /// is re-parked (`detached == 1`, `any_alive`), and the result is + /// non-clean — never a clean detach. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn gap006_shutdown_path_reparks_wedged_worker() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let cfg = WorkerConfig { + join_budget: Duration::from_millis(50), + ..WorkerConfig::default() + }; + reg.start_thread("alpha", cfg, wedged_body(release_rx)); + + let report = tokio::time::timeout(Duration::from_secs(10), reg.shutdown()) + .await + .expect("shutdown must complete within bound"); + assert_eq!(report.per_worker.get("alpha"), Some(&WorkerStatus::Timeout)); + assert_eq!(report.detached, 1, "wedged handle re-parked, survived reap"); + assert!(!report.all_clean()); + assert!(reg.any_alive()); + + // Cleanup. + release_tx.send(()).unwrap(); + let _ = reg.reap_orphans(Duration::from_secs(5)).await; + assert!(!reg.any_alive()); + } + + // ----- Group 3: registry unit suite ------------------------------- + + /// TC-003 — a slow prior-generation thread's epilogue must NOT clear a + /// newer generation's token. Restarting reaps the prior generation + /// fully (its epilogue runs); the new generation stays tracked. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc003_generation_match_epilogue_preserves_new_token() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "beta", WorkerConfig::default()); // gen 1 + assert!(reg.is_running("beta")); + + // Cancel gen 1, then restart. start_thread's reap joins gen 1 + // (running its gen-gated epilogue) before returning, so this is + // deterministic: if the epilogue ignored generation it would have + // cleared gen 2's token during that join. + reg.cancel("beta"); + start_clean(®, "beta", WorkerConfig::default()); // gen 2 + + assert!( + reg.is_running("beta"), + "gen-2 token must survive gen-1's epilogue" + ); + assert_eq!(reg.quiesce("beta").await, WorkerStatus::Ok); + } + + /// TC-004 — a naturally-finished prior thread is joined cleanly on + /// restart, with no parking. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc004_restart_reaps_finished_prior_without_parking() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "gamma", WorkerConfig::default()); + // Cancel so the prior exits, then restart: the reap must join it, + // not park it. + reg.cancel("gamma"); + start_clean(®, "gamma", WorkerConfig::default()); + assert_eq!(orphan_len(®), 0, "finished prior was joined, not parked"); + assert!(reg.is_running("gamma")); + assert_eq!(reg.quiesce("gamma").await, WorkerStatus::Ok); + } + + /// TC-005 — a prior thread wedged past the reap backstop is parked in + /// orphans (not dropped), then drained after release. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc005_restart_parks_wedged_prior() { + let reg = ThreadRegistry::with_reap_backstop(Duration::from_millis(100)); + let (release_tx, release_rx) = mpsc::channel::<()>(); + + // gen 1: wedged (ignores cancel). + reg.start_thread("delta", WorkerConfig::default(), wedged_body(release_rx)); + reg.cancel("delta"); + + // gen 2: clean. The restart reaps gen 1 — wedged past the 100ms + // backstop, so it is parked. Run off the runtime workers since the + // reap spins synchronously. + let reg_for_start = Arc::clone(®); + let parent = Handle::current(); + tokio::task::spawn_blocking(move || { + let handle = parent.clone(); + reg_for_start.start_thread("delta", WorkerConfig::default(), move |cancel| { + handle.block_on(async move { cancel.cancelled().await }); + }); + }) + .await + .unwrap(); + + assert_eq!(orphan_len(®), 1, "wedged prior parked, not dropped"); + assert!(reg.any_alive()); + assert!(reg.is_running("delta"), "gen-2 loop started"); + + // Release the wedged prior; reap drains it. + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert_eq!(orphan_len(®), 0); + + // Cleanup gen 2. + assert_eq!(reg.quiesce("delta").await, WorkerStatus::Ok); + } + + /// TC-006 — orphan drain: a survivor at the grace deadline is reported + /// `Detached` and re-parked; once released it reaps `Ok`. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc006_orphan_drain_detached_then_ok() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let wedged = std::thread::spawn(move || { + let _ = release_rx.recv(); + }); + reg.park_orphan_for_test("orphan", wedged); + + assert_eq!( + reg.reap_orphans(Duration::from_millis(50)).await, + WorkerStatus::Detached + ); + assert_eq!(orphan_len(®), 1, "survivor re-parked for retry"); + assert!(reg.any_alive()); + + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert_eq!(orphan_len(®), 0); + assert!(!reg.any_alive()); + } + + /// TC-007 — weight-ordered shutdown drains a lower tier before a higher + /// one. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc007_weight_ordered_shutdown_drains_low_first() { + let reg = ThreadRegistry::<&str>::new(); + let log = Arc::new(Mutex::new(Vec::<&'static str>::new())); + + let mk_hook = |tag: &'static str, log: Arc>>| -> DrainHook { + Arc::new(move || { + let log = Arc::clone(&log); + Box::pin(async move { + log.lock().unwrap().push(tag); + }) + }) + }; + + start_clean( + ®, + "w0", + WorkerConfig { + weight: ShutdownWeight(0), + drain: Some(mk_hook("w0", Arc::clone(&log))), + ..WorkerConfig::default() + }, + ); + start_clean( + ®, + "w5", + WorkerConfig { + weight: ShutdownWeight(5), + drain: Some(mk_hook("w5", Arc::clone(&log))), + ..WorkerConfig::default() + }, + ); + start_clean( + ®, + "w10", + WorkerConfig { + weight: ShutdownWeight(10), + drain: Some(mk_hook("w10", Arc::clone(&log))), + ..WorkerConfig::default() + }, + ); + + let report = reg.shutdown().await; + assert!(report.all_clean()); + + let log = log.lock().unwrap(); + let pos = |tag| log.iter().position(|t| *t == tag).unwrap(); + assert!(pos("w0") < pos("w5")); + assert!(pos("w5") < pos("w10")); + } + + /// TC-008 — equal-weight workers drain concurrently. A shared + /// `Barrier(2)` in both drain hooks would deadlock under sequential + /// draining (caught by the enclosing timeout); the event log proves + /// both arrived before either passed. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc008_equal_weight_drains_concurrently() { + let reg = ThreadRegistry::<&str>::new(); + let log = Arc::new(Mutex::new(Vec::<&'static str>::new())); + let barrier = Arc::new(Barrier::new(2)); + + let mk_hook = |arrived: &'static str, + passed: &'static str, + log: Arc>>, + barrier: Arc| + -> DrainHook { + Arc::new(move || { + let log = Arc::clone(&log); + let barrier = Arc::clone(&barrier); + Box::pin(async move { + log.lock().unwrap().push(arrived); + barrier.wait().await; + log.lock().unwrap().push(passed); + }) + }) + }; + + start_clean( + ®, + "a", + WorkerConfig { + weight: ShutdownWeight(0), + drain: Some(mk_hook( + "a_arrived", + "a_passed", + Arc::clone(&log), + Arc::clone(&barrier), + )), + ..WorkerConfig::default() + }, + ); + start_clean( + ®, + "b", + WorkerConfig { + weight: ShutdownWeight(0), + drain: Some(mk_hook( + "b_arrived", + "b_passed", + Arc::clone(&log), + Arc::clone(&barrier), + )), + ..WorkerConfig::default() + }, + ); + + let report = tokio::time::timeout(Duration::from_secs(5), reg.shutdown()) + .await + .expect("equal-weight drain must not deadlock (proves concurrency)"); + assert!(report.all_clean()); + + let log = log.lock().unwrap(); + let pos = |tag| log.iter().position(|t| *t == tag).unwrap(); + let last_arrived = pos("a_arrived").max(pos("b_arrived")); + let first_passed = pos("a_passed").min(pos("b_passed")); + assert!( + last_arrived < first_passed, + "both hooks must reach the barrier before either passes: {log:?}" + ); + } + + /// TC-009 — `any_alive()` accounts for both live slots and orphans. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc009_any_alive_spans_slots_and_orphans() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "alpha", WorkerConfig::default()); + assert!(reg.any_alive()); + + let (release_tx, release_rx) = mpsc::channel::<()>(); + let wedged = std::thread::spawn(move || { + let _ = release_rx.recv(); + }); + reg.park_orphan_for_test("orphan", wedged); + assert!(reg.any_alive()); + + assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok); + assert!( + reg.any_alive(), + "orphan still contributes after slot drains" + ); + assert!(!reg.is_running("alpha")); + + release_tx.send(()).unwrap(); + let _ = reg.reap_orphans(Duration::from_secs(2)).await; + assert!(!reg.any_alive()); + } + + /// `any_alive_for(key)` is scoped: an orphan parked under one key does + /// not make a different key look alive (the F2 gate must not be + /// blocked by unrelated workers). + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn any_alive_for_is_key_scoped() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let wedged = std::thread::spawn(move || { + let _ = release_rx.recv(); + }); + reg.park_orphan_for_test("shielded", wedged); + + // A live, unrelated worker. + start_clean(®, "identity", WorkerConfig::default()); + + assert!(reg.any_alive(), "registry-wide liveness sees both"); + assert!(reg.any_alive_for("shielded"), "shielded orphan is alive"); + assert!( + !reg.any_alive_for("address"), + "an unrelated key with no slot/orphan is not alive" + ); + + // The running 'identity' worker must not make 'shielded' look alive + // beyond its own orphan, and vice versa. + assert!(reg.any_alive_for("identity"), "running identity is alive"); + + release_tx.send(()).unwrap(); + let _ = reg.reap_orphans(Duration::from_secs(2)).await; + assert!( + !reg.any_alive_for("shielded"), + "shielded clear once its orphan is reaped" + ); + assert_eq!(reg.quiesce("identity").await, WorkerStatus::Ok); + } + + /// TC-010 — `shutdown()` panics with a documented message on a + /// current-thread runtime (R4, variant B). + #[test] + fn tc010_shutdown_asserts_multi_thread_runtime() { + let rt = Builder::new_current_thread().enable_all().build().unwrap(); + let reg = ThreadRegistry::<&str>::new(); + let result = catch_unwind(AssertUnwindSafe(|| { + rt.block_on(async { reg.shutdown().await }); + })); + let payload = result.expect_err("shutdown must panic on current_thread"); + let msg = payload + .downcast_ref::() + .map(String::as_str) + .or_else(|| payload.downcast_ref::<&str>().copied()) + .unwrap_or(""); + assert!( + msg.contains("multi-thread"), + "panic must name the runtime constraint, got: {msg}" + ); + } + + // ----- Group 4: DrainHook ordering -------------------------------- + + /// TC-011 — the drain hook is fully awaited before the cancel signal is + /// observed by the worker. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc011_drain_hook_completes_before_cancel() { + let reg = ThreadRegistry::<&str>::new(); + let log = Arc::new(Mutex::new(Vec::<&'static str>::new())); + + let log_hook = Arc::clone(&log); + let drain: DrainHook = Arc::new(move || { + let log = Arc::clone(&log_hook); + Box::pin(async move { + log.lock().unwrap().push("drain_hook_start"); + tokio::time::sleep(Duration::from_millis(10)).await; + log.lock().unwrap().push("drain_hook_complete"); + }) + }); + + let log_worker = Arc::clone(&log); + let handle = Handle::current(); + reg.start_thread( + "epsilon", + WorkerConfig { + drain: Some(drain), + ..WorkerConfig::default() + }, + move |cancel| { + handle.block_on(async move { + cancel.cancelled().await; + log_worker.lock().unwrap().push("cancel_observed"); + }); + }, + ); + + assert_eq!(reg.quiesce("epsilon").await, WorkerStatus::Ok); + assert!(!reg.is_running("epsilon")); + + let log = log.lock().unwrap(); + let pos = |tag| log.iter().position(|t| *t == tag).unwrap(); + assert!(pos("drain_hook_start") < pos("drain_hook_complete")); + assert!(pos("drain_hook_complete") < pos("cancel_observed")); + } + + /// TC-012 — a `quiesce` blocks in the drain hook until an `is_syncing` + /// barrier the hook polls falls, and only then cancels + joins. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc012_drain_hook_observes_barrier_before_join() { + let reg = ThreadRegistry::<&str>::new(); + let is_syncing = Arc::new(AtomicBool::new(true)); + + let gate = Arc::clone(&is_syncing); + let drain: DrainHook = Arc::new(move || { + let gate = Arc::clone(&gate); + Box::pin(async move { + while gate.load(Ordering::Acquire) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + }) + }); + start_clean( + ®, + "zeta", + WorkerConfig { + drain: Some(drain), + ..WorkerConfig::default() + }, + ); + + let quiesce_completed = Arc::new(AtomicBool::new(false)); + let reg_q = Arc::clone(®); + let done = Arc::clone(&quiesce_completed); + let quiesce_task = tokio::spawn(async move { + let status = reg_q.quiesce("zeta").await; + done.store(true, Ordering::Release); + status + }); + + // While the barrier is held, quiesce must stay pending. + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + !quiesce_completed.load(Ordering::Acquire), + "quiesce must block while is_syncing is held" + ); + + // Release the barrier; quiesce drains, cancels, joins. + is_syncing.store(false, Ordering::Release); + let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task) + .await + .expect("quiesce must complete once the barrier falls") + .unwrap(); + assert_eq!(status, WorkerStatus::Ok); + assert!(quiesce_completed.load(Ordering::Acquire)); + } + + // ----- Group 5: status classification ----------------------------- + + /// TC-013 — only the `Task` kind can classify as `Stopped` (from a + /// runtime-level cancel/abort JoinError); a cooperatively token- + /// cancelled task exits normally as `Ok`. Verifies the kind-dispatch + /// at the classification boundary. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc013_task_kind_classifies_stopped_and_ok() { + // Stopped: an aborted task yields a cancelled JoinError. + let aborted = tokio::spawn(std::future::pending::<()>()); + aborted.abort(); + while !aborted.is_finished() { + tokio::time::sleep(Duration::from_millis(1)).await; + } + let status = WorkerHandle::Task(aborted).classify(); + assert!(matches!(status, WorkerStatus::Stopped(_)), "got {status:?}"); + assert!(!status.is_clean()); + + // Ok: a cooperatively token-cancelled task returns normally. + let reg = ThreadRegistry::<&str>::new(); + reg.start_task("task_a", WorkerConfig::default(), |cancel| async move { + cancel.cancelled().await; + }); + assert_eq!(reg.quiesce("task_a").await, WorkerStatus::Ok); + assert!(!reg.is_running("task_a")); + } + + /// TC-014 — an `OsThread` worker yields `Ok` (clean) or `Panicked` + /// (`&str` and `String` payloads), never `Stopped`. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc014_os_thread_ok_and_panicked_never_stopped() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "os_clean", WorkerConfig::default()); + let ok = reg.quiesce("os_clean").await; + assert_eq!(ok, WorkerStatus::Ok); + assert!(ok.is_clean()); + + // &str panic payload. + reg.start_thread("os_panic_str", WorkerConfig::default(), |_cancel| { + panic!("deliberate test panic"); + }); + match reg.quiesce("os_panic_str").await { + WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate test panic")), + other => panic!("expected Panicked, got {other:?}"), + } + + // String panic payload. + reg.start_thread("os_panic_string", WorkerConfig::default(), |_cancel| { + std::panic::panic_any(String::from("deliberate string panic")); + }); + match reg.quiesce("os_panic_string").await { + WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate string panic")), + other => panic!("expected Panicked, got {other:?}"), + } + } + + // ----- Gaps ------------------------------------------------------- + + /// GAP-003 — `shutdown()` is idempotent: a second call finds every slot + /// already joined and reports `NotRunning`, still clean. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn gap003_shutdown_is_idempotent() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "alpha", WorkerConfig::default()); + + let first = reg.shutdown().await; + assert_eq!(first.per_worker.get("alpha"), Some(&WorkerStatus::Ok)); + assert!(first.all_clean()); + + let second = reg.shutdown().await; + assert_eq!( + second.per_worker.get("alpha"), + Some(&WorkerStatus::NotRunning) + ); + assert!(second.all_clean()); + } + + /// GAP-004 — `cancel(key)` is selective: cancelling A does not touch B. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn gap004_cancel_is_selective() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "a", WorkerConfig::default()); + start_clean(®, "b", WorkerConfig::default()); + + reg.cancel("a"); + assert!(reg.is_running("b"), "cancel(a) must not cancel b"); + assert_eq!(reg.quiesce("a").await, WorkerStatus::Ok); + assert!(reg.is_running("b"), "b still running after a drains"); + assert_eq!(reg.quiesce("b").await, WorkerStatus::Ok); + } + + /// GAP-005 — `WorkerConfig::default()` values are pinned. + #[test] + fn gap005_worker_config_defaults_pinned() { + let cfg = WorkerConfig::default(); + assert_eq!(cfg.weight, ShutdownWeight(0)); + assert!(cfg.drain.is_none()); + assert_eq!(cfg.join_budget, DEFAULT_JOIN_BUDGET); + } + + // ----- Group 6: concurrency-hazard regressions -------------------- + + /// `quiesce` is generation-guarded. A same-key restart that lands after + /// quiesce takes the prior's cancel must not have its fresh, live handle + /// parked or reported `Timeout`: the superseded quiesce returns + /// `NotRunning` and the new generation survives. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn quiesce_generation_guard_spares_concurrent_restart() { + let reg = ThreadRegistry::<&str>::new(); + // gen-1: a task that ignores cancellation (pending forever), with a + // tiny join budget so a non-guarded quiesce would Timeout quickly. + reg.start_task( + "k", + WorkerConfig { + join_budget: Duration::from_millis(150), + ..WorkerConfig::default() + }, + |_cancel| async move { std::future::pending::<()>().await }, + ); + + // Drive quiesce concurrently; it snapshots gen=1, cancels (ignored), + // and enters the poll loop with cancel already taken. + let reg_q = Arc::clone(®); + let q = tokio::spawn(async move { reg_q.quiesce("k").await }); + + // Let quiesce pass cancel.take() so a restart can proceed. + tokio::time::sleep(Duration::from_millis(40)).await; + + // Restart: cancel is now None, so this proceeds — it takes gen-1's + // live handle as its prior (parked) and installs gen-2. + reg.start_task("k", WorkerConfig::default(), |cancel| async move { + cancel.cancelled().await; + }); + + // The superseded quiesce must NOT park gen-2 / report Timeout. + let status = q.await.unwrap(); + assert_eq!( + status, + WorkerStatus::NotRunning, + "superseded quiesce returns NotRunning, never a spurious Timeout" + ); + assert!(reg.is_running("k"), "gen-2 survives the racing quiesce"); + + // gen-2 quiesces cleanly. + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + } + + /// A thread-spawn failure must neither panic nor detach the live prior + /// handle: it rolls back (prior re-installed, running flag cleared) and + /// the slot stays usable / reapable. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn spawn_failure_reparks_live_prior_without_panic() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + // gen-1: wedged (ignores cancel), stays live until released. + reg.start_thread("k", WorkerConfig::default(), wedged_body(release_rx)); + // cancel() takes the token (slot.cancel = None) but the wedged thread + // keeps running — the slot now holds a LIVE prior handle with cancel + // cleared, the exact shape a racing restart would take as its prior. + reg.cancel("k"); + assert!(!reg.is_running("k")); + + // Force the restart's spawn to fail; it must not panic. + reg.force_spawn_failure.store(true, Ordering::Release); + reg.start_thread("k", WorkerConfig::default(), |_cancel| {}); + assert!( + !reg.is_running("k"), + "failed spawn clears the running flag, never leaves it wedged" + ); + assert!(reg.any_alive(), "live prior re-installed, never detached"); + + // Recover: release the prior; quiesce reaps the now-finished handle + // cleanly, proving it was owned (not leaked/detached) and the slot is + // not wedged. + reg.force_spawn_failure.store(false, Ordering::Release); + release_tx.send(()).unwrap(); + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + assert!(!reg.any_alive()); + } + + /// A thread-spawn failure must roll the slot back to its PRIOR config, not + /// leave the failed start's weight / drain / join_budget / generation + /// behind: the re-installed prior worker keeps its own teardown config for + /// a later quiesce/shutdown. + /// + /// Non-vacuous: against a partial rollback (only cancel/handle restored), + /// the slot would carry the failed start's weight/budget, a `None` drain, + /// and the bumped generation. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn spawn_failure_restores_prior_slot_config() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + + // gen-1 with a DISTINCTIVE config (drain hook + non-default weight and + // join budget). Wedged so it stays the live prior after cancel. + let hook: DrainHook = Arc::new(|| Box::pin(async {})); + let cfg1 = WorkerConfig { + weight: ShutdownWeight(7), + join_budget: Duration::from_secs(11), + drain: Some(hook), + }; + reg.start_thread("k", cfg1, wedged_body(release_rx)); + reg.cancel("k"); + let gen_after_gen1 = reg.lock_slots().get("k").unwrap().generation; + + // Failed restart with a DIFFERENT config; the rollback must discard it. + reg.force_spawn_failure.store(true, Ordering::Release); + let cfg2 = WorkerConfig { + weight: ShutdownWeight(99), + join_budget: Duration::from_secs(99), + drain: None, + }; + reg.start_thread("k", cfg2, |_cancel| {}); + reg.force_spawn_failure.store(false, Ordering::Release); + + { + let slots = reg.lock_slots(); + let slot = slots.get("k").expect("slot present"); + assert_eq!(slot.weight, ShutdownWeight(7), "weight restored to prior"); + assert_eq!( + slot.join_budget, + Duration::from_secs(11), + "join_budget restored to prior" + ); + assert!( + slot.drain.is_some(), + "prior drain hook restored, not the failed start's None" + ); + assert_eq!( + slot.generation, gen_after_gen1, + "generation rolled back to its pre-bump value" + ); + assert!( + slot.cancel.is_none(), + "running flag cleared after failed spawn" + ); + assert!( + slot.handle.is_some(), + "prior handle re-installed (alive), not detached" + ); + } + assert!(reg.any_alive(), "live prior still accounted for"); + + // Recover: release + quiesce reaps the prior cleanly. + release_tx.send(()).unwrap(); + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + assert!(!reg.any_alive()); + } + + /// A panicking worker body still runs its epilogue (via the drop-guard), + /// so `is_running()` reflects the crash and `start()` can relaunch the + /// loop instead of silently no-op'ing. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn panicked_worker_clears_running_and_allows_restart() { + let reg = ThreadRegistry::<&str>::new(); + // A worker whose body panics immediately. + reg.start_thread("k", WorkerConfig::default(), |_cancel| { + panic!("deliberate worker-body panic"); + }); + + // The drop-guard epilogue clears the running flag despite the panic. + let mut waited = Duration::ZERO; + while reg.is_running("k") && waited < Duration::from_secs(2) { + tokio::time::sleep(Duration::from_millis(5)).await; + waited += Duration::from_millis(5); + } + assert!( + !reg.is_running("k"), + "panicked worker clears its running flag via the epilogue guard" + ); + + // start() can relaunch a crashed loop (no longer a silent no-op). + let ran = Arc::new(AtomicBool::new(false)); + let ran_w = Arc::clone(&ran); + let handle = Handle::current(); + reg.start_thread("k", WorkerConfig::default(), move |cancel| { + ran_w.store(true, Ordering::Release); + handle.block_on(async move { cancel.cancelled().await }); + }); + assert!( + reg.is_running("k"), + "start() relaunches a previously-panicked worker" + ); + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + assert!( + ran.load(Ordering::Acquire), + "restarted worker body executed" + ); + } + + /// `shutdown()` latches the registry closed: a start racing (or + /// following) teardown is refused, so no worker is left un-joined. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_latches_closed_refusing_new_workers() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "live", WorkerConfig::default()); + let report = reg.shutdown().await; + assert!(report.all_clean()); + + // One-way door: both worker kinds are refused after shutdown. + start_clean(®, "late_thread", WorkerConfig::default()); + assert!( + !reg.is_running("late_thread"), + "start_thread after shutdown is refused" + ); + reg.start_task("late_task", WorkerConfig::default(), |cancel| async move { + cancel.cancelled().await; + }); + assert!( + !reg.is_running("late_task"), + "start_task after shutdown is refused" + ); + assert!(!reg.any_alive(), "nothing started post-shutdown"); + } + + /// `start_thread` must park a restarted key's still-wedged prior into the + /// orphan list UNDER the slot lock — at the START of the restart, not only + /// after the out-of-lock reap backstop elapses. + /// Otherwise a `shutdown()` that snapshots tiers in the window between + /// "prior taken out of the slot" and "prior parked" sees neither the + /// prior (already moved out of the slot) nor an orphan, and reports + /// clean while the wedged prior is still live and un-joined. + /// + /// Deterministic via a long backstop: with the fix the prior is + /// observable in orphans well before the backstop could elapse; the + /// pre-fix code parks it only at the end of the out-of-lock spin, so the + /// early assertion fails. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn start_thread_parks_wedged_prior_under_slot_lock_at_restart() { + // Long backstop so the under-lock parking is observable well before + // it could possibly elapse. + let reg = ThreadRegistry::with_reap_backstop(Duration::from_secs(10)); + let (release_tx, release_rx) = mpsc::channel::<()>(); + + // gen-1: wedged (ignores cancel), stays live until released. + reg.start_thread("k", WorkerConfig::default(), wedged_body(release_rx)); + reg.cancel("k"); + + // gen-2 restart on a blocking thread: its bounded reap of the wedged + // gen-1 spins the (long) backstop, so start_thread does not return + // promptly. The fix parks gen-1 under the slot lock at the start of + // this call, before that spin. + let reg2 = Arc::clone(®); + let parent = Handle::current(); + let restart = tokio::task::spawn_blocking(move || { + let handle = parent.clone(); + reg2.start_thread("k", WorkerConfig::default(), move |cancel| { + handle.block_on(async move { cancel.cancelled().await }); + }); + }); + + // The wedged prior must appear in orphans far sooner than the 10s + // backstop — it was parked under the slot lock at restart. + let mut waited = Duration::ZERO; + while orphan_len(®) == 0 && waited < Duration::from_secs(2) { + tokio::time::sleep(Duration::from_millis(10)).await; + waited += Duration::from_millis(10); + } + assert_eq!( + orphan_len(®), + 1, + "wedged prior must be parked under the slot lock at restart, not \ + only after the backstop spin" + ); + assert!(reg.is_running("k"), "gen-2 installed under the same lock"); + + // Release the wedged prior: the restart's bounded reap then finds it + // finished, removes it from orphans, and joins it. + release_tx.send(()).unwrap(); + restart.await.unwrap(); + assert_eq!( + orphan_len(®), + 0, + "finished prior removed from orphans by the bounded reap" + ); + + // gen-2 quiesces cleanly. + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + } +} diff --git a/packages/rs-platform-wallet-ffi/Cargo.toml b/packages/rs-platform-wallet-ffi/Cargo.toml index 8a2bd4ef2b4..7e60b05d69b 100644 --- a/packages/rs-platform-wallet-ffi/Cargo.toml +++ b/packages/rs-platform-wallet-ffi/Cargo.toml @@ -22,7 +22,7 @@ rs-sdk-ffi = { path = "../rs-sdk-ffi" } once_cell = "1.19" parking_lot = { version = "0.12", features = ["send_guard"] } lazy_static = "1.4" -tokio = { version = "1", features = ["rt-multi-thread"] } +tokio = { version = "1", features = ["rt-multi-thread", "time"] } tokio-metrics = { workspace = true, optional = true } # Core dependencies (for Network type) diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs index de1a6cb9441..e5b5184a82a 100644 --- a/packages/rs-platform-wallet-ffi/src/error.rs +++ b/packages/rs-platform-wallet-ffi/src/error.rs @@ -125,6 +125,33 @@ pub enum PlatformWalletFFIResultCode { /// and could double-send if the original spend landed. ErrorShieldedSpendUnconfirmed = 18, + /// A background coordinator drain did not complete cleanly within the + /// join deadline — one or more `!Send` sync threads may still be alive + /// and still hold a reference to the host-owned callback context, so they + /// could fire one final callback through it. On this code the host **must + /// not** free the callback context immediately: either keep it alive for a + /// further grace period, or accept the (statistically tiny) race. + /// + /// Returned by three callers, which differ in whether the operation may + /// be **retried**: + /// - `platform_wallet_manager_destroy`: the manager **IS** torn down + /// (removed from storage) regardless — do **not** retry `destroy`; the + /// handle is already gone. Only the callback-context lifetime caveat + /// above applies. + /// - `platform_wallet_manager_shielded_sync_stop`: the manager is **NOT** + /// torn down — only the shielded loop's drain was non-clean. The host + /// may retry the stop (or proceed to `destroy`); the handle stays valid. + /// - `platform_wallet_manager_shielded_clear`: the manager is **NOT** torn + /// down and the store was left **intact** (Clear aborted before touching + /// it). The host may retry the clear, and must **not** commit its own + /// persistence wipe — doing so would desync the host's rows from the + /// still-populated shared tree. + /// + /// Distinct from a normal operation error (the underlying operation may + /// well have made progress); the terminal coordinator status is rendered + /// into the result message. + ErrorShutdownIncomplete = 19, + NotFound = 98, // Used exclusively for all the Option that are retuned as errors ErrorUnknown = 99, } @@ -237,6 +264,14 @@ impl From for PlatformWalletFFIResult { PlatformWalletError::ShieldedSpendUnconfirmed { .. } => { PlatformWalletFFIResultCode::ErrorShieldedSpendUnconfirmed } + // A Clear that refused because the in-flight shielded pass didn't + // drain cleanly: surface it as ErrorShutdownIncomplete (symmetric + // with `platform_wallet_manager_destroy`) so the host defers + // freeing its callback context AND does not commit its own + // persistence wipe — the store was intentionally left intact. + PlatformWalletError::ShieldedShutdownIncomplete { .. } => { + PlatformWalletFFIResultCode::ErrorShutdownIncomplete + } _ => PlatformWalletFFIResultCode::ErrorUnknown, }; PlatformWalletFFIResult::err(code, error.to_string()) @@ -595,6 +630,29 @@ mod tests { assert_eq!(msg, rendered, "Display payload must survive verbatim"); } + /// A Clear that refused on a non-clean shielded drain must surface as + /// `ErrorShutdownIncomplete` (symmetric with `destroy`), not flatten to + /// `ErrorUnknown`, so the host knows to defer freeing its callback + /// context and to NOT commit its own persistence wipe. The typed Display + /// rendering (carrying the terminal coordinator status) survives verbatim. + #[test] + fn shielded_shutdown_incomplete_maps_to_dedicated_code() { + let err = PlatformWalletError::ShieldedShutdownIncomplete { + status: platform_wallet::CoordinatorThreadStatus::Timeout, + }; + let rendered = err.to_string(); + let result: PlatformWalletFFIResult = err.into(); + assert_eq!( + result.code, + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + "ShieldedShutdownIncomplete should map to ErrorShutdownIncomplete (rendered: {rendered})" + ); + let msg = unsafe { std::ffi::CStr::from_ptr(result.message) } + .to_string_lossy() + .into_owned(); + assert_eq!(msg, rendered, "Display payload must survive verbatim"); + } + /// Other wallet-error variants without a dedicated FFI arm still /// fall through to `ErrorUnknown` while carrying the typed /// Display rendering as the message. Pin this so the catch-all diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs index 5930c1c4db6..986103ab477 100644 --- a/packages/rs-platform-wallet-ffi/src/manager.rs +++ b/packages/rs-platform-wallet-ffi/src/manager.rs @@ -360,7 +360,29 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy( // left alive to fire a callback against freed memory. // `shutdown()` is idempotent, so this is safe even if the host // already stopped some sync managers before calling destroy. - runtime().block_on(manager.shutdown()); + // It now joins the coordinator OS threads and returns their + // per-thread exit status; the C ABI exposes none of that, so we + // just log it (a panicked loop is worth surfacing) and drop it. + let status = runtime().block_on(manager.shutdown()); + if !status.all_clean() { + tracing::warn!( + ?status, + "platform wallet coordinator(s) did not exit cleanly; \ + host must not free the callback context immediately" + ); + // Return a distinct non-ok code so the host can delay freeing + // its callback context. A lingering coordinator thread (e.g. one + // that timed out) still holds an Arc to the event handler and may + // fire one final callback through the host-owned context pointer; + // returning ok() here would signal that the context is safe to + // free when it may not be yet. + return PlatformWalletFFIResult::err( + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + format!("coordinator(s) did not exit cleanly: {status:?}"), + ); + } else { + tracing::debug!(?status, "platform wallet coordinators joined cleanly"); + } } PlatformWalletFFIResult::ok() } diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs index 2d58d8165f6..493f84aa0f8 100644 --- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs +++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs @@ -68,12 +68,27 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_start( /// Stop the shielded sync manager and wait for any in-flight pass to /// drain before returning. No-op if not running. /// -/// Uses `quiesce` rather than cancel-only stop, so on return: the loop -/// is cancelled, no new pass will start, and any in-flight pass has +/// Uses `quiesce` rather than cancel-only stop, so on a clean return: the +/// loop is cancelled, no new pass will start, and any in-flight pass has /// fully drained — its **persistence callbacks have completed** (no /// note/sync-state row can be written after this returns) and its /// completion-event *dispatch* on the Rust side has run. /// +/// Returns `ErrorShutdownIncomplete` instead of `Success` in either of two +/// cases, so `Success` accurately implies **no live shielded worker or +/// orphan remains**: +/// - the drain did not complete cleanly (the in-flight pass timed out on the +/// join backstop, or the loop ended non-cleanly); or +/// - the drain was clean but a prior-generation shielded thread is still +/// parked alive as an orphan (a tight `stop()`->`start()` reap detached it +/// past the wedge backstop). +/// +/// The terminal coordinator status is rendered into the result message. On +/// this code the host must **not** free the callback context immediately — a +/// lingering pass or parked orphan may still fire one final callback through +/// it (symmetric with `platform_wallet_manager_destroy` and the shielded +/// Clear flow). +/// /// Caveat on host-observed events: a host that marshals the completion /// callback onto its own executor (e.g. the Swift trampoline hops it to /// the `@MainActor`) may still observe that final, already-dispatched @@ -88,9 +103,58 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop( handle: Handle, ) -> PlatformWalletFFIResult { let option = PLATFORM_WALLET_MANAGER_STORAGE.with_item(handle, |manager| { - runtime().block_on(manager.shielded_sync().quiesce()); + let status = runtime().block_on(async { + // Bound the quiesce with the same backstop `shutdown()` uses so + // a stalled in-flight pass can't hang the host's stop call + // forever. Cancellation makes the drain prompt; this only + // matters if a pass's drop wedges. A timeout (the future was + // dropped at the deadline) is reported as the non-clean + // `Timeout` status, matching `shutdown()`'s backstop + // substitution, so the host learns the drain may be incomplete. + match tokio::time::timeout( + Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS), + manager.shielded_sync().quiesce(), + ) + .await + { + Ok(status) => status, + Err(_elapsed) => platform_wallet::CoordinatorThreadStatus::Timeout, + } + }); + // Capture orphan liveness while we still hold the manager: a clean + // quiesce drains the live slot but not a prior-generation thread + // parked as an orphan. + let shielded_alive = manager.shielded_worker_alive(); + (status, shielded_alive) }); - unwrap_option_or_return!(option); + let (status, shielded_alive) = unwrap_option_or_return!(option); + // Symmetric with `platform_wallet_manager_destroy`: a non-clean drain + // means the shielded loop may still hold a reference to the host-owned + // event-handler / persister context and could fire one final callback, + // so signal the host to defer freeing that context rather than returning + // ok() and inviting a use-after-free. + if !status.is_clean() { + return PlatformWalletFFIResult::err( + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + format!( + "shielded sync stop did not drain cleanly ({status:?}); \ + host must not free the callback context immediately" + ), + ); + } + // Even on a clean drain, a parked prior-generation shielded thread may + // still be alive and holding the host's callback context — mirror + // `clear_shielded` / `destroy` and refuse the clean return so the host + // does not free that context out from under a lingering orphan. + if shielded_alive { + return PlatformWalletFFIResult::err( + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + "shielded sync stop drained cleanly but a prior-generation shielded \ + worker is still parked alive; host must not free the callback \ + context immediately" + .to_string(), + ); + } PlatformWalletFFIResult::ok() } @@ -417,7 +481,9 @@ pub unsafe extern "C" fn platform_wallet_manager_configure_shielded( /// via the changeset path. /// /// Returns `ErrorWalletOperation` if the Rust-side store reset -/// fails. The host **must** check this before wiping its own +/// fails, or `ErrorShutdownIncomplete` if the in-flight sync pass +/// did not drain cleanly first (in which case the store is left +/// intact). The host **must** check this before wiping its own /// persistence: a silent failure would leave the shared tree /// populated while the host drops its rows, and the next cold /// resync would gate-skip every re-downloaded position against the @@ -443,10 +509,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_clear( }); let result = unwrap_option_or_return!(option); if let Err(e) = result { - return PlatformWalletFFIResult::err( - PlatformWalletFFIResultCode::ErrorWalletOperation, - format!("clear_shielded failed: {e}"), - ); + // A non-clean / timed-out quiesce aborts the clear *before* the store + // is touched: surface it as ErrorShutdownIncomplete (symmetric with + // destroy / shielded_sync_stop) so the host defers freeing its + // callback context and does NOT commit its own persistence wipe — the + // store was intentionally left intact. Every other clear failure is a + // store-reset error → ErrorWalletOperation, as before. + let code = match &e { + platform_wallet::PlatformWalletError::ShieldedShutdownIncomplete { .. } => { + PlatformWalletFFIResultCode::ErrorShutdownIncomplete + } + _ => PlatformWalletFFIResultCode::ErrorWalletOperation, + }; + return PlatformWalletFFIResult::err(code, format!("clear_shielded failed: {e}")); } PlatformWalletFFIResult::ok() } diff --git a/packages/rs-platform-wallet/Cargo.toml b/packages/rs-platform-wallet/Cargo.toml index 1362523ecea..5398e9c009d 100644 --- a/packages/rs-platform-wallet/Cargo.toml +++ b/packages/rs-platform-wallet/Cargo.toml @@ -31,6 +31,7 @@ bimap = "0.6" # Async runtime tokio = { version = "1", features = ["sync", "rt", "time", "macros"] } tokio-util = { version = "0.7.12" } +dash-async = { path = "../rs-dash-async" } # Logging tracing = "0.1" @@ -80,6 +81,9 @@ name = "shielded_chunk_timing_bench" required-features = ["shielded"] [dev-dependencies] +# Enables `ThreadRegistry::park_orphan_for_test` for the manager's F2-gate +# regression tests; the seam is feature-gated so it never ships in release. +dash-async = { path = "../rs-dash-async", features = ["test-util"] } # Used by `examples/shielded_chunk_timing_bench.rs` and # `tests/shielded_decrypt_bench.rs` to assemble per-chunk wire # fixtures and decode the `ShieldedEncryptedNote` wire type. diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs index 46945667ef8..927cf8d0006 100644 --- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs +++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs @@ -3,7 +3,7 @@ //! Upstream `key_wallet_manager::WalletManager` exposes a //! `broadcast::Sender` and a `subscribe_events()` accessor //! returning a `broadcast::Receiver`; consumers attach at -//! startup and drain the stream. [`spawn_wallet_event_adapter`] is the +//! startup and drain the stream. [`wallet_event_adapter_loop`] is the //! platform-wallet-side consumer: a tokio task that pulls events off //! that broadcast, projects each one into a //! [`CoreChangeSet`](crate::changeset::CoreChangeSet), wraps it in a @@ -19,10 +19,11 @@ //! //! # Lifetime //! -//! [`spawn_wallet_event_adapter`] returns a [`JoinHandle`]. The caller -//! (typically `PlatformWalletManager`) keeps the handle for the -//! manager's lifetime; on shutdown, fire the [`CancellationToken`] to -//! make the task exit cleanly. +//! [`wallet_event_adapter_loop`] is the task body. The caller (typically +//! `PlatformWalletManager`) registers it on the shared `ThreadRegistry` +//! via `start_task`, which owns its [`JoinHandle`](tokio::task::JoinHandle) +//! and cancellation; on shutdown the registry fires the +//! [`CancellationToken`] to make the task exit cleanly and joins it. use std::sync::Arc; @@ -34,87 +35,82 @@ use key_wallet::Utxo; use key_wallet_manager::{WalletEvent, WalletId, WalletManager}; use tokio::sync::broadcast::error::RecvError; use tokio::sync::RwLock; -use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use crate::changeset::changeset::{CoreChangeSet, PlatformWalletChangeSet}; use crate::changeset::traits::PlatformWalletPersistence; use crate::wallet::platform_wallet::PlatformWalletInfo; -/// Spawn the wallet-event subscriber task. +/// The wallet-event subscriber loop (the task body owned by the registry). /// -/// Subscribes to `wallet_manager.subscribe_events()` from inside the -/// spawned task (so the call-site doesn't need to be on a tokio -/// runtime), then loops dispatching events to the persister via -/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires -/// or the upstream broadcast channel closes. +/// Subscribes to `wallet_manager.subscribe_events()` from inside the task +/// (so the call-site doesn't need to be on a tokio runtime), then loops +/// dispatching events to the persister via +/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires or the +/// upstream broadcast channel closes. /// -/// Generic over `P` so the spawned task gets static-dispatch on -/// every `persister.store(...)` call. Pass the manager's own -/// `Arc

` (not the `Arc` -/// coercion) to actually realize the static-dispatch win. -pub fn spawn_wallet_event_adapter

( +/// Generic over `P` so the task gets static-dispatch on every +/// `persister.store(...)` call. Pass the manager's own `Arc

` (not the +/// `Arc` coercion) to realize that win. +pub async fn wallet_event_adapter_loop

( wallet_manager: Arc>>, persister: Arc

, cancel: CancellationToken, -) -> JoinHandle<()> -where +) where P: PlatformWalletPersistence + 'static, { - tokio::spawn(async move { - let mut receiver = { - let guard = wallet_manager.read().await; - guard.subscribe_events() - }; - tracing::debug!("wallet-event adapter task started"); + let mut receiver = { + let guard = wallet_manager.read().await; + guard.subscribe_events() + }; + tracing::debug!("wallet-event adapter task started"); - loop { - tokio::select! { - recv = receiver.recv() => { - match recv { - Ok(event) => { - let wallet_id = event.wallet_id(); - // For events that need to consult per-wallet - // state (today only `TransactionInstantLocked`, - // which checks finality before recording the IS - // lock), grab a brief read lock on the manager. - let core = build_core_changeset(&wallet_manager, &event).await; - if core.is_empty_no_records() { - // SyncHeightAdvanced for an unknown wallet, - // empty BlockProcessed, etc. — nothing to - // persist. Skip the round-trip. - continue; - } - let cs = PlatformWalletChangeSet { - core: Some(core), - ..PlatformWalletChangeSet::default() - }; - if let Err(e) = persister.store(wallet_id, cs) { - tracing::warn!( - wallet_id = %hex::encode(wallet_id), - error = %e, - "Persister rejected core changeset; state will be re-emitted on next sync round" - ); - } - } - Err(RecvError::Closed) if cancel.is_cancelled() => break, - Err(RecvError::Closed) => { - tracing::error!("WalletEvent broadcast closed unexpectedly"); - break; + loop { + tokio::select! { + recv = receiver.recv() => { + match recv { + Ok(event) => { + let wallet_id = event.wallet_id(); + // For events that need to consult per-wallet + // state (today only `TransactionInstantLocked`, + // which checks finality before recording the IS + // lock), grab a brief read lock on the manager. + let core = build_core_changeset(&wallet_manager, &event).await; + if core.is_empty_no_records() { + // SyncHeightAdvanced for an unknown wallet, + // empty BlockProcessed, etc. — nothing to + // persist. Skip the round-trip. + continue; } - Err(RecvError::Lagged(n)) => { + let cs = PlatformWalletChangeSet { + core: Some(core), + ..PlatformWalletChangeSet::default() + }; + if let Err(e) = persister.store(wallet_id, cs) { tracing::warn!( - missed = n, - "wallet-event adapter lagged on broadcast channel; some events were dropped" + wallet_id = %hex::encode(wallet_id), + error = %e, + "Persister rejected core changeset; state will be re-emitted on next sync round" ); } } + Err(RecvError::Closed) if cancel.is_cancelled() => break, + Err(RecvError::Closed) => { + tracing::error!("WalletEvent broadcast closed unexpectedly"); + break; + } + Err(RecvError::Lagged(n)) => { + tracing::warn!( + missed = n, + "wallet-event adapter lagged on broadcast channel; some events were dropped" + ); + } } - _ = cancel.cancelled() => break, } + _ = cancel.cancelled() => break, } - tracing::debug!("wallet-event adapter task exiting"); - }) + } + tracing::debug!("wallet-event adapter task exiting"); } /// Project an upstream [`WalletEvent`] into a [`CoreChangeSet`] suitable diff --git a/packages/rs-platform-wallet/src/changeset/mod.rs b/packages/rs-platform-wallet/src/changeset/mod.rs index dc76ddd39ac..208c132e874 100644 --- a/packages/rs-platform-wallet/src/changeset/mod.rs +++ b/packages/rs-platform-wallet/src/changeset/mod.rs @@ -33,7 +33,7 @@ pub use changeset::{ }; pub use client_start_state::ClientStartState; pub use client_wallet_start_state::ClientWalletStartState; -pub use core_bridge::spawn_wallet_event_adapter; +pub use core_bridge::wallet_event_adapter_loop; pub use identity_manager_start_state::IdentityManagerStartState; pub use merge::Merge; pub use platform_address_sync_start_state::PlatformAddressSyncStartState; diff --git a/packages/rs-platform-wallet/src/error.rs b/packages/rs-platform-wallet/src/error.rs index c94cb7093d1..196d2ee5b41 100644 --- a/packages/rs-platform-wallet/src/error.rs +++ b/packages/rs-platform-wallet/src/error.rs @@ -239,6 +239,27 @@ pub enum PlatformWalletError { #[error("Shielded sub-wallet not bound: call bind_shielded first")] ShieldedNotBound, + + /// A Clear/wipe could not safely complete because the shielded sync + /// coordinator's in-flight pass did not drain cleanly first — it either + /// timed out on the join backstop or its loop ended non-cleanly + /// (cancelled / panicked). The shared commitment-tree store is therefore + /// **left intact** (not wiped): a still-running pass could re-persist + /// notes into the store immediately after a `clear()`, desyncing the + /// host's wiped rows from a repopulated tree and gate-skipping every + /// re-downloaded position on the next cold resync. The host **must not** + /// commit its own persistence wipe; retry Clear once the pass settles. + /// Carries the terminal [`CoordinatorThreadStatus`] for diagnostics. + /// + /// [`CoordinatorThreadStatus`]: crate::manager::CoordinatorThreadStatus + #[error( + "shielded clear aborted: sync coordinator did not drain cleanly \ + ({status:?}); commitment-tree store left intact so an in-flight pass \ + cannot re-persist into a wiped store — retry once the pass settles" + )] + ShieldedShutdownIncomplete { + status: crate::manager::CoordinatorThreadStatus, + }, } /// Check whether an SDK error indicates that an InstantSend lock proof was diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs index 289a71378fd..8b55948aa1b 100644 --- a/packages/rs-platform-wallet/src/lib.rs +++ b/packages/rs-platform-wallet/src/lib.rs @@ -44,7 +44,10 @@ pub use manager::platform_address_sync::{ PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome, DEFAULT_SYNC_INTERVAL_SECS, }; -pub use manager::PlatformWalletManager; +pub use manager::{ + CoordinatorExitStatus, CoordinatorThreadStatus, PlatformWalletManager, + SHUTDOWN_JOIN_TIMEOUT_SECS, +}; pub use spv::SpvRuntime; pub use wallet::asset_lock::manager::AssetLockManager; pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock}; diff --git a/packages/rs-platform-wallet/src/manager/accessors.rs b/packages/rs-platform-wallet/src/manager/accessors.rs index 7bf901bccf4..4ef045f906d 100644 --- a/packages/rs-platform-wallet/src/manager/accessors.rs +++ b/packages/rs-platform-wallet/src/manager/accessors.rs @@ -299,6 +299,25 @@ impl PlatformWalletManager

{ Arc::clone(&self.shielded_sync_manager) } + /// Whether a shielded-sync worker is still alive — either its live + /// registry slot or a prior-generation thread parked as an orphan after + /// a tight `stop()`->`start()` reap had to detach it past the wedge + /// backstop. Such an orphan still holds an `Arc` to the persister / + /// event-handler context and may fire one final callback, so a clean + /// [`quiesce`](ShieldedSyncManager::quiesce) status alone does not prove + /// the shielded worker is gone. + /// + /// This is the same shielded-scoped liveness gate + /// [`clear_shielded`](Self::clear_shielded) consults; it is exposed so + /// the FFI `shielded_sync_stop` can refuse a misleading clean return + /// while a parked orphan lingers (symmetric with `clear_shielded` / + /// `destroy`). + #[cfg(feature = "shielded")] + pub fn shielded_worker_alive(&self) -> bool { + self.registry + .any_alive_for(super::WalletWorker::ShieldedSync) + } + /// Get a clone of a wallet by its ID. pub async fn get_wallet(&self, wallet_id: &WalletId) -> Option> { let wallets = self.wallets.read().await; diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs new file mode 100644 index 00000000000..87e20fa6e58 --- /dev/null +++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs @@ -0,0 +1,420 @@ +//! Shared lifecycle state + pass protocol for the periodic sync +//! coordinators. +//! +//! The three coordinators ([`IdentitySyncManager`], [`PlatformAddressSyncManager`], +//! [`ShieldedSyncManager`]) each drive a background loop on the shared +//! [`ThreadRegistry`] and gate passes through an `is_syncing` / `quiescing` +//! handshake. That handshake, plus the interval and last-sync bookkeeping, +//! is identical across all three; it lives here so the (subtle, teardown- +//! critical) protocol has a single home and each coordinator keeps only its +//! domain-specific pass body. +//! +//! [`IdentitySyncManager`]: super::identity_sync::IdentitySyncManager +//! [`PlatformAddressSyncManager`]: super::platform_address_sync::PlatformAddressSyncManager +//! [`ShieldedSyncManager`]: super::shielded_sync::ShieldedSyncManager + +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; + +use super::{ + CoordinatorThreadStatus, WalletWorker, COORDINATOR_WEIGHT, SHUTDOWN_JOIN_TIMEOUT_SECS, +}; + +/// Shared lifecycle state and pass-gating protocol for one periodic sync +/// coordinator. Each coordinator embeds one of these and delegates its +/// `start` / `stop` / `quiesce` / `is_running` / interval / pass-gate +/// surface to it. +pub(crate) struct CoordinatorLifecycle { + registry: Arc>, + worker: WalletWorker, + interval_secs: AtomicU64, + is_syncing: AtomicBool, + /// `Arc` so the registry drain hook (a `'static` closure) can capture a + /// clone and raise the gate from inside `quiesce`. + quiescing: Arc, + last_sync_unix: AtomicU64, +} + +impl CoordinatorLifecycle { + pub(crate) fn new( + registry: Arc>, + worker: WalletWorker, + default_interval_secs: u64, + ) -> Self { + Self { + registry, + worker, + interval_secs: AtomicU64::new(default_interval_secs), + is_syncing: AtomicBool::new(false), + quiescing: Arc::new(AtomicBool::new(false)), + last_sync_unix: AtomicU64::new(0), + } + } + + /// The shared worker-lifecycle engine this coordinator's loop runs on. + pub(crate) fn registry(&self) -> &Arc> { + &self.registry + } + + /// This coordinator's registry key. + pub(crate) fn worker(&self) -> WalletWorker { + self.worker + } + + /// Set the polling interval. Clamped to a minimum of 1s. + pub(crate) fn set_interval(&self, interval: Duration) { + let secs = interval.as_secs().max(1); + self.interval_secs.store(secs, Ordering::Release); + } + + /// Current polling interval. + pub(crate) fn interval(&self) -> Duration { + Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + } + + /// Current polling interval in whole seconds (for `Debug`). + pub(crate) fn interval_secs(&self) -> u64 { + self.interval_secs.load(Ordering::Acquire) + } + + /// Whether the background loop is currently running. + pub(crate) fn is_running(&self) -> bool { + self.registry.is_running(self.worker) + } + + /// Whether a sync pass is in flight right now. + pub(crate) fn is_syncing(&self) -> bool { + self.is_syncing.load(Ordering::Acquire) + } + + /// Unix seconds of the last completed pass, or `None` if none has ever + /// completed. + pub(crate) fn last_sync_unix_seconds(&self) -> Option { + match self.last_sync_unix.load(Ordering::Acquire) { + 0 => None, + n => Some(n), + } + } + + /// Record the unix-seconds stamp of a just-completed pass. + pub(crate) fn store_last_sync_unix(&self, unix_secs: u64) { + self.last_sync_unix.store(unix_secs, Ordering::Release); + } + + /// The registry config a coordinator starts its loop with: coordinator + /// teardown weight, the shared join budget, and the `quiescing`-raising + /// drain hook. + pub(crate) fn worker_config(&self) -> WorkerConfig { + WorkerConfig { + weight: COORDINATOR_WEIGHT, + join_budget: Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: Some(self.drain_hook()), + } + } + + /// Drain hook handed to the registry: raise the `quiescing` gate so any + /// pass past its `is_syncing` CAS bails. The registry then cancels the + /// loop and joins the thread, so the barrier itself is instant. + fn drain_hook(&self) -> DrainHook { + let quiescing = Arc::clone(&self.quiescing); + Arc::new(move || { + let quiescing = Arc::clone(&quiescing); + Box::pin(async move { + // SeqCst: store-half of the `quiescing`<->`is_syncing` + // handshake (see `begin_pass`). + quiescing.store(true, Ordering::SeqCst); + }) + }) + } + + /// Reopen the `quiescing` gate so a (re)start's passes can run; a prior + /// quiesce raised it via the drain hook. + pub(crate) fn reopen_quiescing_gate(&self) { + self.quiescing.store(false, Ordering::Release); + } + + /// Cancel-only stop: signal the loop and return immediately. + pub(crate) fn stop(&self) { + self.registry.cancel(self.worker); + } + + /// Cancel the loop, drain any in-flight pass, and join the worker, + /// returning its terminal status. Reopens the `quiescing` gate on every + /// exit path (the gate is reset by the guard; reopening is safe because + /// the loop has been cancelled, so no new pass starts). + /// + /// The gate is raised **here**, not left to the registry's drain hook: + /// `registry.quiesce` early-returns `NotRunning` without running the + /// hook when no background-loop slot is registered, so a coordinator + /// with only direct pass traffic (no running loop) would never see the + /// gate go up — and a direct pass landing concurrently would slip past + /// the barrier `clear_shielded`/`stop` promise. Raising it ourselves + /// makes the "no new pass" gate hold regardless of whether a loop is + /// registered, and preserves gate-before-cancel: it is up before + /// `registry.quiesce` issues any cancel. + /// + /// "Direct pass" here means the gated entry points that take the + /// `is_syncing` slot via [`begin_pass`](Self::begin_pass): every + /// coordinator's `sync_now`, plus the shielded coordinator's + /// `sync_wallet`. The platform-address coordinator's `sync_wallet` is + /// intentionally **ungated** (it never touches `is_syncing`; callers + /// that need exclusion gate themselves), so the gate/drain barrier does + /// not apply to it. + pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus { + // Gate up first (instant) and held until the guard drops on return. + // SeqCst: store-half of the `quiescing`<->`is_syncing` handshake + // (see `begin_pass`). + self.quiescing.store(true, Ordering::SeqCst); + let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); + self.cancel_join_and_drain().await + } + + /// Like [`quiesce`](Self::quiesce) but for a caller that has **already** + /// raised the `quiescing` gate (via [`hold_quiescing_gate`](Self::hold_quiescing_gate)) + /// and will keep holding it: this neither raises nor lowers the gate, so + /// a multi-step teardown (the shielded Clear flow) keeps the "no new + /// pass" barrier raised *continuously* across the drain, the orphan- + /// liveness check, and the store wipe — with no lapse for a direct + /// `sync_now`/`sync_wallet` to slip through and re-persist into the + /// store being cleared. (`quiesce`'s own RAII guard would lower the gate + /// the instant it returned, which is why Clear cannot just call it and + /// re-raise afterwards: a single shared `AtomicFlagGuard` always clears + /// the flag on drop, so the re-raise would leave a window.) Gate-before- + /// cancel still holds: the caller raised the gate before this runs. + #[cfg(any(test, feature = "shielded"))] + pub(crate) async fn quiesce_under_held_gate(&self) -> CoordinatorThreadStatus { + debug_assert!( + self.quiescing.load(Ordering::Acquire), + "quiesce_under_held_gate requires the caller to already hold the quiescing gate" + ); + self.cancel_join_and_drain().await + } + + /// Cancel + bounded-join the background loop (if any), then drain a + /// direct in-flight pass on a clean status. Assumes the `quiescing` gate + /// is **already raised** (by [`quiesce`](Self::quiesce)'s own guard or a + /// caller's hold guard) and does not touch it. + /// + /// A wedged loop pass surfaces from `registry.quiesce` as a non-clean + /// `Timeout` rather than hanging — its orphaned thread is tracked by the + /// registry for teardown, so the drain below must not wait on it. On a + /// clean status the only possible `is_syncing` holder is a direct + /// `sync_now`/`sync_wallet` that entered before the gate rose (with no + /// loop slot `registry.quiesce` joined nothing; with an idle loop it + /// joined a thread that was not the one holding the flag). The raised + /// gate keeps a new pass from starting, so the drain converges, and a + /// panicked pass clears the flag via its own RAII guard. + async fn cancel_join_and_drain(&self) -> CoordinatorThreadStatus { + let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into(); + if status.is_clean() { + self.drain_in_flight_pass().await; + } + status + } + + /// Poll until no sync pass holds `is_syncing`. Only sound to call with + /// the `quiescing` gate already raised (so no new pass can start) and + /// after the background loop has been cancel-joined (so the only + /// possible holder is a direct, non-cancellable pass running to + /// completion). Mirrors the registry's 5ms poll cadence. Unbounded by + /// design — the caller bounds the whole teardown (the FFI `stop` / + /// `clear` bridges wrap it in a `SHUTDOWN_JOIN_TIMEOUT_SECS` timeout). + async fn drain_in_flight_pass(&self) { + // SeqCst: load-half of the `quiescing`<->`is_syncing` handshake (see + // `begin_pass`). Pairs with `begin_pass`'s SeqCst CAS so a pass that + // claimed the slot just as the gate rose is observed here and waited + // out, rather than slipping past an unsynchronized read. + while self.is_syncing.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + } + + /// Raise the `quiescing` gate and hold it raised until the returned + /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate the + /// instant it returns, this lets a multi-step teardown (e.g. Clear) + /// keep new direct passes off across a check-then-wipe so the "no new + /// pass" guarantee does not lapse between the two steps. In production + /// only the shielded Clear flow needs this today; the coordinator pass- + /// gate tests also exercise it. + #[cfg(any(test, feature = "shielded"))] + pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> { + // SeqCst: store-half of the `quiescing`<->`is_syncing` handshake (see + // `begin_pass`). The Clear flow raises the gate through here, so this + // raise must be self-fencing just like `quiesce`'s. + self.quiescing.store(true, Ordering::SeqCst); + AtomicFlagGuard::new(&self.quiescing) + } + + /// Enter a sync pass. Atomically claims the `is_syncing` slot, then + /// checks the `quiescing` gate. Returns the RAII guard that clears + /// `is_syncing` on drop, or `None` when the caller must bail without + /// doing work — because a pass is already in flight, or a teardown has + /// raised the gate. In the gated case the briefly-claimed slot is + /// released before returning (the guard drops), so a later post-quiesce + /// pass can still run. + pub(crate) fn begin_pass(&self) -> Option> { + // LOAD-BEARING MEMORY ORDERING: the `is_syncing` claim (this CAS) and + // the `quiescing` gate read below form a Dekker-style mutual-exclusion + // handshake with `quiesce`'s `store(quiescing) … load(is_syncing)`. + // The guarantee we need is that a teardown and a pass-entry can never + // BOTH miss each other — either this pass observes the raised gate and + // bails, or the drain observes our `is_syncing` claim and waits it + // out. That is a StoreLoad relationship across two distinct atomics, + // which Release/Acquire do NOT order; only SeqCst (a single total + // order over all four ops) does. So the CAS *store* here, the gate + // load here, and the matching `store(quiescing)` / `load(is_syncing)` + // on the teardown side are all `SeqCst`. (Today the lock `registry` + // takes would also fence this, but that is incidental — relying on it + // would make the handshake silently fragile to a lock-free refactor.) + if self + .is_syncing + .compare_exchange(false, true, Ordering::SeqCst, Ordering::Acquire) + .is_err() + { + return None; + } + + // RAII guard: clears `is_syncing` on every exit path, including + // panics. Without it a panic inside the pass would leave + // `is_syncing = true` forever and wedge `quiesce`'s drain loop. + let guard = AtomicFlagGuard::new(&self.is_syncing); + + // A `quiesce` may have raised the gate between our CAS and here; if + // so, bail (dropping `guard`, which clears the slot) so the drain + // can complete and teardown gets a true "no further pass" barrier. + // SeqCst — load-half of the handshake described above. + if self.quiescing.load(Ordering::SeqCst) { + return None; + } + Some(guard) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::sync::oneshot; + + fn make_lifecycle() -> Arc { + let registry = ThreadRegistry::::new(); + Arc::new(CoordinatorLifecycle::new( + registry, + WalletWorker::IdentitySync, + 60, + )) + } + + /// With NO background loop registered, `quiesce` must still raise the + /// `quiescing` gate — so a concurrent direct `sync_now`/`sync_wallet` + /// that lands after it bails — and drain an already-in-flight direct + /// pass before returning. The registry's drain hook cannot cover this: + /// `registry.quiesce` early-returns `NotRunning` WITHOUT running the + /// hook when no loop slot exists, so the gate would otherwise never go + /// up and the in-flight pass would not be drained. Regression for the + /// `clear_shielded`/`stop` contract ("a concurrent direct + /// sync_now/sync_wallet is held off"). Must fail against the pre-fix + /// `quiesce` that only delegated to the registry. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn quiesce_raises_gate_and_drains_direct_pass_without_background_loop() { + let lifecycle = make_lifecycle(); + assert!( + !lifecycle.is_running(), + "precondition: no background loop registered" + ); + + // A direct sync_now/sync_wallet pass already past `begin_pass`, held + // in flight on a task until we release it. + let (ready_tx, ready_rx) = oneshot::channel::<()>(); + let (release_tx, release_rx) = oneshot::channel::<()>(); + let lc_pass = Arc::clone(&lifecycle); + let pass_task = tokio::spawn(async move { + let _pass = lc_pass.begin_pass().expect("first pass enters the slot"); + ready_tx.send(()).expect("signal in-flight"); + release_rx.await.expect("await release"); + // `_pass` drops here → is_syncing = false + }); + + ready_rx.await.expect("pass reached in-flight"); + assert!(lifecycle.is_syncing(), "direct pass holds is_syncing"); + + // Drive `quiesce` concurrently: it must raise the gate, then block + // draining the in-flight pass. + let lc_q = Arc::clone(&lifecycle); + let quiesce_task = tokio::spawn(async move { lc_q.quiesce().await }); + + // Give `quiesce` time to raise the gate and enter the drain. + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + lifecycle.quiescing.load(Ordering::Acquire), + "quiesce must raise the gate even with no background loop registered" + ); + assert!( + lifecycle.is_syncing(), + "in-flight direct pass still held; quiesce has not skipped the drain" + ); + assert!( + !quiesce_task.is_finished(), + "quiesce must block until the in-flight pass drains" + ); + + // Release the pass; `quiesce` drains `is_syncing`, then returns. + release_tx.send(()).expect("release the pass"); + let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task) + .await + .expect("quiesce completes once the pass drains") + .expect("quiesce task joined"); + assert_eq!(status, CoordinatorThreadStatus::NotRunning); + assert!( + !lifecycle.is_syncing(), + "is_syncing was drained before quiesce returned" + ); + + pass_task.await.expect("pass task joined"); + } + + /// `quiesce_under_held_gate` must NOT lower the `quiescing` gate the + /// caller is holding — the mechanism that lets the shielded Clear flow + /// keep the "no new pass" barrier raised *continuously* across the + /// drain, the liveness check, and the store wipe. The plain + /// [`quiesce`](CoordinatorLifecycle::quiesce)'s own RAII guard would + /// lower it on return, leaving a window a direct pass could slip into + /// before Clear re-raised it. Must fail against a variant that delegates + /// to `quiesce` (whose guard clears the shared flag on drop). + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn quiesce_under_held_gate_keeps_caller_gate_raised() { + let lifecycle = make_lifecycle(); + + // Caller (the Clear flow) raises and holds the gate before draining. + let hold = lifecycle.hold_quiescing_gate(); + assert!( + lifecycle.quiescing.load(Ordering::Acquire), + "caller's hold raised the gate" + ); + + // Drain under the held gate (no loop registered → NotRunning); the + // gate must remain raised across the call. + let status = lifecycle.quiesce_under_held_gate().await; + assert_eq!(status, CoordinatorThreadStatus::NotRunning); + assert!( + lifecycle.quiescing.load(Ordering::Acquire), + "gate stays raised across the drain — no lapse for a direct pass" + ); + + // A direct pass attempting to begin during Clear (gate held) is + // refused: it bails after the CAS on the raised gate. + assert!( + lifecycle.begin_pass().is_none(), + "the continuously-held gate holds off a new direct pass" + ); + + // Once Clear's own guard drops, the gate reopens for later work. + drop(hold); + assert!( + !lifecycle.quiescing.load(Ordering::Acquire), + "gate reopens once the caller's hold guard drops" + ); + } +} diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 8730398f978..165e4f45305 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -47,16 +47,17 @@ //! identities are registered and the SDK is connected. use std::collections::BTreeMap; -use std::sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex as StdMutex, -}; +use std::sync::Arc; + +use dash_async::ThreadRegistry; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use super::coordinator_lifecycle::CoordinatorLifecycle; +use super::WalletWorker; + use dpp::balances::credits::TokenAmount; use dpp::prelude::Identifier; use tokio::sync::RwLock; -use tokio_util::sync::CancellationToken; use dash_sdk::platform::tokens::identity_token_balances::{ IdentityTokenBalances, IdentityTokenBalancesQuery, @@ -158,25 +159,14 @@ where /// over `P` so every `persister.store(...)` call on the hot sync /// loop dispatches statically. persister: Arc

, - /// Cancel token for the background loop, if running. - background_cancel: StdMutex>, - /// Monotonically increasing generation counter. Incremented each - /// time `start()` installs a new cancel token so the exiting - /// thread can tell whether its token is still current. - background_generation: AtomicU64, - interval_secs: AtomicU64, - is_syncing: AtomicBool, - /// Set by [`quiesce`](Self::quiesce) to gate new passes while it - /// drains an in-flight one. `sync_now` bails (after taking the - /// `is_syncing` slot) when this is set, so once `quiesce` observes - /// `is_syncing == false` no further pass can start — giving shutdown - /// a real "no more host-visible persister stores" barrier that + /// Shared lifecycle state + pass-gating protocol under the + /// [`WalletWorker::IdentitySync`] key: the registry handle, polling + /// interval, the `is_syncing` / `quiescing` handshake, and the + /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the + /// `sync_now` pass gate delegate to it. The `quiescing` half gives + /// shutdown a real "no more host-visible persister stores" barrier that /// cancel-only [`stop`](Self::stop) does not provide. - quiescing: AtomicBool, - /// Unix seconds of the last completed pass across all identities. - /// `0` = never. Identity-level timestamps live on the per-identity - /// rows in [`IdentitySyncManager::state`]. - last_sync_unix: AtomicU64, + lifecycle: CoordinatorLifecycle, /// Per-identity registry / cache. Keyed by identity id; each row /// carries the per-(identity, token) token rows plus the /// per-identity last-sync timestamp. @@ -199,16 +189,19 @@ where /// writes). The registry starts empty — call /// [`register_identity`](Self::register_identity) before /// [`start`](Self::start). - pub fn new(sdk: Arc, persister: Arc

) -> Self { + pub fn new( + sdk: Arc, + persister: Arc

, + registry: Arc>, + ) -> Self { Self { sdk, persister, - background_cancel: StdMutex::new(None), - background_generation: AtomicU64::new(0), - interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), - is_syncing: AtomicBool::new(false), - quiescing: AtomicBool::new(false), - last_sync_unix: AtomicU64::new(0), + lifecycle: CoordinatorLifecycle::new( + registry, + WalletWorker::IdentitySync, + DEFAULT_SYNC_INTERVAL_SECS, + ), state: RwLock::new(BTreeMap::new()), } } @@ -309,35 +302,28 @@ where /// /// The running loop picks this up on its next sleep. pub fn set_interval(&self, interval: Duration) { - let secs = interval.as_secs().max(1); - self.interval_secs.store(secs, Ordering::Release); + self.lifecycle.set_interval(interval); } /// Current polling interval. pub fn interval(&self) -> Duration { - Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + self.lifecycle.interval() } /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.background_cancel - .lock() - .map(|g| g.is_some()) - .unwrap_or(false) + self.lifecycle.is_running() } /// Whether a sync pass is in flight right now. pub fn is_syncing(&self) -> bool { - self.is_syncing.load(Ordering::Acquire) + self.lifecycle.is_syncing() } /// Unix seconds of the last completed pass (across all identities), /// or `None` if no pass has ever completed. pub fn last_sync_unix_seconds(&self) -> Option { - match self.last_sync_unix.load(Ordering::Acquire) { - 0 => None, - n => Some(n), - } + self.lifecycle.last_sync_unix_seconds() } /// Per-identity last-sync timestamp. @@ -395,27 +381,34 @@ where /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned"); - if guard.is_some() { - return; - } - let cancel = CancellationToken::new(); - *guard = Some(cancel.clone()); - let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; - drop(guard); - + // Reopen the quiescing gate so this (re)start's passes can run; a + // prior quiesce raised it via the drain hook. + self.lifecycle.reopen_quiescing_gate(); + + let cfg = self.lifecycle.worker_config(); + + // The loop drives `!Send` SDK futures via `Handle::block_on` on a + // dedicated OS thread (the registry spawns it). The handle is + // captured from this tokio context; the new thread is not itself a + // tokio worker. `biased` polls the cancel arm first, so a pass + // stalled on a hung SDK fetch is dropped at its `.await` the + // instant the registry cancels — clearing `is_syncing` promptly so + // the join lands inside the budget. let handle = tokio::runtime::Handle::current(); - let this = self; - std::thread::Builder::new() - .name("identity-sync".into()) - .spawn(move || { + let this = Arc::clone(&self); + self.lifecycle + .registry() + .start_thread(self.lifecycle.worker(), cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { break; } - - this.sync_now().await; + tokio::select! { + biased; + _ = cancel.cancelled() => break, + _ = this.sync_now() => {} + } let interval = this.interval(); tokio::select! { @@ -423,17 +416,8 @@ where _ = cancel.cancelled() => break, } } - - // Only clear the slot if no newer start() has - // installed a replacement token since we launched. - if let Ok(mut guard) = this.background_cancel.lock() { - if this.background_generation.load(Ordering::Acquire) == my_gen { - *guard = None; - } - } }); - }) - .expect("failed to spawn identity-sync thread"); + }); } /// Stop the background sync loop. No-op if not running. @@ -445,14 +429,7 @@ where /// by manager shutdown so the host can free the persister context — /// use [`quiesce`](Self::quiesce). pub fn stop(&self) { - if let Some(token) = self - .background_cancel - .lock() - .expect("bg_cancel poisoned") - .take() - { - token.cancel(); - } + self.lifecycle.stop(); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -473,13 +450,14 @@ where /// so its falling edge (with the gate up) is a sound "fully drained" /// signal. The gate is reopened before returning so a later /// start/sync works normally. - pub async fn quiesce(&self) { - self.quiescing.store(true, Ordering::Release); - self.stop(); - while self.is_syncing.load(Ordering::Acquire) { - tokio::time::sleep(Duration::from_millis(20)).await; - } - self.quiescing.store(false, Ordering::Release); + /// + /// Finally **joins** the loop's OS thread (after the drain, so the + /// thread is on its way out) and returns its terminal status. Joining + /// while the runtime is still alive is what lets the manager promise + /// the `!Send` loop has stopped touching `tokio::time` before a + /// one-shot host drops the runtime. + pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { + self.lifecycle.quiesce().await } /// Run one sync pass across every registered identity. @@ -493,22 +471,13 @@ where /// `!Send` (no `tokio::spawn`) and because the design brief /// explicitly forbids it. pub async fn sync_now(&self) { - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { + // Claim the pass slot and honour the quiescing gate; bail without + // work (and without a `persister.store(...)` after quiesce returns) + // if a pass is already in flight or a teardown raised the gate. The + // returned guard clears `is_syncing` on every exit path. + let Some(_pass) = self.lifecycle.begin_pass() else { return; - } - - // A `quiesce()` may have raised the gate between our CAS and - // here; if so, release the slot and bail without running a pass - // so the drain can complete and shutdown gets a true barrier - // (no further `persister.store(...)` after quiesce returns). - if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); - return; - } + }; // Snapshot the per-identity watch list under a short read // lock and release it before any network call. We keep @@ -531,8 +500,8 @@ where .duration_since(UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); - self.last_sync_unix.store(now, Ordering::Release); - self.is_syncing.store(false, Ordering::Release); + self.lifecycle.store_last_sync_unix(now); + // `_pass` drops here → `is_syncing = false` } /// Sync a single identity's watched tokens against Platform. @@ -673,7 +642,7 @@ where f.debug_struct("IdentitySyncManager") .field("is_running", &self.is_running()) .field("is_syncing", &self.is_syncing()) - .field("interval_secs", &self.interval_secs.load(Ordering::Acquire)) + .field("interval_secs", &self.lifecycle.interval_secs()) .field("last_sync_unix", &self.last_sync_unix_seconds()) .finish() } @@ -749,7 +718,8 @@ mod tests { fn make_manager() -> Arc> { let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); let persister = Arc::new(NoopPersister); - Arc::new(IdentitySyncManager::new(sdk, persister)) + let registry = ThreadRegistry::new(); + Arc::new(IdentitySyncManager::new(sdk, persister, registry)) } fn make_recording_manager() -> ( @@ -758,8 +728,13 @@ mod tests { ) { let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); let persister = Arc::new(RecordingPersister::new()); + let registry = ThreadRegistry::new(); ( - Arc::new(IdentitySyncManager::new(sdk, Arc::clone(&persister))), + Arc::new(IdentitySyncManager::new( + sdk, + Arc::clone(&persister), + registry, + )), persister, ) } @@ -880,63 +855,6 @@ mod tests { assert_eq!(mgr.interval(), Duration::from_secs(120)); } - /// `quiesce()` must not return while a pass is in flight, and must - /// return promptly once the pass drains. - /// - /// Drives the real `is_syncing` lifecycle: a background task takes - /// the slot via the same `compare_exchange` the real `sync_now` - /// uses, holds it across a sleep (standing in for the pass body + - /// persister fan-out, which `sync_now` keeps the flag set across), - /// then clears it. We assert `quiesce()` is still pending while the - /// flag is held and completes after it falls — i.e. the falling edge - /// of `is_syncing` is what unblocks the barrier. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn quiesce_blocks_until_in_flight_pass_drains() { - let mgr = make_manager(); - - // Stand in for an in-flight `sync_now`: take the `is_syncing` - // slot exactly as the real pass does, hold it, then release. - let holder = Arc::clone(&mgr); - let pass = tokio::spawn(async move { - assert!( - holder - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_ok(), - "test should own the is_syncing slot" - ); - tokio::time::sleep(Duration::from_millis(200)).await; - holder.is_syncing.store(false, Ordering::Release); - }); - - // Give the holder task a chance to take the slot before we - // start draining. - while !mgr.is_syncing() { - tokio::time::sleep(Duration::from_millis(5)).await; - } - - let quiesce_fut = mgr.quiesce(); - tokio::pin!(quiesce_fut); - - // While the pass holds the flag, quiesce must stay pending. - tokio::select! { - _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"), - _ = tokio::time::sleep(Duration::from_millis(50)) => {} - } - assert!(mgr.is_syncing(), "pass should still be in flight"); - - // Once the pass drains, quiesce must return (well within a - // generous bound — it polls every 20ms). - tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut) - .await - .expect("quiesce did not return after the pass drained"); - - // The gate is reopened before quiesce returns. - assert!(!mgr.quiescing.load(Ordering::Acquire)); - assert!(!mgr.is_syncing()); - pass.await.unwrap(); - } - /// A `sync_now()` invoked while `quiescing` is set must bail without /// running the pass — in particular, without calling /// `persister.store(...)`. This is the gate that prevents a pass @@ -948,8 +866,8 @@ mod tests { let token_x = Identifier::from([10u8; 32]); mgr.register_identity(id_a, [token_x]).await; - // Raise the gate as `quiesce()` would. - mgr.quiescing.store(true, Ordering::Release); + // Raise the gate as `quiesce()` would, held across the pass. + let _gate = mgr.lifecycle.hold_quiescing_gate(); mgr.sync_now().await; diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 3d04ca086d0..fd22ec6d177 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -1,6 +1,7 @@ //! Multi-wallet manager with SPV coordination. pub mod accessors; +mod coordinator_lifecycle; pub mod identity_sync; mod load; pub mod platform_address_sync; @@ -10,13 +11,12 @@ mod wallet_lifecycle; use std::sync::Arc; +use dash_async::{ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig}; use tokio::sync::{Notify, RwLock}; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; use key_wallet_manager::WalletManager; -use crate::changeset::{spawn_wallet_event_adapter, PlatformWalletPersistence}; +use crate::changeset::{wallet_event_adapter_loop, PlatformWalletPersistence}; use crate::events::{PlatformEventHandler, PlatformEventManager}; use crate::manager::identity_sync::IdentitySyncManager; use crate::manager::platform_address_sync::PlatformAddressSyncManager; @@ -28,6 +28,30 @@ use crate::wallet::core::BalanceUpdateHandler; use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId}; use crate::wallet::PlatformWallet; +/// Identity of a background worker on the manager's shared +/// [`ThreadRegistry`]. The three periodic sync coordinators run as +/// OS-thread workers (their SDK futures are `!Send`); the wallet-event +/// adapter runs as a tokio task. Drained in ascending weight order on +/// [`shutdown`](PlatformWalletManager::shutdown): the coordinators +/// (weight 0) first, then the event adapter (weight 10) they store into. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub enum WalletWorker { + /// Platform-address (BLAST) balance sync loop. + PlatformAddressSync, + /// Per-identity token-state sync loop. + IdentitySync, + /// Shielded (Orchard) note sync loop. + ShieldedSync, + /// Wallet-event adapter task (sinks coordinator stores). + EventAdapter, +} + +/// Teardown weight of the periodic sync coordinators — drained first. +pub(crate) const COORDINATOR_WEIGHT: ShutdownWeight = ShutdownWeight(0); +/// Teardown weight of the wallet-event adapter — drained after the +/// coordinators that feed it. +pub(crate) const EVENT_ADAPTER_WEIGHT: ShutdownWeight = ShutdownWeight(10); + /// Multi-wallet coordinator with SPV sync and event handling. /// /// Events are dispatched through [`PlatformEventManager`] to all registered @@ -82,13 +106,200 @@ pub struct PlatformWalletManager { #[cfg(feature = "shielded")] pub(super) event_manager: Arc, pub(super) persister: Arc

, - /// Cancellation token + join handle for the wallet-event adapter - /// task. Held so [`shutdown`] can stop it cleanly when the manager - /// is torn down. - pub(super) event_adapter_cancel: CancellationToken, - pub(super) event_adapter_join: tokio::sync::Mutex>>, + /// Shared worker-lifecycle engine. Owns every background worker's + /// cancellation token + join handle, the restart reap-or-park, and the + /// orphan list. The coordinators hold a clone and register their loops + /// on it; the event adapter runs here as a tokio task. [`shutdown`] + /// drains it in weight order and joins every worker before returning. + pub(super) registry: Arc>, } +/// How one background coordinator thread terminated. +/// +/// The three periodic coordinators run their loops on dedicated OS +/// threads (the SDK futures are `!Send`, so they ride +/// [`Handle::block_on`](tokio::runtime::Handle::block_on) rather than +/// `tokio::spawn`). [`PlatformWalletManager::shutdown`] joins each +/// thread and reports how it ended so a host can tell a clean wind-down +/// from a panicked loop instead of silently dropping the thread. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CoordinatorThreadStatus { + /// The loop exited and its thread/task joined cleanly. + Ok, + /// The thread/task exited for a non-panic reason that is not a clean + /// return — e.g. a tokio task was cancelled or aborted. Carries a + /// reason string when one is available. + Stopped(Option), + /// The thread/task panicked; carries the best-effort panic message. + Panicked(String), + /// The join did not complete within [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. + Timeout, + /// No thread/task was running to join — never started, or already + /// joined by a previous `shutdown()`. + NotRunning, + /// Infrastructural join failure that is neither a timeout nor a + /// panic — e.g. the `spawn_blocking` task itself failed because + /// the runtime was torn down before the join could run (unreachable + /// in normal operation). + Error(String), + /// At least one coordinator OS thread that an earlier tight + /// `stop()`→`start()` reap had to detach past its 1 s wedge-backstop + /// was still alive at the shutdown deadline. + /// + /// Such a thread was parked in the shared [`ThreadRegistry`]'s orphan + /// list (not silently dropped) precisely so this case is visible. + /// A still-live detached thread keeps an `Arc` to the host event + /// handler and may fire one final callback, so the host must NOT + /// free the callback context yet — this status keeps + /// [`is_clean`](Self::is_clean) `false` so the FFI `destroy` returns + /// `ErrorShutdownIncomplete` instead of `ok()`. + Detached, +} + +impl CoordinatorThreadStatus { + /// `true` only for a fully clean outcome: joined normally (`Ok`) or + /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, + /// `Error`, and `Detached` are all considered non-clean. + pub fn is_clean(&self) -> bool { + matches!(self, Self::Ok | Self::NotRunning) + } +} + +/// Relocate a registry [`WorkerStatus`](dash_async::WorkerStatus) into the +/// FFI-stable `CoordinatorThreadStatus`. The variant sets and payloads +/// correspond 1:1, so the body is an exhaustive by-name `From` match that +/// the compiler keeps total. The two enums intentionally keep their own +/// declaration order and carry no `#[repr]`, so this is a match, never a +/// layout-compatible cast — the FFI `destroy` / shielded-stop adapters keep +/// reading the same logical shape. +impl From for CoordinatorThreadStatus { + fn from(status: dash_async::WorkerStatus) -> Self { + use dash_async::WorkerStatus as W; + match status { + W::Ok => Self::Ok, + W::Stopped(reason) => Self::Stopped(reason), + W::Panicked(msg) => Self::Panicked(msg), + W::Timeout => Self::Timeout, + W::Detached => Self::Detached, + W::NotRunning => Self::NotRunning, + W::Error(msg) => Self::Error(msg), + } + } +} + +/// Per-thread terminal status of every background worker, returned by +/// [`PlatformWalletManager::shutdown`]. +/// +/// A host that drops its tokio runtime right after `shutdown()` +/// (one-shot / headless / stdio) reads this to confirm each `!Send` +/// coordinator loop fully wound down on its OS thread *before* the +/// runtime goes away — closing the race where a still-polling loop hits +/// `tokio::time` on a shutting-down runtime and panics with +/// `A Tokio 1.x context was found, but it is being shutdown`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CoordinatorExitStatus { + /// Platform-address (BLAST) balance sync loop. + pub platform_address_sync: CoordinatorThreadStatus, + /// Per-identity token-state sync loop. + pub identity_sync: CoordinatorThreadStatus, + /// Shielded (Orchard) note sync loop. `None` in builds without the + /// `shielded` feature (the coordinator does not exist). + pub shielded_sync: Option, + /// Wallet-event adapter (a `tokio` task, not an OS thread). + pub event_adapter: CoordinatorThreadStatus, + /// Aggregate status of any coordinator OS threads that an earlier + /// tight `stop()`→`start()` reap had to detach past its 1 s + /// wedge-backstop and park in the shared [`ThreadRegistry`]'s orphan + /// list. + /// + /// [`Ok`](CoordinatorThreadStatus::Ok) when none were detached (or + /// every detached thread has since joined cleanly); + /// [`Detached`](CoordinatorThreadStatus::Detached) when at least one + /// is still alive at the shutdown deadline. This is what keeps + /// [`all_clean`](Self::all_clean) honest for the wedge case the rest + /// of the teardown can't see — without it a detached-but-still-live + /// thread would let the host free a callback context the thread may + /// still touch (a residual use-after-free). + pub detached_threads: CoordinatorThreadStatus, +} + +impl CoordinatorExitStatus { + /// `true` only when every worker — including any parked + /// [`detached_threads`](Self::detached_threads) — is + /// [`Ok`](CoordinatorThreadStatus::Ok) or + /// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any + /// `Stopped`, `Panicked`, `Timeout`, `Error`, or `Detached` slot + /// makes it `false`. + pub fn all_clean(&self) -> bool { + self.platform_address_sync.is_clean() + && self.identity_sync.is_clean() + && self.shielded_sync.as_ref().is_none_or(|s| s.is_clean()) + && self.event_adapter.is_clean() + && self.detached_threads.is_clean() + } + + /// Build the FFI-stable exit status from the registry's weight-ordered + /// [`ShutdownReport`]. A worker absent from the report never ran, so it + /// maps to [`NotRunning`](CoordinatorThreadStatus::NotRunning); a + /// non-zero orphan-survivor count surfaces as + /// [`Detached`](CoordinatorThreadStatus::Detached), keeping + /// [`all_clean`](Self::all_clean) honest for a still-live wedged thread. + pub(crate) fn from_report(report: ShutdownReport) -> Self { + let worker = |key: WalletWorker| -> CoordinatorThreadStatus { + report + .per_worker + .get(&key) + .cloned() + .map(CoordinatorThreadStatus::from) + .unwrap_or(CoordinatorThreadStatus::NotRunning) + }; + Self { + platform_address_sync: worker(WalletWorker::PlatformAddressSync), + identity_sync: worker(WalletWorker::IdentitySync), + #[cfg(feature = "shielded")] + shielded_sync: Some(worker(WalletWorker::ShieldedSync)), + #[cfg(not(feature = "shielded"))] + shielded_sync: None, + event_adapter: worker(WalletWorker::EventAdapter), + detached_threads: if report.detached > 0 { + CoordinatorThreadStatus::Detached + } else { + CoordinatorThreadStatus::Ok + }, + } + } +} + +/// Maximum time (seconds) the teardown paths — `shutdown()`, +/// `clear_shielded`, and the FFI shielded-stop bridge — wait for one +/// coordinator's quiesce+join to complete. +/// +/// This is a backstop, not the primary stop mechanism. `quiesce()` +/// cancels the loop, which aborts any in-flight pass at its `.await` +/// point (see each coordinator's `start()` select), so the `is_syncing` +/// drain clears promptly and the join normally lands far inside this +/// window. The deadline fires only if a pass's *drop* itself wedges +/// (e.g. a blocking destructor); on timeout the coordinator slot reports +/// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever. +pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; + +/// Grace period (seconds) [`PlatformWalletManager::shutdown`] spends +/// polling any orphans parked in the shared [`ThreadRegistry`] before +/// declaring a survivor [`Detached`](CoordinatorThreadStatus::Detached). +/// +/// Unlike a live coordinator — whose `quiesce()` may legitimately spend +/// seconds draining an in-flight pass, hence the 30 s +/// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] — an orphan is a thread an earlier reap +/// already had to detach *because it was wedged past its 1 s backstop*. +/// A healthy detached thread finishes within milliseconds of the +/// cancellation it long ago received (so `is_finished()` is usually true +/// on the first poll and the join is instant); one still alive after this +/// grace is wedged in a non-yielding `Drop` and will not finish however +/// long we wait. A short grace therefore separates "finishing" from +/// "wedged" without stretching teardown, and reporting `Detached` is the +/// conservative, UAF-safe outcome (the host delays freeing its context). +pub(crate) const SHUTDOWN_ORPHAN_GRACE_SECS: u64 = 1; + impl PlatformWalletManager

{ /// Create a new PlatformWalletManager. /// @@ -104,14 +315,28 @@ impl PlatformWalletManager

{ let wallets = Arc::new(RwLock::new(std::collections::BTreeMap::new())); let lock_notify = Arc::new(Notify::new()); - // Spawn the wallet-event adapter that translates upstream - // `WalletEvent`s into `CoreChangeSet`s and forwards them to - // the persister. - let event_adapter_cancel = CancellationToken::new(); - let event_adapter_join = spawn_wallet_event_adapter( - Arc::clone(&wallet_manager), - Arc::clone(&persister), - event_adapter_cancel.clone(), + // Shared worker-lifecycle engine. The 1 s reap backstop (separate + // from the 30 s managed-join budget) is the grace a wedged prior + // thread gets before it is reported `Detached`. + let registry = ThreadRegistry::with_reap_backstop(std::time::Duration::from_secs( + SHUTDOWN_ORPHAN_GRACE_SECS, + )); + + // Register the wallet-event adapter as a tokio task on the + // registry. It sinks the coordinators' stores, so it drains AFTER + // them (weight 10 vs the coordinators' 0). + let adapter_wallet_manager = Arc::clone(&wallet_manager); + let adapter_persister = Arc::clone(&persister); + registry.start_task( + WalletWorker::EventAdapter, + WorkerConfig { + weight: EVENT_ADAPTER_WEIGHT, + join_budget: std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: None, + }, + move |cancel| { + wallet_event_adapter_loop(adapter_wallet_manager, adapter_persister, cancel) + }, ); // Build handler list: app handler + internal handlers. @@ -135,10 +360,12 @@ impl PlatformWalletManager

{ let platform_address_sync = Arc::new(PlatformAddressSyncManager::new( Arc::clone(&wallets), Arc::clone(&event_manager), + Arc::clone(®istry), )); let identity_sync = Arc::new(IdentitySyncManager::new( Arc::clone(&sdk), Arc::clone(&persister), + Arc::clone(®istry), )); #[cfg(feature = "shielded")] let shielded_coordinator: Arc< @@ -148,6 +375,7 @@ impl PlatformWalletManager

{ let shielded_sync = Arc::new(ShieldedSyncManager::new( Arc::clone(&event_manager), Arc::clone(&shielded_coordinator), + Arc::clone(®istry), )); Self { sdk, @@ -164,8 +392,7 @@ impl PlatformWalletManager

{ #[cfg(feature = "shielded")] event_manager, persister, - event_adapter_cancel, - event_adapter_join: tokio::sync::Mutex::new(Some(event_adapter_join)), + registry, } } @@ -278,47 +505,696 @@ impl PlatformWalletManager

{ /// disk but its contents are reset to empty so the next bind cold- /// resyncs from index 0. /// - /// Returns an error if the coordinator's store reset fails; the host - /// must not commit its own persistence wipe in that case. + /// Returns an error — and leaves the store untouched — in two cases, so + /// the host knows **not** to commit its own persistence wipe: + /// - the in-flight sync pass did not drain cleanly (timed out on the join + /// backstop, or its loop ended non-cleanly) → + /// [`crate::error::PlatformWalletError::ShieldedShutdownIncomplete`]; or + /// - the coordinator's store reset itself fails. + /// + /// **Host-serialization precondition**: the caller must not invoke + /// `shielded_sync_start` for this manager concurrently with `clear`. A + /// concurrent direct `sync_now`/`sync_wallet` is held off — the quiescing + /// gate is raised *continuously* for the whole clear (from before the + /// drain, across the liveness check, through the wipe), so such a pass + /// observes the gate and bails with no lapse. The one remaining residual + /// is a full `shielded_sync_start` racing `clear`: a restart spawns a + /// fresh loop and reopens the gate, so it could re-persist into the wiped + /// store. The wallet UI drives these from one place; that ordering is the + /// host's contract until the registry grows a per-key clearing latch. #[cfg(feature = "shielded")] pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> { - self.shielded_sync_manager.quiesce().await; + self.clear_shielded_inner(std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS)) + .await + } + + /// [`clear_shielded`](Self::clear_shielded) with an explicit drain + /// deadline. Split out so tests can exercise the timeout path without + /// waiting the full production budget. + #[cfg(feature = "shielded")] + async fn clear_shielded_inner( + &self, + drain_timeout: std::time::Duration, + ) -> Result<(), crate::error::PlatformWalletError> { + // Raise and HOLD the shielded quiescing gate for the WHOLE clear, + // BEFORE quiescing — so the "no new pass" barrier never lapses + // between the drain, the liveness check, and the store wipe: a direct + // `sync_now`/`sync_wallet` landing anywhere in here observes the gate + // and bails instead of re-persisting into the store we are about to + // clear. `quiesce_under_held_gate` deliberately does NOT touch the + // gate (a single `AtomicFlagGuard` always clears the flag on drop, so + // letting `quiesce` manage it and re-raising afterwards would leave a + // window). The guard lowers the gate on return (every path). + let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate(); + + // Cancel the loop and drain any in-flight pass (incl. its persister + // fan-out). Bound the drain (mirroring `shielded_sync_stop`'s + // timeout) so a heavy direct pass cannot hang the host's Clear: on + // timeout the clear reports `Timeout` and aborts BEFORE the wipe, + // leaving the store intact. + let status = match tokio::time::timeout( + drain_timeout, + self.shielded_sync_manager.quiesce_under_held_gate(), + ) + .await + { + Ok(status) => status, + Err(_elapsed) => CoordinatorThreadStatus::Timeout, + }; + + // Only commit the store wipe once the in-flight pass has fully + // drained. A partial/timed-out drain could let a surviving pass + // write into a store we just cleared, desyncing the host's own + // wipe from a repopulated tree. + if !status.is_clean() { + return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status }); + } + + // Also refuse if a prior-generation shielded thread is still parked + // alive: it holds an `Arc` to the persister/store and could re-persist + // notes into the store we are about to wipe. The check is shielded- + // scoped (shares the `shielded_worker_alive` gate), so the other + // coordinators / the always-on event adapter running normally do not + // block Clear. + if self.shielded_worker_alive() { + return Err( + crate::error::PlatformWalletError::ShieldedShutdownIncomplete { + status: CoordinatorThreadStatus::Detached, + }, + ); + } if let Some(coord) = self.shielded_coordinator().await { coord.clear().await?; } Ok(()) } - /// Stop all background tasks and wait for them to exit. + /// Stop all background workers and wait for them to exit. /// - /// **Quiesces** the periodic coordinators - /// (`PlatformAddressSyncManager`, `IdentitySyncManager`, - /// `ShieldedSyncManager`) — cancelling each loop *and draining any - /// in-flight pass to completion*, including its persister / - /// host-callback fan-out — then drains the wallet-event adapter task. - /// Idempotent. Call before dropping the manager when a clean - /// shutdown is required (e.g. on app termination); a dirty drop - /// simply leaks the tasks until the runtime exits. + /// Delegates to the shared [`ThreadRegistry::shutdown`], which drains + /// in ascending weight order: the periodic coordinators (weight 0) + /// first — concurrently, since they share no lock — then the + /// wallet-event adapter (weight 10) that sinks their stores, then any + /// parked orphans. Each worker's drain raises its `quiescing` gate, + /// cancels the loop, and **joins** its OS thread / task, so when this + /// returns every `!Send` loop has fully exited. Idempotent. /// /// Ordering matters: cancel-only `stop()` would let a pass already /// inside `sync_now` keep running and call `persister.store(...)` / - /// fire a host completion callback after the FFI's `destroy` - /// returned and the host freed the persister / event-handler - /// context — a use-after-free. So we `quiesce()` the sync managers - /// FIRST (so no further persister store or host callback can start), - /// and only THEN cancel + join the event adapter, which is the sink - /// those stores feed into. - pub async fn shutdown(&self) { - self.platform_address_sync_manager.quiesce().await; - self.identity_sync_manager.quiesce().await; + /// fire a host completion callback after the FFI's `destroy` returned + /// and the host freed the persister / event-handler context — a + /// use-after-free. Quiescing the coordinators (weight 0) before the + /// event adapter (weight 10) closes that window: no further store can + /// start before its sink is torn down. + /// + /// A host that drops the tokio runtime right after `shutdown()` + /// (one-shot / headless / stdio) is therefore safe — no coordinator + /// can still be polling `tokio::time` on a shutting-down runtime. The + /// returned [`CoordinatorExitStatus`] reports per-worker how each ended. + /// + /// **Precondition: must be called from a multi-thread Tokio runtime.** + /// Each coordinator's OS thread drives its loop via + /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs the + /// runtime's timer/IO driver; a `current_thread` runtime can only + /// service one `block_on` at a time, so the join would deadlock. + /// [`ThreadRegistry::shutdown`] asserts this in both debug and release. + /// + /// Each worker's join is bounded by its own + /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] budget; on timeout its handle is + /// re-parked and the slot reports + /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever + /// (the F1 fix — a dropped/timed-out join can never detach a live + /// thread). The clear-on-panic half rides on unwinding, so it holds + /// under `panic = "unwind"`; under the iOS `panic = "abort"` profiles a + /// pass panic aborts the process outright. + pub async fn shutdown(&self) -> CoordinatorExitStatus { + CoordinatorExitStatus::from_report(self.registry.shutdown().await) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering as AO}; + use std::time::Duration; + + use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet}; + use crate::manager::platform_address_sync::PlatformAddressSyncSummary; + + /// No-op persister — the lifecycle tests below never exercise the + /// real persistence pipeline, they just need a handle that satisfies + /// the manager's `P` bound. + struct NoopPersister; + + impl PlatformWalletPersistence for NoopPersister { + fn store( + &self, + _wallet_id: WalletId, + _changeset: PlatformWalletChangeSet, + ) -> Result<(), PersistenceError> { + Ok(()) + } + + fn flush(&self, _wallet_id: WalletId) -> Result<(), PersistenceError> { + Ok(()) + } + + fn load(&self) -> Result { + Ok(ClientStartState::default()) + } + } + + /// No-op event handler standing in for the host's FFI handler. + struct NoopHandler; + impl dash_spv::EventHandler for NoopHandler {} + impl PlatformEventHandler for NoopHandler {} + + /// Build a manager over a mock SDK + no-op persister/handler. Cheap: + /// `new` wires the sub-managers and spawns the event adapter but + /// starts no coordinator threads. + fn make_manager() -> PlatformWalletManager { + let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); + let persister = Arc::new(NoopPersister); + let handler: Arc = Arc::new(NoopHandler); + PlatformWalletManager::new(sdk, persister, handler) + } + + /// Build a manager that fires a slow (300 ms std::thread::sleep) callback + /// on `on_platform_address_sync_completed`. Used by the in-flight-pass + /// drain test. + fn make_manager_with_slow_handler( + started: Arc, + completed: Arc, + ) -> PlatformWalletManager { + struct SlowHandler { + started: Arc, + completed: Arc, + } + impl dash_spv::EventHandler for SlowHandler {} + impl PlatformEventHandler for SlowHandler { + fn on_platform_address_sync_completed(&self, _: &PlatformAddressSyncSummary) { + self.started.store(true, AO::Release); + std::thread::sleep(Duration::from_millis(300)); + self.completed.store(true, AO::Release); + } + } + + let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); + let persister = Arc::new(NoopPersister); + let handler: Arc = Arc::new(SlowHandler { started, completed }); + PlatformWalletManager::new(sdk, persister, handler) + } + + /// Start every periodic coordinator's background OS-thread loop. + fn start_coordinators(m: &PlatformWalletManager

) { + Arc::clone(&m.platform_address_sync_manager).start(); + Arc::clone(&m.identity_sync_manager).start(); + #[cfg(feature = "shielded")] + Arc::clone(&m.shielded_sync_manager).start(); + } + + /// Happy path: `shutdown()` joins every started worker and reports + /// `Ok`; it completes within a bounded time (no `spawn_blocking` + /// starvation/deadlock); a second `shutdown()` finds nothing left to + /// join (`NotRunning`) — idempotent. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() { + let manager = make_manager(); + start_coordinators(&manager); + // Let the loops enter `block_on` so we exercise the live-loop + // join path (a thread cancelled before its first poll joins too). + tokio::time::sleep(Duration::from_millis(50)).await; + + let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown()) + .await + .expect("shutdown join must complete within bound"); + assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok); #[cfg(feature = "shielded")] - self.shielded_sync_manager.quiesce().await; + assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok)); + #[cfg(not(feature = "shielded"))] + assert_eq!(status.shielded_sync, None); + assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok); + assert!(status.all_clean()); - self.event_adapter_cancel.cancel(); - if let Some(handle) = self.event_adapter_join.lock().await.take() { - if let Err(e) = handle.await { - tracing::warn!(error = ?e, "Wallet event adapter task join error"); + // Handles consumed by the first join → nothing left to join. + let again = manager.shutdown().await; + assert_eq!( + again.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + assert_eq!(again.identity_sync, CoordinatorThreadStatus::NotRunning); + assert_eq!(again.event_adapter, CoordinatorThreadStatus::NotRunning); + assert!(again.all_clean()); + } + + /// Never-started coordinators report `NotRunning` (no thread to + /// join). The event adapter is spawned in `new`, so it still joins + /// `Ok`. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn shutdown_without_starting_reports_not_running() { + let manager = make_manager(); + + let status = manager.shutdown().await; + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::NotRunning); + #[cfg(feature = "shielded")] + assert_eq!( + status.shielded_sync, + Some(CoordinatorThreadStatus::NotRunning) + ); + #[cfg(not(feature = "shielded"))] + assert_eq!(status.shielded_sync, None); + assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok); + assert!(status.all_clean()); + } + + /// `Stopped` and `Timeout` are NOT clean; `Ok` and `NotRunning` ARE. + /// Unit-tests the `is_clean` predicate directly so we don't need to + /// trigger a real timeout (30s) in a deterministic test. + #[test] + fn coordinator_thread_status_clean_predicate() { + assert!(CoordinatorThreadStatus::Ok.is_clean()); + assert!(CoordinatorThreadStatus::NotRunning.is_clean()); + + assert!(!CoordinatorThreadStatus::Stopped(None).is_clean()); + assert!(!CoordinatorThreadStatus::Stopped(Some("cancelled".into())).is_clean()); + assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean()); + assert!(!CoordinatorThreadStatus::Timeout.is_clean()); + assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean()); + // A detached-but-still-live coordinator thread is non-clean: the + // host must not free its callback context yet. + assert!(!CoordinatorThreadStatus::Detached.is_clean()); + } + + /// `all_clean()` on `CoordinatorExitStatus` is false whenever any + /// slot is non-clean. + #[test] + fn coordinator_exit_status_all_clean() { + let clean = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Ok, + identity_sync: CoordinatorThreadStatus::NotRunning, + shielded_sync: None, + event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Ok, + }; + assert!(clean.all_clean()); + + let with_timeout = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Timeout, + identity_sync: CoordinatorThreadStatus::Ok, + shielded_sync: None, + event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Ok, + }; + assert!(!with_timeout.all_clean()); + + let with_stopped = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Ok, + identity_sync: CoordinatorThreadStatus::Ok, + shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))), + event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Ok, + }; + assert!(!with_stopped.all_clean()); + + // A still-live detached orphan alone makes the aggregate + // non-clean — the slot the rest of the teardown can't see. + let with_detached = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Ok, + identity_sync: CoordinatorThreadStatus::Ok, + shielded_sync: None, + event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Detached, + }; + assert!(!with_detached.all_clean()); + } + + /// `shutdown()` must wait for an in-flight sync pass to drain before + /// joining the coordinator thread. + /// + /// A slow `on_platform_address_sync_completed` callback (300 ms) + /// keeps `is_syncing=true` while it runs. We call `shutdown()` while + /// the callback is in-flight and assert that `shutdown()` blocks + /// until the callback completes, then returns `Ok`. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_waits_for_in_flight_pass_to_drain() { + let handler_started = Arc::new(AtomicBool::new(false)); + let handler_completed = Arc::new(AtomicBool::new(false)); + let manager = make_manager_with_slow_handler( + Arc::clone(&handler_started), + Arc::clone(&handler_completed), + ); + + // Start the address-sync coordinator; first pass fires immediately. + Arc::clone(&manager.platform_address_sync_manager).start(); + + // Wait until the slow completion callback is running + // (`is_syncing` stays true for its 300 ms duration). + tokio::time::timeout(Duration::from_secs(5), async { + while !handler_started.load(AO::Acquire) { + tokio::time::sleep(Duration::from_millis(5)).await; } + }) + .await + .expect("handler did not start within 5s"); + + // Shutdown must drain the in-flight pass before joining. + let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown()) + .await + .expect("shutdown must complete within 5 s"); + + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::Ok, + "coordinator must join cleanly after drain" + ); + assert!( + handler_completed.load(AO::Acquire), + "shutdown must not return before the in-flight pass completes" + ); + } + + /// Race regression — start coordinators with a long sleep interval so + /// they spend nearly all their time in a live `tokio::time::sleep`, + /// then `shutdown()` and drop the runtime. + /// + /// With the thread join in `shutdown()` every coordinator has fully + /// exited its `block_on` before `drop(runtime)` — no race possible. + /// Loop 10 times to give any latent race a reliable window: WITHOUT + /// the join, the coordinator's `select!` wakeup (via tokio) would + /// race the runtime teardown and reliably trigger the + /// "Tokio … being shutdown" panic across the 10 iterations. + /// + /// Uses `std::panic::catch_unwind` around `drop(runtime)` rather than + /// a process-global panic hook; the hook would be live for seconds and + /// could swallow diagnostics from other concurrently-running tests. + #[test] + fn shutdown_then_drop_runtime_does_not_panic() { + static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0); + + for _ in 0..10 { + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("build runtime"); + + let status = runtime.block_on(async { + let manager = make_manager(); + // Long interval: coordinator spends ~10 s in a live + // tokio::time::sleep, maximising the race window for a + // join-less runtime drop. + manager + .platform_address_sync_manager + .set_interval(Duration::from_secs(10)); + manager + .identity_sync_manager + .set_interval(Duration::from_secs(10)); + #[cfg(feature = "shielded")] + manager + .shielded_sync_manager + .set_interval(Duration::from_secs(10)); + start_coordinators(&manager); + // Wait for coordinators to finish their first (instant) + // pass and enter the long sleep. + tokio::time::sleep(Duration::from_millis(100)).await; + // shutdown() joins each thread before returning; without + // the join this drop would race the select!/block_on exit. + manager.shutdown().await + }); + + // Wrap the runtime drop in catch_unwind to intercept the specific + // "A Tokio 1.x context ... being shutdown" panic without installing + // a process-wide hook that would suppress diagnostics from other + // concurrently running tests. + let drop_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + drop(runtime); + })); + if let Err(payload) = drop_result { + let msg = payload + .downcast_ref::() + .map(String::as_str) + .or_else(|| payload.downcast_ref::<&str>().copied()) + .unwrap_or(""); + if msg.contains("being shutdown") { + SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst); + } else { + // Unexpected panic — propagate so the test fails loudly. + std::panic::resume_unwind(payload); + } + } + + // Brief settle — any stray thread activity surfaces here. + std::thread::sleep(Duration::from_millis(50)); + + assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok); + assert!(status.all_clean(), "workers did not wind down: {status:?}"); } + + assert_eq!( + SHUTDOWN_PANICS.load(AO::SeqCst), + 0, + "dropping the runtime after shutdown raced a coordinator thread \ + ({} panics across 10 iterations)", + SHUTDOWN_PANICS.load(AO::SeqCst) + ); + } + + /// Spawn a thread that parks until `release` is signalled (or the + /// sender drops), standing in for a coordinator thread wedged in a + /// non-yielding `Drop` that ignores the cancellation it received. + fn spawn_wedged_thread() -> (std::sync::mpsc::Sender<()>, std::thread::JoinHandle<()>) { + let (release_tx, release_rx) = std::sync::mpsc::channel::<()>(); + let handle = std::thread::spawn(move || { + // Block here regardless of any cancellation, exactly like a + // Drop that never yields, until the test releases us. + let _ = release_rx.recv(); + }); + (release_tx, handle) + } + + /// Headline regression: a coordinator thread detached past the reap + /// backstop and parked in the orphans list makes a subsequent + /// `shutdown()` report the result as **non-clean** — so the FFI + /// `destroy` returns `ErrorShutdownIncomplete` and the host delays + /// freeing the callback context the still-live thread may touch. + /// + /// Non-vacuous: if the registry dropped the orphan at reap instead of + /// parking it, `detached_threads` would be `Ok` and `all_clean()` would + /// be `true`, failing both assertions. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_reports_detached_orphan_as_non_clean() { + let manager = make_manager(); + + // Stand in for the genuine-wedge outcome: an earlier tight + // stop()->start() reap had to detach a still-live coordinator thread + // past its backstop, so the registry parked it as an orphan. + let (release_tx, wedged) = spawn_wedged_thread(); + manager + .registry + .park_orphan_for_test(WalletWorker::ShieldedSync, wedged); + + let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown()) + .await + .expect("shutdown must complete within bound"); + + assert_eq!( + status.detached_threads, + CoordinatorThreadStatus::Detached, + "a still-live detached orphan must surface as Detached" + ); + assert!( + !status.all_clean(), + "all_clean() must be false while a detached coordinator thread is \ + still alive: {status:?}" + ); + + // Cleanup: shutdown() re-parked the survivor; release it and reap so + // no live thread leaks past the test. + release_tx.send(()).unwrap(); + let _ = manager.registry.reap_orphans(Duration::from_secs(5)).await; + } + + /// TC-002 (F2): `clear_shielded` must refuse while a prior-generation + /// shielded thread is parked alive — even though the current shielded + /// quiesce is clean and the other coordinators / the always-on event + /// adapter are legitimately running. Releasing + reaping the orphan + /// lets a retry succeed. + /// + /// Non-vacuous: against the pre-fix gate (only `!status.is_clean()`), + /// the clean `NotRunning` quiesce would pass the guard and wipe the + /// store under the live orphan — `clear_shielded` would return `Ok`. + #[cfg(feature = "shielded")] + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn clear_shielded_refuses_while_shielded_orphan_alive() { + let manager = make_manager(); + + // Park a wedged thread under the ShieldedSync key: a prior- + // generation shielded thread an earlier reap could not join. + let (release_tx, wedged) = spawn_wedged_thread(); + manager + .registry + .park_orphan_for_test(WalletWorker::ShieldedSync, wedged); + + assert!(manager.registry.any_alive_for(WalletWorker::ShieldedSync)); + assert!(!manager.shielded_sync_manager.is_running()); + + // Refuses: the live shielded orphan could re-persist into the store + // the wipe is about to clear. + let err = manager + .clear_shielded() + .await + .expect_err("clear_shielded must refuse while a shielded orphan is alive"); + assert!( + matches!( + err, + crate::error::PlatformWalletError::ShieldedShutdownIncomplete { .. } + ), + "expected ShieldedShutdownIncomplete, got {err:?}" + ); + + // Release + reap the orphan; the shielded-scoped gate now clears and + // a retry succeeds (no shielded store configured → clear is a no-op). + release_tx.send(()).unwrap(); + let _ = manager.registry.reap_orphans(Duration::from_secs(5)).await; + assert!(!manager.registry.any_alive_for(WalletWorker::ShieldedSync)); + manager + .clear_shielded() + .await + .expect("clear_shielded must succeed once the orphan is reaped"); + } + + /// SEC-001: `clear_shielded` must BOUND its in-flight-pass drain so a + /// heavy direct `sync_now`/`sync_wallet` that won't drain in time cannot + /// hang the host's Clear. On the drain deadline the clear reports + /// `ShieldedShutdownIncomplete` and aborts BEFORE the store wipe, leaving + /// the store intact. + /// + /// Non-vacuous: against an unbounded drain the held pass keeps + /// `is_syncing` set forever and `clear_shielded_inner` never returns — the + /// test's outer timeout fires and the `expect` below panics. + #[cfg(feature = "shielded")] + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn clear_shielded_aborts_without_wiping_when_drain_times_out() { + let manager = Arc::new(make_manager()); + + // A direct sync pass already in flight (holds `is_syncing`); it never + // drains within the clear's drain budget. + let (ready_tx, ready_rx) = tokio::sync::oneshot::channel::<()>(); + let (release_tx, release_rx) = tokio::sync::oneshot::channel::<()>(); + let ssm = Arc::clone(&manager.shielded_sync_manager); + let pass_task = tokio::spawn(async move { + let _pass = ssm + .begin_pass_for_test() + .expect("direct pass enters the slot"); + ready_tx.send(()).expect("signal in-flight"); + release_rx.await.expect("await release"); + // `_pass` drops here → is_syncing = false + }); + + ready_rx.await.expect("pass reached in-flight"); + assert!(manager.shielded_sync_manager.is_syncing()); + + // Clear with a short drain budget: the held pass can't drain in time, + // so the clear must return ShieldedShutdownIncomplete — bounded, never + // hanging — and never reach the wipe. + let result = tokio::time::timeout( + Duration::from_secs(5), + manager.clear_shielded_inner(Duration::from_millis(100)), + ) + .await + .expect("clear must return within its bounded drain, never hang"); + assert!( + matches!( + result, + Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { .. }) + ), + "bounded drain timeout must surface as ShieldedShutdownIncomplete, got {result:?}" + ); + + // Release the held pass and join. + release_tx.send(()).expect("release the pass"); + pass_task.await.expect("pass task joined"); + } + + /// TC-015 (R5): `from_report` maps the registry's [`ShutdownReport`] + /// onto the FFI-stable `CoordinatorExitStatus` with identical field / + /// variant shape and `all_clean()` semantics. The full `WorkerStatus` + /// -> `CoordinatorThreadStatus` variant table is exercised. + #[test] + fn from_report_maps_to_ffi_stable_exit_status() { + use dash_async::WorkerStatus; + use std::collections::BTreeMap; + + // All Ok, no orphans. + let per = BTreeMap::from([ + (WalletWorker::PlatformAddressSync, WorkerStatus::Ok), + (WalletWorker::IdentitySync, WorkerStatus::Ok), + (WalletWorker::ShieldedSync, WorkerStatus::Ok), + (WalletWorker::EventAdapter, WorkerStatus::Ok), + ]); + let status = CoordinatorExitStatus::from_report(ShutdownReport { + per_worker: per, + detached: 0, + }); + assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok); + #[cfg(feature = "shielded")] + assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok)); + #[cfg(not(feature = "shielded"))] + assert_eq!(status.shielded_sync, None); + assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok); + assert_eq!(status.detached_threads, CoordinatorThreadStatus::Ok); + assert!(status.all_clean()); + + // A surviving orphan -> Detached -> non-clean; absent workers -> + // NotRunning. + let status = CoordinatorExitStatus::from_report(ShutdownReport { + per_worker: BTreeMap::new(), + detached: 1, + }); + assert_eq!(status.detached_threads, CoordinatorThreadStatus::Detached); + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + assert!(!status.all_clean()); + + // A per-worker Timeout propagates and is non-clean. + let per = BTreeMap::from([(WalletWorker::IdentitySync, WorkerStatus::Timeout)]); + let status = CoordinatorExitStatus::from_report(ShutdownReport { + per_worker: per, + detached: 0, + }); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Timeout); + assert!(!status.all_clean()); + + // Full variant mapping table. + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Stopped(Some("x".into()))), + CoordinatorThreadStatus::Stopped(Some("x".into())) + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Panicked("p".into())), + CoordinatorThreadStatus::Panicked("p".into()) + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Error("e".into())), + CoordinatorThreadStatus::Error("e".into()) + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::NotRunning), + CoordinatorThreadStatus::NotRunning + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Detached), + CoordinatorThreadStatus::Detached + ); } } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index e1a229806c2..5cb15b048ea 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -9,19 +9,19 @@ //! wallets are registered and the SPV runtime is up. use std::collections::BTreeMap; -use std::sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex as StdMutex, -}; +use std::sync::Arc; + +use dash_async::ThreadRegistry; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use arc_swap::ArcSwapOption; use dash_sdk::platform::address_sync::{AddressSyncConfig, AddressSyncResult}; use key_wallet::PlatformP2PKHAddress; +use super::coordinator_lifecycle::CoordinatorLifecycle; +use super::WalletWorker; use crate::wallet::PlatformAddressTag; use tokio::sync::RwLock; -use tokio_util::sync::CancellationToken; use crate::error::PlatformWalletError; use crate::events::PlatformEventManager; @@ -95,19 +95,14 @@ impl PlatformAddressSyncSummary { pub struct PlatformAddressSyncManager { wallets: Arc>>>, event_manager: Arc, - /// Cancel token for the background loop, if running. - background_cancel: StdMutex>, - interval_secs: AtomicU64, - is_syncing: AtomicBool, - /// Set by [`quiesce`](Self::quiesce) to gate new passes while it - /// drains an in-flight one. `sync_now` bails (after taking the - /// `is_syncing` slot) when this is set, so once `quiesce` observes - /// `is_syncing == false` no further pass can start — giving shutdown - /// a real "no more host-visible sync-completed callbacks" barrier - /// that cancel-only [`stop`](Self::stop) does not provide. - quiescing: AtomicBool, - /// Unix seconds of the last completed pass. `0` = never. - last_sync_unix: AtomicU64, + /// Shared lifecycle state + pass-gating protocol under the + /// [`WalletWorker::PlatformAddressSync`] key: registry handle, polling + /// interval, the `is_syncing` / `quiescing` handshake, and the + /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the + /// `sync_now` pass gate delegate to it. The `quiescing` half gives + /// shutdown a real "no more host-visible sync-completed callbacks" + /// barrier that cancel-only [`stop`](Self::stop) does not provide. + lifecycle: CoordinatorLifecycle, /// Shared config applied uniformly across wallets and accounts. /// /// `ArcSwapOption` instead of a mutex because writes are rare @@ -120,15 +115,16 @@ impl PlatformAddressSyncManager { pub fn new( wallets: Arc>>>, event_manager: Arc, + registry: Arc>, ) -> Self { Self { wallets, event_manager, - background_cancel: StdMutex::new(None), - interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), - is_syncing: AtomicBool::new(false), - quiescing: AtomicBool::new(false), - last_sync_unix: AtomicU64::new(0), + lifecycle: CoordinatorLifecycle::new( + registry, + WalletWorker::PlatformAddressSync, + DEFAULT_SYNC_INTERVAL_SECS, + ), config: ArcSwapOption::empty(), } } @@ -137,13 +133,12 @@ impl PlatformAddressSyncManager { /// /// The running loop picks this up on its next sleep. pub fn set_interval(&self, interval: Duration) { - let secs = interval.as_secs().max(1); - self.interval_secs.store(secs, Ordering::Release); + self.lifecycle.set_interval(interval); } /// Current polling interval. pub fn interval(&self) -> Duration { - Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + self.lifecycle.interval() } /// Replace the shared [`AddressSyncConfig`] used on every pass. @@ -160,24 +155,18 @@ impl PlatformAddressSyncManager { /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.background_cancel - .lock() - .map(|g| g.is_some()) - .unwrap_or(false) + self.lifecycle.is_running() } /// Whether a sync pass is in flight right now. pub fn is_syncing(&self) -> bool { - self.is_syncing.load(Ordering::Acquire) + self.lifecycle.is_syncing() } /// Unix seconds of the last completed pass, or `None` if no pass /// has ever completed. pub fn last_sync_unix_seconds(&self) -> Option { - match self.last_sync_unix.load(Ordering::Acquire) { - 0 => None, - n => Some(n), - } + self.lifecycle.last_sync_unix_seconds() } /// Start the background sync loop. Idempotent — calling while @@ -195,26 +184,30 @@ impl PlatformAddressSyncManager { /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned"); - if guard.is_some() { - return; - } - let cancel = CancellationToken::new(); - *guard = Some(cancel.clone()); - drop(guard); + // Reopen the quiescing gate so this (re)start's passes can run. + self.lifecycle.reopen_quiescing_gate(); + + let cfg = self.lifecycle.worker_config(); + // The loop drives `!Send` SDK futures via `Handle::block_on` on a + // dedicated OS thread (spawned by the registry). `biased` polls the + // cancel arm first so a pass stalled on a hung SDK fetch is dropped + // at its `.await` the instant the registry cancels. let handle = tokio::runtime::Handle::current(); - let this = self; - std::thread::Builder::new() - .name("platform-address-sync".into()) - .spawn(move || { + let this = Arc::clone(&self); + self.lifecycle + .registry() + .start_thread(self.lifecycle.worker(), cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { break; } - - this.sync_now().await; + tokio::select! { + biased; + _ = cancel.cancelled() => break, + _ = this.sync_now() => {} + } let interval = this.interval(); tokio::select! { @@ -222,13 +215,8 @@ impl PlatformAddressSyncManager { _ = cancel.cancelled() => break, } } - - if let Ok(mut guard) = this.background_cancel.lock() { - *guard = None; - } }); - }) - .expect("failed to spawn platform-address-sync thread"); + }); } /// Stop the background sync loop. No-op if not running. @@ -241,14 +229,7 @@ impl PlatformAddressSyncManager { /// the host can free the event-handler context — use /// [`quiesce`](Self::quiesce). pub fn stop(&self) { - if let Some(token) = self - .background_cancel - .lock() - .expect("bg_cancel poisoned") - .take() - { - token.cancel(); - } + self.lifecycle.stop(); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -270,13 +251,14 @@ impl PlatformAddressSyncManager { /// falling edge (with the gate up) is a sound "fully drained" signal. /// The gate is reopened before returning so a later start/sync works /// normally. - pub async fn quiesce(&self) { - self.quiescing.store(true, Ordering::Release); - self.stop(); - while self.is_syncing.load(Ordering::Acquire) { - tokio::time::sleep(Duration::from_millis(20)).await; - } - self.quiescing.store(false, Ordering::Release); + /// + /// Finally **joins** the loop's OS thread (after the drain, so the + /// thread is on its way out) and returns its terminal status. Joining + /// while the runtime is still alive is what lets the manager promise + /// the `!Send` loop has stopped touching `tokio::time` before a + /// one-shot host drops the runtime. + pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { + self.lifecycle.quiesce().await } /// Run one sync pass across every registered wallet. @@ -284,23 +266,13 @@ impl PlatformAddressSyncManager { /// If a pass is already in flight, returns an empty summary and /// skips — the caller can inspect [`is_syncing`] to distinguish. pub async fn sync_now(&self) -> PlatformAddressSyncSummary { - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { + // Claim the pass slot and honour the quiescing gate; bail with an + // empty summary (and without a host completion callback after + // quiesce returns) if a pass is already in flight or a teardown + // raised the gate. The guard clears `is_syncing` on every exit path. + let Some(_pass) = self.lifecycle.begin_pass() else { return PlatformAddressSyncSummary::default(); - } - - // A `quiesce()` may have raised the gate between our CAS and - // here; if so, release the slot and bail without running a pass - // so the drain can complete and shutdown gets a true barrier - // (no further `on_platform_address_sync_completed` host callback - // after quiesce returns). - if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); - return PlatformAddressSyncSummary::default(); - } + }; let snapshot: Vec<(WalletId, Arc)> = { let wallets = self.wallets.read().await; @@ -330,22 +302,20 @@ impl PlatformAddressSyncManager { .map(|d| d.as_secs()) .unwrap_or(0); summary.sync_unix_seconds = now; - self.last_sync_unix.store(now, Ordering::Release); - - // Dispatch the completion event BEFORE clearing `is_syncing`. - // `quiesce()` drains on the falling edge of `is_syncing`, so if - // we cleared the flag first a shutdown caller could unblock and - // free the host event-handler context while this completion - // event (FFI callback → host handler) is still pending — a - // use-after-free. Holding the flag across the dispatch makes - // quiesce's barrier cover the host callback too. Mirrors the - // ordering in `ShieldedSyncManager::sync_now`. + self.lifecycle.store_last_sync_unix(now); + + // Dispatch the completion event BEFORE the `_pass` guard drops. + // `quiesce()` drains on the falling edge of `is_syncing`; if the + // guard cleared the flag before the dispatch a shutdown caller + // could unblock and free the host event-handler context while + // the callback is still pending — a use-after-free. The guard + // drops (clearing `is_syncing`) after this call returns, when + // the function frame unwinds. self.event_manager .on_platform_address_sync_completed(&summary); - self.is_syncing.store(false, Ordering::Release); - summary + // `_pass` drops here → `is_syncing = false` } /// Sync a single wallet on demand. Does not set the global @@ -373,7 +343,7 @@ impl std::fmt::Debug for PlatformAddressSyncManager { f.debug_struct("PlatformAddressSyncManager") .field("is_running", &self.is_running()) .field("is_syncing", &self.is_syncing()) - .field("interval_secs", &self.interval_secs.load(Ordering::Acquire)) + .field("interval_secs", &self.lifecycle.interval_secs()) .field("last_sync_unix", &self.last_sync_unix_seconds()) .finish() } @@ -422,8 +392,13 @@ mod tests { let event_manager = Arc::new(PlatformEventManager::new(vec![ Arc::clone(&counter) as Arc ])); + let registry = ThreadRegistry::new(); ( - Arc::new(PlatformAddressSyncManager::new(wallets, event_manager)), + Arc::new(PlatformAddressSyncManager::new( + wallets, + event_manager, + registry, + )), counter, ) } @@ -438,53 +413,6 @@ mod tests { assert!(!mgr.is_syncing()); } - /// `quiesce()` must not return while a pass is in flight, and must - /// return promptly once the pass drains. - /// - /// Drives the real `is_syncing` lifecycle: a background task takes - /// the slot via the same `compare_exchange` the real `sync_now` - /// uses, holds it across a sleep (standing in for the pass body + - /// completion-event dispatch, which `sync_now` keeps the flag set - /// across), then clears it. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn quiesce_blocks_until_in_flight_pass_drains() { - let (mgr, _counter) = make_manager(); - - let holder = Arc::clone(&mgr); - let pass = tokio::spawn(async move { - assert!( - holder - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_ok(), - "test should own the is_syncing slot" - ); - tokio::time::sleep(Duration::from_millis(200)).await; - holder.is_syncing.store(false, Ordering::Release); - }); - - while !mgr.is_syncing() { - tokio::time::sleep(Duration::from_millis(5)).await; - } - - let quiesce_fut = mgr.quiesce(); - tokio::pin!(quiesce_fut); - - tokio::select! { - _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"), - _ = tokio::time::sleep(Duration::from_millis(50)) => {} - } - assert!(mgr.is_syncing(), "pass should still be in flight"); - - tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut) - .await - .expect("quiesce did not return after the pass drained"); - - assert!(!mgr.quiescing.load(Ordering::Acquire)); - assert!(!mgr.is_syncing()); - pass.await.unwrap(); - } - /// A `sync_now()` invoked while `quiescing` is set must bail without /// running the pass — in particular, without firing the /// `on_platform_address_sync_completed` host callback. This is the @@ -494,8 +422,8 @@ mod tests { async fn sync_now_bails_when_quiescing() { let (mgr, counter) = make_manager(); - // Raise the gate as `quiesce()` would. - mgr.quiescing.store(true, Ordering::Release); + // Raise the gate as `quiesce()` would, held across the pass. + let _gate = mgr.lifecycle.hold_quiescing_gate(); let summary = mgr.sync_now().await; diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 482674b4322..f949b48dd12 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -26,15 +26,15 @@ //! [`configure_shielded`]: crate::manager::PlatformWalletManager::configure_shielded use std::collections::BTreeMap; -use std::sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex as StdMutex, -}; +use std::sync::Arc; + +use dash_async::{AtomicFlagGuard, ThreadRegistry}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; -use tokio_util::sync::CancellationToken; +use super::coordinator_lifecycle::CoordinatorLifecycle; +use super::WalletWorker; use crate::events::PlatformEventManager; use crate::wallet::platform_wallet::WalletId; use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary}; @@ -139,43 +139,30 @@ pub struct ShieldedSyncManager { /// run first, so an empty slot guarantees no shielded state /// exists). coordinator_slot: Arc>>>, - /// Cancel token for the background loop, if running. - background_cancel: StdMutex>, - /// Monotonically increasing generation counter. Bumped on every - /// `start()` so the exiting thread can tell whether its - /// generation is still the active one before clearing - /// `background_cancel`. Without this, a `stop()` → `start()` - /// overlap lets the prior thread's cleanup strip the new - /// generation's token, leaving the new loop running but - /// untrackable via `is_running()`. - background_generation: AtomicU64, - interval_secs: AtomicU64, - is_syncing: AtomicBool, - /// Set by [`quiesce`](Self::quiesce) to gate new passes while it - /// drains an in-flight one. `sync_now` / `sync_wallet` bail (after - /// taking the `is_syncing` slot) when this is set, so once `quiesce` - /// observes `is_syncing == false` no further pass can start — giving - /// Clear / stop a real "no more host-visible mutations" barrier that - /// cancel-only [`stop`](Self::stop) does not provide. - quiescing: AtomicBool, - /// Unix seconds of the last completed pass. `0` = never. - last_sync_unix: AtomicU64, + /// Shared lifecycle state + pass-gating protocol under the + /// [`WalletWorker::ShieldedSync`] key: registry handle, polling + /// interval, the `is_syncing` / `quiescing` handshake, and the + /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the + /// `sync_now` / `sync_wallet` pass gate delegate to it. The `quiescing` + /// half gives Clear / stop a real "no more host-visible mutations" + /// barrier that cancel-only [`stop`](Self::stop) does not provide. + lifecycle: CoordinatorLifecycle, } impl ShieldedSyncManager { pub fn new( event_manager: Arc, coordinator_slot: Arc>>>, + registry: Arc>, ) -> Self { Self { event_manager, coordinator_slot, - background_cancel: StdMutex::new(None), - background_generation: AtomicU64::new(0), - interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), - is_syncing: AtomicBool::new(false), - quiescing: AtomicBool::new(false), - last_sync_unix: AtomicU64::new(0), + lifecycle: CoordinatorLifecycle::new( + registry, + WalletWorker::ShieldedSync, + DEFAULT_SYNC_INTERVAL_SECS, + ), } } @@ -183,35 +170,28 @@ impl ShieldedSyncManager { /// /// The running loop picks this up on its next sleep. pub fn set_interval(&self, interval: Duration) { - let secs = interval.as_secs().max(1); - self.interval_secs.store(secs, Ordering::Release); + self.lifecycle.set_interval(interval); } /// Current polling interval. pub fn interval(&self) -> Duration { - Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + self.lifecycle.interval() } /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.background_cancel - .lock() - .map(|g| g.is_some()) - .unwrap_or(false) + self.lifecycle.is_running() } /// Whether a sync pass is in flight right now. pub fn is_syncing(&self) -> bool { - self.is_syncing.load(Ordering::Acquire) + self.lifecycle.is_syncing() } /// Unix seconds of the last completed pass, or `None` if no pass /// has ever completed. pub fn last_sync_unix_seconds(&self) -> Option { - match self.last_sync_unix.load(Ordering::Acquire) { - 0 => None, - n => Some(n), - } + self.lifecycle.last_sync_unix_seconds() } /// Start the background sync loop. Idempotent — calling while @@ -222,37 +202,32 @@ impl ShieldedSyncManager { /// GRPC client state isn't `Send + Sync`). Same trade-off as /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start). pub fn start(self: Arc) { - let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned"); - if guard.is_some() { - return; - } - let cancel = CancellationToken::new(); - *guard = Some(cancel.clone()); - // Bump the generation while we still hold the slot lock so - // the load below in any prior thread's cleanup observes - // `current_gen != my_gen` ordered against this token swap. - let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; - drop(guard); + // Reopen the quiescing gate so this (re)start's passes can run. + self.lifecycle.reopen_quiescing_gate(); + let cfg = self.lifecycle.worker_config(); + + // The loop drives `!Send` SDK futures via `Handle::block_on` on a + // dedicated OS thread (spawned by the registry). The background + // cadence passes `force=false` to honor the per-wallet caught-up + // cooldown; user-initiated syncs pass `force=true` via the FFI. + // `biased` polls the cancel arm first so a pass stalled on a hung + // SDK fetch is dropped the instant the registry cancels. let handle = tokio::runtime::Handle::current(); - let this = self; - std::thread::Builder::new() - .name("shielded-sync".into()) - .spawn(move || { + let this = Arc::clone(&self); + self.lifecycle + .registry() + .start_thread(self.lifecycle.worker(), cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { break; } - - // Background-loop cadence — honor the - // per-wallet caught-up cooldown so a - // sleepy network doesn't refetch + - // re-trial-decrypt the partial buffer - // chunk every interval. User-initiated - // syncs pass `force=true` to the FFI - // entry point below and bypass this. - this.sync_now(false).await; + tokio::select! { + biased; + _ = cancel.cancelled() => break, + _ = this.sync_now(false) => {} + } let interval = this.interval(); tokio::select! { @@ -260,21 +235,8 @@ impl ShieldedSyncManager { _ = cancel.cancelled() => break, } } - - // Only clear `background_cancel` if the active - // generation is still ours. Without this guard a - // tight `stop()` → `start()` reschedule has the - // exiting thread overwrite the *new* generation's - // token, leaving the new loop running but - // unreflectable via `is_running()` / `stop()`. - if this.background_generation.load(Ordering::Acquire) == my_gen { - if let Ok(mut guard) = this.background_cancel.lock() { - *guard = None; - } - } }); - }) - .expect("failed to spawn shielded-sync thread"); + }); } /// Stop the background sync loop. No-op if not running. @@ -286,14 +248,7 @@ impl ShieldedSyncManager { /// nothing more will be persisted" barrier — required by Clear, /// unregister, and rebind — use [`quiesce`](Self::quiesce). pub fn stop(&self) { - if let Some(token) = self - .background_cancel - .lock() - .expect("bg_cancel poisoned") - .take() - { - token.cancel(); - } + self.lifecycle.stop(); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -313,13 +268,42 @@ impl ShieldedSyncManager { /// including the persister fan-out, so its falling edge (with the /// gate up) is a sound "fully drained" signal. The gate is reopened /// before returning so a later start/sync works normally. - pub async fn quiesce(&self) { - self.quiescing.store(true, Ordering::Release); - self.stop(); - while self.is_syncing.load(Ordering::Acquire) { - tokio::time::sleep(Duration::from_millis(20)).await; - } - self.quiescing.store(false, Ordering::Release); + /// + /// Finally **joins** the loop's OS thread (after the drain, so the + /// thread is on its way out) and returns its terminal status. Joining + /// while the runtime is still alive is what lets the manager promise + /// the `!Send` loop has stopped touching `tokio::time` before a + /// one-shot host drops the runtime. + pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { + self.lifecycle.quiesce().await + } + + /// Drain + join **without touching the `quiescing` gate**, for a caller + /// (the Clear flow) that already holds it raised via + /// [`hold_quiescing_gate`](Self::hold_quiescing_gate) and keeps holding + /// it across the whole teardown. See + /// [`CoordinatorLifecycle::quiesce_under_held_gate`]. + pub(crate) async fn quiesce_under_held_gate(&self) -> super::CoordinatorThreadStatus { + self.lifecycle.quiesce_under_held_gate().await + } + + /// Test seam: enter a sync pass directly (claim `is_syncing` via the + /// pass gate) so a teardown test can stand in for a direct + /// `sync_now`/`sync_wallet` already in flight, without driving the real + /// (coordinator-backed) sync path. The returned guard clears the flag + /// on drop. + #[cfg(test)] + pub(crate) fn begin_pass_for_test(&self) -> Option> { + self.lifecycle.begin_pass() + } + + /// Raise the `quiescing` gate and hold it raised until the returned + /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate as + /// soon as it returns, this lets a multi-step teardown (Clear) keep new + /// direct `sync_now` / `sync_wallet` passes off across a check-then-wipe + /// so the "no new pass" guarantee does not lapse between the two steps. + pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> { + self.lifecycle.hold_quiescing_gate() } /// Run one sync pass across every registered wallet. @@ -334,21 +318,13 @@ impl ShieldedSyncManager { /// If a pass is already in flight, returns an empty summary and /// skips — the caller can inspect [`is_syncing`] to distinguish. pub async fn sync_now(&self, force: bool) -> ShieldedSyncPassSummary { - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { + // Claim the pass slot and honour the quiescing gate; bail with an + // empty summary if a pass is already in flight or a teardown + // (Clear/stop) raised the gate. The guard clears `is_syncing` on + // every exit path. + let Some(_pass) = self.lifecycle.begin_pass() else { return ShieldedSyncPassSummary::default(); - } - - // A `quiesce()` may have raised the gate between our CAS and - // here; if so, release the slot and bail without running a pass - // so the drain can complete and Clear/stop get a true barrier. - if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); - return ShieldedSyncPassSummary::default(); - } + }; // Snapshot the coordinator Arc and release the slot lock // before awaiting so a concurrent `configure_shielded` @@ -380,21 +356,18 @@ impl ShieldedSyncManager { if summary.sync_unix_seconds == 0 { summary.sync_unix_seconds = now; } - self.last_sync_unix - .store(summary.sync_unix_seconds, Ordering::Release); - - // Dispatch the completion event BEFORE clearing `is_syncing`. - // `quiesce()` drains on the falling edge of `is_syncing`, so if - // we cleared the flag first a stop/clear caller could unblock - // while this completion event (FFI callback → Swift - // `handleShieldedSyncCompleted`) is still pending — surfacing a - // stale post-stop/post-clear event. Holding the flag across the - // dispatch makes quiesce's barrier cover the event too. + self.lifecycle + .store_last_sync_unix(summary.sync_unix_seconds); + + // Dispatch the completion event BEFORE the `_pass` guard drops. + // `quiesce()` drains on the falling edge of `is_syncing`; if + // the guard cleared the flag before the dispatch a stop/clear + // caller could unblock while the callback is still pending — + // surfacing a stale post-stop/post-clear event. self.event_manager.on_shielded_sync_completed(&summary); - self.is_syncing.store(false, Ordering::Release); - summary + // `_pass` drops here → `is_syncing = false` } /// Sync a single wallet on demand. @@ -425,27 +398,16 @@ impl ShieldedSyncManager { }; // Reuse the manager-wide `is_syncing` flag so a per-wallet - // `sync_wallet()` can't race the periodic `sync_now()` - // against the same store — both go through - // `coordinator.sync()`, which serializes per-coordinator - // but the manager flag is what the host UI watches. - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { - return Ok(None); - } - - // Bail if a `quiesce()` raised the gate after our CAS (see - // `sync_now`) so the drain barrier holds. - if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); + // `sync_wallet()` can't race the periodic `sync_now()` against the + // same store — both go through `coordinator.sync()`, which + // serializes per-coordinator, but the manager flag is what the host + // UI watches. Bail (Ok(None)) if a pass is already in flight or a + // teardown raised the quiescing gate. + let Some(_pass) = self.lifecycle.begin_pass() else { return Ok(None); - } + }; let pass = coordinator.sync(force).await; - self.is_syncing.store(false, Ordering::Release); // Extract this wallet's slice from the network-wide pass // summary. If the wallet is registered, we'll get back an @@ -470,7 +432,7 @@ impl std::fmt::Debug for ShieldedSyncManager { f.debug_struct("ShieldedSyncManager") .field("is_running", &self.is_running()) .field("is_syncing", &self.is_syncing()) - .field("interval_secs", &self.interval_secs.load(Ordering::Acquire)) + .field("interval_secs", &self.lifecycle.interval_secs()) .field("last_sync_unix", &self.last_sync_unix_seconds()) .finish() } diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift index 0e433d368ef..34137d15e58 100644 --- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift +++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift @@ -152,10 +152,48 @@ public class PlatformWalletManager: ObservableObject { deinit { progressPollTask?.cancel() - if handle != NULL_HANDLE { - platform_wallet_manager_platform_address_sync_stop(handle).discard() - platform_wallet_manager_shielded_sync_stop(handle).discard() - platform_wallet_manager_destroy(handle).discard() + guard handle != NULL_HANDLE else { return } + + // Tear down the Rust manager: cancel the address-sync loop, drain + // the shielded loop, then destroy. The first stop is cancel-only + // and never reports an incomplete drain, so we still `discard()` it. + platform_wallet_manager_platform_address_sync_stop(handle).discard() + + // Capture the CODE (not just free the message) for the two calls + // that CAN report `.errorShutdownIncomplete`: `shielded_sync_stop` + // and `destroy`. Rust returns that code when a background + // coordinator did not drain within the join deadline, OR — for + // `shielded_sync_stop` — when the drain was clean but a prior- + // generation shielded thread is still parked alive as an orphan + // (a tight `stop()`→`start()` reap that had to detach it past the + // wedge backstop). In either case a lingering `!Send` coordinator + // thread may still hold the `passUnretained` context pointers Rust + // was handed for our `persistenceHandler` / `eventHandler` and fire + // ONE final callback through them. The contract: on that code the + // host must NOT free the callback context immediately. + let shieldedStopCode = + platform_wallet_manager_shielded_sync_stop(handle).discardReturningCode() + let destroyCode = + platform_wallet_manager_destroy(handle).discardReturningCode() + + // Both handlers are passed to Rust via `Unmanaged.passUnretained` + // (see `PlatformWalletPersistenceHandler`/`PlatformWalletEventHandler` + // `makeCallbacks()`), so Rust holds non-owning pointers and these + // objects are kept alive ONLY by the stored properties below. The + // instant this deinit returns, ARC releases them — which would be a + // use-after-free if a lingering coordinator then fires its final + // callback. So, ONLY on an incomplete shutdown, deliberately leak one + // extra strong reference to each (an unbalanced `passRetained` that is + // never released) so they outlive any lingering thread. A clean + // shutdown (the common case) takes neither branch and releases the + // handlers normally — we never leak unconditionally. The leak is + // bounded by how often a shutdown wedges (rare) and trades two small + // objects for guaranteed callback safety, since an incomplete drain + // gives no later signal that the lingering thread has finally exited. + if shieldedStopCode == .errorShutdownIncomplete + || destroyCode == .errorShutdownIncomplete { + if let persistenceHandler { _ = Unmanaged.passRetained(persistenceHandler) } + if let eventHandler { _ = Unmanaged.passRetained(eventHandler) } } } diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift index 2c311f91e9d..c24f72fbf8c 100644 --- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift +++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift @@ -39,6 +39,12 @@ public enum PlatformWalletResultCode: Int32, Sendable { /// outcome. Do NOT auto-retry — a retry would rebuild the bundle and /// could double-execute if the original landed. case errorShieldedSpendUnconfirmed = 18 + /// A destroy/stop/clear completed but a background coordinator did not + /// exit cleanly (timed out or ended non-cleanly). The host should defer + /// freeing its callback context — a lingering coordinator may still fire + /// one final callback through it — and, on the clear path, must NOT + /// commit its own persistence wipe (the Rust store was left intact). + case errorShutdownIncomplete = 19 case notFound = 98 case errorUnknown = 99 @@ -82,6 +88,8 @@ public enum PlatformWalletResultCode: Int32, Sendable { self = .errorShieldedBroadcastUnconfirmed case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHIELDED_SPEND_UNCONFIRMED: self = .errorShieldedSpendUnconfirmed + case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHUTDOWN_INCOMPLETE: + self = .errorShutdownIncomplete case PLATFORM_WALLET_FFI_RESULT_CODE_NOT_FOUND: self = .notFound case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_UNKNOWN: @@ -177,6 +185,12 @@ public enum PlatformWalletError: LocalizedError { /// notes reserved wallet-side (a shield reserves nothing) until the /// next sync reconciles the outcome. Do NOT auto-retry. case shieldedSpendUnconfirmed(String) + /// A destroy / stop / clear completed but a background coordinator did + /// not exit cleanly. The host should defer freeing its callback context + /// (a lingering coordinator may still fire one final callback) and, on + /// the clear path, must NOT commit its own persistence wipe — the Rust + /// store was left intact so it can be retried once the pass settles. + case shutdownIncomplete(String) case notFound(String) case unknown(String) @@ -192,6 +206,7 @@ public enum PlatformWalletError: LocalizedError { .arithmeticOverflow(let m), .noSelectableInputs(let m), .walletAlreadyExists(let m), .shieldedBroadcastFailed(let m), .shieldedBroadcastUnconfirmed(let m), .shieldedSpendUnconfirmed(let m), + .shutdownIncomplete(let m), .notFound(let m), .unknown(let m): return m } @@ -222,6 +237,7 @@ public enum PlatformWalletError: LocalizedError { case .errorShieldedBroadcastFailed: self = .shieldedBroadcastFailed(detail) case .errorShieldedBroadcastUnconfirmed: self = .shieldedBroadcastUnconfirmed(detail) case .errorShieldedSpendUnconfirmed: self = .shieldedSpendUnconfirmed(detail) + case .errorShutdownIncomplete: self = .shutdownIncomplete(detail) case .notFound: self = .notFound(detail) case .errorUnknown: self = .unknown(detail) } @@ -240,4 +256,16 @@ extension PlatformWalletFFIResult { func discard() { _ = PlatformWalletResult(self) } + + /// Free the result's Rust-owned message and return its typed code. + /// + /// Like `discard()`, but hands back the code so the caller can branch + /// on it — used by `PlatformWalletManager.deinit`, which must detect + /// `.errorShutdownIncomplete` to decide whether to keep its callback + /// context alive. The message is still freed deterministically (the + /// temporary `PlatformWalletResult` frees it on drop). + @inline(__always) + func discardReturningCode() -> PlatformWalletResultCode { + PlatformWalletResult(self).code + } }