From f3354f6617a9407cf4c630ade2a30a87db4f9680 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Mon, 22 Jun 2026 21:46:48 +0200 Subject: [PATCH 01/29] feat(platform-wallet)!: shutdown() joins coordinator threads and returns CoordinatorExitStatus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The three periodic sync coordinators (platform-address, identity, shielded) run their `!Send` loops on detached OS threads via `Handle::block_on`. `shutdown()`/`quiesce()` previously only drained the in-flight pass (the `is_syncing` barrier) and never joined the threads, so a consumer that drops the tokio runtime right after `shutdown()` (one-shot / headless / stdio) could race a coordinator still polling `tokio::time` on a shutting-down runtime and panic with "A Tokio 1.x context was found, but it is being shutdown". Each coordinator now stores its OS-thread `JoinHandle`; `quiesce()` joins it (via `spawn_blocking`, after the existing drain) and returns a `CoordinatorThreadStatus` (NotRunning / Ok / Panicked / Error). Joining while the runtime is still alive guarantees the loop has stopped touching `tokio::time` before the host drops the runtime. `shutdown()` aggregates the three into `CoordinatorExitStatus`, so a panicked loop surfaces in the status instead of being silently dropped. JoinHandle-join chosen over a oneshot/Notify signal: `JoinHandle::join` natively distinguishes a clean return from a panic and waits for the actual OS thread to terminate (not just a signal fired mid-teardown), yielding the per-thread status for free. The generation-guard reschedule and quiesce-drain behavior are preserved. BREAKING CHANGE: `PlatformWalletManager::shutdown()` now returns `CoordinatorExitStatus` instead of `()`. FFI: the internal `shutdown()` call logs the new status; the `extern "C"` `platform_wallet_manager_destroy` signature and C ABI are unchanged. 🤖 Co-authored by [Claudius the Magnificent](https://github.com/lklimek/claudius) AI Agent --- .../rs-platform-wallet-ffi/src/manager.rs | 13 +- .../src/manager/identity_sync.rs | 26 +- .../rs-platform-wallet/src/manager/mod.rs | 277 +++++++++++++++++- .../src/manager/platform_address_sync.rs | 26 +- .../src/manager/shielded_sync.rs | 26 +- 5 files changed, 354 insertions(+), 14 deletions(-) diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs index 5930c1c4db..d09d98a1e8 100644 --- a/packages/rs-platform-wallet-ffi/src/manager.rs +++ b/packages/rs-platform-wallet-ffi/src/manager.rs @@ -360,7 +360,18 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy( // left alive to fire a callback against freed memory. // `shutdown()` is idempotent, so this is safe even if the host // already stopped some sync managers before calling destroy. - runtime().block_on(manager.shutdown()); + // It now joins the coordinator OS threads and returns their + // per-thread exit status; the C ABI exposes none of that, so we + // just log it (a panicked loop is worth surfacing) and drop it. + let status = runtime().block_on(manager.shutdown()); + if !status.all_clean() { + tracing::warn!( + ?status, + "platform wallet coordinator(s) did not exit cleanly" + ); + } else { + tracing::debug!(?status, "platform wallet coordinators joined cleanly"); + } } PlatformWalletFFIResult::ok() } diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 8730398f97..ae5ae879f7 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -160,6 +160,11 @@ where persister: Arc

, /// Cancel token for the background loop, if running. background_cancel: StdMutex>, + /// Join handle for the background loop's OS thread, if running. + /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can + /// confirm the `!Send` loop fully exited before the host drops the + /// runtime. + background_join: StdMutex>>, /// Monotonically increasing generation counter. Incremented each /// time `start()` installs a new cancel token so the exiting /// thread can tell whether its token is still current. @@ -204,6 +209,7 @@ where sdk, persister, background_cancel: StdMutex::new(None), + background_join: StdMutex::new(None), background_generation: AtomicU64::new(0), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), @@ -405,8 +411,8 @@ where drop(guard); let handle = tokio::runtime::Handle::current(); - let this = self; - std::thread::Builder::new() + let this = Arc::clone(&self); + let join = std::thread::Builder::new() .name("identity-sync".into()) .spawn(move || { handle.block_on(async move { @@ -434,6 +440,8 @@ where }); }) .expect("failed to spawn identity-sync thread"); + // Store the handle so `quiesce` can join the OS thread. + *self.background_join.lock().expect("bg_join poisoned") = Some(join); } /// Stop the background sync loop. No-op if not running. @@ -473,13 +481,25 @@ where /// so its falling edge (with the gate up) is a sound "fully drained" /// signal. The gate is reopened before returning so a later /// start/sync works normally. - pub async fn quiesce(&self) { + /// + /// Finally **joins** the loop's OS thread (after the drain, so the + /// thread is on its way out) and returns its terminal status. Joining + /// while the runtime is still alive is what lets the manager promise + /// the `!Send` loop has stopped touching `tokio::time` before a + /// one-shot host drops the runtime. + pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { self.quiescing.store(true, Ordering::Release); self.stop(); while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(20)).await; } self.quiescing.store(false, Ordering::Release); + let handle = self + .background_join + .lock() + .expect("bg_join poisoned") + .take(); + super::join_coordinator_thread(handle).await } /// Run one sync pass across every registered identity. diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 3d04ca086d..3529356170 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -89,6 +89,99 @@ pub struct PlatformWalletManager { pub(super) event_adapter_join: tokio::sync::Mutex>>, } +/// Terminal status of one background coordinator's OS thread. +/// +/// The three periodic coordinators run their loops on dedicated OS +/// threads (the SDK futures are `!Send`, so they ride +/// [`Handle::block_on`](tokio::runtime::Handle::block_on) rather than +/// `tokio::spawn`). [`PlatformWalletManager::shutdown`] joins each +/// thread and reports how it ended so a host can tell a clean wind-down +/// from a panicked loop instead of silently dropping the thread. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CoordinatorThreadStatus { + /// No thread was running to join — the loop was never started, or + /// was already stopped and joined. + NotRunning, + /// The loop exited and its OS thread joined cleanly. + Ok, + /// The OS thread panicked; carries the best-effort panic message. + Panicked(String), + /// The join itself could not complete (the blocking join task + /// failed). Distinct from the thread panicking. + Error(String), +} + +impl CoordinatorThreadStatus { + /// `true` for a non-failure outcome (joined cleanly or never ran). + pub fn is_clean(&self) -> bool { + matches!(self, Self::Ok | Self::NotRunning) + } +} + +/// Per-thread terminal status of every background coordinator, returned +/// by [`PlatformWalletManager::shutdown`]. +/// +/// A host that drops its tokio runtime right after `shutdown()` +/// (one-shot / headless / stdio) reads this to confirm each `!Send` +/// coordinator loop fully wound down on its OS thread *before* the +/// runtime goes away — closing the race where a still-polling loop hits +/// `tokio::time` on a shutting-down runtime and panics with +/// `A Tokio 1.x context was found, but it is being shutdown`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CoordinatorExitStatus { + /// Platform-address (BLAST) balance sync loop. + pub platform_address: CoordinatorThreadStatus, + /// Per-identity token-state sync loop. + pub identity: CoordinatorThreadStatus, + /// Shielded (Orchard) note sync loop. Always + /// [`CoordinatorThreadStatus::NotRunning`] in builds without the + /// `shielded` feature. + pub shielded: CoordinatorThreadStatus, +} + +impl CoordinatorExitStatus { + /// `true` when every coordinator wound down without a panic or join + /// failure (each is [`Ok`](CoordinatorThreadStatus::Ok) or + /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)). + pub fn all_clean(&self) -> bool { + self.platform_address.is_clean() && self.identity.is_clean() && self.shielded.is_clean() + } +} + +/// Join a coordinator's background OS thread and classify how it ended. +/// +/// Awaited by [`quiesce`](IdentitySyncManager::quiesce) *after* the loop +/// is cancelled and its in-flight pass drained, so the thread is already +/// on its way out. The blocking [`JoinHandle::join`](std::thread::JoinHandle::join) +/// runs on the blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking)) +/// to avoid parking a runtime worker. Joining here — while the runtime +/// is still alive — is what guarantees the `!Send` loop has stopped +/// touching `tokio::time` before the host drops the runtime. +pub(crate) async fn join_coordinator_thread( + handle: Option>, +) -> CoordinatorThreadStatus { + let Some(handle) = handle else { + return CoordinatorThreadStatus::NotRunning; + }; + match tokio::task::spawn_blocking(move || handle.join()).await { + Ok(Ok(())) => CoordinatorThreadStatus::Ok, + Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)), + Err(join_err) => CoordinatorThreadStatus::Error(join_err.to_string()), + } +} + +/// Best-effort extraction of a panic message from a joined thread's +/// payload (`&str` and `String` are the common cases). +fn panic_message(payload: Box) -> String { + if let Some(s) = payload.downcast_ref::<&str>() { + (*s).to_string() + } else if let Some(s) = payload.downcast_ref::() { + s.clone() + } else { + "unknown panic payload".to_string() + } +} + impl PlatformWalletManager

{ /// Create a new PlatformWalletManager. /// @@ -308,11 +401,20 @@ impl PlatformWalletManager

{ /// FIRST (so no further persister store or host callback can start), /// and only THEN cancel + join the event adapter, which is the sink /// those stores feed into. - pub async fn shutdown(&self) { - self.platform_address_sync_manager.quiesce().await; - self.identity_sync_manager.quiesce().await; + /// + /// Each `quiesce()` now also **joins** its coordinator's OS thread, + /// so when this returns every `!Send` loop has fully exited. A host + /// that drops the tokio runtime right after `shutdown()` (one-shot / + /// headless / stdio) is therefore safe — no coordinator can still be + /// polling `tokio::time` on a shutting-down runtime. The returned + /// [`CoordinatorExitStatus`] reports per-thread how each loop ended. + pub async fn shutdown(&self) -> CoordinatorExitStatus { + let platform_address = self.platform_address_sync_manager.quiesce().await; + let identity = self.identity_sync_manager.quiesce().await; #[cfg(feature = "shielded")] - self.shielded_sync_manager.quiesce().await; + let shielded = self.shielded_sync_manager.quiesce().await; + #[cfg(not(feature = "shielded"))] + let shielded = CoordinatorThreadStatus::NotRunning; self.event_adapter_cancel.cancel(); if let Some(handle) = self.event_adapter_join.lock().await.take() { @@ -320,5 +422,172 @@ impl PlatformWalletManager

{ tracing::warn!(error = ?e, "Wallet event adapter task join error"); } } + + CoordinatorExitStatus { + platform_address, + identity, + shielded, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::time::Duration; + + use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet}; + + /// No-op persister — the lifecycle tests below never exercise the + /// real persistence pipeline, they just need a handle that satisfies + /// the manager's `P` bound. + struct NoopPersister; + + impl PlatformWalletPersistence for NoopPersister { + fn store( + &self, + _wallet_id: WalletId, + _changeset: PlatformWalletChangeSet, + ) -> Result<(), PersistenceError> { + Ok(()) + } + + fn flush(&self, _wallet_id: WalletId) -> Result<(), PersistenceError> { + Ok(()) + } + + fn load(&self) -> Result { + Ok(ClientStartState::default()) + } + } + + /// No-op event handler standing in for the host's FFI handler. + struct NoopHandler; + impl dash_spv::EventHandler for NoopHandler {} + impl PlatformEventHandler for NoopHandler {} + + /// Build a manager over a mock SDK + no-op persister/handler. Cheap: + /// `new` wires the sub-managers and spawns the event adapter but + /// starts no coordinator threads. + fn make_manager() -> PlatformWalletManager { + let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); + let persister = Arc::new(NoopPersister); + let handler: Arc = Arc::new(NoopHandler); + PlatformWalletManager::new(sdk, persister, handler) + } + + /// Start every periodic coordinator's background OS-thread loop. + fn start_coordinators(m: &PlatformWalletManager

) { + Arc::clone(&m.platform_address_sync_manager).start(); + Arc::clone(&m.identity_sync_manager).start(); + #[cfg(feature = "shielded")] + Arc::clone(&m.shielded_sync_manager).start(); + } + + /// (a) `shutdown()` joins all coordinator OS threads and reports an + /// all-clean status; a second call has nothing left to join. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_joins_all_coordinators_and_reports_ok() { + let manager = make_manager(); + start_coordinators(&manager); + // Let the loops enter `block_on` so we exercise the live-loop + // join path (a thread cancelled before its first poll joins too). + tokio::time::sleep(Duration::from_millis(50)).await; + + let status = manager.shutdown().await; + assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity, CoordinatorThreadStatus::Ok); + #[cfg(feature = "shielded")] + assert_eq!(status.shielded, CoordinatorThreadStatus::Ok); + #[cfg(not(feature = "shielded"))] + assert_eq!(status.shielded, CoordinatorThreadStatus::NotRunning); + assert!(status.all_clean()); + + // Handles consumed by the join → nothing left to join. + let again = manager.shutdown().await; + assert_eq!(again.platform_address, CoordinatorThreadStatus::NotRunning); + assert_eq!(again.identity, CoordinatorThreadStatus::NotRunning); + } + + /// (b) A coordinator thread that panics surfaces in the status rather + /// than being silently dropped. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn join_coordinator_thread_surfaces_panic() { + let handle = std::thread::spawn(|| panic!("boom in coordinator")); + match join_coordinator_thread(Some(handle)).await { + CoordinatorThreadStatus::Panicked(msg) => { + assert!(msg.contains("boom in coordinator"), "msg was {msg:?}"); + } + other => panic!("expected Panicked, got {other:?}"), + } + } + + /// A cleanly-returning thread joins as `Ok`; an absent handle is + /// `NotRunning`. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn join_coordinator_thread_clean_and_absent() { + let handle = std::thread::spawn(|| {}); + assert_eq!( + join_coordinator_thread(Some(handle)).await, + CoordinatorThreadStatus::Ok + ); + assert_eq!( + join_coordinator_thread(None).await, + CoordinatorThreadStatus::NotRunning + ); + } + + /// (c) Race regression: model the one-shot / headless path — start + /// the coordinators, `shutdown()`, then **drop the runtime**. Because + /// `shutdown()` joined every loop while the runtime was still alive + /// (asserted via the all-`Ok` status), nothing is left polling + /// `tokio::time`, so the drop raises no "Tokio … being shutdown" + /// panic. A scoped hook counts only that specific panic so a + /// concurrent unrelated panic can't trip the assertion. + #[test] + fn shutdown_then_drop_runtime_does_not_panic() { + use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; + + static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0); + let prev_hook = std::panic::take_hook(); + std::panic::set_hook(Box::new(|info| { + if info.to_string().contains("being shutdown") { + SHUTDOWN_PANICS.fetch_add(1, AtomicOrdering::SeqCst); + } + })); + + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("build runtime"); + + let status = runtime.block_on(async { + let manager = make_manager(); + start_coordinators(&manager); + tokio::time::sleep(Duration::from_millis(50)).await; + manager.shutdown().await + }); + + // The headless drop: with every coordinator already joined, this + // cannot race a loop still touching the timer. + drop(runtime); + std::thread::sleep(Duration::from_millis(100)); + let racing_panics = SHUTDOWN_PANICS.load(AtomicOrdering::SeqCst); + + // Restore the hook before asserting so a failure prints normally. + std::panic::set_hook(prev_hook); + + assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity, CoordinatorThreadStatus::Ok); + assert!( + status.all_clean(), + "coordinators did not wind down: {status:?}" + ); + assert_eq!( + racing_panics, 0, + "dropping the runtime after shutdown raced a coordinator thread" + ); } } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index e1a229806c..baa6111e02 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -97,6 +97,11 @@ pub struct PlatformAddressSyncManager { event_manager: Arc, /// Cancel token for the background loop, if running. background_cancel: StdMutex>, + /// Join handle for the background loop's OS thread, if running. + /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can + /// confirm the `!Send` loop fully exited before the host drops the + /// runtime. + background_join: StdMutex>>, interval_secs: AtomicU64, is_syncing: AtomicBool, /// Set by [`quiesce`](Self::quiesce) to gate new passes while it @@ -125,6 +130,7 @@ impl PlatformAddressSyncManager { wallets, event_manager, background_cancel: StdMutex::new(None), + background_join: StdMutex::new(None), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), quiescing: AtomicBool::new(false), @@ -204,8 +210,8 @@ impl PlatformAddressSyncManager { drop(guard); let handle = tokio::runtime::Handle::current(); - let this = self; - std::thread::Builder::new() + let this = Arc::clone(&self); + let join = std::thread::Builder::new() .name("platform-address-sync".into()) .spawn(move || { handle.block_on(async move { @@ -229,6 +235,8 @@ impl PlatformAddressSyncManager { }); }) .expect("failed to spawn platform-address-sync thread"); + // Store the handle so `quiesce` can join the OS thread. + *self.background_join.lock().expect("bg_join poisoned") = Some(join); } /// Stop the background sync loop. No-op if not running. @@ -270,13 +278,25 @@ impl PlatformAddressSyncManager { /// falling edge (with the gate up) is a sound "fully drained" signal. /// The gate is reopened before returning so a later start/sync works /// normally. - pub async fn quiesce(&self) { + /// + /// Finally **joins** the loop's OS thread (after the drain, so the + /// thread is on its way out) and returns its terminal status. Joining + /// while the runtime is still alive is what lets the manager promise + /// the `!Send` loop has stopped touching `tokio::time` before a + /// one-shot host drops the runtime. + pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { self.quiescing.store(true, Ordering::Release); self.stop(); while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(20)).await; } self.quiescing.store(false, Ordering::Release); + let handle = self + .background_join + .lock() + .expect("bg_join poisoned") + .take(); + super::join_coordinator_thread(handle).await } /// Run one sync pass across every registered wallet. diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 482674b432..d634c65398 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -141,6 +141,11 @@ pub struct ShieldedSyncManager { coordinator_slot: Arc>>>, /// Cancel token for the background loop, if running. background_cancel: StdMutex>, + /// Join handle for the background loop's OS thread, if running. + /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can + /// confirm the `!Send` loop fully exited before the host drops the + /// runtime. + background_join: StdMutex>>, /// Monotonically increasing generation counter. Bumped on every /// `start()` so the exiting thread can tell whether its /// generation is still the active one before clearing @@ -171,6 +176,7 @@ impl ShieldedSyncManager { event_manager, coordinator_slot, background_cancel: StdMutex::new(None), + background_join: StdMutex::new(None), background_generation: AtomicU64::new(0), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), @@ -235,8 +241,8 @@ impl ShieldedSyncManager { drop(guard); let handle = tokio::runtime::Handle::current(); - let this = self; - std::thread::Builder::new() + let this = Arc::clone(&self); + let join = std::thread::Builder::new() .name("shielded-sync".into()) .spawn(move || { handle.block_on(async move { @@ -275,6 +281,8 @@ impl ShieldedSyncManager { }); }) .expect("failed to spawn shielded-sync thread"); + // Store the handle so `quiesce` can join the OS thread. + *self.background_join.lock().expect("bg_join poisoned") = Some(join); } /// Stop the background sync loop. No-op if not running. @@ -313,13 +321,25 @@ impl ShieldedSyncManager { /// including the persister fan-out, so its falling edge (with the /// gate up) is a sound "fully drained" signal. The gate is reopened /// before returning so a later start/sync works normally. - pub async fn quiesce(&self) { + /// + /// Finally **joins** the loop's OS thread (after the drain, so the + /// thread is on its way out) and returns its terminal status. Joining + /// while the runtime is still alive is what lets the manager promise + /// the `!Send` loop has stopped touching `tokio::time` before a + /// one-shot host drops the runtime. + pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { self.quiescing.store(true, Ordering::Release); self.stop(); while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(20)).await; } self.quiescing.store(false, Ordering::Release); + let handle = self + .background_join + .lock() + .expect("bg_join poisoned") + .take(); + super::join_coordinator_thread(handle).await } /// Run one sync pass across every registered wallet. From 261178e8ae1897fdebb4f0e8fcba61826ad3336b Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Mon, 22 Jun 2026 22:41:31 +0200 Subject: [PATCH 02/29] fix(platform-wallet): RAII-guard is_syncing so a coordinator panic cannot wedge shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEC-001: Add `IsSyncingGuard` RAII struct to all three coordinator `sync_now` (and shielded `sync_wallet`) implementations. The guard clears `is_syncing=false` on every exit path — normal return, early return, and panic-unwind — so `quiesce()`'s drain loop can never spin forever on a panicked pass, and the `Panicked` thread-exit status becomes reachable. SEC-002: Wrap each coordinator's `quiesce()` call in `shutdown()` with `tokio::time::timeout(30 s)`. On timeout the slot reports `CoordinatorThreadStatus::Error("join timed out")` rather than hanging forever. SEC-003: Add `debug_assert!` in `shutdown()` that the current runtime is `MultiThread`; document the precondition in the method doc. F-5: In all three coordinators' `start()`, store the `JoinHandle` in `background_join` while still holding the `background_cancel` lock — eliminates the theoretical window where a concurrent `quiesce()` could take a `None` handle because spawn completed before the store. Rename `CoordinatorThreadExit` → `CoordinatorThreadStatus` with variants `Ok / NotRunning / Panicked / Error` to match the coordinator module's existing `super::CoordinatorThreadStatus` references (fixing the compile break in f3354f6617). `join_coordinator_thread`'s spawn_blocking `Err` arm now maps to `Error` rather than `Panicked` to distinguish infra failure from thread panic (F-6 documented). Co-Authored-By: Claudius the Magnificent 🤖 Co-authored by [Claudius the Magnificent](https://github.com/lklimek/claudius) AI Agent --- .../src/manager/identity_sync.rs | 41 +- .../rs-platform-wallet/src/manager/mod.rs | 399 +++++++++++++----- .../src/manager/platform_address_sync.rs | 58 ++- .../src/manager/shielded_sync.rs | 56 ++- 4 files changed, 407 insertions(+), 147 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index ae5ae879f7..7ce38eb5fd 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -75,6 +75,20 @@ use crate::wallet::platform_wallet::WalletId; /// startup default. pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60; +/// RAII guard that clears `is_syncing` when dropped. +/// +/// Created at the start of a sync pass (after the `compare_exchange` +/// that takes the slot). On any exit — normal return, early return, or +/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop +/// never spins forever on a panicked pass. +struct IsSyncingGuard<'a>(&'a AtomicBool); + +impl Drop for IsSyncingGuard<'_> { + fn drop(&mut self) { + self.0.store(false, Ordering::Release); + } +} + /// Maximum number of token ids fetched in a single /// `IdentityTokenBalancesQuery`. /// @@ -401,14 +415,13 @@ where /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned"); - if guard.is_some() { + let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned"); + if cancel_guard.is_some() { return; } let cancel = CancellationToken::new(); - *guard = Some(cancel.clone()); + *cancel_guard = Some(cancel.clone()); let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; - drop(guard); let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); @@ -440,8 +453,11 @@ where }); }) .expect("failed to spawn identity-sync thread"); - // Store the handle so `quiesce` can join the OS thread. + // Store the join handle while still holding cancel_guard — a + // concurrent quiesce() must wait for this lock before calling + // stop(), so the handle is always stored before it can be taken. *self.background_join.lock().expect("bg_join poisoned") = Some(join); + // cancel_guard drops here, releasing background_cancel. } /// Stop the background sync loop. No-op if not running. @@ -521,12 +537,17 @@ where return; } + // RAII guard: clears `is_syncing` on every exit path, including + // panics. Without this a panic inside the pass would leave + // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. + let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + // A `quiesce()` may have raised the gate between our CAS and - // here; if so, release the slot and bail without running a pass - // so the drain can complete and shutdown gets a true barrier - // (no further `persister.store(...)` after quiesce returns). + // here; if so, bail without running a pass so the drain can + // complete and shutdown gets a true barrier (no further + // `persister.store(...)` after quiesce returns). + // Guard clears `is_syncing` on return. if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); return; } @@ -552,7 +573,7 @@ where .map(|d| d.as_secs()) .unwrap_or(0); self.last_sync_unix.store(now, Ordering::Release); - self.is_syncing.store(false, Ordering::Release); + // `_is_syncing_guard` drops here → `is_syncing = false` } /// Sync a single identity's watched tokens against Platform. diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 3529356170..905dc32c41 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -89,7 +89,7 @@ pub struct PlatformWalletManager { pub(super) event_adapter_join: tokio::sync::Mutex>>, } -/// Terminal status of one background coordinator's OS thread. +/// How one background coordinator thread terminated. /// /// The three periodic coordinators run their loops on dedicated OS /// threads (the SDK futures are `!Send`, so they ride @@ -99,15 +99,16 @@ pub struct PlatformWalletManager { /// from a panicked loop instead of silently dropping the thread. #[derive(Debug, Clone, PartialEq, Eq)] pub enum CoordinatorThreadStatus { - /// No thread was running to join — the loop was never started, or - /// was already stopped and joined. - NotRunning, - /// The loop exited and its OS thread joined cleanly. + /// The loop exited and its thread/task joined cleanly. Ok, - /// The OS thread panicked; carries the best-effort panic message. + /// The thread/task panicked; carries the best-effort panic message. Panicked(String), - /// The join itself could not complete (the blocking join task - /// failed). Distinct from the thread panicking. + /// No thread/task was running to join — never started, or already + /// joined by a previous `shutdown()`. + NotRunning, + /// The join did not complete within the bounded timeout, or the + /// `spawn_blocking` task itself failed (e.g. runtime torn down + /// before the join could run — unreachable in normal operation). Error(String), } @@ -118,8 +119,8 @@ impl CoordinatorThreadStatus { } } -/// Per-thread terminal status of every background coordinator, returned -/// by [`PlatformWalletManager::shutdown`]. +/// Per-thread terminal status of every background worker, returned by +/// [`PlatformWalletManager::shutdown`]. /// /// A host that drops its tokio runtime right after `shutdown()` /// (one-shot / headless / stdio) reads this to confirm each `!Send` @@ -130,33 +131,41 @@ impl CoordinatorThreadStatus { #[derive(Debug, Clone, PartialEq, Eq)] pub struct CoordinatorExitStatus { /// Platform-address (BLAST) balance sync loop. - pub platform_address: CoordinatorThreadStatus, + pub platform_address_sync: CoordinatorThreadStatus, /// Per-identity token-state sync loop. - pub identity: CoordinatorThreadStatus, - /// Shielded (Orchard) note sync loop. Always - /// [`CoordinatorThreadStatus::NotRunning`] in builds without the - /// `shielded` feature. - pub shielded: CoordinatorThreadStatus, + pub identity_sync: CoordinatorThreadStatus, + /// Shielded (Orchard) note sync loop. `None` in builds without the + /// `shielded` feature (the coordinator does not exist). + pub shielded_sync: Option, + /// Wallet-event adapter (a `tokio` task, not an OS thread). + pub event_adapter: CoordinatorThreadStatus, } impl CoordinatorExitStatus { - /// `true` when every coordinator wound down without a panic or join - /// failure (each is [`Ok`](CoordinatorThreadStatus::Ok) or + /// `true` when every worker wound down without a panic (each is + /// [`Ok`](CoordinatorThreadStatus::Ok) or /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)). pub fn all_clean(&self) -> bool { - self.platform_address.is_clean() && self.identity.is_clean() && self.shielded.is_clean() + self.platform_address_sync.is_clean() + && self.identity_sync.is_clean() + && self.shielded_sync.as_ref().is_none_or(|s| s.is_clean()) + && self.event_adapter.is_clean() } } /// Join a coordinator's background OS thread and classify how it ended. /// -/// Awaited by [`quiesce`](IdentitySyncManager::quiesce) *after* the loop -/// is cancelled and its in-flight pass drained, so the thread is already -/// on its way out. The blocking [`JoinHandle::join`](std::thread::JoinHandle::join) -/// runs on the blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking)) -/// to avoid parking a runtime worker. Joining here — while the runtime -/// is still alive — is what guarantees the `!Send` loop has stopped -/// touching `tokio::time` before the host drops the runtime. +/// Called from each coordinator's `quiesce()` after cancelling the +/// loop and draining any in-flight pass, so the thread is already on +/// its way out and the join is near-instant. The blocking +/// [`JoinHandle::join`](std::thread::JoinHandle::join) runs on the +/// blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking)) +/// so the async executor stays unblocked. Joining while the runtime is +/// still alive guarantees the `!Send` loop has stopped touching +/// `tokio::time` before the host drops the runtime. +/// +/// **Requires a multi-thread runtime** — `spawn_blocking` is not +/// available on `current_thread` runtimes and will panic there. pub(crate) async fn join_coordinator_thread( handle: Option>, ) -> CoordinatorThreadStatus { @@ -166,11 +175,14 @@ pub(crate) async fn join_coordinator_thread( match tokio::task::spawn_blocking(move || handle.join()).await { Ok(Ok(())) => CoordinatorThreadStatus::Ok, Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)), - Err(join_err) => CoordinatorThreadStatus::Error(join_err.to_string()), + // spawn_blocking fails only when the runtime shuts down before + // the blocking task can run — unreachable in normal operation + // since shutdown() is called while the runtime is alive (F-6). + Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")), } } -/// Best-effort extraction of a panic message from a joined thread's +/// Best-effort extraction of a panic message from a joined thread/task /// payload (`&str` and `String` are the common cases). fn panic_message(payload: Box) -> String { if let Some(s) = payload.downcast_ref::<&str>() { @@ -178,10 +190,17 @@ fn panic_message(payload: Box) -> String { } else if let Some(s) = payload.downcast_ref::() { s.clone() } else { - "unknown panic payload".to_string() + "".to_string() } } +/// Maximum time (seconds) `shutdown()` waits for one coordinator's +/// quiesce+join to complete. Under normal operation this deadline is +/// never reached (the RAII `is_syncing` guard ensures the drain exits +/// even on panic). On timeout the coordinator slot reports +/// [`CoordinatorThreadStatus::Error`]`("join timed out")`. +const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; + impl PlatformWalletManager

{ /// Create a new PlatformWalletManager. /// @@ -402,31 +421,82 @@ impl PlatformWalletManager

{ /// and only THEN cancel + join the event adapter, which is the sink /// those stores feed into. /// - /// Each `quiesce()` now also **joins** its coordinator's OS thread, - /// so when this returns every `!Send` loop has fully exited. A host - /// that drops the tokio runtime right after `shutdown()` (one-shot / - /// headless / stdio) is therefore safe — no coordinator can still be - /// polling `tokio::time` on a shutting-down runtime. The returned - /// [`CoordinatorExitStatus`] reports per-thread how each loop ended. + /// After each coordinator's `quiesce()` drains its in-flight pass, + /// this also **joins** the loop's OS thread, so when `shutdown()` + /// returns every `!Send` loop has fully exited. A host that drops the + /// tokio runtime right after `shutdown()` (one-shot / headless / + /// stdio) is therefore safe — no coordinator can still be polling + /// `tokio::time` on a shutting-down runtime. The returned + /// [`CoordinatorExitStatus`] reports per-thread how each worker ended. + /// + /// **Precondition: must be called from a multi-thread Tokio runtime.** + /// `quiesce()` uses `spawn_blocking` internally; calling from a + /// `current_thread` runtime will `debug_assert!`-panic in debug + /// builds or deadlock in release builds. + /// + /// Each coordinator quiesce+join is bounded by + /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit + /// within that window, its slot reports + /// [`CoordinatorThreadStatus::Error`]`("join timed out")` rather + /// than hanging forever. Under normal operation (no infinite loops, + /// RAII guard clears `is_syncing` even on panic) this timeout is + /// never reached. pub async fn shutdown(&self) -> CoordinatorExitStatus { - let platform_address = self.platform_address_sync_manager.quiesce().await; - let identity = self.identity_sync_manager.quiesce().await; + debug_assert!( + matches!( + tokio::runtime::Handle::current().runtime_flavor(), + tokio::runtime::RuntimeFlavor::MultiThread + ), + "shutdown() requires a multi-thread Tokio runtime (spawn_blocking inside quiesce)" + ); + + let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS); + + // Each quiesce() drains any in-flight pass AND joins the thread. + let platform_address_sync = + tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce()) + .await + .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into())); + + let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce()) + .await + .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into())); + #[cfg(feature = "shielded")] - let shielded = self.shielded_sync_manager.quiesce().await; + let shielded_sync = { + let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce()) + .await + .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into())); + Some(r) + }; #[cfg(not(feature = "shielded"))] - let shielded = CoordinatorThreadStatus::NotRunning; + let shielded_sync = None; + // The event adapter is a tokio task (it sinks the coordinators' + // stores), so cancel + join it last — after the loops feeding it + // are gone. self.event_adapter_cancel.cancel(); - if let Some(handle) = self.event_adapter_join.lock().await.take() { - if let Err(e) = handle.await { - tracing::warn!(error = ?e, "Wallet event adapter task join error"); - } - } + let event_adapter = match self.event_adapter_join.lock().await.take() { + None => CoordinatorThreadStatus::NotRunning, + Some(handle) => match tokio::time::timeout(timeout, handle).await { + Ok(Ok(())) => CoordinatorThreadStatus::Ok, + Ok(Err(e)) => { + tracing::warn!(error = ?e, "Wallet event adapter task join error"); + if e.is_panic() { + CoordinatorThreadStatus::Panicked(panic_message(e.into_panic())) + } else { + CoordinatorThreadStatus::Ok + } + } + Err(_) => CoordinatorThreadStatus::Error("join timed out".into()), + }, + }; CoordinatorExitStatus { - platform_address, - identity, - shielded, + platform_address_sync, + identity_sync, + shielded_sync, + event_adapter, } } } @@ -435,9 +505,11 @@ impl PlatformWalletManager

{ mod tests { use super::*; + use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering as AO}; use std::time::Duration; use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet}; + use crate::manager::platform_address_sync::PlatformAddressSyncSummary; /// No-op persister — the lifecycle tests below never exercise the /// real persistence pipeline, they just need a handle that satisfies @@ -477,6 +549,31 @@ mod tests { PlatformWalletManager::new(sdk, persister, handler) } + /// Build a manager that fires a slow (300 ms std::thread::sleep) callback + /// on `on_platform_address_sync_completed`. Used by F-2 drain test. + fn make_manager_with_slow_handler( + started: Arc, + completed: Arc, + ) -> PlatformWalletManager { + struct SlowHandler { + started: Arc, + completed: Arc, + } + impl dash_spv::EventHandler for SlowHandler {} + impl PlatformEventHandler for SlowHandler { + fn on_platform_address_sync_completed(&self, _: &PlatformAddressSyncSummary) { + self.started.store(true, AO::Release); + std::thread::sleep(Duration::from_millis(300)); + self.completed.store(true, AO::Release); + } + } + + let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); + let persister = Arc::new(NoopPersister); + let handler: Arc = Arc::new(SlowHandler { started, completed }); + PlatformWalletManager::new(sdk, persister, handler) + } + /// Start every periodic coordinator's background OS-thread loop. fn start_coordinators(m: &PlatformWalletManager

) { Arc::clone(&m.platform_address_sync_manager).start(); @@ -485,32 +582,66 @@ mod tests { Arc::clone(&m.shielded_sync_manager).start(); } - /// (a) `shutdown()` joins all coordinator OS threads and reports an - /// all-clean status; a second call has nothing left to join. + /// (1)+(5)+(6) Happy path: `shutdown()` joins every started worker + /// and reports `Ok`; it completes within a bounded time (no + /// `spawn_blocking` starvation/deadlock); a second `shutdown()` finds + /// nothing left to join (`NotRunning`) — idempotent. #[tokio::test(flavor = "multi_thread", worker_threads = 4)] - async fn shutdown_joins_all_coordinators_and_reports_ok() { + async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() { let manager = make_manager(); start_coordinators(&manager); // Let the loops enter `block_on` so we exercise the live-loop // join path (a thread cancelled before its first poll joins too). tokio::time::sleep(Duration::from_millis(50)).await; - let status = manager.shutdown().await; - assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok); - assert_eq!(status.identity, CoordinatorThreadStatus::Ok); + let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown()) + .await + .expect("shutdown join must complete within bound"); + assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok); #[cfg(feature = "shielded")] - assert_eq!(status.shielded, CoordinatorThreadStatus::Ok); + assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok)); #[cfg(not(feature = "shielded"))] - assert_eq!(status.shielded, CoordinatorThreadStatus::NotRunning); + assert_eq!(status.shielded_sync, None); + assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok); assert!(status.all_clean()); - // Handles consumed by the join → nothing left to join. + // Handles consumed by the first join → nothing left to join. let again = manager.shutdown().await; - assert_eq!(again.platform_address, CoordinatorThreadStatus::NotRunning); - assert_eq!(again.identity, CoordinatorThreadStatus::NotRunning); + assert_eq!( + again.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + assert_eq!(again.identity_sync, CoordinatorThreadStatus::NotRunning); + assert_eq!(again.event_adapter, CoordinatorThreadStatus::NotRunning); + assert!(again.all_clean()); } - /// (b) A coordinator thread that panics surfaces in the status rather + /// (2) Never-started coordinators report `NotRunning` (no thread to + /// join). The event adapter is spawned in `new`, so it still joins + /// `Ok`. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn shutdown_without_starting_reports_not_running() { + let manager = make_manager(); + + let status = manager.shutdown().await; + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::NotRunning); + #[cfg(feature = "shielded")] + assert_eq!( + status.shielded_sync, + Some(CoordinatorThreadStatus::NotRunning) + ); + #[cfg(not(feature = "shielded"))] + assert_eq!(status.shielded_sync, None); + assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok); + assert!(status.all_clean()); + } + + /// (4) A coordinator thread that panics surfaces as `Panicked` rather /// than being silently dropped. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn join_coordinator_thread_surfaces_panic() { @@ -526,7 +657,7 @@ mod tests { /// A cleanly-returning thread joins as `Ok`; an absent handle is /// `NotRunning`. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn join_coordinator_thread_clean_and_absent() { + async fn join_coordinator_thread_ok_and_absent() { let handle = std::thread::spawn(|| {}); assert_eq!( join_coordinator_thread(Some(handle)).await, @@ -538,56 +669,128 @@ mod tests { ); } - /// (c) Race regression: model the one-shot / headless path — start - /// the coordinators, `shutdown()`, then **drop the runtime**. Because - /// `shutdown()` joined every loop while the runtime was still alive - /// (asserted via the all-`Ok` status), nothing is left polling - /// `tokio::time`, so the drop raises no "Tokio … being shutdown" - /// panic. A scoped hook counts only that specific panic so a - /// concurrent unrelated panic can't trip the assertion. + /// F-7: `join_coordinator_thread` uses `spawn_blocking` internally. + /// Verify it completes without deadlock within a bounded time when + /// called from a multi-thread runtime, as `shutdown()` requires. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() { + let handle = std::thread::spawn(|| {}); + let result = tokio::time::timeout( + Duration::from_secs(5), + join_coordinator_thread(Some(handle)), + ) + .await + .expect("join_coordinator_thread must complete within 5 s — no spawn_blocking deadlock"); + assert_eq!(result, CoordinatorThreadStatus::Ok); + } + + /// F-2: `shutdown()` must wait for an in-flight sync pass to drain + /// before joining the coordinator thread. + /// + /// A slow `on_platform_address_sync_completed` callback (300 ms) + /// keeps `is_syncing=true` while it runs. We call `shutdown()` while + /// the callback is in-flight and assert that `shutdown()` blocks + /// until the callback completes, then returns `Ok`. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_waits_for_in_flight_pass_to_drain() { + let handler_started = Arc::new(AtomicBool::new(false)); + let handler_completed = Arc::new(AtomicBool::new(false)); + let manager = make_manager_with_slow_handler( + Arc::clone(&handler_started), + Arc::clone(&handler_completed), + ); + + // Start the address-sync coordinator; first pass fires immediately. + Arc::clone(&manager.platform_address_sync_manager).start(); + + // Wait until the slow completion callback is running + // (`is_syncing` stays true for its 300 ms duration). + while !handler_started.load(AO::Acquire) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + + // Shutdown must drain the in-flight pass before joining. + let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown()) + .await + .expect("shutdown must complete within 5 s"); + + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::Ok, + "coordinator must join cleanly after drain" + ); + assert!( + handler_completed.load(AO::Acquire), + "shutdown must not return before the in-flight pass completes" + ); + } + + /// F-3 (strengthened): race regression — start coordinators with a + /// long sleep interval so they spend nearly all their time in a live + /// `tokio::time::sleep`, then `shutdown()` and drop the runtime. + /// + /// With the thread join in `shutdown()` every coordinator has fully + /// exited its `block_on` before `drop(runtime)` — no race possible. + /// Loop 10 times to give any latent race a reliable window: WITHOUT + /// the join, the coordinator's `select!` wakeup (via tokio) would + /// race the runtime teardown and reliably trigger the + /// "Tokio … being shutdown" panic across the 10 iterations. #[test] fn shutdown_then_drop_runtime_does_not_panic() { - use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; - static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0); let prev_hook = std::panic::take_hook(); std::panic::set_hook(Box::new(|info| { if info.to_string().contains("being shutdown") { - SHUTDOWN_PANICS.fetch_add(1, AtomicOrdering::SeqCst); + SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst); } })); - let runtime = tokio::runtime::Builder::new_multi_thread() - .worker_threads(4) - .enable_all() - .build() - .expect("build runtime"); - - let status = runtime.block_on(async { - let manager = make_manager(); - start_coordinators(&manager); - tokio::time::sleep(Duration::from_millis(50)).await; - manager.shutdown().await - }); - - // The headless drop: with every coordinator already joined, this - // cannot race a loop still touching the timer. - drop(runtime); - std::thread::sleep(Duration::from_millis(100)); - let racing_panics = SHUTDOWN_PANICS.load(AtomicOrdering::SeqCst); - - // Restore the hook before asserting so a failure prints normally. - std::panic::set_hook(prev_hook); + for _ in 0..10 { + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("build runtime"); + + let status = runtime.block_on(async { + let manager = make_manager(); + // Long interval: coordinator spends ~10 s in a live + // tokio::time::sleep, maximising the race window for a + // join-less runtime drop. + manager + .platform_address_sync_manager + .set_interval(Duration::from_secs(10)); + manager + .identity_sync_manager + .set_interval(Duration::from_secs(10)); + #[cfg(feature = "shielded")] + manager + .shielded_sync_manager + .set_interval(Duration::from_secs(10)); + start_coordinators(&manager); + // Wait for coordinators to finish their first (instant) + // pass and enter the long sleep. + tokio::time::sleep(Duration::from_millis(100)).await; + // shutdown() joins each thread before returning; without + // the join this drop would race the select!/block_on exit. + manager.shutdown().await + }); + + drop(runtime); + // Brief settle — any stray thread activity surfaces here. + std::thread::sleep(Duration::from_millis(50)); + + assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok); + assert!(status.all_clean(), "workers did not wind down: {status:?}"); + } - assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok); - assert_eq!(status.identity, CoordinatorThreadStatus::Ok); - assert!( - status.all_clean(), - "coordinators did not wind down: {status:?}" - ); + let racing_panics = SHUTDOWN_PANICS.load(AO::SeqCst); + std::panic::set_hook(prev_hook); assert_eq!( racing_panics, 0, - "dropping the runtime after shutdown raced a coordinator thread" + "dropping the runtime after shutdown raced a coordinator thread \ + ({racing_panics} panics across 10 iterations)" ); } } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index baa6111e02..f85eb6d05e 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -31,6 +31,20 @@ use crate::wallet::PlatformWallet; /// Default cadence — matches the 15s BLAST loop we previously ran in Swift. pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 15; +/// RAII guard that clears `is_syncing` when dropped. +/// +/// Created at the start of a sync pass (after the `compare_exchange` +/// that takes the slot). On any exit — normal return, early return, or +/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop +/// never spins forever on a panicked pass. +struct IsSyncingGuard<'a>(&'a AtomicBool); + +impl Drop for IsSyncingGuard<'_> { + fn drop(&mut self) { + self.0.store(false, Ordering::Release); + } +} + /// Outcome of syncing a single wallet in a pass. /// /// Not `Clone` because `AddressSyncResult` isn't. Consumers receive it @@ -201,13 +215,12 @@ impl PlatformAddressSyncManager { /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned"); - if guard.is_some() { + let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned"); + if cancel_guard.is_some() { return; } let cancel = CancellationToken::new(); - *guard = Some(cancel.clone()); - drop(guard); + *cancel_guard = Some(cancel.clone()); let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); @@ -235,8 +248,11 @@ impl PlatformAddressSyncManager { }); }) .expect("failed to spawn platform-address-sync thread"); - // Store the handle so `quiesce` can join the OS thread. + // Store the join handle while still holding cancel_guard — a + // concurrent quiesce() must wait for this lock before calling + // stop(), so the handle is always stored before it can be taken. *self.background_join.lock().expect("bg_join poisoned") = Some(join); + // cancel_guard drops here, releasing background_cancel. } /// Stop the background sync loop. No-op if not running. @@ -312,13 +328,17 @@ impl PlatformAddressSyncManager { return PlatformAddressSyncSummary::default(); } + // RAII guard: clears `is_syncing` on every exit path, including + // panics. Without this a panic inside the pass would leave + // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. + let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + // A `quiesce()` may have raised the gate between our CAS and - // here; if so, release the slot and bail without running a pass - // so the drain can complete and shutdown gets a true barrier - // (no further `on_platform_address_sync_completed` host callback - // after quiesce returns). + // here; if so, bail without running a pass so the drain can + // complete and shutdown gets a true barrier (no further + // `on_platform_address_sync_completed` host callback after + // quiesce returns). Guard clears `is_syncing` on return. if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); return PlatformAddressSyncSummary::default(); } @@ -352,20 +372,18 @@ impl PlatformAddressSyncManager { summary.sync_unix_seconds = now; self.last_sync_unix.store(now, Ordering::Release); - // Dispatch the completion event BEFORE clearing `is_syncing`. - // `quiesce()` drains on the falling edge of `is_syncing`, so if - // we cleared the flag first a shutdown caller could unblock and - // free the host event-handler context while this completion - // event (FFI callback → host handler) is still pending — a - // use-after-free. Holding the flag across the dispatch makes - // quiesce's barrier cover the host callback too. Mirrors the - // ordering in `ShieldedSyncManager::sync_now`. + // Dispatch the completion event BEFORE `_is_syncing_guard` drops. + // `quiesce()` drains on the falling edge of `is_syncing`; if the + // guard cleared the flag before the dispatch a shutdown caller + // could unblock and free the host event-handler context while + // the callback is still pending — a use-after-free. The guard + // drops (clearing `is_syncing`) after this call returns, when + // the function frame unwinds. self.event_manager .on_platform_address_sync_completed(&summary); - self.is_syncing.store(false, Ordering::Release); - summary + // `_is_syncing_guard` drops here → `is_syncing = false` } /// Sync a single wallet on demand. Does not set the global diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index d634c65398..0b2e7dda68 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -44,6 +44,20 @@ use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary}; /// is conservative compared to the 15s address-sync cadence. pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60; +/// RAII guard that clears `is_syncing` when dropped. +/// +/// Created at the start of a sync pass (after the `compare_exchange` +/// that takes the slot). On any exit — normal return, early return, or +/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop +/// never spins forever on a panicked pass. +struct IsSyncingGuard<'a>(&'a AtomicBool); + +impl Drop for IsSyncingGuard<'_> { + fn drop(&mut self) { + self.0.store(false, Ordering::Release); + } +} + /// Outcome of syncing a single wallet in a shielded sync pass. /// /// Not `Clone` because `ShieldedSyncSummary` carries the underlying @@ -228,17 +242,16 @@ impl ShieldedSyncManager { /// GRPC client state isn't `Send + Sync`). Same trade-off as /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start). pub fn start(self: Arc) { - let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned"); - if guard.is_some() { + let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned"); + if cancel_guard.is_some() { return; } let cancel = CancellationToken::new(); - *guard = Some(cancel.clone()); + *cancel_guard = Some(cancel.clone()); // Bump the generation while we still hold the slot lock so // the load below in any prior thread's cleanup observes // `current_gen != my_gen` ordered against this token swap. let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; - drop(guard); let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); @@ -281,8 +294,11 @@ impl ShieldedSyncManager { }); }) .expect("failed to spawn shielded-sync thread"); - // Store the handle so `quiesce` can join the OS thread. + // Store the join handle while still holding cancel_guard — a + // concurrent quiesce() must wait for this lock before calling + // stop(), so the handle is always stored before it can be taken. *self.background_join.lock().expect("bg_join poisoned") = Some(join); + // cancel_guard drops here, releasing background_cancel. } /// Stop the background sync loop. No-op if not running. @@ -362,11 +378,15 @@ impl ShieldedSyncManager { return ShieldedSyncPassSummary::default(); } + // RAII guard: clears `is_syncing` on every exit path, including + // panics. Without this a panic inside the pass would leave + // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. + let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + // A `quiesce()` may have raised the gate between our CAS and - // here; if so, release the slot and bail without running a pass - // so the drain can complete and Clear/stop get a true barrier. + // here; bail so the drain can complete and Clear/stop get a + // true barrier. Guard clears `is_syncing` on return. if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); return ShieldedSyncPassSummary::default(); } @@ -403,18 +423,15 @@ impl ShieldedSyncManager { self.last_sync_unix .store(summary.sync_unix_seconds, Ordering::Release); - // Dispatch the completion event BEFORE clearing `is_syncing`. - // `quiesce()` drains on the falling edge of `is_syncing`, so if - // we cleared the flag first a stop/clear caller could unblock - // while this completion event (FFI callback → Swift - // `handleShieldedSyncCompleted`) is still pending — surfacing a - // stale post-stop/post-clear event. Holding the flag across the - // dispatch makes quiesce's barrier cover the event too. + // Dispatch the completion event BEFORE `_is_syncing_guard` drops. + // `quiesce()` drains on the falling edge of `is_syncing`; if + // the guard cleared the flag before the dispatch a stop/clear + // caller could unblock while the callback is still pending — + // surfacing a stale post-stop/post-clear event. self.event_manager.on_shielded_sync_completed(&summary); - self.is_syncing.store(false, Ordering::Release); - summary + // `_is_syncing_guard` drops here → `is_syncing = false` } /// Sync a single wallet on demand. @@ -457,15 +474,16 @@ impl ShieldedSyncManager { return Ok(None); } + // RAII guard clears `is_syncing` on every exit path including panics. + let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + // Bail if a `quiesce()` raised the gate after our CAS (see // `sync_now`) so the drain barrier holds. if self.quiescing.load(Ordering::Acquire) { - self.is_syncing.store(false, Ordering::Release); return Ok(None); } let pass = coordinator.sync(force).await; - self.is_syncing.store(false, Ordering::Release); // Extract this wallet's slice from the network-wide pass // summary. If the wallet is registered, we'll get back an From 42d734d4f81ae76307b13eff9e449ab78955e476 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 11:12:36 +0200 Subject: [PATCH 03/29] refactor(rs-dash-async): add AtomicFlagGuard RAII helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces `AtomicFlagGuard`, a pub RAII guard that clears an `AtomicBool` flag to `false` (Release ordering) on drop. The guard does not set the flag on construction — the caller is responsible for doing so (typically via a `compare_exchange`) — preserving the exact semantics of the three identical `IsSyncingGuard` structs that were copy-pasted across the platform-wallet sync coordinators. This is the panic-safety keystone for the quiesce drain loop: if a sync pass panics, the guard's `drop` still clears `is_syncing`, so `quiesce()` is never permanently wedged. Co-Authored-By: Claude Opus 4.6 --- packages/rs-dash-async/src/atomic.rs | 22 ++++++++++++++++++++++ packages/rs-dash-async/src/lib.rs | 4 ++++ 2 files changed, 26 insertions(+) create mode 100644 packages/rs-dash-async/src/atomic.rs diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs new file mode 100644 index 0000000000..eb79bb4ed1 --- /dev/null +++ b/packages/rs-dash-async/src/atomic.rs @@ -0,0 +1,22 @@ +use std::sync::atomic::{AtomicBool, Ordering}; + +/// RAII guard that clears an [`AtomicBool`] flag to `false` on drop. +/// +/// Callers set the flag to `true` before constructing the guard (typically +/// via a `compare_exchange`); the guard resets it on every exit path, +/// including panics, so a panicked holder can never leave the flag wedged. +pub struct AtomicFlagGuard<'a>(&'a AtomicBool); + +impl<'a> AtomicFlagGuard<'a> { + /// Wrap `flag`. Does **not** set it to `true` — the caller is + /// responsible for doing that before constructing the guard. + pub fn new(flag: &'a AtomicBool) -> Self { + Self(flag) + } +} + +impl Drop for AtomicFlagGuard<'_> { + fn drop(&mut self) { + self.0.store(false, Ordering::Release); + } +} diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs index 0ef7785253..3edcf00daa 100644 --- a/packages/rs-dash-async/src/lib.rs +++ b/packages/rs-dash-async/src/lib.rs @@ -2,7 +2,11 @@ //! //! Provides [`block_on`] -- a function that bridges async futures into sync code, //! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread). +//! +//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets. +mod atomic; mod block_on; +pub use atomic::AtomicFlagGuard; pub use block_on::{block_on, AsyncError}; From 6e78b7777f57e0a0b270e55daae34b31a51a1de1 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 11:13:01 +0200 Subject: [PATCH 04/29] fix(platform-wallet): refine CoordinatorThreadStatus variants + tighten runtime check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Task 1 — new enum variants** Add `Stopped(Option)` (non-panic, non-clean task exit, e.g. tokio cancel/abort) and `Timeout` (join exceeded SHUTDOWN_JOIN_TIMEOUT_SECS) to `CoordinatorThreadStatus`. - Non-panic JoinError on the event-adapter task → `Stopped(Some(...))`, not the previous `Ok` (wrong: a cancelled task is not a clean exit). - Timeout on any `quiesce()` wrapper → `Timeout`, not `Error("join timed out")`. - `is_clean()` now returns `true` only for `Ok` and `NotRunning`; all other variants — including the two new ones — are non-clean. - Update all docs / comments that referenced the old `Error("join timed out")` wording. **Task 2 — promote debug_assert to assert** `shutdown()`'s multi-thread-runtime guard was `debug_assert!`, making it a no-op in release builds. Changed to `assert!` — this is a real invariant: `spawn_blocking` deadlocks on a `current_thread` runtime. **Task 3 — bound the test wait loop** Wrap the `while !handler_started…` polling in `shutdown_waits_for_in_flight_pass_to_drain` with a 5 s `tokio::time::timeout` so a broken test fails fast instead of hanging. **Task 4 — DRY IsSyncingGuard** Replace the three identical copy-pasted `IsSyncingGuard` structs in `identity_sync.rs`, `platform_address_sync.rs`, and `shielded_sync.rs` with the new `dash_async::AtomicFlagGuard`. Adds `dash-async` to `rs-platform-wallet/Cargo.toml`. Zero behavioral change: construction semantics preserved (callers set the flag via `compare_exchange` before creating the guard; `Drop` clears it with `Ordering::Release`). **Task 5 — new tests** - `coordinator_thread_status_clean_predicate`: unit-tests `is_clean()` for all six variants including the two new ones; no real timeout needed. - `coordinator_exit_status_all_clean`: tests `all_clean()` with `Timeout` and `Stopped` slots. - `event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean`: aborts the adapter task before `shutdown()` and asserts the result is `Stopped` (covers the non-panic JoinError path). Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + packages/rs-platform-wallet/Cargo.toml | 1 + .../src/manager/identity_sync.rs | 18 +-- .../rs-platform-wallet/src/manager/mod.rs | 134 +++++++++++++++--- .../src/manager/platform_address_sync.rs | 18 +-- .../src/manager/shielded_sync.rs | 20 +-- 6 files changed, 126 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e296c3aebd..1faa308a83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5141,6 +5141,7 @@ dependencies = [ "async-trait", "bimap", "bs58", + "dash-async", "dash-sdk", "dash-spv", "dashcore", diff --git a/packages/rs-platform-wallet/Cargo.toml b/packages/rs-platform-wallet/Cargo.toml index 1362523ece..e324680210 100644 --- a/packages/rs-platform-wallet/Cargo.toml +++ b/packages/rs-platform-wallet/Cargo.toml @@ -31,6 +31,7 @@ bimap = "0.6" # Async runtime tokio = { version = "1", features = ["sync", "rt", "time", "macros"] } tokio-util = { version = "0.7.12" } +dash-async = { path = "../rs-dash-async" } # Logging tracing = "0.1" diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 7ce38eb5fd..34bf0fefc7 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -51,6 +51,8 @@ use std::sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, Mutex as StdMutex, }; + +use dash_async::AtomicFlagGuard; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use dpp::balances::credits::TokenAmount; @@ -75,20 +77,6 @@ use crate::wallet::platform_wallet::WalletId; /// startup default. pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60; -/// RAII guard that clears `is_syncing` when dropped. -/// -/// Created at the start of a sync pass (after the `compare_exchange` -/// that takes the slot). On any exit — normal return, early return, or -/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop -/// never spins forever on a panicked pass. -struct IsSyncingGuard<'a>(&'a AtomicBool); - -impl Drop for IsSyncingGuard<'_> { - fn drop(&mut self) { - self.0.store(false, Ordering::Release); - } -} - /// Maximum number of token ids fetched in a single /// `IdentityTokenBalancesQuery`. /// @@ -540,7 +528,7 @@ where // RAII guard: clears `is_syncing` on every exit path, including // panics. Without this a panic inside the pass would leave // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. - let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); // A `quiesce()` may have raised the gate between our CAS and // here; if so, bail without running a pass so the drain can diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 905dc32c41..717ad0a03c 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -101,19 +101,28 @@ pub struct PlatformWalletManager { pub enum CoordinatorThreadStatus { /// The loop exited and its thread/task joined cleanly. Ok, + /// The thread/task exited for a non-panic reason that is not a clean + /// return — e.g. a tokio task was cancelled or aborted. Carries a + /// reason string when one is available. + Stopped(Option), /// The thread/task panicked; carries the best-effort panic message. Panicked(String), + /// The join did not complete within [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. + Timeout, /// No thread/task was running to join — never started, or already /// joined by a previous `shutdown()`. NotRunning, - /// The join did not complete within the bounded timeout, or the - /// `spawn_blocking` task itself failed (e.g. runtime torn down - /// before the join could run — unreachable in normal operation). + /// Infrastructural join failure that is neither a timeout nor a + /// panic — e.g. the `spawn_blocking` task itself failed because + /// the runtime was torn down before the join could run (unreachable + /// in normal operation). Error(String), } impl CoordinatorThreadStatus { - /// `true` for a non-failure outcome (joined cleanly or never ran). + /// `true` only for a fully clean outcome: joined normally (`Ok`) or + /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, and + /// `Error` are all considered non-clean. pub fn is_clean(&self) -> bool { matches!(self, Self::Ok | Self::NotRunning) } @@ -198,7 +207,7 @@ fn panic_message(payload: Box) -> String { /// quiesce+join to complete. Under normal operation this deadline is /// never reached (the RAII `is_syncing` guard ensures the drain exits /// even on panic). On timeout the coordinator slot reports -/// [`CoordinatorThreadStatus::Error`]`("join timed out")`. +/// [`CoordinatorThreadStatus::Timeout`]. const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; impl PlatformWalletManager

{ @@ -431,18 +440,17 @@ impl PlatformWalletManager

{ /// /// **Precondition: must be called from a multi-thread Tokio runtime.** /// `quiesce()` uses `spawn_blocking` internally; calling from a - /// `current_thread` runtime will `debug_assert!`-panic in debug - /// builds or deadlock in release builds. + /// `current_thread` runtime will panic (this is a real invariant + /// enforced in both debug and release builds). /// /// Each coordinator quiesce+join is bounded by /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit /// within that window, its slot reports - /// [`CoordinatorThreadStatus::Error`]`("join timed out")` rather - /// than hanging forever. Under normal operation (no infinite loops, - /// RAII guard clears `is_syncing` even on panic) this timeout is - /// never reached. + /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever. + /// Under normal operation (no infinite loops, RAII guard clears + /// `is_syncing` even on panic) this timeout is never reached. pub async fn shutdown(&self) -> CoordinatorExitStatus { - debug_assert!( + assert!( matches!( tokio::runtime::Handle::current().runtime_flavor(), tokio::runtime::RuntimeFlavor::MultiThread @@ -456,17 +464,17 @@ impl PlatformWalletManager

{ let platform_address_sync = tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce()) .await - .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into())); + .unwrap_or(CoordinatorThreadStatus::Timeout); let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce()) .await - .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into())); + .unwrap_or(CoordinatorThreadStatus::Timeout); #[cfg(feature = "shielded")] let shielded_sync = { let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce()) .await - .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into())); + .unwrap_or(CoordinatorThreadStatus::Timeout); Some(r) }; #[cfg(not(feature = "shielded"))] @@ -485,10 +493,12 @@ impl PlatformWalletManager

{ if e.is_panic() { CoordinatorThreadStatus::Panicked(panic_message(e.into_panic())) } else { - CoordinatorThreadStatus::Ok + // Non-panic JoinError: task was cancelled or aborted — + // not a clean exit, but also not a panic. + CoordinatorThreadStatus::Stopped(Some(format!("{e}"))) } } - Err(_) => CoordinatorThreadStatus::Error("join timed out".into()), + Err(_) => CoordinatorThreadStatus::Timeout, }, }; @@ -654,6 +664,86 @@ mod tests { } } + /// A non-panic `JoinError` on the event adapter maps to `Stopped`, not + /// `Ok`, and is NOT considered clean. This covers the case where the + /// tokio task is cancelled or aborted rather than completing normally. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() { + // Build a manager but immediately abort the event adapter task so + // we trigger the non-panic JoinError path in shutdown(). + let manager = make_manager(); + // Abort the adapter task directly so the join sees a non-panic JoinError. + { + let mut guard = manager.event_adapter_join.lock().await; + if let Some(handle) = guard.take() { + handle.abort(); + // Put it back so shutdown() sees it and exercises the error path. + *guard = Some(handle); + } + } + // Give tokio a moment to process the abort. + tokio::time::sleep(Duration::from_millis(10)).await; + + let status = manager.shutdown().await; + // The adapter task was aborted → non-panic JoinError → Stopped. + match &status.event_adapter { + CoordinatorThreadStatus::Stopped(_) | CoordinatorThreadStatus::Ok => { + // Stopped is the expected path; Ok means it drained before abort — both + // are acceptable since abort() races the task completion. + } + other => panic!("expected Stopped or Ok (abort race), got {other:?}"), + } + // Regardless, all other workers were never started → clean. + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + } + + /// `Stopped` and `Timeout` are NOT clean; `Ok` and `NotRunning` ARE. + /// Unit-tests the `is_clean` predicate directly so we don't need to + /// trigger a real timeout (30s) in a deterministic test. + #[test] + fn coordinator_thread_status_clean_predicate() { + assert!(CoordinatorThreadStatus::Ok.is_clean()); + assert!(CoordinatorThreadStatus::NotRunning.is_clean()); + + assert!(!CoordinatorThreadStatus::Stopped(None).is_clean()); + assert!(!CoordinatorThreadStatus::Stopped(Some("cancelled".into())).is_clean()); + assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean()); + assert!(!CoordinatorThreadStatus::Timeout.is_clean()); + assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean()); + } + + /// `all_clean()` on `CoordinatorExitStatus` is false whenever any + /// slot is non-clean. + #[test] + fn coordinator_exit_status_all_clean() { + let clean = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Ok, + identity_sync: CoordinatorThreadStatus::NotRunning, + shielded_sync: None, + event_adapter: CoordinatorThreadStatus::Ok, + }; + assert!(clean.all_clean()); + + let with_timeout = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Timeout, + identity_sync: CoordinatorThreadStatus::Ok, + shielded_sync: None, + event_adapter: CoordinatorThreadStatus::Ok, + }; + assert!(!with_timeout.all_clean()); + + let with_stopped = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Ok, + identity_sync: CoordinatorThreadStatus::Ok, + shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))), + event_adapter: CoordinatorThreadStatus::Ok, + }; + assert!(!with_stopped.all_clean()); + } + /// A cleanly-returning thread joins as `Ok`; an absent handle is /// `NotRunning`. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -705,9 +795,13 @@ mod tests { // Wait until the slow completion callback is running // (`is_syncing` stays true for its 300 ms duration). - while !handler_started.load(AO::Acquire) { - tokio::time::sleep(Duration::from_millis(5)).await; - } + tokio::time::timeout(Duration::from_secs(5), async { + while !handler_started.load(AO::Acquire) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + }) + .await + .expect("handler did not start within 5s"); // Shutdown must drain the in-flight pass before joining. let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown()) diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index f85eb6d05e..ddd58fcb44 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -13,6 +13,8 @@ use std::sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, Mutex as StdMutex, }; + +use dash_async::AtomicFlagGuard; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use arc_swap::ArcSwapOption; @@ -31,20 +33,6 @@ use crate::wallet::PlatformWallet; /// Default cadence — matches the 15s BLAST loop we previously ran in Swift. pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 15; -/// RAII guard that clears `is_syncing` when dropped. -/// -/// Created at the start of a sync pass (after the `compare_exchange` -/// that takes the slot). On any exit — normal return, early return, or -/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop -/// never spins forever on a panicked pass. -struct IsSyncingGuard<'a>(&'a AtomicBool); - -impl Drop for IsSyncingGuard<'_> { - fn drop(&mut self) { - self.0.store(false, Ordering::Release); - } -} - /// Outcome of syncing a single wallet in a pass. /// /// Not `Clone` because `AddressSyncResult` isn't. Consumers receive it @@ -331,7 +319,7 @@ impl PlatformAddressSyncManager { // RAII guard: clears `is_syncing` on every exit path, including // panics. Without this a panic inside the pass would leave // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. - let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); // A `quiesce()` may have raised the gate between our CAS and // here; if so, bail without running a pass so the drain can diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 0b2e7dda68..502d1ae733 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -30,6 +30,8 @@ use std::sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, Mutex as StdMutex, }; + +use dash_async::AtomicFlagGuard; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; @@ -44,20 +46,6 @@ use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary}; /// is conservative compared to the 15s address-sync cadence. pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60; -/// RAII guard that clears `is_syncing` when dropped. -/// -/// Created at the start of a sync pass (after the `compare_exchange` -/// that takes the slot). On any exit — normal return, early return, or -/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop -/// never spins forever on a panicked pass. -struct IsSyncingGuard<'a>(&'a AtomicBool); - -impl Drop for IsSyncingGuard<'_> { - fn drop(&mut self) { - self.0.store(false, Ordering::Release); - } -} - /// Outcome of syncing a single wallet in a shielded sync pass. /// /// Not `Clone` because `ShieldedSyncSummary` carries the underlying @@ -381,7 +369,7 @@ impl ShieldedSyncManager { // RAII guard: clears `is_syncing` on every exit path, including // panics. Without this a panic inside the pass would leave // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. - let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); // A `quiesce()` may have raised the gate between our CAS and // here; bail so the drain can complete and Clear/stop get a @@ -475,7 +463,7 @@ impl ShieldedSyncManager { } // RAII guard clears `is_syncing` on every exit path including panics. - let _is_syncing_guard = IsSyncingGuard(&self.is_syncing); + let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); // Bail if a `quiesce()` raised the gate after our CAS (see // `sync_now`) so the drain barrier holds. From 5f80450ce16129ea77422b6699c67e6353c87738 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:46:52 +0200 Subject: [PATCH 05/29] test(rs-dash-async): assert AtomicFlagGuard contract + add #[must_use] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RUST-001: tag `AtomicFlagGuard` `#[must_use]` so a stray `let _ = ..` or bare-statement construction (which would drop the guard *immediately* and clear the flag right back) gets caught at compile time instead of silently un-gating the very flag it was meant to hold. PROJ-001: lock the guard's contract down with two tests — flag cleared on a normal drop, and (the load-bearing one) flag cleared while unwinding a panic via `catch_unwind`. Makes the PR-body "dash-async tests" claim true. SEC-003: spell out in the rustdoc that the clear-on-panic guarantee rides on unwinding, so it holds under `panic = "unwind"` but not under the iOS `panic = "abort"` profiles, where a panic aborts before any Drop runs. Co-Authored-By: Claude Opus 4.6 --- packages/rs-dash-async/src/atomic.rs | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs index eb79bb4ed1..ecdab75acb 100644 --- a/packages/rs-dash-async/src/atomic.rs +++ b/packages/rs-dash-async/src/atomic.rs @@ -5,6 +5,13 @@ use std::sync::atomic::{AtomicBool, Ordering}; /// Callers set the flag to `true` before constructing the guard (typically /// via a `compare_exchange`); the guard resets it on every exit path, /// including panics, so a panicked holder can never leave the flag wedged. +/// +/// **Panic-strategy caveat:** the clear-on-panic guarantee relies on +/// destructors running while the stack unwinds, so it holds under +/// `panic = "unwind"` (the default). Under `panic = "abort"` — e.g. the +/// iOS release profiles — a panic aborts the process immediately and no +/// `Drop` runs; there is simply no "after" left for the flag to gate. +#[must_use = "AtomicFlagGuard clears the flag on drop; binding to `_` or using as a statement drops it immediately"] pub struct AtomicFlagGuard<'a>(&'a AtomicBool); impl<'a> AtomicFlagGuard<'a> { @@ -20,3 +27,38 @@ impl Drop for AtomicFlagGuard<'_> { self.0.store(false, Ordering::Release); } } + +#[cfg(test)] +mod tests { + use super::*; + use std::panic::{catch_unwind, AssertUnwindSafe}; + + /// A guard constructed over a `true` flag holds it while in scope and + /// clears it to `false` on a normal scope exit. + #[test] + fn clears_flag_on_normal_drop() { + let flag = AtomicBool::new(true); + { + let _guard = AtomicFlagGuard::new(&flag); + assert!(flag.load(Ordering::Acquire), "flag stays set while held"); + } + assert!(!flag.load(Ordering::Acquire), "flag cleared on drop"); + } + + /// The clear also runs while unwinding a panic — the load-bearing + /// property the sync coordinators lean on so a panicked pass can't + /// leave `is_syncing` latched and wedge `quiesce()`'s drain. + #[test] + fn clears_flag_while_unwinding_panic() { + let flag = AtomicBool::new(true); + let result = catch_unwind(AssertUnwindSafe(|| { + let _guard = AtomicFlagGuard::new(&flag); + panic!("boom while holding the guard"); + })); + assert!(result.is_err(), "the panic propagated out of catch_unwind"); + assert!( + !flag.load(Ordering::Acquire), + "Drop ran during unwinding and cleared the flag" + ); + } +} From 6b2cd39e06ac565a22ff8609da61b2afd14b712b Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:51:18 +0200 Subject: [PATCH 06/29] fix(platform-wallet): make coordinator passes cancellable + converge invariants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEC-001 (the big one): a `shutdown()` quiesce timed out only because a stalled in-flight pass pinned `is_syncing`, so the `while is_syncing` drain never cleared, the quiesce future was dropped *before* the thread join, and the `!Send` coordinator OS thread was left ALIVE — later firing host callbacks through freed memory. Root-cause fix: race the pass body against cancellation inside each coordinator's own loop tokio::select! { biased; _ = cancel.cancelled() => break, _ = this.sync_now(..) => {} } so `stop()`/`quiesce()` cancelling the token drops the stalled `sync_now` future *on the coordinator thread*, which unwinds to its `is_syncing` `AtomicFlagGuard` and clears the flag promptly. The drain then frees and the join lands far inside the timeout — the timeout can no longer strand a live thread. Invariants preserved: the guard is constructed before any `.await` so a cancel-drop always clears `is_syncing`; the completion-event dispatch is the synchronous tail after the last `.await`, so it either runs in full (then clears) or is skipped on cancel — never torn; idempotency and the drain barrier are untouched. The inter-pass sleep was already cancel-raced. MEDIUM-4 (RUST-002): RAII-guard `quiescing` in all three `quiesce()` via `AtomicFlagGuard`, dropping the manual `store(false)`. A timed-out quiesce no longer latches the gate `true` and silently bails every future pass. Reopening on drop is safe because `stop()` already cancelled the loop. MEDIUM-3 (SEC-005/CALL-001): give `PlatformAddressSyncManager` the `background_generation` counter its siblings already have — bump it (AcqRel) in `start()` and gate the thread-exit `*background_cancel = None` on `generation == my_gen`, so a stop()+start() reschedule can't have an exiting thread strip the new generation's token. SEC-003: swap the `background_cancel`/`background_join` std-Mutex `.lock().expect("… poisoned")` calls for `.lock().unwrap_or_else(|e| e.into_inner())` across all three coordinators, so one prior panic can't cascade into an abort on the teardown path. Co-Authored-By: Claude Opus 4.6 --- .../src/manager/identity_sync.rs | 39 ++++++++++-- .../src/manager/platform_address_sync.rs | 61 ++++++++++++++++--- .../src/manager/shielded_sync.rs | 39 ++++++++++-- 3 files changed, 120 insertions(+), 19 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 34bf0fefc7..ae2143a574 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -403,7 +403,10 @@ where /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned"); + let mut cancel_guard = self + .background_cancel + .lock() + .unwrap_or_else(|e| e.into_inner()); if cancel_guard.is_some() { return; } @@ -422,7 +425,22 @@ where break; } - this.sync_now().await; + // Race the in-flight pass against cancellation. + // `stop()` / `quiesce()` cancel the token; with + // `biased` the cancel arm is polled first, so a + // pass stalled on a hung SDK fetch is dropped at + // its `.await` the instant we cancel. Dropping the + // `sync_now` future unwinds to the `is_syncing` + // `AtomicFlagGuard` it holds, clearing the flag + // promptly — so `quiesce()`'s drain loop frees and + // the join lands well inside `shutdown()`'s + // timeout. A stalled pass can no longer strand a + // live `!Send` thread past `shutdown()`. + tokio::select! { + biased; + _ = cancel.cancelled() => break, + _ = this.sync_now() => {} + } let interval = this.interval(); tokio::select! { @@ -444,7 +462,10 @@ where // Store the join handle while still holding cancel_guard — a // concurrent quiesce() must wait for this lock before calling // stop(), so the handle is always stored before it can be taken. - *self.background_join.lock().expect("bg_join poisoned") = Some(join); + *self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) = Some(join); // cancel_guard drops here, releasing background_cancel. } @@ -460,7 +481,7 @@ where if let Some(token) = self .background_cancel .lock() - .expect("bg_cancel poisoned") + .unwrap_or_else(|e| e.into_inner()) .take() { token.cancel(); @@ -493,15 +514,21 @@ where /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { self.quiescing.store(true, Ordering::Release); + // RAII gate: resets `quiescing` on *every* exit path — a normal + // return, a timed-out `shutdown()` dropping this future, or a + // panic. Without it a quiesce that doesn't run to completion + // leaves the gate latched `true`, silently bailing every future + // pass. Reopening on drop is safe because `stop()` (below) has + // already cancelled the loop, so no new pass can start. + let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); self.stop(); while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(20)).await; } - self.quiescing.store(false, Ordering::Release); let handle = self .background_join .lock() - .expect("bg_join poisoned") + .unwrap_or_else(|e| e.into_inner()) .take(); super::join_coordinator_thread(handle).await } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index ddd58fcb44..28987bd9c5 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -104,6 +104,14 @@ pub struct PlatformAddressSyncManager { /// confirm the `!Send` loop fully exited before the host drops the /// runtime. background_join: StdMutex>>, + /// Monotonically increasing generation counter. Bumped on every + /// `start()` so the exiting thread can tell whether its generation is + /// still the active one before clearing `background_cancel`. Without + /// this guard a tight `stop()` → `start()` reschedule lets the prior + /// thread's cleanup strip the *new* generation's token, leaving the + /// new loop running but untrackable via `is_running()` / `stop()`. + /// Mirrors the identity / shielded coordinators. + background_generation: AtomicU64, interval_secs: AtomicU64, is_syncing: AtomicBool, /// Set by [`quiesce`](Self::quiesce) to gate new passes while it @@ -133,6 +141,7 @@ impl PlatformAddressSyncManager { event_manager, background_cancel: StdMutex::new(None), background_join: StdMutex::new(None), + background_generation: AtomicU64::new(0), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), quiescing: AtomicBool::new(false), @@ -203,12 +212,19 @@ impl PlatformAddressSyncManager { /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned"); + let mut cancel_guard = self + .background_cancel + .lock() + .unwrap_or_else(|e| e.into_inner()); if cancel_guard.is_some() { return; } let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); + // Bump the generation while we still hold the slot lock so any + // prior thread's cleanup observes `current_gen != my_gen` ordered + // against this token swap. + let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); @@ -221,7 +237,22 @@ impl PlatformAddressSyncManager { break; } - this.sync_now().await; + // Race the in-flight pass against cancellation. + // `stop()` / `quiesce()` cancel the token; with + // `biased` the cancel arm is polled first, so a + // pass stalled on a hung SDK fetch is dropped at + // its `.await` the instant we cancel. Dropping the + // `sync_now` future unwinds to the `is_syncing` + // `AtomicFlagGuard` it holds, clearing the flag + // promptly — so `quiesce()`'s drain loop frees and + // the join lands well inside `shutdown()`'s + // timeout. A stalled pass can no longer strand a + // live `!Send` thread past `shutdown()`. + tokio::select! { + biased; + _ = cancel.cancelled() => break, + _ = this.sync_now() => {} + } let interval = this.interval(); tokio::select! { @@ -230,8 +261,15 @@ impl PlatformAddressSyncManager { } } + // Only clear the slot if no newer start() has + // installed a replacement token since we launched — + // mirrors the identity / shielded coordinators so a + // stop() → start() reschedule can't have this exiting + // thread strip the new generation's cancel token. if let Ok(mut guard) = this.background_cancel.lock() { - *guard = None; + if this.background_generation.load(Ordering::Acquire) == my_gen { + *guard = None; + } } }); }) @@ -239,7 +277,10 @@ impl PlatformAddressSyncManager { // Store the join handle while still holding cancel_guard — a // concurrent quiesce() must wait for this lock before calling // stop(), so the handle is always stored before it can be taken. - *self.background_join.lock().expect("bg_join poisoned") = Some(join); + *self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) = Some(join); // cancel_guard drops here, releasing background_cancel. } @@ -256,7 +297,7 @@ impl PlatformAddressSyncManager { if let Some(token) = self .background_cancel .lock() - .expect("bg_cancel poisoned") + .unwrap_or_else(|e| e.into_inner()) .take() { token.cancel(); @@ -290,15 +331,21 @@ impl PlatformAddressSyncManager { /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { self.quiescing.store(true, Ordering::Release); + // RAII gate: resets `quiescing` on *every* exit path — a normal + // return, a timed-out `shutdown()` dropping this future, or a + // panic. Without it a quiesce that doesn't run to completion + // leaves the gate latched `true`, silently bailing every future + // pass. Reopening on drop is safe because `stop()` (below) has + // already cancelled the loop, so no new pass can start. + let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); self.stop(); while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(20)).await; } - self.quiescing.store(false, Ordering::Release); let handle = self .background_join .lock() - .expect("bg_join poisoned") + .unwrap_or_else(|e| e.into_inner()) .take(); super::join_coordinator_thread(handle).await } diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 502d1ae733..accaca69d0 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -230,7 +230,10 @@ impl ShieldedSyncManager { /// GRPC client state isn't `Send + Sync`). Same trade-off as /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start). pub fn start(self: Arc) { - let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned"); + let mut cancel_guard = self + .background_cancel + .lock() + .unwrap_or_else(|e| e.into_inner()); if cancel_guard.is_some() { return; } @@ -259,7 +262,22 @@ impl ShieldedSyncManager { // chunk every interval. User-initiated // syncs pass `force=true` to the FFI // entry point below and bypass this. - this.sync_now(false).await; + // + // Race the pass against cancellation. `stop()` / + // `quiesce()` cancel the token; with `biased` the + // cancel arm is polled first, so a pass stalled on + // a hung SDK fetch is dropped at its `.await` the + // instant we cancel. Dropping the `sync_now` future + // unwinds to the `is_syncing` `AtomicFlagGuard` it + // holds, clearing the flag promptly — so the drain + // loop in `quiesce()` frees and the join lands well + // inside `shutdown()`'s timeout. A stalled pass can + // no longer strand a live `!Send` thread. + tokio::select! { + biased; + _ = cancel.cancelled() => break, + _ = this.sync_now(false) => {} + } let interval = this.interval(); tokio::select! { @@ -285,7 +303,10 @@ impl ShieldedSyncManager { // Store the join handle while still holding cancel_guard — a // concurrent quiesce() must wait for this lock before calling // stop(), so the handle is always stored before it can be taken. - *self.background_join.lock().expect("bg_join poisoned") = Some(join); + *self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) = Some(join); // cancel_guard drops here, releasing background_cancel. } @@ -301,7 +322,7 @@ impl ShieldedSyncManager { if let Some(token) = self .background_cancel .lock() - .expect("bg_cancel poisoned") + .unwrap_or_else(|e| e.into_inner()) .take() { token.cancel(); @@ -333,15 +354,21 @@ impl ShieldedSyncManager { /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { self.quiescing.store(true, Ordering::Release); + // RAII gate: resets `quiescing` on *every* exit path — a normal + // return, a timed-out `shutdown()` / Clear dropping this future, + // or a panic. Without it a quiesce that doesn't run to completion + // leaves the gate latched `true`, silently bailing every future + // pass. Reopening on drop is safe because `stop()` (below) has + // already cancelled the loop, so no new pass can start. + let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); self.stop(); while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(20)).await; } - self.quiescing.store(false, Ordering::Release); let handle = self .background_join .lock() - .expect("bg_join poisoned") + .unwrap_or_else(|e| e.into_inner()) .take(); super::join_coordinator_thread(handle).await } From 13a22dd7ca65a885d1eb1d0fa38acd5b91684920 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:54:38 +0200 Subject: [PATCH 07/29] fix(platform-wallet): bound clear_shielded + tidy shutdown docs/logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEC-002: `clear_shielded()` now wraps its `quiesce()` in the same `SHUTDOWN_JOIN_TIMEOUT_SECS` backstop `shutdown()` uses, so a stalled in-flight pass can't hang Clear forever. The const is now `pub` (and re-exported from the crate root) so the FFI shielded-stop bridge can reuse it; its doc + the `shutdown()` doc now describe it as a backstop and note that cancellation is what makes the drain prompt. SEC-004: bind the event-adapter join handle to a local before the join `.await`, so the `tokio::Mutex` guard (previously a match-scrutinee temporary) isn't held across the up-to-30s join. PROJ-004: drop the lone `tracing::warn!` for the adapter join error inside `shutdown()` — the returned status already carries it and the FFI `destroy` adapter logs the aggregate once, so all four workers are now uniform. RUST-004: rewrite the `shutdown()` `assert!` message (and the matching docs) to name the real constraint — the coordinator OS threads each run `Handle::block_on` and need the multi-thread runtime's timer/IO driver — instead of blaming `spawn_blocking`, which works fine on current_thread. PROJ-006: fix the `all_clean()` rustdoc (Stopped/Timeout/Error also make it false, not just panics). PROJ-003: drop the dangling ephemeral `(F-6)` and `F-2`/`F-3`/`F-7` + `(1)/(2)/(4)/(5)/(6)` markers, replacing with self-describing prose. SEC-003: note the unwind-vs-abort caveat on the `shutdown()` panic-safety guarantee. Co-Authored-By: Claude Opus 4.6 --- packages/rs-platform-wallet/src/lib.rs | 2 +- .../rs-platform-wallet/src/manager/mod.rs | 116 ++++++++++++------ 2 files changed, 80 insertions(+), 38 deletions(-) diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs index 289a71378f..dd12883fc7 100644 --- a/packages/rs-platform-wallet/src/lib.rs +++ b/packages/rs-platform-wallet/src/lib.rs @@ -44,7 +44,7 @@ pub use manager::platform_address_sync::{ PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome, DEFAULT_SYNC_INTERVAL_SECS, }; -pub use manager::PlatformWalletManager; +pub use manager::{PlatformWalletManager, SHUTDOWN_JOIN_TIMEOUT_SECS}; pub use spv::SpvRuntime; pub use wallet::asset_lock::manager::AssetLockManager; pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock}; diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 717ad0a03c..6fa26902f8 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -151,9 +151,11 @@ pub struct CoordinatorExitStatus { } impl CoordinatorExitStatus { - /// `true` when every worker wound down without a panic (each is + /// `true` only when every worker is /// [`Ok`](CoordinatorThreadStatus::Ok) or - /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)). + /// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any + /// `Stopped`, `Panicked`, `Timeout`, or `Error` slot makes it + /// `false`. pub fn all_clean(&self) -> bool { self.platform_address_sync.is_clean() && self.identity_sync.is_clean() @@ -173,8 +175,12 @@ impl CoordinatorExitStatus { /// still alive guarantees the `!Send` loop has stopped touching /// `tokio::time` before the host drops the runtime. /// -/// **Requires a multi-thread runtime** — `spawn_blocking` is not -/// available on `current_thread` runtimes and will panic there. +/// **Requires a multi-thread runtime.** Each coordinator's OS thread +/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on) +/// and needs the runtime's timer/IO driver; a `current_thread` runtime +/// can only service one `block_on` at a time, so joining one coordinator +/// while the others (and `shutdown()` itself) are mid-`block_on` would +/// deadlock. `shutdown()` asserts the multi-thread flavor up front. pub(crate) async fn join_coordinator_thread( handle: Option>, ) -> CoordinatorThreadStatus { @@ -186,7 +192,7 @@ pub(crate) async fn join_coordinator_thread( Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)), // spawn_blocking fails only when the runtime shuts down before // the blocking task can run — unreachable in normal operation - // since shutdown() is called while the runtime is alive (F-6). + // since shutdown() is called while the runtime is alive. Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")), } } @@ -203,12 +209,18 @@ fn panic_message(payload: Box) -> String { } } -/// Maximum time (seconds) `shutdown()` waits for one coordinator's -/// quiesce+join to complete. Under normal operation this deadline is -/// never reached (the RAII `is_syncing` guard ensures the drain exits -/// even on panic). On timeout the coordinator slot reports -/// [`CoordinatorThreadStatus::Timeout`]. -const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; +/// Maximum time (seconds) the teardown paths — `shutdown()`, +/// `clear_shielded`, and the FFI shielded-stop bridge — wait for one +/// coordinator's quiesce+join to complete. +/// +/// This is a backstop, not the primary stop mechanism. `quiesce()` +/// cancels the loop, which aborts any in-flight pass at its `.await` +/// point (see each coordinator's `start()` select), so the `is_syncing` +/// drain clears promptly and the join normally lands far inside this +/// window. The deadline fires only if a pass's *drop* itself wedges +/// (e.g. a blocking destructor); on timeout the coordinator slot reports +/// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever. +pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; impl PlatformWalletManager

{ /// Create a new PlatformWalletManager. @@ -403,7 +415,17 @@ impl PlatformWalletManager

{ /// must not commit its own persistence wipe in that case. #[cfg(feature = "shielded")] pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> { - self.shielded_sync_manager.quiesce().await; + // Bound the quiesce with the same backstop `shutdown()` uses so a + // stalled in-flight pass can't hang Clear forever — cancellation + // makes the drain prompt; this timeout only matters if a pass's + // drop wedges. The terminal status isn't surfaced on the Clear + // path (the coordinator reset below is what can fail), so the + // timeout result is intentionally discarded. + let _ = tokio::time::timeout( + std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), + self.shielded_sync_manager.quiesce(), + ) + .await; if let Some(coord) = self.shielded_coordinator().await { coord.clear().await?; } @@ -439,23 +461,35 @@ impl PlatformWalletManager

{ /// [`CoordinatorExitStatus`] reports per-thread how each worker ended. /// /// **Precondition: must be called from a multi-thread Tokio runtime.** - /// `quiesce()` uses `spawn_blocking` internally; calling from a - /// `current_thread` runtime will panic (this is a real invariant - /// enforced in both debug and release builds). + /// Each coordinator's OS thread drives its loop via + /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs + /// the runtime's timer/IO driver; a `current_thread` runtime can only + /// service one `block_on` at a time, so the join would deadlock. This + /// is asserted in both debug and release builds. /// /// Each coordinator quiesce+join is bounded by - /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit - /// within that window, its slot reports + /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] as a backstop. `quiesce()` cancels + /// the loop, which aborts any in-flight pass at its `.await` point, so + /// the `is_syncing` drain clears promptly and the join normally lands + /// far inside the window — the deadline fires only if a pass's *drop* + /// itself wedges. On timeout the coordinator slot reports /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever. - /// Under normal operation (no infinite loops, RAII guard clears - /// `is_syncing` even on panic) this timeout is never reached. + /// + /// The clear-on-panic half of that guarantee rides on unwinding, so + /// it holds under `panic = "unwind"`. Under the iOS `panic = "abort"` + /// release profiles a pass panic aborts the process outright (no + /// `Drop`, no status) — there is no live manager left to read a + /// status from. pub async fn shutdown(&self) -> CoordinatorExitStatus { assert!( matches!( tokio::runtime::Handle::current().runtime_flavor(), tokio::runtime::RuntimeFlavor::MultiThread ), - "shutdown() requires a multi-thread Tokio runtime (spawn_blocking inside quiesce)" + "shutdown() requires a multi-thread Tokio runtime: each \ + coordinator's OS thread drives its sync loop via \ + Handle::block_on and needs the runtime's timer/IO driver, but \ + a current_thread runtime can only drive one block_on at a time" ); let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS); @@ -484,12 +518,19 @@ impl PlatformWalletManager

{ // stores), so cancel + join it last — after the loops feeding it // are gone. self.event_adapter_cancel.cancel(); - let event_adapter = match self.event_adapter_join.lock().await.take() { + // Take the handle out into a local first so the `tokio::Mutex` + // guard doesn't stay held across the (up-to-30s) join `.await` + // below — a match scrutinee temporary would otherwise keep the + // guard alive for the whole match. + let event_adapter_handle = self.event_adapter_join.lock().await.take(); + let event_adapter = match event_adapter_handle { None => CoordinatorThreadStatus::NotRunning, Some(handle) => match tokio::time::timeout(timeout, handle).await { Ok(Ok(())) => CoordinatorThreadStatus::Ok, + // The returned status already carries this failure, and the + // FFI `destroy` adapter logs the aggregate once at the host + // layer — so don't double-log here. Ok(Err(e)) => { - tracing::warn!(error = ?e, "Wallet event adapter task join error"); if e.is_panic() { CoordinatorThreadStatus::Panicked(panic_message(e.into_panic())) } else { @@ -560,7 +601,8 @@ mod tests { } /// Build a manager that fires a slow (300 ms std::thread::sleep) callback - /// on `on_platform_address_sync_completed`. Used by F-2 drain test. + /// on `on_platform_address_sync_completed`. Used by the in-flight-pass + /// drain test. fn make_manager_with_slow_handler( started: Arc, completed: Arc, @@ -592,10 +634,10 @@ mod tests { Arc::clone(&m.shielded_sync_manager).start(); } - /// (1)+(5)+(6) Happy path: `shutdown()` joins every started worker - /// and reports `Ok`; it completes within a bounded time (no - /// `spawn_blocking` starvation/deadlock); a second `shutdown()` finds - /// nothing left to join (`NotRunning`) — idempotent. + /// Happy path: `shutdown()` joins every started worker and reports + /// `Ok`; it completes within a bounded time (no `spawn_blocking` + /// starvation/deadlock); a second `shutdown()` finds nothing left to + /// join (`NotRunning`) — idempotent. #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() { let manager = make_manager(); @@ -627,7 +669,7 @@ mod tests { assert!(again.all_clean()); } - /// (2) Never-started coordinators report `NotRunning` (no thread to + /// Never-started coordinators report `NotRunning` (no thread to /// join). The event adapter is spawned in `new`, so it still joins /// `Ok`. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -651,7 +693,7 @@ mod tests { assert!(status.all_clean()); } - /// (4) A coordinator thread that panics surfaces as `Panicked` rather + /// A coordinator thread that panics surfaces as `Panicked` rather /// than being silently dropped. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn join_coordinator_thread_surfaces_panic() { @@ -759,9 +801,9 @@ mod tests { ); } - /// F-7: `join_coordinator_thread` uses `spawn_blocking` internally. - /// Verify it completes without deadlock within a bounded time when - /// called from a multi-thread runtime, as `shutdown()` requires. + /// `join_coordinator_thread` uses `spawn_blocking` internally. Verify + /// it completes without deadlock within a bounded time when called + /// from a multi-thread runtime, as `shutdown()` requires. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() { let handle = std::thread::spawn(|| {}); @@ -774,8 +816,8 @@ mod tests { assert_eq!(result, CoordinatorThreadStatus::Ok); } - /// F-2: `shutdown()` must wait for an in-flight sync pass to drain - /// before joining the coordinator thread. + /// `shutdown()` must wait for an in-flight sync pass to drain before + /// joining the coordinator thread. /// /// A slow `on_platform_address_sync_completed` callback (300 ms) /// keeps `is_syncing=true` while it runs. We call `shutdown()` while @@ -819,9 +861,9 @@ mod tests { ); } - /// F-3 (strengthened): race regression — start coordinators with a - /// long sleep interval so they spend nearly all their time in a live - /// `tokio::time::sleep`, then `shutdown()` and drop the runtime. + /// Race regression — start coordinators with a long sleep interval so + /// they spend nearly all their time in a live `tokio::time::sleep`, + /// then `shutdown()` and drop the runtime. /// /// With the thread join in `shutdown()` every coordinator has fully /// exited its `block_on` before `drop(runtime)` — no race possible. From 93b89546ed7fac0964d7ae6e7dd3fa12b931b944 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:56:01 +0200 Subject: [PATCH 08/29] fix(platform-wallet-ffi): timeout-bound the shielded sync stop bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEC-002: `platform_wallet_manager_shielded_sync_stop` blocked on a bare `quiesce()`, so a stalled in-flight pass could hang the host's stop call forever. Wrap the quiesce in `tokio::time::timeout` reusing the library's `SHUTDOWN_JOIN_TIMEOUT_SECS` backstop — same guarantee as `shutdown()`. Cancellation makes the drain prompt; the timeout only matters if a pass's drop wedges. The C signature is unchanged and the result is still discarded (`ok` as before) — we only need the call not to hang. Add `tokio/time` to the crate's direct features rather than leaning on `platform-wallet` pulling it in transitively (the crate now calls `tokio::time::timeout` directly). Co-Authored-By: Claude Opus 4.6 --- packages/rs-platform-wallet-ffi/Cargo.toml | 2 +- .../rs-platform-wallet-ffi/src/shielded_sync.rs | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/packages/rs-platform-wallet-ffi/Cargo.toml b/packages/rs-platform-wallet-ffi/Cargo.toml index 8a2bd4ef2b..7e60b05d69 100644 --- a/packages/rs-platform-wallet-ffi/Cargo.toml +++ b/packages/rs-platform-wallet-ffi/Cargo.toml @@ -22,7 +22,7 @@ rs-sdk-ffi = { path = "../rs-sdk-ffi" } once_cell = "1.19" parking_lot = { version = "0.12", features = ["send_guard"] } lazy_static = "1.4" -tokio = { version = "1", features = ["rt-multi-thread"] } +tokio = { version = "1", features = ["rt-multi-thread", "time"] } tokio-metrics = { workspace = true, optional = true } # Core dependencies (for Network type) diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs index 2d58d8165f..da285e422e 100644 --- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs +++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs @@ -88,7 +88,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop( handle: Handle, ) -> PlatformWalletFFIResult { let option = PLATFORM_WALLET_MANAGER_STORAGE.with_item(handle, |manager| { - runtime().block_on(manager.shielded_sync().quiesce()); + runtime().block_on(async { + // Bound the quiesce with the same backstop `shutdown()` uses so + // a stalled in-flight pass can't hang the host's stop call + // forever. Cancellation makes the drain prompt; this only + // matters if a pass's drop wedges. The terminal status is + // discarded — the C ABI exposes none of it, we only need the + // drain not to wedge. + let _ = tokio::time::timeout( + Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS), + manager.shielded_sync().quiesce(), + ) + .await; + }); }); unwrap_option_or_return!(option); PlatformWalletFFIResult::ok() From 2bd9501a0edde17c2c1bc3c6d8f6844eca46a973 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:13:38 +0200 Subject: [PATCH 09/29] fix(platform-wallet)!: close residual coordinator-thread UAF on shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the spawn_blocking-based join in join_coordinator_thread with an is_finished() poll loop that awaits a 5ms sleep each step. spawn_blocking tasks cannot be cancelled once started, so the prior approach left the blocking join alive past the tokio::time::timeout wrapping quiesce() — defeating the timeout boundary. Polling yields at each .await so the external timeout is truly binding (threads are confirmed-exited or the caller times out). Each coordinator's start() now drains any handle left by a prior stop() (is_finished spin-wait, 1s bound) before overwriting background_join, so a stop()->start() reschedule can no longer detach a live, untracked thread that shutdown() would miss. FFI platform_wallet_manager_destroy now returns the new ErrorShutdownIncomplete (19) when shutdown is not all-clean, signalling the host must not immediately free the callback context — a lingering coordinator may still fire one final callback. The C ABI is unchanged (additive enum variant + degraded-path return code). Tests: deterministic Stopped path via spawn(pending).abort() -> asserts Stopped(_) and !is_clean(); race test uses per-iteration catch_unwind instead of a process-global panic hook. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-platform-wallet-ffi/src/error.rs | 9 ++ .../rs-platform-wallet-ffi/src/manager.rs | 13 +- .../src/manager/identity_sync.rs | 32 ++++ .../rs-platform-wallet/src/manager/mod.rs | 142 ++++++++++++------ .../src/manager/platform_address_sync.rs | 32 ++++ .../src/manager/shielded_sync.rs | 32 ++++ 6 files changed, 211 insertions(+), 49 deletions(-) diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs index de1a6cb944..b50b5d79c5 100644 --- a/packages/rs-platform-wallet-ffi/src/error.rs +++ b/packages/rs-platform-wallet-ffi/src/error.rs @@ -125,6 +125,15 @@ pub enum PlatformWalletFFIResultCode { /// and could double-send if the original spend landed. ErrorShieldedSpendUnconfirmed = 18, + /// One or more background coordinator threads did not exit cleanly before + /// the 30 s join deadline. The host **must not** free the callback context + /// immediately — a lingering thread may still hold a reference to it and + /// fire one final callback. Either keep the context alive for a further + /// grace period, or accept the potential (but statistically tiny) race. + /// This is distinct from a normal operation error; the manager IS torn + /// down; the host should not retry `destroy`. + ErrorShutdownIncomplete = 19, + NotFound = 98, // Used exclusively for all the Option that are retuned as errors ErrorUnknown = 99, } diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs index d09d98a1e8..986103ab47 100644 --- a/packages/rs-platform-wallet-ffi/src/manager.rs +++ b/packages/rs-platform-wallet-ffi/src/manager.rs @@ -367,7 +367,18 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy( if !status.all_clean() { tracing::warn!( ?status, - "platform wallet coordinator(s) did not exit cleanly" + "platform wallet coordinator(s) did not exit cleanly; \ + host must not free the callback context immediately" + ); + // Return a distinct non-ok code so the host can delay freeing + // its callback context. A lingering coordinator thread (e.g. one + // that timed out) still holds an Arc to the event handler and may + // fire one final callback through the host-owned context pointer; + // returning ok() here would signal that the context is safe to + // free when it may not be yet. + return PlatformWalletFFIResult::err( + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + format!("coordinator(s) did not exit cleanly: {status:?}"), ); } else { tracing::debug!(?status, "platform wallet coordinators joined cleanly"); diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index ae2143a574..6e87261e0a 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -410,6 +410,38 @@ where if cancel_guard.is_some() { return; } + + // Drain any handle left by a prior stop() call. stop() takes-and-cancels + // the token but never touches background_join, so a stop()→start() + // sequence would otherwise overwrite (detach) the old handle — + // shutdown() would then miss that thread and join() only the new one. + // The old thread was already cancellation-signalled, so is_finished() + // becomes true within a few milliseconds; we spin-wait to guarantee + // no detached thread can fire callbacks after destroy() returns. + { + let prior = self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); + if let Some(h) = prior { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); + while !h.is_finished() { + if std::time::Instant::now() >= deadline { + tracing::warn!( + "identity-sync prior thread did not finish within 1 s \ + after cancellation; detaching to unblock start()" + ); + break; // Drop h — detaches; thread was already cancelled. + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + if h.is_finished() { + let _ = h.join(); // Reap resources; near-instant since finished. + } + } + } + let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 6fa26902f8..a9569dd00e 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -168,13 +168,20 @@ impl CoordinatorExitStatus { /// /// Called from each coordinator's `quiesce()` after cancelling the /// loop and draining any in-flight pass, so the thread is already on -/// its way out and the join is near-instant. The blocking -/// [`JoinHandle::join`](std::thread::JoinHandle::join) runs on the -/// blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking)) -/// so the async executor stays unblocked. Joining while the runtime is -/// still alive guarantees the `!Send` loop has stopped touching +/// its way out and the join is near-instant. Joining while the runtime +/// is still alive guarantees the `!Send` loop has stopped touching /// `tokio::time` before the host drops the runtime. /// +/// **Polling approach**: we poll [`JoinHandle::is_finished`] in 5 ms +/// steps rather than wrapping `handle.join()` in +/// [`spawn_blocking`](tokio::task::spawn_blocking). The +/// `spawn_blocking` approach spawns a blocking-pool task that cannot be +/// cancelled once started — so dropping the timeout future that wraps +/// `quiesce()` would leave the blocking task alive and `handle.join()` +/// still running, defeating the timeout boundary. Polling lets the +/// executor yield at each `.await` step so `tokio::time::timeout` +/// wrapping `quiesce()` can truly interrupt this call. +/// /// **Requires a multi-thread runtime.** Each coordinator's OS thread /// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on) /// and needs the runtime's timer/IO driver; a `current_thread` runtime @@ -187,13 +194,20 @@ pub(crate) async fn join_coordinator_thread( let Some(handle) = handle else { return CoordinatorThreadStatus::NotRunning; }; - match tokio::task::spawn_blocking(move || handle.join()).await { - Ok(Ok(())) => CoordinatorThreadStatus::Ok, - Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)), - // spawn_blocking fails only when the runtime shuts down before - // the blocking task can run — unreachable in normal operation - // since shutdown() is called while the runtime is alive. - Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")), + // Poll until the thread exits. The coordinator was already cancelled + // (stop() fires before quiesce() calls us), so is_finished() becomes + // true nearly immediately — typically within a single 5 ms step. + loop { + if handle.is_finished() { + return match handle.join() { + Ok(()) => CoordinatorThreadStatus::Ok, + Err(payload) => CoordinatorThreadStatus::Panicked(panic_message(payload)), + }; + } + // Yield to the executor so the outer tokio::time::timeout wrapping + // quiesce() can fire if the deadline has passed. Without this yield + // the loop would busy-spin and block the task. + tokio::time::sleep(std::time::Duration::from_millis(5)).await; } } @@ -711,31 +725,44 @@ mod tests { /// tokio task is cancelled or aborted rather than completing normally. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() { - // Build a manager but immediately abort the event adapter task so - // we trigger the non-panic JoinError path in shutdown(). + // Replace the real adapter handle with a guaranteed-pending task, then + // abort it. A `pending::<()>()` future can never complete on its own, + // so abort() always produces a non-panic JoinError — deterministically + // exercising the Stopped branch regardless of scheduler timing. + // (The original approach aborted the real adapter handle, which could + // race the task's own completion and silently yield `Ok` instead.) let manager = make_manager(); - // Abort the adapter task directly so the join sees a non-panic JoinError. - { + + // Drain and discard the real adapter (may already be finished). + let original = { let mut guard = manager.event_adapter_join.lock().await; - if let Some(handle) = guard.take() { - handle.abort(); - // Put it back so shutdown() sees it and exercises the error path. - *guard = Some(handle); - } + guard.take() + }; + if let Some(h) = original { + h.abort(); + let _ = h.await; } - // Give tokio a moment to process the abort. - tokio::time::sleep(Duration::from_millis(10)).await; + + // Install a permanently-pending task and abort it so the JoinError + // path in shutdown() is 100 % deterministic. + let pending = tokio::spawn(std::future::pending::<()>()); + pending.abort(); + *manager.event_adapter_join.lock().await = Some(pending); let status = manager.shutdown().await; - // The adapter task was aborted → non-panic JoinError → Stopped. - match &status.event_adapter { - CoordinatorThreadStatus::Stopped(_) | CoordinatorThreadStatus::Ok => { - // Stopped is the expected path; Ok means it drained before abort — both - // are acceptable since abort() races the task completion. - } - other => panic!("expected Stopped or Ok (abort race), got {other:?}"), - } - // Regardless, all other workers were never started → clean. + + // The aborted pending task always yields a non-panic JoinError → + // shutdown() maps it to Stopped. + assert!( + matches!(status.event_adapter, CoordinatorThreadStatus::Stopped(_)), + "expected Stopped from a non-panic JoinError, got {:?}", + status.event_adapter + ); + assert!( + !status.event_adapter.is_clean(), + "Stopped must not count as clean" + ); + // Coordinators were never started → their slots are clean. assert_eq!( status.platform_address_sync, CoordinatorThreadStatus::NotRunning @@ -801,18 +828,18 @@ mod tests { ); } - /// `join_coordinator_thread` uses `spawn_blocking` internally. Verify - /// it completes without deadlock within a bounded time when called - /// from a multi-thread runtime, as `shutdown()` requires. + /// `join_coordinator_thread` uses `is_finished()` polling. Verify + /// it completes within a bounded time on a multi-thread runtime, as + /// `shutdown()` requires (and that it doesn't busy-spin indefinitely). #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() { + async fn join_coordinator_thread_completes_within_deadline() { let handle = std::thread::spawn(|| {}); let result = tokio::time::timeout( Duration::from_secs(5), join_coordinator_thread(Some(handle)), ) .await - .expect("join_coordinator_thread must complete within 5 s — no spawn_blocking deadlock"); + .expect("join_coordinator_thread must complete within 5 s"); assert_eq!(result, CoordinatorThreadStatus::Ok); } @@ -871,15 +898,14 @@ mod tests { /// the join, the coordinator's `select!` wakeup (via tokio) would /// race the runtime teardown and reliably trigger the /// "Tokio … being shutdown" panic across the 10 iterations. + /// + /// Uses `std::panic::catch_unwind` around `drop(runtime)` rather than + /// a process-global panic hook; the hook would be live for seconds and + /// could swallow diagnostics from concurrently-running tests (e.g. + /// `join_coordinator_thread_surfaces_panic`). #[test] fn shutdown_then_drop_runtime_does_not_panic() { static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0); - let prev_hook = std::panic::take_hook(); - std::panic::set_hook(Box::new(|info| { - if info.to_string().contains("being shutdown") { - SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst); - } - })); for _ in 0..10 { let runtime = tokio::runtime::Builder::new_multi_thread() @@ -912,7 +938,27 @@ mod tests { manager.shutdown().await }); - drop(runtime); + // Wrap the runtime drop in catch_unwind to intercept the specific + // "A Tokio 1.x context ... being shutdown" panic without installing + // a process-wide hook that would suppress diagnostics from other + // concurrently running tests. + let drop_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + drop(runtime); + })); + if let Err(payload) = drop_result { + let msg = payload + .downcast_ref::() + .map(String::as_str) + .or_else(|| payload.downcast_ref::<&str>().copied()) + .unwrap_or(""); + if msg.contains("being shutdown") { + SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst); + } else { + // Unexpected panic — propagate so the test fails loudly. + std::panic::resume_unwind(payload); + } + } + // Brief settle — any stray thread activity surfaces here. std::thread::sleep(Duration::from_millis(50)); @@ -921,12 +967,12 @@ mod tests { assert!(status.all_clean(), "workers did not wind down: {status:?}"); } - let racing_panics = SHUTDOWN_PANICS.load(AO::SeqCst); - std::panic::set_hook(prev_hook); assert_eq!( - racing_panics, 0, + SHUTDOWN_PANICS.load(AO::SeqCst), + 0, "dropping the runtime after shutdown raced a coordinator thread \ - ({racing_panics} panics across 10 iterations)" + ({} panics across 10 iterations)", + SHUTDOWN_PANICS.load(AO::SeqCst) ); } } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index 28987bd9c5..7e72f2fe74 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -219,6 +219,38 @@ impl PlatformAddressSyncManager { if cancel_guard.is_some() { return; } + + // Drain any handle left by a prior stop() call. stop() takes-and-cancels + // the token but never touches background_join, so a stop()→start() + // sequence would otherwise overwrite (detach) the old handle — + // shutdown() would then miss that thread and join() only the new one. + // The old thread was already cancellation-signalled, so is_finished() + // becomes true within a few milliseconds; we spin-wait to guarantee + // no detached thread can fire callbacks after destroy() returns. + { + let prior = self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); + if let Some(h) = prior { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); + while !h.is_finished() { + if std::time::Instant::now() >= deadline { + tracing::warn!( + "platform-address-sync prior thread did not finish within 1 s \ + after cancellation; detaching to unblock start()" + ); + break; // Drop h — detaches; thread was already cancelled. + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + if h.is_finished() { + let _ = h.join(); // Reap resources; near-instant since finished. + } + } + } + let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); // Bump the generation while we still hold the slot lock so any diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index accaca69d0..365b0be17b 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -237,6 +237,38 @@ impl ShieldedSyncManager { if cancel_guard.is_some() { return; } + + // Drain any handle left by a prior stop() call. stop() takes-and-cancels + // the token but never touches background_join, so a stop()→start() + // sequence would otherwise overwrite (detach) the old handle — + // shutdown() would then miss that thread and join() only the new one. + // The old thread was already cancellation-signalled, so is_finished() + // becomes true within a few milliseconds; we spin-wait to guarantee + // no detached thread can fire callbacks after destroy() returns. + { + let prior = self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); + if let Some(h) = prior { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); + while !h.is_finished() { + if std::time::Instant::now() >= deadline { + tracing::warn!( + "shielded-sync prior thread did not finish within 1 s \ + after cancellation; detaching to unblock start()" + ); + break; // Drop h — detaches; thread was already cancelled. + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + if h.is_finished() { + let _ = h.join(); // Reap resources; near-instant since finished. + } + } + } + let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); // Bump the generation while we still hold the slot lock so From 7c975ed5a632c5de60eb32761748dc93f8a35416 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:18:21 +0200 Subject: [PATCH 10/29] fix(platform-wallet)!: surface non-clean shielded drain on clear/stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the destroy UAF-surfacing discipline (which already returns ErrorShutdownIncomplete=19 on a non-clean shutdown) to the shielded clear/stop paths, so a partial/timed-out coordinator drain can no longer be silently swallowed. - clear_shielded now captures the quiesce result instead of discarding it: on a timed-out or non-clean drain it returns the new typed PlatformWalletError::ShieldedShutdownIncomplete (carrying the terminal CoordinatorThreadStatus) and leaves the commitment-tree store INTACT, rather than unconditionally wiping a store an in-flight pass may still write into. The store is wiped only on a clean drain. - FFI shielded_sync_stop now returns ErrorShutdownIncomplete (with the status rendered into the message) on a non-clean/timed-out drain, instead of always returning ok() — symmetric with destroy. A timeout is reported as the Timeout status. - FFI shielded_clear maps the new ShieldedShutdownIncomplete variant to ErrorShutdownIncomplete (store-reset failures still map to ErrorWalletOperation); the blanket From gains the same arm, pinned by a unit test. - Swift mirror gains errorShutdownIncomplete=19 plus a richer PlatformWalletError.shutdownIncomplete case, wired through both the init(ffi:) and init(result:) switches. - Re-export CoordinatorThreadStatus / CoordinatorExitStatus from the crate root so the FFI can name the status type. BREAKING CHANGE: clear_shielded / shielded_sync_stop / shielded_clear now report a non-clean coordinator drain instead of succeeding silently; hosts must defer freeing their callback context and must not commit their own persistence wipe on ErrorShutdownIncomplete. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-platform-wallet-ffi/src/error.rs | 31 +++++++++ .../src/shielded_sync.rs | 66 +++++++++++++++---- packages/rs-platform-wallet/src/error.rs | 21 ++++++ packages/rs-platform-wallet/src/lib.rs | 5 +- .../rs-platform-wallet/src/manager/mod.rs | 32 +++++++-- .../PlatformWallet/PlatformWalletResult.swift | 16 +++++ 6 files changed, 149 insertions(+), 22 deletions(-) diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs index b50b5d79c5..5769ffcc43 100644 --- a/packages/rs-platform-wallet-ffi/src/error.rs +++ b/packages/rs-platform-wallet-ffi/src/error.rs @@ -246,6 +246,14 @@ impl From for PlatformWalletFFIResult { PlatformWalletError::ShieldedSpendUnconfirmed { .. } => { PlatformWalletFFIResultCode::ErrorShieldedSpendUnconfirmed } + // A Clear that refused because the in-flight shielded pass didn't + // drain cleanly: surface it as ErrorShutdownIncomplete (symmetric + // with `platform_wallet_manager_destroy`) so the host defers + // freeing its callback context AND does not commit its own + // persistence wipe — the store was intentionally left intact. + PlatformWalletError::ShieldedShutdownIncomplete { .. } => { + PlatformWalletFFIResultCode::ErrorShutdownIncomplete + } _ => PlatformWalletFFIResultCode::ErrorUnknown, }; PlatformWalletFFIResult::err(code, error.to_string()) @@ -604,6 +612,29 @@ mod tests { assert_eq!(msg, rendered, "Display payload must survive verbatim"); } + /// A Clear that refused on a non-clean shielded drain must surface as + /// `ErrorShutdownIncomplete` (symmetric with `destroy`), not flatten to + /// `ErrorUnknown`, so the host knows to defer freeing its callback + /// context and to NOT commit its own persistence wipe. The typed Display + /// rendering (carrying the terminal coordinator status) survives verbatim. + #[test] + fn shielded_shutdown_incomplete_maps_to_dedicated_code() { + let err = PlatformWalletError::ShieldedShutdownIncomplete { + status: platform_wallet::CoordinatorThreadStatus::Timeout, + }; + let rendered = err.to_string(); + let result: PlatformWalletFFIResult = err.into(); + assert_eq!( + result.code, + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + "ShieldedShutdownIncomplete should map to ErrorShutdownIncomplete (rendered: {rendered})" + ); + let msg = unsafe { std::ffi::CStr::from_ptr(result.message) } + .to_string_lossy() + .into_owned(); + assert_eq!(msg, rendered, "Display payload must survive verbatim"); + } + /// Other wallet-error variants without a dedicated FFI arm still /// fall through to `ErrorUnknown` while carrying the typed /// Display rendering as the message. Pin this so the catch-all diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs index da285e422e..14082628e4 100644 --- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs +++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs @@ -68,12 +68,20 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_start( /// Stop the shielded sync manager and wait for any in-flight pass to /// drain before returning. No-op if not running. /// -/// Uses `quiesce` rather than cancel-only stop, so on return: the loop -/// is cancelled, no new pass will start, and any in-flight pass has +/// Uses `quiesce` rather than cancel-only stop, so on a clean return: the +/// loop is cancelled, no new pass will start, and any in-flight pass has /// fully drained — its **persistence callbacks have completed** (no /// note/sync-state row can be written after this returns) and its /// completion-event *dispatch* on the Rust side has run. /// +/// Returns `ErrorShutdownIncomplete` instead of `Success` when that drain +/// did **not** complete cleanly (the in-flight pass timed out on the join +/// backstop, or the loop ended non-cleanly). The terminal coordinator +/// status is rendered into the result message. On this code the host must +/// **not** free the callback context immediately — a lingering pass may +/// still fire one final callback through it (symmetric with +/// `platform_wallet_manager_destroy`). +/// /// Caveat on host-observed events: a host that marshals the completion /// callback onto its own executor (e.g. the Swift trampoline hops it to /// the `@MainActor`) may still observe that final, already-dispatched @@ -92,17 +100,36 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop( // Bound the quiesce with the same backstop `shutdown()` uses so // a stalled in-flight pass can't hang the host's stop call // forever. Cancellation makes the drain prompt; this only - // matters if a pass's drop wedges. The terminal status is - // discarded — the C ABI exposes none of it, we only need the - // drain not to wedge. - let _ = tokio::time::timeout( + // matters if a pass's drop wedges. A timeout (the future was + // dropped at the deadline) is reported as the non-clean + // `Timeout` status, matching `shutdown()`'s backstop + // substitution, so the host learns the drain may be incomplete. + match tokio::time::timeout( Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS), manager.shielded_sync().quiesce(), ) - .await; - }); + .await + { + Ok(status) => status, + Err(_elapsed) => platform_wallet::CoordinatorThreadStatus::Timeout, + } + }) }); - unwrap_option_or_return!(option); + let status = unwrap_option_or_return!(option); + // Symmetric with `platform_wallet_manager_destroy`: a non-clean drain + // means the shielded loop may still hold a reference to the host-owned + // event-handler / persister context and could fire one final callback, + // so signal the host to defer freeing that context rather than returning + // ok() and inviting a use-after-free. + if !status.is_clean() { + return PlatformWalletFFIResult::err( + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + format!( + "shielded sync stop did not drain cleanly ({status:?}); \ + host must not free the callback context immediately" + ), + ); + } PlatformWalletFFIResult::ok() } @@ -429,7 +456,9 @@ pub unsafe extern "C" fn platform_wallet_manager_configure_shielded( /// via the changeset path. /// /// Returns `ErrorWalletOperation` if the Rust-side store reset -/// fails. The host **must** check this before wiping its own +/// fails, or `ErrorShutdownIncomplete` if the in-flight sync pass +/// did not drain cleanly first (in which case the store is left +/// intact). The host **must** check this before wiping its own /// persistence: a silent failure would leave the shared tree /// populated while the host drops its rows, and the next cold /// resync would gate-skip every re-downloaded position against the @@ -455,10 +484,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_clear( }); let result = unwrap_option_or_return!(option); if let Err(e) = result { - return PlatformWalletFFIResult::err( - PlatformWalletFFIResultCode::ErrorWalletOperation, - format!("clear_shielded failed: {e}"), - ); + // A non-clean / timed-out quiesce aborts the clear *before* the store + // is touched: surface it as ErrorShutdownIncomplete (symmetric with + // destroy / shielded_sync_stop) so the host defers freeing its + // callback context and does NOT commit its own persistence wipe — the + // store was intentionally left intact. Every other clear failure is a + // store-reset error → ErrorWalletOperation, as before. + let code = match &e { + platform_wallet::PlatformWalletError::ShieldedShutdownIncomplete { .. } => { + PlatformWalletFFIResultCode::ErrorShutdownIncomplete + } + _ => PlatformWalletFFIResultCode::ErrorWalletOperation, + }; + return PlatformWalletFFIResult::err(code, format!("clear_shielded failed: {e}")); } PlatformWalletFFIResult::ok() } diff --git a/packages/rs-platform-wallet/src/error.rs b/packages/rs-platform-wallet/src/error.rs index c94cb7093d..196d2ee5b4 100644 --- a/packages/rs-platform-wallet/src/error.rs +++ b/packages/rs-platform-wallet/src/error.rs @@ -239,6 +239,27 @@ pub enum PlatformWalletError { #[error("Shielded sub-wallet not bound: call bind_shielded first")] ShieldedNotBound, + + /// A Clear/wipe could not safely complete because the shielded sync + /// coordinator's in-flight pass did not drain cleanly first — it either + /// timed out on the join backstop or its loop ended non-cleanly + /// (cancelled / panicked). The shared commitment-tree store is therefore + /// **left intact** (not wiped): a still-running pass could re-persist + /// notes into the store immediately after a `clear()`, desyncing the + /// host's wiped rows from a repopulated tree and gate-skipping every + /// re-downloaded position on the next cold resync. The host **must not** + /// commit its own persistence wipe; retry Clear once the pass settles. + /// Carries the terminal [`CoordinatorThreadStatus`] for diagnostics. + /// + /// [`CoordinatorThreadStatus`]: crate::manager::CoordinatorThreadStatus + #[error( + "shielded clear aborted: sync coordinator did not drain cleanly \ + ({status:?}); commitment-tree store left intact so an in-flight pass \ + cannot re-persist into a wiped store — retry once the pass settles" + )] + ShieldedShutdownIncomplete { + status: crate::manager::CoordinatorThreadStatus, + }, } /// Check whether an SDK error indicates that an InstantSend lock proof was diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs index dd12883fc7..8b55948aa1 100644 --- a/packages/rs-platform-wallet/src/lib.rs +++ b/packages/rs-platform-wallet/src/lib.rs @@ -44,7 +44,10 @@ pub use manager::platform_address_sync::{ PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome, DEFAULT_SYNC_INTERVAL_SECS, }; -pub use manager::{PlatformWalletManager, SHUTDOWN_JOIN_TIMEOUT_SECS}; +pub use manager::{ + CoordinatorExitStatus, CoordinatorThreadStatus, PlatformWalletManager, + SHUTDOWN_JOIN_TIMEOUT_SECS, +}; pub use spv::SpvRuntime; pub use wallet::asset_lock::manager::AssetLockManager; pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock}; diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index a9569dd00e..2de6ad6d5a 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -425,21 +425,39 @@ impl PlatformWalletManager

{ /// disk but its contents are reset to empty so the next bind cold- /// resyncs from index 0. /// - /// Returns an error if the coordinator's store reset fails; the host - /// must not commit its own persistence wipe in that case. + /// Returns an error — and leaves the store untouched — in two cases, so + /// the host knows **not** to commit its own persistence wipe: + /// - the in-flight sync pass did not drain cleanly (timed out on the join + /// backstop, or its loop ended non-cleanly) → + /// [`crate::error::PlatformWalletError::ShieldedShutdownIncomplete`]; or + /// - the coordinator's store reset itself fails. #[cfg(feature = "shielded")] pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> { // Bound the quiesce with the same backstop `shutdown()` uses so a // stalled in-flight pass can't hang Clear forever — cancellation // makes the drain prompt; this timeout only matters if a pass's - // drop wedges. The terminal status isn't surfaced on the Clear - // path (the coordinator reset below is what can fail), so the - // timeout result is intentionally discarded. - let _ = tokio::time::timeout( + // drop wedges. Unlike `shutdown()`, the terminal status is + // load-bearing HERE: a non-clean drain means the in-flight pass may + // still be running and could re-persist notes into the very store + // the `clear()` below is about to wipe. A timeout (the future was + // dropped at the deadline) is treated as the non-clean `Timeout` + // status, matching `shutdown()`'s backstop substitution. + let status = match tokio::time::timeout( std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), self.shielded_sync_manager.quiesce(), ) - .await; + .await + { + Ok(status) => status, + Err(_elapsed) => CoordinatorThreadStatus::Timeout, + }; + // Only commit the store wipe once the in-flight pass has fully + // drained. Otherwise refuse: a partial/timed-out drain could let a + // surviving pass write into a store we just cleared, desyncing the + // host's own wipe from a repopulated tree. + if !status.is_clean() { + return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status }); + } if let Some(coord) = self.shielded_coordinator().await { coord.clear().await?; } diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift index 2c311f91e9..31ef07ad4a 100644 --- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift +++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift @@ -39,6 +39,12 @@ public enum PlatformWalletResultCode: Int32, Sendable { /// outcome. Do NOT auto-retry — a retry would rebuild the bundle and /// could double-execute if the original landed. case errorShieldedSpendUnconfirmed = 18 + /// A destroy/stop/clear completed but a background coordinator did not + /// exit cleanly (timed out or ended non-cleanly). The host should defer + /// freeing its callback context — a lingering coordinator may still fire + /// one final callback through it — and, on the clear path, must NOT + /// commit its own persistence wipe (the Rust store was left intact). + case errorShutdownIncomplete = 19 case notFound = 98 case errorUnknown = 99 @@ -82,6 +88,8 @@ public enum PlatformWalletResultCode: Int32, Sendable { self = .errorShieldedBroadcastUnconfirmed case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHIELDED_SPEND_UNCONFIRMED: self = .errorShieldedSpendUnconfirmed + case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHUTDOWN_INCOMPLETE: + self = .errorShutdownIncomplete case PLATFORM_WALLET_FFI_RESULT_CODE_NOT_FOUND: self = .notFound case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_UNKNOWN: @@ -177,6 +185,12 @@ public enum PlatformWalletError: LocalizedError { /// notes reserved wallet-side (a shield reserves nothing) until the /// next sync reconciles the outcome. Do NOT auto-retry. case shieldedSpendUnconfirmed(String) + /// A destroy / stop / clear completed but a background coordinator did + /// not exit cleanly. The host should defer freeing its callback context + /// (a lingering coordinator may still fire one final callback) and, on + /// the clear path, must NOT commit its own persistence wipe — the Rust + /// store was left intact so it can be retried once the pass settles. + case shutdownIncomplete(String) case notFound(String) case unknown(String) @@ -192,6 +206,7 @@ public enum PlatformWalletError: LocalizedError { .arithmeticOverflow(let m), .noSelectableInputs(let m), .walletAlreadyExists(let m), .shieldedBroadcastFailed(let m), .shieldedBroadcastUnconfirmed(let m), .shieldedSpendUnconfirmed(let m), + .shutdownIncomplete(let m), .notFound(let m), .unknown(let m): return m } @@ -222,6 +237,7 @@ public enum PlatformWalletError: LocalizedError { case .errorShieldedBroadcastFailed: self = .shieldedBroadcastFailed(detail) case .errorShieldedBroadcastUnconfirmed: self = .shieldedBroadcastUnconfirmed(detail) case .errorShieldedSpendUnconfirmed: self = .shieldedSpendUnconfirmed(detail) + case .errorShutdownIncomplete: self = .shutdownIncomplete(detail) case .notFound: self = .notFound(detail) case .errorUnknown: self = .unknown(detail) } From 5f63c9544c84c44c7a62eeed14c73634a27e45e6 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:30:26 +0200 Subject: [PATCH 11/29] fix(platform-wallet): reap prior coordinator thread outside background_cancel lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All three coordinators (identity_sync, platform_address_sync, shielded_sync) reaped the prior loop's OS thread inside start() WHILE holding background_cancel. But the exiting prior thread's epilogue also locks background_cancel to clear its slot, so a tight stop()→start() deadlocked the reap: the prior thread blocked on the lock start() held, never finished, and the is_finished() spin-wait burned the full 1 s deadline then DETACHED the handle — a 1 s stall plus a transient untracked thread, on the exact stop()→start() path the reap exists for. Reorder start() to install the new cancel token + bump the generation under the lock, then drop(cancel_guard) to release background_cancel, and only THEN run the spin-wait + join. The prior thread's epilogue now acquires the lock (or, for shielded, observes the bumped generation), skips clearing the freshly-installed token, and returns, so is_finished() trips in milliseconds and the join is near-instant. start() stays synchronous; the 1 s deadline remains only as a genuine-wedge backstop. Adds restart_after_stop_reaps_prior_thread regression tests to the identity and platform-address coordinators: start → (stop+start back-to-back) → assert the restart returns well under the 1 s deadline. Verified non-vacuous — against the old lock-held ordering it stalls ~1.0 s and fails. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/manager/identity_sync.rs | 132 ++++++++++++++---- .../src/manager/platform_address_sync.rs | 132 ++++++++++++++---- .../src/manager/shielded_sync.rs | 78 +++++++---- 3 files changed, 249 insertions(+), 93 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 6e87261e0a..9cc14ac831 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -411,36 +411,22 @@ where return; } - // Drain any handle left by a prior stop() call. stop() takes-and-cancels - // the token but never touches background_join, so a stop()→start() - // sequence would otherwise overwrite (detach) the old handle — - // shutdown() would then miss that thread and join() only the new one. - // The old thread was already cancellation-signalled, so is_finished() - // becomes true within a few milliseconds; we spin-wait to guarantee - // no detached thread can fire callbacks after destroy() returns. - { - let prior = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - if let Some(h) = prior { - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); - while !h.is_finished() { - if std::time::Instant::now() >= deadline { - tracing::warn!( - "identity-sync prior thread did not finish within 1 s \ - after cancellation; detaching to unblock start()" - ); - break; // Drop h — detaches; thread was already cancelled. - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - if h.is_finished() { - let _ = h.join(); // Reap resources; near-instant since finished. - } - } - } + // Take any handle left by a prior stop() call so we can reap it — but + // DON'T join it here, while we still hold background_cancel. stop() + // takes-and-cancels the token but never touches background_join, so a + // stop()→start() sequence would otherwise overwrite (detach) the old + // handle and shutdown() would miss that thread. Joining it under + // background_cancel would DEADLOCK the reap into its 1 s backstop: the + // exiting prior thread's epilogue also locks background_cancel (to + // clear its slot), so it would block on the lock we hold → never + // finish → get detached on the exact stop()→start() path the reap + // exists for. We install the new token + bump the generation below, + // release the lock, and only THEN reap (after this fn's tail). + let prior = self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); @@ -498,7 +484,37 @@ where .background_join .lock() .unwrap_or_else(|e| e.into_inner()) = Some(join); - // cancel_guard drops here, releasing background_cancel. + + // Release background_cancel BEFORE reaping the prior thread, so its + // epilogue can acquire the lock, observe the bumped generation, skip + // clearing our freshly-installed token, and return. Holding the lock + // across the join below is what would block the prior thread, spin + // the full 1 s deadline, and detach — the very stall this ordering + // removes. + drop(cancel_guard); + + // Now reap the prior thread. It was already cancellation-signalled by + // stop(), and with the lock released its epilogue completes promptly, + // so is_finished() trips within a few milliseconds and the join is + // near-instant. The 1 s deadline survives only as a genuine-wedge + // backstop (e.g. a pass wedged in a Drop that never yields); if it + // fires we detach the already-cancelled thread to unblock start(). + if let Some(h) = prior { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); + while !h.is_finished() { + if std::time::Instant::now() >= deadline { + tracing::warn!( + "identity-sync prior thread did not finish within 1 s \ + after cancellation; detaching to unblock start()" + ); + break; // Drop h — detaches; thread was already cancelled. + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + if h.is_finished() { + let _ = h.join(); // Reap resources; near-instant since finished. + } + } } /// Stop the background sync loop. No-op if not running. @@ -1025,6 +1041,60 @@ mod tests { pass.await.unwrap(); } + /// Regression: a tight `stop()` → `start()` must reap the prior loop's + /// OS thread promptly, NOT stall on the 1 s detach backstop. + /// + /// The prior thread's exit epilogue locks `background_cancel` to + /// conditionally clear its slot. The earlier ordering held + /// `background_cancel` across the prior-handle join inside `start()`, so + /// on a back-to-back `stop()` → `start()` the exiting thread blocked on + /// that lock, never finished, and the reap spin-waited the full second + /// before detaching — a 1 s stall plus a transient untracked thread. The + /// fix installs the new token + generation, releases `background_cancel`, + /// and only then reaps, so the prior thread's epilogue runs and the join + /// lands in milliseconds. + /// + /// `stop()` and `start()` run back-to-back in one blocking closure + /// (mirroring the real call site) so `start()` re-acquires the lock + /// microseconds after `stop()` frees it — before the async-woken prior + /// thread can reach its epilogue. Against the old lock-held ordering this + /// reliably stalls ~1 s and fails the bound below. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn restart_after_stop_reaps_prior_thread() { + let mgr = make_manager(); + + // Launch the first loop and let its immediate (no-op, nothing + // registered) pass complete so the thread parks in the interval + // sleep, where cancellation lands cleanly. + Arc::clone(&mgr).start(); + assert!(mgr.is_running()); + tokio::time::sleep(Duration::from_millis(50)).await; + + // Back-to-back cancel-only stop + restart, off the runtime so the + // synchronous reap can't starve a worker. `start()` re-grabs + // background_cancel right after `stop()` frees it. + let restart = Arc::clone(&mgr); + let elapsed = tokio::task::spawn_blocking(move || { + restart.stop(); + let started = std::time::Instant::now(); + Arc::clone(&restart).start(); + started.elapsed() + }) + .await + .unwrap(); + + assert!( + elapsed < Duration::from_millis(500), + "stop()→start() stalled for {elapsed:?}: prior thread was not \ + reaped promptly (background_cancel held across the join?)" + ); + assert!(mgr.is_running(), "restart must leave the new loop tracked"); + + // Wind the new loop down so the test leaves no live !Send thread. + mgr.quiesce().await; + assert!(!mgr.is_running()); + } + /// A `sync_now()` invoked while `quiescing` is set must bail without /// running the pass — in particular, without calling /// `persister.store(...)`. This is the gate that prevents a pass diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index 7e72f2fe74..87b6595e53 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -220,36 +220,22 @@ impl PlatformAddressSyncManager { return; } - // Drain any handle left by a prior stop() call. stop() takes-and-cancels - // the token but never touches background_join, so a stop()→start() - // sequence would otherwise overwrite (detach) the old handle — - // shutdown() would then miss that thread and join() only the new one. - // The old thread was already cancellation-signalled, so is_finished() - // becomes true within a few milliseconds; we spin-wait to guarantee - // no detached thread can fire callbacks after destroy() returns. - { - let prior = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - if let Some(h) = prior { - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); - while !h.is_finished() { - if std::time::Instant::now() >= deadline { - tracing::warn!( - "platform-address-sync prior thread did not finish within 1 s \ - after cancellation; detaching to unblock start()" - ); - break; // Drop h — detaches; thread was already cancelled. - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - if h.is_finished() { - let _ = h.join(); // Reap resources; near-instant since finished. - } - } - } + // Take any handle left by a prior stop() call so we can reap it — but + // DON'T join it here, while we still hold background_cancel. stop() + // takes-and-cancels the token but never touches background_join, so a + // stop()→start() sequence would otherwise overwrite (detach) the old + // handle and shutdown() would miss that thread. Joining it under + // background_cancel would DEADLOCK the reap into its 1 s backstop: the + // exiting prior thread's epilogue also locks background_cancel (to + // clear its slot), so it would block on the lock we hold → never + // finish → get detached on the exact stop()→start() path the reap + // exists for. We install the new token + bump the generation below, + // release the lock, and only THEN reap (after this fn's tail). + let prior = self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); @@ -313,7 +299,37 @@ impl PlatformAddressSyncManager { .background_join .lock() .unwrap_or_else(|e| e.into_inner()) = Some(join); - // cancel_guard drops here, releasing background_cancel. + + // Release background_cancel BEFORE reaping the prior thread, so its + // epilogue can acquire the lock, observe the bumped generation, skip + // clearing our freshly-installed token, and return. Holding the lock + // across the join below is what would block the prior thread, spin + // the full 1 s deadline, and detach — the very stall this ordering + // removes. + drop(cancel_guard); + + // Now reap the prior thread. It was already cancellation-signalled by + // stop(), and with the lock released its epilogue completes promptly, + // so is_finished() trips within a few milliseconds and the join is + // near-instant. The 1 s deadline survives only as a genuine-wedge + // backstop (e.g. a pass wedged in a Drop that never yields); if it + // fires we detach the already-cancelled thread to unblock start(). + if let Some(h) = prior { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); + while !h.is_finished() { + if std::time::Instant::now() >= deadline { + tracing::warn!( + "platform-address-sync prior thread did not finish within 1 s \ + after cancellation; detaching to unblock start()" + ); + break; // Drop h — detaches; thread was already cancelled. + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + if h.is_finished() { + let _ = h.join(); // Reap resources; near-instant since finished. + } + } } /// Stop the background sync loop. No-op if not running. @@ -590,6 +606,60 @@ mod tests { pass.await.unwrap(); } + /// Regression: a tight `stop()` → `start()` must reap the prior loop's + /// OS thread promptly, NOT stall on the 1 s detach backstop. + /// + /// The prior thread's exit epilogue locks `background_cancel` to + /// conditionally clear its slot. The earlier ordering held + /// `background_cancel` across the prior-handle join inside `start()`, so + /// on a back-to-back `stop()` → `start()` the exiting thread blocked on + /// that lock, never finished, and the reap spin-waited the full second + /// before detaching — a 1 s stall plus a transient untracked thread. The + /// fix installs the new token + generation, releases `background_cancel`, + /// and only then reaps, so the prior thread's epilogue runs and the join + /// lands in milliseconds. + /// + /// `stop()` and `start()` run back-to-back in one blocking closure + /// (mirroring the real call site) so `start()` re-acquires the lock + /// microseconds after `stop()` frees it — before the async-woken prior + /// thread can reach its epilogue. Against the old lock-held ordering this + /// reliably stalls ~1 s and fails the bound below. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn restart_after_stop_reaps_prior_thread() { + let (mgr, _counter) = make_manager(); + + // Launch the first loop and let its immediate (no-op, empty wallet + // map) pass complete so the thread parks in the interval sleep, where + // cancellation lands cleanly. + Arc::clone(&mgr).start(); + assert!(mgr.is_running()); + tokio::time::sleep(Duration::from_millis(50)).await; + + // Back-to-back cancel-only stop + restart, off the runtime so the + // synchronous reap can't starve a worker. `start()` re-grabs + // background_cancel right after `stop()` frees it. + let restart = Arc::clone(&mgr); + let elapsed = tokio::task::spawn_blocking(move || { + restart.stop(); + let started = std::time::Instant::now(); + Arc::clone(&restart).start(); + started.elapsed() + }) + .await + .unwrap(); + + assert!( + elapsed < Duration::from_millis(500), + "stop()→start() stalled for {elapsed:?}: prior thread was not \ + reaped promptly (background_cancel held across the join?)" + ); + assert!(mgr.is_running(), "restart must leave the new loop tracked"); + + // Wind the new loop down so the test leaves no live !Send thread. + mgr.quiesce().await; + assert!(!mgr.is_running()); + } + /// A `sync_now()` invoked while `quiescing` is set must bail without /// running the pass — in particular, without firing the /// `on_platform_address_sync_completed` host callback. This is the diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 365b0be17b..d0aa75a843 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -238,36 +238,22 @@ impl ShieldedSyncManager { return; } - // Drain any handle left by a prior stop() call. stop() takes-and-cancels - // the token but never touches background_join, so a stop()→start() - // sequence would otherwise overwrite (detach) the old handle — - // shutdown() would then miss that thread and join() only the new one. - // The old thread was already cancellation-signalled, so is_finished() - // becomes true within a few milliseconds; we spin-wait to guarantee - // no detached thread can fire callbacks after destroy() returns. - { - let prior = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - if let Some(h) = prior { - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); - while !h.is_finished() { - if std::time::Instant::now() >= deadline { - tracing::warn!( - "shielded-sync prior thread did not finish within 1 s \ - after cancellation; detaching to unblock start()" - ); - break; // Drop h — detaches; thread was already cancelled. - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - if h.is_finished() { - let _ = h.join(); // Reap resources; near-instant since finished. - } - } - } + // Take any handle left by a prior stop() call so we can reap it — but + // DON'T join it here, while we still hold background_cancel. stop() + // takes-and-cancels the token but never touches background_join, so a + // stop()→start() sequence would otherwise overwrite (detach) the old + // handle and shutdown() would miss that thread. Joining it under + // background_cancel would DEADLOCK the reap into its 1 s backstop: the + // exiting prior thread's epilogue also locks background_cancel (to + // clear its slot), so it would block on the lock we hold → never + // finish → get detached on the exact stop()→start() path the reap + // exists for. We install the new token + bump the generation below, + // release the lock, and only THEN reap (after this fn's tail). + let prior = self + .background_join + .lock() + .unwrap_or_else(|e| e.into_inner()) + .take(); let cancel = CancellationToken::new(); *cancel_guard = Some(cancel.clone()); @@ -339,7 +325,37 @@ impl ShieldedSyncManager { .background_join .lock() .unwrap_or_else(|e| e.into_inner()) = Some(join); - // cancel_guard drops here, releasing background_cancel. + + // Release background_cancel BEFORE reaping the prior thread, so its + // epilogue can observe the bumped generation (and skip clearing our + // freshly-installed token) without contending the lock we hold. + // Holding the lock across the join below is what would block the + // prior thread, spin the full 1 s deadline, and detach — the very + // stall this ordering removes. + drop(cancel_guard); + + // Now reap the prior thread. It was already cancellation-signalled by + // stop(), and with the lock released its epilogue completes promptly, + // so is_finished() trips within a few milliseconds and the join is + // near-instant. The 1 s deadline survives only as a genuine-wedge + // backstop (e.g. a pass wedged in a Drop that never yields); if it + // fires we detach the already-cancelled thread to unblock start(). + if let Some(h) = prior { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); + while !h.is_finished() { + if std::time::Instant::now() >= deadline { + tracing::warn!( + "shielded-sync prior thread did not finish within 1 s \ + after cancellation; detaching to unblock start()" + ); + break; // Drop h — detaches; thread was already cancelled. + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + if h.is_finished() { + let _ = h.join(); // Reap resources; near-instant since finished. + } + } } /// Stop the background sync loop. No-op if not running. From 2b068ba57564e836f8b48f9fb6c643943b73336c Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:34:10 +0200 Subject: [PATCH 12/29] fix(platform-wallet): close shielded epilogue TOCTOU + pin restart reap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three shielded-sync hardening fixes, bringing it in line with its identity-sync and platform-address-sync siblings. - shielded_sync.rs exit epilogue read `background_generation` BEFORE acquiring `background_cancel` (load-then-lock). That stale-read TOCTOU let a prior thread observe a pre-bump generation, block on the lock until a concurrent start() released it, then null the freshly-installed token — leaving the new loop running but untracked via is_running()/stop(). Acquire the lock first and compare the generation under it, exactly like the siblings. - Add the `restart_after_stop_reaps_prior_thread` regression test the siblings already carry. It pins the reap-after-drop(cancel_guard) reorder: a back-to-back stop()+start() must reap the prior OS thread in <500 ms, not stall ~1 s on the detach backstop. Confirmed non-vacuous — it fails at ~1.0 s with the reap moved back inside the lock. - platform-wallet-ffi: the ErrorShutdownIncomplete doc only described destroy. It is now also returned by shielded_sync_stop and shielded_clear, where the manager is NOT torn down and the operation can be retried. Document all three callers and their differing retry semantics. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-platform-wallet-ffi/src/error.rs | 32 ++++-- .../src/manager/shielded_sync.rs | 97 +++++++++++++++++-- 2 files changed, 115 insertions(+), 14 deletions(-) diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs index 5769ffcc43..e5b5184a82 100644 --- a/packages/rs-platform-wallet-ffi/src/error.rs +++ b/packages/rs-platform-wallet-ffi/src/error.rs @@ -125,13 +125,31 @@ pub enum PlatformWalletFFIResultCode { /// and could double-send if the original spend landed. ErrorShieldedSpendUnconfirmed = 18, - /// One or more background coordinator threads did not exit cleanly before - /// the 30 s join deadline. The host **must not** free the callback context - /// immediately — a lingering thread may still hold a reference to it and - /// fire one final callback. Either keep the context alive for a further - /// grace period, or accept the potential (but statistically tiny) race. - /// This is distinct from a normal operation error; the manager IS torn - /// down; the host should not retry `destroy`. + /// A background coordinator drain did not complete cleanly within the + /// join deadline — one or more `!Send` sync threads may still be alive + /// and still hold a reference to the host-owned callback context, so they + /// could fire one final callback through it. On this code the host **must + /// not** free the callback context immediately: either keep it alive for a + /// further grace period, or accept the (statistically tiny) race. + /// + /// Returned by three callers, which differ in whether the operation may + /// be **retried**: + /// - `platform_wallet_manager_destroy`: the manager **IS** torn down + /// (removed from storage) regardless — do **not** retry `destroy`; the + /// handle is already gone. Only the callback-context lifetime caveat + /// above applies. + /// - `platform_wallet_manager_shielded_sync_stop`: the manager is **NOT** + /// torn down — only the shielded loop's drain was non-clean. The host + /// may retry the stop (or proceed to `destroy`); the handle stays valid. + /// - `platform_wallet_manager_shielded_clear`: the manager is **NOT** torn + /// down and the store was left **intact** (Clear aborted before touching + /// it). The host may retry the clear, and must **not** commit its own + /// persistence wipe — doing so would desync the host's rows from the + /// still-populated shared tree. + /// + /// Distinct from a normal operation error (the underlying operation may + /// well have made progress); the terminal coordinator status is rendered + /// into the result message. ErrorShutdownIncomplete = 19, NotFound = 98, // Used exclusively for all the Option that are retuned as errors diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index d0aa75a843..98e94035aa 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -305,13 +305,20 @@ impl ShieldedSyncManager { } // Only clear `background_cancel` if the active - // generation is still ours. Without this guard a - // tight `stop()` → `start()` reschedule has the - // exiting thread overwrite the *new* generation's - // token, leaving the new loop running but - // unreflectable via `is_running()` / `stop()`. - if this.background_generation.load(Ordering::Acquire) == my_gen { - if let Ok(mut guard) = this.background_cancel.lock() { + // generation is still ours. Acquire the lock FIRST, + // then read/compare `background_generation` under it + // (matching identity_sync / platform_address_sync). + // Reading the generation BEFORE locking opens a + // stale-read TOCTOU: this exiting thread could observe + // a pre-bump generation, then block on the lock until a + // concurrent `start()` released it, and null the + // freshly-installed token — leaving the new loop + // running but unreflectable via `is_running()` / + // `stop()`. `start()` bumps the generation while it + // holds this same lock, so comparing under the lock + // guarantees we observe the post-swap value. + if let Ok(mut guard) = this.background_cancel.lock() { + if this.background_generation.load(Ordering::Acquire) == my_gen { *guard = None; } } @@ -576,3 +583,79 @@ impl std::fmt::Debug for ShieldedSyncManager { .finish() } } + +// The whole module is already `#[cfg(feature = "shielded")]`-gated at its +// `mod` declaration (manager/mod.rs), so these tests compile only under that +// feature — no extra per-test gate needed. +#[cfg(test)] +mod tests { + use super::*; + + /// Build a manager over an **empty** coordinator slot wired to a + /// handler-less event manager. An empty slot makes every `sync_now` + /// pass a no-op (empty-coordinator handling returns immediately), so + /// the background loop parks in its interval sleep — exactly where + /// cancellation lands cleanly — without needing a live SDK / network. + /// That is all the start/stop/restart thread-lifecycle tests below + /// exercise. + fn make_manager() -> Arc { + let coordinator_slot = Arc::new(RwLock::new(None)); + let event_manager = Arc::new(PlatformEventManager::new(vec![])); + Arc::new(ShieldedSyncManager::new(event_manager, coordinator_slot)) + } + + /// Regression: a tight `stop()` → `start()` must reap the prior loop's + /// OS thread promptly, NOT stall on the 1 s detach backstop. + /// + /// The prior thread's exit epilogue locks `background_cancel` to + /// conditionally clear its slot. The earlier ordering held + /// `background_cancel` across the prior-handle join inside `start()`, so + /// on a back-to-back `stop()` → `start()` the exiting thread blocked on + /// that lock, never finished, and the reap spin-waited the full second + /// before detaching — a 1 s stall plus a transient untracked thread. The + /// fix installs the new token + generation, releases `background_cancel`, + /// and only then reaps, so the prior thread's epilogue runs and the join + /// lands in milliseconds. Mirrors the identity-sync and + /// platform-address-sync siblings. + /// + /// `stop()` and `start()` run back-to-back in one blocking closure + /// (mirroring the real call site) so `start()` re-acquires the lock + /// microseconds after `stop()` frees it — before the async-woken prior + /// thread can reach its epilogue. Against the old lock-held ordering this + /// reliably stalls ~1 s and fails the bound below. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn restart_after_stop_reaps_prior_thread() { + let mgr = make_manager(); + + // Launch the first loop and let its immediate (no-op, empty + // coordinator) pass complete so the thread parks in the interval + // sleep, where cancellation lands cleanly. + Arc::clone(&mgr).start(); + assert!(mgr.is_running()); + tokio::time::sleep(Duration::from_millis(50)).await; + + // Back-to-back cancel-only stop + restart, off the runtime so the + // synchronous reap can't starve a worker. `start()` re-grabs + // background_cancel right after `stop()` frees it. + let restart = Arc::clone(&mgr); + let elapsed = tokio::task::spawn_blocking(move || { + restart.stop(); + let started = std::time::Instant::now(); + Arc::clone(&restart).start(); + started.elapsed() + }) + .await + .unwrap(); + + assert!( + elapsed < Duration::from_millis(500), + "stop()→start() stalled for {elapsed:?}: prior thread was not \ + reaped promptly (background_cancel held across the join?)" + ); + assert!(mgr.is_running(), "restart must leave the new loop tracked"); + + // Wind the new loop down so the test leaves no live !Send thread. + mgr.quiesce().await; + assert!(!mgr.is_running()); + } +} From 5017ba13136e8a1f1d818ac6547290f716836e39 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:34:18 +0200 Subject: [PATCH 13/29] fix(swift-sdk): retain wallet callback context on incomplete shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PlatformWalletManager.deinit called .discard() on shielded_sync_stop and destroy, tossing the result code. Rust now returns ErrorShutdownIncomplete (19) on a non-clean drain, with the contract: a lingering coordinator thread may still fire one final callback through the host-owned callback context. But persistenceHandler/eventHandler are handed to Rust via Unmanaged.passUnretained and kept alive only by this object's fields, so the instant deinit returns ARC frees them — a use-after-free on that final callback. Capture the code via a new discardReturningCode() helper; on ErrorShutdownIncomplete from shielded_sync_stop OR destroy, deliberately leak one extra strong reference (an unbalanced passRetained, never released) to each handler so it outlives any lingering thread. A clean shutdown — the common case — takes neither branch and releases the handlers normally; we never leak unconditionally. UNVERIFIED locally: no Swift toolchain / xcframework on this host. Reasoned correct-by-construction; needs an iOS-environment build to confirm. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../PlatformWalletManager.swift | 43 +++++++++++++++++-- .../PlatformWallet/PlatformWalletResult.swift | 12 ++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift index 0e433d368e..36bafa37d1 100644 --- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift +++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift @@ -152,10 +152,45 @@ public class PlatformWalletManager: ObservableObject { deinit { progressPollTask?.cancel() - if handle != NULL_HANDLE { - platform_wallet_manager_platform_address_sync_stop(handle).discard() - platform_wallet_manager_shielded_sync_stop(handle).discard() - platform_wallet_manager_destroy(handle).discard() + guard handle != NULL_HANDLE else { return } + + // Tear down the Rust manager: cancel the address-sync loop, drain + // the shielded loop, then destroy. The first stop is cancel-only + // and never reports an incomplete drain, so we still `discard()` it. + platform_wallet_manager_platform_address_sync_stop(handle).discard() + + // Capture the CODE (not just free the message) for the two calls + // that CAN report `.errorShutdownIncomplete`: `shielded_sync_stop` + // and `destroy`. Rust returns that code when a background + // coordinator did not drain within the join deadline — meaning a + // lingering `!Send` coordinator thread may still hold the + // `passUnretained` context pointers Rust was handed for our + // `persistenceHandler` / `eventHandler` and fire ONE final callback + // through them. The contract: on that code the host must NOT free + // the callback context immediately. + let shieldedStopCode = + platform_wallet_manager_shielded_sync_stop(handle).discardReturningCode() + let destroyCode = + platform_wallet_manager_destroy(handle).discardReturningCode() + + // Both handlers are passed to Rust via `Unmanaged.passUnretained` + // (see `PlatformWalletPersistenceHandler`/`PlatformWalletEventHandler` + // `makeCallbacks()`), so Rust holds non-owning pointers and these + // objects are kept alive ONLY by the stored properties below. The + // instant this deinit returns, ARC releases them — which would be a + // use-after-free if a lingering coordinator then fires its final + // callback. So, ONLY on an incomplete shutdown, deliberately leak one + // extra strong reference to each (an unbalanced `passRetained` that is + // never released) so they outlive any lingering thread. A clean + // shutdown (the common case) takes neither branch and releases the + // handlers normally — we never leak unconditionally. The leak is + // bounded by how often a shutdown wedges (rare) and trades two small + // objects for guaranteed callback safety, since an incomplete drain + // gives no later signal that the lingering thread has finally exited. + if shieldedStopCode == .errorShutdownIncomplete + || destroyCode == .errorShutdownIncomplete { + if let persistenceHandler { _ = Unmanaged.passRetained(persistenceHandler) } + if let eventHandler { _ = Unmanaged.passRetained(eventHandler) } } } diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift index 31ef07ad4a..c24f72fbf8 100644 --- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift +++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift @@ -256,4 +256,16 @@ extension PlatformWalletFFIResult { func discard() { _ = PlatformWalletResult(self) } + + /// Free the result's Rust-owned message and return its typed code. + /// + /// Like `discard()`, but hands back the code so the caller can branch + /// on it — used by `PlatformWalletManager.deinit`, which must detect + /// `.errorShutdownIncomplete` to decide whether to keep its callback + /// context alive. The message is still freed deterministically (the + /// temporary `PlatformWalletResult` frees it on drop). + @inline(__always) + func discardReturningCode() -> PlatformWalletResultCode { + PlatformWalletResult(self).code + } } From b4917732a39eb3a86ec706c9ac115f2011c185f9 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:50:44 +0200 Subject: [PATCH 14/29] test(platform-wallet): bound cleanup quiesce in restart-reap regression tests Wrap the cleanup `mgr.quiesce().await` in all three `restart_after_stop_reaps_prior_thread` tests with a 2-second `tokio::time::timeout`. An unbounded quiesce after the restarted loop would hang CI forever if the loop wedges; now it fails fast with a clear message. Also assert `status.is_clean()` on the returned `CoordinatorThreadStatus`. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-platform-wallet/src/manager/identity_sync.rs | 8 +++++++- .../src/manager/platform_address_sync.rs | 8 +++++++- packages/rs-platform-wallet/src/manager/shielded_sync.rs | 8 +++++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 9cc14ac831..ab6fa6033e 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -1091,7 +1091,13 @@ mod tests { assert!(mgr.is_running(), "restart must leave the new loop tracked"); // Wind the new loop down so the test leaves no live !Send thread. - mgr.quiesce().await; + let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce()) + .await + .expect("cleanup quiesce did not complete within 2s after restart"); + assert!( + status.is_clean(), + "cleanup quiesce ended non-cleanly: {status:?}" + ); assert!(!mgr.is_running()); } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index 87b6595e53..094ae1a25b 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -656,7 +656,13 @@ mod tests { assert!(mgr.is_running(), "restart must leave the new loop tracked"); // Wind the new loop down so the test leaves no live !Send thread. - mgr.quiesce().await; + let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce()) + .await + .expect("cleanup quiesce did not complete within 2s after restart"); + assert!( + status.is_clean(), + "cleanup quiesce ended non-cleanly: {status:?}" + ); assert!(!mgr.is_running()); } diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 98e94035aa..ba7b752315 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -655,7 +655,13 @@ mod tests { assert!(mgr.is_running(), "restart must leave the new loop tracked"); // Wind the new loop down so the test leaves no live !Send thread. - mgr.quiesce().await; + let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce()) + .await + .expect("cleanup quiesce did not complete within 2s after restart"); + assert!( + status.is_clean(), + "cleanup quiesce ended non-cleanly: {status:?}" + ); assert!(!mgr.is_running()); } } From 76c8bee0060f2bd5b473c62ef633bcfc9bd69a81 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 12:16:40 +0200 Subject: [PATCH 15/29] fix(platform-wallet): track detached coordinator threads so shutdown() reports them as non-clean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the residual use-after-free window left by the coordinator reap backstop. On a tight stop()->start(), each sync coordinator waits ~1s for the prior OS thread to finish; if that thread is genuinely wedged in a non-yielding Drop, the backstop previously DROPPED the still-live JoinHandle (detaching it). A later shutdown() joined only the current handle, all_clean() returned true, and the FFI destroy returned ok() — at which point the host could free the callback context the detached, still-running thread might still touch. Fix (review option i): the manager now owns a shared CoordinatorOrphans list (Arc>>) cloned into every coordinator. The duplicated reap blocks in identity/platform-address/shielded start() are consolidated into reap_prior_or_park(), which PARKS a wedged prior thread in that list instead of dropping it (lock-ordering preserved: drop(cancel_guard) still happens before the reap). shutdown() then drains the list via join_detached_orphans() within a bounded, yielding is_finished() poll and reports a new CoordinatorThreadStatus::Detached (non-clean) in CoordinatorExitStatus::detached_threads when any orphan is still alive at the grace deadline. all_clean() folds it in, so the FFI destroy correctly returns ErrorShutdownIncomplete and the host delays freeing its context. The new Detached variant re-exports through lib.rs with its sibling statuses. Tests (manager/mod.rs): reap_prior_or_park parks a force-wedged thread; join_detached_orphans reports Detached then Ok; and a manager shutdown() with a parked still-live orphan reports non-clean. All proven non-vacuous by neutering the park/join. Cleanup quiesce/join in tests is bounded; a wedged stand-in thread is released and joined so none leak. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/manager/identity_sync.rs | 49 ++- .../rs-platform-wallet/src/manager/mod.rs | 405 +++++++++++++++++- .../src/manager/platform_address_sync.rs | 41 +- .../src/manager/shielded_sync.rs | 41 +- 4 files changed, 475 insertions(+), 61 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index ab6fa6033e..40329bad74 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -167,6 +167,12 @@ where /// confirm the `!Send` loop fully exited before the host drops the /// runtime. background_join: StdMutex>>, + /// Manager-owned orphans list (shared `Arc`). On a tight + /// `stop()`→`start()` where the prior thread is wedged past the 1 s + /// reap backstop, [`start`](Self::start) parks the still-live handle + /// here (via [`reap_prior_or_park`](super::reap_prior_or_park)) + /// instead of dropping it, so manager `shutdown()` accounts for it. + coordinator_orphans: super::CoordinatorOrphans, /// Monotonically increasing generation counter. Incremented each /// time `start()` installs a new cancel token so the exiting /// thread can tell whether its token is still current. @@ -206,12 +212,17 @@ where /// writes). The registry starts empty — call /// [`register_identity`](Self::register_identity) before /// [`start`](Self::start). - pub fn new(sdk: Arc, persister: Arc

) -> Self { + pub fn new( + sdk: Arc, + persister: Arc

, + coordinator_orphans: super::CoordinatorOrphans, + ) -> Self { Self { sdk, persister, background_cancel: StdMutex::new(None), background_join: StdMutex::new(None), + coordinator_orphans, background_generation: AtomicU64::new(0), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), @@ -498,23 +509,15 @@ where // so is_finished() trips within a few milliseconds and the join is // near-instant. The 1 s deadline survives only as a genuine-wedge // backstop (e.g. a pass wedged in a Drop that never yields); if it - // fires we detach the already-cancelled thread to unblock start(). - if let Some(h) = prior { - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); - while !h.is_finished() { - if std::time::Instant::now() >= deadline { - tracing::warn!( - "identity-sync prior thread did not finish within 1 s \ - after cancellation; detaching to unblock start()" - ); - break; // Drop h — detaches; thread was already cancelled. - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - if h.is_finished() { - let _ = h.join(); // Reap resources; near-instant since finished. - } - } + // fires `reap_prior_or_park` parks the still-live, already-cancelled + // thread in the manager orphans list so `shutdown()` joins it and + // reports it non-clean rather than dropping it (residual UAF). + super::reap_prior_or_park( + prior, + &self.coordinator_orphans, + std::time::Duration::from_secs(1), + "identity-sync", + ); } /// Stop the background sync loop. No-op if not running. @@ -853,7 +856,8 @@ mod tests { fn make_manager() -> Arc> { let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); let persister = Arc::new(NoopPersister); - Arc::new(IdentitySyncManager::new(sdk, persister)) + let orphans = Arc::new(StdMutex::new(Vec::new())); + Arc::new(IdentitySyncManager::new(sdk, persister, orphans)) } fn make_recording_manager() -> ( @@ -862,8 +866,13 @@ mod tests { ) { let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); let persister = Arc::new(RecordingPersister::new()); + let orphans = Arc::new(StdMutex::new(Vec::new())); ( - Arc::new(IdentitySyncManager::new(sdk, Arc::clone(&persister))), + Arc::new(IdentitySyncManager::new( + sdk, + Arc::clone(&persister), + orphans, + )), persister, ) } diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 2de6ad6d5a..0e02d430b7 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -28,6 +28,22 @@ use crate::wallet::core::BalanceUpdateHandler; use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId}; use crate::wallet::PlatformWallet; +/// Shared list of coordinator OS threads that a tight `stop()`→`start()` +/// reap had to detach past its 1 s wedge-backstop. +/// +/// A coordinator's `start()` reap normally joins the prior thread within +/// a few milliseconds. If that thread is genuinely wedged in a +/// non-yielding `Drop` (vanishingly rare — the loop exits via a +/// cancellable `select!`), [`reap_prior_or_park`] parks its still-live +/// `JoinHandle` here instead of dropping it. The manager owns this list +/// and shares a clone (`Arc`) with every coordinator, so +/// [`PlatformWalletManager::shutdown`] can join everything parked here +/// within its timeout and report +/// [`CoordinatorThreadStatus::Detached`] if any thread is still alive — +/// telling the host NOT to free a callback context the thread may still +/// touch (closing a residual use-after-free window). +pub(crate) type CoordinatorOrphans = Arc>>>; + /// Multi-wallet coordinator with SPV sync and event handling. /// /// Events are dispatched through [`PlatformEventManager`] to all registered @@ -87,6 +103,11 @@ pub struct PlatformWalletManager { /// is torn down. pub(super) event_adapter_cancel: CancellationToken, pub(super) event_adapter_join: tokio::sync::Mutex>>, + /// Coordinator OS threads detached by a tight `stop()`→`start()` + /// reap (see [`CoordinatorOrphans`]). Shared (cloned `Arc`) with + /// every coordinator so their `start()` reaps can park a wedged + /// prior thread here, and drained/joined by [`shutdown`](Self::shutdown). + pub(super) coordinator_orphans: CoordinatorOrphans, } /// How one background coordinator thread terminated. @@ -117,12 +138,24 @@ pub enum CoordinatorThreadStatus { /// the runtime was torn down before the join could run (unreachable /// in normal operation). Error(String), + /// At least one coordinator OS thread that an earlier tight + /// `stop()`→`start()` reap had to detach past its 1 s wedge-backstop + /// was still alive at the shutdown deadline. + /// + /// Such a thread was parked in the manager's [`CoordinatorOrphans`] + /// list (not silently dropped) precisely so this case is visible. + /// A still-live detached thread keeps an `Arc` to the host event + /// handler and may fire one final callback, so the host must NOT + /// free the callback context yet — this status keeps + /// [`is_clean`](Self::is_clean) `false` so the FFI `destroy` returns + /// `ErrorShutdownIncomplete` instead of `ok()`. + Detached, } impl CoordinatorThreadStatus { /// `true` only for a fully clean outcome: joined normally (`Ok`) or - /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, and - /// `Error` are all considered non-clean. + /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, + /// `Error`, and `Detached` are all considered non-clean. pub fn is_clean(&self) -> bool { matches!(self, Self::Ok | Self::NotRunning) } @@ -148,19 +181,35 @@ pub struct CoordinatorExitStatus { pub shielded_sync: Option, /// Wallet-event adapter (a `tokio` task, not an OS thread). pub event_adapter: CoordinatorThreadStatus, + /// Aggregate status of any coordinator OS threads that an earlier + /// tight `stop()`→`start()` reap had to detach past its 1 s + /// wedge-backstop and park in the manager's [`CoordinatorOrphans`] + /// list. + /// + /// [`Ok`](CoordinatorThreadStatus::Ok) when none were detached (or + /// every detached thread has since joined cleanly); + /// [`Detached`](CoordinatorThreadStatus::Detached) when at least one + /// is still alive at the shutdown deadline. This is what keeps + /// [`all_clean`](Self::all_clean) honest for the wedge case the rest + /// of the teardown can't see — without it a detached-but-still-live + /// thread would let the host free a callback context the thread may + /// still touch (a residual use-after-free). + pub detached_threads: CoordinatorThreadStatus, } impl CoordinatorExitStatus { - /// `true` only when every worker is + /// `true` only when every worker — including any parked + /// [`detached_threads`](Self::detached_threads) — is /// [`Ok`](CoordinatorThreadStatus::Ok) or /// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any - /// `Stopped`, `Panicked`, `Timeout`, or `Error` slot makes it - /// `false`. + /// `Stopped`, `Panicked`, `Timeout`, `Error`, or `Detached` slot + /// makes it `false`. pub fn all_clean(&self) -> bool { self.platform_address_sync.is_clean() && self.identity_sync.is_clean() && self.shielded_sync.as_ref().is_none_or(|s| s.is_clean()) && self.event_adapter.is_clean() + && self.detached_threads.is_clean() } } @@ -223,6 +272,138 @@ fn panic_message(payload: Box) -> String { } } +/// Reap a coordinator's prior OS thread after a `stop()`→`start()` +/// reschedule — or park it for [`PlatformWalletManager::shutdown`] if it +/// is genuinely wedged. +/// +/// Shared by all three coordinators' `start()` (identity / platform- +/// address / shielded), called at the tail of `start()` *after* the +/// `background_cancel` lock has been released, so the exiting prior +/// thread's epilogue (which also takes that lock) can complete and the +/// join lands in milliseconds. +/// +/// `prior` was cancellation-signalled by the preceding `stop()`, so its +/// `select!` loop exits and the thread finishes almost immediately. The +/// `backstop` deadline fires only if the thread is wedged in a +/// non-yielding `Drop` that never observes the cancellation (vanishingly +/// rare). On that wedge we must NOT silently drop the still-live handle: +/// the thread still holds an `Arc` to the host event handler and could +/// fire a callback, so a later `destroy` that freed the host context +/// would hit a use-after-free. Instead we park the handle in `orphans` +/// so `shutdown()` joins it within its own timeout and reports +/// [`CoordinatorThreadStatus::Detached`] if it is still alive — keeping +/// [`CoordinatorExitStatus::all_clean`] honest. +pub(crate) fn reap_prior_or_park( + prior: Option>, + orphans: &CoordinatorOrphans, + backstop: std::time::Duration, + coordinator: &str, +) { + let Some(handle) = prior else { + return; + }; + let deadline = std::time::Instant::now() + backstop; + loop { + if handle.is_finished() { + // Near-instant since finished; reaps the thread's resources. + let _ = handle.join(); + return; + } + if std::time::Instant::now() >= deadline { + tracing::warn!( + coordinator, + ?backstop, + "prior sync thread did not finish within the backstop after \ + cancellation; parking it in the manager orphans list for \ + shutdown() to join rather than detaching it" + ); + // Park the still-live (but already-cancelled) handle so a + // later shutdown() can join it and report it non-clean, + // instead of dropping it and leaving a UAF window where the + // host frees a callback context the thread may still touch. + orphans + .lock() + .unwrap_or_else(|e| e.into_inner()) + .push(handle); + return; + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } +} + +/// Drain the manager's [`CoordinatorOrphans`] list and classify how the +/// parked threads ended, polling until `deadline`. +/// +/// Threads land in the list only when a tight `stop()`→`start()` reap had +/// to detach a prior coordinator thread past its 1 s wedge-backstop (see +/// [`reap_prior_or_park`]). They were parked rather than dropped so this +/// final teardown can account for them: a still-live detached thread +/// keeps an `Arc` to the host event handler and could fire one last +/// callback, so the host must not free its context until every such +/// thread has exited. +/// +/// Polls [`JoinHandle::is_finished`](std::thread::JoinHandle::is_finished) +/// in 5 ms steps, yielding at each `.await` so a wrapping +/// `tokio::time::timeout` can still interrupt it (no uncancellable +/// blocking join — `join()` is only ever called on an already-finished +/// handle). Returns: +/// - [`Ok`](CoordinatorThreadStatus::Ok) — the list was empty, or every +/// parked thread joined cleanly; +/// - [`Panicked`](CoordinatorThreadStatus::Panicked) — a parked thread +/// had panicked (and none were left alive at the deadline); +/// - [`Detached`](CoordinatorThreadStatus::Detached) — at least one +/// parked thread was still alive at `deadline`. Any still-live handles +/// are re-parked so a later (idempotent) `shutdown()` can retry. +pub(crate) async fn join_detached_orphans( + orphans: &CoordinatorOrphans, + deadline: std::time::Instant, +) -> CoordinatorThreadStatus { + // Take the whole list out under the lock; we re-park any survivors + // at the deadline, but never hold the lock across an `.await`. + let mut pending: Vec> = { + let mut guard = orphans.lock().unwrap_or_else(|e| e.into_inner()); + std::mem::take(&mut *guard) + }; + if pending.is_empty() { + return CoordinatorThreadStatus::Ok; + } + + let mut panicked: Option = None; + loop { + // Reap every thread that has finished this pass; retain the rest. + let mut still_live = Vec::with_capacity(pending.len()); + for handle in pending.drain(..) { + if handle.is_finished() { + if let Err(payload) = handle.join() { + // Keep the first panic message; a live `Detached` + // thread still takes precedence at the deadline below. + panicked.get_or_insert_with(|| panic_message(payload)); + } + } else { + still_live.push(handle); + } + } + pending = still_live; + + if pending.is_empty() { + return match panicked { + Some(msg) => CoordinatorThreadStatus::Panicked(msg), + None => CoordinatorThreadStatus::Ok, + }; + } + if std::time::Instant::now() >= deadline { + // Re-park survivors so an idempotent re-`shutdown()` retries + // rather than losing track of a still-live thread. + orphans + .lock() + .unwrap_or_else(|e| e.into_inner()) + .extend(pending); + return CoordinatorThreadStatus::Detached; + } + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + } +} + /// Maximum time (seconds) the teardown paths — `shutdown()`, /// `clear_shielded`, and the FFI shielded-stop bridge — wait for one /// coordinator's quiesce+join to complete. @@ -236,6 +417,23 @@ fn panic_message(payload: Box) -> String { /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever. pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; +/// Grace period (seconds) [`PlatformWalletManager::shutdown`] spends +/// polling any parked [`CoordinatorOrphans`] before declaring a survivor +/// [`Detached`](CoordinatorThreadStatus::Detached). +/// +/// Unlike a live coordinator — whose `quiesce()` may legitimately spend +/// seconds draining an in-flight pass, hence the 30 s +/// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] — an orphan is a thread an earlier reap +/// already had to detach *because it was wedged past its 1 s backstop*. +/// A healthy detached thread finishes within milliseconds of the +/// cancellation it long ago received (so `is_finished()` is usually true +/// on the first poll and the join is instant); one still alive after this +/// grace is wedged in a non-yielding `Drop` and will not finish however +/// long we wait. A short grace therefore separates "finishing" from +/// "wedged" without stretching teardown, and reporting `Detached` is the +/// conservative, UAF-safe outcome (the host delays freeing its context). +pub(crate) const SHUTDOWN_ORPHAN_GRACE_SECS: u64 = 1; + impl PlatformWalletManager

{ /// Create a new PlatformWalletManager. /// @@ -275,6 +473,13 @@ impl PlatformWalletManager

{ balance_handler, ])); + // Shared orphans list: a coordinator's `start()` reap parks here + // any prior thread it had to detach past its 1 s wedge-backstop, + // and `shutdown()` joins them. Every coordinator gets a clone of + // this same `Arc` so they all park into the one list the manager + // drains. + let coordinator_orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new())); + let spv = Arc::new(SpvRuntime::new( Arc::clone(&wallet_manager), Arc::clone(&event_manager), @@ -282,10 +487,12 @@ impl PlatformWalletManager

{ let platform_address_sync = Arc::new(PlatformAddressSyncManager::new( Arc::clone(&wallets), Arc::clone(&event_manager), + Arc::clone(&coordinator_orphans), )); let identity_sync = Arc::new(IdentitySyncManager::new( Arc::clone(&sdk), Arc::clone(&persister), + Arc::clone(&coordinator_orphans), )); #[cfg(feature = "shielded")] let shielded_coordinator: Arc< @@ -295,6 +502,7 @@ impl PlatformWalletManager

{ let shielded_sync = Arc::new(ShieldedSyncManager::new( Arc::clone(&event_manager), Arc::clone(&shielded_coordinator), + Arc::clone(&coordinator_orphans), )); Self { sdk, @@ -313,6 +521,7 @@ impl PlatformWalletManager

{ persister, event_adapter_cancel, event_adapter_join: tokio::sync::Mutex::new(Some(event_adapter_join)), + coordinator_orphans, } } @@ -575,11 +784,26 @@ impl PlatformWalletManager

{ }, }; + // Finally, account for any coordinator threads an earlier tight + // stop()→start() reap had to detach past its 1 s wedge-backstop. + // They were parked in `coordinator_orphans` (not dropped) so we + // can join them here; a survivor at the grace deadline reports + // `Detached`, which keeps `all_clean()` false so the FFI `destroy` + // returns `ErrorShutdownIncomplete` rather than letting the host + // free a callback context the live thread may still touch. The + // grace poll yields, so it never blocks teardown uncancellably. + let detached_threads = join_detached_orphans( + &self.coordinator_orphans, + std::time::Instant::now() + std::time::Duration::from_secs(SHUTDOWN_ORPHAN_GRACE_SECS), + ) + .await; + CoordinatorExitStatus { platform_address_sync, identity_sync, shielded_sync, event_adapter, + detached_threads, } } } @@ -800,6 +1024,9 @@ mod tests { assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean()); assert!(!CoordinatorThreadStatus::Timeout.is_clean()); assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean()); + // A detached-but-still-live coordinator thread is non-clean: the + // host must not free its callback context yet. + assert!(!CoordinatorThreadStatus::Detached.is_clean()); } /// `all_clean()` on `CoordinatorExitStatus` is false whenever any @@ -811,6 +1038,7 @@ mod tests { identity_sync: CoordinatorThreadStatus::NotRunning, shielded_sync: None, event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Ok, }; assert!(clean.all_clean()); @@ -819,6 +1047,7 @@ mod tests { identity_sync: CoordinatorThreadStatus::Ok, shielded_sync: None, event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Ok, }; assert!(!with_timeout.all_clean()); @@ -827,8 +1056,20 @@ mod tests { identity_sync: CoordinatorThreadStatus::Ok, shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))), event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Ok, }; assert!(!with_stopped.all_clean()); + + // A still-live detached orphan alone makes the aggregate + // non-clean — the slot the rest of the teardown can't see. + let with_detached = CoordinatorExitStatus { + platform_address_sync: CoordinatorThreadStatus::Ok, + identity_sync: CoordinatorThreadStatus::Ok, + shielded_sync: None, + event_adapter: CoordinatorThreadStatus::Ok, + detached_threads: CoordinatorThreadStatus::Detached, + }; + assert!(!with_detached.all_clean()); } /// A cleanly-returning thread joins as `Ok`; an absent handle is @@ -993,4 +1234,158 @@ mod tests { SHUTDOWN_PANICS.load(AO::SeqCst) ); } + + /// Spawn a thread that parks until `release` is signalled (or the + /// sender drops), standing in for a coordinator thread wedged in a + /// non-yielding `Drop` that ignores the cancellation it received. + fn spawn_wedged_thread() -> (std::sync::mpsc::Sender<()>, std::thread::JoinHandle<()>) { + let (release_tx, release_rx) = std::sync::mpsc::channel::<()>(); + let handle = std::thread::spawn(move || { + // Block here regardless of any cancellation, exactly like a + // Drop that never yields, until the test releases us. + let _ = release_rx.recv(); + }); + (release_tx, handle) + } + + /// A prior coordinator thread that is still alive past the reap + /// backstop must be **parked in the orphans list**, not dropped — + /// otherwise `shutdown()` would never know it exists and could let the + /// host free a callback context the live thread still touches. + /// + /// Non-vacuous: if `reap_prior_or_park` dropped the wedged handle + /// (the old behavior) the list would stay empty and the length + /// assertion below would fail. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn reap_prior_or_park_parks_wedged_thread() { + let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new())); + let (release_tx, wedged) = spawn_wedged_thread(); + + // `reap_prior_or_park` is synchronous and spins a std sleep until + // its backstop, so run it off the runtime workers. A short backstop + // (real `start()` uses 1 s) keeps the test fast. + let orphans_for_reap = Arc::clone(&orphans); + tokio::task::spawn_blocking(move || { + reap_prior_or_park( + Some(wedged), + &orphans_for_reap, + Duration::from_millis(100), + "test-coordinator", + ); + }) + .await + .unwrap(); + + assert_eq!( + orphans.lock().unwrap().len(), + 1, + "a prior thread wedged past the backstop must be parked, not dropped" + ); + + // Cleanup: release + join the parked thread so none leaks. + release_tx.send(()).unwrap(); + let parked = orphans.lock().unwrap().pop().unwrap(); + tokio::task::spawn_blocking(move || { + let _ = parked.join(); + }) + .await + .unwrap(); + } + + /// `join_detached_orphans` classifies the parked threads: empty list → + /// `Ok`; a survivor at the deadline → `Detached` (re-parked for a later + /// retry); once the survivor exits, a fresh join reports `Ok` and + /// drains the list. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn join_detached_orphans_reports_detached_then_ok() { + let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new())); + + // Nothing parked → clean. + assert_eq!( + join_detached_orphans(&orphans, std::time::Instant::now()).await, + CoordinatorThreadStatus::Ok + ); + + // Park a still-live thread; a short deadline elapses with it alive. + let (release_tx, wedged) = spawn_wedged_thread(); + orphans.lock().unwrap().push(wedged); + let status = join_detached_orphans( + &orphans, + std::time::Instant::now() + Duration::from_millis(50), + ) + .await; + assert_eq!( + status, + CoordinatorThreadStatus::Detached, + "a survivor at the deadline must report Detached" + ); + assert_eq!( + orphans.lock().unwrap().len(), + 1, + "a survivor must be re-parked so an idempotent re-shutdown retries" + ); + + // Release it; the next join reaps it cleanly and empties the list. + release_tx.send(()).unwrap(); + let status = tokio::time::timeout( + Duration::from_secs(5), + join_detached_orphans(&orphans, std::time::Instant::now() + Duration::from_secs(5)), + ) + .await + .expect("orphan join must complete once the thread is released"); + assert_eq!(status, CoordinatorThreadStatus::Ok); + assert!( + orphans.lock().unwrap().is_empty(), + "a joined orphan must be drained from the list" + ); + } + + /// Headline regression: a coordinator thread detached past the reap + /// backstop and parked in the orphans list makes a subsequent + /// `shutdown()` report the result as **non-clean** — so the FFI + /// `destroy` returns `ErrorShutdownIncomplete` and the host delays + /// freeing the callback context the still-live thread may touch. + /// + /// Non-vacuous: if `join_detached_orphans` ignored the list (or the + /// orphan were dropped at reap instead of parked), `detached_threads` + /// would be `Ok` and `all_clean()` would be `true`, failing both + /// assertions. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_reports_detached_orphan_as_non_clean() { + let manager = make_manager(); + + // Stand in for the genuine-wedge outcome: an earlier tight + // stop()→start() reap had to detach a still-live coordinator thread + // past its 1 s backstop, so `reap_prior_or_park` parked it here. + let (release_tx, wedged) = spawn_wedged_thread(); + manager.coordinator_orphans.lock().unwrap().push(wedged); + + let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown()) + .await + .expect("shutdown must complete within bound"); + + assert_eq!( + status.detached_threads, + CoordinatorThreadStatus::Detached, + "a still-live detached orphan must surface as Detached" + ); + assert!( + !status.all_clean(), + "all_clean() must be false while a detached coordinator thread is \ + still alive: {status:?}" + ); + + // Cleanup: shutdown() re-parked the survivor; release + join it so + // no live thread leaks past the test. Pop into a local first so the + // std MutexGuard is not held across the await below. + release_tx.send(()).unwrap(); + let parked = manager.coordinator_orphans.lock().unwrap().pop(); + if let Some(parked) = parked { + tokio::task::spawn_blocking(move || { + let _ = parked.join(); + }) + .await + .unwrap(); + } + } } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index 094ae1a25b..40457c4a87 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -104,6 +104,12 @@ pub struct PlatformAddressSyncManager { /// confirm the `!Send` loop fully exited before the host drops the /// runtime. background_join: StdMutex>>, + /// Manager-owned orphans list (shared `Arc`). On a tight + /// `stop()`→`start()` where the prior thread is wedged past the 1 s + /// reap backstop, [`start`](Self::start) parks the still-live handle + /// here (via [`reap_prior_or_park`](super::reap_prior_or_park)) + /// instead of dropping it, so manager `shutdown()` accounts for it. + coordinator_orphans: super::CoordinatorOrphans, /// Monotonically increasing generation counter. Bumped on every /// `start()` so the exiting thread can tell whether its generation is /// still the active one before clearing `background_cancel`. Without @@ -135,12 +141,14 @@ impl PlatformAddressSyncManager { pub fn new( wallets: Arc>>>, event_manager: Arc, + coordinator_orphans: super::CoordinatorOrphans, ) -> Self { Self { wallets, event_manager, background_cancel: StdMutex::new(None), background_join: StdMutex::new(None), + coordinator_orphans, background_generation: AtomicU64::new(0), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), @@ -313,23 +321,15 @@ impl PlatformAddressSyncManager { // so is_finished() trips within a few milliseconds and the join is // near-instant. The 1 s deadline survives only as a genuine-wedge // backstop (e.g. a pass wedged in a Drop that never yields); if it - // fires we detach the already-cancelled thread to unblock start(). - if let Some(h) = prior { - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); - while !h.is_finished() { - if std::time::Instant::now() >= deadline { - tracing::warn!( - "platform-address-sync prior thread did not finish within 1 s \ - after cancellation; detaching to unblock start()" - ); - break; // Drop h — detaches; thread was already cancelled. - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - if h.is_finished() { - let _ = h.join(); // Reap resources; near-instant since finished. - } - } + // fires `reap_prior_or_park` parks the still-live, already-cancelled + // thread in the manager orphans list so `shutdown()` joins it and + // reports it non-clean rather than dropping it (residual UAF). + super::reap_prior_or_park( + prior, + &self.coordinator_orphans, + std::time::Duration::from_secs(1), + "platform-address-sync", + ); } /// Stop the background sync loop. No-op if not running. @@ -543,8 +543,13 @@ mod tests { let event_manager = Arc::new(PlatformEventManager::new(vec![ Arc::clone(&counter) as Arc ])); + let orphans = Arc::new(StdMutex::new(Vec::new())); ( - Arc::new(PlatformAddressSyncManager::new(wallets, event_manager)), + Arc::new(PlatformAddressSyncManager::new( + wallets, + event_manager, + orphans, + )), counter, ) } diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index ba7b752315..3c84bd7071 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -148,6 +148,12 @@ pub struct ShieldedSyncManager { /// confirm the `!Send` loop fully exited before the host drops the /// runtime. background_join: StdMutex>>, + /// Manager-owned orphans list (shared `Arc`). On a tight + /// `stop()`→`start()` where the prior thread is wedged past the 1 s + /// reap backstop, [`start`](Self::start) parks the still-live handle + /// here (via [`reap_prior_or_park`](super::reap_prior_or_park)) + /// instead of dropping it, so manager `shutdown()` accounts for it. + coordinator_orphans: super::CoordinatorOrphans, /// Monotonically increasing generation counter. Bumped on every /// `start()` so the exiting thread can tell whether its /// generation is still the active one before clearing @@ -173,12 +179,14 @@ impl ShieldedSyncManager { pub fn new( event_manager: Arc, coordinator_slot: Arc>>>, + coordinator_orphans: super::CoordinatorOrphans, ) -> Self { Self { event_manager, coordinator_slot, background_cancel: StdMutex::new(None), background_join: StdMutex::new(None), + coordinator_orphans, background_generation: AtomicU64::new(0), interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), @@ -346,23 +354,15 @@ impl ShieldedSyncManager { // so is_finished() trips within a few milliseconds and the join is // near-instant. The 1 s deadline survives only as a genuine-wedge // backstop (e.g. a pass wedged in a Drop that never yields); if it - // fires we detach the already-cancelled thread to unblock start(). - if let Some(h) = prior { - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1); - while !h.is_finished() { - if std::time::Instant::now() >= deadline { - tracing::warn!( - "shielded-sync prior thread did not finish within 1 s \ - after cancellation; detaching to unblock start()" - ); - break; // Drop h — detaches; thread was already cancelled. - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - if h.is_finished() { - let _ = h.join(); // Reap resources; near-instant since finished. - } - } + // fires `reap_prior_or_park` parks the still-live, already-cancelled + // thread in the manager orphans list so `shutdown()` joins it and + // reports it non-clean rather than dropping it (residual UAF). + super::reap_prior_or_park( + prior, + &self.coordinator_orphans, + std::time::Duration::from_secs(1), + "shielded-sync", + ); } /// Stop the background sync loop. No-op if not running. @@ -601,7 +601,12 @@ mod tests { fn make_manager() -> Arc { let coordinator_slot = Arc::new(RwLock::new(None)); let event_manager = Arc::new(PlatformEventManager::new(vec![])); - Arc::new(ShieldedSyncManager::new(event_manager, coordinator_slot)) + let orphans = Arc::new(StdMutex::new(Vec::new())); + Arc::new(ShieldedSyncManager::new( + event_manager, + coordinator_slot, + orphans, + )) } /// Regression: a tight `stop()` → `start()` must reap the prior loop's From 3cca1cf833e1a2aaf7dadd9df722323634678cae Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:36:19 +0200 Subject: [PATCH 16/29] perf(platform-wallet): drain coordinators concurrently in shutdown() via tokio::join! The three periodic coordinators (platform-address, identity, shielded) were quiesced sequentially in shutdown(), making the worst-case wait additive (~3 x SHUTDOWN_JOIN_TIMEOUT_SECS). Each quiesce() touches only its own state (its quiescing/is_syncing atomics and its own background_cancel/background_join mutexes) and joins its own OS thread, sharing no lock, so racing them is sound. Drain them concurrently via tokio::join!, collapsing the worst case to ~max(timeouts). Each join! arm keeps its OWN inner tokio::time::timeout, so every coordinator still yields its own per-coordinator CoordinatorThreadStatus (a single outer timeout would flatten all three to Timeout). The event adapter teardown and join_detached_orphans stay sequential and ordered strictly AFTER the coordinator join!, since the adapter sinks the coordinators' stores. The multi-thread runtime assert is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../rs-platform-wallet/src/manager/mod.rs | 57 +++++++++++++------ 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 0e02d430b7..7e9690d066 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -691,7 +691,13 @@ impl PlatformWalletManager

{ /// context — a use-after-free. So we `quiesce()` the sync managers /// FIRST (so no further persister store or host callback can start), /// and only THEN cancel + join the event adapter, which is the sink - /// those stores feed into. + /// those stores feed into. The three coordinators are independent — + /// each `quiesce()` touches only its own state (its `quiescing` / + /// `is_syncing` atomics and its own `background_cancel` / + /// `background_join` mutexes) and joins its own OS thread, sharing no + /// lock — so they are drained *concurrently* via `tokio::join!`; only + /// the event-adapter teardown stays ordered strictly after them, + /// because it is the sink the coordinators store into. /// /// After each coordinator's `quiesce()` drains its in-flight pass, /// this also **joins** the loop's OS thread, so when `shutdown()` @@ -708,8 +714,10 @@ impl PlatformWalletManager

{ /// service one `block_on` at a time, so the join would deadlock. This /// is asserted in both debug and release builds. /// - /// Each coordinator quiesce+join is bounded by - /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] as a backstop. `quiesce()` cancels + /// Each coordinator quiesce+join is bounded by its own + /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] backstop. Because the three drain + /// concurrently, the worst-case wait collapses to ~that single + /// backstop instead of the sum of all three. `quiesce()` cancels /// the loop, which aborts any in-flight pass at its `.await` point, so /// the `is_syncing` drain clears promptly and the join normally lands /// far inside the window — the deadline fires only if a pass's *drop* @@ -735,25 +743,42 @@ impl PlatformWalletManager

{ let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS); - // Each quiesce() drains any in-flight pass AND joins the thread. - let platform_address_sync = + // Drain the three independent periodic coordinators *concurrently*. + // Each quiesce() drains any in-flight pass AND joins its own OS + // thread, touching only that coordinator's own state (no shared + // lock), so racing them is sound and collapses the worst case from + // the sum of the three backstops to ~max(...). Each drain keeps its + // OWN inner `tokio::time::timeout`, so it still yields its own + // per-coordinator `CoordinatorThreadStatus` — a single outer timeout + // around the whole join! would flatten all three to `Timeout` and + // lose that detail. + let drain_platform_address = async { tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce()) .await - .unwrap_or(CoordinatorThreadStatus::Timeout); - - let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce()) - .await - .unwrap_or(CoordinatorThreadStatus::Timeout); - + .unwrap_or(CoordinatorThreadStatus::Timeout) + }; + let drain_identity = async { + tokio::time::timeout(timeout, self.identity_sync_manager.quiesce()) + .await + .unwrap_or(CoordinatorThreadStatus::Timeout) + }; #[cfg(feature = "shielded")] - let shielded_sync = { - let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce()) + let drain_shielded = async { + tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce()) .await - .unwrap_or(CoordinatorThreadStatus::Timeout); - Some(r) + .unwrap_or(CoordinatorThreadStatus::Timeout) + }; + + #[cfg(feature = "shielded")] + let (platform_address_sync, identity_sync, shielded_sync) = { + let (p, i, s) = tokio::join!(drain_platform_address, drain_identity, drain_shielded); + (p, i, Some(s)) }; #[cfg(not(feature = "shielded"))] - let shielded_sync = None; + let (platform_address_sync, identity_sync, shielded_sync) = { + let (p, i) = tokio::join!(drain_platform_address, drain_identity); + (p, i, None) + }; // The event adapter is a tokio task (it sinks the coordinators' // stores), so cancel + join it last — after the loops feeding it From 8c528116a30ac0cb5173236c0fd83d12a2cabc41 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 22:22:19 +0200 Subject: [PATCH 17/29] feat(dash-async): add shared ThreadRegistry worker-lifecycle engine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralizes the previously-triplicated background-worker lifecycle — generation-match exit epilogue, restart reap-or-park, orphan drain — into one tested engine in the shared dash-async crate, generic over a worker key and supporting both OS-thread (`!Send` block_on loops) and tokio-task workers. Makes two confirmed bugs impossible by construction: - F1: quiesce/join paths take `&self`; the live JoinHandle stays owned by the slot and is never moved into a cancellable future's frame. A dropped/timed-out quiesce re-parks the handle into orphans (Timeout), never drop-and-detach to a clean NotRunning. - F2: any_alive() is the single liveness gate spanning live slots AND parked orphans, so store-wiping paths can refuse while a prior thread is alive. Weight-ordered shutdown drains tiers ascending, concurrently within a tier. WorkerStatus variants are byte-identical to the wallet's CoordinatorThreadStatus for a stable FFI mapping. Adds the full registry test suite (TC-001/001b/003-014, F1 shutdown-path GAP-006, compile-fail DrainHook Send check, default-config and idempotent- shutdown gaps). 22 tests + 1 doctest green. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- Cargo.lock | 2 + packages/rs-dash-async/Cargo.toml | 4 +- packages/rs-dash-async/src/lib.rs | 11 +- packages/rs-dash-async/src/registry.rs | 1257 ++++++++++++++++++++++++ 4 files changed, 1272 insertions(+), 2 deletions(-) create mode 100644 packages/rs-dash-async/src/registry.rs diff --git a/Cargo.lock b/Cargo.lock index 1faa308a83..2108bed826 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1617,8 +1617,10 @@ dependencies = [ name = "dash-async" version = "4.0.0-rc.2" dependencies = [ + "futures", "thiserror 2.0.18", "tokio", + "tokio-util", "tracing", ] diff --git a/packages/rs-dash-async/Cargo.toml b/packages/rs-dash-async/Cargo.toml index 26e2c8fdeb..69d180e568 100644 --- a/packages/rs-dash-async/Cargo.toml +++ b/packages/rs-dash-async/Cargo.toml @@ -13,6 +13,8 @@ tracing = "0.1.41" [target.'cfg(not(target_arch = "wasm32"))'.dependencies] tokio = { version = "1.40", features = ["rt", "rt-multi-thread", "time", "net"] } +tokio-util = { version = "0.7.12" } +futures = { version = "0.3.30" } [dev-dependencies] -tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync"] } +tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync", "time"] } diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs index 3edcf00daa..1ce0820359 100644 --- a/packages/rs-dash-async/src/lib.rs +++ b/packages/rs-dash-async/src/lib.rs @@ -3,10 +3,19 @@ //! Provides [`block_on`] -- a function that bridges async futures into sync code, //! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread). //! -//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets. +//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets, +//! and [`ThreadRegistry`] — a shared lifecycle engine for background OS-thread / tokio-task +//! workers (start, cancel, weight-ordered quiesce + join, orphan reap). mod atomic; mod block_on; +#[cfg(not(target_arch = "wasm32"))] +mod registry; pub use atomic::AtomicFlagGuard; pub use block_on::{block_on, AsyncError}; +#[cfg(not(target_arch = "wasm32"))] +pub use registry::{ + DrainHook, RegistryKey, ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig, + WorkerStatus, DEFAULT_JOIN_BUDGET, DEFAULT_REAP_BACKSTOP, +}; diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs new file mode 100644 index 0000000000..e7cd835cd8 --- /dev/null +++ b/packages/rs-dash-async/src/registry.rs @@ -0,0 +1,1257 @@ +//! Shared lifecycle engine for background workers (`ThreadRegistry`). +//! +//! Centralizes the dangerous, previously-triplicated 80% of a background +//! worker's lifecycle — the generation-match exit epilogue, the +//! reap-or-park of a restarted worker's prior thread, and the orphan +//! drain — into one tested place, while deliberately leaving the +//! domain-specific 20% (the "is a pass in flight?" drain barrier) to the +//! consumer as a [`DrainHook`]. +//! +//! Two worker kinds are supported: +//! - [`start_thread`](ThreadRegistry::start_thread) — a dedicated OS +//! thread, for loops that `block_on` `!Send` futures internally (the +//! `!Send` value never crosses the spawn boundary; the body itself is +//! `Send`). +//! - [`start_task`](ThreadRegistry::start_task) — a tokio task, for +//! `Send` futures. +//! +//! # Why F1 and F2 cannot recur +//! +//! - **F1** (timeout-dropped quiesce detaches a live thread): every join +//! path takes `&self`; the live join handle stays owned by the slot +//! and is never moved into a cancellable future's frame. A +//! dropped/timed-out [`quiesce`](ThreadRegistry::quiesce) therefore +//! cannot drop-and-detach the handle — on timeout (or on an external +//! drop) the handle is deterministically re-parked into the orphan +//! list, and the slot reports [`WorkerStatus::Timeout`], never a clean +//! `NotRunning`. +//! - **F2** (store wipe races a parked prior-generation thread): +//! orphans live in the registry and [`any_alive`](ThreadRegistry::any_alive) +//! is the single liveness gate spanning live slots **and** parked +//! orphans. Every store-wiping path consults it, so a parked +//! still-live thread blocks the wipe. + +use std::collections::BTreeMap; +use std::future::Future; +use std::pin::Pin; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use futures::future::FutureExt; +use tokio::runtime::RuntimeFlavor; +use tokio_util::sync::CancellationToken; + +// --------------------------------------------------------------------- +// Key & weight +// --------------------------------------------------------------------- + +/// Worker identity. A wallet supplies a fixed enum; rs-dapi a generated +/// id. Blanket-implemented — consumers just derive the listed bounds on +/// their own key type. +pub trait RegistryKey: Copy + Ord + Eq + std::fmt::Debug + Send + Sync + 'static {} +impl RegistryKey for T {} + +/// Teardown order. Lower weights drain first; equal weights drain +/// concurrently within a tier. Default `0`. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default)] +pub struct ShutdownWeight(pub i32); + +// --------------------------------------------------------------------- +// Status +// --------------------------------------------------------------------- + +/// Terminal status of one worker. Variant set and payloads are +/// byte-identical to the wallet's `CoordinatorThreadStatus`, which is +/// constructed from this via `From` so the FFI surface stays stable. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum WorkerStatus { + /// The loop exited and its thread/task joined cleanly. + Ok, + /// A tokio task ended for a non-panic, non-clean reason (cancelled / + /// aborted at the runtime level). Carries a reason when available. + /// Only the `Task` kind can produce this; an OS thread never does. + Stopped(Option), + /// The thread/task panicked; carries the best-effort panic message. + Panicked(String), + /// The managed join exceeded this worker's `join_budget`. The live + /// handle was re-parked into the orphan list — UAF-safe, non-clean. + Timeout, + /// A parked orphan was still alive after the reap grace — UAF-safe, + /// non-clean. + Detached, + /// No thread/task was running to join — never started, or already + /// joined by a prior teardown. + NotRunning, + /// Infrastructural join failure that is neither a timeout nor a + /// panic (unreachable in normal operation). + Error(String), +} + +impl WorkerStatus { + /// `true` only for a fully clean outcome: joined normally (`Ok`) or + /// never ran (`NotRunning`). + pub fn is_clean(&self) -> bool { + matches!(self, Self::Ok | Self::NotRunning) + } +} + +/// Aggregate result of [`ThreadRegistry::shutdown`]. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShutdownReport { + /// Per-worker terminal status, keyed by worker id. + pub per_worker: BTreeMap, + /// Number of parked orphans still alive at the reap deadline. + pub detached: usize, +} + +impl ShutdownReport { + /// `true` only when every per-worker status is clean and no orphan + /// survived the reap. + pub fn all_clean(&self) -> bool { + self.detached == 0 && self.per_worker.values().all(WorkerStatus::is_clean) + } +} + +// --------------------------------------------------------------------- +// Per-worker registration options +// --------------------------------------------------------------------- + +/// Async drain hook the registry awaits **before** cancelling a worker, +/// in weight order. The domain barrier (raise a `quiescing` gate, wait +/// out an in-flight pass) lives here, supplied by the consumer — the +/// registry never owns domain semantics. +/// +/// The captured state must be `Send + Sync`; a `!Send` capture does not +/// compile as a `DrainHook`: +/// +/// ```compile_fail +/// use std::rc::Rc; +/// use std::sync::Arc; +/// use dash_async::DrainHook; +/// let rc = Rc::new(42u32); // !Send +/// let _hook: DrainHook = +/// Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) }); +/// ``` +pub type DrainHook = + Arc Pin + Send>> + Send + Sync>; + +/// Default managed-join budget when a [`WorkerConfig`] does not override +/// it. Pinned so an accidental change surfaces in tests. +pub const DEFAULT_JOIN_BUDGET: Duration = Duration::from_secs(30); + +/// Default orphan reap backstop (start-time reap and shutdown grace). +pub const DEFAULT_REAP_BACKSTOP: Duration = Duration::from_secs(1); + +/// Per-worker registration options. +pub struct WorkerConfig { + /// Teardown tier; lower drains first, equal weights concurrently. + pub weight: ShutdownWeight, + /// Optional drain barrier awaited before cancellation. + pub drain: Option, + /// Managed-join timeout for this worker. + pub join_budget: Duration, +} + +impl Default for WorkerConfig { + fn default() -> Self { + Self { + weight: ShutdownWeight::default(), + drain: None, + join_budget: DEFAULT_JOIN_BUDGET, + } + } +} + +// --------------------------------------------------------------------- +// Internal handle + slot state +// --------------------------------------------------------------------- + +/// A live worker's join handle. Kept owned by its slot so a cancellable +/// caller can never move it into a future frame and detach it on drop. +enum WorkerHandle { + OsThread(std::thread::JoinHandle<()>), + Task(tokio::task::JoinHandle<()>), +} + +impl WorkerHandle { + fn is_finished(&self) -> bool { + match self { + WorkerHandle::OsThread(h) => h.is_finished(), + WorkerHandle::Task(h) => h.is_finished(), + } + } + + /// Classify a **finished** handle. Kind-dispatched (R3): an OS thread + /// yields only `Ok` / `Panicked`; a task can also yield `Stopped` + /// (cancelled / aborted at the runtime level). + fn classify(self) -> WorkerStatus { + match self { + WorkerHandle::OsThread(j) => match j.join() { + Ok(()) => WorkerStatus::Ok, + Err(payload) => WorkerStatus::Panicked(panic_message(payload)), + }, + WorkerHandle::Task(j) => match j.now_or_never() { + Some(Ok(())) => WorkerStatus::Ok, + Some(Err(e)) if e.is_panic() => { + WorkerStatus::Panicked(panic_message(e.into_panic())) + } + Some(Err(e)) => WorkerStatus::Stopped(Some(e.to_string())), + // Only ever called on a finished handle, so a finished + // task is always ready; this arm is defensive. + None => WorkerStatus::Error("task handle not ready at join".to_string()), + }, + } + } +} + +/// Best-effort extraction of a panic message (`&str` / `String` cases). +fn panic_message(payload: Box) -> String { + if let Some(s) = payload.downcast_ref::<&str>() { + (*s).to_string() + } else if let Some(s) = payload.downcast_ref::() { + s.clone() + } else { + "".to_string() + } +} + +/// One key's slot. The entry is created on first start and never removed, +/// so `generation` stays monotonic across the key's whole lifetime — a +/// parked prior-generation thread can therefore always tell that its +/// generation is stale. `cancel.is_some()` is the running indicator; +/// `handle` is the join handle, reaped by the next start or by quiesce. +struct SlotState { + generation: u64, + cancel: Option, + handle: Option, + weight: ShutdownWeight, + drain: Option, + join_budget: Duration, +} + +impl SlotState { + fn dormant() -> Self { + Self { + generation: 0, + cancel: None, + handle: None, + weight: ShutdownWeight::default(), + drain: None, + join_budget: DEFAULT_JOIN_BUDGET, + } + } +} + +// --------------------------------------------------------------------- +// The registry +// --------------------------------------------------------------------- + +/// Shared lifecycle engine for background workers. See the module docs. +pub struct ThreadRegistry { + slots: Mutex>, + orphans: Mutex>, + reap_backstop: Duration, +} + +impl ThreadRegistry { + /// New registry with the default reap backstop ([`DEFAULT_REAP_BACKSTOP`]). + pub fn new() -> Arc { + Self::with_reap_backstop(DEFAULT_REAP_BACKSTOP) + } + + /// New registry with an explicit orphan reap backstop (the wallet + /// uses 1s — the same grace separates "finishing" from "wedged"). + pub fn with_reap_backstop(backstop: Duration) -> Arc { + Arc::new(Self { + slots: Mutex::new(BTreeMap::new()), + orphans: Mutex::new(Vec::new()), + reap_backstop: backstop, + }) + } + + /// Start an OS-thread worker for `!Send` loops. `body` runs on a + /// fresh `std::thread` and may build and `block_on` `!Send` futures + /// internally — the `!Send` value never crosses the spawn boundary + /// (`body` itself is `Send`). Starting a key that already has a live + /// worker is a no-op; a key whose prior thread has not been reaped is + /// reaped-or-parked first (the restart-reap path). + /// + /// **Requires a multi-thread runtime**: the worker drives its loop + /// via `Handle::block_on` and needs the shared timer/IO driver. + pub fn start_thread(self: &Arc, key: K, cfg: WorkerConfig, body: F) + where + F: FnOnce(CancellationToken) + Send + 'static, + { + Self::assert_multi_thread("start_thread"); + let prior = { + let mut slots = self.lock_slots(); + let slot = slots.entry(key).or_insert_with(SlotState::dormant); + if slot.cancel.is_some() { + return; + } + // Take the prior handle to reap below; bump generation and + // install the new token under this one lock so a prior + // thread's epilogue observes the post-swap generation. + let prior = slot.handle.take(); + let token = CancellationToken::new(); + slot.cancel = Some(token.clone()); + slot.generation += 1; + let my_gen = slot.generation; + slot.weight = cfg.weight; + slot.drain = cfg.drain; + slot.join_budget = cfg.join_budget; + + let reg = Arc::clone(self); + let body_token = token; + let join = std::thread::Builder::new() + .name(format!("tr-worker-{key:?}")) + .spawn(move || { + body(body_token); + reg.run_epilogue(key, my_gen); + }) + .expect("failed to spawn registry worker thread"); + // Store the handle while still under the slot lock; the guard + // is released at the end of this block, BEFORE the reap below + // (R1: store handle -> drop guard -> THEN reap-or-park). + slot.handle = Some(WorkerHandle::OsThread(join)); + prior + }; + + // The prior thread was cancellation-signalled by a preceding + // cancel(); with the slot lock released its epilogue completes + // promptly and the join lands in milliseconds. The backstop fires + // only on a genuine wedge, in which case the still-live handle is + // parked (not dropped) so teardown can account for it. + self.reap_prior_or_park(prior, key); + } + + /// Start a tokio-task worker for `Send` futures. Same restart-reap + /// semantics as [`start_thread`](Self::start_thread); does not require + /// a multi-thread runtime. + pub fn start_task(self: &Arc, key: K, cfg: WorkerConfig, body: F) + where + F: FnOnce(CancellationToken) -> Fut + Send + 'static, + Fut: Future + Send + 'static, + { + let prior = { + let mut slots = self.lock_slots(); + let slot = slots.entry(key).or_insert_with(SlotState::dormant); + if slot.cancel.is_some() { + return; + } + let prior = slot.handle.take(); + let token = CancellationToken::new(); + slot.cancel = Some(token.clone()); + slot.generation += 1; + let my_gen = slot.generation; + slot.weight = cfg.weight; + slot.drain = cfg.drain; + slot.join_budget = cfg.join_budget; + + let reg = Arc::clone(self); + let body_token = token; + let join = tokio::spawn(async move { + body(body_token).await; + reg.run_epilogue(key, my_gen); + }); + slot.handle = Some(WorkerHandle::Task(join)); + prior + }; + self.reap_prior_or_park(prior, key); + } + + /// Whether a worker is currently registered and running for `key`. + pub fn is_running(&self, key: K) -> bool { + self.lock_slots() + .get(&key) + .map(|s| s.cancel.is_some()) + .unwrap_or(false) + } + + /// Signal-only cancellation of one worker (was `stop()`). + pub fn cancel(&self, key: K) { + if let Some(slot) = self.lock_slots().get_mut(&key) { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } + } + } + + /// Signal-only cancellation of every registered worker. + pub fn cancel_all(&self) { + for slot in self.lock_slots().values_mut() { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } + } + } + + /// Await this worker's drain hook, cancel it, then join within its + /// budget. The live handle is owned by the slot and is **never** moved + /// into this future's frame, so a dropped/timed-out call cannot detach + /// it; on the managed timeout — or if this future is dropped + /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX] + pub async fn quiesce(&self, key: K) -> WorkerStatus { + // Snapshot the drain hook + budget, and bail early if nothing is + // registered for this key. + let (drain, budget) = { + let slots = self.lock_slots(); + match slots.get(&key) { + Some(s) if s.cancel.is_some() || s.handle.is_some() => { + (s.drain.clone(), s.join_budget) + } + _ => return WorkerStatus::NotRunning, + } + }; + + // R2: gate-before-cancel — fully await the drain hook before the + // cancel signal is observed. + if let Some(drain) = drain { + drain().await; + } + + // Signal-only cancel. + if let Some(slot) = self.lock_slots().get_mut(&key) { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } + } + + // Poll-join within budget. The re-park guard moves the slot's + // still-live handle into orphans if this future is dropped before + // the loop finishes — the handle is never owned by this frame. + let _repark = Repark { reg: self, key }; + let deadline = Instant::now() + budget; + loop { + enum Step { + Classify(WorkerHandle), + Park(WorkerHandle), + NotRunning, + Wait, + } + let step = { + let mut slots = self.lock_slots(); + match slots.get_mut(&key) { + None => Step::NotRunning, + Some(slot) => match slot.handle.take_if(|h| h.is_finished()) { + Some(h) => Step::Classify(h), + None if slot.handle.is_none() => Step::NotRunning, + None if Instant::now() >= deadline => { + Step::Park(slot.handle.take().expect("handle present")) + } + None => Step::Wait, + }, + } + }; + match step { + Step::Classify(h) => return h.classify(), + Step::Park(h) => { + self.lock_orphans().push(h); + return WorkerStatus::Timeout; + } + Step::NotRunning => return WorkerStatus::NotRunning, + Step::Wait => tokio::time::sleep(Duration::from_millis(5)).await, + } + } + } + + /// Is any registered worker **or** parked orphan still alive? + /// Store-wiping paths must gate on this returning `false` before + /// destroying shared state. [F2 FIX] + pub fn any_alive(&self) -> bool { + { + let slots = self.lock_slots(); + for slot in slots.values() { + if slot.cancel.is_some() { + return true; + } + if let Some(handle) = &slot.handle { + if !handle.is_finished() { + return true; + } + } + } + } + self.lock_orphans().iter().any(|h| !h.is_finished()) + } + + /// Reap parked orphans with a short grace; survivors are re-parked and + /// reported as [`WorkerStatus::Detached`] (idempotent retry). + pub async fn reap_orphans(&self, grace: Duration) -> WorkerStatus { + self.reap_orphans_impl(grace).await.0 + } + + /// Weight-ordered teardown: ascending tier by tier, each worker's + /// (drain-hook -> cancel -> join) run concurrently within a tier; + /// orphan reap runs last. **Requires a multi-thread runtime.** + pub async fn shutdown(&self) -> ShutdownReport { + Self::assert_multi_thread("shutdown"); + + // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in + // ascending weight order, giving the lower-first drain. + let tiers: BTreeMap> = { + let slots = self.lock_slots(); + let mut tiers: BTreeMap> = BTreeMap::new(); + for (key, slot) in slots.iter() { + tiers.entry(slot.weight).or_default().push(*key); + } + tiers + }; + + let mut per_worker = BTreeMap::new(); + for (_weight, keys) in tiers { + // Drain every worker in this tier concurrently: each + // quiesce() drives its own drain-hook -> cancel -> join, and + // `join_all` polls them on one task so their drain hooks + // interleave (equal-weight concurrency). + let drained = keys.into_iter().map(|key| async move { (key, self.quiesce(key).await) }); + for (key, status) in futures::future::join_all(drained).await { + per_worker.insert(key, status); + } + } + + // Account for parked orphans last. + let (_status, detached) = self.reap_orphans_impl(self.reap_backstop).await; + ShutdownReport { + per_worker, + detached, + } + } + + // ----------------------------------------------------------------- + // Internal helpers + // ----------------------------------------------------------------- + + fn lock_slots(&self) -> std::sync::MutexGuard<'_, BTreeMap> { + self.slots.lock().unwrap_or_else(|e| e.into_inner()) + } + + fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec> { + self.orphans.lock().unwrap_or_else(|e| e.into_inner()) + } + + fn assert_multi_thread(ctx: &str) { + assert!( + matches!( + tokio::runtime::Handle::current().runtime_flavor(), + RuntimeFlavor::MultiThread + ), + "ThreadRegistry::{ctx}() requires a multi-thread Tokio runtime: an \ + OS-thread worker drives its loop via Handle::block_on and needs the \ + runtime's timer/IO driver, but a current_thread runtime can only \ + drive one block_on at a time" + ); + } + + /// Gen-gated exit epilogue, run on the worker after its body returns: + /// clear this slot's running flag only if a newer start has not since + /// installed a replacement. + fn run_epilogue(&self, key: K, my_gen: u64) { + if let Some(slot) = self.lock_slots().get_mut(&key) { + if slot.generation == my_gen { + slot.cancel = None; + } + } + } + + /// Reap a restarted key's prior worker — or park it if it is genuinely + /// wedged past the reap backstop. Must be called with no registry lock + /// held (it spins synchronously for an OS thread). + fn reap_prior_or_park(&self, prior: Option, key: K) { + let Some(handle) = prior else { + return; + }; + match handle { + WorkerHandle::OsThread(h) => { + let deadline = Instant::now() + self.reap_backstop; + loop { + if h.is_finished() { + let _ = h.join(); + return; + } + if Instant::now() >= deadline { + tracing::warn!( + ?key, + backstop = ?self.reap_backstop, + "prior worker thread did not finish within the reap \ + backstop after cancellation; parking it as an orphan \ + for teardown to join rather than detaching it" + ); + self.lock_orphans().push(WorkerHandle::OsThread(h)); + return; + } + std::thread::sleep(Duration::from_millis(5)); + } + } + // A task can't be joined synchronously here; park a still-live + // one for async reap. A finished one is dropped (detaching a + // finished task is a no-op). + task => { + if !task.is_finished() { + self.lock_orphans().push(task); + } + } + } + } + + /// Drain the orphan list, polling until `grace`. Returns the terminal + /// status and the number of survivors re-parked for an idempotent + /// retry. + async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) { + let mut pending: Vec = { + let mut guard = self.lock_orphans(); + std::mem::take(&mut *guard) + }; + if pending.is_empty() { + return (WorkerStatus::Ok, 0); + } + + let deadline = Instant::now() + grace; + // Keep the first non-clean terminal status; a live survivor still + // takes precedence at the deadline. + let mut non_clean: Option = None; + loop { + let mut still_live = Vec::with_capacity(pending.len()); + for handle in pending.drain(..) { + if handle.is_finished() { + let status = handle.classify(); + if !status.is_clean() { + non_clean.get_or_insert(status); + } + } else { + still_live.push(handle); + } + } + pending = still_live; + + if pending.is_empty() { + return (non_clean.unwrap_or(WorkerStatus::Ok), 0); + } + if Instant::now() >= deadline { + let survivors = pending.len(); + self.lock_orphans().extend(pending); + return (WorkerStatus::Detached, survivors); + } + tokio::time::sleep(Duration::from_millis(5)).await; + } + } + + /// Test-only seam: park a raw thread handle as an orphan. Used by + /// cross-crate regression tests (e.g. the wallet's F2 gate) that must + /// inject a wedged prior-generation thread without driving the full + /// restart-reap path. + #[doc(hidden)] + pub fn park_orphan_for_test(&self, handle: std::thread::JoinHandle<()>) { + self.lock_orphans().push(WorkerHandle::OsThread(handle)); + } +} + +/// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future +/// is dropped before it finishes (e.g. an outer timeout fires), this moves +/// the slot's still-live handle into the orphan list instead of letting it +/// be dropped-and-detached. On normal completion the handle has already +/// been taken from the slot, so this is a no-op. +struct Repark<'a, K: RegistryKey> { + reg: &'a ThreadRegistry, + key: K, +} + +impl Drop for Repark<'_, K> { + fn drop(&mut self) { + // Take the handle under the slot lock, release it, then push to + // orphans — never nest the two locks. + let handle = self + .reg + .lock_slots() + .get_mut(&self.key) + .and_then(|slot| slot.handle.take()); + if let Some(handle) = handle { + self.reg.lock_orphans().push(handle); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::panic::{catch_unwind, AssertUnwindSafe}; + use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::mpsc; + use tokio::runtime::{Builder, Handle}; + use tokio::sync::Barrier; + + type Reg = Arc>; + + /// Start an OS-thread worker that exits cleanly when cancelled. The + /// runtime handle is captured from the caller's context (the worker + /// thread is not itself a tokio worker, so it can't fetch its own). + fn start_clean(reg: &Reg, key: &'static str, cfg: WorkerConfig) { + let handle = Handle::current(); + reg.start_thread(key, cfg, move |cancel| { + handle.block_on(async move { cancel.cancelled().await }); + }); + } + + /// Body for a worker wedged in a non-yielding section: blocks on a + /// channel and ignores its cancellation token (stands in for a thread + /// stuck in a `Drop` that never observes cancel). + fn wedged_body(rx: mpsc::Receiver<()>) -> impl FnOnce(CancellationToken) + Send + 'static { + move |_cancel| { + let _ = rx.recv(); + } + } + + fn orphan_len(reg: &Reg) -> usize { + reg.lock_orphans().len() + } + + // ----- Group 1: F1 regression ------------------------------------- + + /// TC-001 — a `quiesce` whose outer future is dropped (a tiny enclosing + /// timeout) must re-park the live handle, never drop-and-detach it. The + /// slot is cleared (`is_running == false`) but the handle lives in + /// orphans and `any_alive()` stays true. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc001_quiesce_drop_reparks_handle_not_detach() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + reg.start_thread("alpha", WorkerConfig::default(), wedged_body(release_rx)); + assert!(reg.is_running("alpha")); + + // The wedged worker never observes cancel, so the internal 30s + // budget can't fire here; the tiny outer timeout drops the quiesce + // future mid-poll. A naive by-value-into-future impl would detach + // the handle (orphans empty, any_alive false); the fix re-parks it. + let result = + tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await; + assert!(result.is_err(), "outer timeout must fire on the wedged worker"); + + assert!(reg.any_alive(), "re-parked handle keeps any_alive true"); + assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)"); + assert_eq!(orphan_len(®), 1, "handle was re-parked, not detached"); + assert!(!WorkerStatus::Timeout.is_clean()); + + // Release + reap: the orphan joins cleanly and liveness clears. + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert!(!reg.any_alive()); + } + + /// TC-001b — internal-budget variant: a wedged worker with a tiny + /// `join_budget` makes `quiesce` itself time out, re-park, and return + /// `Timeout` (no outer drop involved). + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc001b_quiesce_internal_budget_timeout_reparks() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let cfg = WorkerConfig { + join_budget: Duration::from_millis(50), + ..WorkerConfig::default() + }; + reg.start_thread("alpha", cfg, wedged_body(release_rx)); + + let status = reg.quiesce("alpha").await; + assert_eq!(status, WorkerStatus::Timeout); + assert_eq!(orphan_len(®), 1); + assert!(reg.any_alive()); + assert!(!reg.is_running("alpha")); + + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert!(!reg.any_alive()); + } + + /// GAP-006 — the F1 scenario via the `shutdown()` path: a wedged worker + /// with a tiny budget surfaces as `Timeout` in the report, its handle + /// is re-parked (`detached == 1`, `any_alive`), and the result is + /// non-clean — never a clean detach. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn gap006_shutdown_path_reparks_wedged_worker() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let cfg = WorkerConfig { + join_budget: Duration::from_millis(50), + ..WorkerConfig::default() + }; + reg.start_thread("alpha", cfg, wedged_body(release_rx)); + + let report = tokio::time::timeout(Duration::from_secs(10), reg.shutdown()) + .await + .expect("shutdown must complete within bound"); + assert_eq!(report.per_worker.get("alpha"), Some(&WorkerStatus::Timeout)); + assert_eq!(report.detached, 1, "wedged handle re-parked, survived reap"); + assert!(!report.all_clean()); + assert!(reg.any_alive()); + + // Cleanup. + release_tx.send(()).unwrap(); + let _ = reg.reap_orphans(Duration::from_secs(5)).await; + assert!(!reg.any_alive()); + } + + // ----- Group 3: registry unit suite ------------------------------- + + /// TC-003 — a slow prior-generation thread's epilogue must NOT clear a + /// newer generation's token. Restarting reaps the prior generation + /// fully (its epilogue runs); the new generation stays tracked. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc003_generation_match_epilogue_preserves_new_token() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "beta", WorkerConfig::default()); // gen 1 + assert!(reg.is_running("beta")); + + // Cancel gen 1, then restart. start_thread's reap joins gen 1 + // (running its gen-gated epilogue) before returning, so this is + // deterministic: if the epilogue ignored generation it would have + // cleared gen 2's token during that join. + reg.cancel("beta"); + start_clean(®, "beta", WorkerConfig::default()); // gen 2 + + assert!( + reg.is_running("beta"), + "gen-2 token must survive gen-1's epilogue" + ); + assert_eq!(reg.quiesce("beta").await, WorkerStatus::Ok); + } + + /// TC-004 — a naturally-finished prior thread is joined cleanly on + /// restart, with no parking. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc004_restart_reaps_finished_prior_without_parking() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "gamma", WorkerConfig::default()); + // Cancel so the prior exits, then restart: the reap must join it, + // not park it. + reg.cancel("gamma"); + start_clean(®, "gamma", WorkerConfig::default()); + assert_eq!(orphan_len(®), 0, "finished prior was joined, not parked"); + assert!(reg.is_running("gamma")); + assert_eq!(reg.quiesce("gamma").await, WorkerStatus::Ok); + } + + /// TC-005 — a prior thread wedged past the reap backstop is parked in + /// orphans (not dropped), then drained after release. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc005_restart_parks_wedged_prior() { + let reg = ThreadRegistry::with_reap_backstop(Duration::from_millis(100)); + let (release_tx, release_rx) = mpsc::channel::<()>(); + + // gen 1: wedged (ignores cancel). + reg.start_thread("delta", WorkerConfig::default(), wedged_body(release_rx)); + reg.cancel("delta"); + + // gen 2: clean. The restart reaps gen 1 — wedged past the 100ms + // backstop, so it is parked. Run off the runtime workers since the + // reap spins synchronously. + let reg_for_start = Arc::clone(®); + let parent = Handle::current(); + tokio::task::spawn_blocking(move || { + let handle = parent.clone(); + reg_for_start.start_thread("delta", WorkerConfig::default(), move |cancel| { + handle.block_on(async move { cancel.cancelled().await }); + }); + }) + .await + .unwrap(); + + assert_eq!(orphan_len(®), 1, "wedged prior parked, not dropped"); + assert!(reg.any_alive()); + assert!(reg.is_running("delta"), "gen-2 loop started"); + + // Release the wedged prior; reap drains it. + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert_eq!(orphan_len(®), 0); + + // Cleanup gen 2. + assert_eq!(reg.quiesce("delta").await, WorkerStatus::Ok); + } + + /// TC-006 — orphan drain: a survivor at the grace deadline is reported + /// `Detached` and re-parked; once released it reaps `Ok`. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc006_orphan_drain_detached_then_ok() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let wedged = std::thread::spawn(move || { + let _ = release_rx.recv(); + }); + reg.park_orphan_for_test(wedged); + + assert_eq!( + reg.reap_orphans(Duration::from_millis(50)).await, + WorkerStatus::Detached + ); + assert_eq!(orphan_len(®), 1, "survivor re-parked for retry"); + assert!(reg.any_alive()); + + release_tx.send(()).unwrap(); + assert_eq!( + reg.reap_orphans(Duration::from_secs(2)).await, + WorkerStatus::Ok + ); + assert_eq!(orphan_len(®), 0); + assert!(!reg.any_alive()); + } + + /// TC-007 — weight-ordered shutdown drains a lower tier before a higher + /// one. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc007_weight_ordered_shutdown_drains_low_first() { + let reg = ThreadRegistry::<&str>::new(); + let log = Arc::new(Mutex::new(Vec::<&'static str>::new())); + + let mk_hook = |tag: &'static str, log: Arc>>| -> DrainHook { + Arc::new(move || { + let log = Arc::clone(&log); + Box::pin(async move { + log.lock().unwrap().push(tag); + }) + }) + }; + + start_clean( + ®, + "w0", + WorkerConfig { + weight: ShutdownWeight(0), + drain: Some(mk_hook("w0", Arc::clone(&log))), + ..WorkerConfig::default() + }, + ); + start_clean( + ®, + "w5", + WorkerConfig { + weight: ShutdownWeight(5), + drain: Some(mk_hook("w5", Arc::clone(&log))), + ..WorkerConfig::default() + }, + ); + start_clean( + ®, + "w10", + WorkerConfig { + weight: ShutdownWeight(10), + drain: Some(mk_hook("w10", Arc::clone(&log))), + ..WorkerConfig::default() + }, + ); + + let report = reg.shutdown().await; + assert!(report.all_clean()); + + let log = log.lock().unwrap(); + let pos = |tag| log.iter().position(|t| *t == tag).unwrap(); + assert!(pos("w0") < pos("w5")); + assert!(pos("w5") < pos("w10")); + } + + /// TC-008 — equal-weight workers drain concurrently. A shared + /// `Barrier(2)` in both drain hooks would deadlock under sequential + /// draining (caught by the enclosing timeout); the event log proves + /// both arrived before either passed. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc008_equal_weight_drains_concurrently() { + let reg = ThreadRegistry::<&str>::new(); + let log = Arc::new(Mutex::new(Vec::<&'static str>::new())); + let barrier = Arc::new(Barrier::new(2)); + + let mk_hook = |arrived: &'static str, + passed: &'static str, + log: Arc>>, + barrier: Arc| + -> DrainHook { + Arc::new(move || { + let log = Arc::clone(&log); + let barrier = Arc::clone(&barrier); + Box::pin(async move { + log.lock().unwrap().push(arrived); + barrier.wait().await; + log.lock().unwrap().push(passed); + }) + }) + }; + + start_clean( + ®, + "a", + WorkerConfig { + weight: ShutdownWeight(0), + drain: Some(mk_hook("a_arrived", "a_passed", Arc::clone(&log), Arc::clone(&barrier))), + ..WorkerConfig::default() + }, + ); + start_clean( + ®, + "b", + WorkerConfig { + weight: ShutdownWeight(0), + drain: Some(mk_hook("b_arrived", "b_passed", Arc::clone(&log), Arc::clone(&barrier))), + ..WorkerConfig::default() + }, + ); + + let report = tokio::time::timeout(Duration::from_secs(5), reg.shutdown()) + .await + .expect("equal-weight drain must not deadlock (proves concurrency)"); + assert!(report.all_clean()); + + let log = log.lock().unwrap(); + let pos = |tag| log.iter().position(|t| *t == tag).unwrap(); + let last_arrived = pos("a_arrived").max(pos("b_arrived")); + let first_passed = pos("a_passed").min(pos("b_passed")); + assert!( + last_arrived < first_passed, + "both hooks must reach the barrier before either passes: {log:?}" + ); + } + + /// TC-009 — `any_alive()` accounts for both live slots and orphans. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc009_any_alive_spans_slots_and_orphans() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "alpha", WorkerConfig::default()); + assert!(reg.any_alive()); + + let (release_tx, release_rx) = mpsc::channel::<()>(); + let wedged = std::thread::spawn(move || { + let _ = release_rx.recv(); + }); + reg.park_orphan_for_test(wedged); + assert!(reg.any_alive()); + + assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok); + assert!(reg.any_alive(), "orphan still contributes after slot drains"); + assert!(!reg.is_running("alpha")); + + release_tx.send(()).unwrap(); + let _ = reg.reap_orphans(Duration::from_secs(2)).await; + assert!(!reg.any_alive()); + } + + /// TC-010 — `shutdown()` panics with a documented message on a + /// current-thread runtime (R4, variant B). + #[test] + fn tc010_shutdown_asserts_multi_thread_runtime() { + let rt = Builder::new_current_thread().enable_all().build().unwrap(); + let reg = ThreadRegistry::<&str>::new(); + let result = catch_unwind(AssertUnwindSafe(|| { + rt.block_on(async { reg.shutdown().await }); + })); + let payload = result.expect_err("shutdown must panic on current_thread"); + let msg = payload + .downcast_ref::() + .map(String::as_str) + .or_else(|| payload.downcast_ref::<&str>().copied()) + .unwrap_or(""); + assert!( + msg.contains("multi-thread"), + "panic must name the runtime constraint, got: {msg}" + ); + } + + // ----- Group 4: DrainHook ordering -------------------------------- + + /// TC-011 — the drain hook is fully awaited before the cancel signal is + /// observed by the worker. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc011_drain_hook_completes_before_cancel() { + let reg = ThreadRegistry::<&str>::new(); + let log = Arc::new(Mutex::new(Vec::<&'static str>::new())); + + let log_hook = Arc::clone(&log); + let drain: DrainHook = Arc::new(move || { + let log = Arc::clone(&log_hook); + Box::pin(async move { + log.lock().unwrap().push("drain_hook_start"); + tokio::time::sleep(Duration::from_millis(10)).await; + log.lock().unwrap().push("drain_hook_complete"); + }) + }); + + let log_worker = Arc::clone(&log); + let handle = Handle::current(); + reg.start_thread( + "epsilon", + WorkerConfig { + drain: Some(drain), + ..WorkerConfig::default() + }, + move |cancel| { + handle.block_on(async move { + cancel.cancelled().await; + log_worker.lock().unwrap().push("cancel_observed"); + }); + }, + ); + + assert_eq!(reg.quiesce("epsilon").await, WorkerStatus::Ok); + assert!(!reg.is_running("epsilon")); + + let log = log.lock().unwrap(); + let pos = |tag| log.iter().position(|t| *t == tag).unwrap(); + assert!(pos("drain_hook_start") < pos("drain_hook_complete")); + assert!(pos("drain_hook_complete") < pos("cancel_observed")); + } + + /// TC-012 — a `quiesce` blocks in the drain hook until an `is_syncing` + /// barrier the hook polls falls, and only then cancels + joins. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc012_drain_hook_observes_barrier_before_join() { + let reg = ThreadRegistry::<&str>::new(); + let is_syncing = Arc::new(AtomicBool::new(true)); + + let gate = Arc::clone(&is_syncing); + let drain: DrainHook = Arc::new(move || { + let gate = Arc::clone(&gate); + Box::pin(async move { + while gate.load(Ordering::Acquire) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + }) + }); + start_clean( + ®, + "zeta", + WorkerConfig { + drain: Some(drain), + ..WorkerConfig::default() + }, + ); + + let quiesce_completed = Arc::new(AtomicBool::new(false)); + let reg_q = Arc::clone(®); + let done = Arc::clone(&quiesce_completed); + let quiesce_task = tokio::spawn(async move { + let status = reg_q.quiesce("zeta").await; + done.store(true, Ordering::Release); + status + }); + + // While the barrier is held, quiesce must stay pending. + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + !quiesce_completed.load(Ordering::Acquire), + "quiesce must block while is_syncing is held" + ); + + // Release the barrier; quiesce drains, cancels, joins. + is_syncing.store(false, Ordering::Release); + let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task) + .await + .expect("quiesce must complete once the barrier falls") + .unwrap(); + assert_eq!(status, WorkerStatus::Ok); + assert!(quiesce_completed.load(Ordering::Acquire)); + } + + // ----- Group 5: status classification ----------------------------- + + /// TC-013 — only the `Task` kind can classify as `Stopped` (from a + /// runtime-level cancel/abort JoinError); a cooperatively token- + /// cancelled task exits normally as `Ok`. Verifies the kind-dispatch + /// at the classification boundary. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc013_task_kind_classifies_stopped_and_ok() { + // Stopped: an aborted task yields a cancelled JoinError. + let aborted = tokio::spawn(std::future::pending::<()>()); + aborted.abort(); + while !aborted.is_finished() { + tokio::time::sleep(Duration::from_millis(1)).await; + } + let status = WorkerHandle::Task(aborted).classify(); + assert!(matches!(status, WorkerStatus::Stopped(_)), "got {status:?}"); + assert!(!status.is_clean()); + + // Ok: a cooperatively token-cancelled task returns normally. + let reg = ThreadRegistry::<&str>::new(); + reg.start_task("task_a", WorkerConfig::default(), |cancel| async move { + cancel.cancelled().await; + }); + assert_eq!(reg.quiesce("task_a").await, WorkerStatus::Ok); + assert!(!reg.is_running("task_a")); + } + + /// TC-014 — an `OsThread` worker yields `Ok` (clean) or `Panicked` + /// (`&str` and `String` payloads), never `Stopped`. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn tc014_os_thread_ok_and_panicked_never_stopped() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "os_clean", WorkerConfig::default()); + let ok = reg.quiesce("os_clean").await; + assert_eq!(ok, WorkerStatus::Ok); + assert!(ok.is_clean()); + + // &str panic payload. + reg.start_thread("os_panic_str", WorkerConfig::default(), |_cancel| { + panic!("deliberate test panic"); + }); + match reg.quiesce("os_panic_str").await { + WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate test panic")), + other => panic!("expected Panicked, got {other:?}"), + } + + // String panic payload. + reg.start_thread("os_panic_string", WorkerConfig::default(), |_cancel| { + std::panic::panic_any(String::from("deliberate string panic")); + }); + match reg.quiesce("os_panic_string").await { + WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate string panic")), + other => panic!("expected Panicked, got {other:?}"), + } + } + + // ----- Gaps ------------------------------------------------------- + + /// GAP-003 — `shutdown()` is idempotent: a second call finds every slot + /// already joined and reports `NotRunning`, still clean. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn gap003_shutdown_is_idempotent() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "alpha", WorkerConfig::default()); + + let first = reg.shutdown().await; + assert_eq!(first.per_worker.get("alpha"), Some(&WorkerStatus::Ok)); + assert!(first.all_clean()); + + let second = reg.shutdown().await; + assert_eq!( + second.per_worker.get("alpha"), + Some(&WorkerStatus::NotRunning) + ); + assert!(second.all_clean()); + } + + /// GAP-004 — `cancel(key)` is selective: cancelling A does not touch B. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn gap004_cancel_is_selective() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "a", WorkerConfig::default()); + start_clean(®, "b", WorkerConfig::default()); + + reg.cancel("a"); + assert!(reg.is_running("b"), "cancel(a) must not cancel b"); + assert_eq!(reg.quiesce("a").await, WorkerStatus::Ok); + assert!(reg.is_running("b"), "b still running after a drains"); + assert_eq!(reg.quiesce("b").await, WorkerStatus::Ok); + } + + /// GAP-005 — `WorkerConfig::default()` values are pinned. + #[test] + fn gap005_worker_config_defaults_pinned() { + let cfg = WorkerConfig::default(); + assert_eq!(cfg.weight, ShutdownWeight(0)); + assert!(cfg.drain.is_none()); + assert_eq!(cfg.join_budget, DEFAULT_JOIN_BUDGET); + } +} From ac9a51a7c70f25cc307ef076cb0e9498c6a67f9b Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 22:27:04 +0200 Subject: [PATCH 18/29] feat(dash-async): key-scope parked orphans for any_alive_for() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tag each parked orphan with its originating worker key and add any_alive_for(key), so a store-wiping path scoped to one worker (the wallet's clear_shielded F2 gate) can refuse only while that worker — its slot or a parked prior-generation thread under its key — is alive, without being blocked by unrelated workers that are legitimately running (e.g. the always-on event-adapter task). Registry-wide any_alive() is retained. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-dash-async/src/registry.rs | 108 ++++++++++++++++++------- 1 file changed, 81 insertions(+), 27 deletions(-) diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs index e7cd835cd8..d04d8cbbbc 100644 --- a/packages/rs-dash-async/src/registry.rs +++ b/packages/rs-dash-async/src/registry.rs @@ -247,9 +247,13 @@ impl SlotState { // --------------------------------------------------------------------- /// Shared lifecycle engine for background workers. See the module docs. +/// +/// Parked orphans carry their originating key so a store-wiping path for +/// one worker can gate on [`any_alive_for`](Self::any_alive_for) without +/// being blocked by an unrelated worker still legitimately running. pub struct ThreadRegistry { slots: Mutex>, - orphans: Mutex>, + orphans: Mutex>, reap_backstop: Duration, } @@ -446,7 +450,7 @@ impl ThreadRegistry { match step { Step::Classify(h) => return h.classify(), Step::Park(h) => { - self.lock_orphans().push(h); + self.lock_orphans().push((key, h)); return WorkerStatus::Timeout; } Step::NotRunning => return WorkerStatus::NotRunning, @@ -455,24 +459,34 @@ impl ThreadRegistry { } } - /// Is any registered worker **or** parked orphan still alive? - /// Store-wiping paths must gate on this returning `false` before - /// destroying shared state. [F2 FIX] + /// Is any registered worker **or** parked orphan still alive across + /// the whole registry? pub fn any_alive(&self) -> bool { { let slots = self.lock_slots(); for slot in slots.values() { - if slot.cancel.is_some() { + if slot_alive(slot) { return true; } - if let Some(handle) = &slot.handle { - if !handle.is_finished() { - return true; - } - } } } - self.lock_orphans().iter().any(|h| !h.is_finished()) + self.lock_orphans().iter().any(|(_, h)| !h.is_finished()) + } + + /// Is the worker for `key` — its live slot **or** any orphan parked + /// under that key — still alive? A store-wiping path scoped to one + /// worker must gate on this (rather than the registry-wide + /// [`any_alive`](Self::any_alive)) so an unrelated worker that is + /// legitimately running does not block the wipe. [F2 FIX] + pub fn any_alive_for(&self, key: K) -> bool { + if let Some(slot) = self.lock_slots().get(&key) { + if slot_alive(slot) { + return true; + } + } + self.lock_orphans() + .iter() + .any(|(k, h)| *k == key && !h.is_finished()) } /// Reap parked orphans with a short grace; survivors are re-parked and @@ -526,7 +540,7 @@ impl ThreadRegistry { self.slots.lock().unwrap_or_else(|e| e.into_inner()) } - fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec> { + fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec<(K, WorkerHandle)>> { self.orphans.lock().unwrap_or_else(|e| e.into_inner()) } @@ -577,7 +591,7 @@ impl ThreadRegistry { backstop after cancellation; parking it as an orphan \ for teardown to join rather than detaching it" ); - self.lock_orphans().push(WorkerHandle::OsThread(h)); + self.lock_orphans().push((key, WorkerHandle::OsThread(h))); return; } std::thread::sleep(Duration::from_millis(5)); @@ -588,7 +602,7 @@ impl ThreadRegistry { // finished task is a no-op). task => { if !task.is_finished() { - self.lock_orphans().push(task); + self.lock_orphans().push((key, task)); } } } @@ -598,7 +612,7 @@ impl ThreadRegistry { /// status and the number of survivors re-parked for an idempotent /// retry. async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) { - let mut pending: Vec = { + let mut pending: Vec<(K, WorkerHandle)> = { let mut guard = self.lock_orphans(); std::mem::take(&mut *guard) }; @@ -612,14 +626,14 @@ impl ThreadRegistry { let mut non_clean: Option = None; loop { let mut still_live = Vec::with_capacity(pending.len()); - for handle in pending.drain(..) { + for (key, handle) in pending.drain(..) { if handle.is_finished() { let status = handle.classify(); if !status.is_clean() { non_clean.get_or_insert(status); } } else { - still_live.push(handle); + still_live.push((key, handle)); } } pending = still_live; @@ -636,16 +650,21 @@ impl ThreadRegistry { } } - /// Test-only seam: park a raw thread handle as an orphan. Used by - /// cross-crate regression tests (e.g. the wallet's F2 gate) that must - /// inject a wedged prior-generation thread without driving the full - /// restart-reap path. + /// Test-only seam: park a raw thread handle as an orphan under `key`. + /// Used by cross-crate regression tests (e.g. the wallet's F2 gate) + /// that must inject a wedged prior-generation thread without driving + /// the full restart-reap path. #[doc(hidden)] - pub fn park_orphan_for_test(&self, handle: std::thread::JoinHandle<()>) { - self.lock_orphans().push(WorkerHandle::OsThread(handle)); + pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) { + self.lock_orphans().push((key, WorkerHandle::OsThread(handle))); } } +/// `true` if a slot is running or holds an unfinished handle. +fn slot_alive(slot: &SlotState) -> bool { + slot.cancel.is_some() || slot.handle.as_ref().is_some_and(|h| !h.is_finished()) +} + /// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future /// is dropped before it finishes (e.g. an outer timeout fires), this moves /// the slot's still-live handle into the orphan list instead of letting it @@ -666,7 +685,7 @@ impl Drop for Repark<'_, K> { .get_mut(&self.key) .and_then(|slot| slot.handle.take()); if let Some(handle) = handle { - self.reg.lock_orphans().push(handle); + self.reg.lock_orphans().push((self.key, handle)); } } } @@ -885,7 +904,7 @@ mod tests { let wedged = std::thread::spawn(move || { let _ = release_rx.recv(); }); - reg.park_orphan_for_test(wedged); + reg.park_orphan_for_test("orphan", wedged); assert_eq!( reg.reap_orphans(Duration::from_millis(50)).await, @@ -1027,7 +1046,7 @@ mod tests { let wedged = std::thread::spawn(move || { let _ = release_rx.recv(); }); - reg.park_orphan_for_test(wedged); + reg.park_orphan_for_test("orphan", wedged); assert!(reg.any_alive()); assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok); @@ -1039,6 +1058,41 @@ mod tests { assert!(!reg.any_alive()); } + /// `any_alive_for(key)` is scoped: an orphan parked under one key does + /// not make a different key look alive (the F2 gate must not be + /// blocked by unrelated workers). + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn any_alive_for_is_key_scoped() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + let wedged = std::thread::spawn(move || { + let _ = release_rx.recv(); + }); + reg.park_orphan_for_test("shielded", wedged); + + // A live, unrelated worker. + start_clean(®, "identity", WorkerConfig::default()); + + assert!(reg.any_alive(), "registry-wide liveness sees both"); + assert!(reg.any_alive_for("shielded"), "shielded orphan is alive"); + assert!( + !reg.any_alive_for("address"), + "an unrelated key with no slot/orphan is not alive" + ); + + // The running 'identity' worker must not make 'shielded' look alive + // beyond its own orphan, and vice versa. + assert!(reg.any_alive_for("identity"), "running identity is alive"); + + release_tx.send(()).unwrap(); + let _ = reg.reap_orphans(Duration::from_secs(2)).await; + assert!( + !reg.any_alive_for("shielded"), + "shielded clear once its orphan is reaped" + ); + assert_eq!(reg.quiesce("identity").await, WorkerStatus::Ok); + } + /// TC-010 — `shutdown()` panics with a documented message on a /// current-thread runtime (R4, variant B). #[test] From d20aed0027929efc32c6a33ddff9461dc812b8e5 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 22:55:04 +0200 Subject: [PATCH 19/29] refactor(platform-wallet): migrate sync coordinators onto shared ThreadRegistry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the triplicated per-coordinator lifecycle machinery (background_cancel/join/generation mutexes, the cloned coordinator_orphans list, and the mod.rs free fns join_coordinator_thread / reap_prior_or_park / join_detached_orphans / panic_message) with the shared dash-async ThreadRegistry. The manager holds one Arc>; each coordinator's start/stop/ is_running/quiesce now delegate to it under a fixed WalletWorker key, and each exposes its quiescing-gate raise as a registry DrainHook. The wallet-event adapter becomes a registry start_task worker (weight 10, draining after the coordinators it sinks at weight 0). Fixes the two confirmed bugs structurally: - F1: shutdown() is now CoordinatorExitStatus::from_report(registry .shutdown()); each worker's join is bounded by its own join_budget inside the registry, where the live handle stays owned by the slot. A dropped/timed-out join can no longer detach a live thread — it re-parks to Timeout. - F2: clear_shielded() gates the store wipe on registry.any_alive_for(ShieldedSync) (shielded-scoped, so the always-on event adapter and the other coordinators running normally do not block Clear), refusing while a prior-generation shielded thread is parked alive. CoordinatorThreadStatus / CoordinatorExitStatus / all_clean() are byte-stable (FFI destroy maps !all_clean -> ErrorShutdownIncomplete); WorkerStatus maps onto them 1:1 via From. The three wall-clock restart-reap regression tests and the relocated free-fn tests are deleted (subsumed by the dash-async registry suite); F2 (clear_shielded) and R5 (from_report) gain wallet-level tests. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-dash-async/src/registry.rs | 37 +- .../src/changeset/core_bridge.rs | 122 ++- .../rs-platform-wallet/src/changeset/mod.rs | 2 +- .../src/manager/identity_sync.rs | 319 ++----- .../rs-platform-wallet/src/manager/mod.rs | 898 ++++++------------ .../src/manager/platform_address_sync.rs | 304 +----- .../src/manager/shielded_sync.rs | 297 +----- 7 files changed, 537 insertions(+), 1442 deletions(-) diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs index d04d8cbbbc..802ca3598c 100644 --- a/packages/rs-dash-async/src/registry.rs +++ b/packages/rs-dash-async/src/registry.rs @@ -132,8 +132,7 @@ impl ShutdownReport { /// let _hook: DrainHook = /// Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) }); /// ``` -pub type DrainHook = - Arc Pin + Send>> + Send + Sync>; +pub type DrainHook = Arc Pin + Send>> + Send + Sync>; /// Default managed-join budget when a [`WorkerConfig`] does not override /// it. Pinned so an accidental change surfaces in tests. @@ -518,7 +517,9 @@ impl ThreadRegistry { // quiesce() drives its own drain-hook -> cancel -> join, and // `join_all` polls them on one task so their drain hooks // interleave (equal-weight concurrency). - let drained = keys.into_iter().map(|key| async move { (key, self.quiesce(key).await) }); + let drained = keys + .into_iter() + .map(|key| async move { (key, self.quiesce(key).await) }); for (key, status) in futures::future::join_all(drained).await { per_worker.insert(key, status); } @@ -656,7 +657,8 @@ impl ThreadRegistry { /// the full restart-reap path. #[doc(hidden)] pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) { - self.lock_orphans().push((key, WorkerHandle::OsThread(handle))); + self.lock_orphans() + .push((key, WorkerHandle::OsThread(handle))); } } @@ -741,9 +743,11 @@ mod tests { // budget can't fire here; the tiny outer timeout drops the quiesce // future mid-poll. A naive by-value-into-future impl would detach // the handle (orphans empty, any_alive false); the fix re-parks it. - let result = - tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await; - assert!(result.is_err(), "outer timeout must fire on the wedged worker"); + let result = tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await; + assert!( + result.is_err(), + "outer timeout must fire on the wedged worker" + ); assert!(reg.any_alive(), "re-parked handle keeps any_alive true"); assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)"); @@ -1006,7 +1010,12 @@ mod tests { "a", WorkerConfig { weight: ShutdownWeight(0), - drain: Some(mk_hook("a_arrived", "a_passed", Arc::clone(&log), Arc::clone(&barrier))), + drain: Some(mk_hook( + "a_arrived", + "a_passed", + Arc::clone(&log), + Arc::clone(&barrier), + )), ..WorkerConfig::default() }, ); @@ -1015,7 +1024,12 @@ mod tests { "b", WorkerConfig { weight: ShutdownWeight(0), - drain: Some(mk_hook("b_arrived", "b_passed", Arc::clone(&log), Arc::clone(&barrier))), + drain: Some(mk_hook( + "b_arrived", + "b_passed", + Arc::clone(&log), + Arc::clone(&barrier), + )), ..WorkerConfig::default() }, ); @@ -1050,7 +1064,10 @@ mod tests { assert!(reg.any_alive()); assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok); - assert!(reg.any_alive(), "orphan still contributes after slot drains"); + assert!( + reg.any_alive(), + "orphan still contributes after slot drains" + ); assert!(!reg.is_running("alpha")); release_tx.send(()).unwrap(); diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs index 46945667ef..9e22d9e6f2 100644 --- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs +++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs @@ -19,10 +19,11 @@ //! //! # Lifetime //! -//! [`spawn_wallet_event_adapter`] returns a [`JoinHandle`]. The caller -//! (typically `PlatformWalletManager`) keeps the handle for the -//! manager's lifetime; on shutdown, fire the [`CancellationToken`] to -//! make the task exit cleanly. +//! [`wallet_event_adapter_loop`] is the task body. The caller (typically +//! `PlatformWalletManager`) registers it on the shared `ThreadRegistry` +//! via `start_task`, which owns its [`JoinHandle`] and cancellation; on +//! shutdown the registry fires the [`CancellationToken`] to make the task +//! exit cleanly and joins it. use std::sync::Arc; @@ -34,87 +35,82 @@ use key_wallet::Utxo; use key_wallet_manager::{WalletEvent, WalletId, WalletManager}; use tokio::sync::broadcast::error::RecvError; use tokio::sync::RwLock; -use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use crate::changeset::changeset::{CoreChangeSet, PlatformWalletChangeSet}; use crate::changeset::traits::PlatformWalletPersistence; use crate::wallet::platform_wallet::PlatformWalletInfo; -/// Spawn the wallet-event subscriber task. +/// The wallet-event subscriber loop (the task body owned by the registry). /// -/// Subscribes to `wallet_manager.subscribe_events()` from inside the -/// spawned task (so the call-site doesn't need to be on a tokio -/// runtime), then loops dispatching events to the persister via -/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires -/// or the upstream broadcast channel closes. +/// Subscribes to `wallet_manager.subscribe_events()` from inside the task +/// (so the call-site doesn't need to be on a tokio runtime), then loops +/// dispatching events to the persister via +/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires or the +/// upstream broadcast channel closes. /// -/// Generic over `P` so the spawned task gets static-dispatch on -/// every `persister.store(...)` call. Pass the manager's own -/// `Arc

` (not the `Arc` -/// coercion) to actually realize the static-dispatch win. -pub fn spawn_wallet_event_adapter

( +/// Generic over `P` so the task gets static-dispatch on every +/// `persister.store(...)` call. Pass the manager's own `Arc

` (not the +/// `Arc` coercion) to realize that win. +pub async fn wallet_event_adapter_loop

( wallet_manager: Arc>>, persister: Arc

, cancel: CancellationToken, -) -> JoinHandle<()> -where +) where P: PlatformWalletPersistence + 'static, { - tokio::spawn(async move { - let mut receiver = { - let guard = wallet_manager.read().await; - guard.subscribe_events() - }; - tracing::debug!("wallet-event adapter task started"); + let mut receiver = { + let guard = wallet_manager.read().await; + guard.subscribe_events() + }; + tracing::debug!("wallet-event adapter task started"); - loop { - tokio::select! { - recv = receiver.recv() => { - match recv { - Ok(event) => { - let wallet_id = event.wallet_id(); - // For events that need to consult per-wallet - // state (today only `TransactionInstantLocked`, - // which checks finality before recording the IS - // lock), grab a brief read lock on the manager. - let core = build_core_changeset(&wallet_manager, &event).await; - if core.is_empty_no_records() { - // SyncHeightAdvanced for an unknown wallet, - // empty BlockProcessed, etc. — nothing to - // persist. Skip the round-trip. - continue; - } - let cs = PlatformWalletChangeSet { - core: Some(core), - ..PlatformWalletChangeSet::default() - }; - if let Err(e) = persister.store(wallet_id, cs) { - tracing::warn!( - wallet_id = %hex::encode(wallet_id), - error = %e, - "Persister rejected core changeset; state will be re-emitted on next sync round" - ); - } - } - Err(RecvError::Closed) if cancel.is_cancelled() => break, - Err(RecvError::Closed) => { - tracing::error!("WalletEvent broadcast closed unexpectedly"); - break; + loop { + tokio::select! { + recv = receiver.recv() => { + match recv { + Ok(event) => { + let wallet_id = event.wallet_id(); + // For events that need to consult per-wallet + // state (today only `TransactionInstantLocked`, + // which checks finality before recording the IS + // lock), grab a brief read lock on the manager. + let core = build_core_changeset(&wallet_manager, &event).await; + if core.is_empty_no_records() { + // SyncHeightAdvanced for an unknown wallet, + // empty BlockProcessed, etc. — nothing to + // persist. Skip the round-trip. + continue; } - Err(RecvError::Lagged(n)) => { + let cs = PlatformWalletChangeSet { + core: Some(core), + ..PlatformWalletChangeSet::default() + }; + if let Err(e) = persister.store(wallet_id, cs) { tracing::warn!( - missed = n, - "wallet-event adapter lagged on broadcast channel; some events were dropped" + wallet_id = %hex::encode(wallet_id), + error = %e, + "Persister rejected core changeset; state will be re-emitted on next sync round" ); } } + Err(RecvError::Closed) if cancel.is_cancelled() => break, + Err(RecvError::Closed) => { + tracing::error!("WalletEvent broadcast closed unexpectedly"); + break; + } + Err(RecvError::Lagged(n)) => { + tracing::warn!( + missed = n, + "wallet-event adapter lagged on broadcast channel; some events were dropped" + ); + } } - _ = cancel.cancelled() => break, } + _ = cancel.cancelled() => break, } - tracing::debug!("wallet-event adapter task exiting"); - }) + } + tracing::debug!("wallet-event adapter task exiting"); } /// Project an upstream [`WalletEvent`] into a [`CoreChangeSet`] suitable diff --git a/packages/rs-platform-wallet/src/changeset/mod.rs b/packages/rs-platform-wallet/src/changeset/mod.rs index dc76ddd39a..208c132e87 100644 --- a/packages/rs-platform-wallet/src/changeset/mod.rs +++ b/packages/rs-platform-wallet/src/changeset/mod.rs @@ -33,7 +33,7 @@ pub use changeset::{ }; pub use client_start_state::ClientStartState; pub use client_wallet_start_state::ClientWalletStartState; -pub use core_bridge::spawn_wallet_event_adapter; +pub use core_bridge::wallet_event_adapter_loop; pub use identity_manager_start_state::IdentityManagerStartState; pub use merge::Merge; pub use platform_address_sync_start_state::PlatformAddressSyncStartState; diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 40329bad74..8dfe83eede 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -49,16 +49,17 @@ use std::collections::BTreeMap; use std::sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex as StdMutex, + Arc, }; -use dash_async::AtomicFlagGuard; +use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use super::WalletWorker; + use dpp::balances::credits::TokenAmount; use dpp::prelude::Identifier; use tokio::sync::RwLock; -use tokio_util::sync::CancellationToken; use dash_sdk::platform::tokens::identity_token_balances::{ IdentityTokenBalances, IdentityTokenBalancesQuery, @@ -160,23 +161,11 @@ where /// over `P` so every `persister.store(...)` call on the hot sync /// loop dispatches statically. persister: Arc

, - /// Cancel token for the background loop, if running. - background_cancel: StdMutex>, - /// Join handle for the background loop's OS thread, if running. - /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can - /// confirm the `!Send` loop fully exited before the host drops the - /// runtime. - background_join: StdMutex>>, - /// Manager-owned orphans list (shared `Arc`). On a tight - /// `stop()`→`start()` where the prior thread is wedged past the 1 s - /// reap backstop, [`start`](Self::start) parks the still-live handle - /// here (via [`reap_prior_or_park`](super::reap_prior_or_park)) - /// instead of dropping it, so manager `shutdown()` accounts for it. - coordinator_orphans: super::CoordinatorOrphans, - /// Monotonically increasing generation counter. Incremented each - /// time `start()` installs a new cancel token so the exiting - /// thread can tell whether its token is still current. - background_generation: AtomicU64, + /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` / + /// `quiesce` delegate to it under the [`WalletWorker::IdentitySync`] + /// key; it owns the loop's cancel token, OS-thread join handle, the + /// restart reap-or-park, and the orphan list. + registry: Arc>, interval_secs: AtomicU64, is_syncing: AtomicBool, /// Set by [`quiesce`](Self::quiesce) to gate new passes while it @@ -215,15 +204,12 @@ where pub fn new( sdk: Arc, persister: Arc

, - coordinator_orphans: super::CoordinatorOrphans, + registry: Arc>, ) -> Self { Self { sdk, persister, - background_cancel: StdMutex::new(None), - background_join: StdMutex::new(None), - coordinator_orphans, - background_generation: AtomicU64::new(0), + registry, interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), quiescing: AtomicBool::new(false), @@ -339,10 +325,22 @@ where /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.background_cancel - .lock() - .map(|g| g.is_some()) - .unwrap_or(false) + self.registry.is_running(WalletWorker::IdentitySync) + } + + /// The drain barrier handed to the registry: raise the `quiescing` + /// gate so any pass past its `is_syncing` CAS bails. The registry then + /// cancels the loop and joins the thread (the join waits for the + /// in-flight pass to drop and `is_syncing` to clear), so the barrier + /// itself is instant and never blocks teardown. + fn drain_hook(self: &Arc) -> DrainHook { + let this = Arc::clone(self); + Arc::new(move || { + let this = Arc::clone(&this); + Box::pin(async move { + this.quiescing.store(true, Ordering::Release); + }) + }) } /// Whether a sync pass is in flight right now. @@ -414,57 +412,32 @@ where /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut cancel_guard = self - .background_cancel - .lock() - .unwrap_or_else(|e| e.into_inner()); - if cancel_guard.is_some() { - return; - } - - // Take any handle left by a prior stop() call so we can reap it — but - // DON'T join it here, while we still hold background_cancel. stop() - // takes-and-cancels the token but never touches background_join, so a - // stop()→start() sequence would otherwise overwrite (detach) the old - // handle and shutdown() would miss that thread. Joining it under - // background_cancel would DEADLOCK the reap into its 1 s backstop: the - // exiting prior thread's epilogue also locks background_cancel (to - // clear its slot), so it would block on the lock we hold → never - // finish → get detached on the exact stop()→start() path the reap - // exists for. We install the new token + bump the generation below, - // release the lock, and only THEN reap (after this fn's tail). - let prior = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - - let cancel = CancellationToken::new(); - *cancel_guard = Some(cancel.clone()); - let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; + // Reopen the quiescing gate so this (re)start's passes can run; a + // prior quiesce raised it via the drain hook. + self.quiescing.store(false, Ordering::Release); + + let cfg = WorkerConfig { + weight: super::COORDINATOR_WEIGHT, + join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: Some(self.drain_hook()), + }; + // The loop drives `!Send` SDK futures via `Handle::block_on` on a + // dedicated OS thread (the registry spawns it). The handle is + // captured from this tokio context; the new thread is not itself a + // tokio worker. `biased` polls the cancel arm first, so a pass + // stalled on a hung SDK fetch is dropped at its `.await` the + // instant the registry cancels — clearing `is_syncing` promptly so + // the join lands inside the budget. let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); - let join = std::thread::Builder::new() - .name("identity-sync".into()) - .spawn(move || { + self.registry + .start_thread(WalletWorker::IdentitySync, cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { break; } - - // Race the in-flight pass against cancellation. - // `stop()` / `quiesce()` cancel the token; with - // `biased` the cancel arm is polled first, so a - // pass stalled on a hung SDK fetch is dropped at - // its `.await` the instant we cancel. Dropping the - // `sync_now` future unwinds to the `is_syncing` - // `AtomicFlagGuard` it holds, clearing the flag - // promptly — so `quiesce()`'s drain loop frees and - // the join lands well inside `shutdown()`'s - // timeout. A stalled pass can no longer strand a - // live `!Send` thread past `shutdown()`. tokio::select! { biased; _ = cancel.cancelled() => break, @@ -477,47 +450,8 @@ where _ = cancel.cancelled() => break, } } - - // Only clear the slot if no newer start() has - // installed a replacement token since we launched. - if let Ok(mut guard) = this.background_cancel.lock() { - if this.background_generation.load(Ordering::Acquire) == my_gen { - *guard = None; - } - } }); - }) - .expect("failed to spawn identity-sync thread"); - // Store the join handle while still holding cancel_guard — a - // concurrent quiesce() must wait for this lock before calling - // stop(), so the handle is always stored before it can be taken. - *self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) = Some(join); - - // Release background_cancel BEFORE reaping the prior thread, so its - // epilogue can acquire the lock, observe the bumped generation, skip - // clearing our freshly-installed token, and return. Holding the lock - // across the join below is what would block the prior thread, spin - // the full 1 s deadline, and detach — the very stall this ordering - // removes. - drop(cancel_guard); - - // Now reap the prior thread. It was already cancellation-signalled by - // stop(), and with the lock released its epilogue completes promptly, - // so is_finished() trips within a few milliseconds and the join is - // near-instant. The 1 s deadline survives only as a genuine-wedge - // backstop (e.g. a pass wedged in a Drop that never yields); if it - // fires `reap_prior_or_park` parks the still-live, already-cancelled - // thread in the manager orphans list so `shutdown()` joins it and - // reports it non-clean rather than dropping it (residual UAF). - super::reap_prior_or_park( - prior, - &self.coordinator_orphans, - std::time::Duration::from_secs(1), - "identity-sync", - ); + }); } /// Stop the background sync loop. No-op if not running. @@ -529,14 +463,7 @@ where /// by manager shutdown so the host can free the persister context — /// use [`quiesce`](Self::quiesce). pub fn stop(&self) { - if let Some(token) = self - .background_cancel - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take() - { - token.cancel(); - } + self.registry.cancel(WalletWorker::IdentitySync); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -564,24 +491,17 @@ where /// the `!Send` loop has stopped touching `tokio::time` before a /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { - self.quiescing.store(true, Ordering::Release); - // RAII gate: resets `quiescing` on *every* exit path — a normal - // return, a timed-out `shutdown()` dropping this future, or a - // panic. Without it a quiesce that doesn't run to completion - // leaves the gate latched `true`, silently bailing every future - // pass. Reopening on drop is safe because `stop()` (below) has - // already cancelled the loop, so no new pass can start. + // RAII gate: reopen `quiescing` on *every* exit path — normal + // return, a dropped future, or a panic. The registry's drain hook + // raises it inside `quiesce` below; without this reset a quiesce + // that doesn't complete would leave the gate latched and silently + // bail every future pass. Reopening is safe because the loop has + // been cancelled, so no new pass can start. let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.stop(); - while self.is_syncing.load(Ordering::Acquire) { - tokio::time::sleep(Duration::from_millis(20)).await; - } - let handle = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - super::join_coordinator_thread(handle).await + self.registry + .quiesce(WalletWorker::IdentitySync) + .await + .into() } /// Run one sync pass across every registered identity. @@ -856,8 +776,8 @@ mod tests { fn make_manager() -> Arc> { let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); let persister = Arc::new(NoopPersister); - let orphans = Arc::new(StdMutex::new(Vec::new())); - Arc::new(IdentitySyncManager::new(sdk, persister, orphans)) + let registry = ThreadRegistry::new(); + Arc::new(IdentitySyncManager::new(sdk, persister, registry)) } fn make_recording_manager() -> ( @@ -866,12 +786,12 @@ mod tests { ) { let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk")); let persister = Arc::new(RecordingPersister::new()); - let orphans = Arc::new(StdMutex::new(Vec::new())); + let registry = ThreadRegistry::new(); ( Arc::new(IdentitySyncManager::new( sdk, Arc::clone(&persister), - orphans, + registry, )), persister, ) @@ -993,123 +913,6 @@ mod tests { assert_eq!(mgr.interval(), Duration::from_secs(120)); } - /// `quiesce()` must not return while a pass is in flight, and must - /// return promptly once the pass drains. - /// - /// Drives the real `is_syncing` lifecycle: a background task takes - /// the slot via the same `compare_exchange` the real `sync_now` - /// uses, holds it across a sleep (standing in for the pass body + - /// persister fan-out, which `sync_now` keeps the flag set across), - /// then clears it. We assert `quiesce()` is still pending while the - /// flag is held and completes after it falls — i.e. the falling edge - /// of `is_syncing` is what unblocks the barrier. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn quiesce_blocks_until_in_flight_pass_drains() { - let mgr = make_manager(); - - // Stand in for an in-flight `sync_now`: take the `is_syncing` - // slot exactly as the real pass does, hold it, then release. - let holder = Arc::clone(&mgr); - let pass = tokio::spawn(async move { - assert!( - holder - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_ok(), - "test should own the is_syncing slot" - ); - tokio::time::sleep(Duration::from_millis(200)).await; - holder.is_syncing.store(false, Ordering::Release); - }); - - // Give the holder task a chance to take the slot before we - // start draining. - while !mgr.is_syncing() { - tokio::time::sleep(Duration::from_millis(5)).await; - } - - let quiesce_fut = mgr.quiesce(); - tokio::pin!(quiesce_fut); - - // While the pass holds the flag, quiesce must stay pending. - tokio::select! { - _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"), - _ = tokio::time::sleep(Duration::from_millis(50)) => {} - } - assert!(mgr.is_syncing(), "pass should still be in flight"); - - // Once the pass drains, quiesce must return (well within a - // generous bound — it polls every 20ms). - tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut) - .await - .expect("quiesce did not return after the pass drained"); - - // The gate is reopened before quiesce returns. - assert!(!mgr.quiescing.load(Ordering::Acquire)); - assert!(!mgr.is_syncing()); - pass.await.unwrap(); - } - - /// Regression: a tight `stop()` → `start()` must reap the prior loop's - /// OS thread promptly, NOT stall on the 1 s detach backstop. - /// - /// The prior thread's exit epilogue locks `background_cancel` to - /// conditionally clear its slot. The earlier ordering held - /// `background_cancel` across the prior-handle join inside `start()`, so - /// on a back-to-back `stop()` → `start()` the exiting thread blocked on - /// that lock, never finished, and the reap spin-waited the full second - /// before detaching — a 1 s stall plus a transient untracked thread. The - /// fix installs the new token + generation, releases `background_cancel`, - /// and only then reaps, so the prior thread's epilogue runs and the join - /// lands in milliseconds. - /// - /// `stop()` and `start()` run back-to-back in one blocking closure - /// (mirroring the real call site) so `start()` re-acquires the lock - /// microseconds after `stop()` frees it — before the async-woken prior - /// thread can reach its epilogue. Against the old lock-held ordering this - /// reliably stalls ~1 s and fails the bound below. - #[tokio::test(flavor = "multi_thread", worker_threads = 4)] - async fn restart_after_stop_reaps_prior_thread() { - let mgr = make_manager(); - - // Launch the first loop and let its immediate (no-op, nothing - // registered) pass complete so the thread parks in the interval - // sleep, where cancellation lands cleanly. - Arc::clone(&mgr).start(); - assert!(mgr.is_running()); - tokio::time::sleep(Duration::from_millis(50)).await; - - // Back-to-back cancel-only stop + restart, off the runtime so the - // synchronous reap can't starve a worker. `start()` re-grabs - // background_cancel right after `stop()` frees it. - let restart = Arc::clone(&mgr); - let elapsed = tokio::task::spawn_blocking(move || { - restart.stop(); - let started = std::time::Instant::now(); - Arc::clone(&restart).start(); - started.elapsed() - }) - .await - .unwrap(); - - assert!( - elapsed < Duration::from_millis(500), - "stop()→start() stalled for {elapsed:?}: prior thread was not \ - reaped promptly (background_cancel held across the join?)" - ); - assert!(mgr.is_running(), "restart must leave the new loop tracked"); - - // Wind the new loop down so the test leaves no live !Send thread. - let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce()) - .await - .expect("cleanup quiesce did not complete within 2s after restart"); - assert!( - status.is_clean(), - "cleanup quiesce ended non-cleanly: {status:?}" - ); - assert!(!mgr.is_running()); - } - /// A `sync_now()` invoked while `quiescing` is set must bail without /// running the pass — in particular, without calling /// `persister.store(...)`. This is the gate that prevents a pass diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 7e9690d066..d03dcccf7b 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -10,13 +10,12 @@ mod wallet_lifecycle; use std::sync::Arc; +use dash_async::{ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig}; use tokio::sync::{Notify, RwLock}; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; use key_wallet_manager::WalletManager; -use crate::changeset::{spawn_wallet_event_adapter, PlatformWalletPersistence}; +use crate::changeset::{wallet_event_adapter_loop, PlatformWalletPersistence}; use crate::events::{PlatformEventHandler, PlatformEventManager}; use crate::manager::identity_sync::IdentitySyncManager; use crate::manager::platform_address_sync::PlatformAddressSyncManager; @@ -28,21 +27,29 @@ use crate::wallet::core::BalanceUpdateHandler; use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId}; use crate::wallet::PlatformWallet; -/// Shared list of coordinator OS threads that a tight `stop()`→`start()` -/// reap had to detach past its 1 s wedge-backstop. -/// -/// A coordinator's `start()` reap normally joins the prior thread within -/// a few milliseconds. If that thread is genuinely wedged in a -/// non-yielding `Drop` (vanishingly rare — the loop exits via a -/// cancellable `select!`), [`reap_prior_or_park`] parks its still-live -/// `JoinHandle` here instead of dropping it. The manager owns this list -/// and shares a clone (`Arc`) with every coordinator, so -/// [`PlatformWalletManager::shutdown`] can join everything parked here -/// within its timeout and report -/// [`CoordinatorThreadStatus::Detached`] if any thread is still alive — -/// telling the host NOT to free a callback context the thread may still -/// touch (closing a residual use-after-free window). -pub(crate) type CoordinatorOrphans = Arc>>>; +/// Identity of a background worker on the manager's shared +/// [`ThreadRegistry`]. The three periodic sync coordinators run as +/// OS-thread workers (their SDK futures are `!Send`); the wallet-event +/// adapter runs as a tokio task. Drained in ascending weight order on +/// [`shutdown`](PlatformWalletManager::shutdown): the coordinators +/// (weight 0) first, then the event adapter (weight 10) they store into. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub enum WalletWorker { + /// Platform-address (BLAST) balance sync loop. + PlatformAddressSync, + /// Per-identity token-state sync loop. + IdentitySync, + /// Shielded (Orchard) note sync loop. + ShieldedSync, + /// Wallet-event adapter task (sinks coordinator stores). + EventAdapter, +} + +/// Teardown weight of the periodic sync coordinators — drained first. +pub(crate) const COORDINATOR_WEIGHT: ShutdownWeight = ShutdownWeight(0); +/// Teardown weight of the wallet-event adapter — drained after the +/// coordinators that feed it. +pub(crate) const EVENT_ADAPTER_WEIGHT: ShutdownWeight = ShutdownWeight(10); /// Multi-wallet coordinator with SPV sync and event handling. /// @@ -98,16 +105,12 @@ pub struct PlatformWalletManager { #[cfg(feature = "shielded")] pub(super) event_manager: Arc, pub(super) persister: Arc

, - /// Cancellation token + join handle for the wallet-event adapter - /// task. Held so [`shutdown`] can stop it cleanly when the manager - /// is torn down. - pub(super) event_adapter_cancel: CancellationToken, - pub(super) event_adapter_join: tokio::sync::Mutex>>, - /// Coordinator OS threads detached by a tight `stop()`→`start()` - /// reap (see [`CoordinatorOrphans`]). Shared (cloned `Arc`) with - /// every coordinator so their `start()` reaps can park a wedged - /// prior thread here, and drained/joined by [`shutdown`](Self::shutdown). - pub(super) coordinator_orphans: CoordinatorOrphans, + /// Shared worker-lifecycle engine. Owns every background worker's + /// cancellation token + join handle, the restart reap-or-park, and the + /// orphan list. The coordinators hold a clone and register their loops + /// on it; the event adapter runs here as a tokio task. [`shutdown`] + /// drains it in weight order and joins every worker before returning. + pub(super) registry: Arc>, } /// How one background coordinator thread terminated. @@ -161,6 +164,25 @@ impl CoordinatorThreadStatus { } } +/// Relocate a registry [`WorkerStatus`](dash_async::WorkerStatus) into the +/// FFI-stable `CoordinatorThreadStatus`. The variant set and payloads are +/// identical by construction, so this is a byte-stable 1:1 mapping — the +/// FFI `destroy` / shielded-stop adapters keep reading the same shape. +impl From for CoordinatorThreadStatus { + fn from(status: dash_async::WorkerStatus) -> Self { + use dash_async::WorkerStatus as W; + match status { + W::Ok => Self::Ok, + W::Stopped(reason) => Self::Stopped(reason), + W::Panicked(msg) => Self::Panicked(msg), + W::Timeout => Self::Timeout, + W::Detached => Self::Detached, + W::NotRunning => Self::NotRunning, + W::Error(msg) => Self::Error(msg), + } + } +} + /// Per-thread terminal status of every background worker, returned by /// [`PlatformWalletManager::shutdown`]. /// @@ -211,196 +233,36 @@ impl CoordinatorExitStatus { && self.event_adapter.is_clean() && self.detached_threads.is_clean() } -} -/// Join a coordinator's background OS thread and classify how it ended. -/// -/// Called from each coordinator's `quiesce()` after cancelling the -/// loop and draining any in-flight pass, so the thread is already on -/// its way out and the join is near-instant. Joining while the runtime -/// is still alive guarantees the `!Send` loop has stopped touching -/// `tokio::time` before the host drops the runtime. -/// -/// **Polling approach**: we poll [`JoinHandle::is_finished`] in 5 ms -/// steps rather than wrapping `handle.join()` in -/// [`spawn_blocking`](tokio::task::spawn_blocking). The -/// `spawn_blocking` approach spawns a blocking-pool task that cannot be -/// cancelled once started — so dropping the timeout future that wraps -/// `quiesce()` would leave the blocking task alive and `handle.join()` -/// still running, defeating the timeout boundary. Polling lets the -/// executor yield at each `.await` step so `tokio::time::timeout` -/// wrapping `quiesce()` can truly interrupt this call. -/// -/// **Requires a multi-thread runtime.** Each coordinator's OS thread -/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on) -/// and needs the runtime's timer/IO driver; a `current_thread` runtime -/// can only service one `block_on` at a time, so joining one coordinator -/// while the others (and `shutdown()` itself) are mid-`block_on` would -/// deadlock. `shutdown()` asserts the multi-thread flavor up front. -pub(crate) async fn join_coordinator_thread( - handle: Option>, -) -> CoordinatorThreadStatus { - let Some(handle) = handle else { - return CoordinatorThreadStatus::NotRunning; - }; - // Poll until the thread exits. The coordinator was already cancelled - // (stop() fires before quiesce() calls us), so is_finished() becomes - // true nearly immediately — typically within a single 5 ms step. - loop { - if handle.is_finished() { - return match handle.join() { - Ok(()) => CoordinatorThreadStatus::Ok, - Err(payload) => CoordinatorThreadStatus::Panicked(panic_message(payload)), - }; - } - // Yield to the executor so the outer tokio::time::timeout wrapping - // quiesce() can fire if the deadline has passed. Without this yield - // the loop would busy-spin and block the task. - tokio::time::sleep(std::time::Duration::from_millis(5)).await; - } -} - -/// Best-effort extraction of a panic message from a joined thread/task -/// payload (`&str` and `String` are the common cases). -fn panic_message(payload: Box) -> String { - if let Some(s) = payload.downcast_ref::<&str>() { - (*s).to_string() - } else if let Some(s) = payload.downcast_ref::() { - s.clone() - } else { - "".to_string() - } -} - -/// Reap a coordinator's prior OS thread after a `stop()`→`start()` -/// reschedule — or park it for [`PlatformWalletManager::shutdown`] if it -/// is genuinely wedged. -/// -/// Shared by all three coordinators' `start()` (identity / platform- -/// address / shielded), called at the tail of `start()` *after* the -/// `background_cancel` lock has been released, so the exiting prior -/// thread's epilogue (which also takes that lock) can complete and the -/// join lands in milliseconds. -/// -/// `prior` was cancellation-signalled by the preceding `stop()`, so its -/// `select!` loop exits and the thread finishes almost immediately. The -/// `backstop` deadline fires only if the thread is wedged in a -/// non-yielding `Drop` that never observes the cancellation (vanishingly -/// rare). On that wedge we must NOT silently drop the still-live handle: -/// the thread still holds an `Arc` to the host event handler and could -/// fire a callback, so a later `destroy` that freed the host context -/// would hit a use-after-free. Instead we park the handle in `orphans` -/// so `shutdown()` joins it within its own timeout and reports -/// [`CoordinatorThreadStatus::Detached`] if it is still alive — keeping -/// [`CoordinatorExitStatus::all_clean`] honest. -pub(crate) fn reap_prior_or_park( - prior: Option>, - orphans: &CoordinatorOrphans, - backstop: std::time::Duration, - coordinator: &str, -) { - let Some(handle) = prior else { - return; - }; - let deadline = std::time::Instant::now() + backstop; - loop { - if handle.is_finished() { - // Near-instant since finished; reaps the thread's resources. - let _ = handle.join(); - return; - } - if std::time::Instant::now() >= deadline { - tracing::warn!( - coordinator, - ?backstop, - "prior sync thread did not finish within the backstop after \ - cancellation; parking it in the manager orphans list for \ - shutdown() to join rather than detaching it" - ); - // Park the still-live (but already-cancelled) handle so a - // later shutdown() can join it and report it non-clean, - // instead of dropping it and leaving a UAF window where the - // host frees a callback context the thread may still touch. - orphans - .lock() - .unwrap_or_else(|e| e.into_inner()) - .push(handle); - return; - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } -} - -/// Drain the manager's [`CoordinatorOrphans`] list and classify how the -/// parked threads ended, polling until `deadline`. -/// -/// Threads land in the list only when a tight `stop()`→`start()` reap had -/// to detach a prior coordinator thread past its 1 s wedge-backstop (see -/// [`reap_prior_or_park`]). They were parked rather than dropped so this -/// final teardown can account for them: a still-live detached thread -/// keeps an `Arc` to the host event handler and could fire one last -/// callback, so the host must not free its context until every such -/// thread has exited. -/// -/// Polls [`JoinHandle::is_finished`](std::thread::JoinHandle::is_finished) -/// in 5 ms steps, yielding at each `.await` so a wrapping -/// `tokio::time::timeout` can still interrupt it (no uncancellable -/// blocking join — `join()` is only ever called on an already-finished -/// handle). Returns: -/// - [`Ok`](CoordinatorThreadStatus::Ok) — the list was empty, or every -/// parked thread joined cleanly; -/// - [`Panicked`](CoordinatorThreadStatus::Panicked) — a parked thread -/// had panicked (and none were left alive at the deadline); -/// - [`Detached`](CoordinatorThreadStatus::Detached) — at least one -/// parked thread was still alive at `deadline`. Any still-live handles -/// are re-parked so a later (idempotent) `shutdown()` can retry. -pub(crate) async fn join_detached_orphans( - orphans: &CoordinatorOrphans, - deadline: std::time::Instant, -) -> CoordinatorThreadStatus { - // Take the whole list out under the lock; we re-park any survivors - // at the deadline, but never hold the lock across an `.await`. - let mut pending: Vec> = { - let mut guard = orphans.lock().unwrap_or_else(|e| e.into_inner()); - std::mem::take(&mut *guard) - }; - if pending.is_empty() { - return CoordinatorThreadStatus::Ok; - } - - let mut panicked: Option = None; - loop { - // Reap every thread that has finished this pass; retain the rest. - let mut still_live = Vec::with_capacity(pending.len()); - for handle in pending.drain(..) { - if handle.is_finished() { - if let Err(payload) = handle.join() { - // Keep the first panic message; a live `Detached` - // thread still takes precedence at the deadline below. - panicked.get_or_insert_with(|| panic_message(payload)); - } + /// Build the FFI-stable exit status from the registry's weight-ordered + /// [`ShutdownReport`]. A worker absent from the report never ran, so it + /// maps to [`NotRunning`](CoordinatorThreadStatus::NotRunning); a + /// non-zero orphan-survivor count surfaces as + /// [`Detached`](CoordinatorThreadStatus::Detached), keeping + /// [`all_clean`](Self::all_clean) honest for a still-live wedged thread. + pub(crate) fn from_report(report: ShutdownReport) -> Self { + let worker = |key: WalletWorker| -> CoordinatorThreadStatus { + report + .per_worker + .get(&key) + .cloned() + .map(CoordinatorThreadStatus::from) + .unwrap_or(CoordinatorThreadStatus::NotRunning) + }; + Self { + platform_address_sync: worker(WalletWorker::PlatformAddressSync), + identity_sync: worker(WalletWorker::IdentitySync), + #[cfg(feature = "shielded")] + shielded_sync: Some(worker(WalletWorker::ShieldedSync)), + #[cfg(not(feature = "shielded"))] + shielded_sync: None, + event_adapter: worker(WalletWorker::EventAdapter), + detached_threads: if report.detached > 0 { + CoordinatorThreadStatus::Detached } else { - still_live.push(handle); - } - } - pending = still_live; - - if pending.is_empty() { - return match panicked { - Some(msg) => CoordinatorThreadStatus::Panicked(msg), - None => CoordinatorThreadStatus::Ok, - }; - } - if std::time::Instant::now() >= deadline { - // Re-park survivors so an idempotent re-`shutdown()` retries - // rather than losing track of a still-live thread. - orphans - .lock() - .unwrap_or_else(|e| e.into_inner()) - .extend(pending); - return CoordinatorThreadStatus::Detached; + CoordinatorThreadStatus::Ok + }, } - tokio::time::sleep(std::time::Duration::from_millis(5)).await; } } @@ -449,14 +311,28 @@ impl PlatformWalletManager

{ let wallets = Arc::new(RwLock::new(std::collections::BTreeMap::new())); let lock_notify = Arc::new(Notify::new()); - // Spawn the wallet-event adapter that translates upstream - // `WalletEvent`s into `CoreChangeSet`s and forwards them to - // the persister. - let event_adapter_cancel = CancellationToken::new(); - let event_adapter_join = spawn_wallet_event_adapter( - Arc::clone(&wallet_manager), - Arc::clone(&persister), - event_adapter_cancel.clone(), + // Shared worker-lifecycle engine. The 1 s reap backstop (separate + // from the 30 s managed-join budget) is the grace a wedged prior + // thread gets before it is reported `Detached`. + let registry = ThreadRegistry::with_reap_backstop(std::time::Duration::from_secs( + SHUTDOWN_ORPHAN_GRACE_SECS, + )); + + // Register the wallet-event adapter as a tokio task on the + // registry. It sinks the coordinators' stores, so it drains AFTER + // them (weight 10 vs the coordinators' 0). + let adapter_wallet_manager = Arc::clone(&wallet_manager); + let adapter_persister = Arc::clone(&persister); + registry.start_task( + WalletWorker::EventAdapter, + WorkerConfig { + weight: EVENT_ADAPTER_WEIGHT, + join_budget: std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: None, + }, + move |cancel| { + wallet_event_adapter_loop(adapter_wallet_manager, adapter_persister, cancel) + }, ); // Build handler list: app handler + internal handlers. @@ -473,13 +349,6 @@ impl PlatformWalletManager

{ balance_handler, ])); - // Shared orphans list: a coordinator's `start()` reap parks here - // any prior thread it had to detach past its 1 s wedge-backstop, - // and `shutdown()` joins them. Every coordinator gets a clone of - // this same `Arc` so they all park into the one list the manager - // drains. - let coordinator_orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new())); - let spv = Arc::new(SpvRuntime::new( Arc::clone(&wallet_manager), Arc::clone(&event_manager), @@ -487,12 +356,12 @@ impl PlatformWalletManager

{ let platform_address_sync = Arc::new(PlatformAddressSyncManager::new( Arc::clone(&wallets), Arc::clone(&event_manager), - Arc::clone(&coordinator_orphans), + Arc::clone(®istry), )); let identity_sync = Arc::new(IdentitySyncManager::new( Arc::clone(&sdk), Arc::clone(&persister), - Arc::clone(&coordinator_orphans), + Arc::clone(®istry), )); #[cfg(feature = "shielded")] let shielded_coordinator: Arc< @@ -502,7 +371,7 @@ impl PlatformWalletManager

{ let shielded_sync = Arc::new(ShieldedSyncManager::new( Arc::clone(&event_manager), Arc::clone(&shielded_coordinator), - Arc::clone(&coordinator_orphans), + Arc::clone(®istry), )); Self { sdk, @@ -519,9 +388,7 @@ impl PlatformWalletManager

{ #[cfg(feature = "shielded")] event_manager, persister, - event_adapter_cancel, - event_adapter_join: tokio::sync::Mutex::new(Some(event_adapter_join)), - coordinator_orphans, + registry, } } @@ -642,194 +509,79 @@ impl PlatformWalletManager

{ /// - the coordinator's store reset itself fails. #[cfg(feature = "shielded")] pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> { - // Bound the quiesce with the same backstop `shutdown()` uses so a - // stalled in-flight pass can't hang Clear forever — cancellation - // makes the drain prompt; this timeout only matters if a pass's - // drop wedges. Unlike `shutdown()`, the terminal status is - // load-bearing HERE: a non-clean drain means the in-flight pass may - // still be running and could re-persist notes into the very store - // the `clear()` below is about to wipe. A timeout (the future was - // dropped at the deadline) is treated as the non-clean `Timeout` - // status, matching `shutdown()`'s backstop substitution. - let status = match tokio::time::timeout( - std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), - self.shielded_sync_manager.quiesce(), - ) - .await - { - Ok(status) => status, - Err(_elapsed) => CoordinatorThreadStatus::Timeout, - }; + // Quiesce the shielded loop: cancel it, drain any in-flight pass + // (incl. its persister fan-out), and join its OS thread. The + // registry bounds the join by the coordinator's own + // `SHUTDOWN_JOIN_TIMEOUT_SECS` budget — returning `Timeout` rather + // than hanging if a pass's drop wedges — so no outer timeout is + // needed here. + let status = self.shielded_sync_manager.quiesce().await; + // Only commit the store wipe once the in-flight pass has fully - // drained. Otherwise refuse: a partial/timed-out drain could let a - // surviving pass write into a store we just cleared, desyncing the - // host's own wipe from a repopulated tree. + // drained. A partial/timed-out drain could let a surviving pass + // write into a store we just cleared, desyncing the host's own + // wipe from a repopulated tree. if !status.is_clean() { return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status }); } + // [F2 FIX] Also refuse if a prior-generation shielded thread is + // still parked alive: it holds an `Arc` to the persister/store and + // could re-persist notes into the store we are about to wipe. The + // check is shielded-scoped, so the other coordinators / the + // always-on event adapter running normally do not block Clear. + if self.registry.any_alive_for(WalletWorker::ShieldedSync) { + return Err( + crate::error::PlatformWalletError::ShieldedShutdownIncomplete { + status: CoordinatorThreadStatus::Detached, + }, + ); + } if let Some(coord) = self.shielded_coordinator().await { coord.clear().await?; } Ok(()) } - /// Stop all background tasks and wait for them to exit. + /// Stop all background workers and wait for them to exit. /// - /// **Quiesces** the periodic coordinators - /// (`PlatformAddressSyncManager`, `IdentitySyncManager`, - /// `ShieldedSyncManager`) — cancelling each loop *and draining any - /// in-flight pass to completion*, including its persister / - /// host-callback fan-out — then drains the wallet-event adapter task. - /// Idempotent. Call before dropping the manager when a clean - /// shutdown is required (e.g. on app termination); a dirty drop - /// simply leaks the tasks until the runtime exits. + /// Delegates to the shared [`ThreadRegistry::shutdown`], which drains + /// in ascending weight order: the periodic coordinators (weight 0) + /// first — concurrently, since they share no lock — then the + /// wallet-event adapter (weight 10) that sinks their stores, then any + /// parked orphans. Each worker's drain raises its `quiescing` gate, + /// cancels the loop, and **joins** its OS thread / task, so when this + /// returns every `!Send` loop has fully exited. Idempotent. /// /// Ordering matters: cancel-only `stop()` would let a pass already /// inside `sync_now` keep running and call `persister.store(...)` / - /// fire a host completion callback after the FFI's `destroy` - /// returned and the host freed the persister / event-handler - /// context — a use-after-free. So we `quiesce()` the sync managers - /// FIRST (so no further persister store or host callback can start), - /// and only THEN cancel + join the event adapter, which is the sink - /// those stores feed into. The three coordinators are independent — - /// each `quiesce()` touches only its own state (its `quiescing` / - /// `is_syncing` atomics and its own `background_cancel` / - /// `background_join` mutexes) and joins its own OS thread, sharing no - /// lock — so they are drained *concurrently* via `tokio::join!`; only - /// the event-adapter teardown stays ordered strictly after them, - /// because it is the sink the coordinators store into. + /// fire a host completion callback after the FFI's `destroy` returned + /// and the host freed the persister / event-handler context — a + /// use-after-free. Quiescing the coordinators (weight 0) before the + /// event adapter (weight 10) closes that window: no further store can + /// start before its sink is torn down. /// - /// After each coordinator's `quiesce()` drains its in-flight pass, - /// this also **joins** the loop's OS thread, so when `shutdown()` - /// returns every `!Send` loop has fully exited. A host that drops the - /// tokio runtime right after `shutdown()` (one-shot / headless / - /// stdio) is therefore safe — no coordinator can still be polling - /// `tokio::time` on a shutting-down runtime. The returned - /// [`CoordinatorExitStatus`] reports per-thread how each worker ended. + /// A host that drops the tokio runtime right after `shutdown()` + /// (one-shot / headless / stdio) is therefore safe — no coordinator + /// can still be polling `tokio::time` on a shutting-down runtime. The + /// returned [`CoordinatorExitStatus`] reports per-worker how each ended. /// /// **Precondition: must be called from a multi-thread Tokio runtime.** /// Each coordinator's OS thread drives its loop via - /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs - /// the runtime's timer/IO driver; a `current_thread` runtime can only - /// service one `block_on` at a time, so the join would deadlock. This - /// is asserted in both debug and release builds. - /// - /// Each coordinator quiesce+join is bounded by its own - /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] backstop. Because the three drain - /// concurrently, the worst-case wait collapses to ~that single - /// backstop instead of the sum of all three. `quiesce()` cancels - /// the loop, which aborts any in-flight pass at its `.await` point, so - /// the `is_syncing` drain clears promptly and the join normally lands - /// far inside the window — the deadline fires only if a pass's *drop* - /// itself wedges. On timeout the coordinator slot reports - /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever. + /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs the + /// runtime's timer/IO driver; a `current_thread` runtime can only + /// service one `block_on` at a time, so the join would deadlock. + /// [`ThreadRegistry::shutdown`] asserts this in both debug and release. /// - /// The clear-on-panic half of that guarantee rides on unwinding, so - /// it holds under `panic = "unwind"`. Under the iOS `panic = "abort"` - /// release profiles a pass panic aborts the process outright (no - /// `Drop`, no status) — there is no live manager left to read a - /// status from. + /// Each worker's join is bounded by its own + /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] budget; on timeout its handle is + /// re-parked and the slot reports + /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever + /// (the F1 fix — a dropped/timed-out join can never detach a live + /// thread). The clear-on-panic half rides on unwinding, so it holds + /// under `panic = "unwind"`; under the iOS `panic = "abort"` profiles a + /// pass panic aborts the process outright. pub async fn shutdown(&self) -> CoordinatorExitStatus { - assert!( - matches!( - tokio::runtime::Handle::current().runtime_flavor(), - tokio::runtime::RuntimeFlavor::MultiThread - ), - "shutdown() requires a multi-thread Tokio runtime: each \ - coordinator's OS thread drives its sync loop via \ - Handle::block_on and needs the runtime's timer/IO driver, but \ - a current_thread runtime can only drive one block_on at a time" - ); - - let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS); - - // Drain the three independent periodic coordinators *concurrently*. - // Each quiesce() drains any in-flight pass AND joins its own OS - // thread, touching only that coordinator's own state (no shared - // lock), so racing them is sound and collapses the worst case from - // the sum of the three backstops to ~max(...). Each drain keeps its - // OWN inner `tokio::time::timeout`, so it still yields its own - // per-coordinator `CoordinatorThreadStatus` — a single outer timeout - // around the whole join! would flatten all three to `Timeout` and - // lose that detail. - let drain_platform_address = async { - tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce()) - .await - .unwrap_or(CoordinatorThreadStatus::Timeout) - }; - let drain_identity = async { - tokio::time::timeout(timeout, self.identity_sync_manager.quiesce()) - .await - .unwrap_or(CoordinatorThreadStatus::Timeout) - }; - #[cfg(feature = "shielded")] - let drain_shielded = async { - tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce()) - .await - .unwrap_or(CoordinatorThreadStatus::Timeout) - }; - - #[cfg(feature = "shielded")] - let (platform_address_sync, identity_sync, shielded_sync) = { - let (p, i, s) = tokio::join!(drain_platform_address, drain_identity, drain_shielded); - (p, i, Some(s)) - }; - #[cfg(not(feature = "shielded"))] - let (platform_address_sync, identity_sync, shielded_sync) = { - let (p, i) = tokio::join!(drain_platform_address, drain_identity); - (p, i, None) - }; - - // The event adapter is a tokio task (it sinks the coordinators' - // stores), so cancel + join it last — after the loops feeding it - // are gone. - self.event_adapter_cancel.cancel(); - // Take the handle out into a local first so the `tokio::Mutex` - // guard doesn't stay held across the (up-to-30s) join `.await` - // below — a match scrutinee temporary would otherwise keep the - // guard alive for the whole match. - let event_adapter_handle = self.event_adapter_join.lock().await.take(); - let event_adapter = match event_adapter_handle { - None => CoordinatorThreadStatus::NotRunning, - Some(handle) => match tokio::time::timeout(timeout, handle).await { - Ok(Ok(())) => CoordinatorThreadStatus::Ok, - // The returned status already carries this failure, and the - // FFI `destroy` adapter logs the aggregate once at the host - // layer — so don't double-log here. - Ok(Err(e)) => { - if e.is_panic() { - CoordinatorThreadStatus::Panicked(panic_message(e.into_panic())) - } else { - // Non-panic JoinError: task was cancelled or aborted — - // not a clean exit, but also not a panic. - CoordinatorThreadStatus::Stopped(Some(format!("{e}"))) - } - } - Err(_) => CoordinatorThreadStatus::Timeout, - }, - }; - - // Finally, account for any coordinator threads an earlier tight - // stop()→start() reap had to detach past its 1 s wedge-backstop. - // They were parked in `coordinator_orphans` (not dropped) so we - // can join them here; a survivor at the grace deadline reports - // `Detached`, which keeps `all_clean()` false so the FFI `destroy` - // returns `ErrorShutdownIncomplete` rather than letting the host - // free a callback context the live thread may still touch. The - // grace poll yields, so it never blocks teardown uncancellably. - let detached_threads = join_detached_orphans( - &self.coordinator_orphans, - std::time::Instant::now() + std::time::Duration::from_secs(SHUTDOWN_ORPHAN_GRACE_SECS), - ) - .await; - - CoordinatorExitStatus { - platform_address_sync, - identity_sync, - shielded_sync, - event_adapter, - detached_threads, - } + CoordinatorExitStatus::from_report(self.registry.shutdown().await) } } @@ -974,68 +726,6 @@ mod tests { assert!(status.all_clean()); } - /// A coordinator thread that panics surfaces as `Panicked` rather - /// than being silently dropped. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn join_coordinator_thread_surfaces_panic() { - let handle = std::thread::spawn(|| panic!("boom in coordinator")); - match join_coordinator_thread(Some(handle)).await { - CoordinatorThreadStatus::Panicked(msg) => { - assert!(msg.contains("boom in coordinator"), "msg was {msg:?}"); - } - other => panic!("expected Panicked, got {other:?}"), - } - } - - /// A non-panic `JoinError` on the event adapter maps to `Stopped`, not - /// `Ok`, and is NOT considered clean. This covers the case where the - /// tokio task is cancelled or aborted rather than completing normally. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() { - // Replace the real adapter handle with a guaranteed-pending task, then - // abort it. A `pending::<()>()` future can never complete on its own, - // so abort() always produces a non-panic JoinError — deterministically - // exercising the Stopped branch regardless of scheduler timing. - // (The original approach aborted the real adapter handle, which could - // race the task's own completion and silently yield `Ok` instead.) - let manager = make_manager(); - - // Drain and discard the real adapter (may already be finished). - let original = { - let mut guard = manager.event_adapter_join.lock().await; - guard.take() - }; - if let Some(h) = original { - h.abort(); - let _ = h.await; - } - - // Install a permanently-pending task and abort it so the JoinError - // path in shutdown() is 100 % deterministic. - let pending = tokio::spawn(std::future::pending::<()>()); - pending.abort(); - *manager.event_adapter_join.lock().await = Some(pending); - - let status = manager.shutdown().await; - - // The aborted pending task always yields a non-panic JoinError → - // shutdown() maps it to Stopped. - assert!( - matches!(status.event_adapter, CoordinatorThreadStatus::Stopped(_)), - "expected Stopped from a non-panic JoinError, got {:?}", - status.event_adapter - ); - assert!( - !status.event_adapter.is_clean(), - "Stopped must not count as clean" - ); - // Coordinators were never started → their slots are clean. - assert_eq!( - status.platform_address_sync, - CoordinatorThreadStatus::NotRunning - ); - } - /// `Stopped` and `Timeout` are NOT clean; `Ok` and `NotRunning` ARE. /// Unit-tests the `is_clean` predicate directly so we don't need to /// trigger a real timeout (30s) in a deterministic test. @@ -1097,36 +787,6 @@ mod tests { assert!(!with_detached.all_clean()); } - /// A cleanly-returning thread joins as `Ok`; an absent handle is - /// `NotRunning`. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn join_coordinator_thread_ok_and_absent() { - let handle = std::thread::spawn(|| {}); - assert_eq!( - join_coordinator_thread(Some(handle)).await, - CoordinatorThreadStatus::Ok - ); - assert_eq!( - join_coordinator_thread(None).await, - CoordinatorThreadStatus::NotRunning - ); - } - - /// `join_coordinator_thread` uses `is_finished()` polling. Verify - /// it completes within a bounded time on a multi-thread runtime, as - /// `shutdown()` requires (and that it doesn't busy-spin indefinitely). - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn join_coordinator_thread_completes_within_deadline() { - let handle = std::thread::spawn(|| {}); - let result = tokio::time::timeout( - Duration::from_secs(5), - join_coordinator_thread(Some(handle)), - ) - .await - .expect("join_coordinator_thread must complete within 5 s"); - assert_eq!(result, CoordinatorThreadStatus::Ok); - } - /// `shutdown()` must wait for an in-flight sync pass to drain before /// joining the coordinator thread. /// @@ -1185,8 +845,7 @@ mod tests { /// /// Uses `std::panic::catch_unwind` around `drop(runtime)` rather than /// a process-global panic hook; the hook would be live for seconds and - /// could swallow diagnostics from concurrently-running tests (e.g. - /// `join_coordinator_thread_surfaces_panic`). + /// could swallow diagnostics from other concurrently-running tests. #[test] fn shutdown_then_drop_runtime_does_not_panic() { static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0); @@ -1273,117 +932,26 @@ mod tests { (release_tx, handle) } - /// A prior coordinator thread that is still alive past the reap - /// backstop must be **parked in the orphans list**, not dropped — - /// otherwise `shutdown()` would never know it exists and could let the - /// host free a callback context the live thread still touches. - /// - /// Non-vacuous: if `reap_prior_or_park` dropped the wedged handle - /// (the old behavior) the list would stay empty and the length - /// assertion below would fail. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn reap_prior_or_park_parks_wedged_thread() { - let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new())); - let (release_tx, wedged) = spawn_wedged_thread(); - - // `reap_prior_or_park` is synchronous and spins a std sleep until - // its backstop, so run it off the runtime workers. A short backstop - // (real `start()` uses 1 s) keeps the test fast. - let orphans_for_reap = Arc::clone(&orphans); - tokio::task::spawn_blocking(move || { - reap_prior_or_park( - Some(wedged), - &orphans_for_reap, - Duration::from_millis(100), - "test-coordinator", - ); - }) - .await - .unwrap(); - - assert_eq!( - orphans.lock().unwrap().len(), - 1, - "a prior thread wedged past the backstop must be parked, not dropped" - ); - - // Cleanup: release + join the parked thread so none leaks. - release_tx.send(()).unwrap(); - let parked = orphans.lock().unwrap().pop().unwrap(); - tokio::task::spawn_blocking(move || { - let _ = parked.join(); - }) - .await - .unwrap(); - } - - /// `join_detached_orphans` classifies the parked threads: empty list → - /// `Ok`; a survivor at the deadline → `Detached` (re-parked for a later - /// retry); once the survivor exits, a fresh join reports `Ok` and - /// drains the list. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn join_detached_orphans_reports_detached_then_ok() { - let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new())); - - // Nothing parked → clean. - assert_eq!( - join_detached_orphans(&orphans, std::time::Instant::now()).await, - CoordinatorThreadStatus::Ok - ); - - // Park a still-live thread; a short deadline elapses with it alive. - let (release_tx, wedged) = spawn_wedged_thread(); - orphans.lock().unwrap().push(wedged); - let status = join_detached_orphans( - &orphans, - std::time::Instant::now() + Duration::from_millis(50), - ) - .await; - assert_eq!( - status, - CoordinatorThreadStatus::Detached, - "a survivor at the deadline must report Detached" - ); - assert_eq!( - orphans.lock().unwrap().len(), - 1, - "a survivor must be re-parked so an idempotent re-shutdown retries" - ); - - // Release it; the next join reaps it cleanly and empties the list. - release_tx.send(()).unwrap(); - let status = tokio::time::timeout( - Duration::from_secs(5), - join_detached_orphans(&orphans, std::time::Instant::now() + Duration::from_secs(5)), - ) - .await - .expect("orphan join must complete once the thread is released"); - assert_eq!(status, CoordinatorThreadStatus::Ok); - assert!( - orphans.lock().unwrap().is_empty(), - "a joined orphan must be drained from the list" - ); - } - /// Headline regression: a coordinator thread detached past the reap /// backstop and parked in the orphans list makes a subsequent /// `shutdown()` report the result as **non-clean** — so the FFI /// `destroy` returns `ErrorShutdownIncomplete` and the host delays /// freeing the callback context the still-live thread may touch. /// - /// Non-vacuous: if `join_detached_orphans` ignored the list (or the - /// orphan were dropped at reap instead of parked), `detached_threads` - /// would be `Ok` and `all_clean()` would be `true`, failing both - /// assertions. + /// Non-vacuous: if the registry dropped the orphan at reap instead of + /// parking it, `detached_threads` would be `Ok` and `all_clean()` would + /// be `true`, failing both assertions. #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn shutdown_reports_detached_orphan_as_non_clean() { let manager = make_manager(); // Stand in for the genuine-wedge outcome: an earlier tight - // stop()→start() reap had to detach a still-live coordinator thread - // past its 1 s backstop, so `reap_prior_or_park` parked it here. + // stop()->start() reap had to detach a still-live coordinator thread + // past its backstop, so the registry parked it as an orphan. let (release_tx, wedged) = spawn_wedged_thread(); - manager.coordinator_orphans.lock().unwrap().push(wedged); + manager + .registry + .park_orphan_for_test(WalletWorker::ShieldedSync, wedged); let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown()) .await @@ -1400,17 +968,133 @@ mod tests { still alive: {status:?}" ); - // Cleanup: shutdown() re-parked the survivor; release + join it so - // no live thread leaks past the test. Pop into a local first so the - // std MutexGuard is not held across the await below. + // Cleanup: shutdown() re-parked the survivor; release it and reap so + // no live thread leaks past the test. release_tx.send(()).unwrap(); - let parked = manager.coordinator_orphans.lock().unwrap().pop(); - if let Some(parked) = parked { - tokio::task::spawn_blocking(move || { - let _ = parked.join(); - }) + let _ = manager.registry.reap_orphans(Duration::from_secs(5)).await; + } + + /// TC-002 (F2): `clear_shielded` must refuse while a prior-generation + /// shielded thread is parked alive — even though the current shielded + /// quiesce is clean and the other coordinators / the always-on event + /// adapter are legitimately running. Releasing + reaping the orphan + /// lets a retry succeed. + /// + /// Non-vacuous: against the pre-fix gate (only `!status.is_clean()`), + /// the clean `NotRunning` quiesce would pass the guard and wipe the + /// store under the live orphan — `clear_shielded` would return `Ok`. + #[cfg(feature = "shielded")] + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn clear_shielded_refuses_while_shielded_orphan_alive() { + let manager = make_manager(); + + // Park a wedged thread under the ShieldedSync key: a prior- + // generation shielded thread an earlier reap could not join. + let (release_tx, wedged) = spawn_wedged_thread(); + manager + .registry + .park_orphan_for_test(WalletWorker::ShieldedSync, wedged); + + assert!(manager.registry.any_alive_for(WalletWorker::ShieldedSync)); + assert!(!manager.shielded_sync_manager.is_running()); + + // Refuses: the live shielded orphan could re-persist into the store + // the wipe is about to clear. + let err = manager + .clear_shielded() .await - .unwrap(); - } + .expect_err("clear_shielded must refuse while a shielded orphan is alive"); + assert!( + matches!( + err, + crate::error::PlatformWalletError::ShieldedShutdownIncomplete { .. } + ), + "expected ShieldedShutdownIncomplete, got {err:?}" + ); + + // Release + reap the orphan; the shielded-scoped gate now clears and + // a retry succeeds (no shielded store configured → clear is a no-op). + release_tx.send(()).unwrap(); + let _ = manager.registry.reap_orphans(Duration::from_secs(5)).await; + assert!(!manager.registry.any_alive_for(WalletWorker::ShieldedSync)); + manager + .clear_shielded() + .await + .expect("clear_shielded must succeed once the orphan is reaped"); + } + + /// TC-015 (R5): `from_report` maps the registry's [`ShutdownReport`] + /// onto the FFI-stable `CoordinatorExitStatus` with identical field / + /// variant shape and `all_clean()` semantics. The full `WorkerStatus` + /// -> `CoordinatorThreadStatus` variant table is exercised. + #[test] + fn from_report_maps_to_ffi_stable_exit_status() { + use dash_async::WorkerStatus; + use std::collections::BTreeMap; + + // All Ok, no orphans. + let per = BTreeMap::from([ + (WalletWorker::PlatformAddressSync, WorkerStatus::Ok), + (WalletWorker::IdentitySync, WorkerStatus::Ok), + (WalletWorker::ShieldedSync, WorkerStatus::Ok), + (WalletWorker::EventAdapter, WorkerStatus::Ok), + ]); + let status = CoordinatorExitStatus::from_report(ShutdownReport { + per_worker: per, + detached: 0, + }); + assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok); + #[cfg(feature = "shielded")] + assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok)); + #[cfg(not(feature = "shielded"))] + assert_eq!(status.shielded_sync, None); + assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok); + assert_eq!(status.detached_threads, CoordinatorThreadStatus::Ok); + assert!(status.all_clean()); + + // A surviving orphan -> Detached -> non-clean; absent workers -> + // NotRunning. + let status = CoordinatorExitStatus::from_report(ShutdownReport { + per_worker: BTreeMap::new(), + detached: 1, + }); + assert_eq!(status.detached_threads, CoordinatorThreadStatus::Detached); + assert_eq!( + status.platform_address_sync, + CoordinatorThreadStatus::NotRunning + ); + assert!(!status.all_clean()); + + // A per-worker Timeout propagates and is non-clean. + let per = BTreeMap::from([(WalletWorker::IdentitySync, WorkerStatus::Timeout)]); + let status = CoordinatorExitStatus::from_report(ShutdownReport { + per_worker: per, + detached: 0, + }); + assert_eq!(status.identity_sync, CoordinatorThreadStatus::Timeout); + assert!(!status.all_clean()); + + // Full variant mapping table. + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Stopped(Some("x".into()))), + CoordinatorThreadStatus::Stopped(Some("x".into())) + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Panicked("p".into())), + CoordinatorThreadStatus::Panicked("p".into()) + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Error("e".into())), + CoordinatorThreadStatus::Error("e".into()) + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::NotRunning), + CoordinatorThreadStatus::NotRunning + ); + assert_eq!( + CoordinatorThreadStatus::from(WorkerStatus::Detached), + CoordinatorThreadStatus::Detached + ); } } diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index 40457c4a87..e68fcfef7c 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -11,19 +11,19 @@ use std::collections::BTreeMap; use std::sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex as StdMutex, + Arc, }; -use dash_async::AtomicFlagGuard; +use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use arc_swap::ArcSwapOption; use dash_sdk::platform::address_sync::{AddressSyncConfig, AddressSyncResult}; use key_wallet::PlatformP2PKHAddress; +use super::WalletWorker; use crate::wallet::PlatformAddressTag; use tokio::sync::RwLock; -use tokio_util::sync::CancellationToken; use crate::error::PlatformWalletError; use crate::events::PlatformEventManager; @@ -97,27 +97,10 @@ impl PlatformAddressSyncSummary { pub struct PlatformAddressSyncManager { wallets: Arc>>>, event_manager: Arc, - /// Cancel token for the background loop, if running. - background_cancel: StdMutex>, - /// Join handle for the background loop's OS thread, if running. - /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can - /// confirm the `!Send` loop fully exited before the host drops the - /// runtime. - background_join: StdMutex>>, - /// Manager-owned orphans list (shared `Arc`). On a tight - /// `stop()`→`start()` where the prior thread is wedged past the 1 s - /// reap backstop, [`start`](Self::start) parks the still-live handle - /// here (via [`reap_prior_or_park`](super::reap_prior_or_park)) - /// instead of dropping it, so manager `shutdown()` accounts for it. - coordinator_orphans: super::CoordinatorOrphans, - /// Monotonically increasing generation counter. Bumped on every - /// `start()` so the exiting thread can tell whether its generation is - /// still the active one before clearing `background_cancel`. Without - /// this guard a tight `stop()` → `start()` reschedule lets the prior - /// thread's cleanup strip the *new* generation's token, leaving the - /// new loop running but untrackable via `is_running()` / `stop()`. - /// Mirrors the identity / shielded coordinators. - background_generation: AtomicU64, + /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` / + /// `quiesce` delegate to it under the + /// [`WalletWorker::PlatformAddressSync`] key. + registry: Arc>, interval_secs: AtomicU64, is_syncing: AtomicBool, /// Set by [`quiesce`](Self::quiesce) to gate new passes while it @@ -141,15 +124,12 @@ impl PlatformAddressSyncManager { pub fn new( wallets: Arc>>>, event_manager: Arc, - coordinator_orphans: super::CoordinatorOrphans, + registry: Arc>, ) -> Self { Self { wallets, event_manager, - background_cancel: StdMutex::new(None), - background_join: StdMutex::new(None), - coordinator_orphans, - background_generation: AtomicU64::new(0), + registry, interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), quiescing: AtomicBool::new(false), @@ -185,10 +165,22 @@ impl PlatformAddressSyncManager { /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.background_cancel - .lock() - .map(|g| g.is_some()) - .unwrap_or(false) + self.registry.is_running(WalletWorker::PlatformAddressSync) + } + + /// The drain barrier handed to the registry: raise the `quiescing` + /// gate so any pass past its `is_syncing` CAS bails. The registry then + /// cancels the loop and joins the thread (the join waits for the + /// in-flight pass — incl. its completion-event dispatch — to drop and + /// `is_syncing` to clear), so this barrier is instant. + fn drain_hook(self: &Arc) -> DrainHook { + let this = Arc::clone(self); + Arc::new(move || { + let this = Arc::clone(&this); + Box::pin(async move { + this.quiescing.store(true, Ordering::Release); + }) + }) } /// Whether a sync pass is in flight right now. @@ -220,60 +212,28 @@ impl PlatformAddressSyncManager { /// The first pass runs immediately; subsequent passes fire every /// [`interval`](Self::interval). pub fn start(self: Arc) { - let mut cancel_guard = self - .background_cancel - .lock() - .unwrap_or_else(|e| e.into_inner()); - if cancel_guard.is_some() { - return; - } + // Reopen the quiescing gate so this (re)start's passes can run. + self.quiescing.store(false, Ordering::Release); - // Take any handle left by a prior stop() call so we can reap it — but - // DON'T join it here, while we still hold background_cancel. stop() - // takes-and-cancels the token but never touches background_join, so a - // stop()→start() sequence would otherwise overwrite (detach) the old - // handle and shutdown() would miss that thread. Joining it under - // background_cancel would DEADLOCK the reap into its 1 s backstop: the - // exiting prior thread's epilogue also locks background_cancel (to - // clear its slot), so it would block on the lock we hold → never - // finish → get detached on the exact stop()→start() path the reap - // exists for. We install the new token + bump the generation below, - // release the lock, and only THEN reap (after this fn's tail). - let prior = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - - let cancel = CancellationToken::new(); - *cancel_guard = Some(cancel.clone()); - // Bump the generation while we still hold the slot lock so any - // prior thread's cleanup observes `current_gen != my_gen` ordered - // against this token swap. - let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; + let cfg = WorkerConfig { + weight: super::COORDINATOR_WEIGHT, + join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: Some(self.drain_hook()), + }; + // The loop drives `!Send` SDK futures via `Handle::block_on` on a + // dedicated OS thread (spawned by the registry). `biased` polls the + // cancel arm first so a pass stalled on a hung SDK fetch is dropped + // at its `.await` the instant the registry cancels. let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); - let join = std::thread::Builder::new() - .name("platform-address-sync".into()) - .spawn(move || { + self.registry + .start_thread(WalletWorker::PlatformAddressSync, cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { break; } - - // Race the in-flight pass against cancellation. - // `stop()` / `quiesce()` cancel the token; with - // `biased` the cancel arm is polled first, so a - // pass stalled on a hung SDK fetch is dropped at - // its `.await` the instant we cancel. Dropping the - // `sync_now` future unwinds to the `is_syncing` - // `AtomicFlagGuard` it holds, clearing the flag - // promptly — so `quiesce()`'s drain loop frees and - // the join lands well inside `shutdown()`'s - // timeout. A stalled pass can no longer strand a - // live `!Send` thread past `shutdown()`. tokio::select! { biased; _ = cancel.cancelled() => break, @@ -286,50 +246,8 @@ impl PlatformAddressSyncManager { _ = cancel.cancelled() => break, } } - - // Only clear the slot if no newer start() has - // installed a replacement token since we launched — - // mirrors the identity / shielded coordinators so a - // stop() → start() reschedule can't have this exiting - // thread strip the new generation's cancel token. - if let Ok(mut guard) = this.background_cancel.lock() { - if this.background_generation.load(Ordering::Acquire) == my_gen { - *guard = None; - } - } }); - }) - .expect("failed to spawn platform-address-sync thread"); - // Store the join handle while still holding cancel_guard — a - // concurrent quiesce() must wait for this lock before calling - // stop(), so the handle is always stored before it can be taken. - *self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) = Some(join); - - // Release background_cancel BEFORE reaping the prior thread, so its - // epilogue can acquire the lock, observe the bumped generation, skip - // clearing our freshly-installed token, and return. Holding the lock - // across the join below is what would block the prior thread, spin - // the full 1 s deadline, and detach — the very stall this ordering - // removes. - drop(cancel_guard); - - // Now reap the prior thread. It was already cancellation-signalled by - // stop(), and with the lock released its epilogue completes promptly, - // so is_finished() trips within a few milliseconds and the join is - // near-instant. The 1 s deadline survives only as a genuine-wedge - // backstop (e.g. a pass wedged in a Drop that never yields); if it - // fires `reap_prior_or_park` parks the still-live, already-cancelled - // thread in the manager orphans list so `shutdown()` joins it and - // reports it non-clean rather than dropping it (residual UAF). - super::reap_prior_or_park( - prior, - &self.coordinator_orphans, - std::time::Duration::from_secs(1), - "platform-address-sync", - ); + }); } /// Stop the background sync loop. No-op if not running. @@ -342,14 +260,7 @@ impl PlatformAddressSyncManager { /// the host can free the event-handler context — use /// [`quiesce`](Self::quiesce). pub fn stop(&self) { - if let Some(token) = self - .background_cancel - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take() - { - token.cancel(); - } + self.registry.cancel(WalletWorker::PlatformAddressSync); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -378,24 +289,14 @@ impl PlatformAddressSyncManager { /// the `!Send` loop has stopped touching `tokio::time` before a /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { - self.quiescing.store(true, Ordering::Release); - // RAII gate: resets `quiescing` on *every* exit path — a normal - // return, a timed-out `shutdown()` dropping this future, or a - // panic. Without it a quiesce that doesn't run to completion - // leaves the gate latched `true`, silently bailing every future - // pass. Reopening on drop is safe because `stop()` (below) has - // already cancelled the loop, so no new pass can start. + // RAII gate: reopen `quiescing` on every exit path. The registry's + // drain hook raises it inside `quiesce`; reopening on return is + // safe because the loop has been cancelled, so no new pass starts. let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.stop(); - while self.is_syncing.load(Ordering::Acquire) { - tokio::time::sleep(Duration::from_millis(20)).await; - } - let handle = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - super::join_coordinator_thread(handle).await + self.registry + .quiesce(WalletWorker::PlatformAddressSync) + .await + .into() } /// Run one sync pass across every registered wallet. @@ -543,12 +444,12 @@ mod tests { let event_manager = Arc::new(PlatformEventManager::new(vec![ Arc::clone(&counter) as Arc ])); - let orphans = Arc::new(StdMutex::new(Vec::new())); + let registry = ThreadRegistry::new(); ( Arc::new(PlatformAddressSyncManager::new( wallets, event_manager, - orphans, + registry, )), counter, ) @@ -564,113 +465,6 @@ mod tests { assert!(!mgr.is_syncing()); } - /// `quiesce()` must not return while a pass is in flight, and must - /// return promptly once the pass drains. - /// - /// Drives the real `is_syncing` lifecycle: a background task takes - /// the slot via the same `compare_exchange` the real `sync_now` - /// uses, holds it across a sleep (standing in for the pass body + - /// completion-event dispatch, which `sync_now` keeps the flag set - /// across), then clears it. - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn quiesce_blocks_until_in_flight_pass_drains() { - let (mgr, _counter) = make_manager(); - - let holder = Arc::clone(&mgr); - let pass = tokio::spawn(async move { - assert!( - holder - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_ok(), - "test should own the is_syncing slot" - ); - tokio::time::sleep(Duration::from_millis(200)).await; - holder.is_syncing.store(false, Ordering::Release); - }); - - while !mgr.is_syncing() { - tokio::time::sleep(Duration::from_millis(5)).await; - } - - let quiesce_fut = mgr.quiesce(); - tokio::pin!(quiesce_fut); - - tokio::select! { - _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"), - _ = tokio::time::sleep(Duration::from_millis(50)) => {} - } - assert!(mgr.is_syncing(), "pass should still be in flight"); - - tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut) - .await - .expect("quiesce did not return after the pass drained"); - - assert!(!mgr.quiescing.load(Ordering::Acquire)); - assert!(!mgr.is_syncing()); - pass.await.unwrap(); - } - - /// Regression: a tight `stop()` → `start()` must reap the prior loop's - /// OS thread promptly, NOT stall on the 1 s detach backstop. - /// - /// The prior thread's exit epilogue locks `background_cancel` to - /// conditionally clear its slot. The earlier ordering held - /// `background_cancel` across the prior-handle join inside `start()`, so - /// on a back-to-back `stop()` → `start()` the exiting thread blocked on - /// that lock, never finished, and the reap spin-waited the full second - /// before detaching — a 1 s stall plus a transient untracked thread. The - /// fix installs the new token + generation, releases `background_cancel`, - /// and only then reaps, so the prior thread's epilogue runs and the join - /// lands in milliseconds. - /// - /// `stop()` and `start()` run back-to-back in one blocking closure - /// (mirroring the real call site) so `start()` re-acquires the lock - /// microseconds after `stop()` frees it — before the async-woken prior - /// thread can reach its epilogue. Against the old lock-held ordering this - /// reliably stalls ~1 s and fails the bound below. - #[tokio::test(flavor = "multi_thread", worker_threads = 4)] - async fn restart_after_stop_reaps_prior_thread() { - let (mgr, _counter) = make_manager(); - - // Launch the first loop and let its immediate (no-op, empty wallet - // map) pass complete so the thread parks in the interval sleep, where - // cancellation lands cleanly. - Arc::clone(&mgr).start(); - assert!(mgr.is_running()); - tokio::time::sleep(Duration::from_millis(50)).await; - - // Back-to-back cancel-only stop + restart, off the runtime so the - // synchronous reap can't starve a worker. `start()` re-grabs - // background_cancel right after `stop()` frees it. - let restart = Arc::clone(&mgr); - let elapsed = tokio::task::spawn_blocking(move || { - restart.stop(); - let started = std::time::Instant::now(); - Arc::clone(&restart).start(); - started.elapsed() - }) - .await - .unwrap(); - - assert!( - elapsed < Duration::from_millis(500), - "stop()→start() stalled for {elapsed:?}: prior thread was not \ - reaped promptly (background_cancel held across the join?)" - ); - assert!(mgr.is_running(), "restart must leave the new loop tracked"); - - // Wind the new loop down so the test leaves no live !Send thread. - let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce()) - .await - .expect("cleanup quiesce did not complete within 2s after restart"); - assert!( - status.is_clean(), - "cleanup quiesce ended non-cleanly: {status:?}" - ); - assert!(!mgr.is_running()); - } - /// A `sync_now()` invoked while `quiescing` is set must bail without /// running the pass — in particular, without firing the /// `on_platform_address_sync_completed` host callback. This is the diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 3c84bd7071..a930febdc7 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -28,15 +28,15 @@ use std::collections::BTreeMap; use std::sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex as StdMutex, + Arc, }; -use dash_async::AtomicFlagGuard; +use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; -use tokio_util::sync::CancellationToken; +use super::WalletWorker; use crate::events::PlatformEventManager; use crate::wallet::platform_wallet::WalletId; use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary}; @@ -141,27 +141,10 @@ pub struct ShieldedSyncManager { /// run first, so an empty slot guarantees no shielded state /// exists). coordinator_slot: Arc>>>, - /// Cancel token for the background loop, if running. - background_cancel: StdMutex>, - /// Join handle for the background loop's OS thread, if running. - /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can - /// confirm the `!Send` loop fully exited before the host drops the - /// runtime. - background_join: StdMutex>>, - /// Manager-owned orphans list (shared `Arc`). On a tight - /// `stop()`→`start()` where the prior thread is wedged past the 1 s - /// reap backstop, [`start`](Self::start) parks the still-live handle - /// here (via [`reap_prior_or_park`](super::reap_prior_or_park)) - /// instead of dropping it, so manager `shutdown()` accounts for it. - coordinator_orphans: super::CoordinatorOrphans, - /// Monotonically increasing generation counter. Bumped on every - /// `start()` so the exiting thread can tell whether its - /// generation is still the active one before clearing - /// `background_cancel`. Without this, a `stop()` → `start()` - /// overlap lets the prior thread's cleanup strip the new - /// generation's token, leaving the new loop running but - /// untrackable via `is_running()`. - background_generation: AtomicU64, + /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` / + /// `quiesce` delegate to it under the [`WalletWorker::ShieldedSync`] + /// key. + registry: Arc>, interval_secs: AtomicU64, is_syncing: AtomicBool, /// Set by [`quiesce`](Self::quiesce) to gate new passes while it @@ -179,15 +162,12 @@ impl ShieldedSyncManager { pub fn new( event_manager: Arc, coordinator_slot: Arc>>>, - coordinator_orphans: super::CoordinatorOrphans, + registry: Arc>, ) -> Self { Self { event_manager, coordinator_slot, - background_cancel: StdMutex::new(None), - background_join: StdMutex::new(None), - coordinator_orphans, - background_generation: AtomicU64::new(0), + registry, interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), is_syncing: AtomicBool::new(false), quiescing: AtomicBool::new(false), @@ -210,10 +190,22 @@ impl ShieldedSyncManager { /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.background_cancel - .lock() - .map(|g| g.is_some()) - .unwrap_or(false) + self.registry.is_running(WalletWorker::ShieldedSync) + } + + /// The drain barrier handed to the registry: raise the `quiescing` + /// gate so any pass past its `is_syncing` CAS bails. The registry then + /// cancels the loop and joins the thread (the join waits for the + /// in-flight pass — incl. its persister fan-out — to drop and + /// `is_syncing` to clear), so this barrier is instant. + fn drain_hook(self: &Arc) -> DrainHook { + let this = Arc::clone(self); + Arc::new(move || { + let this = Arc::clone(&this); + Box::pin(async move { + this.quiescing.store(true, Ordering::Release); + }) + }) } /// Whether a sync pass is in flight right now. @@ -238,67 +230,30 @@ impl ShieldedSyncManager { /// GRPC client state isn't `Send + Sync`). Same trade-off as /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start). pub fn start(self: Arc) { - let mut cancel_guard = self - .background_cancel - .lock() - .unwrap_or_else(|e| e.into_inner()); - if cancel_guard.is_some() { - return; - } + // Reopen the quiescing gate so this (re)start's passes can run. + self.quiescing.store(false, Ordering::Release); - // Take any handle left by a prior stop() call so we can reap it — but - // DON'T join it here, while we still hold background_cancel. stop() - // takes-and-cancels the token but never touches background_join, so a - // stop()→start() sequence would otherwise overwrite (detach) the old - // handle and shutdown() would miss that thread. Joining it under - // background_cancel would DEADLOCK the reap into its 1 s backstop: the - // exiting prior thread's epilogue also locks background_cancel (to - // clear its slot), so it would block on the lock we hold → never - // finish → get detached on the exact stop()→start() path the reap - // exists for. We install the new token + bump the generation below, - // release the lock, and only THEN reap (after this fn's tail). - let prior = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - - let cancel = CancellationToken::new(); - *cancel_guard = Some(cancel.clone()); - // Bump the generation while we still hold the slot lock so - // the load below in any prior thread's cleanup observes - // `current_gen != my_gen` ordered against this token swap. - let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1; + let cfg = WorkerConfig { + weight: super::COORDINATOR_WEIGHT, + join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: Some(self.drain_hook()), + }; + // The loop drives `!Send` SDK futures via `Handle::block_on` on a + // dedicated OS thread (spawned by the registry). The background + // cadence passes `force=false` to honor the per-wallet caught-up + // cooldown; user-initiated syncs pass `force=true` via the FFI. + // `biased` polls the cancel arm first so a pass stalled on a hung + // SDK fetch is dropped the instant the registry cancels. let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); - let join = std::thread::Builder::new() - .name("shielded-sync".into()) - .spawn(move || { + self.registry + .start_thread(WalletWorker::ShieldedSync, cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { break; } - - // Background-loop cadence — honor the - // per-wallet caught-up cooldown so a - // sleepy network doesn't refetch + - // re-trial-decrypt the partial buffer - // chunk every interval. User-initiated - // syncs pass `force=true` to the FFI - // entry point below and bypass this. - // - // Race the pass against cancellation. `stop()` / - // `quiesce()` cancel the token; with `biased` the - // cancel arm is polled first, so a pass stalled on - // a hung SDK fetch is dropped at its `.await` the - // instant we cancel. Dropping the `sync_now` future - // unwinds to the `is_syncing` `AtomicFlagGuard` it - // holds, clearing the flag promptly — so the drain - // loop in `quiesce()` frees and the join lands well - // inside `shutdown()`'s timeout. A stalled pass can - // no longer strand a live `!Send` thread. tokio::select! { biased; _ = cancel.cancelled() => break, @@ -311,58 +266,8 @@ impl ShieldedSyncManager { _ = cancel.cancelled() => break, } } - - // Only clear `background_cancel` if the active - // generation is still ours. Acquire the lock FIRST, - // then read/compare `background_generation` under it - // (matching identity_sync / platform_address_sync). - // Reading the generation BEFORE locking opens a - // stale-read TOCTOU: this exiting thread could observe - // a pre-bump generation, then block on the lock until a - // concurrent `start()` released it, and null the - // freshly-installed token — leaving the new loop - // running but unreflectable via `is_running()` / - // `stop()`. `start()` bumps the generation while it - // holds this same lock, so comparing under the lock - // guarantees we observe the post-swap value. - if let Ok(mut guard) = this.background_cancel.lock() { - if this.background_generation.load(Ordering::Acquire) == my_gen { - *guard = None; - } - } }); - }) - .expect("failed to spawn shielded-sync thread"); - // Store the join handle while still holding cancel_guard — a - // concurrent quiesce() must wait for this lock before calling - // stop(), so the handle is always stored before it can be taken. - *self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) = Some(join); - - // Release background_cancel BEFORE reaping the prior thread, so its - // epilogue can observe the bumped generation (and skip clearing our - // freshly-installed token) without contending the lock we hold. - // Holding the lock across the join below is what would block the - // prior thread, spin the full 1 s deadline, and detach — the very - // stall this ordering removes. - drop(cancel_guard); - - // Now reap the prior thread. It was already cancellation-signalled by - // stop(), and with the lock released its epilogue completes promptly, - // so is_finished() trips within a few milliseconds and the join is - // near-instant. The 1 s deadline survives only as a genuine-wedge - // backstop (e.g. a pass wedged in a Drop that never yields); if it - // fires `reap_prior_or_park` parks the still-live, already-cancelled - // thread in the manager orphans list so `shutdown()` joins it and - // reports it non-clean rather than dropping it (residual UAF). - super::reap_prior_or_park( - prior, - &self.coordinator_orphans, - std::time::Duration::from_secs(1), - "shielded-sync", - ); + }); } /// Stop the background sync loop. No-op if not running. @@ -374,14 +279,7 @@ impl ShieldedSyncManager { /// nothing more will be persisted" barrier — required by Clear, /// unregister, and rebind — use [`quiesce`](Self::quiesce). pub fn stop(&self) { - if let Some(token) = self - .background_cancel - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take() - { - token.cancel(); - } + self.registry.cancel(WalletWorker::ShieldedSync); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -408,24 +306,14 @@ impl ShieldedSyncManager { /// the `!Send` loop has stopped touching `tokio::time` before a /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { - self.quiescing.store(true, Ordering::Release); - // RAII gate: resets `quiescing` on *every* exit path — a normal - // return, a timed-out `shutdown()` / Clear dropping this future, - // or a panic. Without it a quiesce that doesn't run to completion - // leaves the gate latched `true`, silently bailing every future - // pass. Reopening on drop is safe because `stop()` (below) has - // already cancelled the loop, so no new pass can start. + // RAII gate: reopen `quiescing` on every exit path. The registry's + // drain hook raises it inside `quiesce`; reopening on return is + // safe because the loop has been cancelled, so no new pass starts. let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.stop(); - while self.is_syncing.load(Ordering::Acquire) { - tokio::time::sleep(Duration::from_millis(20)).await; - } - let handle = self - .background_join - .lock() - .unwrap_or_else(|e| e.into_inner()) - .take(); - super::join_coordinator_thread(handle).await + self.registry + .quiesce(WalletWorker::ShieldedSync) + .await + .into() } /// Run one sync pass across every registered wallet. @@ -583,90 +471,3 @@ impl std::fmt::Debug for ShieldedSyncManager { .finish() } } - -// The whole module is already `#[cfg(feature = "shielded")]`-gated at its -// `mod` declaration (manager/mod.rs), so these tests compile only under that -// feature — no extra per-test gate needed. -#[cfg(test)] -mod tests { - use super::*; - - /// Build a manager over an **empty** coordinator slot wired to a - /// handler-less event manager. An empty slot makes every `sync_now` - /// pass a no-op (empty-coordinator handling returns immediately), so - /// the background loop parks in its interval sleep — exactly where - /// cancellation lands cleanly — without needing a live SDK / network. - /// That is all the start/stop/restart thread-lifecycle tests below - /// exercise. - fn make_manager() -> Arc { - let coordinator_slot = Arc::new(RwLock::new(None)); - let event_manager = Arc::new(PlatformEventManager::new(vec![])); - let orphans = Arc::new(StdMutex::new(Vec::new())); - Arc::new(ShieldedSyncManager::new( - event_manager, - coordinator_slot, - orphans, - )) - } - - /// Regression: a tight `stop()` → `start()` must reap the prior loop's - /// OS thread promptly, NOT stall on the 1 s detach backstop. - /// - /// The prior thread's exit epilogue locks `background_cancel` to - /// conditionally clear its slot. The earlier ordering held - /// `background_cancel` across the prior-handle join inside `start()`, so - /// on a back-to-back `stop()` → `start()` the exiting thread blocked on - /// that lock, never finished, and the reap spin-waited the full second - /// before detaching — a 1 s stall plus a transient untracked thread. The - /// fix installs the new token + generation, releases `background_cancel`, - /// and only then reaps, so the prior thread's epilogue runs and the join - /// lands in milliseconds. Mirrors the identity-sync and - /// platform-address-sync siblings. - /// - /// `stop()` and `start()` run back-to-back in one blocking closure - /// (mirroring the real call site) so `start()` re-acquires the lock - /// microseconds after `stop()` frees it — before the async-woken prior - /// thread can reach its epilogue. Against the old lock-held ordering this - /// reliably stalls ~1 s and fails the bound below. - #[tokio::test(flavor = "multi_thread", worker_threads = 4)] - async fn restart_after_stop_reaps_prior_thread() { - let mgr = make_manager(); - - // Launch the first loop and let its immediate (no-op, empty - // coordinator) pass complete so the thread parks in the interval - // sleep, where cancellation lands cleanly. - Arc::clone(&mgr).start(); - assert!(mgr.is_running()); - tokio::time::sleep(Duration::from_millis(50)).await; - - // Back-to-back cancel-only stop + restart, off the runtime so the - // synchronous reap can't starve a worker. `start()` re-grabs - // background_cancel right after `stop()` frees it. - let restart = Arc::clone(&mgr); - let elapsed = tokio::task::spawn_blocking(move || { - restart.stop(); - let started = std::time::Instant::now(); - Arc::clone(&restart).start(); - started.elapsed() - }) - .await - .unwrap(); - - assert!( - elapsed < Duration::from_millis(500), - "stop()→start() stalled for {elapsed:?}: prior thread was not \ - reaped promptly (background_cancel held across the join?)" - ); - assert!(mgr.is_running(), "restart must leave the new loop tracked"); - - // Wind the new loop down so the test leaves no live !Send thread. - let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce()) - .await - .expect("cleanup quiesce did not complete within 2s after restart"); - assert!( - status.is_clean(), - "cleanup quiesce ended non-cleanly: {status:?}" - ); - assert!(!mgr.is_running()); - } -} From d190f298d1ead1f056e46a13550419d42f660a45 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Wed, 24 Jun 2026 23:29:50 +0200 Subject: [PATCH 20/29] test(dash-async): anchor DrainHook compile_fail doctest to E0277 + note assert asymmetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QA-002: anchor the DrainHook Send+Sync compile_fail doctest to E0277 so it verifies the !Send capture is rejected for the right reason (unsatisfied Send bound) and cannot pass vacuously on an unrelated compile error. QA-001: document the runtime-flavor assert asymmetry — start_thread and shutdown assert a multi-thread runtime but start_task does not, so a task-only consumer (rs-dapi) on a current_thread runtime would panic late at shutdown(). The wallet always uses start_thread, so it trips the assert at start and is unaffected. Fix deferred to the rs-dapi adoption PR. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-dash-async/src/registry.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs index 802ca3598c..328ef419bc 100644 --- a/packages/rs-dash-async/src/registry.rs +++ b/packages/rs-dash-async/src/registry.rs @@ -122,9 +122,11 @@ impl ShutdownReport { /// registry never owns domain semantics. /// /// The captured state must be `Send + Sync`; a `!Send` capture does not -/// compile as a `DrainHook`: +/// compile as a `DrainHook`. The fence is anchored to `E0277` (unsatisfied +/// `Send` bound) so the test cannot pass vacuously on some unrelated +/// compile error: /// -/// ```compile_fail +/// ```compile_fail,E0277 /// use std::rc::Rc; /// use std::sync::Arc; /// use dash_async::DrainHook; @@ -331,6 +333,18 @@ impl ThreadRegistry { /// Start a tokio-task worker for `Send` futures. Same restart-reap /// semantics as [`start_thread`](Self::start_thread); does not require /// a multi-thread runtime. + /// + // TODO(rs-dapi-adoption): runtime-flavor assert is asymmetric. + // `start_thread` and `shutdown` assert a multi-thread runtime (the + // OS-thread `block_on` needs the shared reactor), but `start_task` does + // not — a task only needs a runtime handle. So a TASK-ONLY consumer + // (rs-dapi, no `start_thread`) can register and run workers on a + // `current_thread` runtime, then panic LATE when it finally calls + // `shutdown()`. The wallet (which always uses `start_thread`) trips the + // assert at start, so it is unaffected. Fix when rs-dapi adopts the + // registry: either drop the assert from `shutdown` for all-task + // registries (track whether any OS-thread worker was ever started) or + // assert in `start_task` too and require multi-thread everywhere. pub fn start_task(self: &Arc, key: K, cfg: WorkerConfig, body: F) where F: FnOnce(CancellationToken) -> Fut + Send + 'static, @@ -498,6 +512,8 @@ impl ThreadRegistry { /// (drain-hook -> cancel -> join) run concurrently within a tier; /// orphan reap runs last. **Requires a multi-thread runtime.** pub async fn shutdown(&self) -> ShutdownReport { + // TODO(rs-dapi-adoption): see `start_task` — this assert is the late + // panic point for a task-only consumer on a current_thread runtime. Self::assert_multi_thread("shutdown"); // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in From 3e81fc1bb9c0ede295c0d865c98afe858ea0cb99 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 01:03:37 +0200 Subject: [PATCH 21/29] fix(dash-async,platform-wallet): harden ThreadRegistry lifecycle + doc accuracy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply consolidated review findings to the shared ThreadRegistry and the wallet's shutdown/clear paths. Concurrency hardening (with regression tests): - generation-guard quiesce(): a concurrent same-key restart can no longer have its fresh live handle parked or reported Timeout — the superseded quiesce returns NotRunning and the gen-scoped Repark leaves the new handle alone. - graceful spawn-failure rollback: start_thread no longer .expect()s on thread spawn; a failure re-installs the prior handle (never detached), clears the running flag, and returns instead of panicking across the FFI boundary. - panic-safe epilogue: an EpilogueGuard drop-guard runs the gen-gated epilogue even when a worker body unwinds, so is_running() reflects a crash and start() can relaunch it. - closing latch: shutdown() latches the registry closed under the slot lock before snapshotting tiers; start_thread/start_task refuse new workers once teardown begins, so a start racing shutdown cannot leave an un-joined worker behind. - clear_shielded holds the shielded quiescing gate across its liveness check and store wipe (closes the direct sync_now/sync_wallet TOCTOU) and documents the residual start-vs-clear host-serialization precondition. Hygiene: - feature-gate park_orphan_for_test behind a new test-util feature so the mutation seam never ships in a downstream production build. - add Debug to WorkerConfig/ThreadRegistry; add # Panics docs to start_thread/start_task/shutdown; drop a tombstone comment. - correct the WorkerStatus<->CoordinatorThreadStatus "byte-identical" claim to "exhaustive by-name match, never a layout cast". - repoint dead [CoordinatorOrphans] and [spawn_wallet_event_adapter] intra-doc links at the surviving registry / renamed loop. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-dash-async/Cargo.toml | 6 + packages/rs-dash-async/src/registry.rs | 391 ++++++++++++++++-- packages/rs-platform-wallet/Cargo.toml | 3 + .../src/changeset/core_bridge.rs | 2 +- .../rs-platform-wallet/src/manager/mod.rs | 34 +- .../src/manager/shielded_sync.rs | 10 + 6 files changed, 404 insertions(+), 42 deletions(-) diff --git a/packages/rs-dash-async/Cargo.toml b/packages/rs-dash-async/Cargo.toml index 69d180e568..a567cc60ae 100644 --- a/packages/rs-dash-async/Cargo.toml +++ b/packages/rs-dash-async/Cargo.toml @@ -7,6 +7,12 @@ authors = ["Dash Core Team"] license = "MIT" description = "Async-sync bridging utilities for Dash Platform" +[features] +# Exposes cross-crate test seams (e.g. `ThreadRegistry::park_orphan_for_test`) +# so downstream crates can drive registry regression tests without shipping +# the seam in their production builds. +test-util = [] + [dependencies] thiserror = "2.0" tracing = "0.1.41" diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs index 328ef419bc..982dd6b57c 100644 --- a/packages/rs-dash-async/src/registry.rs +++ b/packages/rs-dash-async/src/registry.rs @@ -34,6 +34,7 @@ use std::collections::BTreeMap; use std::future::Future; use std::pin::Pin; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; @@ -60,9 +61,11 @@ pub struct ShutdownWeight(pub i32); // Status // --------------------------------------------------------------------- -/// Terminal status of one worker. Variant set and payloads are -/// byte-identical to the wallet's `CoordinatorThreadStatus`, which is -/// constructed from this via `From` so the FFI surface stays stable. +/// Terminal status of one worker. Its variant set and payloads correspond +/// 1:1 to the wallet's `CoordinatorThreadStatus`, which is built from this +/// via an exhaustive by-name `From` so the FFI surface stays stable. The +/// two enums keep their own declaration order and carry no `#[repr]`, so +/// the mapping is a match, never a layout-compatible cast. #[derive(Clone, Debug, PartialEq, Eq)] pub enum WorkerStatus { /// The loop exited and its thread/task joined cleanly. @@ -163,6 +166,18 @@ impl Default for WorkerConfig { } } +impl std::fmt::Debug for WorkerConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // `drain` is a boxed closure with no useful `Debug`; render its + // presence instead. + f.debug_struct("WorkerConfig") + .field("weight", &self.weight) + .field("drain", &self.drain.is_some()) + .field("join_budget", &self.join_budget) + .finish() + } +} + // --------------------------------------------------------------------- // Internal handle + slot state // --------------------------------------------------------------------- @@ -256,6 +271,28 @@ pub struct ThreadRegistry { slots: Mutex>, orphans: Mutex>, reap_backstop: Duration, + /// One-way teardown latch. [`shutdown`](Self::shutdown) sets it under + /// the slot lock before snapshotting tiers; `start_thread`/`start_task` + /// honour it under the same lock and refuse to register a new worker + /// once teardown has begun, so a start racing shutdown can never leave + /// an un-joined worker behind. + closing: AtomicBool, + /// Test seam: when set, the next OS-thread spawn returns an injected + /// `io::Error` instead of really spawning, so the spawn-failure + /// rollback path can be exercised deterministically. + #[cfg(test)] + force_spawn_failure: AtomicBool, +} + +impl std::fmt::Debug for ThreadRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ThreadRegistry") + .field("live_slots", &self.lock_slots().len()) + .field("orphans", &self.lock_orphans().len()) + .field("reap_backstop", &self.reap_backstop) + .field("closing", &self.closing.load(Ordering::Acquire)) + .finish() + } } impl ThreadRegistry { @@ -271,6 +308,9 @@ impl ThreadRegistry { slots: Mutex::new(BTreeMap::new()), orphans: Mutex::new(Vec::new()), reap_backstop: backstop, + closing: AtomicBool::new(false), + #[cfg(test)] + force_spawn_failure: AtomicBool::new(false), }) } @@ -279,10 +319,21 @@ impl ThreadRegistry { /// internally — the `!Send` value never crosses the spawn boundary /// (`body` itself is `Send`). Starting a key that already has a live /// worker is a no-op; a key whose prior thread has not been reaped is - /// reaped-or-parked first (the restart-reap path). + /// reaped-or-parked first (the restart-reap path). After + /// [`shutdown`](Self::shutdown) has begun the call is also a no-op (the + /// one-way closing latch). /// /// **Requires a multi-thread runtime**: the worker drives its loop /// via `Handle::block_on` and needs the shared timer/IO driver. + /// + /// # Panics + /// + /// Panics if called outside a multi-thread Tokio runtime (see + /// [`shutdown`](Self::shutdown)). It does **not** panic on thread-spawn + /// failure: a failed spawn (e.g. the OS thread-count limit) is rolled + /// back — the prior handle is re-installed rather than detached and the + /// slot returns to not-running — and the call simply does not start a + /// worker. pub fn start_thread(self: &Arc, key: K, cfg: WorkerConfig, body: F) where F: FnOnce(CancellationToken) + Send + 'static, @@ -290,6 +341,11 @@ impl ThreadRegistry { Self::assert_multi_thread("start_thread"); let prior = { let mut slots = self.lock_slots(); + // One-way teardown latch: refuse new workers once shutdown has + // begun, under the same lock shutdown snapshots tiers with. + if self.closing.load(Ordering::Acquire) { + return; + } let slot = slots.entry(key).or_insert_with(SlotState::dormant); if slot.cancel.is_some() { return; @@ -308,18 +364,45 @@ impl ThreadRegistry { let reg = Arc::clone(self); let body_token = token; - let join = std::thread::Builder::new() - .name(format!("tr-worker-{key:?}")) - .spawn(move || { - body(body_token); - reg.run_epilogue(key, my_gen); - }) - .expect("failed to spawn registry worker thread"); - // Store the handle while still under the slot lock; the guard - // is released at the end of this block, BEFORE the reap below - // (R1: store handle -> drop guard -> THEN reap-or-park). - slot.handle = Some(WorkerHandle::OsThread(join)); - prior + // Build the epilogue drop-guard INSIDE the worker closure, not + // here: on a spawn failure the closure is dropped while we still + // hold the slot lock, and a guard constructed out here would run + // `run_epilogue` (which re-locks `slots`) on that drop and + // deadlock. Constructing it inside means it only exists once the + // thread is actually running. A panicking `body` then still + // clears this generation's running flag via the guard's Drop + // (under `panic = "unwind"`), and the panic keeps unwinding so + // the join handle still classifies as `Panicked`. + match self.spawn_os_thread(key, move || { + let _epilogue = EpilogueGuard { reg, key, my_gen }; + body(body_token); + }) { + Ok(join) => { + // Store the handle while still under the slot lock; the + // guard is released at the end of this block, BEFORE the + // reap below (R1: store handle -> drop guard -> THEN + // reap-or-park). + slot.handle = Some(WorkerHandle::OsThread(join)); + prior + } + Err(e) => { + // Spawn failed (e.g. EAGAIN at the OS thread ceiling). + // Roll back so the prior handle is never detached and + // the slot is not left wedged "running": re-install + // prior, clear the running flag. `generation` stays + // bumped (it is only ever monotonic), which is harmless + // — the next start reaps the re-installed prior. + tracing::error!( + ?key, + error = %e, + "failed to spawn registry worker thread; rolling back \ + start (prior handle re-installed, not detached)" + ); + slot.cancel = None; + slot.handle = prior; + None + } + } }; // The prior thread was cancellation-signalled by a preceding @@ -345,6 +428,12 @@ impl ThreadRegistry { // registry: either drop the assert from `shutdown` for all-task // registries (track whether any OS-thread worker was ever started) or // assert in `start_task` too and require multi-thread everywhere. + /// + /// # Panics + /// + /// Panics if called outside a Tokio runtime context (`tokio::spawn`'s + /// own precondition). After [`shutdown`](Self::shutdown) has begun the + /// call is a no-op (the one-way closing latch). pub fn start_task(self: &Arc, key: K, cfg: WorkerConfig, body: F) where F: FnOnce(CancellationToken) -> Fut + Send + 'static, @@ -352,6 +441,10 @@ impl ThreadRegistry { { let prior = { let mut slots = self.lock_slots(); + // One-way teardown latch — see `start_thread`. + if self.closing.load(Ordering::Acquire) { + return; + } let slot = slots.entry(key).or_insert_with(SlotState::dormant); if slot.cancel.is_some() { return; @@ -367,9 +460,12 @@ impl ThreadRegistry { let reg = Arc::clone(self); let body_token = token; + // Drop-guard epilogue, same rationale as `start_thread`: a task + // whose future panics still clears its running flag via the + // guard's Drop during unwind. let join = tokio::spawn(async move { + let _epilogue = EpilogueGuard { reg, key, my_gen }; body(body_token).await; - reg.run_epilogue(key, my_gen); }); slot.handle = Some(WorkerHandle::Task(join)); prior @@ -385,7 +481,7 @@ impl ThreadRegistry { .unwrap_or(false) } - /// Signal-only cancellation of one worker (was `stop()`). + /// Signal-only cancellation of one worker. pub fn cancel(&self, key: K) { if let Some(slot) = self.lock_slots().get_mut(&key) { if let Some(token) = slot.cancel.take() { @@ -409,13 +505,14 @@ impl ThreadRegistry { /// it; on the managed timeout — or if this future is dropped /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX] pub async fn quiesce(&self, key: K) -> WorkerStatus { - // Snapshot the drain hook + budget, and bail early if nothing is - // registered for this key. - let (drain, budget) = { + // Snapshot the drain hook + budget + generation, and bail early if + // nothing is registered for this key. The generation is the anchor + // for the supersede guard below. + let (drain, budget, my_gen) = { let slots = self.lock_slots(); match slots.get(&key) { Some(s) if s.cancel.is_some() || s.handle.is_some() => { - (s.drain.clone(), s.join_budget) + (s.drain.clone(), s.join_budget, s.generation) } _ => return WorkerStatus::NotRunning, } @@ -427,29 +524,46 @@ impl ThreadRegistry { drain().await; } - // Signal-only cancel. + // Signal-only cancel — but only if this is still the generation we + // snapshotted. A concurrent restart (which can proceed the instant + // we take `cancel` below) bumps the generation; taking the new + // token here would silently un-track the fresh worker. if let Some(slot) = self.lock_slots().get_mut(&key) { - if let Some(token) = slot.cancel.take() { - token.cancel(); + if slot.generation == my_gen { + if let Some(token) = slot.cancel.take() { + token.cancel(); + } } } // Poll-join within budget. The re-park guard moves the slot's // still-live handle into orphans if this future is dropped before - // the loop finishes — the handle is never owned by this frame. - let _repark = Repark { reg: self, key }; + // the loop finishes — the handle is never owned by this frame. Both + // the guard and the loop are generation-scoped, so a concurrent + // same-key restart's live handle is never parked or classified by + // the quiesce that cancelled the *prior* generation. + let _repark = Repark { + reg: self, + key, + my_gen, + }; let deadline = Instant::now() + budget; loop { enum Step { Classify(WorkerHandle), Park(WorkerHandle), NotRunning, + Superseded, Wait, } let step = { let mut slots = self.lock_slots(); match slots.get_mut(&key) { None => Step::NotRunning, + // A restart replaced the generation we were draining: + // the handle now in the slot belongs to a newer, live + // worker the restart owns. Leave it untouched. + Some(slot) if slot.generation != my_gen => Step::Superseded, Some(slot) => match slot.handle.take_if(|h| h.is_finished()) { Some(h) => Step::Classify(h), None if slot.handle.is_none() => Step::NotRunning, @@ -466,7 +580,7 @@ impl ThreadRegistry { self.lock_orphans().push((key, h)); return WorkerStatus::Timeout; } - Step::NotRunning => return WorkerStatus::NotRunning, + Step::NotRunning | Step::Superseded => return WorkerStatus::NotRunning, Step::Wait => tokio::time::sleep(Duration::from_millis(5)).await, } } @@ -511,15 +625,31 @@ impl ThreadRegistry { /// Weight-ordered teardown: ascending tier by tier, each worker's /// (drain-hook -> cancel -> join) run concurrently within a tier; /// orphan reap runs last. **Requires a multi-thread runtime.** + /// + /// Latches the registry closed first (under the slot lock, before the + /// tier snapshot), so any `start_thread`/`start_task` racing teardown is + /// either already in the snapshot or refused outright — shutdown is a + /// one-way door and never leaves a worker un-joined. Idempotent. + /// + /// # Panics + /// + /// Panics if called outside a multi-thread Tokio runtime: an OS-thread + /// worker drives its loop via `Handle::block_on` and needs the shared + /// timer/IO driver, so a `current_thread` runtime would deadlock the + /// join. pub async fn shutdown(&self) -> ShutdownReport { // TODO(rs-dapi-adoption): see `start_task` — this assert is the late // panic point for a task-only consumer on a current_thread runtime. Self::assert_multi_thread("shutdown"); // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in - // ascending weight order, giving the lower-first drain. + // ascending weight order, giving the lower-first drain. Latch the + // registry closed under the same lock and before the snapshot so a + // racing start is serialized: it either landed before this lock (and + // is in the snapshot) or sees `closing` and bails. let tiers: BTreeMap> = { let slots = self.lock_slots(); + self.closing.store(true, Ordering::Release); let mut tiers: BTreeMap> = BTreeMap::new(); for (key, slot) in slots.iter() { tiers.entry(slot.weight).or_default().push(*key); @@ -574,9 +704,9 @@ impl ThreadRegistry { ); } - /// Gen-gated exit epilogue, run on the worker after its body returns: - /// clear this slot's running flag only if a newer start has not since - /// installed a replacement. + /// Gen-gated exit epilogue, run on the worker after its body returns + /// (or unwinds): clear this slot's running flag only if a newer start + /// has not since installed a replacement. fn run_epilogue(&self, key: K, my_gen: u64) { if let Some(slot) = self.lock_slots().get_mut(&key) { if slot.generation == my_gen { @@ -585,6 +715,22 @@ impl ThreadRegistry { } } + /// Spawn the named OS worker thread, surfacing a spawn failure as + /// `io::Result` instead of panicking so the caller can roll back. The + /// `#[cfg(test)]` seam forces a synthetic failure to exercise that path. + fn spawn_os_thread(&self, key: K, closure: C) -> std::io::Result> + where + C: FnOnce() + Send + 'static, + { + #[cfg(test)] + if self.force_spawn_failure.load(Ordering::Acquire) { + return Err(std::io::Error::other("forced spawn failure (test seam)")); + } + std::thread::Builder::new() + .name(format!("tr-worker-{key:?}")) + .spawn(closure) + } + /// Reap a restarted key's prior worker — or park it if it is genuinely /// wedged past the reap backstop. Must be called with no registry lock /// held (it spins synchronously for an OS thread). @@ -670,7 +816,9 @@ impl ThreadRegistry { /// Test-only seam: park a raw thread handle as an orphan under `key`. /// Used by cross-crate regression tests (e.g. the wallet's F2 gate) /// that must inject a wedged prior-generation thread without driving - /// the full restart-reap path. + /// the full restart-reap path. Feature-gated behind `test-util` so it + /// never ships in a production build of a downstream consumer. + #[cfg(any(test, feature = "test-util"))] #[doc(hidden)] pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) { self.lock_orphans() @@ -688,19 +836,27 @@ fn slot_alive(slot: &SlotState) -> bool { /// the slot's still-live handle into the orphan list instead of letting it /// be dropped-and-detached. On normal completion the handle has already /// been taken from the slot, so this is a no-op. +/// +/// Generation-scoped: it only re-parks the handle if the slot still holds +/// the generation `quiesce` was draining. A concurrent same-key restart +/// bumps the generation and installs its own live handle; this guard leaves +/// that fresh handle alone. struct Repark<'a, K: RegistryKey> { reg: &'a ThreadRegistry, key: K, + my_gen: u64, } impl Drop for Repark<'_, K> { fn drop(&mut self) { // Take the handle under the slot lock, release it, then push to - // orphans — never nest the two locks. + // orphans — never nest the two locks. Skip if a restart superseded + // our generation (the handle is the new worker's, not ours). let handle = self .reg .lock_slots() .get_mut(&self.key) + .filter(|slot| slot.generation == self.my_gen) .and_then(|slot| slot.handle.take()); if let Some(handle) = handle { self.reg.lock_orphans().push((self.key, handle)); @@ -708,6 +864,27 @@ impl Drop for Repark<'_, K> { } } +/// Worker-side exit guard. Runs the generation-gated [`run_epilogue`] +/// from its `Drop`, so a worker whose `body` returns normally **or** +/// unwinds on panic still clears its running flag — `is_running()` then +/// reflects reality and `start()` can relaunch a crashed loop. +/// +/// Panic-strategy caveat (same as `AtomicFlagGuard`): the clear-on-panic +/// half relies on `Drop` running while the stack unwinds, so it holds under +/// `panic = "unwind"`. Under `panic = "abort"` a worker panic aborts the +/// process and there is no "after" to gate. +struct EpilogueGuard { + reg: Arc>, + key: K, + my_gen: u64, +} + +impl Drop for EpilogueGuard { + fn drop(&mut self) { + self.reg.run_epilogue(self.key, self.my_gen); + } +} + #[cfg(test)] mod tests { use super::*; @@ -1341,4 +1518,150 @@ mod tests { assert!(cfg.drain.is_none()); assert_eq!(cfg.join_budget, DEFAULT_JOIN_BUDGET); } + + // ----- Group 6: concurrency-hazard regressions -------------------- + + /// `quiesce` is generation-guarded. A same-key restart that lands after + /// quiesce takes the prior's cancel must not have its fresh, live handle + /// parked or reported `Timeout`: the superseded quiesce returns + /// `NotRunning` and the new generation survives. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn quiesce_generation_guard_spares_concurrent_restart() { + let reg = ThreadRegistry::<&str>::new(); + // gen-1: a task that ignores cancellation (pending forever), with a + // tiny join budget so a non-guarded quiesce would Timeout quickly. + reg.start_task( + "k", + WorkerConfig { + join_budget: Duration::from_millis(150), + ..WorkerConfig::default() + }, + |_cancel| async move { std::future::pending::<()>().await }, + ); + + // Drive quiesce concurrently; it snapshots gen=1, cancels (ignored), + // and enters the poll loop with cancel already taken. + let reg_q = Arc::clone(®); + let q = tokio::spawn(async move { reg_q.quiesce("k").await }); + + // Let quiesce pass cancel.take() so a restart can proceed. + tokio::time::sleep(Duration::from_millis(40)).await; + + // Restart: cancel is now None, so this proceeds — it takes gen-1's + // live handle as its prior (parked) and installs gen-2. + reg.start_task("k", WorkerConfig::default(), |cancel| async move { + cancel.cancelled().await; + }); + + // The superseded quiesce must NOT park gen-2 / report Timeout. + let status = q.await.unwrap(); + assert_eq!( + status, + WorkerStatus::NotRunning, + "superseded quiesce returns NotRunning, never a spurious Timeout" + ); + assert!(reg.is_running("k"), "gen-2 survives the racing quiesce"); + + // gen-2 quiesces cleanly. + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + } + + /// A thread-spawn failure must neither panic nor detach the live prior + /// handle: it rolls back (prior re-installed, running flag cleared) and + /// the slot stays usable / reapable. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn spawn_failure_reparks_live_prior_without_panic() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + // gen-1: wedged (ignores cancel), stays live until released. + reg.start_thread("k", WorkerConfig::default(), wedged_body(release_rx)); + // cancel() takes the token (slot.cancel = None) but the wedged thread + // keeps running — the slot now holds a LIVE prior handle with cancel + // cleared, the exact shape a racing restart would take as its prior. + reg.cancel("k"); + assert!(!reg.is_running("k")); + + // Force the restart's spawn to fail; it must not panic. + reg.force_spawn_failure.store(true, Ordering::Release); + reg.start_thread("k", WorkerConfig::default(), |_cancel| {}); + assert!( + !reg.is_running("k"), + "failed spawn clears the running flag, never leaves it wedged" + ); + assert!(reg.any_alive(), "live prior re-installed, never detached"); + + // Recover: release the prior; quiesce reaps the now-finished handle + // cleanly, proving it was owned (not leaked/detached) and the slot is + // not wedged. + reg.force_spawn_failure.store(false, Ordering::Release); + release_tx.send(()).unwrap(); + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + assert!(!reg.any_alive()); + } + + /// A panicking worker body still runs its epilogue (via the drop-guard), + /// so `is_running()` reflects the crash and `start()` can relaunch the + /// loop instead of silently no-op'ing. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn panicked_worker_clears_running_and_allows_restart() { + let reg = ThreadRegistry::<&str>::new(); + // A worker whose body panics immediately. + reg.start_thread("k", WorkerConfig::default(), |_cancel| { + panic!("deliberate worker-body panic"); + }); + + // The drop-guard epilogue clears the running flag despite the panic. + let mut waited = Duration::ZERO; + while reg.is_running("k") && waited < Duration::from_secs(2) { + tokio::time::sleep(Duration::from_millis(5)).await; + waited += Duration::from_millis(5); + } + assert!( + !reg.is_running("k"), + "panicked worker clears its running flag via the epilogue guard" + ); + + // start() can relaunch a crashed loop (no longer a silent no-op). + let ran = Arc::new(AtomicBool::new(false)); + let ran_w = Arc::clone(&ran); + let handle = Handle::current(); + reg.start_thread("k", WorkerConfig::default(), move |cancel| { + ran_w.store(true, Ordering::Release); + handle.block_on(async move { cancel.cancelled().await }); + }); + assert!( + reg.is_running("k"), + "start() relaunches a previously-panicked worker" + ); + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + assert!( + ran.load(Ordering::Acquire), + "restarted worker body executed" + ); + } + + /// `shutdown()` latches the registry closed: a start racing (or + /// following) teardown is refused, so no worker is left un-joined. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn shutdown_latches_closed_refusing_new_workers() { + let reg = ThreadRegistry::<&str>::new(); + start_clean(®, "live", WorkerConfig::default()); + let report = reg.shutdown().await; + assert!(report.all_clean()); + + // One-way door: both worker kinds are refused after shutdown. + start_clean(®, "late_thread", WorkerConfig::default()); + assert!( + !reg.is_running("late_thread"), + "start_thread after shutdown is refused" + ); + reg.start_task("late_task", WorkerConfig::default(), |cancel| async move { + cancel.cancelled().await; + }); + assert!( + !reg.is_running("late_task"), + "start_task after shutdown is refused" + ); + assert!(!reg.any_alive(), "nothing started post-shutdown"); + } } diff --git a/packages/rs-platform-wallet/Cargo.toml b/packages/rs-platform-wallet/Cargo.toml index e324680210..5398e9c009 100644 --- a/packages/rs-platform-wallet/Cargo.toml +++ b/packages/rs-platform-wallet/Cargo.toml @@ -81,6 +81,9 @@ name = "shielded_chunk_timing_bench" required-features = ["shielded"] [dev-dependencies] +# Enables `ThreadRegistry::park_orphan_for_test` for the manager's F2-gate +# regression tests; the seam is feature-gated so it never ships in release. +dash-async = { path = "../rs-dash-async", features = ["test-util"] } # Used by `examples/shielded_chunk_timing_bench.rs` and # `tests/shielded_decrypt_bench.rs` to assemble per-chunk wire # fixtures and decode the `ShieldedEncryptedNote` wire type. diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs index 9e22d9e6f2..13a177cb47 100644 --- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs +++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs @@ -3,7 +3,7 @@ //! Upstream `key_wallet_manager::WalletManager` exposes a //! `broadcast::Sender` and a `subscribe_events()` accessor //! returning a `broadcast::Receiver`; consumers attach at -//! startup and drain the stream. [`spawn_wallet_event_adapter`] is the +//! startup and drain the stream. [`wallet_event_adapter_loop`] is the //! platform-wallet-side consumer: a tokio task that pulls events off //! that broadcast, projects each one into a //! [`CoreChangeSet`](crate::changeset::CoreChangeSet), wraps it in a diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index d03dcccf7b..51e14c2524 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -145,7 +145,7 @@ pub enum CoordinatorThreadStatus { /// `stop()`→`start()` reap had to detach past its 1 s wedge-backstop /// was still alive at the shutdown deadline. /// - /// Such a thread was parked in the manager's [`CoordinatorOrphans`] + /// Such a thread was parked in the shared [`ThreadRegistry`]'s orphan /// list (not silently dropped) precisely so this case is visible. /// A still-live detached thread keeps an `Arc` to the host event /// handler and may fire one final callback, so the host must NOT @@ -165,9 +165,12 @@ impl CoordinatorThreadStatus { } /// Relocate a registry [`WorkerStatus`](dash_async::WorkerStatus) into the -/// FFI-stable `CoordinatorThreadStatus`. The variant set and payloads are -/// identical by construction, so this is a byte-stable 1:1 mapping — the -/// FFI `destroy` / shielded-stop adapters keep reading the same shape. +/// FFI-stable `CoordinatorThreadStatus`. The variant sets and payloads +/// correspond 1:1, so the body is an exhaustive by-name `From` match that +/// the compiler keeps total. The two enums intentionally keep their own +/// declaration order and carry no `#[repr]`, so this is a match, never a +/// layout-compatible cast — the FFI `destroy` / shielded-stop adapters keep +/// reading the same logical shape. impl From for CoordinatorThreadStatus { fn from(status: dash_async::WorkerStatus) -> Self { use dash_async::WorkerStatus as W; @@ -205,7 +208,7 @@ pub struct CoordinatorExitStatus { pub event_adapter: CoordinatorThreadStatus, /// Aggregate status of any coordinator OS threads that an earlier /// tight `stop()`→`start()` reap had to detach past its 1 s - /// wedge-backstop and park in the manager's [`CoordinatorOrphans`] + /// wedge-backstop and park in the shared [`ThreadRegistry`]'s orphan /// list. /// /// [`Ok`](CoordinatorThreadStatus::Ok) when none were detached (or @@ -280,8 +283,8 @@ impl CoordinatorExitStatus { pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30; /// Grace period (seconds) [`PlatformWalletManager::shutdown`] spends -/// polling any parked [`CoordinatorOrphans`] before declaring a survivor -/// [`Detached`](CoordinatorThreadStatus::Detached). +/// polling any orphans parked in the shared [`ThreadRegistry`] before +/// declaring a survivor [`Detached`](CoordinatorThreadStatus::Detached). /// /// Unlike a live coordinator — whose `quiesce()` may legitimately spend /// seconds draining an in-flight pass, hence the 30 s @@ -507,6 +510,15 @@ impl PlatformWalletManager

{ /// backstop, or its loop ended non-cleanly) → /// [`crate::error::PlatformWalletError::ShieldedShutdownIncomplete`]; or /// - the coordinator's store reset itself fails. + /// + /// **Host-serialization precondition**: the caller must not invoke + /// `shielded_sync_start` for this manager concurrently with `clear`. A + /// concurrent direct `sync_now`/`sync_wallet` is held off (the quiescing + /// gate stays raised across the liveness check and the wipe), but a full + /// restart re-opens that gate as it spawns a fresh loop, so a `start` + /// racing `clear` can still re-persist into the wiped store. The wallet + /// UI drives these from one place; that ordering is the host's contract + /// until the registry grows a per-key clearing latch. #[cfg(feature = "shielded")] pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> { // Quiesce the shielded loop: cancel it, drain any in-flight pass @@ -524,6 +536,14 @@ impl PlatformWalletManager

{ if !status.is_clean() { return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status }); } + // Hold the shielded quiescing gate raised across BOTH the liveness + // check below and the store wipe, so the gate guarding "no new pass" + // does not lapse between check and act: a direct `sync_now` / + // `sync_wallet` that lands here observes the gate and bails instead + // of writing into the store we are about to clear. The guard lowers + // the gate on return (every path), so a later start/sync works. + let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate(); + // [F2 FIX] Also refuse if a prior-generation shielded thread is // still parked alive: it holds an `Arc` to the persister/store and // could re-persist notes into the store we are about to wipe. The diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index a930febdc7..ea0a0566f9 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -316,6 +316,16 @@ impl ShieldedSyncManager { .into() } + /// Raise the `quiescing` gate and hold it raised until the returned + /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate as + /// soon as it returns, this lets a multi-step teardown (Clear) keep new + /// direct `sync_now` / `sync_wallet` passes off across a check-then-wipe + /// so the "no new pass" guarantee does not lapse between the two steps. + pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> { + self.quiescing.store(true, Ordering::Release); + AtomicFlagGuard::new(&self.quiescing) + } + /// Run one sync pass across every registered wallet. /// /// `force` is propagated to each wallet's From 911f99f7ce569ebe727c7439bf44fcb9358cc2b9 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 01:03:37 +0200 Subject: [PATCH 22/29] refactor(platform-wallet): extract CoordinatorLifecycle to dedup the three sync coordinators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The migration onto ThreadRegistry left the coordinator-side delegate + pass-gating boilerplate copy-pasted across IdentitySyncManager, PlatformAddressSyncManager, and ShieldedSyncManager: the same five fields (registry, interval, is_syncing, quiescing, last_sync), byte- identical interval/is_syncing/last_sync/drain_hook/stop/quiesce delegations, and — critically — the subtle is_syncing-CAS + quiescing-gate pass preamble reproduced four times. Hoist all of it into one CoordinatorLifecycle helper that each coordinator embeds and delegates to, so the teardown-critical handshake has a single home. begin_pass() folds the CAS + guard + gate check into one RAII-guard- returning call; hold_quiescing_gate() is the shared primitive the shielded Clear flow holds across its check-then-wipe. Each coordinator now keeps only its domain-specific pass body. Behaviour-preserving: the full platform-wallet lib suite (307 tests, both default and shielded) and the coordinator pass-gate tests are unchanged and green. Also fixes the dangling [JoinHandle] intra-doc link in core_bridge (its import was removed in the migration) by fully qualifying it. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/changeset/core_bridge.rs | 6 +- .../src/manager/coordinator_lifecycle.rs | 193 ++++++++++++++++++ .../src/manager/identity_sync.rs | 134 ++++-------- .../rs-platform-wallet/src/manager/mod.rs | 1 + .../src/manager/platform_address_sync.rs | 132 ++++-------- .../src/manager/shielded_sync.rs | 157 +++++--------- 6 files changed, 322 insertions(+), 301 deletions(-) create mode 100644 packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs index 13a177cb47..927cf8d000 100644 --- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs +++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs @@ -21,9 +21,9 @@ //! //! [`wallet_event_adapter_loop`] is the task body. The caller (typically //! `PlatformWalletManager`) registers it on the shared `ThreadRegistry` -//! via `start_task`, which owns its [`JoinHandle`] and cancellation; on -//! shutdown the registry fires the [`CancellationToken`] to make the task -//! exit cleanly and joins it. +//! via `start_task`, which owns its [`JoinHandle`](tokio::task::JoinHandle) +//! and cancellation; on shutdown the registry fires the +//! [`CancellationToken`] to make the task exit cleanly and joins it. use std::sync::Arc; diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs new file mode 100644 index 0000000000..440c67c676 --- /dev/null +++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs @@ -0,0 +1,193 @@ +//! Shared lifecycle state + pass protocol for the periodic sync +//! coordinators. +//! +//! The three coordinators ([`IdentitySyncManager`], [`PlatformAddressSyncManager`], +//! [`ShieldedSyncManager`]) each drive a background loop on the shared +//! [`ThreadRegistry`] and gate passes through an `is_syncing` / `quiescing` +//! handshake. That handshake, plus the interval and last-sync bookkeeping, +//! is identical across all three; it lives here so the (subtle, teardown- +//! critical) protocol has a single home and each coordinator keeps only its +//! domain-specific pass body. +//! +//! [`IdentitySyncManager`]: super::identity_sync::IdentitySyncManager +//! [`PlatformAddressSyncManager`]: super::platform_address_sync::PlatformAddressSyncManager +//! [`ShieldedSyncManager`]: super::shielded_sync::ShieldedSyncManager + +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; + +use super::{ + CoordinatorThreadStatus, WalletWorker, COORDINATOR_WEIGHT, SHUTDOWN_JOIN_TIMEOUT_SECS, +}; + +/// Shared lifecycle state and pass-gating protocol for one periodic sync +/// coordinator. Each coordinator embeds one of these and delegates its +/// `start` / `stop` / `quiesce` / `is_running` / interval / pass-gate +/// surface to it. +pub(crate) struct CoordinatorLifecycle { + registry: Arc>, + worker: WalletWorker, + interval_secs: AtomicU64, + is_syncing: AtomicBool, + /// `Arc` so the registry drain hook (a `'static` closure) can capture a + /// clone and raise the gate from inside `quiesce`. + quiescing: Arc, + last_sync_unix: AtomicU64, +} + +impl CoordinatorLifecycle { + pub(crate) fn new( + registry: Arc>, + worker: WalletWorker, + default_interval_secs: u64, + ) -> Self { + Self { + registry, + worker, + interval_secs: AtomicU64::new(default_interval_secs), + is_syncing: AtomicBool::new(false), + quiescing: Arc::new(AtomicBool::new(false)), + last_sync_unix: AtomicU64::new(0), + } + } + + /// The shared worker-lifecycle engine this coordinator's loop runs on. + pub(crate) fn registry(&self) -> &Arc> { + &self.registry + } + + /// This coordinator's registry key. + pub(crate) fn worker(&self) -> WalletWorker { + self.worker + } + + /// Set the polling interval. Clamped to a minimum of 1s. + pub(crate) fn set_interval(&self, interval: Duration) { + let secs = interval.as_secs().max(1); + self.interval_secs.store(secs, Ordering::Release); + } + + /// Current polling interval. + pub(crate) fn interval(&self) -> Duration { + Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + } + + /// Current polling interval in whole seconds (for `Debug`). + pub(crate) fn interval_secs(&self) -> u64 { + self.interval_secs.load(Ordering::Acquire) + } + + /// Whether the background loop is currently running. + pub(crate) fn is_running(&self) -> bool { + self.registry.is_running(self.worker) + } + + /// Whether a sync pass is in flight right now. + pub(crate) fn is_syncing(&self) -> bool { + self.is_syncing.load(Ordering::Acquire) + } + + /// Unix seconds of the last completed pass, or `None` if none has ever + /// completed. + pub(crate) fn last_sync_unix_seconds(&self) -> Option { + match self.last_sync_unix.load(Ordering::Acquire) { + 0 => None, + n => Some(n), + } + } + + /// Record the unix-seconds stamp of a just-completed pass. + pub(crate) fn store_last_sync_unix(&self, unix_secs: u64) { + self.last_sync_unix.store(unix_secs, Ordering::Release); + } + + /// The registry config a coordinator starts its loop with: coordinator + /// teardown weight, the shared join budget, and the `quiescing`-raising + /// drain hook. + pub(crate) fn worker_config(&self) -> WorkerConfig { + WorkerConfig { + weight: COORDINATOR_WEIGHT, + join_budget: Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS), + drain: Some(self.drain_hook()), + } + } + + /// Drain hook handed to the registry: raise the `quiescing` gate so any + /// pass past its `is_syncing` CAS bails. The registry then cancels the + /// loop and joins the thread, so the barrier itself is instant. + fn drain_hook(&self) -> DrainHook { + let quiescing = Arc::clone(&self.quiescing); + Arc::new(move || { + let quiescing = Arc::clone(&quiescing); + Box::pin(async move { + quiescing.store(true, Ordering::Release); + }) + }) + } + + /// Reopen the `quiescing` gate so a (re)start's passes can run; a prior + /// quiesce raised it via the drain hook. + pub(crate) fn reopen_quiescing_gate(&self) { + self.quiescing.store(false, Ordering::Release); + } + + /// Cancel-only stop: signal the loop and return immediately. + pub(crate) fn stop(&self) { + self.registry.cancel(self.worker); + } + + /// Cancel the loop, drain any in-flight pass, and join the worker, + /// returning its terminal status. Reopens the `quiescing` gate on every + /// exit path (the registry's drain hook raised it; reopening is safe + /// because the loop has been cancelled, so no new pass starts). + pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus { + let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); + self.registry.quiesce(self.worker).await.into() + } + + /// Raise the `quiescing` gate and hold it raised until the returned + /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate the + /// instant it returns, this lets a multi-step teardown (e.g. Clear) + /// keep new direct passes off across a check-then-wipe so the "no new + /// pass" guarantee does not lapse between the two steps. In production + /// only the shielded Clear flow needs this today; the coordinator pass- + /// gate tests also exercise it. + #[cfg(any(test, feature = "shielded"))] + pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> { + self.quiescing.store(true, Ordering::Release); + AtomicFlagGuard::new(&self.quiescing) + } + + /// Enter a sync pass. Atomically claims the `is_syncing` slot, then + /// checks the `quiescing` gate. Returns the RAII guard that clears + /// `is_syncing` on drop, or `None` when the caller must bail without + /// doing work — because a pass is already in flight, or a teardown has + /// raised the gate. In the gated case the briefly-claimed slot is + /// released before returning (the guard drops), so a later post-quiesce + /// pass can still run. + pub(crate) fn begin_pass(&self) -> Option> { + if self + .is_syncing + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return None; + } + + // RAII guard: clears `is_syncing` on every exit path, including + // panics. Without it a panic inside the pass would leave + // `is_syncing = true` forever and wedge `quiesce`'s drain loop. + let guard = AtomicFlagGuard::new(&self.is_syncing); + + // A `quiesce` may have raised the gate between our CAS and here; if + // so, bail (dropping `guard`, which clears the slot) so the drain + // can complete and teardown gets a true "no further pass" barrier. + if self.quiescing.load(Ordering::Acquire) { + return None; + } + Some(guard) + } +} diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs index 8dfe83eede..165e4f4530 100644 --- a/packages/rs-platform-wallet/src/manager/identity_sync.rs +++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs @@ -47,14 +47,12 @@ //! identities are registered and the SDK is connected. use std::collections::BTreeMap; -use std::sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, -}; +use std::sync::Arc; -use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; +use dash_async::ThreadRegistry; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use super::coordinator_lifecycle::CoordinatorLifecycle; use super::WalletWorker; use dpp::balances::credits::TokenAmount; @@ -161,24 +159,14 @@ where /// over `P` so every `persister.store(...)` call on the hot sync /// loop dispatches statically. persister: Arc

, - /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` / - /// `quiesce` delegate to it under the [`WalletWorker::IdentitySync`] - /// key; it owns the loop's cancel token, OS-thread join handle, the - /// restart reap-or-park, and the orphan list. - registry: Arc>, - interval_secs: AtomicU64, - is_syncing: AtomicBool, - /// Set by [`quiesce`](Self::quiesce) to gate new passes while it - /// drains an in-flight one. `sync_now` bails (after taking the - /// `is_syncing` slot) when this is set, so once `quiesce` observes - /// `is_syncing == false` no further pass can start — giving shutdown - /// a real "no more host-visible persister stores" barrier that + /// Shared lifecycle state + pass-gating protocol under the + /// [`WalletWorker::IdentitySync`] key: the registry handle, polling + /// interval, the `is_syncing` / `quiescing` handshake, and the + /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the + /// `sync_now` pass gate delegate to it. The `quiescing` half gives + /// shutdown a real "no more host-visible persister stores" barrier that /// cancel-only [`stop`](Self::stop) does not provide. - quiescing: AtomicBool, - /// Unix seconds of the last completed pass across all identities. - /// `0` = never. Identity-level timestamps live on the per-identity - /// rows in [`IdentitySyncManager::state`]. - last_sync_unix: AtomicU64, + lifecycle: CoordinatorLifecycle, /// Per-identity registry / cache. Keyed by identity id; each row /// carries the per-(identity, token) token rows plus the /// per-identity last-sync timestamp. @@ -209,11 +197,11 @@ where Self { sdk, persister, - registry, - interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), - is_syncing: AtomicBool::new(false), - quiescing: AtomicBool::new(false), - last_sync_unix: AtomicU64::new(0), + lifecycle: CoordinatorLifecycle::new( + registry, + WalletWorker::IdentitySync, + DEFAULT_SYNC_INTERVAL_SECS, + ), state: RwLock::new(BTreeMap::new()), } } @@ -314,47 +302,28 @@ where /// /// The running loop picks this up on its next sleep. pub fn set_interval(&self, interval: Duration) { - let secs = interval.as_secs().max(1); - self.interval_secs.store(secs, Ordering::Release); + self.lifecycle.set_interval(interval); } /// Current polling interval. pub fn interval(&self) -> Duration { - Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + self.lifecycle.interval() } /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.registry.is_running(WalletWorker::IdentitySync) - } - - /// The drain barrier handed to the registry: raise the `quiescing` - /// gate so any pass past its `is_syncing` CAS bails. The registry then - /// cancels the loop and joins the thread (the join waits for the - /// in-flight pass to drop and `is_syncing` to clear), so the barrier - /// itself is instant and never blocks teardown. - fn drain_hook(self: &Arc) -> DrainHook { - let this = Arc::clone(self); - Arc::new(move || { - let this = Arc::clone(&this); - Box::pin(async move { - this.quiescing.store(true, Ordering::Release); - }) - }) + self.lifecycle.is_running() } /// Whether a sync pass is in flight right now. pub fn is_syncing(&self) -> bool { - self.is_syncing.load(Ordering::Acquire) + self.lifecycle.is_syncing() } /// Unix seconds of the last completed pass (across all identities), /// or `None` if no pass has ever completed. pub fn last_sync_unix_seconds(&self) -> Option { - match self.last_sync_unix.load(Ordering::Acquire) { - 0 => None, - n => Some(n), - } + self.lifecycle.last_sync_unix_seconds() } /// Per-identity last-sync timestamp. @@ -414,13 +383,9 @@ where pub fn start(self: Arc) { // Reopen the quiescing gate so this (re)start's passes can run; a // prior quiesce raised it via the drain hook. - self.quiescing.store(false, Ordering::Release); + self.lifecycle.reopen_quiescing_gate(); - let cfg = WorkerConfig { - weight: super::COORDINATOR_WEIGHT, - join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS), - drain: Some(self.drain_hook()), - }; + let cfg = self.lifecycle.worker_config(); // The loop drives `!Send` SDK futures via `Handle::block_on` on a // dedicated OS thread (the registry spawns it). The handle is @@ -431,8 +396,9 @@ where // the join lands inside the budget. let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); - self.registry - .start_thread(WalletWorker::IdentitySync, cfg, move |cancel| { + self.lifecycle + .registry() + .start_thread(self.lifecycle.worker(), cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { @@ -463,7 +429,7 @@ where /// by manager shutdown so the host can free the persister context — /// use [`quiesce`](Self::quiesce). pub fn stop(&self) { - self.registry.cancel(WalletWorker::IdentitySync); + self.lifecycle.stop(); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -491,17 +457,7 @@ where /// the `!Send` loop has stopped touching `tokio::time` before a /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { - // RAII gate: reopen `quiescing` on *every* exit path — normal - // return, a dropped future, or a panic. The registry's drain hook - // raises it inside `quiesce` below; without this reset a quiesce - // that doesn't complete would leave the gate latched and silently - // bail every future pass. Reopening is safe because the loop has - // been cancelled, so no new pass can start. - let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.registry - .quiesce(WalletWorker::IdentitySync) - .await - .into() + self.lifecycle.quiesce().await } /// Run one sync pass across every registered identity. @@ -515,27 +471,13 @@ where /// `!Send` (no `tokio::spawn`) and because the design brief /// explicitly forbids it. pub async fn sync_now(&self) { - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { + // Claim the pass slot and honour the quiescing gate; bail without + // work (and without a `persister.store(...)` after quiesce returns) + // if a pass is already in flight or a teardown raised the gate. The + // returned guard clears `is_syncing` on every exit path. + let Some(_pass) = self.lifecycle.begin_pass() else { return; - } - - // RAII guard: clears `is_syncing` on every exit path, including - // panics. Without this a panic inside the pass would leave - // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. - let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); - - // A `quiesce()` may have raised the gate between our CAS and - // here; if so, bail without running a pass so the drain can - // complete and shutdown gets a true barrier (no further - // `persister.store(...)` after quiesce returns). - // Guard clears `is_syncing` on return. - if self.quiescing.load(Ordering::Acquire) { - return; - } + }; // Snapshot the per-identity watch list under a short read // lock and release it before any network call. We keep @@ -558,8 +500,8 @@ where .duration_since(UNIX_EPOCH) .map(|d| d.as_secs()) .unwrap_or(0); - self.last_sync_unix.store(now, Ordering::Release); - // `_is_syncing_guard` drops here → `is_syncing = false` + self.lifecycle.store_last_sync_unix(now); + // `_pass` drops here → `is_syncing = false` } /// Sync a single identity's watched tokens against Platform. @@ -700,7 +642,7 @@ where f.debug_struct("IdentitySyncManager") .field("is_running", &self.is_running()) .field("is_syncing", &self.is_syncing()) - .field("interval_secs", &self.interval_secs.load(Ordering::Acquire)) + .field("interval_secs", &self.lifecycle.interval_secs()) .field("last_sync_unix", &self.last_sync_unix_seconds()) .finish() } @@ -924,8 +866,8 @@ mod tests { let token_x = Identifier::from([10u8; 32]); mgr.register_identity(id_a, [token_x]).await; - // Raise the gate as `quiesce()` would. - mgr.quiescing.store(true, Ordering::Release); + // Raise the gate as `quiesce()` would, held across the pass. + let _gate = mgr.lifecycle.hold_quiescing_gate(); mgr.sync_now().await; diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 51e14c2524..840dd13c7a 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -1,6 +1,7 @@ //! Multi-wallet manager with SPV coordination. pub mod accessors; +mod coordinator_lifecycle; pub mod identity_sync; mod load; pub mod platform_address_sync; diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs index e68fcfef7c..5cb15b048e 100644 --- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs +++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs @@ -9,18 +9,16 @@ //! wallets are registered and the SPV runtime is up. use std::collections::BTreeMap; -use std::sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, -}; +use std::sync::Arc; -use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; +use dash_async::ThreadRegistry; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use arc_swap::ArcSwapOption; use dash_sdk::platform::address_sync::{AddressSyncConfig, AddressSyncResult}; use key_wallet::PlatformP2PKHAddress; +use super::coordinator_lifecycle::CoordinatorLifecycle; use super::WalletWorker; use crate::wallet::PlatformAddressTag; use tokio::sync::RwLock; @@ -97,21 +95,14 @@ impl PlatformAddressSyncSummary { pub struct PlatformAddressSyncManager { wallets: Arc>>>, event_manager: Arc, - /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` / - /// `quiesce` delegate to it under the - /// [`WalletWorker::PlatformAddressSync`] key. - registry: Arc>, - interval_secs: AtomicU64, - is_syncing: AtomicBool, - /// Set by [`quiesce`](Self::quiesce) to gate new passes while it - /// drains an in-flight one. `sync_now` bails (after taking the - /// `is_syncing` slot) when this is set, so once `quiesce` observes - /// `is_syncing == false` no further pass can start — giving shutdown - /// a real "no more host-visible sync-completed callbacks" barrier - /// that cancel-only [`stop`](Self::stop) does not provide. - quiescing: AtomicBool, - /// Unix seconds of the last completed pass. `0` = never. - last_sync_unix: AtomicU64, + /// Shared lifecycle state + pass-gating protocol under the + /// [`WalletWorker::PlatformAddressSync`] key: registry handle, polling + /// interval, the `is_syncing` / `quiescing` handshake, and the + /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the + /// `sync_now` pass gate delegate to it. The `quiescing` half gives + /// shutdown a real "no more host-visible sync-completed callbacks" + /// barrier that cancel-only [`stop`](Self::stop) does not provide. + lifecycle: CoordinatorLifecycle, /// Shared config applied uniformly across wallets and accounts. /// /// `ArcSwapOption` instead of a mutex because writes are rare @@ -129,11 +120,11 @@ impl PlatformAddressSyncManager { Self { wallets, event_manager, - registry, - interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), - is_syncing: AtomicBool::new(false), - quiescing: AtomicBool::new(false), - last_sync_unix: AtomicU64::new(0), + lifecycle: CoordinatorLifecycle::new( + registry, + WalletWorker::PlatformAddressSync, + DEFAULT_SYNC_INTERVAL_SECS, + ), config: ArcSwapOption::empty(), } } @@ -142,13 +133,12 @@ impl PlatformAddressSyncManager { /// /// The running loop picks this up on its next sleep. pub fn set_interval(&self, interval: Duration) { - let secs = interval.as_secs().max(1); - self.interval_secs.store(secs, Ordering::Release); + self.lifecycle.set_interval(interval); } /// Current polling interval. pub fn interval(&self) -> Duration { - Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + self.lifecycle.interval() } /// Replace the shared [`AddressSyncConfig`] used on every pass. @@ -165,36 +155,18 @@ impl PlatformAddressSyncManager { /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.registry.is_running(WalletWorker::PlatformAddressSync) - } - - /// The drain barrier handed to the registry: raise the `quiescing` - /// gate so any pass past its `is_syncing` CAS bails. The registry then - /// cancels the loop and joins the thread (the join waits for the - /// in-flight pass — incl. its completion-event dispatch — to drop and - /// `is_syncing` to clear), so this barrier is instant. - fn drain_hook(self: &Arc) -> DrainHook { - let this = Arc::clone(self); - Arc::new(move || { - let this = Arc::clone(&this); - Box::pin(async move { - this.quiescing.store(true, Ordering::Release); - }) - }) + self.lifecycle.is_running() } /// Whether a sync pass is in flight right now. pub fn is_syncing(&self) -> bool { - self.is_syncing.load(Ordering::Acquire) + self.lifecycle.is_syncing() } /// Unix seconds of the last completed pass, or `None` if no pass /// has ever completed. pub fn last_sync_unix_seconds(&self) -> Option { - match self.last_sync_unix.load(Ordering::Acquire) { - 0 => None, - n => Some(n), - } + self.lifecycle.last_sync_unix_seconds() } /// Start the background sync loop. Idempotent — calling while @@ -213,13 +185,9 @@ impl PlatformAddressSyncManager { /// [`interval`](Self::interval). pub fn start(self: Arc) { // Reopen the quiescing gate so this (re)start's passes can run. - self.quiescing.store(false, Ordering::Release); + self.lifecycle.reopen_quiescing_gate(); - let cfg = WorkerConfig { - weight: super::COORDINATOR_WEIGHT, - join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS), - drain: Some(self.drain_hook()), - }; + let cfg = self.lifecycle.worker_config(); // The loop drives `!Send` SDK futures via `Handle::block_on` on a // dedicated OS thread (spawned by the registry). `biased` polls the @@ -227,8 +195,9 @@ impl PlatformAddressSyncManager { // at its `.await` the instant the registry cancels. let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); - self.registry - .start_thread(WalletWorker::PlatformAddressSync, cfg, move |cancel| { + self.lifecycle + .registry() + .start_thread(self.lifecycle.worker(), cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { @@ -260,7 +229,7 @@ impl PlatformAddressSyncManager { /// the host can free the event-handler context — use /// [`quiesce`](Self::quiesce). pub fn stop(&self) { - self.registry.cancel(WalletWorker::PlatformAddressSync); + self.lifecycle.stop(); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -289,14 +258,7 @@ impl PlatformAddressSyncManager { /// the `!Send` loop has stopped touching `tokio::time` before a /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { - // RAII gate: reopen `quiescing` on every exit path. The registry's - // drain hook raises it inside `quiesce`; reopening on return is - // safe because the loop has been cancelled, so no new pass starts. - let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.registry - .quiesce(WalletWorker::PlatformAddressSync) - .await - .into() + self.lifecycle.quiesce().await } /// Run one sync pass across every registered wallet. @@ -304,27 +266,13 @@ impl PlatformAddressSyncManager { /// If a pass is already in flight, returns an empty summary and /// skips — the caller can inspect [`is_syncing`] to distinguish. pub async fn sync_now(&self) -> PlatformAddressSyncSummary { - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { + // Claim the pass slot and honour the quiescing gate; bail with an + // empty summary (and without a host completion callback after + // quiesce returns) if a pass is already in flight or a teardown + // raised the gate. The guard clears `is_syncing` on every exit path. + let Some(_pass) = self.lifecycle.begin_pass() else { return PlatformAddressSyncSummary::default(); - } - - // RAII guard: clears `is_syncing` on every exit path, including - // panics. Without this a panic inside the pass would leave - // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. - let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); - - // A `quiesce()` may have raised the gate between our CAS and - // here; if so, bail without running a pass so the drain can - // complete and shutdown gets a true barrier (no further - // `on_platform_address_sync_completed` host callback after - // quiesce returns). Guard clears `is_syncing` on return. - if self.quiescing.load(Ordering::Acquire) { - return PlatformAddressSyncSummary::default(); - } + }; let snapshot: Vec<(WalletId, Arc)> = { let wallets = self.wallets.read().await; @@ -354,9 +302,9 @@ impl PlatformAddressSyncManager { .map(|d| d.as_secs()) .unwrap_or(0); summary.sync_unix_seconds = now; - self.last_sync_unix.store(now, Ordering::Release); + self.lifecycle.store_last_sync_unix(now); - // Dispatch the completion event BEFORE `_is_syncing_guard` drops. + // Dispatch the completion event BEFORE the `_pass` guard drops. // `quiesce()` drains on the falling edge of `is_syncing`; if the // guard cleared the flag before the dispatch a shutdown caller // could unblock and free the host event-handler context while @@ -367,7 +315,7 @@ impl PlatformAddressSyncManager { .on_platform_address_sync_completed(&summary); summary - // `_is_syncing_guard` drops here → `is_syncing = false` + // `_pass` drops here → `is_syncing = false` } /// Sync a single wallet on demand. Does not set the global @@ -395,7 +343,7 @@ impl std::fmt::Debug for PlatformAddressSyncManager { f.debug_struct("PlatformAddressSyncManager") .field("is_running", &self.is_running()) .field("is_syncing", &self.is_syncing()) - .field("interval_secs", &self.interval_secs.load(Ordering::Acquire)) + .field("interval_secs", &self.lifecycle.interval_secs()) .field("last_sync_unix", &self.last_sync_unix_seconds()) .finish() } @@ -474,8 +422,8 @@ mod tests { async fn sync_now_bails_when_quiescing() { let (mgr, counter) = make_manager(); - // Raise the gate as `quiesce()` would. - mgr.quiescing.store(true, Ordering::Release); + // Raise the gate as `quiesce()` would, held across the pass. + let _gate = mgr.lifecycle.hold_quiescing_gate(); let summary = mgr.sync_now().await; diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index ea0a0566f9..6a66e30ba6 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -26,16 +26,14 @@ //! [`configure_shielded`]: crate::manager::PlatformWalletManager::configure_shielded use std::collections::BTreeMap; -use std::sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, -}; +use std::sync::Arc; -use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig}; +use dash_async::{AtomicFlagGuard, ThreadRegistry}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; +use super::coordinator_lifecycle::CoordinatorLifecycle; use super::WalletWorker; use crate::events::PlatformEventManager; use crate::wallet::platform_wallet::WalletId; @@ -141,21 +139,14 @@ pub struct ShieldedSyncManager { /// run first, so an empty slot guarantees no shielded state /// exists). coordinator_slot: Arc>>>, - /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` / - /// `quiesce` delegate to it under the [`WalletWorker::ShieldedSync`] - /// key. - registry: Arc>, - interval_secs: AtomicU64, - is_syncing: AtomicBool, - /// Set by [`quiesce`](Self::quiesce) to gate new passes while it - /// drains an in-flight one. `sync_now` / `sync_wallet` bail (after - /// taking the `is_syncing` slot) when this is set, so once `quiesce` - /// observes `is_syncing == false` no further pass can start — giving - /// Clear / stop a real "no more host-visible mutations" barrier that - /// cancel-only [`stop`](Self::stop) does not provide. - quiescing: AtomicBool, - /// Unix seconds of the last completed pass. `0` = never. - last_sync_unix: AtomicU64, + /// Shared lifecycle state + pass-gating protocol under the + /// [`WalletWorker::ShieldedSync`] key: registry handle, polling + /// interval, the `is_syncing` / `quiescing` handshake, and the + /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the + /// `sync_now` / `sync_wallet` pass gate delegate to it. The `quiescing` + /// half gives Clear / stop a real "no more host-visible mutations" + /// barrier that cancel-only [`stop`](Self::stop) does not provide. + lifecycle: CoordinatorLifecycle, } impl ShieldedSyncManager { @@ -167,11 +158,11 @@ impl ShieldedSyncManager { Self { event_manager, coordinator_slot, - registry, - interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS), - is_syncing: AtomicBool::new(false), - quiescing: AtomicBool::new(false), - last_sync_unix: AtomicU64::new(0), + lifecycle: CoordinatorLifecycle::new( + registry, + WalletWorker::ShieldedSync, + DEFAULT_SYNC_INTERVAL_SECS, + ), } } @@ -179,47 +170,28 @@ impl ShieldedSyncManager { /// /// The running loop picks this up on its next sleep. pub fn set_interval(&self, interval: Duration) { - let secs = interval.as_secs().max(1); - self.interval_secs.store(secs, Ordering::Release); + self.lifecycle.set_interval(interval); } /// Current polling interval. pub fn interval(&self) -> Duration { - Duration::from_secs(self.interval_secs.load(Ordering::Acquire)) + self.lifecycle.interval() } /// Whether the background loop is currently running. pub fn is_running(&self) -> bool { - self.registry.is_running(WalletWorker::ShieldedSync) - } - - /// The drain barrier handed to the registry: raise the `quiescing` - /// gate so any pass past its `is_syncing` CAS bails. The registry then - /// cancels the loop and joins the thread (the join waits for the - /// in-flight pass — incl. its persister fan-out — to drop and - /// `is_syncing` to clear), so this barrier is instant. - fn drain_hook(self: &Arc) -> DrainHook { - let this = Arc::clone(self); - Arc::new(move || { - let this = Arc::clone(&this); - Box::pin(async move { - this.quiescing.store(true, Ordering::Release); - }) - }) + self.lifecycle.is_running() } /// Whether a sync pass is in flight right now. pub fn is_syncing(&self) -> bool { - self.is_syncing.load(Ordering::Acquire) + self.lifecycle.is_syncing() } /// Unix seconds of the last completed pass, or `None` if no pass /// has ever completed. pub fn last_sync_unix_seconds(&self) -> Option { - match self.last_sync_unix.load(Ordering::Acquire) { - 0 => None, - n => Some(n), - } + self.lifecycle.last_sync_unix_seconds() } /// Start the background sync loop. Idempotent — calling while @@ -231,13 +203,9 @@ impl ShieldedSyncManager { /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start). pub fn start(self: Arc) { // Reopen the quiescing gate so this (re)start's passes can run. - self.quiescing.store(false, Ordering::Release); + self.lifecycle.reopen_quiescing_gate(); - let cfg = WorkerConfig { - weight: super::COORDINATOR_WEIGHT, - join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS), - drain: Some(self.drain_hook()), - }; + let cfg = self.lifecycle.worker_config(); // The loop drives `!Send` SDK futures via `Handle::block_on` on a // dedicated OS thread (spawned by the registry). The background @@ -247,8 +215,9 @@ impl ShieldedSyncManager { // SDK fetch is dropped the instant the registry cancels. let handle = tokio::runtime::Handle::current(); let this = Arc::clone(&self); - self.registry - .start_thread(WalletWorker::ShieldedSync, cfg, move |cancel| { + self.lifecycle + .registry() + .start_thread(self.lifecycle.worker(), cfg, move |cancel| { handle.block_on(async move { loop { if cancel.is_cancelled() { @@ -279,7 +248,7 @@ impl ShieldedSyncManager { /// nothing more will be persisted" barrier — required by Clear, /// unregister, and rebind — use [`quiesce`](Self::quiesce). pub fn stop(&self) { - self.registry.cancel(WalletWorker::ShieldedSync); + self.lifecycle.stop(); } /// Cancel the background loop **and wait for any in-flight sync pass @@ -306,14 +275,7 @@ impl ShieldedSyncManager { /// the `!Send` loop has stopped touching `tokio::time` before a /// one-shot host drops the runtime. pub async fn quiesce(&self) -> super::CoordinatorThreadStatus { - // RAII gate: reopen `quiescing` on every exit path. The registry's - // drain hook raises it inside `quiesce`; reopening on return is - // safe because the loop has been cancelled, so no new pass starts. - let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.registry - .quiesce(WalletWorker::ShieldedSync) - .await - .into() + self.lifecycle.quiesce().await } /// Raise the `quiescing` gate and hold it raised until the returned @@ -322,8 +284,7 @@ impl ShieldedSyncManager { /// direct `sync_now` / `sync_wallet` passes off across a check-then-wipe /// so the "no new pass" guarantee does not lapse between the two steps. pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> { - self.quiescing.store(true, Ordering::Release); - AtomicFlagGuard::new(&self.quiescing) + self.lifecycle.hold_quiescing_gate() } /// Run one sync pass across every registered wallet. @@ -338,25 +299,13 @@ impl ShieldedSyncManager { /// If a pass is already in flight, returns an empty summary and /// skips — the caller can inspect [`is_syncing`] to distinguish. pub async fn sync_now(&self, force: bool) -> ShieldedSyncPassSummary { - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { - return ShieldedSyncPassSummary::default(); - } - - // RAII guard: clears `is_syncing` on every exit path, including - // panics. Without this a panic inside the pass would leave - // `is_syncing=true` forever and wedge `quiesce()`'s drain loop. - let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); - - // A `quiesce()` may have raised the gate between our CAS and - // here; bail so the drain can complete and Clear/stop get a - // true barrier. Guard clears `is_syncing` on return. - if self.quiescing.load(Ordering::Acquire) { + // Claim the pass slot and honour the quiescing gate; bail with an + // empty summary if a pass is already in flight or a teardown + // (Clear/stop) raised the gate. The guard clears `is_syncing` on + // every exit path. + let Some(_pass) = self.lifecycle.begin_pass() else { return ShieldedSyncPassSummary::default(); - } + }; // Snapshot the coordinator Arc and release the slot lock // before awaiting so a concurrent `configure_shielded` @@ -388,10 +337,10 @@ impl ShieldedSyncManager { if summary.sync_unix_seconds == 0 { summary.sync_unix_seconds = now; } - self.last_sync_unix - .store(summary.sync_unix_seconds, Ordering::Release); + self.lifecycle + .store_last_sync_unix(summary.sync_unix_seconds); - // Dispatch the completion event BEFORE `_is_syncing_guard` drops. + // Dispatch the completion event BEFORE the `_pass` guard drops. // `quiesce()` drains on the falling edge of `is_syncing`; if // the guard cleared the flag before the dispatch a stop/clear // caller could unblock while the callback is still pending — @@ -399,7 +348,7 @@ impl ShieldedSyncManager { self.event_manager.on_shielded_sync_completed(&summary); summary - // `_is_syncing_guard` drops here → `is_syncing = false` + // `_pass` drops here → `is_syncing = false` } /// Sync a single wallet on demand. @@ -430,26 +379,14 @@ impl ShieldedSyncManager { }; // Reuse the manager-wide `is_syncing` flag so a per-wallet - // `sync_wallet()` can't race the periodic `sync_now()` - // against the same store — both go through - // `coordinator.sync()`, which serializes per-coordinator - // but the manager flag is what the host UI watches. - if self - .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) - .is_err() - { + // `sync_wallet()` can't race the periodic `sync_now()` against the + // same store — both go through `coordinator.sync()`, which + // serializes per-coordinator, but the manager flag is what the host + // UI watches. Bail (Ok(None)) if a pass is already in flight or a + // teardown raised the quiescing gate. + let Some(_pass) = self.lifecycle.begin_pass() else { return Ok(None); - } - - // RAII guard clears `is_syncing` on every exit path including panics. - let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing); - - // Bail if a `quiesce()` raised the gate after our CAS (see - // `sync_now`) so the drain barrier holds. - if self.quiescing.load(Ordering::Acquire) { - return Ok(None); - } + }; let pass = coordinator.sync(force).await; @@ -476,7 +413,7 @@ impl std::fmt::Debug for ShieldedSyncManager { f.debug_struct("ShieldedSyncManager") .field("is_running", &self.is_running()) .field("is_syncing", &self.is_syncing()) - .field("interval_secs", &self.interval_secs.load(Ordering::Acquire)) + .field("interval_secs", &self.lifecycle.interval_secs()) .field("last_sync_unix", &self.last_sync_unix_seconds()) .finish() } From 22647a7fde77c5ed0c71d58d70971ecc96e3c801 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:34:47 +0200 Subject: [PATCH 23/29] fix(platform-wallet): raise quiescing gate in CoordinatorLifecycle::quiesce regardless of a running loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit registry.quiesce early-returns NotRunning WITHOUT running the DrainHook when no background-loop slot is registered, so quiesce left the quiescing gate down and never drained an in-flight pass. A concurrent direct sync_now/sync_wallet that had already cleared begin_pass() was therefore not held off, breaking the clear_shielded/stop contract. Fix: quiesce now raises quiescing itself (gate-before-cancel preserved; the AtomicFlagGuard still reopens it on return), then — after the registry's bounded cancel+join — drains is_syncing when the status is clean. Draining only on a clean status keeps a wedged loop pass (reported Timeout, its thread orphaned) from reintroducing the shutdown stall the bounded join exists to prevent, while still covering the no-loop and idle-loop+direct-pass cases. TDD: new test fails against the pre-fix delegating quiesce. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/manager/coordinator_lifecycle.rs | 135 +++++++++++++++++- 1 file changed, 132 insertions(+), 3 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs index 440c67c676..84a6c02ed9 100644 --- a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs +++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs @@ -141,11 +141,57 @@ impl CoordinatorLifecycle { /// Cancel the loop, drain any in-flight pass, and join the worker, /// returning its terminal status. Reopens the `quiescing` gate on every - /// exit path (the registry's drain hook raised it; reopening is safe - /// because the loop has been cancelled, so no new pass starts). + /// exit path (the gate is reset by the guard; reopening is safe because + /// the loop has been cancelled, so no new pass starts). + /// + /// The gate is raised **here**, not left to the registry's drain hook: + /// `registry.quiesce` early-returns `NotRunning` without running the + /// hook when no background-loop slot is registered, so a coordinator + /// with only direct `sync_now`/`sync_wallet` traffic (no running loop) + /// would never see the gate go up — and a direct pass landing + /// concurrently would slip past the barrier `clear_shielded`/`stop` + /// promise. Raising it ourselves makes the "no new pass" gate hold + /// regardless of whether a loop is registered, and preserves + /// gate-before-cancel: it is up before `registry.quiesce` issues any + /// cancel. pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus { + // Gate up first (instant) and held until the guard drops on return. + self.quiescing.store(true, Ordering::Release); let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); - self.registry.quiesce(self.worker).await.into() + + // Cancel + bounded join of the background loop (if any). A wedged + // loop pass surfaces here as a non-clean `Timeout` rather than + // hanging — its orphaned thread is tracked by the registry for + // teardown, so we must not wait on it below. + let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into(); + + // Drain a *direct* in-flight pass the registry could not: with no + // loop slot, `registry.quiesce` returned `NotRunning` without + // joining anything; with an idle loop it joined a thread that was + // not the one holding `is_syncing`. Either way a `sync_now`/ + // `sync_wallet` that entered before the gate rose may still be in + // flight. The gate keeps a new pass from starting, so this + // converges, and a panicked pass clears the flag via its own RAII + // guard. Only drain on a clean status: a non-clean one means a + // wedged loop pass is the `is_syncing` holder (its thread was + // orphaned, not joined), and waiting on it would reintroduce the + // shutdown stall the registry's bounded join exists to prevent. + if status.is_clean() { + self.drain_in_flight_pass().await; + } + + status + } + + /// Poll until no sync pass holds `is_syncing`. Only sound to call with + /// the `quiescing` gate already raised (so no new pass can start) and + /// after the background loop has been cancel-joined (so the only + /// possible holder is a direct, non-cancellable pass running to + /// completion). Mirrors the registry's 5ms poll cadence. + async fn drain_in_flight_pass(&self) { + while self.is_syncing.load(Ordering::Acquire) { + tokio::time::sleep(Duration::from_millis(5)).await; + } } /// Raise the `quiescing` gate and hold it raised until the returned @@ -191,3 +237,86 @@ impl CoordinatorLifecycle { Some(guard) } } + +#[cfg(test)] +mod tests { + use super::*; + use tokio::sync::oneshot; + + fn make_lifecycle() -> Arc { + let registry = ThreadRegistry::::new(); + Arc::new(CoordinatorLifecycle::new( + registry, + WalletWorker::IdentitySync, + 60, + )) + } + + /// With NO background loop registered, `quiesce` must still raise the + /// `quiescing` gate — so a concurrent direct `sync_now`/`sync_wallet` + /// that lands after it bails — and drain an already-in-flight direct + /// pass before returning. The registry's drain hook cannot cover this: + /// `registry.quiesce` early-returns `NotRunning` WITHOUT running the + /// hook when no loop slot exists, so the gate would otherwise never go + /// up and the in-flight pass would not be drained. Regression for the + /// `clear_shielded`/`stop` contract ("a concurrent direct + /// sync_now/sync_wallet is held off"). Must fail against the pre-fix + /// `quiesce` that only delegated to the registry. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn quiesce_raises_gate_and_drains_direct_pass_without_background_loop() { + let lifecycle = make_lifecycle(); + assert!( + !lifecycle.is_running(), + "precondition: no background loop registered" + ); + + // A direct sync_now/sync_wallet pass already past `begin_pass`, held + // in flight on a task until we release it. + let (ready_tx, ready_rx) = oneshot::channel::<()>(); + let (release_tx, release_rx) = oneshot::channel::<()>(); + let lc_pass = Arc::clone(&lifecycle); + let pass_task = tokio::spawn(async move { + let _pass = lc_pass.begin_pass().expect("first pass enters the slot"); + ready_tx.send(()).expect("signal in-flight"); + release_rx.await.expect("await release"); + // `_pass` drops here → is_syncing = false + }); + + ready_rx.await.expect("pass reached in-flight"); + assert!(lifecycle.is_syncing(), "direct pass holds is_syncing"); + + // Drive `quiesce` concurrently: it must raise the gate, then block + // draining the in-flight pass. + let lc_q = Arc::clone(&lifecycle); + let quiesce_task = tokio::spawn(async move { lc_q.quiesce().await }); + + // Give `quiesce` time to raise the gate and enter the drain. + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + lifecycle.quiescing.load(Ordering::Acquire), + "quiesce must raise the gate even with no background loop registered" + ); + assert!( + lifecycle.is_syncing(), + "in-flight direct pass still held; quiesce has not skipped the drain" + ); + assert!( + !quiesce_task.is_finished(), + "quiesce must block until the in-flight pass drains" + ); + + // Release the pass; `quiesce` drains `is_syncing`, then returns. + release_tx.send(()).expect("release the pass"); + let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task) + .await + .expect("quiesce completes once the pass drains") + .expect("quiesce task joined"); + assert_eq!(status, CoordinatorThreadStatus::NotRunning); + assert!( + !lifecycle.is_syncing(), + "is_syncing was drained before quiesce returned" + ); + + pass_task.await.expect("pass task joined"); + } +} From 7f3aeb59f8e90085ed5f1921a60e8dc6b2664fd4 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:34:56 +0200 Subject: [PATCH 24/29] fix(dash-async): park a restarted worker's prior under the slot lock so shutdown can't miss it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit start_thread took the prior handle out of the slot and ran reap_prior_or_park (a ~1s backstop spin) OUT of the slot lock. A concurrent shutdown() could take the lock, latch closing, snapshot tiers seeing only the NEW handle, release, and reap an EMPTY orphan list — reporting clean while the wedged prior was still live and un-joined. Fix: park the prior into orphans UNDER the slot lock (park_prior_locked), making take-prior + park-prior atomic from shutdown's under-lock view; the bounded join stays out of the lock (reap_parked_prior, which finds the prior by ThreadId, removes+joins it when finished, or leaves a genuine wedge parked). start_task parks under the lock too. This introduces the module's only slots->orphans nesting; it is deadlock-free since no path takes slots while holding orphans. TDD: long-backstop test asserts the prior is parked before the spin could elapse; fails pre-fix. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-dash-async/src/registry.rs | 221 ++++++++++++++++++++----- 1 file changed, 179 insertions(+), 42 deletions(-) diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs index 982dd6b57c..7f103ffcae 100644 --- a/packages/rs-dash-async/src/registry.rs +++ b/packages/rs-dash-async/src/registry.rs @@ -339,7 +339,7 @@ impl ThreadRegistry { F: FnOnce(CancellationToken) + Send + 'static, { Self::assert_multi_thread("start_thread"); - let prior = { + let prior_tid = { let mut slots = self.lock_slots(); // One-way teardown latch: refuse new workers once shutdown has // begun, under the same lock shutdown snapshots tiers with. @@ -378,12 +378,27 @@ impl ThreadRegistry { body(body_token); }) { Ok(join) => { - // Store the handle while still under the slot lock; the - // guard is released at the end of this block, BEFORE the - // reap below (R1: store handle -> drop guard -> THEN - // reap-or-park). + // Store the new handle, then park the prior into orphans + // — both while still under THIS slot lock (R1: store + // handle -> park prior -> drop guard -> THEN bounded + // reap below). slot.handle = Some(WorkerHandle::OsThread(join)); - prior + // [F3 FIX] Park the prior UNDER the slot lock, before + // releasing it. `shutdown` latches `closing` and + // snapshots tiers under this same lock; parking here + // means the take-prior + park-prior is atomic from its + // view, so it can never observe the new slot without + // also seeing the prior accounted in orphans. (The old + // out-of-lock reap left a window: the prior was moved out + // of the slot but not yet parked, so a shutdown + // snapshotting in that gap reaped an empty orphan list + // and reported clean while a wedged prior was still + // live.) The bounded join stays OUT of the lock — + // `reap_parked_prior` below. The `slots`->`orphans` + // nesting this introduces is the only such nesting in the + // module and is deadlock-free: no path acquires `slots` + // while holding `orphans`. + self.park_prior_locked(key, prior) } Err(e) => { // Spawn failed (e.g. EAGAIN at the OS thread ceiling). @@ -391,7 +406,8 @@ impl ThreadRegistry { // the slot is not left wedged "running": re-install // prior, clear the running flag. `generation` stays // bumped (it is only ever monotonic), which is harmless - // — the next start reaps the re-installed prior. + // — the next start reaps the re-installed prior. Nothing + // was parked, so there is no prior to reap below. tracing::error!( ?key, error = %e, @@ -407,10 +423,11 @@ impl ThreadRegistry { // The prior thread was cancellation-signalled by a preceding // cancel(); with the slot lock released its epilogue completes - // promptly and the join lands in milliseconds. The backstop fires - // only on a genuine wedge, in which case the still-live handle is + // promptly and the join lands in milliseconds — `reap_parked_prior` + // then removes it from orphans and joins it. The backstop fires only + // on a genuine wedge, in which case the still-live handle is left // parked (not dropped) so teardown can account for it. - self.reap_prior_or_park(prior, key); + self.reap_parked_prior(key, prior_tid); } /// Start a tokio-task worker for `Send` futures. Same restart-reap @@ -439,7 +456,7 @@ impl ThreadRegistry { F: FnOnce(CancellationToken) -> Fut + Send + 'static, Fut: Future + Send + 'static, { - let prior = { + { let mut slots = self.lock_slots(); // One-way teardown latch — see `start_thread`. if self.closing.load(Ordering::Acquire) { @@ -468,9 +485,17 @@ impl ThreadRegistry { body(body_token).await; }); slot.handle = Some(WorkerHandle::Task(join)); - prior - }; - self.reap_prior_or_park(prior, key); + // [F3 FIX] Park the prior UNDER this slot lock, same rationale as + // `start_thread`: it keeps `shutdown`'s under-lock tier snapshot + // from ever missing the prior. A task cannot be joined + // synchronously, so there is no bounded reap here — a live prior + // is parked for the async orphan reap (`reap_orphans` / + // `shutdown`) and a finished one is dropped. The returned thread + // id is unused: a task prior has none, and a (mixed-usage) + // OS-thread prior is likewise left to the async reap rather than + // spun on synchronously from this (possibly async) caller. + let _ = self.park_prior_locked(key, prior); + } } /// Whether a worker is currently registered and running for `key`. @@ -731,43 +756,86 @@ impl ThreadRegistry { .spawn(closure) } - /// Reap a restarted key's prior worker — or park it if it is genuinely - /// wedged past the reap backstop. Must be called with no registry lock - /// held (it spins synchronously for an OS thread). - fn reap_prior_or_park(&self, prior: Option, key: K) { - let Some(handle) = prior else { + /// Park a restarted key's prior handle into orphans. **Must be called + /// while the slot lock is held** — the resulting `slots`->`orphans` + /// nesting is the only such nesting in this module and is deadlock-free + /// (no path ever acquires `slots` while holding `orphans`, so there is no + /// cycle). Parking the prior here, rather than after the slot lock is + /// released, is what lets `shutdown`'s under-lock tier snapshot never + /// miss it: the take-prior and the park-prior are then atomic from + /// `shutdown`'s view. A finished task is dropped (detaching a finished + /// task is a no-op); a live task and any OS thread are parked. Returns + /// the parked OS thread's id so [`reap_parked_prior`](Self::reap_parked_prior) + /// can find and bounded-join it; tasks (reaped asynchronously) return + /// `None`. + fn park_prior_locked( + &self, + key: K, + prior: Option, + ) -> Option { + match prior { + Some(WorkerHandle::OsThread(h)) => { + let tid = h.thread().id(); + self.lock_orphans().push((key, WorkerHandle::OsThread(h))); + Some(tid) + } + Some(task) => { + if !task.is_finished() { + self.lock_orphans().push((key, task)); + } + None + } + None => None, + } + } + + /// Bounded reap of an OS-thread prior that [`park_prior_locked`](Self::park_prior_locked) + /// parked under `key` at restart. Must be called with no registry lock + /// held (it spins synchronously). The instant the parked thread finishes + /// it is removed from orphans and joined — the join itself stays OUT of + /// any lock (only the bookkeeping is taken under the orphans lock). A + /// genuine wedge past the reap backstop is left parked, so teardown can + /// still account for it. No-op when no OS thread was parked (`None`), or + /// when the orphan was already taken by a concurrent reaper / `shutdown` + /// (which then owns the join). + fn reap_parked_prior(&self, key: K, prior_tid: Option) { + let Some(tid) = prior_tid else { return; }; - match handle { - WorkerHandle::OsThread(h) => { - let deadline = Instant::now() + self.reap_backstop; - loop { - if h.is_finished() { - let _ = h.join(); - return; - } - if Instant::now() >= deadline { + let deadline = Instant::now() + self.reap_backstop; + loop { + // Bookkeeping under the orphans lock only: locate our parked + // prior by thread id and, once it has finished, take it out to + // join after the lock is released. Never hold the lock across the + // join. + let taken = { + let mut orphans = self.lock_orphans(); + let pos = orphans.iter().position(|(k, h)| { + *k == key && matches!(h, WorkerHandle::OsThread(t) if t.thread().id() == tid) + }); + match pos { + // Already taken by a concurrent reaper / shutdown: it owns + // the join now. + None => return, + Some(i) if orphans[i].1.is_finished() => Some(orphans.remove(i).1), + Some(_) if Instant::now() >= deadline => { tracing::warn!( ?key, backstop = ?self.reap_backstop, "prior worker thread did not finish within the reap \ - backstop after cancellation; parking it as an orphan \ - for teardown to join rather than detaching it" + backstop after cancellation; leaving it parked as an \ + orphan for teardown to join rather than detaching it" ); - self.lock_orphans().push((key, WorkerHandle::OsThread(h))); return; } - std::thread::sleep(Duration::from_millis(5)); - } - } - // A task can't be joined synchronously here; park a still-live - // one for async reap. A finished one is dropped (detaching a - // finished task is a no-op). - task => { - if !task.is_finished() { - self.lock_orphans().push((key, task)); + Some(_) => None, } + }; + if let Some(WorkerHandle::OsThread(h)) = taken { + let _ = h.join(); + return; } + std::thread::sleep(Duration::from_millis(5)); } } @@ -850,8 +918,12 @@ struct Repark<'a, K: RegistryKey> { impl Drop for Repark<'_, K> { fn drop(&mut self) { // Take the handle under the slot lock, release it, then push to - // orphans — never nest the two locks. Skip if a restart superseded - // our generation (the handle is the new worker's, not ours). + // orphans. This path holds only one lock at a time; the single + // sanctioned nesting in the module is `slots`->`orphans` in + // `park_prior_locked`, and nothing ever takes `slots` while holding + // `orphans`, so the ordering stays acyclic. Skip if a restart + // superseded our generation (the handle is the new worker's, not + // ours). let handle = self .reg .lock_slots() @@ -1664,4 +1736,69 @@ mod tests { ); assert!(!reg.any_alive(), "nothing started post-shutdown"); } + + /// [F3 FIX] `start_thread` must park a restarted key's still-wedged prior + /// into the orphan list UNDER the slot lock — at the START of the + /// restart, not only after the out-of-lock reap backstop elapses. + /// Otherwise a `shutdown()` that snapshots tiers in the window between + /// "prior taken out of the slot" and "prior parked" sees neither the + /// prior (already moved out of the slot) nor an orphan, and reports + /// clean while the wedged prior is still live and un-joined. + /// + /// Deterministic via a long backstop: with the fix the prior is + /// observable in orphans well before the backstop could elapse; the + /// pre-fix code parks it only at the end of the out-of-lock spin, so the + /// early assertion fails. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn start_thread_parks_wedged_prior_under_slot_lock_at_restart() { + // Long backstop so the under-lock parking is observable well before + // it could possibly elapse. + let reg = ThreadRegistry::with_reap_backstop(Duration::from_secs(10)); + let (release_tx, release_rx) = mpsc::channel::<()>(); + + // gen-1: wedged (ignores cancel), stays live until released. + reg.start_thread("k", WorkerConfig::default(), wedged_body(release_rx)); + reg.cancel("k"); + + // gen-2 restart on a blocking thread: its bounded reap of the wedged + // gen-1 spins the (long) backstop, so start_thread does not return + // promptly. The fix parks gen-1 under the slot lock at the start of + // this call, before that spin. + let reg2 = Arc::clone(®); + let parent = Handle::current(); + let restart = tokio::task::spawn_blocking(move || { + let handle = parent.clone(); + reg2.start_thread("k", WorkerConfig::default(), move |cancel| { + handle.block_on(async move { cancel.cancelled().await }); + }); + }); + + // The wedged prior must appear in orphans far sooner than the 10s + // backstop — it was parked under the slot lock at restart. + let mut waited = Duration::ZERO; + while orphan_len(®) == 0 && waited < Duration::from_secs(2) { + tokio::time::sleep(Duration::from_millis(10)).await; + waited += Duration::from_millis(10); + } + assert_eq!( + orphan_len(®), + 1, + "wedged prior must be parked under the slot lock at restart, not \ + only after the backstop spin" + ); + assert!(reg.is_running("k"), "gen-2 installed under the same lock"); + + // Release the wedged prior: the restart's bounded reap then finds it + // finished, removes it from orphans, and joins it. + release_tx.send(()).unwrap(); + restart.await.unwrap(); + assert_eq!( + orphan_len(®), + 0, + "finished prior removed from orphans by the bounded reap" + ); + + // gen-2 quiesces cleanly. + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + } } From 41791c06c512821e9fbd5354e064a165e76bc880 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:35:05 +0200 Subject: [PATCH 25/29] fix(platform-wallet-ffi): gate shielded_sync_stop success on orphan liveness, like clear_shielded/destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shielded_sync_stop returned Success on status.is_clean() alone, ignoring a prior-generation shielded thread still parked alive as an orphan — asymmetric with clear_shielded/destroy and a misleading contract (the orphan still holds the host callback context). No live UAF today since Swift always does stop->destroy, but Success should imply no live shielded worker/orphan. Add manager::shielded_worker_alive() (the same shielded-scoped any_alive_for gate clear_shielded consults) and have shielded_sync_stop return ErrorShutdownIncomplete when a parked orphan survives a clean drain. FFI ABI unchanged (same return-code semantics); docstring updated so Success accurately implies no live shielded worker/orphan. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/shielded_sync.rs | 45 ++++++++++++++----- .../src/manager/accessors.rs | 19 ++++++++ 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs index 14082628e4..493f84aa0f 100644 --- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs +++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs @@ -74,13 +74,20 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_start( /// note/sync-state row can be written after this returns) and its /// completion-event *dispatch* on the Rust side has run. /// -/// Returns `ErrorShutdownIncomplete` instead of `Success` when that drain -/// did **not** complete cleanly (the in-flight pass timed out on the join -/// backstop, or the loop ended non-cleanly). The terminal coordinator -/// status is rendered into the result message. On this code the host must -/// **not** free the callback context immediately — a lingering pass may -/// still fire one final callback through it (symmetric with -/// `platform_wallet_manager_destroy`). +/// Returns `ErrorShutdownIncomplete` instead of `Success` in either of two +/// cases, so `Success` accurately implies **no live shielded worker or +/// orphan remains**: +/// - the drain did not complete cleanly (the in-flight pass timed out on the +/// join backstop, or the loop ended non-cleanly); or +/// - the drain was clean but a prior-generation shielded thread is still +/// parked alive as an orphan (a tight `stop()`->`start()` reap detached it +/// past the wedge backstop). +/// +/// The terminal coordinator status is rendered into the result message. On +/// this code the host must **not** free the callback context immediately — a +/// lingering pass or parked orphan may still fire one final callback through +/// it (symmetric with `platform_wallet_manager_destroy` and the shielded +/// Clear flow). /// /// Caveat on host-observed events: a host that marshals the completion /// callback onto its own executor (e.g. the Swift trampoline hops it to @@ -96,7 +103,7 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop( handle: Handle, ) -> PlatformWalletFFIResult { let option = PLATFORM_WALLET_MANAGER_STORAGE.with_item(handle, |manager| { - runtime().block_on(async { + let status = runtime().block_on(async { // Bound the quiesce with the same backstop `shutdown()` uses so // a stalled in-flight pass can't hang the host's stop call // forever. Cancellation makes the drain prompt; this only @@ -113,9 +120,14 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop( Ok(status) => status, Err(_elapsed) => platform_wallet::CoordinatorThreadStatus::Timeout, } - }) + }); + // Capture orphan liveness while we still hold the manager: a clean + // quiesce drains the live slot but not a prior-generation thread + // parked as an orphan. + let shielded_alive = manager.shielded_worker_alive(); + (status, shielded_alive) }); - let status = unwrap_option_or_return!(option); + let (status, shielded_alive) = unwrap_option_or_return!(option); // Symmetric with `platform_wallet_manager_destroy`: a non-clean drain // means the shielded loop may still hold a reference to the host-owned // event-handler / persister context and could fire one final callback, @@ -130,6 +142,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop( ), ); } + // Even on a clean drain, a parked prior-generation shielded thread may + // still be alive and holding the host's callback context — mirror + // `clear_shielded` / `destroy` and refuse the clean return so the host + // does not free that context out from under a lingering orphan. + if shielded_alive { + return PlatformWalletFFIResult::err( + PlatformWalletFFIResultCode::ErrorShutdownIncomplete, + "shielded sync stop drained cleanly but a prior-generation shielded \ + worker is still parked alive; host must not free the callback \ + context immediately" + .to_string(), + ); + } PlatformWalletFFIResult::ok() } diff --git a/packages/rs-platform-wallet/src/manager/accessors.rs b/packages/rs-platform-wallet/src/manager/accessors.rs index 7bf901bccf..4ef045f906 100644 --- a/packages/rs-platform-wallet/src/manager/accessors.rs +++ b/packages/rs-platform-wallet/src/manager/accessors.rs @@ -299,6 +299,25 @@ impl PlatformWalletManager

{ Arc::clone(&self.shielded_sync_manager) } + /// Whether a shielded-sync worker is still alive — either its live + /// registry slot or a prior-generation thread parked as an orphan after + /// a tight `stop()`->`start()` reap had to detach it past the wedge + /// backstop. Such an orphan still holds an `Arc` to the persister / + /// event-handler context and may fire one final callback, so a clean + /// [`quiesce`](ShieldedSyncManager::quiesce) status alone does not prove + /// the shielded worker is gone. + /// + /// This is the same shielded-scoped liveness gate + /// [`clear_shielded`](Self::clear_shielded) consults; it is exposed so + /// the FFI `shielded_sync_stop` can refuse a misleading clean return + /// while a parked orphan lingers (symmetric with `clear_shielded` / + /// `destroy`). + #[cfg(feature = "shielded")] + pub fn shielded_worker_alive(&self) -> bool { + self.registry + .any_alive_for(super::WalletWorker::ShieldedSync) + } + /// Get a clone of a wallet by its ID. pub async fn get_wallet(&self, wallet_id: &WalletId) -> Option> { let wallets = self.wallets.read().await; From 4b099a92dcd8a5a339b7e926a6e4b04008c107eb Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:18:57 +0200 Subject: [PATCH 26/29] fix(platform-wallet): bound clear_shielded's drain and hold its quiescing gate continuously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEC-001: clear_shielded's in-flight-pass drain was unbounded and the FFI clear bridge a bare block_on, so a heavy direct pass could hang the host (ANR). Bound it with a SHUTDOWN_JOIN_TIMEOUT_SECS timeout (mirroring shielded_sync_stop); on timeout the clear reports Timeout and aborts BEFORE the wipe, leaving the store intact. Split out clear_shielded_inner(drain_timeout) so the timeout path is testable without the 30s budget. SEC-002/RUST-002: the gate lapsed between quiesce() returning (its RAII guard lowers the shared flag) and the post-drain re-raise, letting a direct sync_now/sync_wallet slip past any_alive_for and re-persist into the wiped store. Fix: raise+HOLD the gate via clear's own guard BEFORE draining, and drain via a new gate-neutral quiesce_under_held_gate (extracted cancel_join_and_drain shared with quiesce, which stays byte-identical — Fix-1 invariant untouched). The gate now stays raised continuously across drain, liveness check, and wipe; doc softened to note the only residual is a full start() racing clear (per-key-latch follow-up). PROJ-004: clear now calls shielded_worker_alive() instead of re-inlining any_alive_for. Also clarifies the quiesce doc that only shielded gates sync_wallet (platform-address's is intentionally ungated). TDD: both SEC tests proven non-vacuous (revert->fail->restore). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/manager/coordinator_lifecycle.rs | 123 ++++++++++++---- .../rs-platform-wallet/src/manager/mod.rs | 131 ++++++++++++++---- .../src/manager/shielded_sync.rs | 19 +++ 3 files changed, 222 insertions(+), 51 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs index 84a6c02ed9..ebcc73419c 100644 --- a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs +++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs @@ -147,39 +147,67 @@ impl CoordinatorLifecycle { /// The gate is raised **here**, not left to the registry's drain hook: /// `registry.quiesce` early-returns `NotRunning` without running the /// hook when no background-loop slot is registered, so a coordinator - /// with only direct `sync_now`/`sync_wallet` traffic (no running loop) - /// would never see the gate go up — and a direct pass landing - /// concurrently would slip past the barrier `clear_shielded`/`stop` - /// promise. Raising it ourselves makes the "no new pass" gate hold - /// regardless of whether a loop is registered, and preserves - /// gate-before-cancel: it is up before `registry.quiesce` issues any - /// cancel. + /// with only direct pass traffic (no running loop) would never see the + /// gate go up — and a direct pass landing concurrently would slip past + /// the barrier `clear_shielded`/`stop` promise. Raising it ourselves + /// makes the "no new pass" gate hold regardless of whether a loop is + /// registered, and preserves gate-before-cancel: it is up before + /// `registry.quiesce` issues any cancel. + /// + /// "Direct pass" here means the gated entry points that take the + /// `is_syncing` slot via [`begin_pass`](Self::begin_pass): every + /// coordinator's `sync_now`, plus the shielded coordinator's + /// `sync_wallet`. The platform-address coordinator's `sync_wallet` is + /// intentionally **ungated** (it never touches `is_syncing`; callers + /// that need exclusion gate themselves), so the gate/drain barrier does + /// not apply to it. pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus { // Gate up first (instant) and held until the guard drops on return. self.quiescing.store(true, Ordering::Release); let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); + self.cancel_join_and_drain().await + } - // Cancel + bounded join of the background loop (if any). A wedged - // loop pass surfaces here as a non-clean `Timeout` rather than - // hanging — its orphaned thread is tracked by the registry for - // teardown, so we must not wait on it below. - let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into(); + /// Like [`quiesce`](Self::quiesce) but for a caller that has **already** + /// raised the `quiescing` gate (via [`hold_quiescing_gate`](Self::hold_quiescing_gate)) + /// and will keep holding it: this neither raises nor lowers the gate, so + /// a multi-step teardown (the shielded Clear flow) keeps the "no new + /// pass" barrier raised *continuously* across the drain, the orphan- + /// liveness check, and the store wipe — with no lapse for a direct + /// `sync_now`/`sync_wallet` to slip through and re-persist into the + /// store being cleared. (`quiesce`'s own RAII guard would lower the gate + /// the instant it returned, which is why Clear cannot just call it and + /// re-raise afterwards: a single shared `AtomicFlagGuard` always clears + /// the flag on drop, so the re-raise would leave a window.) Gate-before- + /// cancel still holds: the caller raised the gate before this runs. + #[cfg(any(test, feature = "shielded"))] + pub(crate) async fn quiesce_under_held_gate(&self) -> CoordinatorThreadStatus { + debug_assert!( + self.quiescing.load(Ordering::Acquire), + "quiesce_under_held_gate requires the caller to already hold the quiescing gate" + ); + self.cancel_join_and_drain().await + } - // Drain a *direct* in-flight pass the registry could not: with no - // loop slot, `registry.quiesce` returned `NotRunning` without - // joining anything; with an idle loop it joined a thread that was - // not the one holding `is_syncing`. Either way a `sync_now`/ - // `sync_wallet` that entered before the gate rose may still be in - // flight. The gate keeps a new pass from starting, so this - // converges, and a panicked pass clears the flag via its own RAII - // guard. Only drain on a clean status: a non-clean one means a - // wedged loop pass is the `is_syncing` holder (its thread was - // orphaned, not joined), and waiting on it would reintroduce the - // shutdown stall the registry's bounded join exists to prevent. + /// Cancel + bounded-join the background loop (if any), then drain a + /// direct in-flight pass on a clean status. Assumes the `quiescing` gate + /// is **already raised** (by [`quiesce`](Self::quiesce)'s own guard or a + /// caller's hold guard) and does not touch it. + /// + /// A wedged loop pass surfaces from `registry.quiesce` as a non-clean + /// `Timeout` rather than hanging — its orphaned thread is tracked by the + /// registry for teardown, so the drain below must not wait on it. On a + /// clean status the only possible `is_syncing` holder is a direct + /// `sync_now`/`sync_wallet` that entered before the gate rose (with no + /// loop slot `registry.quiesce` joined nothing; with an idle loop it + /// joined a thread that was not the one holding the flag). The raised + /// gate keeps a new pass from starting, so the drain converges, and a + /// panicked pass clears the flag via its own RAII guard. + async fn cancel_join_and_drain(&self) -> CoordinatorThreadStatus { + let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into(); if status.is_clean() { self.drain_in_flight_pass().await; } - status } @@ -187,7 +215,9 @@ impl CoordinatorLifecycle { /// the `quiescing` gate already raised (so no new pass can start) and /// after the background loop has been cancel-joined (so the only /// possible holder is a direct, non-cancellable pass running to - /// completion). Mirrors the registry's 5ms poll cadence. + /// completion). Mirrors the registry's 5ms poll cadence. Unbounded by + /// design — the caller bounds the whole teardown (the FFI `stop` / + /// `clear` bridges wrap it in a `SHUTDOWN_JOIN_TIMEOUT_SECS` timeout). async fn drain_in_flight_pass(&self) { while self.is_syncing.load(Ordering::Acquire) { tokio::time::sleep(Duration::from_millis(5)).await; @@ -319,4 +349,47 @@ mod tests { pass_task.await.expect("pass task joined"); } + + /// `quiesce_under_held_gate` must NOT lower the `quiescing` gate the + /// caller is holding — the mechanism that lets the shielded Clear flow + /// keep the "no new pass" barrier raised *continuously* across the + /// drain, the liveness check, and the store wipe. The plain + /// [`quiesce`](CoordinatorLifecycle::quiesce)'s own RAII guard would + /// lower it on return, leaving a window a direct pass could slip into + /// before Clear re-raised it. Must fail against a variant that delegates + /// to `quiesce` (whose guard clears the shared flag on drop). + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn quiesce_under_held_gate_keeps_caller_gate_raised() { + let lifecycle = make_lifecycle(); + + // Caller (the Clear flow) raises and holds the gate before draining. + let hold = lifecycle.hold_quiescing_gate(); + assert!( + lifecycle.quiescing.load(Ordering::Acquire), + "caller's hold raised the gate" + ); + + // Drain under the held gate (no loop registered → NotRunning); the + // gate must remain raised across the call. + let status = lifecycle.quiesce_under_held_gate().await; + assert_eq!(status, CoordinatorThreadStatus::NotRunning); + assert!( + lifecycle.quiescing.load(Ordering::Acquire), + "gate stays raised across the drain — no lapse for a direct pass" + ); + + // A direct pass attempting to begin during Clear (gate held) is + // refused: it bails after the CAS on the raised gate. + assert!( + lifecycle.begin_pass().is_none(), + "the continuously-held gate holds off a new direct pass" + ); + + // Once Clear's own guard drops, the gate reopens for later work. + drop(hold); + assert!( + !lifecycle.quiescing.load(Ordering::Acquire), + "gate reopens once the caller's hold guard drops" + ); + } } diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs index 840dd13c7a..fd22ec6d17 100644 --- a/packages/rs-platform-wallet/src/manager/mod.rs +++ b/packages/rs-platform-wallet/src/manager/mod.rs @@ -514,21 +514,53 @@ impl PlatformWalletManager

{ /// /// **Host-serialization precondition**: the caller must not invoke /// `shielded_sync_start` for this manager concurrently with `clear`. A - /// concurrent direct `sync_now`/`sync_wallet` is held off (the quiescing - /// gate stays raised across the liveness check and the wipe), but a full - /// restart re-opens that gate as it spawns a fresh loop, so a `start` - /// racing `clear` can still re-persist into the wiped store. The wallet - /// UI drives these from one place; that ordering is the host's contract - /// until the registry grows a per-key clearing latch. + /// concurrent direct `sync_now`/`sync_wallet` is held off — the quiescing + /// gate is raised *continuously* for the whole clear (from before the + /// drain, across the liveness check, through the wipe), so such a pass + /// observes the gate and bails with no lapse. The one remaining residual + /// is a full `shielded_sync_start` racing `clear`: a restart spawns a + /// fresh loop and reopens the gate, so it could re-persist into the wiped + /// store. The wallet UI drives these from one place; that ordering is the + /// host's contract until the registry grows a per-key clearing latch. #[cfg(feature = "shielded")] pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> { - // Quiesce the shielded loop: cancel it, drain any in-flight pass - // (incl. its persister fan-out), and join its OS thread. The - // registry bounds the join by the coordinator's own - // `SHUTDOWN_JOIN_TIMEOUT_SECS` budget — returning `Timeout` rather - // than hanging if a pass's drop wedges — so no outer timeout is - // needed here. - let status = self.shielded_sync_manager.quiesce().await; + self.clear_shielded_inner(std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS)) + .await + } + + /// [`clear_shielded`](Self::clear_shielded) with an explicit drain + /// deadline. Split out so tests can exercise the timeout path without + /// waiting the full production budget. + #[cfg(feature = "shielded")] + async fn clear_shielded_inner( + &self, + drain_timeout: std::time::Duration, + ) -> Result<(), crate::error::PlatformWalletError> { + // Raise and HOLD the shielded quiescing gate for the WHOLE clear, + // BEFORE quiescing — so the "no new pass" barrier never lapses + // between the drain, the liveness check, and the store wipe: a direct + // `sync_now`/`sync_wallet` landing anywhere in here observes the gate + // and bails instead of re-persisting into the store we are about to + // clear. `quiesce_under_held_gate` deliberately does NOT touch the + // gate (a single `AtomicFlagGuard` always clears the flag on drop, so + // letting `quiesce` manage it and re-raising afterwards would leave a + // window). The guard lowers the gate on return (every path). + let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate(); + + // Cancel the loop and drain any in-flight pass (incl. its persister + // fan-out). Bound the drain (mirroring `shielded_sync_stop`'s + // timeout) so a heavy direct pass cannot hang the host's Clear: on + // timeout the clear reports `Timeout` and aborts BEFORE the wipe, + // leaving the store intact. + let status = match tokio::time::timeout( + drain_timeout, + self.shielded_sync_manager.quiesce_under_held_gate(), + ) + .await + { + Ok(status) => status, + Err(_elapsed) => CoordinatorThreadStatus::Timeout, + }; // Only commit the store wipe once the in-flight pass has fully // drained. A partial/timed-out drain could let a surviving pass @@ -537,20 +569,14 @@ impl PlatformWalletManager

{ if !status.is_clean() { return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status }); } - // Hold the shielded quiescing gate raised across BOTH the liveness - // check below and the store wipe, so the gate guarding "no new pass" - // does not lapse between check and act: a direct `sync_now` / - // `sync_wallet` that lands here observes the gate and bails instead - // of writing into the store we are about to clear. The guard lowers - // the gate on return (every path), so a later start/sync works. - let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate(); - // [F2 FIX] Also refuse if a prior-generation shielded thread is - // still parked alive: it holds an `Arc` to the persister/store and - // could re-persist notes into the store we are about to wipe. The - // check is shielded-scoped, so the other coordinators / the - // always-on event adapter running normally do not block Clear. - if self.registry.any_alive_for(WalletWorker::ShieldedSync) { + // Also refuse if a prior-generation shielded thread is still parked + // alive: it holds an `Arc` to the persister/store and could re-persist + // notes into the store we are about to wipe. The check is shielded- + // scoped (shares the `shielded_worker_alive` gate), so the other + // coordinators / the always-on event adapter running normally do not + // block Clear. + if self.shielded_worker_alive() { return Err( crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status: CoordinatorThreadStatus::Detached, @@ -1044,6 +1070,59 @@ mod tests { .expect("clear_shielded must succeed once the orphan is reaped"); } + /// SEC-001: `clear_shielded` must BOUND its in-flight-pass drain so a + /// heavy direct `sync_now`/`sync_wallet` that won't drain in time cannot + /// hang the host's Clear. On the drain deadline the clear reports + /// `ShieldedShutdownIncomplete` and aborts BEFORE the store wipe, leaving + /// the store intact. + /// + /// Non-vacuous: against an unbounded drain the held pass keeps + /// `is_syncing` set forever and `clear_shielded_inner` never returns — the + /// test's outer timeout fires and the `expect` below panics. + #[cfg(feature = "shielded")] + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn clear_shielded_aborts_without_wiping_when_drain_times_out() { + let manager = Arc::new(make_manager()); + + // A direct sync pass already in flight (holds `is_syncing`); it never + // drains within the clear's drain budget. + let (ready_tx, ready_rx) = tokio::sync::oneshot::channel::<()>(); + let (release_tx, release_rx) = tokio::sync::oneshot::channel::<()>(); + let ssm = Arc::clone(&manager.shielded_sync_manager); + let pass_task = tokio::spawn(async move { + let _pass = ssm + .begin_pass_for_test() + .expect("direct pass enters the slot"); + ready_tx.send(()).expect("signal in-flight"); + release_rx.await.expect("await release"); + // `_pass` drops here → is_syncing = false + }); + + ready_rx.await.expect("pass reached in-flight"); + assert!(manager.shielded_sync_manager.is_syncing()); + + // Clear with a short drain budget: the held pass can't drain in time, + // so the clear must return ShieldedShutdownIncomplete — bounded, never + // hanging — and never reach the wipe. + let result = tokio::time::timeout( + Duration::from_secs(5), + manager.clear_shielded_inner(Duration::from_millis(100)), + ) + .await + .expect("clear must return within its bounded drain, never hang"); + assert!( + matches!( + result, + Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { .. }) + ), + "bounded drain timeout must surface as ShieldedShutdownIncomplete, got {result:?}" + ); + + // Release the held pass and join. + release_tx.send(()).expect("release the pass"); + pass_task.await.expect("pass task joined"); + } + /// TC-015 (R5): `from_report` maps the registry's [`ShutdownReport`] /// onto the FFI-stable `CoordinatorExitStatus` with identical field / /// variant shape and `all_clean()` semantics. The full `WorkerStatus` diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs index 6a66e30ba6..f949b48dd1 100644 --- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs +++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs @@ -278,6 +278,25 @@ impl ShieldedSyncManager { self.lifecycle.quiesce().await } + /// Drain + join **without touching the `quiescing` gate**, for a caller + /// (the Clear flow) that already holds it raised via + /// [`hold_quiescing_gate`](Self::hold_quiescing_gate) and keeps holding + /// it across the whole teardown. See + /// [`CoordinatorLifecycle::quiesce_under_held_gate`]. + pub(crate) async fn quiesce_under_held_gate(&self) -> super::CoordinatorThreadStatus { + self.lifecycle.quiesce_under_held_gate().await + } + + /// Test seam: enter a sync pass directly (claim `is_syncing` via the + /// pass gate) so a teardown test can stand in for a direct + /// `sync_now`/`sync_wallet` already in flight, without driving the real + /// (coordinator-backed) sync path. The returned guard clears the flag + /// on drop. + #[cfg(test)] + pub(crate) fn begin_pass_for_test(&self) -> Option> { + self.lifecycle.begin_pass() + } + /// Raise the `quiescing` gate and hold it raised until the returned /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate as /// soon as it returns, this lets a multi-step teardown (Clear) keep new From 7be68c561846eb0114b449398ca83440d914603f Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:19:07 +0200 Subject: [PATCH 27/29] refactor(dash-async): full spawn-failure rollback + drop stale doc history-tags RUST-004: a failed thread spawn left the slot carrying the FAILED start's weight/drain/join_budget (and a bumped generation). Now snapshot the pre-start config and restore ALL of it on the Err path, so the re-installed prior keeps its own teardown config; generation rolls back too (the +1 is only observed under the slot lock and the failed start spawns no thread, so the rollback is net-zero and the externally-visible generation stays monotonic). New regression test asserts the restored config. RUST-005: trimmed the duplicated park-under-lock rustdoc block in start_thread that repeated park_prior_locked's doc. RUST-003/PROJ hygiene: removed [F1/F2/F3 FIX] history-tags from committed comments, replaced the 'Why F1 and F2 cannot recur' module section with present-state invariant descriptions, and fixed the glossary to reference the key-scoped any_alive_for (the gate store-wiping paths actually consult) rather than the registry-wide any_alive. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- packages/rs-dash-async/src/registry.rs | 160 ++++++++++++++++++------- 1 file changed, 119 insertions(+), 41 deletions(-) diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs index 7f103ffcae..01795a5e73 100644 --- a/packages/rs-dash-async/src/registry.rs +++ b/packages/rs-dash-async/src/registry.rs @@ -15,21 +15,24 @@ //! - [`start_task`](ThreadRegistry::start_task) — a tokio task, for //! `Send` futures. //! -//! # Why F1 and F2 cannot recur +//! # Safety invariants //! -//! - **F1** (timeout-dropped quiesce detaches a live thread): every join -//! path takes `&self`; the live join handle stays owned by the slot -//! and is never moved into a cancellable future's frame. A +//! - **A timed-out or dropped quiesce never detaches a live thread.** +//! Every join path takes `&self`; the live join handle stays owned by +//! the slot and is never moved into a cancellable future's frame. A //! dropped/timed-out [`quiesce`](ThreadRegistry::quiesce) therefore //! cannot drop-and-detach the handle — on timeout (or on an external //! drop) the handle is deterministically re-parked into the orphan //! list, and the slot reports [`WorkerStatus::Timeout`], never a clean //! `NotRunning`. -//! - **F2** (store wipe races a parked prior-generation thread): -//! orphans live in the registry and [`any_alive`](ThreadRegistry::any_alive) -//! is the single liveness gate spanning live slots **and** parked -//! orphans. Every store-wiping path consults it, so a parked -//! still-live thread blocks the wipe. +//! - **A store wipe cannot race a parked prior-generation thread.** +//! Orphans live in the registry and +//! [`any_alive_for`](ThreadRegistry::any_alive_for) is the key-scoped +//! liveness gate spanning a key's live slot **and** its parked orphans +//! (with [`any_alive`](ThreadRegistry::any_alive) the registry-wide +//! variant). A store-wiping path scoped to one worker consults the +//! key-scoped gate, so a parked still-live thread blocks the wipe of its +//! own worker's store without an unrelated worker blocking it. use std::collections::BTreeMap; use std::future::Future; @@ -354,6 +357,17 @@ impl ThreadRegistry { // install the new token under this one lock so a prior // thread's epilogue observes the post-swap generation. let prior = slot.handle.take(); + // Snapshot the slot's pre-start config so a spawn failure can roll + // the slot back to exactly its prior state: a re-installed prior + // worker must keep its OWN teardown config, not inherit the failed + // start's weight/drain/join_budget. Generation is rolled back too — + // the bump is only ever observed under this lock and a failed start + // spawns no thread to reference it, so the rollback is net-zero and + // the externally-visible generation stays monotonic. + let prev_generation = slot.generation; + let prev_weight = slot.weight; + let prev_join_budget = slot.join_budget; + let prev_drain = slot.drain.take(); let token = CancellationToken::new(); slot.cancel = Some(token.clone()); slot.generation += 1; @@ -378,36 +392,27 @@ impl ThreadRegistry { body(body_token); }) { Ok(join) => { - // Store the new handle, then park the prior into orphans - // — both while still under THIS slot lock (R1: store - // handle -> park prior -> drop guard -> THEN bounded - // reap below). + // Store the new handle, then park the prior into orphans — + // both still under THIS slot lock, so `shutdown`'s + // under-lock tier snapshot can never see the new slot + // without also seeing the prior accounted (R1: store handle + // -> park prior -> drop guard -> THEN bounded reap below). + // See `park_prior_locked` for the lock-order rationale; the + // bounded join stays out of the lock in `reap_parked_prior`. slot.handle = Some(WorkerHandle::OsThread(join)); - // [F3 FIX] Park the prior UNDER the slot lock, before - // releasing it. `shutdown` latches `closing` and - // snapshots tiers under this same lock; parking here - // means the take-prior + park-prior is atomic from its - // view, so it can never observe the new slot without - // also seeing the prior accounted in orphans. (The old - // out-of-lock reap left a window: the prior was moved out - // of the slot but not yet parked, so a shutdown - // snapshotting in that gap reaped an empty orphan list - // and reported clean while a wedged prior was still - // live.) The bounded join stays OUT of the lock — - // `reap_parked_prior` below. The `slots`->`orphans` - // nesting this introduces is the only such nesting in the - // module and is deadlock-free: no path acquires `slots` - // while holding `orphans`. self.park_prior_locked(key, prior) } Err(e) => { - // Spawn failed (e.g. EAGAIN at the OS thread ceiling). - // Roll back so the prior handle is never detached and - // the slot is not left wedged "running": re-install - // prior, clear the running flag. `generation` stays - // bumped (it is only ever monotonic), which is harmless - // — the next start reaps the re-installed prior. Nothing - // was parked, so there is no prior to reap below. + // Spawn failed (e.g. EAGAIN at the OS thread ceiling). Roll + // the slot back to exactly its pre-start state: clear the + // running flag, re-install the prior handle (never + // detached), and restore the prior teardown config + + // generation so nothing of the failed start lingers. The + // re-installed prior keeps its own weight/drain/join_budget + // for a later quiesce/shutdown, and generation returns to + // its pre-bump value (the bump was never observed outside + // this lock and spawned no thread). Nothing was parked, so + // there is no prior to reap below. tracing::error!( ?key, error = %e, @@ -416,6 +421,10 @@ impl ThreadRegistry { ); slot.cancel = None; slot.handle = prior; + slot.generation = prev_generation; + slot.weight = prev_weight; + slot.drain = prev_drain; + slot.join_budget = prev_join_budget; None } } @@ -485,7 +494,7 @@ impl ThreadRegistry { body(body_token).await; }); slot.handle = Some(WorkerHandle::Task(join)); - // [F3 FIX] Park the prior UNDER this slot lock, same rationale as + // Park the prior UNDER this slot lock, same rationale as // `start_thread`: it keeps `shutdown`'s under-lock tier snapshot // from ever missing the prior. A task cannot be joined // synchronously, so there is no bounded reap here — a live prior @@ -528,7 +537,7 @@ impl ThreadRegistry { /// budget. The live handle is owned by the slot and is **never** moved /// into this future's frame, so a dropped/timed-out call cannot detach /// it; on the managed timeout — or if this future is dropped - /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX] + /// mid-poll — the handle is re-parked into the orphan list. pub async fn quiesce(&self, key: K) -> WorkerStatus { // Snapshot the drain hook + budget + generation, and bail early if // nothing is registered for this key. The generation is the anchor @@ -629,7 +638,7 @@ impl ThreadRegistry { /// under that key — still alive? A store-wiping path scoped to one /// worker must gate on this (rather than the registry-wide /// [`any_alive`](Self::any_alive)) so an unrelated worker that is - /// legitimately running does not block the wipe. [F2 FIX] + /// legitimately running does not block the wipe. pub fn any_alive_for(&self, key: K) -> bool { if let Some(slot) = self.lock_slots().get(&key) { if slot_alive(slot) { @@ -1671,6 +1680,75 @@ mod tests { assert!(!reg.any_alive()); } + /// A thread-spawn failure must roll the slot back to its PRIOR config, not + /// leave the failed start's weight / drain / join_budget / generation + /// behind: the re-installed prior worker keeps its own teardown config for + /// a later quiesce/shutdown. + /// + /// Non-vacuous: against a partial rollback (only cancel/handle restored), + /// the slot would carry the failed start's weight/budget, a `None` drain, + /// and the bumped generation. + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn spawn_failure_restores_prior_slot_config() { + let reg = ThreadRegistry::<&str>::new(); + let (release_tx, release_rx) = mpsc::channel::<()>(); + + // gen-1 with a DISTINCTIVE config (drain hook + non-default weight and + // join budget). Wedged so it stays the live prior after cancel. + let hook: DrainHook = Arc::new(|| Box::pin(async {})); + let cfg1 = WorkerConfig { + weight: ShutdownWeight(7), + join_budget: Duration::from_secs(11), + drain: Some(hook), + }; + reg.start_thread("k", cfg1, wedged_body(release_rx)); + reg.cancel("k"); + let gen_after_gen1 = reg.lock_slots().get("k").unwrap().generation; + + // Failed restart with a DIFFERENT config; the rollback must discard it. + reg.force_spawn_failure.store(true, Ordering::Release); + let cfg2 = WorkerConfig { + weight: ShutdownWeight(99), + join_budget: Duration::from_secs(99), + drain: None, + }; + reg.start_thread("k", cfg2, |_cancel| {}); + reg.force_spawn_failure.store(false, Ordering::Release); + + { + let slots = reg.lock_slots(); + let slot = slots.get("k").expect("slot present"); + assert_eq!(slot.weight, ShutdownWeight(7), "weight restored to prior"); + assert_eq!( + slot.join_budget, + Duration::from_secs(11), + "join_budget restored to prior" + ); + assert!( + slot.drain.is_some(), + "prior drain hook restored, not the failed start's None" + ); + assert_eq!( + slot.generation, gen_after_gen1, + "generation rolled back to its pre-bump value" + ); + assert!( + slot.cancel.is_none(), + "running flag cleared after failed spawn" + ); + assert!( + slot.handle.is_some(), + "prior handle re-installed (alive), not detached" + ); + } + assert!(reg.any_alive(), "live prior still accounted for"); + + // Recover: release + quiesce reaps the prior cleanly. + release_tx.send(()).unwrap(); + assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok); + assert!(!reg.any_alive()); + } + /// A panicking worker body still runs its epilogue (via the drop-guard), /// so `is_running()` reflects the crash and `start()` can relaunch the /// loop instead of silently no-op'ing. @@ -1737,9 +1815,9 @@ mod tests { assert!(!reg.any_alive(), "nothing started post-shutdown"); } - /// [F3 FIX] `start_thread` must park a restarted key's still-wedged prior - /// into the orphan list UNDER the slot lock — at the START of the - /// restart, not only after the out-of-lock reap backstop elapses. + /// `start_thread` must park a restarted key's still-wedged prior into the + /// orphan list UNDER the slot lock — at the START of the restart, not only + /// after the out-of-lock reap backstop elapses. /// Otherwise a `shutdown()` that snapshots tiers in the window between /// "prior taken out of the slot" and "prior parked" sees neither the /// prior (already moved out of the slot) nor an orphan, and reports From 3821389cfd5d603585bff5a52c3e25a0b91fa4d5 Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:19:18 +0200 Subject: [PATCH 28/29] docs(swift-sdk): broaden deinit comment for shielded_sync_stop's orphan-trigger contract shielded_sync_stop now also returns .errorShutdownIncomplete when the drain was clean but a prior-generation shielded thread is still parked alive as an orphan (not only when the in-flight drain times out). Update the manager deinit comment to reflect both triggers; behaviour unchanged (the deinit already leaks one strong ref to the handlers on .errorShutdownIncomplete). Comment-only. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../PlatformWallet/PlatformWalletManager.swift | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift index 36bafa37d1..34137d15e5 100644 --- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift +++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift @@ -162,12 +162,15 @@ public class PlatformWalletManager: ObservableObject { // Capture the CODE (not just free the message) for the two calls // that CAN report `.errorShutdownIncomplete`: `shielded_sync_stop` // and `destroy`. Rust returns that code when a background - // coordinator did not drain within the join deadline — meaning a - // lingering `!Send` coordinator thread may still hold the - // `passUnretained` context pointers Rust was handed for our - // `persistenceHandler` / `eventHandler` and fire ONE final callback - // through them. The contract: on that code the host must NOT free - // the callback context immediately. + // coordinator did not drain within the join deadline, OR — for + // `shielded_sync_stop` — when the drain was clean but a prior- + // generation shielded thread is still parked alive as an orphan + // (a tight `stop()`→`start()` reap that had to detach it past the + // wedge backstop). In either case a lingering `!Send` coordinator + // thread may still hold the `passUnretained` context pointers Rust + // was handed for our `persistenceHandler` / `eventHandler` and fire + // ONE final callback through them. The contract: on that code the + // host must NOT free the callback context immediately. let shieldedStopCode = platform_wallet_manager_shielded_sync_stop(handle).discardReturningCode() let destroyCode = From 748c4f826c24c0093743d66d3dc5f9157448a07b Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:28:19 +0200 Subject: [PATCH 29/29] fix(platform-wallet): make the quiescing<->is_syncing handshake self-fencing (SeqCst) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEC-003: the pass-gate handshake is a Dekker-style StoreLoad pair across two distinct atomics — quiesce does store(quiescing) … load(is_syncing); begin_pass does CAS(is_syncing) … load(quiescing). Release/Acquire do NOT order StoreLoad across separate locations, so by the annotations alone both sides could miss each other (begin_pass reads a stale quiescing==false and runs a pass past a raised gate while the drain reads a stale is_syncing==false and returns). It was sound only incidentally — registry.quiesce happens to take the slots Mutex (a fence) before returning; a lock-free refactor of that path would make the race live. Promote the four handshake ops to SeqCst (a single total order guarantees at least one side observes the other): the gate-raise stores in quiesce, hold_quiescing_gate, and the registry drain hook; the is_syncing CAS (success) and quiescing load in begin_pass; and the is_syncing load in the drain. Gate lowering (reopen / RAII drop) and observational reads stay Release/Acquire — a stale-high gate read only makes a pass bail conservatively. Fix-1's gate-before-cancel + never-latched invariant is preserved (SeqCst is strictly stronger than the prior Release). Added a load-bearing-ordering comment at begin_pass. Not unit-testable (ordering); the existing gate/drain handshake tests still pass. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7 --- .../src/manager/coordinator_lifecycle.rs | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs index ebcc73419c..87e20fa6e5 100644 --- a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs +++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs @@ -123,7 +123,9 @@ impl CoordinatorLifecycle { Arc::new(move || { let quiescing = Arc::clone(&quiescing); Box::pin(async move { - quiescing.store(true, Ordering::Release); + // SeqCst: store-half of the `quiescing`<->`is_syncing` + // handshake (see `begin_pass`). + quiescing.store(true, Ordering::SeqCst); }) }) } @@ -163,7 +165,9 @@ impl CoordinatorLifecycle { /// not apply to it. pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus { // Gate up first (instant) and held until the guard drops on return. - self.quiescing.store(true, Ordering::Release); + // SeqCst: store-half of the `quiescing`<->`is_syncing` handshake + // (see `begin_pass`). + self.quiescing.store(true, Ordering::SeqCst); let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing); self.cancel_join_and_drain().await } @@ -219,7 +223,11 @@ impl CoordinatorLifecycle { /// design — the caller bounds the whole teardown (the FFI `stop` / /// `clear` bridges wrap it in a `SHUTDOWN_JOIN_TIMEOUT_SECS` timeout). async fn drain_in_flight_pass(&self) { - while self.is_syncing.load(Ordering::Acquire) { + // SeqCst: load-half of the `quiescing`<->`is_syncing` handshake (see + // `begin_pass`). Pairs with `begin_pass`'s SeqCst CAS so a pass that + // claimed the slot just as the gate rose is observed here and waited + // out, rather than slipping past an unsynchronized read. + while self.is_syncing.load(Ordering::SeqCst) { tokio::time::sleep(Duration::from_millis(5)).await; } } @@ -233,7 +241,10 @@ impl CoordinatorLifecycle { /// gate tests also exercise it. #[cfg(any(test, feature = "shielded"))] pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> { - self.quiescing.store(true, Ordering::Release); + // SeqCst: store-half of the `quiescing`<->`is_syncing` handshake (see + // `begin_pass`). The Clear flow raises the gate through here, so this + // raise must be self-fencing just like `quiesce`'s. + self.quiescing.store(true, Ordering::SeqCst); AtomicFlagGuard::new(&self.quiescing) } @@ -245,9 +256,22 @@ impl CoordinatorLifecycle { /// released before returning (the guard drops), so a later post-quiesce /// pass can still run. pub(crate) fn begin_pass(&self) -> Option> { + // LOAD-BEARING MEMORY ORDERING: the `is_syncing` claim (this CAS) and + // the `quiescing` gate read below form a Dekker-style mutual-exclusion + // handshake with `quiesce`'s `store(quiescing) … load(is_syncing)`. + // The guarantee we need is that a teardown and a pass-entry can never + // BOTH miss each other — either this pass observes the raised gate and + // bails, or the drain observes our `is_syncing` claim and waits it + // out. That is a StoreLoad relationship across two distinct atomics, + // which Release/Acquire do NOT order; only SeqCst (a single total + // order over all four ops) does. So the CAS *store* here, the gate + // load here, and the matching `store(quiescing)` / `load(is_syncing)` + // on the teardown side are all `SeqCst`. (Today the lock `registry` + // takes would also fence this, but that is incidental — relying on it + // would make the handshake silently fragile to a lock-free refactor.) if self .is_syncing - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .compare_exchange(false, true, Ordering::SeqCst, Ordering::Acquire) .is_err() { return None; @@ -261,7 +285,8 @@ impl CoordinatorLifecycle { // A `quiesce` may have raised the gate between our CAS and here; if // so, bail (dropping `guard`, which clears the slot) so the drain // can complete and teardown gets a true "no further pass" barrier. - if self.quiescing.load(Ordering::Acquire) { + // SeqCst — load-half of the handshake described above. + if self.quiescing.load(Ordering::SeqCst) { return None; } Some(guard)