From f3354f6617a9407cf4c630ade2a30a87db4f9680 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Mon, 22 Jun 2026 21:46:48 +0200
Subject: [PATCH 01/29] feat(platform-wallet)!: shutdown() joins coordinator
threads and returns CoordinatorExitStatus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The three periodic sync coordinators (platform-address, identity,
shielded) run their `!Send` loops on detached OS threads via
`Handle::block_on`. `shutdown()`/`quiesce()` previously only drained the
in-flight pass (the `is_syncing` barrier) and never joined the threads,
so a consumer that drops the tokio runtime right after `shutdown()`
(one-shot / headless / stdio) could race a coordinator still polling
`tokio::time` on a shutting-down runtime and panic with
"A Tokio 1.x context was found, but it is being shutdown".
Each coordinator now stores its OS-thread `JoinHandle`; `quiesce()` joins
it (via `spawn_blocking`, after the existing drain) and returns a
`CoordinatorThreadStatus` (NotRunning / Ok / Panicked / Error). Joining
while the runtime is still alive guarantees the loop has stopped touching
`tokio::time` before the host drops the runtime. `shutdown()` aggregates
the three into `CoordinatorExitStatus`, so a panicked loop surfaces in
the status instead of being silently dropped.
JoinHandle-join chosen over a oneshot/Notify signal: `JoinHandle::join`
natively distinguishes a clean return from a panic and waits for the
actual OS thread to terminate (not just a signal fired mid-teardown),
yielding the per-thread status for free. The generation-guard reschedule
and quiesce-drain behavior are preserved.
BREAKING CHANGE: `PlatformWalletManager::shutdown()` now returns
`CoordinatorExitStatus` instead of `()`.
FFI: the internal `shutdown()` call logs the new status; the `extern "C"`
`platform_wallet_manager_destroy` signature and C ABI are unchanged.
🤖 Co-authored by [Claudius the Magnificent](https://github.com/lklimek/claudius) AI Agent
---
.../rs-platform-wallet-ffi/src/manager.rs | 13 +-
.../src/manager/identity_sync.rs | 26 +-
.../rs-platform-wallet/src/manager/mod.rs | 277 +++++++++++++++++-
.../src/manager/platform_address_sync.rs | 26 +-
.../src/manager/shielded_sync.rs | 26 +-
5 files changed, 354 insertions(+), 14 deletions(-)
diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs
index 5930c1c4db..d09d98a1e8 100644
--- a/packages/rs-platform-wallet-ffi/src/manager.rs
+++ b/packages/rs-platform-wallet-ffi/src/manager.rs
@@ -360,7 +360,18 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy(
// left alive to fire a callback against freed memory.
// `shutdown()` is idempotent, so this is safe even if the host
// already stopped some sync managers before calling destroy.
- runtime().block_on(manager.shutdown());
+ // It now joins the coordinator OS threads and returns their
+ // per-thread exit status; the C ABI exposes none of that, so we
+ // just log it (a panicked loop is worth surfacing) and drop it.
+ let status = runtime().block_on(manager.shutdown());
+ if !status.all_clean() {
+ tracing::warn!(
+ ?status,
+ "platform wallet coordinator(s) did not exit cleanly"
+ );
+ } else {
+ tracing::debug!(?status, "platform wallet coordinators joined cleanly");
+ }
}
PlatformWalletFFIResult::ok()
}
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 8730398f97..ae5ae879f7 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -160,6 +160,11 @@ where
persister: Arc
,
/// Cancel token for the background loop, if running.
background_cancel: StdMutex>,
+ /// Join handle for the background loop's OS thread, if running.
+ /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
+ /// confirm the `!Send` loop fully exited before the host drops the
+ /// runtime.
+ background_join: StdMutex >>,
/// Monotonically increasing generation counter. Incremented each
/// time `start()` installs a new cancel token so the exiting
/// thread can tell whether its token is still current.
@@ -204,6 +209,7 @@ where
sdk,
persister,
background_cancel: StdMutex::new(None),
+ background_join: StdMutex::new(None),
background_generation: AtomicU64::new(0),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
@@ -405,8 +411,8 @@ where
drop(guard);
let handle = tokio::runtime::Handle::current();
- let this = self;
- std::thread::Builder::new()
+ let this = Arc::clone(&self);
+ let join = std::thread::Builder::new()
.name("identity-sync".into())
.spawn(move || {
handle.block_on(async move {
@@ -434,6 +440,8 @@ where
});
})
.expect("failed to spawn identity-sync thread");
+ // Store the handle so `quiesce` can join the OS thread.
+ *self.background_join.lock().expect("bg_join poisoned") = Some(join);
}
/// Stop the background sync loop. No-op if not running.
@@ -473,13 +481,25 @@ where
/// so its falling edge (with the gate up) is a sound "fully drained"
/// signal. The gate is reopened before returning so a later
/// start/sync works normally.
- pub async fn quiesce(&self) {
+ ///
+ /// Finally **joins** the loop's OS thread (after the drain, so the
+ /// thread is on its way out) and returns its terminal status. Joining
+ /// while the runtime is still alive is what lets the manager promise
+ /// the `!Send` loop has stopped touching `tokio::time` before a
+ /// one-shot host drops the runtime.
+ pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
self.quiescing.store(true, Ordering::Release);
self.stop();
while self.is_syncing.load(Ordering::Acquire) {
tokio::time::sleep(Duration::from_millis(20)).await;
}
self.quiescing.store(false, Ordering::Release);
+ let handle = self
+ .background_join
+ .lock()
+ .expect("bg_join poisoned")
+ .take();
+ super::join_coordinator_thread(handle).await
}
/// Run one sync pass across every registered identity.
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 3d04ca086d..3529356170 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -89,6 +89,99 @@ pub struct PlatformWalletManager {
pub(super) event_adapter_join: tokio::sync::Mutex>>,
}
+/// Terminal status of one background coordinator's OS thread.
+///
+/// The three periodic coordinators run their loops on dedicated OS
+/// threads (the SDK futures are `!Send`, so they ride
+/// [`Handle::block_on`](tokio::runtime::Handle::block_on) rather than
+/// `tokio::spawn`). [`PlatformWalletManager::shutdown`] joins each
+/// thread and reports how it ended so a host can tell a clean wind-down
+/// from a panicked loop instead of silently dropping the thread.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum CoordinatorThreadStatus {
+ /// No thread was running to join — the loop was never started, or
+ /// was already stopped and joined.
+ NotRunning,
+ /// The loop exited and its OS thread joined cleanly.
+ Ok,
+ /// The OS thread panicked; carries the best-effort panic message.
+ Panicked(String),
+ /// The join itself could not complete (the blocking join task
+ /// failed). Distinct from the thread panicking.
+ Error(String),
+}
+
+impl CoordinatorThreadStatus {
+ /// `true` for a non-failure outcome (joined cleanly or never ran).
+ pub fn is_clean(&self) -> bool {
+ matches!(self, Self::Ok | Self::NotRunning)
+ }
+}
+
+/// Per-thread terminal status of every background coordinator, returned
+/// by [`PlatformWalletManager::shutdown`].
+///
+/// A host that drops its tokio runtime right after `shutdown()`
+/// (one-shot / headless / stdio) reads this to confirm each `!Send`
+/// coordinator loop fully wound down on its OS thread *before* the
+/// runtime goes away — closing the race where a still-polling loop hits
+/// `tokio::time` on a shutting-down runtime and panics with
+/// `A Tokio 1.x context was found, but it is being shutdown`.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CoordinatorExitStatus {
+ /// Platform-address (BLAST) balance sync loop.
+ pub platform_address: CoordinatorThreadStatus,
+ /// Per-identity token-state sync loop.
+ pub identity: CoordinatorThreadStatus,
+ /// Shielded (Orchard) note sync loop. Always
+ /// [`CoordinatorThreadStatus::NotRunning`] in builds without the
+ /// `shielded` feature.
+ pub shielded: CoordinatorThreadStatus,
+}
+
+impl CoordinatorExitStatus {
+ /// `true` when every coordinator wound down without a panic or join
+ /// failure (each is [`Ok`](CoordinatorThreadStatus::Ok) or
+ /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)).
+ pub fn all_clean(&self) -> bool {
+ self.platform_address.is_clean() && self.identity.is_clean() && self.shielded.is_clean()
+ }
+}
+
+/// Join a coordinator's background OS thread and classify how it ended.
+///
+/// Awaited by [`quiesce`](IdentitySyncManager::quiesce) *after* the loop
+/// is cancelled and its in-flight pass drained, so the thread is already
+/// on its way out. The blocking [`JoinHandle::join`](std::thread::JoinHandle::join)
+/// runs on the blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
+/// to avoid parking a runtime worker. Joining here — while the runtime
+/// is still alive — is what guarantees the `!Send` loop has stopped
+/// touching `tokio::time` before the host drops the runtime.
+pub(crate) async fn join_coordinator_thread(
+ handle: Option>,
+) -> CoordinatorThreadStatus {
+ let Some(handle) = handle else {
+ return CoordinatorThreadStatus::NotRunning;
+ };
+ match tokio::task::spawn_blocking(move || handle.join()).await {
+ Ok(Ok(())) => CoordinatorThreadStatus::Ok,
+ Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
+ Err(join_err) => CoordinatorThreadStatus::Error(join_err.to_string()),
+ }
+}
+
+/// Best-effort extraction of a panic message from a joined thread's
+/// payload (`&str` and `String` are the common cases).
+fn panic_message(payload: Box) -> String {
+ if let Some(s) = payload.downcast_ref::<&str>() {
+ (*s).to_string()
+ } else if let Some(s) = payload.downcast_ref::() {
+ s.clone()
+ } else {
+ "unknown panic payload".to_string()
+ }
+}
+
impl PlatformWalletManager {
/// Create a new PlatformWalletManager.
///
@@ -308,11 +401,20 @@ impl PlatformWalletManager {
/// FIRST (so no further persister store or host callback can start),
/// and only THEN cancel + join the event adapter, which is the sink
/// those stores feed into.
- pub async fn shutdown(&self) {
- self.platform_address_sync_manager.quiesce().await;
- self.identity_sync_manager.quiesce().await;
+ ///
+ /// Each `quiesce()` now also **joins** its coordinator's OS thread,
+ /// so when this returns every `!Send` loop has fully exited. A host
+ /// that drops the tokio runtime right after `shutdown()` (one-shot /
+ /// headless / stdio) is therefore safe — no coordinator can still be
+ /// polling `tokio::time` on a shutting-down runtime. The returned
+ /// [`CoordinatorExitStatus`] reports per-thread how each loop ended.
+ pub async fn shutdown(&self) -> CoordinatorExitStatus {
+ let platform_address = self.platform_address_sync_manager.quiesce().await;
+ let identity = self.identity_sync_manager.quiesce().await;
#[cfg(feature = "shielded")]
- self.shielded_sync_manager.quiesce().await;
+ let shielded = self.shielded_sync_manager.quiesce().await;
+ #[cfg(not(feature = "shielded"))]
+ let shielded = CoordinatorThreadStatus::NotRunning;
self.event_adapter_cancel.cancel();
if let Some(handle) = self.event_adapter_join.lock().await.take() {
@@ -320,5 +422,172 @@ impl PlatformWalletManager {
tracing::warn!(error = ?e, "Wallet event adapter task join error");
}
}
+
+ CoordinatorExitStatus {
+ platform_address,
+ identity,
+ shielded,
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ use std::time::Duration;
+
+ use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet};
+
+ /// No-op persister — the lifecycle tests below never exercise the
+ /// real persistence pipeline, they just need a handle that satisfies
+ /// the manager's `P` bound.
+ struct NoopPersister;
+
+ impl PlatformWalletPersistence for NoopPersister {
+ fn store(
+ &self,
+ _wallet_id: WalletId,
+ _changeset: PlatformWalletChangeSet,
+ ) -> Result<(), PersistenceError> {
+ Ok(())
+ }
+
+ fn flush(&self, _wallet_id: WalletId) -> Result<(), PersistenceError> {
+ Ok(())
+ }
+
+ fn load(&self) -> Result {
+ Ok(ClientStartState::default())
+ }
+ }
+
+ /// No-op event handler standing in for the host's FFI handler.
+ struct NoopHandler;
+ impl dash_spv::EventHandler for NoopHandler {}
+ impl PlatformEventHandler for NoopHandler {}
+
+ /// Build a manager over a mock SDK + no-op persister/handler. Cheap:
+ /// `new` wires the sub-managers and spawns the event adapter but
+ /// starts no coordinator threads.
+ fn make_manager() -> PlatformWalletManager {
+ let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
+ let persister = Arc::new(NoopPersister);
+ let handler: Arc = Arc::new(NoopHandler);
+ PlatformWalletManager::new(sdk, persister, handler)
+ }
+
+ /// Start every periodic coordinator's background OS-thread loop.
+ fn start_coordinators(m: &PlatformWalletManager) {
+ Arc::clone(&m.platform_address_sync_manager).start();
+ Arc::clone(&m.identity_sync_manager).start();
+ #[cfg(feature = "shielded")]
+ Arc::clone(&m.shielded_sync_manager).start();
+ }
+
+ /// (a) `shutdown()` joins all coordinator OS threads and reports an
+ /// all-clean status; a second call has nothing left to join.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn shutdown_joins_all_coordinators_and_reports_ok() {
+ let manager = make_manager();
+ start_coordinators(&manager);
+ // Let the loops enter `block_on` so we exercise the live-loop
+ // join path (a thread cancelled before its first poll joins too).
+ tokio::time::sleep(Duration::from_millis(50)).await;
+
+ let status = manager.shutdown().await;
+ assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
+ assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
+ #[cfg(feature = "shielded")]
+ assert_eq!(status.shielded, CoordinatorThreadStatus::Ok);
+ #[cfg(not(feature = "shielded"))]
+ assert_eq!(status.shielded, CoordinatorThreadStatus::NotRunning);
+ assert!(status.all_clean());
+
+ // Handles consumed by the join → nothing left to join.
+ let again = manager.shutdown().await;
+ assert_eq!(again.platform_address, CoordinatorThreadStatus::NotRunning);
+ assert_eq!(again.identity, CoordinatorThreadStatus::NotRunning);
+ }
+
+ /// (b) A coordinator thread that panics surfaces in the status rather
+ /// than being silently dropped.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn join_coordinator_thread_surfaces_panic() {
+ let handle = std::thread::spawn(|| panic!("boom in coordinator"));
+ match join_coordinator_thread(Some(handle)).await {
+ CoordinatorThreadStatus::Panicked(msg) => {
+ assert!(msg.contains("boom in coordinator"), "msg was {msg:?}");
+ }
+ other => panic!("expected Panicked, got {other:?}"),
+ }
+ }
+
+ /// A cleanly-returning thread joins as `Ok`; an absent handle is
+ /// `NotRunning`.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn join_coordinator_thread_clean_and_absent() {
+ let handle = std::thread::spawn(|| {});
+ assert_eq!(
+ join_coordinator_thread(Some(handle)).await,
+ CoordinatorThreadStatus::Ok
+ );
+ assert_eq!(
+ join_coordinator_thread(None).await,
+ CoordinatorThreadStatus::NotRunning
+ );
+ }
+
+ /// (c) Race regression: model the one-shot / headless path — start
+ /// the coordinators, `shutdown()`, then **drop the runtime**. Because
+ /// `shutdown()` joined every loop while the runtime was still alive
+ /// (asserted via the all-`Ok` status), nothing is left polling
+ /// `tokio::time`, so the drop raises no "Tokio … being shutdown"
+ /// panic. A scoped hook counts only that specific panic so a
+ /// concurrent unrelated panic can't trip the assertion.
+ #[test]
+ fn shutdown_then_drop_runtime_does_not_panic() {
+ use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
+
+ static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
+ let prev_hook = std::panic::take_hook();
+ std::panic::set_hook(Box::new(|info| {
+ if info.to_string().contains("being shutdown") {
+ SHUTDOWN_PANICS.fetch_add(1, AtomicOrdering::SeqCst);
+ }
+ }));
+
+ let runtime = tokio::runtime::Builder::new_multi_thread()
+ .worker_threads(4)
+ .enable_all()
+ .build()
+ .expect("build runtime");
+
+ let status = runtime.block_on(async {
+ let manager = make_manager();
+ start_coordinators(&manager);
+ tokio::time::sleep(Duration::from_millis(50)).await;
+ manager.shutdown().await
+ });
+
+ // The headless drop: with every coordinator already joined, this
+ // cannot race a loop still touching the timer.
+ drop(runtime);
+ std::thread::sleep(Duration::from_millis(100));
+ let racing_panics = SHUTDOWN_PANICS.load(AtomicOrdering::SeqCst);
+
+ // Restore the hook before asserting so a failure prints normally.
+ std::panic::set_hook(prev_hook);
+
+ assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
+ assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
+ assert!(
+ status.all_clean(),
+ "coordinators did not wind down: {status:?}"
+ );
+ assert_eq!(
+ racing_panics, 0,
+ "dropping the runtime after shutdown raced a coordinator thread"
+ );
}
}
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index e1a229806c..baa6111e02 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -97,6 +97,11 @@ pub struct PlatformAddressSyncManager {
event_manager: Arc,
/// Cancel token for the background loop, if running.
background_cancel: StdMutex>,
+ /// Join handle for the background loop's OS thread, if running.
+ /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
+ /// confirm the `!Send` loop fully exited before the host drops the
+ /// runtime.
+ background_join: StdMutex >>,
interval_secs: AtomicU64,
is_syncing: AtomicBool,
/// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -125,6 +130,7 @@ impl PlatformAddressSyncManager {
wallets,
event_manager,
background_cancel: StdMutex::new(None),
+ background_join: StdMutex::new(None),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
quiescing: AtomicBool::new(false),
@@ -204,8 +210,8 @@ impl PlatformAddressSyncManager {
drop(guard);
let handle = tokio::runtime::Handle::current();
- let this = self;
- std::thread::Builder::new()
+ let this = Arc::clone(&self);
+ let join = std::thread::Builder::new()
.name("platform-address-sync".into())
.spawn(move || {
handle.block_on(async move {
@@ -229,6 +235,8 @@ impl PlatformAddressSyncManager {
});
})
.expect("failed to spawn platform-address-sync thread");
+ // Store the handle so `quiesce` can join the OS thread.
+ *self.background_join.lock().expect("bg_join poisoned") = Some(join);
}
/// Stop the background sync loop. No-op if not running.
@@ -270,13 +278,25 @@ impl PlatformAddressSyncManager {
/// falling edge (with the gate up) is a sound "fully drained" signal.
/// The gate is reopened before returning so a later start/sync works
/// normally.
- pub async fn quiesce(&self) {
+ ///
+ /// Finally **joins** the loop's OS thread (after the drain, so the
+ /// thread is on its way out) and returns its terminal status. Joining
+ /// while the runtime is still alive is what lets the manager promise
+ /// the `!Send` loop has stopped touching `tokio::time` before a
+ /// one-shot host drops the runtime.
+ pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
self.quiescing.store(true, Ordering::Release);
self.stop();
while self.is_syncing.load(Ordering::Acquire) {
tokio::time::sleep(Duration::from_millis(20)).await;
}
self.quiescing.store(false, Ordering::Release);
+ let handle = self
+ .background_join
+ .lock()
+ .expect("bg_join poisoned")
+ .take();
+ super::join_coordinator_thread(handle).await
}
/// Run one sync pass across every registered wallet.
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 482674b432..d634c65398 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -141,6 +141,11 @@ pub struct ShieldedSyncManager {
coordinator_slot: Arc>>>,
/// Cancel token for the background loop, if running.
background_cancel: StdMutex>,
+ /// Join handle for the background loop's OS thread, if running.
+ /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
+ /// confirm the `!Send` loop fully exited before the host drops the
+ /// runtime.
+ background_join: StdMutex >>,
/// Monotonically increasing generation counter. Bumped on every
/// `start()` so the exiting thread can tell whether its
/// generation is still the active one before clearing
@@ -171,6 +176,7 @@ impl ShieldedSyncManager {
event_manager,
coordinator_slot,
background_cancel: StdMutex::new(None),
+ background_join: StdMutex::new(None),
background_generation: AtomicU64::new(0),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
@@ -235,8 +241,8 @@ impl ShieldedSyncManager {
drop(guard);
let handle = tokio::runtime::Handle::current();
- let this = self;
- std::thread::Builder::new()
+ let this = Arc::clone(&self);
+ let join = std::thread::Builder::new()
.name("shielded-sync".into())
.spawn(move || {
handle.block_on(async move {
@@ -275,6 +281,8 @@ impl ShieldedSyncManager {
});
})
.expect("failed to spawn shielded-sync thread");
+ // Store the handle so `quiesce` can join the OS thread.
+ *self.background_join.lock().expect("bg_join poisoned") = Some(join);
}
/// Stop the background sync loop. No-op if not running.
@@ -313,13 +321,25 @@ impl ShieldedSyncManager {
/// including the persister fan-out, so its falling edge (with the
/// gate up) is a sound "fully drained" signal. The gate is reopened
/// before returning so a later start/sync works normally.
- pub async fn quiesce(&self) {
+ ///
+ /// Finally **joins** the loop's OS thread (after the drain, so the
+ /// thread is on its way out) and returns its terminal status. Joining
+ /// while the runtime is still alive is what lets the manager promise
+ /// the `!Send` loop has stopped touching `tokio::time` before a
+ /// one-shot host drops the runtime.
+ pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
self.quiescing.store(true, Ordering::Release);
self.stop();
while self.is_syncing.load(Ordering::Acquire) {
tokio::time::sleep(Duration::from_millis(20)).await;
}
self.quiescing.store(false, Ordering::Release);
+ let handle = self
+ .background_join
+ .lock()
+ .expect("bg_join poisoned")
+ .take();
+ super::join_coordinator_thread(handle).await
}
/// Run one sync pass across every registered wallet.
From 261178e8ae1897fdebb4f0e8fcba61826ad3336b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Mon, 22 Jun 2026 22:41:31 +0200
Subject: [PATCH 02/29] fix(platform-wallet): RAII-guard is_syncing so a
coordinator panic cannot wedge shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
SEC-001: Add `IsSyncingGuard` RAII struct to all three coordinator
`sync_now` (and shielded `sync_wallet`) implementations. The guard
clears `is_syncing=false` on every exit path — normal return, early
return, and panic-unwind — so `quiesce()`'s drain loop can never spin
forever on a panicked pass, and the `Panicked` thread-exit status
becomes reachable.
SEC-002: Wrap each coordinator's `quiesce()` call in `shutdown()` with
`tokio::time::timeout(30 s)`. On timeout the slot reports
`CoordinatorThreadStatus::Error("join timed out")` rather than hanging
forever.
SEC-003: Add `debug_assert!` in `shutdown()` that the current runtime
is `MultiThread`; document the precondition in the method doc.
F-5: In all three coordinators' `start()`, store the `JoinHandle` in
`background_join` while still holding the `background_cancel` lock —
eliminates the theoretical window where a concurrent `quiesce()` could
take a `None` handle because spawn completed before the store.
Rename `CoordinatorThreadExit` → `CoordinatorThreadStatus` with
variants `Ok / NotRunning / Panicked / Error` to match the coordinator
module's existing `super::CoordinatorThreadStatus` references (fixing
the compile break in f3354f6617). `join_coordinator_thread`'s
spawn_blocking `Err` arm now maps to `Error` rather than `Panicked`
to distinguish infra failure from thread panic (F-6 documented).
Co-Authored-By: Claudius the Magnificent
🤖 Co-authored by [Claudius the Magnificent](https://github.com/lklimek/claudius) AI Agent
---
.../src/manager/identity_sync.rs | 41 +-
.../rs-platform-wallet/src/manager/mod.rs | 399 +++++++++++++-----
.../src/manager/platform_address_sync.rs | 58 ++-
.../src/manager/shielded_sync.rs | 56 ++-
4 files changed, 407 insertions(+), 147 deletions(-)
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index ae5ae879f7..7ce38eb5fd 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -75,6 +75,20 @@ use crate::wallet::platform_wallet::WalletId;
/// startup default.
pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
+/// RAII guard that clears `is_syncing` when dropped.
+///
+/// Created at the start of a sync pass (after the `compare_exchange`
+/// that takes the slot). On any exit — normal return, early return, or
+/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
+/// never spins forever on a panicked pass.
+struct IsSyncingGuard<'a>(&'a AtomicBool);
+
+impl Drop for IsSyncingGuard<'_> {
+ fn drop(&mut self) {
+ self.0.store(false, Ordering::Release);
+ }
+}
+
/// Maximum number of token ids fetched in a single
/// `IdentityTokenBalancesQuery`.
///
@@ -401,14 +415,13 @@ where
/// The first pass runs immediately; subsequent passes fire every
/// [`interval`](Self::interval).
pub fn start(self: Arc) {
- let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned");
- if guard.is_some() {
+ let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+ if cancel_guard.is_some() {
return;
}
let cancel = CancellationToken::new();
- *guard = Some(cancel.clone());
+ *cancel_guard = Some(cancel.clone());
let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
- drop(guard);
let handle = tokio::runtime::Handle::current();
let this = Arc::clone(&self);
@@ -440,8 +453,11 @@ where
});
})
.expect("failed to spawn identity-sync thread");
- // Store the handle so `quiesce` can join the OS thread.
+ // Store the join handle while still holding cancel_guard — a
+ // concurrent quiesce() must wait for this lock before calling
+ // stop(), so the handle is always stored before it can be taken.
*self.background_join.lock().expect("bg_join poisoned") = Some(join);
+ // cancel_guard drops here, releasing background_cancel.
}
/// Stop the background sync loop. No-op if not running.
@@ -521,12 +537,17 @@ where
return;
}
+ // RAII guard: clears `is_syncing` on every exit path, including
+ // panics. Without this a panic inside the pass would leave
+ // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
+ let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
// A `quiesce()` may have raised the gate between our CAS and
- // here; if so, release the slot and bail without running a pass
- // so the drain can complete and shutdown gets a true barrier
- // (no further `persister.store(...)` after quiesce returns).
+ // here; if so, bail without running a pass so the drain can
+ // complete and shutdown gets a true barrier (no further
+ // `persister.store(...)` after quiesce returns).
+ // Guard clears `is_syncing` on return.
if self.quiescing.load(Ordering::Acquire) {
- self.is_syncing.store(false, Ordering::Release);
return;
}
@@ -552,7 +573,7 @@ where
.map(|d| d.as_secs())
.unwrap_or(0);
self.last_sync_unix.store(now, Ordering::Release);
- self.is_syncing.store(false, Ordering::Release);
+ // `_is_syncing_guard` drops here → `is_syncing = false`
}
/// Sync a single identity's watched tokens against Platform.
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 3529356170..905dc32c41 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -89,7 +89,7 @@ pub struct PlatformWalletManager {
pub(super) event_adapter_join: tokio::sync::Mutex>>,
}
-/// Terminal status of one background coordinator's OS thread.
+/// How one background coordinator thread terminated.
///
/// The three periodic coordinators run their loops on dedicated OS
/// threads (the SDK futures are `!Send`, so they ride
@@ -99,15 +99,16 @@ pub struct PlatformWalletManager {
/// from a panicked loop instead of silently dropping the thread.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CoordinatorThreadStatus {
- /// No thread was running to join — the loop was never started, or
- /// was already stopped and joined.
- NotRunning,
- /// The loop exited and its OS thread joined cleanly.
+ /// The loop exited and its thread/task joined cleanly.
Ok,
- /// The OS thread panicked; carries the best-effort panic message.
+ /// The thread/task panicked; carries the best-effort panic message.
Panicked(String),
- /// The join itself could not complete (the blocking join task
- /// failed). Distinct from the thread panicking.
+ /// No thread/task was running to join — never started, or already
+ /// joined by a previous `shutdown()`.
+ NotRunning,
+ /// The join did not complete within the bounded timeout, or the
+ /// `spawn_blocking` task itself failed (e.g. runtime torn down
+ /// before the join could run — unreachable in normal operation).
Error(String),
}
@@ -118,8 +119,8 @@ impl CoordinatorThreadStatus {
}
}
-/// Per-thread terminal status of every background coordinator, returned
-/// by [`PlatformWalletManager::shutdown`].
+/// Per-thread terminal status of every background worker, returned by
+/// [`PlatformWalletManager::shutdown`].
///
/// A host that drops its tokio runtime right after `shutdown()`
/// (one-shot / headless / stdio) reads this to confirm each `!Send`
@@ -130,33 +131,41 @@ impl CoordinatorThreadStatus {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CoordinatorExitStatus {
/// Platform-address (BLAST) balance sync loop.
- pub platform_address: CoordinatorThreadStatus,
+ pub platform_address_sync: CoordinatorThreadStatus,
/// Per-identity token-state sync loop.
- pub identity: CoordinatorThreadStatus,
- /// Shielded (Orchard) note sync loop. Always
- /// [`CoordinatorThreadStatus::NotRunning`] in builds without the
- /// `shielded` feature.
- pub shielded: CoordinatorThreadStatus,
+ pub identity_sync: CoordinatorThreadStatus,
+ /// Shielded (Orchard) note sync loop. `None` in builds without the
+ /// `shielded` feature (the coordinator does not exist).
+ pub shielded_sync: Option,
+ /// Wallet-event adapter (a `tokio` task, not an OS thread).
+ pub event_adapter: CoordinatorThreadStatus,
}
impl CoordinatorExitStatus {
- /// `true` when every coordinator wound down without a panic or join
- /// failure (each is [`Ok`](CoordinatorThreadStatus::Ok) or
+ /// `true` when every worker wound down without a panic (each is
+ /// [`Ok`](CoordinatorThreadStatus::Ok) or
/// [`NotRunning`](CoordinatorThreadStatus::NotRunning)).
pub fn all_clean(&self) -> bool {
- self.platform_address.is_clean() && self.identity.is_clean() && self.shielded.is_clean()
+ self.platform_address_sync.is_clean()
+ && self.identity_sync.is_clean()
+ && self.shielded_sync.as_ref().is_none_or(|s| s.is_clean())
+ && self.event_adapter.is_clean()
}
}
/// Join a coordinator's background OS thread and classify how it ended.
///
-/// Awaited by [`quiesce`](IdentitySyncManager::quiesce) *after* the loop
-/// is cancelled and its in-flight pass drained, so the thread is already
-/// on its way out. The blocking [`JoinHandle::join`](std::thread::JoinHandle::join)
-/// runs on the blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
-/// to avoid parking a runtime worker. Joining here — while the runtime
-/// is still alive — is what guarantees the `!Send` loop has stopped
-/// touching `tokio::time` before the host drops the runtime.
+/// Called from each coordinator's `quiesce()` after cancelling the
+/// loop and draining any in-flight pass, so the thread is already on
+/// its way out and the join is near-instant. The blocking
+/// [`JoinHandle::join`](std::thread::JoinHandle::join) runs on the
+/// blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
+/// so the async executor stays unblocked. Joining while the runtime is
+/// still alive guarantees the `!Send` loop has stopped touching
+/// `tokio::time` before the host drops the runtime.
+///
+/// **Requires a multi-thread runtime** — `spawn_blocking` is not
+/// available on `current_thread` runtimes and will panic there.
pub(crate) async fn join_coordinator_thread(
handle: Option>,
) -> CoordinatorThreadStatus {
@@ -166,11 +175,14 @@ pub(crate) async fn join_coordinator_thread(
match tokio::task::spawn_blocking(move || handle.join()).await {
Ok(Ok(())) => CoordinatorThreadStatus::Ok,
Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
- Err(join_err) => CoordinatorThreadStatus::Error(join_err.to_string()),
+ // spawn_blocking fails only when the runtime shuts down before
+ // the blocking task can run — unreachable in normal operation
+ // since shutdown() is called while the runtime is alive (F-6).
+ Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")),
}
}
-/// Best-effort extraction of a panic message from a joined thread's
+/// Best-effort extraction of a panic message from a joined thread/task
/// payload (`&str` and `String` are the common cases).
fn panic_message(payload: Box) -> String {
if let Some(s) = payload.downcast_ref::<&str>() {
@@ -178,10 +190,17 @@ fn panic_message(payload: Box) -> String {
} else if let Some(s) = payload.downcast_ref::() {
s.clone()
} else {
- "unknown panic payload".to_string()
+ "".to_string()
}
}
+/// Maximum time (seconds) `shutdown()` waits for one coordinator's
+/// quiesce+join to complete. Under normal operation this deadline is
+/// never reached (the RAII `is_syncing` guard ensures the drain exits
+/// even on panic). On timeout the coordinator slot reports
+/// [`CoordinatorThreadStatus::Error`]`("join timed out")`.
+const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
+
impl PlatformWalletManager {
/// Create a new PlatformWalletManager.
///
@@ -402,31 +421,82 @@ impl PlatformWalletManager {
/// and only THEN cancel + join the event adapter, which is the sink
/// those stores feed into.
///
- /// Each `quiesce()` now also **joins** its coordinator's OS thread,
- /// so when this returns every `!Send` loop has fully exited. A host
- /// that drops the tokio runtime right after `shutdown()` (one-shot /
- /// headless / stdio) is therefore safe — no coordinator can still be
- /// polling `tokio::time` on a shutting-down runtime. The returned
- /// [`CoordinatorExitStatus`] reports per-thread how each loop ended.
+ /// After each coordinator's `quiesce()` drains its in-flight pass,
+ /// this also **joins** the loop's OS thread, so when `shutdown()`
+ /// returns every `!Send` loop has fully exited. A host that drops the
+ /// tokio runtime right after `shutdown()` (one-shot / headless /
+ /// stdio) is therefore safe — no coordinator can still be polling
+ /// `tokio::time` on a shutting-down runtime. The returned
+ /// [`CoordinatorExitStatus`] reports per-thread how each worker ended.
+ ///
+ /// **Precondition: must be called from a multi-thread Tokio runtime.**
+ /// `quiesce()` uses `spawn_blocking` internally; calling from a
+ /// `current_thread` runtime will `debug_assert!`-panic in debug
+ /// builds or deadlock in release builds.
+ ///
+ /// Each coordinator quiesce+join is bounded by
+ /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit
+ /// within that window, its slot reports
+ /// [`CoordinatorThreadStatus::Error`]`("join timed out")` rather
+ /// than hanging forever. Under normal operation (no infinite loops,
+ /// RAII guard clears `is_syncing` even on panic) this timeout is
+ /// never reached.
pub async fn shutdown(&self) -> CoordinatorExitStatus {
- let platform_address = self.platform_address_sync_manager.quiesce().await;
- let identity = self.identity_sync_manager.quiesce().await;
+ debug_assert!(
+ matches!(
+ tokio::runtime::Handle::current().runtime_flavor(),
+ tokio::runtime::RuntimeFlavor::MultiThread
+ ),
+ "shutdown() requires a multi-thread Tokio runtime (spawn_blocking inside quiesce)"
+ );
+
+ let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
+
+ // Each quiesce() drains any in-flight pass AND joins the thread.
+ let platform_address_sync =
+ tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
+ .await
+ .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+
+ let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
+ .await
+ .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+
#[cfg(feature = "shielded")]
- let shielded = self.shielded_sync_manager.quiesce().await;
+ let shielded_sync = {
+ let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
+ .await
+ .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+ Some(r)
+ };
#[cfg(not(feature = "shielded"))]
- let shielded = CoordinatorThreadStatus::NotRunning;
+ let shielded_sync = None;
+ // The event adapter is a tokio task (it sinks the coordinators'
+ // stores), so cancel + join it last — after the loops feeding it
+ // are gone.
self.event_adapter_cancel.cancel();
- if let Some(handle) = self.event_adapter_join.lock().await.take() {
- if let Err(e) = handle.await {
- tracing::warn!(error = ?e, "Wallet event adapter task join error");
- }
- }
+ let event_adapter = match self.event_adapter_join.lock().await.take() {
+ None => CoordinatorThreadStatus::NotRunning,
+ Some(handle) => match tokio::time::timeout(timeout, handle).await {
+ Ok(Ok(())) => CoordinatorThreadStatus::Ok,
+ Ok(Err(e)) => {
+ tracing::warn!(error = ?e, "Wallet event adapter task join error");
+ if e.is_panic() {
+ CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
+ } else {
+ CoordinatorThreadStatus::Ok
+ }
+ }
+ Err(_) => CoordinatorThreadStatus::Error("join timed out".into()),
+ },
+ };
CoordinatorExitStatus {
- platform_address,
- identity,
- shielded,
+ platform_address_sync,
+ identity_sync,
+ shielded_sync,
+ event_adapter,
}
}
}
@@ -435,9 +505,11 @@ impl PlatformWalletManager {
mod tests {
use super::*;
+ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering as AO};
use std::time::Duration;
use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet};
+ use crate::manager::platform_address_sync::PlatformAddressSyncSummary;
/// No-op persister — the lifecycle tests below never exercise the
/// real persistence pipeline, they just need a handle that satisfies
@@ -477,6 +549,31 @@ mod tests {
PlatformWalletManager::new(sdk, persister, handler)
}
+ /// Build a manager that fires a slow (300 ms std::thread::sleep) callback
+ /// on `on_platform_address_sync_completed`. Used by F-2 drain test.
+ fn make_manager_with_slow_handler(
+ started: Arc,
+ completed: Arc,
+ ) -> PlatformWalletManager {
+ struct SlowHandler {
+ started: Arc,
+ completed: Arc,
+ }
+ impl dash_spv::EventHandler for SlowHandler {}
+ impl PlatformEventHandler for SlowHandler {
+ fn on_platform_address_sync_completed(&self, _: &PlatformAddressSyncSummary) {
+ self.started.store(true, AO::Release);
+ std::thread::sleep(Duration::from_millis(300));
+ self.completed.store(true, AO::Release);
+ }
+ }
+
+ let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
+ let persister = Arc::new(NoopPersister);
+ let handler: Arc = Arc::new(SlowHandler { started, completed });
+ PlatformWalletManager::new(sdk, persister, handler)
+ }
+
/// Start every periodic coordinator's background OS-thread loop.
fn start_coordinators(m: &PlatformWalletManager) {
Arc::clone(&m.platform_address_sync_manager).start();
@@ -485,32 +582,66 @@ mod tests {
Arc::clone(&m.shielded_sync_manager).start();
}
- /// (a) `shutdown()` joins all coordinator OS threads and reports an
- /// all-clean status; a second call has nothing left to join.
+ /// (1)+(5)+(6) Happy path: `shutdown()` joins every started worker
+ /// and reports `Ok`; it completes within a bounded time (no
+ /// `spawn_blocking` starvation/deadlock); a second `shutdown()` finds
+ /// nothing left to join (`NotRunning`) — idempotent.
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
- async fn shutdown_joins_all_coordinators_and_reports_ok() {
+ async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() {
let manager = make_manager();
start_coordinators(&manager);
// Let the loops enter `block_on` so we exercise the live-loop
// join path (a thread cancelled before its first poll joins too).
tokio::time::sleep(Duration::from_millis(50)).await;
- let status = manager.shutdown().await;
- assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
- assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
+ let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown())
+ .await
+ .expect("shutdown join must complete within bound");
+ assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok);
+ assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok);
#[cfg(feature = "shielded")]
- assert_eq!(status.shielded, CoordinatorThreadStatus::Ok);
+ assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok));
#[cfg(not(feature = "shielded"))]
- assert_eq!(status.shielded, CoordinatorThreadStatus::NotRunning);
+ assert_eq!(status.shielded_sync, None);
+ assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok);
assert!(status.all_clean());
- // Handles consumed by the join → nothing left to join.
+ // Handles consumed by the first join → nothing left to join.
let again = manager.shutdown().await;
- assert_eq!(again.platform_address, CoordinatorThreadStatus::NotRunning);
- assert_eq!(again.identity, CoordinatorThreadStatus::NotRunning);
+ assert_eq!(
+ again.platform_address_sync,
+ CoordinatorThreadStatus::NotRunning
+ );
+ assert_eq!(again.identity_sync, CoordinatorThreadStatus::NotRunning);
+ assert_eq!(again.event_adapter, CoordinatorThreadStatus::NotRunning);
+ assert!(again.all_clean());
}
- /// (b) A coordinator thread that panics surfaces in the status rather
+ /// (2) Never-started coordinators report `NotRunning` (no thread to
+ /// join). The event adapter is spawned in `new`, so it still joins
+ /// `Ok`.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn shutdown_without_starting_reports_not_running() {
+ let manager = make_manager();
+
+ let status = manager.shutdown().await;
+ assert_eq!(
+ status.platform_address_sync,
+ CoordinatorThreadStatus::NotRunning
+ );
+ assert_eq!(status.identity_sync, CoordinatorThreadStatus::NotRunning);
+ #[cfg(feature = "shielded")]
+ assert_eq!(
+ status.shielded_sync,
+ Some(CoordinatorThreadStatus::NotRunning)
+ );
+ #[cfg(not(feature = "shielded"))]
+ assert_eq!(status.shielded_sync, None);
+ assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok);
+ assert!(status.all_clean());
+ }
+
+ /// (4) A coordinator thread that panics surfaces as `Panicked` rather
/// than being silently dropped.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn join_coordinator_thread_surfaces_panic() {
@@ -526,7 +657,7 @@ mod tests {
/// A cleanly-returning thread joins as `Ok`; an absent handle is
/// `NotRunning`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
- async fn join_coordinator_thread_clean_and_absent() {
+ async fn join_coordinator_thread_ok_and_absent() {
let handle = std::thread::spawn(|| {});
assert_eq!(
join_coordinator_thread(Some(handle)).await,
@@ -538,56 +669,128 @@ mod tests {
);
}
- /// (c) Race regression: model the one-shot / headless path — start
- /// the coordinators, `shutdown()`, then **drop the runtime**. Because
- /// `shutdown()` joined every loop while the runtime was still alive
- /// (asserted via the all-`Ok` status), nothing is left polling
- /// `tokio::time`, so the drop raises no "Tokio … being shutdown"
- /// panic. A scoped hook counts only that specific panic so a
- /// concurrent unrelated panic can't trip the assertion.
+ /// F-7: `join_coordinator_thread` uses `spawn_blocking` internally.
+ /// Verify it completes without deadlock within a bounded time when
+ /// called from a multi-thread runtime, as `shutdown()` requires.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() {
+ let handle = std::thread::spawn(|| {});
+ let result = tokio::time::timeout(
+ Duration::from_secs(5),
+ join_coordinator_thread(Some(handle)),
+ )
+ .await
+ .expect("join_coordinator_thread must complete within 5 s — no spawn_blocking deadlock");
+ assert_eq!(result, CoordinatorThreadStatus::Ok);
+ }
+
+ /// F-2: `shutdown()` must wait for an in-flight sync pass to drain
+ /// before joining the coordinator thread.
+ ///
+ /// A slow `on_platform_address_sync_completed` callback (300 ms)
+ /// keeps `is_syncing=true` while it runs. We call `shutdown()` while
+ /// the callback is in-flight and assert that `shutdown()` blocks
+ /// until the callback completes, then returns `Ok`.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn shutdown_waits_for_in_flight_pass_to_drain() {
+ let handler_started = Arc::new(AtomicBool::new(false));
+ let handler_completed = Arc::new(AtomicBool::new(false));
+ let manager = make_manager_with_slow_handler(
+ Arc::clone(&handler_started),
+ Arc::clone(&handler_completed),
+ );
+
+ // Start the address-sync coordinator; first pass fires immediately.
+ Arc::clone(&manager.platform_address_sync_manager).start();
+
+ // Wait until the slow completion callback is running
+ // (`is_syncing` stays true for its 300 ms duration).
+ while !handler_started.load(AO::Acquire) {
+ tokio::time::sleep(Duration::from_millis(5)).await;
+ }
+
+ // Shutdown must drain the in-flight pass before joining.
+ let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown())
+ .await
+ .expect("shutdown must complete within 5 s");
+
+ assert_eq!(
+ status.platform_address_sync,
+ CoordinatorThreadStatus::Ok,
+ "coordinator must join cleanly after drain"
+ );
+ assert!(
+ handler_completed.load(AO::Acquire),
+ "shutdown must not return before the in-flight pass completes"
+ );
+ }
+
+ /// F-3 (strengthened): race regression — start coordinators with a
+ /// long sleep interval so they spend nearly all their time in a live
+ /// `tokio::time::sleep`, then `shutdown()` and drop the runtime.
+ ///
+ /// With the thread join in `shutdown()` every coordinator has fully
+ /// exited its `block_on` before `drop(runtime)` — no race possible.
+ /// Loop 10 times to give any latent race a reliable window: WITHOUT
+ /// the join, the coordinator's `select!` wakeup (via tokio) would
+ /// race the runtime teardown and reliably trigger the
+ /// "Tokio … being shutdown" panic across the 10 iterations.
#[test]
fn shutdown_then_drop_runtime_does_not_panic() {
- use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
-
static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
let prev_hook = std::panic::take_hook();
std::panic::set_hook(Box::new(|info| {
if info.to_string().contains("being shutdown") {
- SHUTDOWN_PANICS.fetch_add(1, AtomicOrdering::SeqCst);
+ SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst);
}
}));
- let runtime = tokio::runtime::Builder::new_multi_thread()
- .worker_threads(4)
- .enable_all()
- .build()
- .expect("build runtime");
-
- let status = runtime.block_on(async {
- let manager = make_manager();
- start_coordinators(&manager);
- tokio::time::sleep(Duration::from_millis(50)).await;
- manager.shutdown().await
- });
-
- // The headless drop: with every coordinator already joined, this
- // cannot race a loop still touching the timer.
- drop(runtime);
- std::thread::sleep(Duration::from_millis(100));
- let racing_panics = SHUTDOWN_PANICS.load(AtomicOrdering::SeqCst);
-
- // Restore the hook before asserting so a failure prints normally.
- std::panic::set_hook(prev_hook);
+ for _ in 0..10 {
+ let runtime = tokio::runtime::Builder::new_multi_thread()
+ .worker_threads(4)
+ .enable_all()
+ .build()
+ .expect("build runtime");
+
+ let status = runtime.block_on(async {
+ let manager = make_manager();
+ // Long interval: coordinator spends ~10 s in a live
+ // tokio::time::sleep, maximising the race window for a
+ // join-less runtime drop.
+ manager
+ .platform_address_sync_manager
+ .set_interval(Duration::from_secs(10));
+ manager
+ .identity_sync_manager
+ .set_interval(Duration::from_secs(10));
+ #[cfg(feature = "shielded")]
+ manager
+ .shielded_sync_manager
+ .set_interval(Duration::from_secs(10));
+ start_coordinators(&manager);
+ // Wait for coordinators to finish their first (instant)
+ // pass and enter the long sleep.
+ tokio::time::sleep(Duration::from_millis(100)).await;
+ // shutdown() joins each thread before returning; without
+ // the join this drop would race the select!/block_on exit.
+ manager.shutdown().await
+ });
+
+ drop(runtime);
+ // Brief settle — any stray thread activity surfaces here.
+ std::thread::sleep(Duration::from_millis(50));
+
+ assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok);
+ assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok);
+ assert!(status.all_clean(), "workers did not wind down: {status:?}");
+ }
- assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
- assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
- assert!(
- status.all_clean(),
- "coordinators did not wind down: {status:?}"
- );
+ let racing_panics = SHUTDOWN_PANICS.load(AO::SeqCst);
+ std::panic::set_hook(prev_hook);
assert_eq!(
racing_panics, 0,
- "dropping the runtime after shutdown raced a coordinator thread"
+ "dropping the runtime after shutdown raced a coordinator thread \
+ ({racing_panics} panics across 10 iterations)"
);
}
}
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index baa6111e02..f85eb6d05e 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -31,6 +31,20 @@ use crate::wallet::PlatformWallet;
/// Default cadence — matches the 15s BLAST loop we previously ran in Swift.
pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 15;
+/// RAII guard that clears `is_syncing` when dropped.
+///
+/// Created at the start of a sync pass (after the `compare_exchange`
+/// that takes the slot). On any exit — normal return, early return, or
+/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
+/// never spins forever on a panicked pass.
+struct IsSyncingGuard<'a>(&'a AtomicBool);
+
+impl Drop for IsSyncingGuard<'_> {
+ fn drop(&mut self) {
+ self.0.store(false, Ordering::Release);
+ }
+}
+
/// Outcome of syncing a single wallet in a pass.
///
/// Not `Clone` because `AddressSyncResult` isn't. Consumers receive it
@@ -201,13 +215,12 @@ impl PlatformAddressSyncManager {
/// The first pass runs immediately; subsequent passes fire every
/// [`interval`](Self::interval).
pub fn start(self: Arc) {
- let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned");
- if guard.is_some() {
+ let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+ if cancel_guard.is_some() {
return;
}
let cancel = CancellationToken::new();
- *guard = Some(cancel.clone());
- drop(guard);
+ *cancel_guard = Some(cancel.clone());
let handle = tokio::runtime::Handle::current();
let this = Arc::clone(&self);
@@ -235,8 +248,11 @@ impl PlatformAddressSyncManager {
});
})
.expect("failed to spawn platform-address-sync thread");
- // Store the handle so `quiesce` can join the OS thread.
+ // Store the join handle while still holding cancel_guard — a
+ // concurrent quiesce() must wait for this lock before calling
+ // stop(), so the handle is always stored before it can be taken.
*self.background_join.lock().expect("bg_join poisoned") = Some(join);
+ // cancel_guard drops here, releasing background_cancel.
}
/// Stop the background sync loop. No-op if not running.
@@ -312,13 +328,17 @@ impl PlatformAddressSyncManager {
return PlatformAddressSyncSummary::default();
}
+ // RAII guard: clears `is_syncing` on every exit path, including
+ // panics. Without this a panic inside the pass would leave
+ // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
+ let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
// A `quiesce()` may have raised the gate between our CAS and
- // here; if so, release the slot and bail without running a pass
- // so the drain can complete and shutdown gets a true barrier
- // (no further `on_platform_address_sync_completed` host callback
- // after quiesce returns).
+ // here; if so, bail without running a pass so the drain can
+ // complete and shutdown gets a true barrier (no further
+ // `on_platform_address_sync_completed` host callback after
+ // quiesce returns). Guard clears `is_syncing` on return.
if self.quiescing.load(Ordering::Acquire) {
- self.is_syncing.store(false, Ordering::Release);
return PlatformAddressSyncSummary::default();
}
@@ -352,20 +372,18 @@ impl PlatformAddressSyncManager {
summary.sync_unix_seconds = now;
self.last_sync_unix.store(now, Ordering::Release);
- // Dispatch the completion event BEFORE clearing `is_syncing`.
- // `quiesce()` drains on the falling edge of `is_syncing`, so if
- // we cleared the flag first a shutdown caller could unblock and
- // free the host event-handler context while this completion
- // event (FFI callback → host handler) is still pending — a
- // use-after-free. Holding the flag across the dispatch makes
- // quiesce's barrier cover the host callback too. Mirrors the
- // ordering in `ShieldedSyncManager::sync_now`.
+ // Dispatch the completion event BEFORE `_is_syncing_guard` drops.
+ // `quiesce()` drains on the falling edge of `is_syncing`; if the
+ // guard cleared the flag before the dispatch a shutdown caller
+ // could unblock and free the host event-handler context while
+ // the callback is still pending — a use-after-free. The guard
+ // drops (clearing `is_syncing`) after this call returns, when
+ // the function frame unwinds.
self.event_manager
.on_platform_address_sync_completed(&summary);
- self.is_syncing.store(false, Ordering::Release);
-
summary
+ // `_is_syncing_guard` drops here → `is_syncing = false`
}
/// Sync a single wallet on demand. Does not set the global
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index d634c65398..0b2e7dda68 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -44,6 +44,20 @@ use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary};
/// is conservative compared to the 15s address-sync cadence.
pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
+/// RAII guard that clears `is_syncing` when dropped.
+///
+/// Created at the start of a sync pass (after the `compare_exchange`
+/// that takes the slot). On any exit — normal return, early return, or
+/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
+/// never spins forever on a panicked pass.
+struct IsSyncingGuard<'a>(&'a AtomicBool);
+
+impl Drop for IsSyncingGuard<'_> {
+ fn drop(&mut self) {
+ self.0.store(false, Ordering::Release);
+ }
+}
+
/// Outcome of syncing a single wallet in a shielded sync pass.
///
/// Not `Clone` because `ShieldedSyncSummary` carries the underlying
@@ -228,17 +242,16 @@ impl ShieldedSyncManager {
/// GRPC client state isn't `Send + Sync`). Same trade-off as
/// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start).
pub fn start(self: Arc) {
- let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned");
- if guard.is_some() {
+ let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+ if cancel_guard.is_some() {
return;
}
let cancel = CancellationToken::new();
- *guard = Some(cancel.clone());
+ *cancel_guard = Some(cancel.clone());
// Bump the generation while we still hold the slot lock so
// the load below in any prior thread's cleanup observes
// `current_gen != my_gen` ordered against this token swap.
let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
- drop(guard);
let handle = tokio::runtime::Handle::current();
let this = Arc::clone(&self);
@@ -281,8 +294,11 @@ impl ShieldedSyncManager {
});
})
.expect("failed to spawn shielded-sync thread");
- // Store the handle so `quiesce` can join the OS thread.
+ // Store the join handle while still holding cancel_guard — a
+ // concurrent quiesce() must wait for this lock before calling
+ // stop(), so the handle is always stored before it can be taken.
*self.background_join.lock().expect("bg_join poisoned") = Some(join);
+ // cancel_guard drops here, releasing background_cancel.
}
/// Stop the background sync loop. No-op if not running.
@@ -362,11 +378,15 @@ impl ShieldedSyncManager {
return ShieldedSyncPassSummary::default();
}
+ // RAII guard: clears `is_syncing` on every exit path, including
+ // panics. Without this a panic inside the pass would leave
+ // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
+ let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
// A `quiesce()` may have raised the gate between our CAS and
- // here; if so, release the slot and bail without running a pass
- // so the drain can complete and Clear/stop get a true barrier.
+ // here; bail so the drain can complete and Clear/stop get a
+ // true barrier. Guard clears `is_syncing` on return.
if self.quiescing.load(Ordering::Acquire) {
- self.is_syncing.store(false, Ordering::Release);
return ShieldedSyncPassSummary::default();
}
@@ -403,18 +423,15 @@ impl ShieldedSyncManager {
self.last_sync_unix
.store(summary.sync_unix_seconds, Ordering::Release);
- // Dispatch the completion event BEFORE clearing `is_syncing`.
- // `quiesce()` drains on the falling edge of `is_syncing`, so if
- // we cleared the flag first a stop/clear caller could unblock
- // while this completion event (FFI callback → Swift
- // `handleShieldedSyncCompleted`) is still pending — surfacing a
- // stale post-stop/post-clear event. Holding the flag across the
- // dispatch makes quiesce's barrier cover the event too.
+ // Dispatch the completion event BEFORE `_is_syncing_guard` drops.
+ // `quiesce()` drains on the falling edge of `is_syncing`; if
+ // the guard cleared the flag before the dispatch a stop/clear
+ // caller could unblock while the callback is still pending —
+ // surfacing a stale post-stop/post-clear event.
self.event_manager.on_shielded_sync_completed(&summary);
- self.is_syncing.store(false, Ordering::Release);
-
summary
+ // `_is_syncing_guard` drops here → `is_syncing = false`
}
/// Sync a single wallet on demand.
@@ -457,15 +474,16 @@ impl ShieldedSyncManager {
return Ok(None);
}
+ // RAII guard clears `is_syncing` on every exit path including panics.
+ let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
// Bail if a `quiesce()` raised the gate after our CAS (see
// `sync_now`) so the drain barrier holds.
if self.quiescing.load(Ordering::Acquire) {
- self.is_syncing.store(false, Ordering::Release);
return Ok(None);
}
let pass = coordinator.sync(force).await;
- self.is_syncing.store(false, Ordering::Release);
// Extract this wallet's slice from the network-wide pass
// summary. If the wallet is registered, we'll get back an
From 42d734d4f81ae76307b13eff9e449ab78955e476 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 11:12:36 +0200
Subject: [PATCH 03/29] refactor(rs-dash-async): add AtomicFlagGuard RAII
helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduces `AtomicFlagGuard`, a pub RAII guard that clears an
`AtomicBool` flag to `false` (Release ordering) on drop. The guard
does not set the flag on construction — the caller is responsible for
doing so (typically via a `compare_exchange`) — preserving the exact
semantics of the three identical `IsSyncingGuard` structs that were
copy-pasted across the platform-wallet sync coordinators.
This is the panic-safety keystone for the quiesce drain loop: if a sync
pass panics, the guard's `drop` still clears `is_syncing`, so
`quiesce()` is never permanently wedged.
Co-Authored-By: Claude Opus 4.6
---
packages/rs-dash-async/src/atomic.rs | 22 ++++++++++++++++++++++
packages/rs-dash-async/src/lib.rs | 4 ++++
2 files changed, 26 insertions(+)
create mode 100644 packages/rs-dash-async/src/atomic.rs
diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs
new file mode 100644
index 0000000000..eb79bb4ed1
--- /dev/null
+++ b/packages/rs-dash-async/src/atomic.rs
@@ -0,0 +1,22 @@
+use std::sync::atomic::{AtomicBool, Ordering};
+
+/// RAII guard that clears an [`AtomicBool`] flag to `false` on drop.
+///
+/// Callers set the flag to `true` before constructing the guard (typically
+/// via a `compare_exchange`); the guard resets it on every exit path,
+/// including panics, so a panicked holder can never leave the flag wedged.
+pub struct AtomicFlagGuard<'a>(&'a AtomicBool);
+
+impl<'a> AtomicFlagGuard<'a> {
+ /// Wrap `flag`. Does **not** set it to `true` — the caller is
+ /// responsible for doing that before constructing the guard.
+ pub fn new(flag: &'a AtomicBool) -> Self {
+ Self(flag)
+ }
+}
+
+impl Drop for AtomicFlagGuard<'_> {
+ fn drop(&mut self) {
+ self.0.store(false, Ordering::Release);
+ }
+}
diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs
index 0ef7785253..3edcf00daa 100644
--- a/packages/rs-dash-async/src/lib.rs
+++ b/packages/rs-dash-async/src/lib.rs
@@ -2,7 +2,11 @@
//!
//! Provides [`block_on`] -- a function that bridges async futures into sync code,
//! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread).
+//!
+//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets.
+mod atomic;
mod block_on;
+pub use atomic::AtomicFlagGuard;
pub use block_on::{block_on, AsyncError};
From 6e78b7777f57e0a0b270e55daae34b31a51a1de1 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 11:13:01 +0200
Subject: [PATCH 04/29] fix(platform-wallet): refine CoordinatorThreadStatus
variants + tighten runtime check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
**Task 1 — new enum variants**
Add `Stopped(Option)` (non-panic, non-clean task exit, e.g.
tokio cancel/abort) and `Timeout` (join exceeded
SHUTDOWN_JOIN_TIMEOUT_SECS) to `CoordinatorThreadStatus`.
- Non-panic JoinError on the event-adapter task → `Stopped(Some(...))`,
not the previous `Ok` (wrong: a cancelled task is not a clean exit).
- Timeout on any `quiesce()` wrapper → `Timeout`, not `Error("join
timed out")`.
- `is_clean()` now returns `true` only for `Ok` and `NotRunning`; all
other variants — including the two new ones — are non-clean.
- Update all docs / comments that referenced the old `Error("join timed
out")` wording.
**Task 2 — promote debug_assert to assert**
`shutdown()`'s multi-thread-runtime guard was `debug_assert!`, making
it a no-op in release builds. Changed to `assert!` — this is a real
invariant: `spawn_blocking` deadlocks on a `current_thread` runtime.
**Task 3 — bound the test wait loop**
Wrap the `while !handler_started…` polling in
`shutdown_waits_for_in_flight_pass_to_drain` with a 5 s
`tokio::time::timeout` so a broken test fails fast instead of hanging.
**Task 4 — DRY IsSyncingGuard**
Replace the three identical copy-pasted `IsSyncingGuard` structs in
`identity_sync.rs`, `platform_address_sync.rs`, and `shielded_sync.rs`
with the new `dash_async::AtomicFlagGuard`. Adds `dash-async` to
`rs-platform-wallet/Cargo.toml`. Zero behavioral change: construction
semantics preserved (callers set the flag via `compare_exchange` before
creating the guard; `Drop` clears it with `Ordering::Release`).
**Task 5 — new tests**
- `coordinator_thread_status_clean_predicate`: unit-tests `is_clean()`
for all six variants including the two new ones; no real timeout needed.
- `coordinator_exit_status_all_clean`: tests `all_clean()` with
`Timeout` and `Stopped` slots.
- `event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean`:
aborts the adapter task before `shutdown()` and asserts the result is
`Stopped` (covers the non-panic JoinError path).
Co-Authored-By: Claude Opus 4.6
---
Cargo.lock | 1 +
packages/rs-platform-wallet/Cargo.toml | 1 +
.../src/manager/identity_sync.rs | 18 +--
.../rs-platform-wallet/src/manager/mod.rs | 134 +++++++++++++++---
.../src/manager/platform_address_sync.rs | 18 +--
.../src/manager/shielded_sync.rs | 20 +--
6 files changed, 126 insertions(+), 66 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e296c3aebd..1faa308a83 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5141,6 +5141,7 @@ dependencies = [
"async-trait",
"bimap",
"bs58",
+ "dash-async",
"dash-sdk",
"dash-spv",
"dashcore",
diff --git a/packages/rs-platform-wallet/Cargo.toml b/packages/rs-platform-wallet/Cargo.toml
index 1362523ece..e324680210 100644
--- a/packages/rs-platform-wallet/Cargo.toml
+++ b/packages/rs-platform-wallet/Cargo.toml
@@ -31,6 +31,7 @@ bimap = "0.6"
# Async runtime
tokio = { version = "1", features = ["sync", "rt", "time", "macros"] }
tokio-util = { version = "0.7.12" }
+dash-async = { path = "../rs-dash-async" }
# Logging
tracing = "0.1"
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 7ce38eb5fd..34bf0fefc7 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -51,6 +51,8 @@ use std::sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
Arc, Mutex as StdMutex,
};
+
+use dash_async::AtomicFlagGuard;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use dpp::balances::credits::TokenAmount;
@@ -75,20 +77,6 @@ use crate::wallet::platform_wallet::WalletId;
/// startup default.
pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
-/// RAII guard that clears `is_syncing` when dropped.
-///
-/// Created at the start of a sync pass (after the `compare_exchange`
-/// that takes the slot). On any exit — normal return, early return, or
-/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
-/// never spins forever on a panicked pass.
-struct IsSyncingGuard<'a>(&'a AtomicBool);
-
-impl Drop for IsSyncingGuard<'_> {
- fn drop(&mut self) {
- self.0.store(false, Ordering::Release);
- }
-}
-
/// Maximum number of token ids fetched in a single
/// `IdentityTokenBalancesQuery`.
///
@@ -540,7 +528,7 @@ where
// RAII guard: clears `is_syncing` on every exit path, including
// panics. Without this a panic inside the pass would leave
// `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
- let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+ let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
// A `quiesce()` may have raised the gate between our CAS and
// here; if so, bail without running a pass so the drain can
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 905dc32c41..717ad0a03c 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -101,19 +101,28 @@ pub struct PlatformWalletManager {
pub enum CoordinatorThreadStatus {
/// The loop exited and its thread/task joined cleanly.
Ok,
+ /// The thread/task exited for a non-panic reason that is not a clean
+ /// return — e.g. a tokio task was cancelled or aborted. Carries a
+ /// reason string when one is available.
+ Stopped(Option),
/// The thread/task panicked; carries the best-effort panic message.
Panicked(String),
+ /// The join did not complete within [`SHUTDOWN_JOIN_TIMEOUT_SECS`].
+ Timeout,
/// No thread/task was running to join — never started, or already
/// joined by a previous `shutdown()`.
NotRunning,
- /// The join did not complete within the bounded timeout, or the
- /// `spawn_blocking` task itself failed (e.g. runtime torn down
- /// before the join could run — unreachable in normal operation).
+ /// Infrastructural join failure that is neither a timeout nor a
+ /// panic — e.g. the `spawn_blocking` task itself failed because
+ /// the runtime was torn down before the join could run (unreachable
+ /// in normal operation).
Error(String),
}
impl CoordinatorThreadStatus {
- /// `true` for a non-failure outcome (joined cleanly or never ran).
+ /// `true` only for a fully clean outcome: joined normally (`Ok`) or
+ /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, and
+ /// `Error` are all considered non-clean.
pub fn is_clean(&self) -> bool {
matches!(self, Self::Ok | Self::NotRunning)
}
@@ -198,7 +207,7 @@ fn panic_message(payload: Box) -> String {
/// quiesce+join to complete. Under normal operation this deadline is
/// never reached (the RAII `is_syncing` guard ensures the drain exits
/// even on panic). On timeout the coordinator slot reports
-/// [`CoordinatorThreadStatus::Error`]`("join timed out")`.
+/// [`CoordinatorThreadStatus::Timeout`].
const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
impl PlatformWalletManager {
@@ -431,18 +440,17 @@ impl PlatformWalletManager {
///
/// **Precondition: must be called from a multi-thread Tokio runtime.**
/// `quiesce()` uses `spawn_blocking` internally; calling from a
- /// `current_thread` runtime will `debug_assert!`-panic in debug
- /// builds or deadlock in release builds.
+ /// `current_thread` runtime will panic (this is a real invariant
+ /// enforced in both debug and release builds).
///
/// Each coordinator quiesce+join is bounded by
/// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit
/// within that window, its slot reports
- /// [`CoordinatorThreadStatus::Error`]`("join timed out")` rather
- /// than hanging forever. Under normal operation (no infinite loops,
- /// RAII guard clears `is_syncing` even on panic) this timeout is
- /// never reached.
+ /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
+ /// Under normal operation (no infinite loops, RAII guard clears
+ /// `is_syncing` even on panic) this timeout is never reached.
pub async fn shutdown(&self) -> CoordinatorExitStatus {
- debug_assert!(
+ assert!(
matches!(
tokio::runtime::Handle::current().runtime_flavor(),
tokio::runtime::RuntimeFlavor::MultiThread
@@ -456,17 +464,17 @@ impl PlatformWalletManager {
let platform_address_sync =
tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
.await
- .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+ .unwrap_or(CoordinatorThreadStatus::Timeout);
let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
.await
- .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+ .unwrap_or(CoordinatorThreadStatus::Timeout);
#[cfg(feature = "shielded")]
let shielded_sync = {
let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
.await
- .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+ .unwrap_or(CoordinatorThreadStatus::Timeout);
Some(r)
};
#[cfg(not(feature = "shielded"))]
@@ -485,10 +493,12 @@ impl PlatformWalletManager {
if e.is_panic() {
CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
} else {
- CoordinatorThreadStatus::Ok
+ // Non-panic JoinError: task was cancelled or aborted —
+ // not a clean exit, but also not a panic.
+ CoordinatorThreadStatus::Stopped(Some(format!("{e}")))
}
}
- Err(_) => CoordinatorThreadStatus::Error("join timed out".into()),
+ Err(_) => CoordinatorThreadStatus::Timeout,
},
};
@@ -654,6 +664,86 @@ mod tests {
}
}
+ /// A non-panic `JoinError` on the event adapter maps to `Stopped`, not
+ /// `Ok`, and is NOT considered clean. This covers the case where the
+ /// tokio task is cancelled or aborted rather than completing normally.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() {
+ // Build a manager but immediately abort the event adapter task so
+ // we trigger the non-panic JoinError path in shutdown().
+ let manager = make_manager();
+ // Abort the adapter task directly so the join sees a non-panic JoinError.
+ {
+ let mut guard = manager.event_adapter_join.lock().await;
+ if let Some(handle) = guard.take() {
+ handle.abort();
+ // Put it back so shutdown() sees it and exercises the error path.
+ *guard = Some(handle);
+ }
+ }
+ // Give tokio a moment to process the abort.
+ tokio::time::sleep(Duration::from_millis(10)).await;
+
+ let status = manager.shutdown().await;
+ // The adapter task was aborted → non-panic JoinError → Stopped.
+ match &status.event_adapter {
+ CoordinatorThreadStatus::Stopped(_) | CoordinatorThreadStatus::Ok => {
+ // Stopped is the expected path; Ok means it drained before abort — both
+ // are acceptable since abort() races the task completion.
+ }
+ other => panic!("expected Stopped or Ok (abort race), got {other:?}"),
+ }
+ // Regardless, all other workers were never started → clean.
+ assert_eq!(
+ status.platform_address_sync,
+ CoordinatorThreadStatus::NotRunning
+ );
+ }
+
+ /// `Stopped` and `Timeout` are NOT clean; `Ok` and `NotRunning` ARE.
+ /// Unit-tests the `is_clean` predicate directly so we don't need to
+ /// trigger a real timeout (30s) in a deterministic test.
+ #[test]
+ fn coordinator_thread_status_clean_predicate() {
+ assert!(CoordinatorThreadStatus::Ok.is_clean());
+ assert!(CoordinatorThreadStatus::NotRunning.is_clean());
+
+ assert!(!CoordinatorThreadStatus::Stopped(None).is_clean());
+ assert!(!CoordinatorThreadStatus::Stopped(Some("cancelled".into())).is_clean());
+ assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean());
+ assert!(!CoordinatorThreadStatus::Timeout.is_clean());
+ assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean());
+ }
+
+ /// `all_clean()` on `CoordinatorExitStatus` is false whenever any
+ /// slot is non-clean.
+ #[test]
+ fn coordinator_exit_status_all_clean() {
+ let clean = CoordinatorExitStatus {
+ platform_address_sync: CoordinatorThreadStatus::Ok,
+ identity_sync: CoordinatorThreadStatus::NotRunning,
+ shielded_sync: None,
+ event_adapter: CoordinatorThreadStatus::Ok,
+ };
+ assert!(clean.all_clean());
+
+ let with_timeout = CoordinatorExitStatus {
+ platform_address_sync: CoordinatorThreadStatus::Timeout,
+ identity_sync: CoordinatorThreadStatus::Ok,
+ shielded_sync: None,
+ event_adapter: CoordinatorThreadStatus::Ok,
+ };
+ assert!(!with_timeout.all_clean());
+
+ let with_stopped = CoordinatorExitStatus {
+ platform_address_sync: CoordinatorThreadStatus::Ok,
+ identity_sync: CoordinatorThreadStatus::Ok,
+ shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))),
+ event_adapter: CoordinatorThreadStatus::Ok,
+ };
+ assert!(!with_stopped.all_clean());
+ }
+
/// A cleanly-returning thread joins as `Ok`; an absent handle is
/// `NotRunning`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -705,9 +795,13 @@ mod tests {
// Wait until the slow completion callback is running
// (`is_syncing` stays true for its 300 ms duration).
- while !handler_started.load(AO::Acquire) {
- tokio::time::sleep(Duration::from_millis(5)).await;
- }
+ tokio::time::timeout(Duration::from_secs(5), async {
+ while !handler_started.load(AO::Acquire) {
+ tokio::time::sleep(Duration::from_millis(5)).await;
+ }
+ })
+ .await
+ .expect("handler did not start within 5s");
// Shutdown must drain the in-flight pass before joining.
let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown())
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index f85eb6d05e..ddd58fcb44 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -13,6 +13,8 @@ use std::sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
Arc, Mutex as StdMutex,
};
+
+use dash_async::AtomicFlagGuard;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use arc_swap::ArcSwapOption;
@@ -31,20 +33,6 @@ use crate::wallet::PlatformWallet;
/// Default cadence — matches the 15s BLAST loop we previously ran in Swift.
pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 15;
-/// RAII guard that clears `is_syncing` when dropped.
-///
-/// Created at the start of a sync pass (after the `compare_exchange`
-/// that takes the slot). On any exit — normal return, early return, or
-/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
-/// never spins forever on a panicked pass.
-struct IsSyncingGuard<'a>(&'a AtomicBool);
-
-impl Drop for IsSyncingGuard<'_> {
- fn drop(&mut self) {
- self.0.store(false, Ordering::Release);
- }
-}
-
/// Outcome of syncing a single wallet in a pass.
///
/// Not `Clone` because `AddressSyncResult` isn't. Consumers receive it
@@ -331,7 +319,7 @@ impl PlatformAddressSyncManager {
// RAII guard: clears `is_syncing` on every exit path, including
// panics. Without this a panic inside the pass would leave
// `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
- let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+ let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
// A `quiesce()` may have raised the gate between our CAS and
// here; if so, bail without running a pass so the drain can
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 0b2e7dda68..502d1ae733 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -30,6 +30,8 @@ use std::sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
Arc, Mutex as StdMutex,
};
+
+use dash_async::AtomicFlagGuard;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tokio::sync::RwLock;
@@ -44,20 +46,6 @@ use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary};
/// is conservative compared to the 15s address-sync cadence.
pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
-/// RAII guard that clears `is_syncing` when dropped.
-///
-/// Created at the start of a sync pass (after the `compare_exchange`
-/// that takes the slot). On any exit — normal return, early return, or
-/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
-/// never spins forever on a panicked pass.
-struct IsSyncingGuard<'a>(&'a AtomicBool);
-
-impl Drop for IsSyncingGuard<'_> {
- fn drop(&mut self) {
- self.0.store(false, Ordering::Release);
- }
-}
-
/// Outcome of syncing a single wallet in a shielded sync pass.
///
/// Not `Clone` because `ShieldedSyncSummary` carries the underlying
@@ -381,7 +369,7 @@ impl ShieldedSyncManager {
// RAII guard: clears `is_syncing` on every exit path, including
// panics. Without this a panic inside the pass would leave
// `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
- let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+ let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
// A `quiesce()` may have raised the gate between our CAS and
// here; bail so the drain can complete and Clear/stop get a
@@ -475,7 +463,7 @@ impl ShieldedSyncManager {
}
// RAII guard clears `is_syncing` on every exit path including panics.
- let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+ let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
// Bail if a `quiesce()` raised the gate after our CAS (see
// `sync_now`) so the drain barrier holds.
From 5f80450ce16129ea77422b6699c67e6353c87738 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:46:52 +0200
Subject: [PATCH 05/29] test(rs-dash-async): assert AtomicFlagGuard contract +
add #[must_use]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RUST-001: tag `AtomicFlagGuard` `#[must_use]` so a stray `let _ = ..` or
bare-statement construction (which would drop the guard *immediately* and
clear the flag right back) gets caught at compile time instead of silently
un-gating the very flag it was meant to hold.
PROJ-001: lock the guard's contract down with two tests — flag cleared on a
normal drop, and (the load-bearing one) flag cleared while unwinding a
panic via `catch_unwind`. Makes the PR-body "dash-async tests" claim true.
SEC-003: spell out in the rustdoc that the clear-on-panic guarantee rides
on unwinding, so it holds under `panic = "unwind"` but not under the iOS
`panic = "abort"` profiles, where a panic aborts before any Drop runs.
Co-Authored-By: Claude Opus 4.6
---
packages/rs-dash-async/src/atomic.rs | 42 ++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs
index eb79bb4ed1..ecdab75acb 100644
--- a/packages/rs-dash-async/src/atomic.rs
+++ b/packages/rs-dash-async/src/atomic.rs
@@ -5,6 +5,13 @@ use std::sync::atomic::{AtomicBool, Ordering};
/// Callers set the flag to `true` before constructing the guard (typically
/// via a `compare_exchange`); the guard resets it on every exit path,
/// including panics, so a panicked holder can never leave the flag wedged.
+///
+/// **Panic-strategy caveat:** the clear-on-panic guarantee relies on
+/// destructors running while the stack unwinds, so it holds under
+/// `panic = "unwind"` (the default). Under `panic = "abort"` — e.g. the
+/// iOS release profiles — a panic aborts the process immediately and no
+/// `Drop` runs; there is simply no "after" left for the flag to gate.
+#[must_use = "AtomicFlagGuard clears the flag on drop; binding to `_` or using as a statement drops it immediately"]
pub struct AtomicFlagGuard<'a>(&'a AtomicBool);
impl<'a> AtomicFlagGuard<'a> {
@@ -20,3 +27,38 @@ impl Drop for AtomicFlagGuard<'_> {
self.0.store(false, Ordering::Release);
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::panic::{catch_unwind, AssertUnwindSafe};
+
+ /// A guard constructed over a `true` flag holds it while in scope and
+ /// clears it to `false` on a normal scope exit.
+ #[test]
+ fn clears_flag_on_normal_drop() {
+ let flag = AtomicBool::new(true);
+ {
+ let _guard = AtomicFlagGuard::new(&flag);
+ assert!(flag.load(Ordering::Acquire), "flag stays set while held");
+ }
+ assert!(!flag.load(Ordering::Acquire), "flag cleared on drop");
+ }
+
+ /// The clear also runs while unwinding a panic — the load-bearing
+ /// property the sync coordinators lean on so a panicked pass can't
+ /// leave `is_syncing` latched and wedge `quiesce()`'s drain.
+ #[test]
+ fn clears_flag_while_unwinding_panic() {
+ let flag = AtomicBool::new(true);
+ let result = catch_unwind(AssertUnwindSafe(|| {
+ let _guard = AtomicFlagGuard::new(&flag);
+ panic!("boom while holding the guard");
+ }));
+ assert!(result.is_err(), "the panic propagated out of catch_unwind");
+ assert!(
+ !flag.load(Ordering::Acquire),
+ "Drop ran during unwinding and cleared the flag"
+ );
+ }
+}
From 6b2cd39e06ac565a22ff8609da61b2afd14b712b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:51:18 +0200
Subject: [PATCH 06/29] fix(platform-wallet): make coordinator passes
cancellable + converge invariants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
SEC-001 (the big one): a `shutdown()` quiesce timed out only because a
stalled in-flight pass pinned `is_syncing`, so the `while is_syncing` drain
never cleared, the quiesce future was dropped *before* the thread join, and
the `!Send` coordinator OS thread was left ALIVE — later firing host
callbacks through freed memory. Root-cause fix: race the pass body against
cancellation inside each coordinator's own loop
tokio::select! {
biased;
_ = cancel.cancelled() => break,
_ = this.sync_now(..) => {}
}
so `stop()`/`quiesce()` cancelling the token drops the stalled `sync_now`
future *on the coordinator thread*, which unwinds to its `is_syncing`
`AtomicFlagGuard` and clears the flag promptly. The drain then frees and the
join lands far inside the timeout — the timeout can no longer strand a live
thread. Invariants preserved: the guard is constructed before any `.await`
so a cancel-drop always clears `is_syncing`; the completion-event dispatch
is the synchronous tail after the last `.await`, so it either runs in full
(then clears) or is skipped on cancel — never torn; idempotency and the
drain barrier are untouched. The inter-pass sleep was already cancel-raced.
MEDIUM-4 (RUST-002): RAII-guard `quiescing` in all three `quiesce()` via
`AtomicFlagGuard`, dropping the manual `store(false)`. A timed-out quiesce
no longer latches the gate `true` and silently bails every future pass.
Reopening on drop is safe because `stop()` already cancelled the loop.
MEDIUM-3 (SEC-005/CALL-001): give `PlatformAddressSyncManager` the
`background_generation` counter its siblings already have — bump it (AcqRel)
in `start()` and gate the thread-exit `*background_cancel = None` on
`generation == my_gen`, so a stop()+start() reschedule can't have an exiting
thread strip the new generation's token.
SEC-003: swap the `background_cancel`/`background_join` std-Mutex
`.lock().expect("… poisoned")` calls for `.lock().unwrap_or_else(|e|
e.into_inner())` across all three coordinators, so one prior panic can't
cascade into an abort on the teardown path.
Co-Authored-By: Claude Opus 4.6
---
.../src/manager/identity_sync.rs | 39 ++++++++++--
.../src/manager/platform_address_sync.rs | 61 ++++++++++++++++---
.../src/manager/shielded_sync.rs | 39 ++++++++++--
3 files changed, 120 insertions(+), 19 deletions(-)
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 34bf0fefc7..ae2143a574 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -403,7 +403,10 @@ where
/// The first pass runs immediately; subsequent passes fire every
/// [`interval`](Self::interval).
pub fn start(self: Arc) {
- let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+ let mut cancel_guard = self
+ .background_cancel
+ .lock()
+ .unwrap_or_else(|e| e.into_inner());
if cancel_guard.is_some() {
return;
}
@@ -422,7 +425,22 @@ where
break;
}
- this.sync_now().await;
+ // Race the in-flight pass against cancellation.
+ // `stop()` / `quiesce()` cancel the token; with
+ // `biased` the cancel arm is polled first, so a
+ // pass stalled on a hung SDK fetch is dropped at
+ // its `.await` the instant we cancel. Dropping the
+ // `sync_now` future unwinds to the `is_syncing`
+ // `AtomicFlagGuard` it holds, clearing the flag
+ // promptly — so `quiesce()`'s drain loop frees and
+ // the join lands well inside `shutdown()`'s
+ // timeout. A stalled pass can no longer strand a
+ // live `!Send` thread past `shutdown()`.
+ tokio::select! {
+ biased;
+ _ = cancel.cancelled() => break,
+ _ = this.sync_now() => {}
+ }
let interval = this.interval();
tokio::select! {
@@ -444,7 +462,10 @@ where
// Store the join handle while still holding cancel_guard — a
// concurrent quiesce() must wait for this lock before calling
// stop(), so the handle is always stored before it can be taken.
- *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+ *self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner()) = Some(join);
// cancel_guard drops here, releasing background_cancel.
}
@@ -460,7 +481,7 @@ where
if let Some(token) = self
.background_cancel
.lock()
- .expect("bg_cancel poisoned")
+ .unwrap_or_else(|e| e.into_inner())
.take()
{
token.cancel();
@@ -493,15 +514,21 @@ where
/// one-shot host drops the runtime.
pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
self.quiescing.store(true, Ordering::Release);
+ // RAII gate: resets `quiescing` on *every* exit path — a normal
+ // return, a timed-out `shutdown()` dropping this future, or a
+ // panic. Without it a quiesce that doesn't run to completion
+ // leaves the gate latched `true`, silently bailing every future
+ // pass. Reopening on drop is safe because `stop()` (below) has
+ // already cancelled the loop, so no new pass can start.
+ let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
self.stop();
while self.is_syncing.load(Ordering::Acquire) {
tokio::time::sleep(Duration::from_millis(20)).await;
}
- self.quiescing.store(false, Ordering::Release);
let handle = self
.background_join
.lock()
- .expect("bg_join poisoned")
+ .unwrap_or_else(|e| e.into_inner())
.take();
super::join_coordinator_thread(handle).await
}
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index ddd58fcb44..28987bd9c5 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -104,6 +104,14 @@ pub struct PlatformAddressSyncManager {
/// confirm the `!Send` loop fully exited before the host drops the
/// runtime.
background_join: StdMutex>>,
+ /// Monotonically increasing generation counter. Bumped on every
+ /// `start()` so the exiting thread can tell whether its generation is
+ /// still the active one before clearing `background_cancel`. Without
+ /// this guard a tight `stop()` → `start()` reschedule lets the prior
+ /// thread's cleanup strip the *new* generation's token, leaving the
+ /// new loop running but untrackable via `is_running()` / `stop()`.
+ /// Mirrors the identity / shielded coordinators.
+ background_generation: AtomicU64,
interval_secs: AtomicU64,
is_syncing: AtomicBool,
/// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -133,6 +141,7 @@ impl PlatformAddressSyncManager {
event_manager,
background_cancel: StdMutex::new(None),
background_join: StdMutex::new(None),
+ background_generation: AtomicU64::new(0),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
quiescing: AtomicBool::new(false),
@@ -203,12 +212,19 @@ impl PlatformAddressSyncManager {
/// The first pass runs immediately; subsequent passes fire every
/// [`interval`](Self::interval).
pub fn start(self: Arc) {
- let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+ let mut cancel_guard = self
+ .background_cancel
+ .lock()
+ .unwrap_or_else(|e| e.into_inner());
if cancel_guard.is_some() {
return;
}
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
+ // Bump the generation while we still hold the slot lock so any
+ // prior thread's cleanup observes `current_gen != my_gen` ordered
+ // against this token swap.
+ let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
let handle = tokio::runtime::Handle::current();
let this = Arc::clone(&self);
@@ -221,7 +237,22 @@ impl PlatformAddressSyncManager {
break;
}
- this.sync_now().await;
+ // Race the in-flight pass against cancellation.
+ // `stop()` / `quiesce()` cancel the token; with
+ // `biased` the cancel arm is polled first, so a
+ // pass stalled on a hung SDK fetch is dropped at
+ // its `.await` the instant we cancel. Dropping the
+ // `sync_now` future unwinds to the `is_syncing`
+ // `AtomicFlagGuard` it holds, clearing the flag
+ // promptly — so `quiesce()`'s drain loop frees and
+ // the join lands well inside `shutdown()`'s
+ // timeout. A stalled pass can no longer strand a
+ // live `!Send` thread past `shutdown()`.
+ tokio::select! {
+ biased;
+ _ = cancel.cancelled() => break,
+ _ = this.sync_now() => {}
+ }
let interval = this.interval();
tokio::select! {
@@ -230,8 +261,15 @@ impl PlatformAddressSyncManager {
}
}
+ // Only clear the slot if no newer start() has
+ // installed a replacement token since we launched —
+ // mirrors the identity / shielded coordinators so a
+ // stop() → start() reschedule can't have this exiting
+ // thread strip the new generation's cancel token.
if let Ok(mut guard) = this.background_cancel.lock() {
- *guard = None;
+ if this.background_generation.load(Ordering::Acquire) == my_gen {
+ *guard = None;
+ }
}
});
})
@@ -239,7 +277,10 @@ impl PlatformAddressSyncManager {
// Store the join handle while still holding cancel_guard — a
// concurrent quiesce() must wait for this lock before calling
// stop(), so the handle is always stored before it can be taken.
- *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+ *self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner()) = Some(join);
// cancel_guard drops here, releasing background_cancel.
}
@@ -256,7 +297,7 @@ impl PlatformAddressSyncManager {
if let Some(token) = self
.background_cancel
.lock()
- .expect("bg_cancel poisoned")
+ .unwrap_or_else(|e| e.into_inner())
.take()
{
token.cancel();
@@ -290,15 +331,21 @@ impl PlatformAddressSyncManager {
/// one-shot host drops the runtime.
pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
self.quiescing.store(true, Ordering::Release);
+ // RAII gate: resets `quiescing` on *every* exit path — a normal
+ // return, a timed-out `shutdown()` dropping this future, or a
+ // panic. Without it a quiesce that doesn't run to completion
+ // leaves the gate latched `true`, silently bailing every future
+ // pass. Reopening on drop is safe because `stop()` (below) has
+ // already cancelled the loop, so no new pass can start.
+ let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
self.stop();
while self.is_syncing.load(Ordering::Acquire) {
tokio::time::sleep(Duration::from_millis(20)).await;
}
- self.quiescing.store(false, Ordering::Release);
let handle = self
.background_join
.lock()
- .expect("bg_join poisoned")
+ .unwrap_or_else(|e| e.into_inner())
.take();
super::join_coordinator_thread(handle).await
}
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 502d1ae733..accaca69d0 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -230,7 +230,10 @@ impl ShieldedSyncManager {
/// GRPC client state isn't `Send + Sync`). Same trade-off as
/// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start).
pub fn start(self: Arc) {
- let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+ let mut cancel_guard = self
+ .background_cancel
+ .lock()
+ .unwrap_or_else(|e| e.into_inner());
if cancel_guard.is_some() {
return;
}
@@ -259,7 +262,22 @@ impl ShieldedSyncManager {
// chunk every interval. User-initiated
// syncs pass `force=true` to the FFI
// entry point below and bypass this.
- this.sync_now(false).await;
+ //
+ // Race the pass against cancellation. `stop()` /
+ // `quiesce()` cancel the token; with `biased` the
+ // cancel arm is polled first, so a pass stalled on
+ // a hung SDK fetch is dropped at its `.await` the
+ // instant we cancel. Dropping the `sync_now` future
+ // unwinds to the `is_syncing` `AtomicFlagGuard` it
+ // holds, clearing the flag promptly — so the drain
+ // loop in `quiesce()` frees and the join lands well
+ // inside `shutdown()`'s timeout. A stalled pass can
+ // no longer strand a live `!Send` thread.
+ tokio::select! {
+ biased;
+ _ = cancel.cancelled() => break,
+ _ = this.sync_now(false) => {}
+ }
let interval = this.interval();
tokio::select! {
@@ -285,7 +303,10 @@ impl ShieldedSyncManager {
// Store the join handle while still holding cancel_guard — a
// concurrent quiesce() must wait for this lock before calling
// stop(), so the handle is always stored before it can be taken.
- *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+ *self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner()) = Some(join);
// cancel_guard drops here, releasing background_cancel.
}
@@ -301,7 +322,7 @@ impl ShieldedSyncManager {
if let Some(token) = self
.background_cancel
.lock()
- .expect("bg_cancel poisoned")
+ .unwrap_or_else(|e| e.into_inner())
.take()
{
token.cancel();
@@ -333,15 +354,21 @@ impl ShieldedSyncManager {
/// one-shot host drops the runtime.
pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
self.quiescing.store(true, Ordering::Release);
+ // RAII gate: resets `quiescing` on *every* exit path — a normal
+ // return, a timed-out `shutdown()` / Clear dropping this future,
+ // or a panic. Without it a quiesce that doesn't run to completion
+ // leaves the gate latched `true`, silently bailing every future
+ // pass. Reopening on drop is safe because `stop()` (below) has
+ // already cancelled the loop, so no new pass can start.
+ let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
self.stop();
while self.is_syncing.load(Ordering::Acquire) {
tokio::time::sleep(Duration::from_millis(20)).await;
}
- self.quiescing.store(false, Ordering::Release);
let handle = self
.background_join
.lock()
- .expect("bg_join poisoned")
+ .unwrap_or_else(|e| e.into_inner())
.take();
super::join_coordinator_thread(handle).await
}
From 13a22dd7ca65a885d1eb1d0fa38acd5b91684920 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:54:38 +0200
Subject: [PATCH 07/29] fix(platform-wallet): bound clear_shielded + tidy
shutdown docs/logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
SEC-002: `clear_shielded()` now wraps its `quiesce()` in the same
`SHUTDOWN_JOIN_TIMEOUT_SECS` backstop `shutdown()` uses, so a stalled
in-flight pass can't hang Clear forever. The const is now `pub` (and
re-exported from the crate root) so the FFI shielded-stop bridge can reuse
it; its doc + the `shutdown()` doc now describe it as a backstop and note
that cancellation is what makes the drain prompt.
SEC-004: bind the event-adapter join handle to a local before the join
`.await`, so the `tokio::Mutex` guard (previously a match-scrutinee
temporary) isn't held across the up-to-30s join.
PROJ-004: drop the lone `tracing::warn!` for the adapter join error inside
`shutdown()` — the returned status already carries it and the FFI `destroy`
adapter logs the aggregate once, so all four workers are now uniform.
RUST-004: rewrite the `shutdown()` `assert!` message (and the matching
docs) to name the real constraint — the coordinator OS threads each run
`Handle::block_on` and need the multi-thread runtime's timer/IO driver —
instead of blaming `spawn_blocking`, which works fine on current_thread.
PROJ-006: fix the `all_clean()` rustdoc (Stopped/Timeout/Error also make it
false, not just panics). PROJ-003: drop the dangling ephemeral `(F-6)` and
`F-2`/`F-3`/`F-7` + `(1)/(2)/(4)/(5)/(6)` markers, replacing with
self-describing prose. SEC-003: note the unwind-vs-abort caveat on the
`shutdown()` panic-safety guarantee.
Co-Authored-By: Claude Opus 4.6
---
packages/rs-platform-wallet/src/lib.rs | 2 +-
.../rs-platform-wallet/src/manager/mod.rs | 116 ++++++++++++------
2 files changed, 80 insertions(+), 38 deletions(-)
diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs
index 289a71378f..dd12883fc7 100644
--- a/packages/rs-platform-wallet/src/lib.rs
+++ b/packages/rs-platform-wallet/src/lib.rs
@@ -44,7 +44,7 @@ pub use manager::platform_address_sync::{
PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome,
DEFAULT_SYNC_INTERVAL_SECS,
};
-pub use manager::PlatformWalletManager;
+pub use manager::{PlatformWalletManager, SHUTDOWN_JOIN_TIMEOUT_SECS};
pub use spv::SpvRuntime;
pub use wallet::asset_lock::manager::AssetLockManager;
pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock};
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 717ad0a03c..6fa26902f8 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -151,9 +151,11 @@ pub struct CoordinatorExitStatus {
}
impl CoordinatorExitStatus {
- /// `true` when every worker wound down without a panic (each is
+ /// `true` only when every worker is
/// [`Ok`](CoordinatorThreadStatus::Ok) or
- /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)).
+ /// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any
+ /// `Stopped`, `Panicked`, `Timeout`, or `Error` slot makes it
+ /// `false`.
pub fn all_clean(&self) -> bool {
self.platform_address_sync.is_clean()
&& self.identity_sync.is_clean()
@@ -173,8 +175,12 @@ impl CoordinatorExitStatus {
/// still alive guarantees the `!Send` loop has stopped touching
/// `tokio::time` before the host drops the runtime.
///
-/// **Requires a multi-thread runtime** — `spawn_blocking` is not
-/// available on `current_thread` runtimes and will panic there.
+/// **Requires a multi-thread runtime.** Each coordinator's OS thread
+/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on)
+/// and needs the runtime's timer/IO driver; a `current_thread` runtime
+/// can only service one `block_on` at a time, so joining one coordinator
+/// while the others (and `shutdown()` itself) are mid-`block_on` would
+/// deadlock. `shutdown()` asserts the multi-thread flavor up front.
pub(crate) async fn join_coordinator_thread(
handle: Option>,
) -> CoordinatorThreadStatus {
@@ -186,7 +192,7 @@ pub(crate) async fn join_coordinator_thread(
Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
// spawn_blocking fails only when the runtime shuts down before
// the blocking task can run — unreachable in normal operation
- // since shutdown() is called while the runtime is alive (F-6).
+ // since shutdown() is called while the runtime is alive.
Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")),
}
}
@@ -203,12 +209,18 @@ fn panic_message(payload: Box) -> String {
}
}
-/// Maximum time (seconds) `shutdown()` waits for one coordinator's
-/// quiesce+join to complete. Under normal operation this deadline is
-/// never reached (the RAII `is_syncing` guard ensures the drain exits
-/// even on panic). On timeout the coordinator slot reports
-/// [`CoordinatorThreadStatus::Timeout`].
-const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
+/// Maximum time (seconds) the teardown paths — `shutdown()`,
+/// `clear_shielded`, and the FFI shielded-stop bridge — wait for one
+/// coordinator's quiesce+join to complete.
+///
+/// This is a backstop, not the primary stop mechanism. `quiesce()`
+/// cancels the loop, which aborts any in-flight pass at its `.await`
+/// point (see each coordinator's `start()` select), so the `is_syncing`
+/// drain clears promptly and the join normally lands far inside this
+/// window. The deadline fires only if a pass's *drop* itself wedges
+/// (e.g. a blocking destructor); on timeout the coordinator slot reports
+/// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
+pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
impl PlatformWalletManager {
/// Create a new PlatformWalletManager.
@@ -403,7 +415,17 @@ impl PlatformWalletManager {
/// must not commit its own persistence wipe in that case.
#[cfg(feature = "shielded")]
pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
- self.shielded_sync_manager.quiesce().await;
+ // Bound the quiesce with the same backstop `shutdown()` uses so a
+ // stalled in-flight pass can't hang Clear forever — cancellation
+ // makes the drain prompt; this timeout only matters if a pass's
+ // drop wedges. The terminal status isn't surfaced on the Clear
+ // path (the coordinator reset below is what can fail), so the
+ // timeout result is intentionally discarded.
+ let _ = tokio::time::timeout(
+ std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
+ self.shielded_sync_manager.quiesce(),
+ )
+ .await;
if let Some(coord) = self.shielded_coordinator().await {
coord.clear().await?;
}
@@ -439,23 +461,35 @@ impl PlatformWalletManager {
/// [`CoordinatorExitStatus`] reports per-thread how each worker ended.
///
/// **Precondition: must be called from a multi-thread Tokio runtime.**
- /// `quiesce()` uses `spawn_blocking` internally; calling from a
- /// `current_thread` runtime will panic (this is a real invariant
- /// enforced in both debug and release builds).
+ /// Each coordinator's OS thread drives its loop via
+ /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs
+ /// the runtime's timer/IO driver; a `current_thread` runtime can only
+ /// service one `block_on` at a time, so the join would deadlock. This
+ /// is asserted in both debug and release builds.
///
/// Each coordinator quiesce+join is bounded by
- /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit
- /// within that window, its slot reports
+ /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] as a backstop. `quiesce()` cancels
+ /// the loop, which aborts any in-flight pass at its `.await` point, so
+ /// the `is_syncing` drain clears promptly and the join normally lands
+ /// far inside the window — the deadline fires only if a pass's *drop*
+ /// itself wedges. On timeout the coordinator slot reports
/// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
- /// Under normal operation (no infinite loops, RAII guard clears
- /// `is_syncing` even on panic) this timeout is never reached.
+ ///
+ /// The clear-on-panic half of that guarantee rides on unwinding, so
+ /// it holds under `panic = "unwind"`. Under the iOS `panic = "abort"`
+ /// release profiles a pass panic aborts the process outright (no
+ /// `Drop`, no status) — there is no live manager left to read a
+ /// status from.
pub async fn shutdown(&self) -> CoordinatorExitStatus {
assert!(
matches!(
tokio::runtime::Handle::current().runtime_flavor(),
tokio::runtime::RuntimeFlavor::MultiThread
),
- "shutdown() requires a multi-thread Tokio runtime (spawn_blocking inside quiesce)"
+ "shutdown() requires a multi-thread Tokio runtime: each \
+ coordinator's OS thread drives its sync loop via \
+ Handle::block_on and needs the runtime's timer/IO driver, but \
+ a current_thread runtime can only drive one block_on at a time"
);
let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
@@ -484,12 +518,19 @@ impl PlatformWalletManager {
// stores), so cancel + join it last — after the loops feeding it
// are gone.
self.event_adapter_cancel.cancel();
- let event_adapter = match self.event_adapter_join.lock().await.take() {
+ // Take the handle out into a local first so the `tokio::Mutex`
+ // guard doesn't stay held across the (up-to-30s) join `.await`
+ // below — a match scrutinee temporary would otherwise keep the
+ // guard alive for the whole match.
+ let event_adapter_handle = self.event_adapter_join.lock().await.take();
+ let event_adapter = match event_adapter_handle {
None => CoordinatorThreadStatus::NotRunning,
Some(handle) => match tokio::time::timeout(timeout, handle).await {
Ok(Ok(())) => CoordinatorThreadStatus::Ok,
+ // The returned status already carries this failure, and the
+ // FFI `destroy` adapter logs the aggregate once at the host
+ // layer — so don't double-log here.
Ok(Err(e)) => {
- tracing::warn!(error = ?e, "Wallet event adapter task join error");
if e.is_panic() {
CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
} else {
@@ -560,7 +601,8 @@ mod tests {
}
/// Build a manager that fires a slow (300 ms std::thread::sleep) callback
- /// on `on_platform_address_sync_completed`. Used by F-2 drain test.
+ /// on `on_platform_address_sync_completed`. Used by the in-flight-pass
+ /// drain test.
fn make_manager_with_slow_handler(
started: Arc,
completed: Arc,
@@ -592,10 +634,10 @@ mod tests {
Arc::clone(&m.shielded_sync_manager).start();
}
- /// (1)+(5)+(6) Happy path: `shutdown()` joins every started worker
- /// and reports `Ok`; it completes within a bounded time (no
- /// `spawn_blocking` starvation/deadlock); a second `shutdown()` finds
- /// nothing left to join (`NotRunning`) — idempotent.
+ /// Happy path: `shutdown()` joins every started worker and reports
+ /// `Ok`; it completes within a bounded time (no `spawn_blocking`
+ /// starvation/deadlock); a second `shutdown()` finds nothing left to
+ /// join (`NotRunning`) — idempotent.
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() {
let manager = make_manager();
@@ -627,7 +669,7 @@ mod tests {
assert!(again.all_clean());
}
- /// (2) Never-started coordinators report `NotRunning` (no thread to
+ /// Never-started coordinators report `NotRunning` (no thread to
/// join). The event adapter is spawned in `new`, so it still joins
/// `Ok`.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -651,7 +693,7 @@ mod tests {
assert!(status.all_clean());
}
- /// (4) A coordinator thread that panics surfaces as `Panicked` rather
+ /// A coordinator thread that panics surfaces as `Panicked` rather
/// than being silently dropped.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn join_coordinator_thread_surfaces_panic() {
@@ -759,9 +801,9 @@ mod tests {
);
}
- /// F-7: `join_coordinator_thread` uses `spawn_blocking` internally.
- /// Verify it completes without deadlock within a bounded time when
- /// called from a multi-thread runtime, as `shutdown()` requires.
+ /// `join_coordinator_thread` uses `spawn_blocking` internally. Verify
+ /// it completes without deadlock within a bounded time when called
+ /// from a multi-thread runtime, as `shutdown()` requires.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() {
let handle = std::thread::spawn(|| {});
@@ -774,8 +816,8 @@ mod tests {
assert_eq!(result, CoordinatorThreadStatus::Ok);
}
- /// F-2: `shutdown()` must wait for an in-flight sync pass to drain
- /// before joining the coordinator thread.
+ /// `shutdown()` must wait for an in-flight sync pass to drain before
+ /// joining the coordinator thread.
///
/// A slow `on_platform_address_sync_completed` callback (300 ms)
/// keeps `is_syncing=true` while it runs. We call `shutdown()` while
@@ -819,9 +861,9 @@ mod tests {
);
}
- /// F-3 (strengthened): race regression — start coordinators with a
- /// long sleep interval so they spend nearly all their time in a live
- /// `tokio::time::sleep`, then `shutdown()` and drop the runtime.
+ /// Race regression — start coordinators with a long sleep interval so
+ /// they spend nearly all their time in a live `tokio::time::sleep`,
+ /// then `shutdown()` and drop the runtime.
///
/// With the thread join in `shutdown()` every coordinator has fully
/// exited its `block_on` before `drop(runtime)` — no race possible.
From 93b89546ed7fac0964d7ae6e7dd3fa12b931b944 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:56:01 +0200
Subject: [PATCH 08/29] fix(platform-wallet-ffi): timeout-bound the shielded
sync stop bridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
SEC-002: `platform_wallet_manager_shielded_sync_stop` blocked on a bare
`quiesce()`, so a stalled in-flight pass could hang the host's stop call
forever. Wrap the quiesce in `tokio::time::timeout` reusing the library's
`SHUTDOWN_JOIN_TIMEOUT_SECS` backstop — same guarantee as `shutdown()`.
Cancellation makes the drain prompt; the timeout only matters if a pass's
drop wedges. The C signature is unchanged and the result is still discarded
(`ok` as before) — we only need the call not to hang.
Add `tokio/time` to the crate's direct features rather than leaning on
`platform-wallet` pulling it in transitively (the crate now calls
`tokio::time::timeout` directly).
Co-Authored-By: Claude Opus 4.6
---
packages/rs-platform-wallet-ffi/Cargo.toml | 2 +-
.../rs-platform-wallet-ffi/src/shielded_sync.rs | 14 +++++++++++++-
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/packages/rs-platform-wallet-ffi/Cargo.toml b/packages/rs-platform-wallet-ffi/Cargo.toml
index 8a2bd4ef2b..7e60b05d69 100644
--- a/packages/rs-platform-wallet-ffi/Cargo.toml
+++ b/packages/rs-platform-wallet-ffi/Cargo.toml
@@ -22,7 +22,7 @@ rs-sdk-ffi = { path = "../rs-sdk-ffi" }
once_cell = "1.19"
parking_lot = { version = "0.12", features = ["send_guard"] }
lazy_static = "1.4"
-tokio = { version = "1", features = ["rt-multi-thread"] }
+tokio = { version = "1", features = ["rt-multi-thread", "time"] }
tokio-metrics = { workspace = true, optional = true }
# Core dependencies (for Network type)
diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
index 2d58d8165f..da285e422e 100644
--- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
+++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
@@ -88,7 +88,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
handle: Handle,
) -> PlatformWalletFFIResult {
let option = PLATFORM_WALLET_MANAGER_STORAGE.with_item(handle, |manager| {
- runtime().block_on(manager.shielded_sync().quiesce());
+ runtime().block_on(async {
+ // Bound the quiesce with the same backstop `shutdown()` uses so
+ // a stalled in-flight pass can't hang the host's stop call
+ // forever. Cancellation makes the drain prompt; this only
+ // matters if a pass's drop wedges. The terminal status is
+ // discarded — the C ABI exposes none of it, we only need the
+ // drain not to wedge.
+ let _ = tokio::time::timeout(
+ Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS),
+ manager.shielded_sync().quiesce(),
+ )
+ .await;
+ });
});
unwrap_option_or_return!(option);
PlatformWalletFFIResult::ok()
From 2bd9501a0edde17c2c1bc3c6d8f6844eca46a973 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:13:38 +0200
Subject: [PATCH 09/29] fix(platform-wallet)!: close residual
coordinator-thread UAF on shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Replace the spawn_blocking-based join in join_coordinator_thread with an
is_finished() poll loop that awaits a 5ms sleep each step. spawn_blocking
tasks cannot be cancelled once started, so the prior approach left the
blocking join alive past the tokio::time::timeout wrapping quiesce() —
defeating the timeout boundary. Polling yields at each .await so the
external timeout is truly binding (threads are confirmed-exited or the
caller times out).
Each coordinator's start() now drains any handle left by a prior stop()
(is_finished spin-wait, 1s bound) before overwriting background_join, so a
stop()->start() reschedule can no longer detach a live, untracked thread
that shutdown() would miss.
FFI platform_wallet_manager_destroy now returns the new
ErrorShutdownIncomplete (19) when shutdown is not all-clean, signalling the
host must not immediately free the callback context — a lingering
coordinator may still fire one final callback. The C ABI is unchanged
(additive enum variant + degraded-path return code).
Tests: deterministic Stopped path via spawn(pending).abort() -> asserts
Stopped(_) and !is_clean(); race test uses per-iteration catch_unwind
instead of a process-global panic hook.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
packages/rs-platform-wallet-ffi/src/error.rs | 9 ++
.../rs-platform-wallet-ffi/src/manager.rs | 13 +-
.../src/manager/identity_sync.rs | 32 ++++
.../rs-platform-wallet/src/manager/mod.rs | 142 ++++++++++++------
.../src/manager/platform_address_sync.rs | 32 ++++
.../src/manager/shielded_sync.rs | 32 ++++
6 files changed, 211 insertions(+), 49 deletions(-)
diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs
index de1a6cb944..b50b5d79c5 100644
--- a/packages/rs-platform-wallet-ffi/src/error.rs
+++ b/packages/rs-platform-wallet-ffi/src/error.rs
@@ -125,6 +125,15 @@ pub enum PlatformWalletFFIResultCode {
/// and could double-send if the original spend landed.
ErrorShieldedSpendUnconfirmed = 18,
+ /// One or more background coordinator threads did not exit cleanly before
+ /// the 30 s join deadline. The host **must not** free the callback context
+ /// immediately — a lingering thread may still hold a reference to it and
+ /// fire one final callback. Either keep the context alive for a further
+ /// grace period, or accept the potential (but statistically tiny) race.
+ /// This is distinct from a normal operation error; the manager IS torn
+ /// down; the host should not retry `destroy`.
+ ErrorShutdownIncomplete = 19,
+
NotFound = 98, // Used exclusively for all the Option that are retuned as errors
ErrorUnknown = 99,
}
diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs
index d09d98a1e8..986103ab47 100644
--- a/packages/rs-platform-wallet-ffi/src/manager.rs
+++ b/packages/rs-platform-wallet-ffi/src/manager.rs
@@ -367,7 +367,18 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy(
if !status.all_clean() {
tracing::warn!(
?status,
- "platform wallet coordinator(s) did not exit cleanly"
+ "platform wallet coordinator(s) did not exit cleanly; \
+ host must not free the callback context immediately"
+ );
+ // Return a distinct non-ok code so the host can delay freeing
+ // its callback context. A lingering coordinator thread (e.g. one
+ // that timed out) still holds an Arc to the event handler and may
+ // fire one final callback through the host-owned context pointer;
+ // returning ok() here would signal that the context is safe to
+ // free when it may not be yet.
+ return PlatformWalletFFIResult::err(
+ PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+ format!("coordinator(s) did not exit cleanly: {status:?}"),
);
} else {
tracing::debug!(?status, "platform wallet coordinators joined cleanly");
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index ae2143a574..6e87261e0a 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -410,6 +410,38 @@ where
if cancel_guard.is_some() {
return;
}
+
+ // Drain any handle left by a prior stop() call. stop() takes-and-cancels
+ // the token but never touches background_join, so a stop()→start()
+ // sequence would otherwise overwrite (detach) the old handle —
+ // shutdown() would then miss that thread and join() only the new one.
+ // The old thread was already cancellation-signalled, so is_finished()
+ // becomes true within a few milliseconds; we spin-wait to guarantee
+ // no detached thread can fire callbacks after destroy() returns.
+ {
+ let prior = self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .take();
+ if let Some(h) = prior {
+ let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+ while !h.is_finished() {
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ "identity-sync prior thread did not finish within 1 s \
+ after cancellation; detaching to unblock start()"
+ );
+ break; // Drop h — detaches; thread was already cancelled.
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+ if h.is_finished() {
+ let _ = h.join(); // Reap resources; near-instant since finished.
+ }
+ }
+ }
+
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 6fa26902f8..a9569dd00e 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -168,13 +168,20 @@ impl CoordinatorExitStatus {
///
/// Called from each coordinator's `quiesce()` after cancelling the
/// loop and draining any in-flight pass, so the thread is already on
-/// its way out and the join is near-instant. The blocking
-/// [`JoinHandle::join`](std::thread::JoinHandle::join) runs on the
-/// blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
-/// so the async executor stays unblocked. Joining while the runtime is
-/// still alive guarantees the `!Send` loop has stopped touching
+/// its way out and the join is near-instant. Joining while the runtime
+/// is still alive guarantees the `!Send` loop has stopped touching
/// `tokio::time` before the host drops the runtime.
///
+/// **Polling approach**: we poll [`JoinHandle::is_finished`] in 5 ms
+/// steps rather than wrapping `handle.join()` in
+/// [`spawn_blocking`](tokio::task::spawn_blocking). The
+/// `spawn_blocking` approach spawns a blocking-pool task that cannot be
+/// cancelled once started — so dropping the timeout future that wraps
+/// `quiesce()` would leave the blocking task alive and `handle.join()`
+/// still running, defeating the timeout boundary. Polling lets the
+/// executor yield at each `.await` step so `tokio::time::timeout`
+/// wrapping `quiesce()` can truly interrupt this call.
+///
/// **Requires a multi-thread runtime.** Each coordinator's OS thread
/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on)
/// and needs the runtime's timer/IO driver; a `current_thread` runtime
@@ -187,13 +194,20 @@ pub(crate) async fn join_coordinator_thread(
let Some(handle) = handle else {
return CoordinatorThreadStatus::NotRunning;
};
- match tokio::task::spawn_blocking(move || handle.join()).await {
- Ok(Ok(())) => CoordinatorThreadStatus::Ok,
- Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
- // spawn_blocking fails only when the runtime shuts down before
- // the blocking task can run — unreachable in normal operation
- // since shutdown() is called while the runtime is alive.
- Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")),
+ // Poll until the thread exits. The coordinator was already cancelled
+ // (stop() fires before quiesce() calls us), so is_finished() becomes
+ // true nearly immediately — typically within a single 5 ms step.
+ loop {
+ if handle.is_finished() {
+ return match handle.join() {
+ Ok(()) => CoordinatorThreadStatus::Ok,
+ Err(payload) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
+ };
+ }
+ // Yield to the executor so the outer tokio::time::timeout wrapping
+ // quiesce() can fire if the deadline has passed. Without this yield
+ // the loop would busy-spin and block the task.
+ tokio::time::sleep(std::time::Duration::from_millis(5)).await;
}
}
@@ -711,31 +725,44 @@ mod tests {
/// tokio task is cancelled or aborted rather than completing normally.
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() {
- // Build a manager but immediately abort the event adapter task so
- // we trigger the non-panic JoinError path in shutdown().
+ // Replace the real adapter handle with a guaranteed-pending task, then
+ // abort it. A `pending::<()>()` future can never complete on its own,
+ // so abort() always produces a non-panic JoinError — deterministically
+ // exercising the Stopped branch regardless of scheduler timing.
+ // (The original approach aborted the real adapter handle, which could
+ // race the task's own completion and silently yield `Ok` instead.)
let manager = make_manager();
- // Abort the adapter task directly so the join sees a non-panic JoinError.
- {
+
+ // Drain and discard the real adapter (may already be finished).
+ let original = {
let mut guard = manager.event_adapter_join.lock().await;
- if let Some(handle) = guard.take() {
- handle.abort();
- // Put it back so shutdown() sees it and exercises the error path.
- *guard = Some(handle);
- }
+ guard.take()
+ };
+ if let Some(h) = original {
+ h.abort();
+ let _ = h.await;
}
- // Give tokio a moment to process the abort.
- tokio::time::sleep(Duration::from_millis(10)).await;
+
+ // Install a permanently-pending task and abort it so the JoinError
+ // path in shutdown() is 100 % deterministic.
+ let pending = tokio::spawn(std::future::pending::<()>());
+ pending.abort();
+ *manager.event_adapter_join.lock().await = Some(pending);
let status = manager.shutdown().await;
- // The adapter task was aborted → non-panic JoinError → Stopped.
- match &status.event_adapter {
- CoordinatorThreadStatus::Stopped(_) | CoordinatorThreadStatus::Ok => {
- // Stopped is the expected path; Ok means it drained before abort — both
- // are acceptable since abort() races the task completion.
- }
- other => panic!("expected Stopped or Ok (abort race), got {other:?}"),
- }
- // Regardless, all other workers were never started → clean.
+
+ // The aborted pending task always yields a non-panic JoinError →
+ // shutdown() maps it to Stopped.
+ assert!(
+ matches!(status.event_adapter, CoordinatorThreadStatus::Stopped(_)),
+ "expected Stopped from a non-panic JoinError, got {:?}",
+ status.event_adapter
+ );
+ assert!(
+ !status.event_adapter.is_clean(),
+ "Stopped must not count as clean"
+ );
+ // Coordinators were never started → their slots are clean.
assert_eq!(
status.platform_address_sync,
CoordinatorThreadStatus::NotRunning
@@ -801,18 +828,18 @@ mod tests {
);
}
- /// `join_coordinator_thread` uses `spawn_blocking` internally. Verify
- /// it completes without deadlock within a bounded time when called
- /// from a multi-thread runtime, as `shutdown()` requires.
+ /// `join_coordinator_thread` uses `is_finished()` polling. Verify
+ /// it completes within a bounded time on a multi-thread runtime, as
+ /// `shutdown()` requires (and that it doesn't busy-spin indefinitely).
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
- async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() {
+ async fn join_coordinator_thread_completes_within_deadline() {
let handle = std::thread::spawn(|| {});
let result = tokio::time::timeout(
Duration::from_secs(5),
join_coordinator_thread(Some(handle)),
)
.await
- .expect("join_coordinator_thread must complete within 5 s — no spawn_blocking deadlock");
+ .expect("join_coordinator_thread must complete within 5 s");
assert_eq!(result, CoordinatorThreadStatus::Ok);
}
@@ -871,15 +898,14 @@ mod tests {
/// the join, the coordinator's `select!` wakeup (via tokio) would
/// race the runtime teardown and reliably trigger the
/// "Tokio … being shutdown" panic across the 10 iterations.
+ ///
+ /// Uses `std::panic::catch_unwind` around `drop(runtime)` rather than
+ /// a process-global panic hook; the hook would be live for seconds and
+ /// could swallow diagnostics from concurrently-running tests (e.g.
+ /// `join_coordinator_thread_surfaces_panic`).
#[test]
fn shutdown_then_drop_runtime_does_not_panic() {
static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
- let prev_hook = std::panic::take_hook();
- std::panic::set_hook(Box::new(|info| {
- if info.to_string().contains("being shutdown") {
- SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst);
- }
- }));
for _ in 0..10 {
let runtime = tokio::runtime::Builder::new_multi_thread()
@@ -912,7 +938,27 @@ mod tests {
manager.shutdown().await
});
- drop(runtime);
+ // Wrap the runtime drop in catch_unwind to intercept the specific
+ // "A Tokio 1.x context ... being shutdown" panic without installing
+ // a process-wide hook that would suppress diagnostics from other
+ // concurrently running tests.
+ let drop_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+ drop(runtime);
+ }));
+ if let Err(payload) = drop_result {
+ let msg = payload
+ .downcast_ref::()
+ .map(String::as_str)
+ .or_else(|| payload.downcast_ref::<&str>().copied())
+ .unwrap_or("");
+ if msg.contains("being shutdown") {
+ SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst);
+ } else {
+ // Unexpected panic — propagate so the test fails loudly.
+ std::panic::resume_unwind(payload);
+ }
+ }
+
// Brief settle — any stray thread activity surfaces here.
std::thread::sleep(Duration::from_millis(50));
@@ -921,12 +967,12 @@ mod tests {
assert!(status.all_clean(), "workers did not wind down: {status:?}");
}
- let racing_panics = SHUTDOWN_PANICS.load(AO::SeqCst);
- std::panic::set_hook(prev_hook);
assert_eq!(
- racing_panics, 0,
+ SHUTDOWN_PANICS.load(AO::SeqCst),
+ 0,
"dropping the runtime after shutdown raced a coordinator thread \
- ({racing_panics} panics across 10 iterations)"
+ ({} panics across 10 iterations)",
+ SHUTDOWN_PANICS.load(AO::SeqCst)
);
}
}
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 28987bd9c5..7e72f2fe74 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -219,6 +219,38 @@ impl PlatformAddressSyncManager {
if cancel_guard.is_some() {
return;
}
+
+ // Drain any handle left by a prior stop() call. stop() takes-and-cancels
+ // the token but never touches background_join, so a stop()→start()
+ // sequence would otherwise overwrite (detach) the old handle —
+ // shutdown() would then miss that thread and join() only the new one.
+ // The old thread was already cancellation-signalled, so is_finished()
+ // becomes true within a few milliseconds; we spin-wait to guarantee
+ // no detached thread can fire callbacks after destroy() returns.
+ {
+ let prior = self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .take();
+ if let Some(h) = prior {
+ let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+ while !h.is_finished() {
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ "platform-address-sync prior thread did not finish within 1 s \
+ after cancellation; detaching to unblock start()"
+ );
+ break; // Drop h — detaches; thread was already cancelled.
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+ if h.is_finished() {
+ let _ = h.join(); // Reap resources; near-instant since finished.
+ }
+ }
+ }
+
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
// Bump the generation while we still hold the slot lock so any
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index accaca69d0..365b0be17b 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -237,6 +237,38 @@ impl ShieldedSyncManager {
if cancel_guard.is_some() {
return;
}
+
+ // Drain any handle left by a prior stop() call. stop() takes-and-cancels
+ // the token but never touches background_join, so a stop()→start()
+ // sequence would otherwise overwrite (detach) the old handle —
+ // shutdown() would then miss that thread and join() only the new one.
+ // The old thread was already cancellation-signalled, so is_finished()
+ // becomes true within a few milliseconds; we spin-wait to guarantee
+ // no detached thread can fire callbacks after destroy() returns.
+ {
+ let prior = self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .take();
+ if let Some(h) = prior {
+ let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+ while !h.is_finished() {
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ "shielded-sync prior thread did not finish within 1 s \
+ after cancellation; detaching to unblock start()"
+ );
+ break; // Drop h — detaches; thread was already cancelled.
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+ if h.is_finished() {
+ let _ = h.join(); // Reap resources; near-instant since finished.
+ }
+ }
+ }
+
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
// Bump the generation while we still hold the slot lock so
From 7c975ed5a632c5de60eb32761748dc93f8a35416 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:18:21 +0200
Subject: [PATCH 10/29] fix(platform-wallet)!: surface non-clean shielded drain
on clear/stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Extend the destroy UAF-surfacing discipline (which already returns
ErrorShutdownIncomplete=19 on a non-clean shutdown) to the shielded
clear/stop paths, so a partial/timed-out coordinator drain can no
longer be silently swallowed.
- clear_shielded now captures the quiesce result instead of discarding
it: on a timed-out or non-clean drain it returns the new typed
PlatformWalletError::ShieldedShutdownIncomplete (carrying the terminal
CoordinatorThreadStatus) and leaves the commitment-tree store INTACT,
rather than unconditionally wiping a store an in-flight pass may still
write into. The store is wiped only on a clean drain.
- FFI shielded_sync_stop now returns ErrorShutdownIncomplete (with the
status rendered into the message) on a non-clean/timed-out drain,
instead of always returning ok() — symmetric with destroy. A timeout
is reported as the Timeout status.
- FFI shielded_clear maps the new ShieldedShutdownIncomplete variant to
ErrorShutdownIncomplete (store-reset failures still map to
ErrorWalletOperation); the blanket From gains the
same arm, pinned by a unit test.
- Swift mirror gains errorShutdownIncomplete=19 plus a richer
PlatformWalletError.shutdownIncomplete case, wired through both the
init(ffi:) and init(result:) switches.
- Re-export CoordinatorThreadStatus / CoordinatorExitStatus from the
crate root so the FFI can name the status type.
BREAKING CHANGE: clear_shielded / shielded_sync_stop / shielded_clear
now report a non-clean coordinator drain instead of succeeding silently;
hosts must defer freeing their callback context and must not commit their
own persistence wipe on ErrorShutdownIncomplete.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
packages/rs-platform-wallet-ffi/src/error.rs | 31 +++++++++
.../src/shielded_sync.rs | 66 +++++++++++++++----
packages/rs-platform-wallet/src/error.rs | 21 ++++++
packages/rs-platform-wallet/src/lib.rs | 5 +-
.../rs-platform-wallet/src/manager/mod.rs | 32 +++++++--
.../PlatformWallet/PlatformWalletResult.swift | 16 +++++
6 files changed, 149 insertions(+), 22 deletions(-)
diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs
index b50b5d79c5..5769ffcc43 100644
--- a/packages/rs-platform-wallet-ffi/src/error.rs
+++ b/packages/rs-platform-wallet-ffi/src/error.rs
@@ -246,6 +246,14 @@ impl From for PlatformWalletFFIResult {
PlatformWalletError::ShieldedSpendUnconfirmed { .. } => {
PlatformWalletFFIResultCode::ErrorShieldedSpendUnconfirmed
}
+ // A Clear that refused because the in-flight shielded pass didn't
+ // drain cleanly: surface it as ErrorShutdownIncomplete (symmetric
+ // with `platform_wallet_manager_destroy`) so the host defers
+ // freeing its callback context AND does not commit its own
+ // persistence wipe — the store was intentionally left intact.
+ PlatformWalletError::ShieldedShutdownIncomplete { .. } => {
+ PlatformWalletFFIResultCode::ErrorShutdownIncomplete
+ }
_ => PlatformWalletFFIResultCode::ErrorUnknown,
};
PlatformWalletFFIResult::err(code, error.to_string())
@@ -604,6 +612,29 @@ mod tests {
assert_eq!(msg, rendered, "Display payload must survive verbatim");
}
+ /// A Clear that refused on a non-clean shielded drain must surface as
+ /// `ErrorShutdownIncomplete` (symmetric with `destroy`), not flatten to
+ /// `ErrorUnknown`, so the host knows to defer freeing its callback
+ /// context and to NOT commit its own persistence wipe. The typed Display
+ /// rendering (carrying the terminal coordinator status) survives verbatim.
+ #[test]
+ fn shielded_shutdown_incomplete_maps_to_dedicated_code() {
+ let err = PlatformWalletError::ShieldedShutdownIncomplete {
+ status: platform_wallet::CoordinatorThreadStatus::Timeout,
+ };
+ let rendered = err.to_string();
+ let result: PlatformWalletFFIResult = err.into();
+ assert_eq!(
+ result.code,
+ PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+ "ShieldedShutdownIncomplete should map to ErrorShutdownIncomplete (rendered: {rendered})"
+ );
+ let msg = unsafe { std::ffi::CStr::from_ptr(result.message) }
+ .to_string_lossy()
+ .into_owned();
+ assert_eq!(msg, rendered, "Display payload must survive verbatim");
+ }
+
/// Other wallet-error variants without a dedicated FFI arm still
/// fall through to `ErrorUnknown` while carrying the typed
/// Display rendering as the message. Pin this so the catch-all
diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
index da285e422e..14082628e4 100644
--- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
+++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
@@ -68,12 +68,20 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_start(
/// Stop the shielded sync manager and wait for any in-flight pass to
/// drain before returning. No-op if not running.
///
-/// Uses `quiesce` rather than cancel-only stop, so on return: the loop
-/// is cancelled, no new pass will start, and any in-flight pass has
+/// Uses `quiesce` rather than cancel-only stop, so on a clean return: the
+/// loop is cancelled, no new pass will start, and any in-flight pass has
/// fully drained — its **persistence callbacks have completed** (no
/// note/sync-state row can be written after this returns) and its
/// completion-event *dispatch* on the Rust side has run.
///
+/// Returns `ErrorShutdownIncomplete` instead of `Success` when that drain
+/// did **not** complete cleanly (the in-flight pass timed out on the join
+/// backstop, or the loop ended non-cleanly). The terminal coordinator
+/// status is rendered into the result message. On this code the host must
+/// **not** free the callback context immediately — a lingering pass may
+/// still fire one final callback through it (symmetric with
+/// `platform_wallet_manager_destroy`).
+///
/// Caveat on host-observed events: a host that marshals the completion
/// callback onto its own executor (e.g. the Swift trampoline hops it to
/// the `@MainActor`) may still observe that final, already-dispatched
@@ -92,17 +100,36 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
// Bound the quiesce with the same backstop `shutdown()` uses so
// a stalled in-flight pass can't hang the host's stop call
// forever. Cancellation makes the drain prompt; this only
- // matters if a pass's drop wedges. The terminal status is
- // discarded — the C ABI exposes none of it, we only need the
- // drain not to wedge.
- let _ = tokio::time::timeout(
+ // matters if a pass's drop wedges. A timeout (the future was
+ // dropped at the deadline) is reported as the non-clean
+ // `Timeout` status, matching `shutdown()`'s backstop
+ // substitution, so the host learns the drain may be incomplete.
+ match tokio::time::timeout(
Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS),
manager.shielded_sync().quiesce(),
)
- .await;
- });
+ .await
+ {
+ Ok(status) => status,
+ Err(_elapsed) => platform_wallet::CoordinatorThreadStatus::Timeout,
+ }
+ })
});
- unwrap_option_or_return!(option);
+ let status = unwrap_option_or_return!(option);
+ // Symmetric with `platform_wallet_manager_destroy`: a non-clean drain
+ // means the shielded loop may still hold a reference to the host-owned
+ // event-handler / persister context and could fire one final callback,
+ // so signal the host to defer freeing that context rather than returning
+ // ok() and inviting a use-after-free.
+ if !status.is_clean() {
+ return PlatformWalletFFIResult::err(
+ PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+ format!(
+ "shielded sync stop did not drain cleanly ({status:?}); \
+ host must not free the callback context immediately"
+ ),
+ );
+ }
PlatformWalletFFIResult::ok()
}
@@ -429,7 +456,9 @@ pub unsafe extern "C" fn platform_wallet_manager_configure_shielded(
/// via the changeset path.
///
/// Returns `ErrorWalletOperation` if the Rust-side store reset
-/// fails. The host **must** check this before wiping its own
+/// fails, or `ErrorShutdownIncomplete` if the in-flight sync pass
+/// did not drain cleanly first (in which case the store is left
+/// intact). The host **must** check this before wiping its own
/// persistence: a silent failure would leave the shared tree
/// populated while the host drops its rows, and the next cold
/// resync would gate-skip every re-downloaded position against the
@@ -455,10 +484,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_clear(
});
let result = unwrap_option_or_return!(option);
if let Err(e) = result {
- return PlatformWalletFFIResult::err(
- PlatformWalletFFIResultCode::ErrorWalletOperation,
- format!("clear_shielded failed: {e}"),
- );
+ // A non-clean / timed-out quiesce aborts the clear *before* the store
+ // is touched: surface it as ErrorShutdownIncomplete (symmetric with
+ // destroy / shielded_sync_stop) so the host defers freeing its
+ // callback context and does NOT commit its own persistence wipe — the
+ // store was intentionally left intact. Every other clear failure is a
+ // store-reset error → ErrorWalletOperation, as before.
+ let code = match &e {
+ platform_wallet::PlatformWalletError::ShieldedShutdownIncomplete { .. } => {
+ PlatformWalletFFIResultCode::ErrorShutdownIncomplete
+ }
+ _ => PlatformWalletFFIResultCode::ErrorWalletOperation,
+ };
+ return PlatformWalletFFIResult::err(code, format!("clear_shielded failed: {e}"));
}
PlatformWalletFFIResult::ok()
}
diff --git a/packages/rs-platform-wallet/src/error.rs b/packages/rs-platform-wallet/src/error.rs
index c94cb7093d..196d2ee5b4 100644
--- a/packages/rs-platform-wallet/src/error.rs
+++ b/packages/rs-platform-wallet/src/error.rs
@@ -239,6 +239,27 @@ pub enum PlatformWalletError {
#[error("Shielded sub-wallet not bound: call bind_shielded first")]
ShieldedNotBound,
+
+ /// A Clear/wipe could not safely complete because the shielded sync
+ /// coordinator's in-flight pass did not drain cleanly first — it either
+ /// timed out on the join backstop or its loop ended non-cleanly
+ /// (cancelled / panicked). The shared commitment-tree store is therefore
+ /// **left intact** (not wiped): a still-running pass could re-persist
+ /// notes into the store immediately after a `clear()`, desyncing the
+ /// host's wiped rows from a repopulated tree and gate-skipping every
+ /// re-downloaded position on the next cold resync. The host **must not**
+ /// commit its own persistence wipe; retry Clear once the pass settles.
+ /// Carries the terminal [`CoordinatorThreadStatus`] for diagnostics.
+ ///
+ /// [`CoordinatorThreadStatus`]: crate::manager::CoordinatorThreadStatus
+ #[error(
+ "shielded clear aborted: sync coordinator did not drain cleanly \
+ ({status:?}); commitment-tree store left intact so an in-flight pass \
+ cannot re-persist into a wiped store — retry once the pass settles"
+ )]
+ ShieldedShutdownIncomplete {
+ status: crate::manager::CoordinatorThreadStatus,
+ },
}
/// Check whether an SDK error indicates that an InstantSend lock proof was
diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs
index dd12883fc7..8b55948aa1 100644
--- a/packages/rs-platform-wallet/src/lib.rs
+++ b/packages/rs-platform-wallet/src/lib.rs
@@ -44,7 +44,10 @@ pub use manager::platform_address_sync::{
PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome,
DEFAULT_SYNC_INTERVAL_SECS,
};
-pub use manager::{PlatformWalletManager, SHUTDOWN_JOIN_TIMEOUT_SECS};
+pub use manager::{
+ CoordinatorExitStatus, CoordinatorThreadStatus, PlatformWalletManager,
+ SHUTDOWN_JOIN_TIMEOUT_SECS,
+};
pub use spv::SpvRuntime;
pub use wallet::asset_lock::manager::AssetLockManager;
pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock};
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index a9569dd00e..2de6ad6d5a 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -425,21 +425,39 @@ impl PlatformWalletManager {
/// disk but its contents are reset to empty so the next bind cold-
/// resyncs from index 0.
///
- /// Returns an error if the coordinator's store reset fails; the host
- /// must not commit its own persistence wipe in that case.
+ /// Returns an error — and leaves the store untouched — in two cases, so
+ /// the host knows **not** to commit its own persistence wipe:
+ /// - the in-flight sync pass did not drain cleanly (timed out on the join
+ /// backstop, or its loop ended non-cleanly) →
+ /// [`crate::error::PlatformWalletError::ShieldedShutdownIncomplete`]; or
+ /// - the coordinator's store reset itself fails.
#[cfg(feature = "shielded")]
pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
// Bound the quiesce with the same backstop `shutdown()` uses so a
// stalled in-flight pass can't hang Clear forever — cancellation
// makes the drain prompt; this timeout only matters if a pass's
- // drop wedges. The terminal status isn't surfaced on the Clear
- // path (the coordinator reset below is what can fail), so the
- // timeout result is intentionally discarded.
- let _ = tokio::time::timeout(
+ // drop wedges. Unlike `shutdown()`, the terminal status is
+ // load-bearing HERE: a non-clean drain means the in-flight pass may
+ // still be running and could re-persist notes into the very store
+ // the `clear()` below is about to wipe. A timeout (the future was
+ // dropped at the deadline) is treated as the non-clean `Timeout`
+ // status, matching `shutdown()`'s backstop substitution.
+ let status = match tokio::time::timeout(
std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
self.shielded_sync_manager.quiesce(),
)
- .await;
+ .await
+ {
+ Ok(status) => status,
+ Err(_elapsed) => CoordinatorThreadStatus::Timeout,
+ };
+ // Only commit the store wipe once the in-flight pass has fully
+ // drained. Otherwise refuse: a partial/timed-out drain could let a
+ // surviving pass write into a store we just cleared, desyncing the
+ // host's own wipe from a repopulated tree.
+ if !status.is_clean() {
+ return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status });
+ }
if let Some(coord) = self.shielded_coordinator().await {
coord.clear().await?;
}
diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
index 2c311f91e9..31ef07ad4a 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
@@ -39,6 +39,12 @@ public enum PlatformWalletResultCode: Int32, Sendable {
/// outcome. Do NOT auto-retry — a retry would rebuild the bundle and
/// could double-execute if the original landed.
case errorShieldedSpendUnconfirmed = 18
+ /// A destroy/stop/clear completed but a background coordinator did not
+ /// exit cleanly (timed out or ended non-cleanly). The host should defer
+ /// freeing its callback context — a lingering coordinator may still fire
+ /// one final callback through it — and, on the clear path, must NOT
+ /// commit its own persistence wipe (the Rust store was left intact).
+ case errorShutdownIncomplete = 19
case notFound = 98
case errorUnknown = 99
@@ -82,6 +88,8 @@ public enum PlatformWalletResultCode: Int32, Sendable {
self = .errorShieldedBroadcastUnconfirmed
case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHIELDED_SPEND_UNCONFIRMED:
self = .errorShieldedSpendUnconfirmed
+ case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHUTDOWN_INCOMPLETE:
+ self = .errorShutdownIncomplete
case PLATFORM_WALLET_FFI_RESULT_CODE_NOT_FOUND:
self = .notFound
case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_UNKNOWN:
@@ -177,6 +185,12 @@ public enum PlatformWalletError: LocalizedError {
/// notes reserved wallet-side (a shield reserves nothing) until the
/// next sync reconciles the outcome. Do NOT auto-retry.
case shieldedSpendUnconfirmed(String)
+ /// A destroy / stop / clear completed but a background coordinator did
+ /// not exit cleanly. The host should defer freeing its callback context
+ /// (a lingering coordinator may still fire one final callback) and, on
+ /// the clear path, must NOT commit its own persistence wipe — the Rust
+ /// store was left intact so it can be retried once the pass settles.
+ case shutdownIncomplete(String)
case notFound(String)
case unknown(String)
@@ -192,6 +206,7 @@ public enum PlatformWalletError: LocalizedError {
.arithmeticOverflow(let m), .noSelectableInputs(let m),
.walletAlreadyExists(let m), .shieldedBroadcastFailed(let m),
.shieldedBroadcastUnconfirmed(let m), .shieldedSpendUnconfirmed(let m),
+ .shutdownIncomplete(let m),
.notFound(let m), .unknown(let m):
return m
}
@@ -222,6 +237,7 @@ public enum PlatformWalletError: LocalizedError {
case .errorShieldedBroadcastFailed: self = .shieldedBroadcastFailed(detail)
case .errorShieldedBroadcastUnconfirmed: self = .shieldedBroadcastUnconfirmed(detail)
case .errorShieldedSpendUnconfirmed: self = .shieldedSpendUnconfirmed(detail)
+ case .errorShutdownIncomplete: self = .shutdownIncomplete(detail)
case .notFound: self = .notFound(detail)
case .errorUnknown: self = .unknown(detail)
}
From 5f63c9544c84c44c7a62eeed14c73634a27e45e6 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:30:26 +0200
Subject: [PATCH 11/29] fix(platform-wallet): reap prior coordinator thread
outside background_cancel lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
All three coordinators (identity_sync, platform_address_sync,
shielded_sync) reaped the prior loop's OS thread inside start() WHILE
holding background_cancel. But the exiting prior thread's epilogue also
locks background_cancel to clear its slot, so a tight stop()→start()
deadlocked the reap: the prior thread blocked on the lock start() held,
never finished, and the is_finished() spin-wait burned the full 1 s
deadline then DETACHED the handle — a 1 s stall plus a transient
untracked thread, on the exact stop()→start() path the reap exists for.
Reorder start() to install the new cancel token + bump the generation
under the lock, then drop(cancel_guard) to release background_cancel,
and only THEN run the spin-wait + join. The prior thread's epilogue now
acquires the lock (or, for shielded, observes the bumped generation),
skips clearing the freshly-installed token, and returns, so is_finished()
trips in milliseconds and the join is near-instant. start() stays
synchronous; the 1 s deadline remains only as a genuine-wedge backstop.
Adds restart_after_stop_reaps_prior_thread regression tests to the
identity and platform-address coordinators: start → (stop+start
back-to-back) → assert the restart returns well under the 1 s deadline.
Verified non-vacuous — against the old lock-held ordering it stalls
~1.0 s and fails.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
.../src/manager/identity_sync.rs | 132 ++++++++++++++----
.../src/manager/platform_address_sync.rs | 132 ++++++++++++++----
.../src/manager/shielded_sync.rs | 78 +++++++----
3 files changed, 249 insertions(+), 93 deletions(-)
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 6e87261e0a..9cc14ac831 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -411,36 +411,22 @@ where
return;
}
- // Drain any handle left by a prior stop() call. stop() takes-and-cancels
- // the token but never touches background_join, so a stop()→start()
- // sequence would otherwise overwrite (detach) the old handle —
- // shutdown() would then miss that thread and join() only the new one.
- // The old thread was already cancellation-signalled, so is_finished()
- // becomes true within a few milliseconds; we spin-wait to guarantee
- // no detached thread can fire callbacks after destroy() returns.
- {
- let prior = self
- .background_join
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .take();
- if let Some(h) = prior {
- let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
- while !h.is_finished() {
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- "identity-sync prior thread did not finish within 1 s \
- after cancellation; detaching to unblock start()"
- );
- break; // Drop h — detaches; thread was already cancelled.
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
- if h.is_finished() {
- let _ = h.join(); // Reap resources; near-instant since finished.
- }
- }
- }
+ // Take any handle left by a prior stop() call so we can reap it — but
+ // DON'T join it here, while we still hold background_cancel. stop()
+ // takes-and-cancels the token but never touches background_join, so a
+ // stop()→start() sequence would otherwise overwrite (detach) the old
+ // handle and shutdown() would miss that thread. Joining it under
+ // background_cancel would DEADLOCK the reap into its 1 s backstop: the
+ // exiting prior thread's epilogue also locks background_cancel (to
+ // clear its slot), so it would block on the lock we hold → never
+ // finish → get detached on the exact stop()→start() path the reap
+ // exists for. We install the new token + bump the generation below,
+ // release the lock, and only THEN reap (after this fn's tail).
+ let prior = self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .take();
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
@@ -498,7 +484,37 @@ where
.background_join
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some(join);
- // cancel_guard drops here, releasing background_cancel.
+
+ // Release background_cancel BEFORE reaping the prior thread, so its
+ // epilogue can acquire the lock, observe the bumped generation, skip
+ // clearing our freshly-installed token, and return. Holding the lock
+ // across the join below is what would block the prior thread, spin
+ // the full 1 s deadline, and detach — the very stall this ordering
+ // removes.
+ drop(cancel_guard);
+
+ // Now reap the prior thread. It was already cancellation-signalled by
+ // stop(), and with the lock released its epilogue completes promptly,
+ // so is_finished() trips within a few milliseconds and the join is
+ // near-instant. The 1 s deadline survives only as a genuine-wedge
+ // backstop (e.g. a pass wedged in a Drop that never yields); if it
+ // fires we detach the already-cancelled thread to unblock start().
+ if let Some(h) = prior {
+ let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+ while !h.is_finished() {
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ "identity-sync prior thread did not finish within 1 s \
+ after cancellation; detaching to unblock start()"
+ );
+ break; // Drop h — detaches; thread was already cancelled.
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+ if h.is_finished() {
+ let _ = h.join(); // Reap resources; near-instant since finished.
+ }
+ }
}
/// Stop the background sync loop. No-op if not running.
@@ -1025,6 +1041,60 @@ mod tests {
pass.await.unwrap();
}
+ /// Regression: a tight `stop()` → `start()` must reap the prior loop's
+ /// OS thread promptly, NOT stall on the 1 s detach backstop.
+ ///
+ /// The prior thread's exit epilogue locks `background_cancel` to
+ /// conditionally clear its slot. The earlier ordering held
+ /// `background_cancel` across the prior-handle join inside `start()`, so
+ /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
+ /// that lock, never finished, and the reap spin-waited the full second
+ /// before detaching — a 1 s stall plus a transient untracked thread. The
+ /// fix installs the new token + generation, releases `background_cancel`,
+ /// and only then reaps, so the prior thread's epilogue runs and the join
+ /// lands in milliseconds.
+ ///
+ /// `stop()` and `start()` run back-to-back in one blocking closure
+ /// (mirroring the real call site) so `start()` re-acquires the lock
+ /// microseconds after `stop()` frees it — before the async-woken prior
+ /// thread can reach its epilogue. Against the old lock-held ordering this
+ /// reliably stalls ~1 s and fails the bound below.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn restart_after_stop_reaps_prior_thread() {
+ let mgr = make_manager();
+
+ // Launch the first loop and let its immediate (no-op, nothing
+ // registered) pass complete so the thread parks in the interval
+ // sleep, where cancellation lands cleanly.
+ Arc::clone(&mgr).start();
+ assert!(mgr.is_running());
+ tokio::time::sleep(Duration::from_millis(50)).await;
+
+ // Back-to-back cancel-only stop + restart, off the runtime so the
+ // synchronous reap can't starve a worker. `start()` re-grabs
+ // background_cancel right after `stop()` frees it.
+ let restart = Arc::clone(&mgr);
+ let elapsed = tokio::task::spawn_blocking(move || {
+ restart.stop();
+ let started = std::time::Instant::now();
+ Arc::clone(&restart).start();
+ started.elapsed()
+ })
+ .await
+ .unwrap();
+
+ assert!(
+ elapsed < Duration::from_millis(500),
+ "stop()→start() stalled for {elapsed:?}: prior thread was not \
+ reaped promptly (background_cancel held across the join?)"
+ );
+ assert!(mgr.is_running(), "restart must leave the new loop tracked");
+
+ // Wind the new loop down so the test leaves no live !Send thread.
+ mgr.quiesce().await;
+ assert!(!mgr.is_running());
+ }
+
/// A `sync_now()` invoked while `quiescing` is set must bail without
/// running the pass — in particular, without calling
/// `persister.store(...)`. This is the gate that prevents a pass
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 7e72f2fe74..87b6595e53 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -220,36 +220,22 @@ impl PlatformAddressSyncManager {
return;
}
- // Drain any handle left by a prior stop() call. stop() takes-and-cancels
- // the token but never touches background_join, so a stop()→start()
- // sequence would otherwise overwrite (detach) the old handle —
- // shutdown() would then miss that thread and join() only the new one.
- // The old thread was already cancellation-signalled, so is_finished()
- // becomes true within a few milliseconds; we spin-wait to guarantee
- // no detached thread can fire callbacks after destroy() returns.
- {
- let prior = self
- .background_join
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .take();
- if let Some(h) = prior {
- let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
- while !h.is_finished() {
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- "platform-address-sync prior thread did not finish within 1 s \
- after cancellation; detaching to unblock start()"
- );
- break; // Drop h — detaches; thread was already cancelled.
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
- if h.is_finished() {
- let _ = h.join(); // Reap resources; near-instant since finished.
- }
- }
- }
+ // Take any handle left by a prior stop() call so we can reap it — but
+ // DON'T join it here, while we still hold background_cancel. stop()
+ // takes-and-cancels the token but never touches background_join, so a
+ // stop()→start() sequence would otherwise overwrite (detach) the old
+ // handle and shutdown() would miss that thread. Joining it under
+ // background_cancel would DEADLOCK the reap into its 1 s backstop: the
+ // exiting prior thread's epilogue also locks background_cancel (to
+ // clear its slot), so it would block on the lock we hold → never
+ // finish → get detached on the exact stop()→start() path the reap
+ // exists for. We install the new token + bump the generation below,
+ // release the lock, and only THEN reap (after this fn's tail).
+ let prior = self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .take();
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
@@ -313,7 +299,37 @@ impl PlatformAddressSyncManager {
.background_join
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some(join);
- // cancel_guard drops here, releasing background_cancel.
+
+ // Release background_cancel BEFORE reaping the prior thread, so its
+ // epilogue can acquire the lock, observe the bumped generation, skip
+ // clearing our freshly-installed token, and return. Holding the lock
+ // across the join below is what would block the prior thread, spin
+ // the full 1 s deadline, and detach — the very stall this ordering
+ // removes.
+ drop(cancel_guard);
+
+ // Now reap the prior thread. It was already cancellation-signalled by
+ // stop(), and with the lock released its epilogue completes promptly,
+ // so is_finished() trips within a few milliseconds and the join is
+ // near-instant. The 1 s deadline survives only as a genuine-wedge
+ // backstop (e.g. a pass wedged in a Drop that never yields); if it
+ // fires we detach the already-cancelled thread to unblock start().
+ if let Some(h) = prior {
+ let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+ while !h.is_finished() {
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ "platform-address-sync prior thread did not finish within 1 s \
+ after cancellation; detaching to unblock start()"
+ );
+ break; // Drop h — detaches; thread was already cancelled.
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+ if h.is_finished() {
+ let _ = h.join(); // Reap resources; near-instant since finished.
+ }
+ }
}
/// Stop the background sync loop. No-op if not running.
@@ -590,6 +606,60 @@ mod tests {
pass.await.unwrap();
}
+ /// Regression: a tight `stop()` → `start()` must reap the prior loop's
+ /// OS thread promptly, NOT stall on the 1 s detach backstop.
+ ///
+ /// The prior thread's exit epilogue locks `background_cancel` to
+ /// conditionally clear its slot. The earlier ordering held
+ /// `background_cancel` across the prior-handle join inside `start()`, so
+ /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
+ /// that lock, never finished, and the reap spin-waited the full second
+ /// before detaching — a 1 s stall plus a transient untracked thread. The
+ /// fix installs the new token + generation, releases `background_cancel`,
+ /// and only then reaps, so the prior thread's epilogue runs and the join
+ /// lands in milliseconds.
+ ///
+ /// `stop()` and `start()` run back-to-back in one blocking closure
+ /// (mirroring the real call site) so `start()` re-acquires the lock
+ /// microseconds after `stop()` frees it — before the async-woken prior
+ /// thread can reach its epilogue. Against the old lock-held ordering this
+ /// reliably stalls ~1 s and fails the bound below.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn restart_after_stop_reaps_prior_thread() {
+ let (mgr, _counter) = make_manager();
+
+ // Launch the first loop and let its immediate (no-op, empty wallet
+ // map) pass complete so the thread parks in the interval sleep, where
+ // cancellation lands cleanly.
+ Arc::clone(&mgr).start();
+ assert!(mgr.is_running());
+ tokio::time::sleep(Duration::from_millis(50)).await;
+
+ // Back-to-back cancel-only stop + restart, off the runtime so the
+ // synchronous reap can't starve a worker. `start()` re-grabs
+ // background_cancel right after `stop()` frees it.
+ let restart = Arc::clone(&mgr);
+ let elapsed = tokio::task::spawn_blocking(move || {
+ restart.stop();
+ let started = std::time::Instant::now();
+ Arc::clone(&restart).start();
+ started.elapsed()
+ })
+ .await
+ .unwrap();
+
+ assert!(
+ elapsed < Duration::from_millis(500),
+ "stop()→start() stalled for {elapsed:?}: prior thread was not \
+ reaped promptly (background_cancel held across the join?)"
+ );
+ assert!(mgr.is_running(), "restart must leave the new loop tracked");
+
+ // Wind the new loop down so the test leaves no live !Send thread.
+ mgr.quiesce().await;
+ assert!(!mgr.is_running());
+ }
+
/// A `sync_now()` invoked while `quiescing` is set must bail without
/// running the pass — in particular, without firing the
/// `on_platform_address_sync_completed` host callback. This is the
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 365b0be17b..d0aa75a843 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -238,36 +238,22 @@ impl ShieldedSyncManager {
return;
}
- // Drain any handle left by a prior stop() call. stop() takes-and-cancels
- // the token but never touches background_join, so a stop()→start()
- // sequence would otherwise overwrite (detach) the old handle —
- // shutdown() would then miss that thread and join() only the new one.
- // The old thread was already cancellation-signalled, so is_finished()
- // becomes true within a few milliseconds; we spin-wait to guarantee
- // no detached thread can fire callbacks after destroy() returns.
- {
- let prior = self
- .background_join
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .take();
- if let Some(h) = prior {
- let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
- while !h.is_finished() {
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- "shielded-sync prior thread did not finish within 1 s \
- after cancellation; detaching to unblock start()"
- );
- break; // Drop h — detaches; thread was already cancelled.
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
- if h.is_finished() {
- let _ = h.join(); // Reap resources; near-instant since finished.
- }
- }
- }
+ // Take any handle left by a prior stop() call so we can reap it — but
+ // DON'T join it here, while we still hold background_cancel. stop()
+ // takes-and-cancels the token but never touches background_join, so a
+ // stop()→start() sequence would otherwise overwrite (detach) the old
+ // handle and shutdown() would miss that thread. Joining it under
+ // background_cancel would DEADLOCK the reap into its 1 s backstop: the
+ // exiting prior thread's epilogue also locks background_cancel (to
+ // clear its slot), so it would block on the lock we hold → never
+ // finish → get detached on the exact stop()→start() path the reap
+ // exists for. We install the new token + bump the generation below,
+ // release the lock, and only THEN reap (after this fn's tail).
+ let prior = self
+ .background_join
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .take();
let cancel = CancellationToken::new();
*cancel_guard = Some(cancel.clone());
@@ -339,7 +325,37 @@ impl ShieldedSyncManager {
.background_join
.lock()
.unwrap_or_else(|e| e.into_inner()) = Some(join);
- // cancel_guard drops here, releasing background_cancel.
+
+ // Release background_cancel BEFORE reaping the prior thread, so its
+ // epilogue can observe the bumped generation (and skip clearing our
+ // freshly-installed token) without contending the lock we hold.
+ // Holding the lock across the join below is what would block the
+ // prior thread, spin the full 1 s deadline, and detach — the very
+ // stall this ordering removes.
+ drop(cancel_guard);
+
+ // Now reap the prior thread. It was already cancellation-signalled by
+ // stop(), and with the lock released its epilogue completes promptly,
+ // so is_finished() trips within a few milliseconds and the join is
+ // near-instant. The 1 s deadline survives only as a genuine-wedge
+ // backstop (e.g. a pass wedged in a Drop that never yields); if it
+ // fires we detach the already-cancelled thread to unblock start().
+ if let Some(h) = prior {
+ let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+ while !h.is_finished() {
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ "shielded-sync prior thread did not finish within 1 s \
+ after cancellation; detaching to unblock start()"
+ );
+ break; // Drop h — detaches; thread was already cancelled.
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+ if h.is_finished() {
+ let _ = h.join(); // Reap resources; near-instant since finished.
+ }
+ }
}
/// Stop the background sync loop. No-op if not running.
From 2b068ba57564e836f8b48f9fb6c643943b73336c Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:34:10 +0200
Subject: [PATCH 12/29] fix(platform-wallet): close shielded epilogue TOCTOU +
pin restart reap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Three shielded-sync hardening fixes, bringing it in line with its
identity-sync and platform-address-sync siblings.
- shielded_sync.rs exit epilogue read `background_generation` BEFORE
acquiring `background_cancel` (load-then-lock). That stale-read TOCTOU let
a prior thread observe a pre-bump generation, block on the lock until a
concurrent start() released it, then null the freshly-installed token —
leaving the new loop running but untracked via is_running()/stop(). Acquire
the lock first and compare the generation under it, exactly like the
siblings.
- Add the `restart_after_stop_reaps_prior_thread` regression test the
siblings already carry. It pins the reap-after-drop(cancel_guard) reorder:
a back-to-back stop()+start() must reap the prior OS thread in <500 ms, not
stall ~1 s on the detach backstop. Confirmed non-vacuous — it fails at
~1.0 s with the reap moved back inside the lock.
- platform-wallet-ffi: the ErrorShutdownIncomplete doc only described
destroy. It is now also returned by shielded_sync_stop and shielded_clear,
where the manager is NOT torn down and the operation can be retried.
Document all three callers and their differing retry semantics.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
packages/rs-platform-wallet-ffi/src/error.rs | 32 ++++--
.../src/manager/shielded_sync.rs | 97 +++++++++++++++++--
2 files changed, 115 insertions(+), 14 deletions(-)
diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs
index 5769ffcc43..e5b5184a82 100644
--- a/packages/rs-platform-wallet-ffi/src/error.rs
+++ b/packages/rs-platform-wallet-ffi/src/error.rs
@@ -125,13 +125,31 @@ pub enum PlatformWalletFFIResultCode {
/// and could double-send if the original spend landed.
ErrorShieldedSpendUnconfirmed = 18,
- /// One or more background coordinator threads did not exit cleanly before
- /// the 30 s join deadline. The host **must not** free the callback context
- /// immediately — a lingering thread may still hold a reference to it and
- /// fire one final callback. Either keep the context alive for a further
- /// grace period, or accept the potential (but statistically tiny) race.
- /// This is distinct from a normal operation error; the manager IS torn
- /// down; the host should not retry `destroy`.
+ /// A background coordinator drain did not complete cleanly within the
+ /// join deadline — one or more `!Send` sync threads may still be alive
+ /// and still hold a reference to the host-owned callback context, so they
+ /// could fire one final callback through it. On this code the host **must
+ /// not** free the callback context immediately: either keep it alive for a
+ /// further grace period, or accept the (statistically tiny) race.
+ ///
+ /// Returned by three callers, which differ in whether the operation may
+ /// be **retried**:
+ /// - `platform_wallet_manager_destroy`: the manager **IS** torn down
+ /// (removed from storage) regardless — do **not** retry `destroy`; the
+ /// handle is already gone. Only the callback-context lifetime caveat
+ /// above applies.
+ /// - `platform_wallet_manager_shielded_sync_stop`: the manager is **NOT**
+ /// torn down — only the shielded loop's drain was non-clean. The host
+ /// may retry the stop (or proceed to `destroy`); the handle stays valid.
+ /// - `platform_wallet_manager_shielded_clear`: the manager is **NOT** torn
+ /// down and the store was left **intact** (Clear aborted before touching
+ /// it). The host may retry the clear, and must **not** commit its own
+ /// persistence wipe — doing so would desync the host's rows from the
+ /// still-populated shared tree.
+ ///
+ /// Distinct from a normal operation error (the underlying operation may
+ /// well have made progress); the terminal coordinator status is rendered
+ /// into the result message.
ErrorShutdownIncomplete = 19,
NotFound = 98, // Used exclusively for all the Option that are retuned as errors
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index d0aa75a843..98e94035aa 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -305,13 +305,20 @@ impl ShieldedSyncManager {
}
// Only clear `background_cancel` if the active
- // generation is still ours. Without this guard a
- // tight `stop()` → `start()` reschedule has the
- // exiting thread overwrite the *new* generation's
- // token, leaving the new loop running but
- // unreflectable via `is_running()` / `stop()`.
- if this.background_generation.load(Ordering::Acquire) == my_gen {
- if let Ok(mut guard) = this.background_cancel.lock() {
+ // generation is still ours. Acquire the lock FIRST,
+ // then read/compare `background_generation` under it
+ // (matching identity_sync / platform_address_sync).
+ // Reading the generation BEFORE locking opens a
+ // stale-read TOCTOU: this exiting thread could observe
+ // a pre-bump generation, then block on the lock until a
+ // concurrent `start()` released it, and null the
+ // freshly-installed token — leaving the new loop
+ // running but unreflectable via `is_running()` /
+ // `stop()`. `start()` bumps the generation while it
+ // holds this same lock, so comparing under the lock
+ // guarantees we observe the post-swap value.
+ if let Ok(mut guard) = this.background_cancel.lock() {
+ if this.background_generation.load(Ordering::Acquire) == my_gen {
*guard = None;
}
}
@@ -576,3 +583,79 @@ impl std::fmt::Debug for ShieldedSyncManager {
.finish()
}
}
+
+// The whole module is already `#[cfg(feature = "shielded")]`-gated at its
+// `mod` declaration (manager/mod.rs), so these tests compile only under that
+// feature — no extra per-test gate needed.
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ /// Build a manager over an **empty** coordinator slot wired to a
+ /// handler-less event manager. An empty slot makes every `sync_now`
+ /// pass a no-op (empty-coordinator handling returns immediately), so
+ /// the background loop parks in its interval sleep — exactly where
+ /// cancellation lands cleanly — without needing a live SDK / network.
+ /// That is all the start/stop/restart thread-lifecycle tests below
+ /// exercise.
+ fn make_manager() -> Arc {
+ let coordinator_slot = Arc::new(RwLock::new(None));
+ let event_manager = Arc::new(PlatformEventManager::new(vec![]));
+ Arc::new(ShieldedSyncManager::new(event_manager, coordinator_slot))
+ }
+
+ /// Regression: a tight `stop()` → `start()` must reap the prior loop's
+ /// OS thread promptly, NOT stall on the 1 s detach backstop.
+ ///
+ /// The prior thread's exit epilogue locks `background_cancel` to
+ /// conditionally clear its slot. The earlier ordering held
+ /// `background_cancel` across the prior-handle join inside `start()`, so
+ /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
+ /// that lock, never finished, and the reap spin-waited the full second
+ /// before detaching — a 1 s stall plus a transient untracked thread. The
+ /// fix installs the new token + generation, releases `background_cancel`,
+ /// and only then reaps, so the prior thread's epilogue runs and the join
+ /// lands in milliseconds. Mirrors the identity-sync and
+ /// platform-address-sync siblings.
+ ///
+ /// `stop()` and `start()` run back-to-back in one blocking closure
+ /// (mirroring the real call site) so `start()` re-acquires the lock
+ /// microseconds after `stop()` frees it — before the async-woken prior
+ /// thread can reach its epilogue. Against the old lock-held ordering this
+ /// reliably stalls ~1 s and fails the bound below.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn restart_after_stop_reaps_prior_thread() {
+ let mgr = make_manager();
+
+ // Launch the first loop and let its immediate (no-op, empty
+ // coordinator) pass complete so the thread parks in the interval
+ // sleep, where cancellation lands cleanly.
+ Arc::clone(&mgr).start();
+ assert!(mgr.is_running());
+ tokio::time::sleep(Duration::from_millis(50)).await;
+
+ // Back-to-back cancel-only stop + restart, off the runtime so the
+ // synchronous reap can't starve a worker. `start()` re-grabs
+ // background_cancel right after `stop()` frees it.
+ let restart = Arc::clone(&mgr);
+ let elapsed = tokio::task::spawn_blocking(move || {
+ restart.stop();
+ let started = std::time::Instant::now();
+ Arc::clone(&restart).start();
+ started.elapsed()
+ })
+ .await
+ .unwrap();
+
+ assert!(
+ elapsed < Duration::from_millis(500),
+ "stop()→start() stalled for {elapsed:?}: prior thread was not \
+ reaped promptly (background_cancel held across the join?)"
+ );
+ assert!(mgr.is_running(), "restart must leave the new loop tracked");
+
+ // Wind the new loop down so the test leaves no live !Send thread.
+ mgr.quiesce().await;
+ assert!(!mgr.is_running());
+ }
+}
From 5017ba13136e8a1f1d818ac6547290f716836e39 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:34:18 +0200
Subject: [PATCH 13/29] fix(swift-sdk): retain wallet callback context on
incomplete shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
PlatformWalletManager.deinit called .discard() on shielded_sync_stop and
destroy, tossing the result code. Rust now returns ErrorShutdownIncomplete
(19) on a non-clean drain, with the contract: a lingering coordinator thread
may still fire one final callback through the host-owned callback context.
But persistenceHandler/eventHandler are handed to Rust via
Unmanaged.passUnretained and kept alive only by this object's fields, so the
instant deinit returns ARC frees them — a use-after-free on that final
callback.
Capture the code via a new discardReturningCode() helper; on
ErrorShutdownIncomplete from shielded_sync_stop OR destroy, deliberately leak
one extra strong reference (an unbalanced passRetained, never released) to
each handler so it outlives any lingering thread. A clean shutdown — the
common case — takes neither branch and releases the handlers normally; we
never leak unconditionally.
UNVERIFIED locally: no Swift toolchain / xcframework on this host. Reasoned
correct-by-construction; needs an iOS-environment build to confirm.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
.../PlatformWalletManager.swift | 43 +++++++++++++++++--
.../PlatformWallet/PlatformWalletResult.swift | 12 ++++++
2 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
index 0e433d368e..36bafa37d1 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
@@ -152,10 +152,45 @@ public class PlatformWalletManager: ObservableObject {
deinit {
progressPollTask?.cancel()
- if handle != NULL_HANDLE {
- platform_wallet_manager_platform_address_sync_stop(handle).discard()
- platform_wallet_manager_shielded_sync_stop(handle).discard()
- platform_wallet_manager_destroy(handle).discard()
+ guard handle != NULL_HANDLE else { return }
+
+ // Tear down the Rust manager: cancel the address-sync loop, drain
+ // the shielded loop, then destroy. The first stop is cancel-only
+ // and never reports an incomplete drain, so we still `discard()` it.
+ platform_wallet_manager_platform_address_sync_stop(handle).discard()
+
+ // Capture the CODE (not just free the message) for the two calls
+ // that CAN report `.errorShutdownIncomplete`: `shielded_sync_stop`
+ // and `destroy`. Rust returns that code when a background
+ // coordinator did not drain within the join deadline — meaning a
+ // lingering `!Send` coordinator thread may still hold the
+ // `passUnretained` context pointers Rust was handed for our
+ // `persistenceHandler` / `eventHandler` and fire ONE final callback
+ // through them. The contract: on that code the host must NOT free
+ // the callback context immediately.
+ let shieldedStopCode =
+ platform_wallet_manager_shielded_sync_stop(handle).discardReturningCode()
+ let destroyCode =
+ platform_wallet_manager_destroy(handle).discardReturningCode()
+
+ // Both handlers are passed to Rust via `Unmanaged.passUnretained`
+ // (see `PlatformWalletPersistenceHandler`/`PlatformWalletEventHandler`
+ // `makeCallbacks()`), so Rust holds non-owning pointers and these
+ // objects are kept alive ONLY by the stored properties below. The
+ // instant this deinit returns, ARC releases them — which would be a
+ // use-after-free if a lingering coordinator then fires its final
+ // callback. So, ONLY on an incomplete shutdown, deliberately leak one
+ // extra strong reference to each (an unbalanced `passRetained` that is
+ // never released) so they outlive any lingering thread. A clean
+ // shutdown (the common case) takes neither branch and releases the
+ // handlers normally — we never leak unconditionally. The leak is
+ // bounded by how often a shutdown wedges (rare) and trades two small
+ // objects for guaranteed callback safety, since an incomplete drain
+ // gives no later signal that the lingering thread has finally exited.
+ if shieldedStopCode == .errorShutdownIncomplete
+ || destroyCode == .errorShutdownIncomplete {
+ if let persistenceHandler { _ = Unmanaged.passRetained(persistenceHandler) }
+ if let eventHandler { _ = Unmanaged.passRetained(eventHandler) }
}
}
diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
index 31ef07ad4a..c24f72fbf8 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
@@ -256,4 +256,16 @@ extension PlatformWalletFFIResult {
func discard() {
_ = PlatformWalletResult(self)
}
+
+ /// Free the result's Rust-owned message and return its typed code.
+ ///
+ /// Like `discard()`, but hands back the code so the caller can branch
+ /// on it — used by `PlatformWalletManager.deinit`, which must detect
+ /// `.errorShutdownIncomplete` to decide whether to keep its callback
+ /// context alive. The message is still freed deterministically (the
+ /// temporary `PlatformWalletResult` frees it on drop).
+ @inline(__always)
+ func discardReturningCode() -> PlatformWalletResultCode {
+ PlatformWalletResult(self).code
+ }
}
From b4917732a39eb3a86ec706c9ac115f2011c185f9 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:50:44 +0200
Subject: [PATCH 14/29] test(platform-wallet): bound cleanup quiesce in
restart-reap regression tests
Wrap the cleanup `mgr.quiesce().await` in all three
`restart_after_stop_reaps_prior_thread` tests with a 2-second
`tokio::time::timeout`. An unbounded quiesce after the restarted loop
would hang CI forever if the loop wedges; now it fails fast with a clear
message. Also assert `status.is_clean()` on the returned
`CoordinatorThreadStatus`.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
packages/rs-platform-wallet/src/manager/identity_sync.rs | 8 +++++++-
.../src/manager/platform_address_sync.rs | 8 +++++++-
packages/rs-platform-wallet/src/manager/shielded_sync.rs | 8 +++++++-
3 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 9cc14ac831..ab6fa6033e 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -1091,7 +1091,13 @@ mod tests {
assert!(mgr.is_running(), "restart must leave the new loop tracked");
// Wind the new loop down so the test leaves no live !Send thread.
- mgr.quiesce().await;
+ let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
+ .await
+ .expect("cleanup quiesce did not complete within 2s after restart");
+ assert!(
+ status.is_clean(),
+ "cleanup quiesce ended non-cleanly: {status:?}"
+ );
assert!(!mgr.is_running());
}
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 87b6595e53..094ae1a25b 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -656,7 +656,13 @@ mod tests {
assert!(mgr.is_running(), "restart must leave the new loop tracked");
// Wind the new loop down so the test leaves no live !Send thread.
- mgr.quiesce().await;
+ let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
+ .await
+ .expect("cleanup quiesce did not complete within 2s after restart");
+ assert!(
+ status.is_clean(),
+ "cleanup quiesce ended non-cleanly: {status:?}"
+ );
assert!(!mgr.is_running());
}
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 98e94035aa..ba7b752315 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -655,7 +655,13 @@ mod tests {
assert!(mgr.is_running(), "restart must leave the new loop tracked");
// Wind the new loop down so the test leaves no live !Send thread.
- mgr.quiesce().await;
+ let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
+ .await
+ .expect("cleanup quiesce did not complete within 2s after restart");
+ assert!(
+ status.is_clean(),
+ "cleanup quiesce ended non-cleanly: {status:?}"
+ );
assert!(!mgr.is_running());
}
}
From 76c8bee0060f2bd5b473c62ef633bcfc9bd69a81 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 12:16:40 +0200
Subject: [PATCH 15/29] fix(platform-wallet): track detached coordinator
threads so shutdown() reports them as non-clean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Closes the residual use-after-free window left by the coordinator
reap backstop. On a tight stop()->start(), each sync coordinator
waits ~1s for the prior OS thread to finish; if that thread is
genuinely wedged in a non-yielding Drop, the backstop previously
DROPPED the still-live JoinHandle (detaching it). A later shutdown()
joined only the current handle, all_clean() returned true, and the
FFI destroy returned ok() — at which point the host could free the
callback context the detached, still-running thread might still touch.
Fix (review option i): the manager now owns a shared CoordinatorOrphans
list (Arc>>) cloned into every coordinator. The
duplicated reap blocks in identity/platform-address/shielded start()
are consolidated into reap_prior_or_park(), which PARKS a wedged prior
thread in that list instead of dropping it (lock-ordering preserved:
drop(cancel_guard) still happens before the reap). shutdown() then
drains the list via join_detached_orphans() within a bounded, yielding
is_finished() poll and reports a new CoordinatorThreadStatus::Detached
(non-clean) in CoordinatorExitStatus::detached_threads when any orphan
is still alive at the grace deadline. all_clean() folds it in, so the
FFI destroy correctly returns ErrorShutdownIncomplete and the host
delays freeing its context. The new Detached variant re-exports through
lib.rs with its sibling statuses.
Tests (manager/mod.rs): reap_prior_or_park parks a force-wedged thread;
join_detached_orphans reports Detached then Ok; and a manager shutdown()
with a parked still-live orphan reports non-clean. All proven
non-vacuous by neutering the park/join. Cleanup quiesce/join in tests is
bounded; a wedged stand-in thread is released and joined so none leak.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
.../src/manager/identity_sync.rs | 49 ++-
.../rs-platform-wallet/src/manager/mod.rs | 405 +++++++++++++++++-
.../src/manager/platform_address_sync.rs | 41 +-
.../src/manager/shielded_sync.rs | 41 +-
4 files changed, 475 insertions(+), 61 deletions(-)
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index ab6fa6033e..40329bad74 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -167,6 +167,12 @@ where
/// confirm the `!Send` loop fully exited before the host drops the
/// runtime.
background_join: StdMutex>>,
+ /// Manager-owned orphans list (shared `Arc`). On a tight
+ /// `stop()`→`start()` where the prior thread is wedged past the 1 s
+ /// reap backstop, [`start`](Self::start) parks the still-live handle
+ /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
+ /// instead of dropping it, so manager `shutdown()` accounts for it.
+ coordinator_orphans: super::CoordinatorOrphans,
/// Monotonically increasing generation counter. Incremented each
/// time `start()` installs a new cancel token so the exiting
/// thread can tell whether its token is still current.
@@ -206,12 +212,17 @@ where
/// writes). The registry starts empty — call
/// [`register_identity`](Self::register_identity) before
/// [`start`](Self::start).
- pub fn new(sdk: Arc, persister: Arc) -> Self {
+ pub fn new(
+ sdk: Arc,
+ persister: Arc,
+ coordinator_orphans: super::CoordinatorOrphans,
+ ) -> Self {
Self {
sdk,
persister,
background_cancel: StdMutex::new(None),
background_join: StdMutex::new(None),
+ coordinator_orphans,
background_generation: AtomicU64::new(0),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
@@ -498,23 +509,15 @@ where
// so is_finished() trips within a few milliseconds and the join is
// near-instant. The 1 s deadline survives only as a genuine-wedge
// backstop (e.g. a pass wedged in a Drop that never yields); if it
- // fires we detach the already-cancelled thread to unblock start().
- if let Some(h) = prior {
- let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
- while !h.is_finished() {
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- "identity-sync prior thread did not finish within 1 s \
- after cancellation; detaching to unblock start()"
- );
- break; // Drop h — detaches; thread was already cancelled.
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
- if h.is_finished() {
- let _ = h.join(); // Reap resources; near-instant since finished.
- }
- }
+ // fires `reap_prior_or_park` parks the still-live, already-cancelled
+ // thread in the manager orphans list so `shutdown()` joins it and
+ // reports it non-clean rather than dropping it (residual UAF).
+ super::reap_prior_or_park(
+ prior,
+ &self.coordinator_orphans,
+ std::time::Duration::from_secs(1),
+ "identity-sync",
+ );
}
/// Stop the background sync loop. No-op if not running.
@@ -853,7 +856,8 @@ mod tests {
fn make_manager() -> Arc> {
let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
let persister = Arc::new(NoopPersister);
- Arc::new(IdentitySyncManager::new(sdk, persister))
+ let orphans = Arc::new(StdMutex::new(Vec::new()));
+ Arc::new(IdentitySyncManager::new(sdk, persister, orphans))
}
fn make_recording_manager() -> (
@@ -862,8 +866,13 @@ mod tests {
) {
let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
let persister = Arc::new(RecordingPersister::new());
+ let orphans = Arc::new(StdMutex::new(Vec::new()));
(
- Arc::new(IdentitySyncManager::new(sdk, Arc::clone(&persister))),
+ Arc::new(IdentitySyncManager::new(
+ sdk,
+ Arc::clone(&persister),
+ orphans,
+ )),
persister,
)
}
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 2de6ad6d5a..0e02d430b7 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -28,6 +28,22 @@ use crate::wallet::core::BalanceUpdateHandler;
use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId};
use crate::wallet::PlatformWallet;
+/// Shared list of coordinator OS threads that a tight `stop()`→`start()`
+/// reap had to detach past its 1 s wedge-backstop.
+///
+/// A coordinator's `start()` reap normally joins the prior thread within
+/// a few milliseconds. If that thread is genuinely wedged in a
+/// non-yielding `Drop` (vanishingly rare — the loop exits via a
+/// cancellable `select!`), [`reap_prior_or_park`] parks its still-live
+/// `JoinHandle` here instead of dropping it. The manager owns this list
+/// and shares a clone (`Arc`) with every coordinator, so
+/// [`PlatformWalletManager::shutdown`] can join everything parked here
+/// within its timeout and report
+/// [`CoordinatorThreadStatus::Detached`] if any thread is still alive —
+/// telling the host NOT to free a callback context the thread may still
+/// touch (closing a residual use-after-free window).
+pub(crate) type CoordinatorOrphans = Arc>>>;
+
/// Multi-wallet coordinator with SPV sync and event handling.
///
/// Events are dispatched through [`PlatformEventManager`] to all registered
@@ -87,6 +103,11 @@ pub struct PlatformWalletManager {
/// is torn down.
pub(super) event_adapter_cancel: CancellationToken,
pub(super) event_adapter_join: tokio::sync::Mutex>>,
+ /// Coordinator OS threads detached by a tight `stop()`→`start()`
+ /// reap (see [`CoordinatorOrphans`]). Shared (cloned `Arc`) with
+ /// every coordinator so their `start()` reaps can park a wedged
+ /// prior thread here, and drained/joined by [`shutdown`](Self::shutdown).
+ pub(super) coordinator_orphans: CoordinatorOrphans,
}
/// How one background coordinator thread terminated.
@@ -117,12 +138,24 @@ pub enum CoordinatorThreadStatus {
/// the runtime was torn down before the join could run (unreachable
/// in normal operation).
Error(String),
+ /// At least one coordinator OS thread that an earlier tight
+ /// `stop()`→`start()` reap had to detach past its 1 s wedge-backstop
+ /// was still alive at the shutdown deadline.
+ ///
+ /// Such a thread was parked in the manager's [`CoordinatorOrphans`]
+ /// list (not silently dropped) precisely so this case is visible.
+ /// A still-live detached thread keeps an `Arc` to the host event
+ /// handler and may fire one final callback, so the host must NOT
+ /// free the callback context yet — this status keeps
+ /// [`is_clean`](Self::is_clean) `false` so the FFI `destroy` returns
+ /// `ErrorShutdownIncomplete` instead of `ok()`.
+ Detached,
}
impl CoordinatorThreadStatus {
/// `true` only for a fully clean outcome: joined normally (`Ok`) or
- /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, and
- /// `Error` are all considered non-clean.
+ /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`,
+ /// `Error`, and `Detached` are all considered non-clean.
pub fn is_clean(&self) -> bool {
matches!(self, Self::Ok | Self::NotRunning)
}
@@ -148,19 +181,35 @@ pub struct CoordinatorExitStatus {
pub shielded_sync: Option,
/// Wallet-event adapter (a `tokio` task, not an OS thread).
pub event_adapter: CoordinatorThreadStatus,
+ /// Aggregate status of any coordinator OS threads that an earlier
+ /// tight `stop()`→`start()` reap had to detach past its 1 s
+ /// wedge-backstop and park in the manager's [`CoordinatorOrphans`]
+ /// list.
+ ///
+ /// [`Ok`](CoordinatorThreadStatus::Ok) when none were detached (or
+ /// every detached thread has since joined cleanly);
+ /// [`Detached`](CoordinatorThreadStatus::Detached) when at least one
+ /// is still alive at the shutdown deadline. This is what keeps
+ /// [`all_clean`](Self::all_clean) honest for the wedge case the rest
+ /// of the teardown can't see — without it a detached-but-still-live
+ /// thread would let the host free a callback context the thread may
+ /// still touch (a residual use-after-free).
+ pub detached_threads: CoordinatorThreadStatus,
}
impl CoordinatorExitStatus {
- /// `true` only when every worker is
+ /// `true` only when every worker — including any parked
+ /// [`detached_threads`](Self::detached_threads) — is
/// [`Ok`](CoordinatorThreadStatus::Ok) or
/// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any
- /// `Stopped`, `Panicked`, `Timeout`, or `Error` slot makes it
- /// `false`.
+ /// `Stopped`, `Panicked`, `Timeout`, `Error`, or `Detached` slot
+ /// makes it `false`.
pub fn all_clean(&self) -> bool {
self.platform_address_sync.is_clean()
&& self.identity_sync.is_clean()
&& self.shielded_sync.as_ref().is_none_or(|s| s.is_clean())
&& self.event_adapter.is_clean()
+ && self.detached_threads.is_clean()
}
}
@@ -223,6 +272,138 @@ fn panic_message(payload: Box) -> String {
}
}
+/// Reap a coordinator's prior OS thread after a `stop()`→`start()`
+/// reschedule — or park it for [`PlatformWalletManager::shutdown`] if it
+/// is genuinely wedged.
+///
+/// Shared by all three coordinators' `start()` (identity / platform-
+/// address / shielded), called at the tail of `start()` *after* the
+/// `background_cancel` lock has been released, so the exiting prior
+/// thread's epilogue (which also takes that lock) can complete and the
+/// join lands in milliseconds.
+///
+/// `prior` was cancellation-signalled by the preceding `stop()`, so its
+/// `select!` loop exits and the thread finishes almost immediately. The
+/// `backstop` deadline fires only if the thread is wedged in a
+/// non-yielding `Drop` that never observes the cancellation (vanishingly
+/// rare). On that wedge we must NOT silently drop the still-live handle:
+/// the thread still holds an `Arc` to the host event handler and could
+/// fire a callback, so a later `destroy` that freed the host context
+/// would hit a use-after-free. Instead we park the handle in `orphans`
+/// so `shutdown()` joins it within its own timeout and reports
+/// [`CoordinatorThreadStatus::Detached`] if it is still alive — keeping
+/// [`CoordinatorExitStatus::all_clean`] honest.
+pub(crate) fn reap_prior_or_park(
+ prior: Option>,
+ orphans: &CoordinatorOrphans,
+ backstop: std::time::Duration,
+ coordinator: &str,
+) {
+ let Some(handle) = prior else {
+ return;
+ };
+ let deadline = std::time::Instant::now() + backstop;
+ loop {
+ if handle.is_finished() {
+ // Near-instant since finished; reaps the thread's resources.
+ let _ = handle.join();
+ return;
+ }
+ if std::time::Instant::now() >= deadline {
+ tracing::warn!(
+ coordinator,
+ ?backstop,
+ "prior sync thread did not finish within the backstop after \
+ cancellation; parking it in the manager orphans list for \
+ shutdown() to join rather than detaching it"
+ );
+ // Park the still-live (but already-cancelled) handle so a
+ // later shutdown() can join it and report it non-clean,
+ // instead of dropping it and leaving a UAF window where the
+ // host frees a callback context the thread may still touch.
+ orphans
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .push(handle);
+ return;
+ }
+ std::thread::sleep(std::time::Duration::from_millis(5));
+ }
+}
+
+/// Drain the manager's [`CoordinatorOrphans`] list and classify how the
+/// parked threads ended, polling until `deadline`.
+///
+/// Threads land in the list only when a tight `stop()`→`start()` reap had
+/// to detach a prior coordinator thread past its 1 s wedge-backstop (see
+/// [`reap_prior_or_park`]). They were parked rather than dropped so this
+/// final teardown can account for them: a still-live detached thread
+/// keeps an `Arc` to the host event handler and could fire one last
+/// callback, so the host must not free its context until every such
+/// thread has exited.
+///
+/// Polls [`JoinHandle::is_finished`](std::thread::JoinHandle::is_finished)
+/// in 5 ms steps, yielding at each `.await` so a wrapping
+/// `tokio::time::timeout` can still interrupt it (no uncancellable
+/// blocking join — `join()` is only ever called on an already-finished
+/// handle). Returns:
+/// - [`Ok`](CoordinatorThreadStatus::Ok) — the list was empty, or every
+/// parked thread joined cleanly;
+/// - [`Panicked`](CoordinatorThreadStatus::Panicked) — a parked thread
+/// had panicked (and none were left alive at the deadline);
+/// - [`Detached`](CoordinatorThreadStatus::Detached) — at least one
+/// parked thread was still alive at `deadline`. Any still-live handles
+/// are re-parked so a later (idempotent) `shutdown()` can retry.
+pub(crate) async fn join_detached_orphans(
+ orphans: &CoordinatorOrphans,
+ deadline: std::time::Instant,
+) -> CoordinatorThreadStatus {
+ // Take the whole list out under the lock; we re-park any survivors
+ // at the deadline, but never hold the lock across an `.await`.
+ let mut pending: Vec> = {
+ let mut guard = orphans.lock().unwrap_or_else(|e| e.into_inner());
+ std::mem::take(&mut *guard)
+ };
+ if pending.is_empty() {
+ return CoordinatorThreadStatus::Ok;
+ }
+
+ let mut panicked: Option = None;
+ loop {
+ // Reap every thread that has finished this pass; retain the rest.
+ let mut still_live = Vec::with_capacity(pending.len());
+ for handle in pending.drain(..) {
+ if handle.is_finished() {
+ if let Err(payload) = handle.join() {
+ // Keep the first panic message; a live `Detached`
+ // thread still takes precedence at the deadline below.
+ panicked.get_or_insert_with(|| panic_message(payload));
+ }
+ } else {
+ still_live.push(handle);
+ }
+ }
+ pending = still_live;
+
+ if pending.is_empty() {
+ return match panicked {
+ Some(msg) => CoordinatorThreadStatus::Panicked(msg),
+ None => CoordinatorThreadStatus::Ok,
+ };
+ }
+ if std::time::Instant::now() >= deadline {
+ // Re-park survivors so an idempotent re-`shutdown()` retries
+ // rather than losing track of a still-live thread.
+ orphans
+ .lock()
+ .unwrap_or_else(|e| e.into_inner())
+ .extend(pending);
+ return CoordinatorThreadStatus::Detached;
+ }
+ tokio::time::sleep(std::time::Duration::from_millis(5)).await;
+ }
+}
+
/// Maximum time (seconds) the teardown paths — `shutdown()`,
/// `clear_shielded`, and the FFI shielded-stop bridge — wait for one
/// coordinator's quiesce+join to complete.
@@ -236,6 +417,23 @@ fn panic_message(payload: Box) -> String {
/// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
+/// Grace period (seconds) [`PlatformWalletManager::shutdown`] spends
+/// polling any parked [`CoordinatorOrphans`] before declaring a survivor
+/// [`Detached`](CoordinatorThreadStatus::Detached).
+///
+/// Unlike a live coordinator — whose `quiesce()` may legitimately spend
+/// seconds draining an in-flight pass, hence the 30 s
+/// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] — an orphan is a thread an earlier reap
+/// already had to detach *because it was wedged past its 1 s backstop*.
+/// A healthy detached thread finishes within milliseconds of the
+/// cancellation it long ago received (so `is_finished()` is usually true
+/// on the first poll and the join is instant); one still alive after this
+/// grace is wedged in a non-yielding `Drop` and will not finish however
+/// long we wait. A short grace therefore separates "finishing" from
+/// "wedged" without stretching teardown, and reporting `Detached` is the
+/// conservative, UAF-safe outcome (the host delays freeing its context).
+pub(crate) const SHUTDOWN_ORPHAN_GRACE_SECS: u64 = 1;
+
impl PlatformWalletManager {
/// Create a new PlatformWalletManager.
///
@@ -275,6 +473,13 @@ impl PlatformWalletManager {
balance_handler,
]));
+ // Shared orphans list: a coordinator's `start()` reap parks here
+ // any prior thread it had to detach past its 1 s wedge-backstop,
+ // and `shutdown()` joins them. Every coordinator gets a clone of
+ // this same `Arc` so they all park into the one list the manager
+ // drains.
+ let coordinator_orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
+
let spv = Arc::new(SpvRuntime::new(
Arc::clone(&wallet_manager),
Arc::clone(&event_manager),
@@ -282,10 +487,12 @@ impl PlatformWalletManager {
let platform_address_sync = Arc::new(PlatformAddressSyncManager::new(
Arc::clone(&wallets),
Arc::clone(&event_manager),
+ Arc::clone(&coordinator_orphans),
));
let identity_sync = Arc::new(IdentitySyncManager::new(
Arc::clone(&sdk),
Arc::clone(&persister),
+ Arc::clone(&coordinator_orphans),
));
#[cfg(feature = "shielded")]
let shielded_coordinator: Arc<
@@ -295,6 +502,7 @@ impl PlatformWalletManager {
let shielded_sync = Arc::new(ShieldedSyncManager::new(
Arc::clone(&event_manager),
Arc::clone(&shielded_coordinator),
+ Arc::clone(&coordinator_orphans),
));
Self {
sdk,
@@ -313,6 +521,7 @@ impl PlatformWalletManager {
persister,
event_adapter_cancel,
event_adapter_join: tokio::sync::Mutex::new(Some(event_adapter_join)),
+ coordinator_orphans,
}
}
@@ -575,11 +784,26 @@ impl PlatformWalletManager {
},
};
+ // Finally, account for any coordinator threads an earlier tight
+ // stop()→start() reap had to detach past its 1 s wedge-backstop.
+ // They were parked in `coordinator_orphans` (not dropped) so we
+ // can join them here; a survivor at the grace deadline reports
+ // `Detached`, which keeps `all_clean()` false so the FFI `destroy`
+ // returns `ErrorShutdownIncomplete` rather than letting the host
+ // free a callback context the live thread may still touch. The
+ // grace poll yields, so it never blocks teardown uncancellably.
+ let detached_threads = join_detached_orphans(
+ &self.coordinator_orphans,
+ std::time::Instant::now() + std::time::Duration::from_secs(SHUTDOWN_ORPHAN_GRACE_SECS),
+ )
+ .await;
+
CoordinatorExitStatus {
platform_address_sync,
identity_sync,
shielded_sync,
event_adapter,
+ detached_threads,
}
}
}
@@ -800,6 +1024,9 @@ mod tests {
assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean());
assert!(!CoordinatorThreadStatus::Timeout.is_clean());
assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean());
+ // A detached-but-still-live coordinator thread is non-clean: the
+ // host must not free its callback context yet.
+ assert!(!CoordinatorThreadStatus::Detached.is_clean());
}
/// `all_clean()` on `CoordinatorExitStatus` is false whenever any
@@ -811,6 +1038,7 @@ mod tests {
identity_sync: CoordinatorThreadStatus::NotRunning,
shielded_sync: None,
event_adapter: CoordinatorThreadStatus::Ok,
+ detached_threads: CoordinatorThreadStatus::Ok,
};
assert!(clean.all_clean());
@@ -819,6 +1047,7 @@ mod tests {
identity_sync: CoordinatorThreadStatus::Ok,
shielded_sync: None,
event_adapter: CoordinatorThreadStatus::Ok,
+ detached_threads: CoordinatorThreadStatus::Ok,
};
assert!(!with_timeout.all_clean());
@@ -827,8 +1056,20 @@ mod tests {
identity_sync: CoordinatorThreadStatus::Ok,
shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))),
event_adapter: CoordinatorThreadStatus::Ok,
+ detached_threads: CoordinatorThreadStatus::Ok,
};
assert!(!with_stopped.all_clean());
+
+ // A still-live detached orphan alone makes the aggregate
+ // non-clean — the slot the rest of the teardown can't see.
+ let with_detached = CoordinatorExitStatus {
+ platform_address_sync: CoordinatorThreadStatus::Ok,
+ identity_sync: CoordinatorThreadStatus::Ok,
+ shielded_sync: None,
+ event_adapter: CoordinatorThreadStatus::Ok,
+ detached_threads: CoordinatorThreadStatus::Detached,
+ };
+ assert!(!with_detached.all_clean());
}
/// A cleanly-returning thread joins as `Ok`; an absent handle is
@@ -993,4 +1234,158 @@ mod tests {
SHUTDOWN_PANICS.load(AO::SeqCst)
);
}
+
+ /// Spawn a thread that parks until `release` is signalled (or the
+ /// sender drops), standing in for a coordinator thread wedged in a
+ /// non-yielding `Drop` that ignores the cancellation it received.
+ fn spawn_wedged_thread() -> (std::sync::mpsc::Sender<()>, std::thread::JoinHandle<()>) {
+ let (release_tx, release_rx) = std::sync::mpsc::channel::<()>();
+ let handle = std::thread::spawn(move || {
+ // Block here regardless of any cancellation, exactly like a
+ // Drop that never yields, until the test releases us.
+ let _ = release_rx.recv();
+ });
+ (release_tx, handle)
+ }
+
+ /// A prior coordinator thread that is still alive past the reap
+ /// backstop must be **parked in the orphans list**, not dropped —
+ /// otherwise `shutdown()` would never know it exists and could let the
+ /// host free a callback context the live thread still touches.
+ ///
+ /// Non-vacuous: if `reap_prior_or_park` dropped the wedged handle
+ /// (the old behavior) the list would stay empty and the length
+ /// assertion below would fail.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn reap_prior_or_park_parks_wedged_thread() {
+ let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
+ let (release_tx, wedged) = spawn_wedged_thread();
+
+ // `reap_prior_or_park` is synchronous and spins a std sleep until
+ // its backstop, so run it off the runtime workers. A short backstop
+ // (real `start()` uses 1 s) keeps the test fast.
+ let orphans_for_reap = Arc::clone(&orphans);
+ tokio::task::spawn_blocking(move || {
+ reap_prior_or_park(
+ Some(wedged),
+ &orphans_for_reap,
+ Duration::from_millis(100),
+ "test-coordinator",
+ );
+ })
+ .await
+ .unwrap();
+
+ assert_eq!(
+ orphans.lock().unwrap().len(),
+ 1,
+ "a prior thread wedged past the backstop must be parked, not dropped"
+ );
+
+ // Cleanup: release + join the parked thread so none leaks.
+ release_tx.send(()).unwrap();
+ let parked = orphans.lock().unwrap().pop().unwrap();
+ tokio::task::spawn_blocking(move || {
+ let _ = parked.join();
+ })
+ .await
+ .unwrap();
+ }
+
+ /// `join_detached_orphans` classifies the parked threads: empty list →
+ /// `Ok`; a survivor at the deadline → `Detached` (re-parked for a later
+ /// retry); once the survivor exits, a fresh join reports `Ok` and
+ /// drains the list.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+ async fn join_detached_orphans_reports_detached_then_ok() {
+ let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
+
+ // Nothing parked → clean.
+ assert_eq!(
+ join_detached_orphans(&orphans, std::time::Instant::now()).await,
+ CoordinatorThreadStatus::Ok
+ );
+
+ // Park a still-live thread; a short deadline elapses with it alive.
+ let (release_tx, wedged) = spawn_wedged_thread();
+ orphans.lock().unwrap().push(wedged);
+ let status = join_detached_orphans(
+ &orphans,
+ std::time::Instant::now() + Duration::from_millis(50),
+ )
+ .await;
+ assert_eq!(
+ status,
+ CoordinatorThreadStatus::Detached,
+ "a survivor at the deadline must report Detached"
+ );
+ assert_eq!(
+ orphans.lock().unwrap().len(),
+ 1,
+ "a survivor must be re-parked so an idempotent re-shutdown retries"
+ );
+
+ // Release it; the next join reaps it cleanly and empties the list.
+ release_tx.send(()).unwrap();
+ let status = tokio::time::timeout(
+ Duration::from_secs(5),
+ join_detached_orphans(&orphans, std::time::Instant::now() + Duration::from_secs(5)),
+ )
+ .await
+ .expect("orphan join must complete once the thread is released");
+ assert_eq!(status, CoordinatorThreadStatus::Ok);
+ assert!(
+ orphans.lock().unwrap().is_empty(),
+ "a joined orphan must be drained from the list"
+ );
+ }
+
+ /// Headline regression: a coordinator thread detached past the reap
+ /// backstop and parked in the orphans list makes a subsequent
+ /// `shutdown()` report the result as **non-clean** — so the FFI
+ /// `destroy` returns `ErrorShutdownIncomplete` and the host delays
+ /// freeing the callback context the still-live thread may touch.
+ ///
+ /// Non-vacuous: if `join_detached_orphans` ignored the list (or the
+ /// orphan were dropped at reap instead of parked), `detached_threads`
+ /// would be `Ok` and `all_clean()` would be `true`, failing both
+ /// assertions.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn shutdown_reports_detached_orphan_as_non_clean() {
+ let manager = make_manager();
+
+ // Stand in for the genuine-wedge outcome: an earlier tight
+ // stop()→start() reap had to detach a still-live coordinator thread
+ // past its 1 s backstop, so `reap_prior_or_park` parked it here.
+ let (release_tx, wedged) = spawn_wedged_thread();
+ manager.coordinator_orphans.lock().unwrap().push(wedged);
+
+ let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown())
+ .await
+ .expect("shutdown must complete within bound");
+
+ assert_eq!(
+ status.detached_threads,
+ CoordinatorThreadStatus::Detached,
+ "a still-live detached orphan must surface as Detached"
+ );
+ assert!(
+ !status.all_clean(),
+ "all_clean() must be false while a detached coordinator thread is \
+ still alive: {status:?}"
+ );
+
+ // Cleanup: shutdown() re-parked the survivor; release + join it so
+ // no live thread leaks past the test. Pop into a local first so the
+ // std MutexGuard is not held across the await below.
+ release_tx.send(()).unwrap();
+ let parked = manager.coordinator_orphans.lock().unwrap().pop();
+ if let Some(parked) = parked {
+ tokio::task::spawn_blocking(move || {
+ let _ = parked.join();
+ })
+ .await
+ .unwrap();
+ }
+ }
}
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 094ae1a25b..40457c4a87 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -104,6 +104,12 @@ pub struct PlatformAddressSyncManager {
/// confirm the `!Send` loop fully exited before the host drops the
/// runtime.
background_join: StdMutex>>,
+ /// Manager-owned orphans list (shared `Arc`). On a tight
+ /// `stop()`→`start()` where the prior thread is wedged past the 1 s
+ /// reap backstop, [`start`](Self::start) parks the still-live handle
+ /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
+ /// instead of dropping it, so manager `shutdown()` accounts for it.
+ coordinator_orphans: super::CoordinatorOrphans,
/// Monotonically increasing generation counter. Bumped on every
/// `start()` so the exiting thread can tell whether its generation is
/// still the active one before clearing `background_cancel`. Without
@@ -135,12 +141,14 @@ impl PlatformAddressSyncManager {
pub fn new(
wallets: Arc>>>,
event_manager: Arc,
+ coordinator_orphans: super::CoordinatorOrphans,
) -> Self {
Self {
wallets,
event_manager,
background_cancel: StdMutex::new(None),
background_join: StdMutex::new(None),
+ coordinator_orphans,
background_generation: AtomicU64::new(0),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
@@ -313,23 +321,15 @@ impl PlatformAddressSyncManager {
// so is_finished() trips within a few milliseconds and the join is
// near-instant. The 1 s deadline survives only as a genuine-wedge
// backstop (e.g. a pass wedged in a Drop that never yields); if it
- // fires we detach the already-cancelled thread to unblock start().
- if let Some(h) = prior {
- let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
- while !h.is_finished() {
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- "platform-address-sync prior thread did not finish within 1 s \
- after cancellation; detaching to unblock start()"
- );
- break; // Drop h — detaches; thread was already cancelled.
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
- if h.is_finished() {
- let _ = h.join(); // Reap resources; near-instant since finished.
- }
- }
+ // fires `reap_prior_or_park` parks the still-live, already-cancelled
+ // thread in the manager orphans list so `shutdown()` joins it and
+ // reports it non-clean rather than dropping it (residual UAF).
+ super::reap_prior_or_park(
+ prior,
+ &self.coordinator_orphans,
+ std::time::Duration::from_secs(1),
+ "platform-address-sync",
+ );
}
/// Stop the background sync loop. No-op if not running.
@@ -543,8 +543,13 @@ mod tests {
let event_manager = Arc::new(PlatformEventManager::new(vec![
Arc::clone(&counter) as Arc
]));
+ let orphans = Arc::new(StdMutex::new(Vec::new()));
(
- Arc::new(PlatformAddressSyncManager::new(wallets, event_manager)),
+ Arc::new(PlatformAddressSyncManager::new(
+ wallets,
+ event_manager,
+ orphans,
+ )),
counter,
)
}
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index ba7b752315..3c84bd7071 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -148,6 +148,12 @@ pub struct ShieldedSyncManager {
/// confirm the `!Send` loop fully exited before the host drops the
/// runtime.
background_join: StdMutex>>,
+ /// Manager-owned orphans list (shared `Arc`). On a tight
+ /// `stop()`→`start()` where the prior thread is wedged past the 1 s
+ /// reap backstop, [`start`](Self::start) parks the still-live handle
+ /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
+ /// instead of dropping it, so manager `shutdown()` accounts for it.
+ coordinator_orphans: super::CoordinatorOrphans,
/// Monotonically increasing generation counter. Bumped on every
/// `start()` so the exiting thread can tell whether its
/// generation is still the active one before clearing
@@ -173,12 +179,14 @@ impl ShieldedSyncManager {
pub fn new(
event_manager: Arc,
coordinator_slot: Arc>>>,
+ coordinator_orphans: super::CoordinatorOrphans,
) -> Self {
Self {
event_manager,
coordinator_slot,
background_cancel: StdMutex::new(None),
background_join: StdMutex::new(None),
+ coordinator_orphans,
background_generation: AtomicU64::new(0),
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
@@ -346,23 +354,15 @@ impl ShieldedSyncManager {
// so is_finished() trips within a few milliseconds and the join is
// near-instant. The 1 s deadline survives only as a genuine-wedge
// backstop (e.g. a pass wedged in a Drop that never yields); if it
- // fires we detach the already-cancelled thread to unblock start().
- if let Some(h) = prior {
- let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
- while !h.is_finished() {
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- "shielded-sync prior thread did not finish within 1 s \
- after cancellation; detaching to unblock start()"
- );
- break; // Drop h — detaches; thread was already cancelled.
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
- if h.is_finished() {
- let _ = h.join(); // Reap resources; near-instant since finished.
- }
- }
+ // fires `reap_prior_or_park` parks the still-live, already-cancelled
+ // thread in the manager orphans list so `shutdown()` joins it and
+ // reports it non-clean rather than dropping it (residual UAF).
+ super::reap_prior_or_park(
+ prior,
+ &self.coordinator_orphans,
+ std::time::Duration::from_secs(1),
+ "shielded-sync",
+ );
}
/// Stop the background sync loop. No-op if not running.
@@ -601,7 +601,12 @@ mod tests {
fn make_manager() -> Arc {
let coordinator_slot = Arc::new(RwLock::new(None));
let event_manager = Arc::new(PlatformEventManager::new(vec![]));
- Arc::new(ShieldedSyncManager::new(event_manager, coordinator_slot))
+ let orphans = Arc::new(StdMutex::new(Vec::new()));
+ Arc::new(ShieldedSyncManager::new(
+ event_manager,
+ coordinator_slot,
+ orphans,
+ ))
}
/// Regression: a tight `stop()` → `start()` must reap the prior loop's
From 3cca1cf833e1a2aaf7dadd9df722323634678cae Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:36:19 +0200
Subject: [PATCH 16/29] perf(platform-wallet): drain coordinators concurrently
in shutdown() via tokio::join!
The three periodic coordinators (platform-address, identity, shielded)
were quiesced sequentially in shutdown(), making the worst-case wait
additive (~3 x SHUTDOWN_JOIN_TIMEOUT_SECS). Each quiesce() touches only
its own state (its quiescing/is_syncing atomics and its own
background_cancel/background_join mutexes) and joins its own OS thread,
sharing no lock, so racing them is sound. Drain them concurrently via
tokio::join!, collapsing the worst case to ~max(timeouts).
Each join! arm keeps its OWN inner tokio::time::timeout, so every
coordinator still yields its own per-coordinator CoordinatorThreadStatus
(a single outer timeout would flatten all three to Timeout). The event
adapter teardown and join_detached_orphans stay sequential and ordered
strictly AFTER the coordinator join!, since the adapter sinks the
coordinators' stores. The multi-thread runtime assert is unchanged.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
.../rs-platform-wallet/src/manager/mod.rs | 57 +++++++++++++------
1 file changed, 41 insertions(+), 16 deletions(-)
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 0e02d430b7..7e9690d066 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -691,7 +691,13 @@ impl PlatformWalletManager {
/// context — a use-after-free. So we `quiesce()` the sync managers
/// FIRST (so no further persister store or host callback can start),
/// and only THEN cancel + join the event adapter, which is the sink
- /// those stores feed into.
+ /// those stores feed into. The three coordinators are independent —
+ /// each `quiesce()` touches only its own state (its `quiescing` /
+ /// `is_syncing` atomics and its own `background_cancel` /
+ /// `background_join` mutexes) and joins its own OS thread, sharing no
+ /// lock — so they are drained *concurrently* via `tokio::join!`; only
+ /// the event-adapter teardown stays ordered strictly after them,
+ /// because it is the sink the coordinators store into.
///
/// After each coordinator's `quiesce()` drains its in-flight pass,
/// this also **joins** the loop's OS thread, so when `shutdown()`
@@ -708,8 +714,10 @@ impl PlatformWalletManager {
/// service one `block_on` at a time, so the join would deadlock. This
/// is asserted in both debug and release builds.
///
- /// Each coordinator quiesce+join is bounded by
- /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] as a backstop. `quiesce()` cancels
+ /// Each coordinator quiesce+join is bounded by its own
+ /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] backstop. Because the three drain
+ /// concurrently, the worst-case wait collapses to ~that single
+ /// backstop instead of the sum of all three. `quiesce()` cancels
/// the loop, which aborts any in-flight pass at its `.await` point, so
/// the `is_syncing` drain clears promptly and the join normally lands
/// far inside the window — the deadline fires only if a pass's *drop*
@@ -735,25 +743,42 @@ impl PlatformWalletManager {
let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
- // Each quiesce() drains any in-flight pass AND joins the thread.
- let platform_address_sync =
+ // Drain the three independent periodic coordinators *concurrently*.
+ // Each quiesce() drains any in-flight pass AND joins its own OS
+ // thread, touching only that coordinator's own state (no shared
+ // lock), so racing them is sound and collapses the worst case from
+ // the sum of the three backstops to ~max(...). Each drain keeps its
+ // OWN inner `tokio::time::timeout`, so it still yields its own
+ // per-coordinator `CoordinatorThreadStatus` — a single outer timeout
+ // around the whole join! would flatten all three to `Timeout` and
+ // lose that detail.
+ let drain_platform_address = async {
tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
.await
- .unwrap_or(CoordinatorThreadStatus::Timeout);
-
- let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
- .await
- .unwrap_or(CoordinatorThreadStatus::Timeout);
-
+ .unwrap_or(CoordinatorThreadStatus::Timeout)
+ };
+ let drain_identity = async {
+ tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
+ .await
+ .unwrap_or(CoordinatorThreadStatus::Timeout)
+ };
#[cfg(feature = "shielded")]
- let shielded_sync = {
- let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
+ let drain_shielded = async {
+ tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
.await
- .unwrap_or(CoordinatorThreadStatus::Timeout);
- Some(r)
+ .unwrap_or(CoordinatorThreadStatus::Timeout)
+ };
+
+ #[cfg(feature = "shielded")]
+ let (platform_address_sync, identity_sync, shielded_sync) = {
+ let (p, i, s) = tokio::join!(drain_platform_address, drain_identity, drain_shielded);
+ (p, i, Some(s))
};
#[cfg(not(feature = "shielded"))]
- let shielded_sync = None;
+ let (platform_address_sync, identity_sync, shielded_sync) = {
+ let (p, i) = tokio::join!(drain_platform_address, drain_identity);
+ (p, i, None)
+ };
// The event adapter is a tokio task (it sinks the coordinators'
// stores), so cancel + join it last — after the loops feeding it
From 8c528116a30ac0cb5173236c0fd83d12a2cabc41 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:22:19 +0200
Subject: [PATCH 17/29] feat(dash-async): add shared ThreadRegistry
worker-lifecycle engine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Centralizes the previously-triplicated background-worker lifecycle —
generation-match exit epilogue, restart reap-or-park, orphan drain — into
one tested engine in the shared dash-async crate, generic over a worker
key and supporting both OS-thread (`!Send` block_on loops) and tokio-task
workers.
Makes two confirmed bugs impossible by construction:
- F1: quiesce/join paths take `&self`; the live JoinHandle stays owned by
the slot and is never moved into a cancellable future's frame. A
dropped/timed-out quiesce re-parks the handle into orphans (Timeout),
never drop-and-detach to a clean NotRunning.
- F2: any_alive() is the single liveness gate spanning live slots AND
parked orphans, so store-wiping paths can refuse while a prior thread
is alive.
Weight-ordered shutdown drains tiers ascending, concurrently within a
tier. WorkerStatus variants are byte-identical to the wallet's
CoordinatorThreadStatus for a stable FFI mapping.
Adds the full registry test suite (TC-001/001b/003-014, F1 shutdown-path
GAP-006, compile-fail DrainHook Send check, default-config and idempotent-
shutdown gaps). 22 tests + 1 doctest green.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
Cargo.lock | 2 +
packages/rs-dash-async/Cargo.toml | 4 +-
packages/rs-dash-async/src/lib.rs | 11 +-
packages/rs-dash-async/src/registry.rs | 1257 ++++++++++++++++++++++++
4 files changed, 1272 insertions(+), 2 deletions(-)
create mode 100644 packages/rs-dash-async/src/registry.rs
diff --git a/Cargo.lock b/Cargo.lock
index 1faa308a83..2108bed826 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1617,8 +1617,10 @@ dependencies = [
name = "dash-async"
version = "4.0.0-rc.2"
dependencies = [
+ "futures",
"thiserror 2.0.18",
"tokio",
+ "tokio-util",
"tracing",
]
diff --git a/packages/rs-dash-async/Cargo.toml b/packages/rs-dash-async/Cargo.toml
index 26e2c8fdeb..69d180e568 100644
--- a/packages/rs-dash-async/Cargo.toml
+++ b/packages/rs-dash-async/Cargo.toml
@@ -13,6 +13,8 @@ tracing = "0.1.41"
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
tokio = { version = "1.40", features = ["rt", "rt-multi-thread", "time", "net"] }
+tokio-util = { version = "0.7.12" }
+futures = { version = "0.3.30" }
[dev-dependencies]
-tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync"] }
+tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync", "time"] }
diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs
index 3edcf00daa..1ce0820359 100644
--- a/packages/rs-dash-async/src/lib.rs
+++ b/packages/rs-dash-async/src/lib.rs
@@ -3,10 +3,19 @@
//! Provides [`block_on`] -- a function that bridges async futures into sync code,
//! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread).
//!
-//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets.
+//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets,
+//! and [`ThreadRegistry`] — a shared lifecycle engine for background OS-thread / tokio-task
+//! workers (start, cancel, weight-ordered quiesce + join, orphan reap).
mod atomic;
mod block_on;
+#[cfg(not(target_arch = "wasm32"))]
+mod registry;
pub use atomic::AtomicFlagGuard;
pub use block_on::{block_on, AsyncError};
+#[cfg(not(target_arch = "wasm32"))]
+pub use registry::{
+ DrainHook, RegistryKey, ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig,
+ WorkerStatus, DEFAULT_JOIN_BUDGET, DEFAULT_REAP_BACKSTOP,
+};
diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
new file mode 100644
index 0000000000..e7cd835cd8
--- /dev/null
+++ b/packages/rs-dash-async/src/registry.rs
@@ -0,0 +1,1257 @@
+//! Shared lifecycle engine for background workers (`ThreadRegistry`).
+//!
+//! Centralizes the dangerous, previously-triplicated 80% of a background
+//! worker's lifecycle — the generation-match exit epilogue, the
+//! reap-or-park of a restarted worker's prior thread, and the orphan
+//! drain — into one tested place, while deliberately leaving the
+//! domain-specific 20% (the "is a pass in flight?" drain barrier) to the
+//! consumer as a [`DrainHook`].
+//!
+//! Two worker kinds are supported:
+//! - [`start_thread`](ThreadRegistry::start_thread) — a dedicated OS
+//! thread, for loops that `block_on` `!Send` futures internally (the
+//! `!Send` value never crosses the spawn boundary; the body itself is
+//! `Send`).
+//! - [`start_task`](ThreadRegistry::start_task) — a tokio task, for
+//! `Send` futures.
+//!
+//! # Why F1 and F2 cannot recur
+//!
+//! - **F1** (timeout-dropped quiesce detaches a live thread): every join
+//! path takes `&self`; the live join handle stays owned by the slot
+//! and is never moved into a cancellable future's frame. A
+//! dropped/timed-out [`quiesce`](ThreadRegistry::quiesce) therefore
+//! cannot drop-and-detach the handle — on timeout (or on an external
+//! drop) the handle is deterministically re-parked into the orphan
+//! list, and the slot reports [`WorkerStatus::Timeout`], never a clean
+//! `NotRunning`.
+//! - **F2** (store wipe races a parked prior-generation thread):
+//! orphans live in the registry and [`any_alive`](ThreadRegistry::any_alive)
+//! is the single liveness gate spanning live slots **and** parked
+//! orphans. Every store-wiping path consults it, so a parked
+//! still-live thread blocks the wipe.
+
+use std::collections::BTreeMap;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use futures::future::FutureExt;
+use tokio::runtime::RuntimeFlavor;
+use tokio_util::sync::CancellationToken;
+
+// ---------------------------------------------------------------------
+// Key & weight
+// ---------------------------------------------------------------------
+
+/// Worker identity. A wallet supplies a fixed enum; rs-dapi a generated
+/// id. Blanket-implemented — consumers just derive the listed bounds on
+/// their own key type.
+pub trait RegistryKey: Copy + Ord + Eq + std::fmt::Debug + Send + Sync + 'static {}
+impl RegistryKey for T {}
+
+/// Teardown order. Lower weights drain first; equal weights drain
+/// concurrently within a tier. Default `0`.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default)]
+pub struct ShutdownWeight(pub i32);
+
+// ---------------------------------------------------------------------
+// Status
+// ---------------------------------------------------------------------
+
+/// Terminal status of one worker. Variant set and payloads are
+/// byte-identical to the wallet's `CoordinatorThreadStatus`, which is
+/// constructed from this via `From` so the FFI surface stays stable.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum WorkerStatus {
+ /// The loop exited and its thread/task joined cleanly.
+ Ok,
+ /// A tokio task ended for a non-panic, non-clean reason (cancelled /
+ /// aborted at the runtime level). Carries a reason when available.
+ /// Only the `Task` kind can produce this; an OS thread never does.
+ Stopped(Option),
+ /// The thread/task panicked; carries the best-effort panic message.
+ Panicked(String),
+ /// The managed join exceeded this worker's `join_budget`. The live
+ /// handle was re-parked into the orphan list — UAF-safe, non-clean.
+ Timeout,
+ /// A parked orphan was still alive after the reap grace — UAF-safe,
+ /// non-clean.
+ Detached,
+ /// No thread/task was running to join — never started, or already
+ /// joined by a prior teardown.
+ NotRunning,
+ /// Infrastructural join failure that is neither a timeout nor a
+ /// panic (unreachable in normal operation).
+ Error(String),
+}
+
+impl WorkerStatus {
+ /// `true` only for a fully clean outcome: joined normally (`Ok`) or
+ /// never ran (`NotRunning`).
+ pub fn is_clean(&self) -> bool {
+ matches!(self, Self::Ok | Self::NotRunning)
+ }
+}
+
+/// Aggregate result of [`ThreadRegistry::shutdown`].
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ShutdownReport {
+ /// Per-worker terminal status, keyed by worker id.
+ pub per_worker: BTreeMap,
+ /// Number of parked orphans still alive at the reap deadline.
+ pub detached: usize,
+}
+
+impl ShutdownReport {
+ /// `true` only when every per-worker status is clean and no orphan
+ /// survived the reap.
+ pub fn all_clean(&self) -> bool {
+ self.detached == 0 && self.per_worker.values().all(WorkerStatus::is_clean)
+ }
+}
+
+// ---------------------------------------------------------------------
+// Per-worker registration options
+// ---------------------------------------------------------------------
+
+/// Async drain hook the registry awaits **before** cancelling a worker,
+/// in weight order. The domain barrier (raise a `quiescing` gate, wait
+/// out an in-flight pass) lives here, supplied by the consumer — the
+/// registry never owns domain semantics.
+///
+/// The captured state must be `Send + Sync`; a `!Send` capture does not
+/// compile as a `DrainHook`:
+///
+/// ```compile_fail
+/// use std::rc::Rc;
+/// use std::sync::Arc;
+/// use dash_async::DrainHook;
+/// let rc = Rc::new(42u32); // !Send
+/// let _hook: DrainHook =
+/// Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) });
+/// ```
+pub type DrainHook =
+ Arc Pin + Send>> + Send + Sync>;
+
+/// Default managed-join budget when a [`WorkerConfig`] does not override
+/// it. Pinned so an accidental change surfaces in tests.
+pub const DEFAULT_JOIN_BUDGET: Duration = Duration::from_secs(30);
+
+/// Default orphan reap backstop (start-time reap and shutdown grace).
+pub const DEFAULT_REAP_BACKSTOP: Duration = Duration::from_secs(1);
+
+/// Per-worker registration options.
+pub struct WorkerConfig {
+ /// Teardown tier; lower drains first, equal weights concurrently.
+ pub weight: ShutdownWeight,
+ /// Optional drain barrier awaited before cancellation.
+ pub drain: Option,
+ /// Managed-join timeout for this worker.
+ pub join_budget: Duration,
+}
+
+impl Default for WorkerConfig {
+ fn default() -> Self {
+ Self {
+ weight: ShutdownWeight::default(),
+ drain: None,
+ join_budget: DEFAULT_JOIN_BUDGET,
+ }
+ }
+}
+
+// ---------------------------------------------------------------------
+// Internal handle + slot state
+// ---------------------------------------------------------------------
+
+/// A live worker's join handle. Kept owned by its slot so a cancellable
+/// caller can never move it into a future frame and detach it on drop.
+enum WorkerHandle {
+ OsThread(std::thread::JoinHandle<()>),
+ Task(tokio::task::JoinHandle<()>),
+}
+
+impl WorkerHandle {
+ fn is_finished(&self) -> bool {
+ match self {
+ WorkerHandle::OsThread(h) => h.is_finished(),
+ WorkerHandle::Task(h) => h.is_finished(),
+ }
+ }
+
+ /// Classify a **finished** handle. Kind-dispatched (R3): an OS thread
+ /// yields only `Ok` / `Panicked`; a task can also yield `Stopped`
+ /// (cancelled / aborted at the runtime level).
+ fn classify(self) -> WorkerStatus {
+ match self {
+ WorkerHandle::OsThread(j) => match j.join() {
+ Ok(()) => WorkerStatus::Ok,
+ Err(payload) => WorkerStatus::Panicked(panic_message(payload)),
+ },
+ WorkerHandle::Task(j) => match j.now_or_never() {
+ Some(Ok(())) => WorkerStatus::Ok,
+ Some(Err(e)) if e.is_panic() => {
+ WorkerStatus::Panicked(panic_message(e.into_panic()))
+ }
+ Some(Err(e)) => WorkerStatus::Stopped(Some(e.to_string())),
+ // Only ever called on a finished handle, so a finished
+ // task is always ready; this arm is defensive.
+ None => WorkerStatus::Error("task handle not ready at join".to_string()),
+ },
+ }
+ }
+}
+
+/// Best-effort extraction of a panic message (`&str` / `String` cases).
+fn panic_message(payload: Box) -> String {
+ if let Some(s) = payload.downcast_ref::<&str>() {
+ (*s).to_string()
+ } else if let Some(s) = payload.downcast_ref::() {
+ s.clone()
+ } else {
+ "".to_string()
+ }
+}
+
+/// One key's slot. The entry is created on first start and never removed,
+/// so `generation` stays monotonic across the key's whole lifetime — a
+/// parked prior-generation thread can therefore always tell that its
+/// generation is stale. `cancel.is_some()` is the running indicator;
+/// `handle` is the join handle, reaped by the next start or by quiesce.
+struct SlotState {
+ generation: u64,
+ cancel: Option,
+ handle: Option,
+ weight: ShutdownWeight,
+ drain: Option,
+ join_budget: Duration,
+}
+
+impl SlotState {
+ fn dormant() -> Self {
+ Self {
+ generation: 0,
+ cancel: None,
+ handle: None,
+ weight: ShutdownWeight::default(),
+ drain: None,
+ join_budget: DEFAULT_JOIN_BUDGET,
+ }
+ }
+}
+
+// ---------------------------------------------------------------------
+// The registry
+// ---------------------------------------------------------------------
+
+/// Shared lifecycle engine for background workers. See the module docs.
+pub struct ThreadRegistry {
+ slots: Mutex>,
+ orphans: Mutex>,
+ reap_backstop: Duration,
+}
+
+impl ThreadRegistry {
+ /// New registry with the default reap backstop ([`DEFAULT_REAP_BACKSTOP`]).
+ pub fn new() -> Arc {
+ Self::with_reap_backstop(DEFAULT_REAP_BACKSTOP)
+ }
+
+ /// New registry with an explicit orphan reap backstop (the wallet
+ /// uses 1s — the same grace separates "finishing" from "wedged").
+ pub fn with_reap_backstop(backstop: Duration) -> Arc {
+ Arc::new(Self {
+ slots: Mutex::new(BTreeMap::new()),
+ orphans: Mutex::new(Vec::new()),
+ reap_backstop: backstop,
+ })
+ }
+
+ /// Start an OS-thread worker for `!Send` loops. `body` runs on a
+ /// fresh `std::thread` and may build and `block_on` `!Send` futures
+ /// internally — the `!Send` value never crosses the spawn boundary
+ /// (`body` itself is `Send`). Starting a key that already has a live
+ /// worker is a no-op; a key whose prior thread has not been reaped is
+ /// reaped-or-parked first (the restart-reap path).
+ ///
+ /// **Requires a multi-thread runtime**: the worker drives its loop
+ /// via `Handle::block_on` and needs the shared timer/IO driver.
+ pub fn start_thread(self: &Arc, key: K, cfg: WorkerConfig, body: F)
+ where
+ F: FnOnce(CancellationToken) + Send + 'static,
+ {
+ Self::assert_multi_thread("start_thread");
+ let prior = {
+ let mut slots = self.lock_slots();
+ let slot = slots.entry(key).or_insert_with(SlotState::dormant);
+ if slot.cancel.is_some() {
+ return;
+ }
+ // Take the prior handle to reap below; bump generation and
+ // install the new token under this one lock so a prior
+ // thread's epilogue observes the post-swap generation.
+ let prior = slot.handle.take();
+ let token = CancellationToken::new();
+ slot.cancel = Some(token.clone());
+ slot.generation += 1;
+ let my_gen = slot.generation;
+ slot.weight = cfg.weight;
+ slot.drain = cfg.drain;
+ slot.join_budget = cfg.join_budget;
+
+ let reg = Arc::clone(self);
+ let body_token = token;
+ let join = std::thread::Builder::new()
+ .name(format!("tr-worker-{key:?}"))
+ .spawn(move || {
+ body(body_token);
+ reg.run_epilogue(key, my_gen);
+ })
+ .expect("failed to spawn registry worker thread");
+ // Store the handle while still under the slot lock; the guard
+ // is released at the end of this block, BEFORE the reap below
+ // (R1: store handle -> drop guard -> THEN reap-or-park).
+ slot.handle = Some(WorkerHandle::OsThread(join));
+ prior
+ };
+
+ // The prior thread was cancellation-signalled by a preceding
+ // cancel(); with the slot lock released its epilogue completes
+ // promptly and the join lands in milliseconds. The backstop fires
+ // only on a genuine wedge, in which case the still-live handle is
+ // parked (not dropped) so teardown can account for it.
+ self.reap_prior_or_park(prior, key);
+ }
+
+ /// Start a tokio-task worker for `Send` futures. Same restart-reap
+ /// semantics as [`start_thread`](Self::start_thread); does not require
+ /// a multi-thread runtime.
+ pub fn start_task(self: &Arc, key: K, cfg: WorkerConfig, body: F)
+ where
+ F: FnOnce(CancellationToken) -> Fut + Send + 'static,
+ Fut: Future + Send + 'static,
+ {
+ let prior = {
+ let mut slots = self.lock_slots();
+ let slot = slots.entry(key).or_insert_with(SlotState::dormant);
+ if slot.cancel.is_some() {
+ return;
+ }
+ let prior = slot.handle.take();
+ let token = CancellationToken::new();
+ slot.cancel = Some(token.clone());
+ slot.generation += 1;
+ let my_gen = slot.generation;
+ slot.weight = cfg.weight;
+ slot.drain = cfg.drain;
+ slot.join_budget = cfg.join_budget;
+
+ let reg = Arc::clone(self);
+ let body_token = token;
+ let join = tokio::spawn(async move {
+ body(body_token).await;
+ reg.run_epilogue(key, my_gen);
+ });
+ slot.handle = Some(WorkerHandle::Task(join));
+ prior
+ };
+ self.reap_prior_or_park(prior, key);
+ }
+
+ /// Whether a worker is currently registered and running for `key`.
+ pub fn is_running(&self, key: K) -> bool {
+ self.lock_slots()
+ .get(&key)
+ .map(|s| s.cancel.is_some())
+ .unwrap_or(false)
+ }
+
+ /// Signal-only cancellation of one worker (was `stop()`).
+ pub fn cancel(&self, key: K) {
+ if let Some(slot) = self.lock_slots().get_mut(&key) {
+ if let Some(token) = slot.cancel.take() {
+ token.cancel();
+ }
+ }
+ }
+
+ /// Signal-only cancellation of every registered worker.
+ pub fn cancel_all(&self) {
+ for slot in self.lock_slots().values_mut() {
+ if let Some(token) = slot.cancel.take() {
+ token.cancel();
+ }
+ }
+ }
+
+ /// Await this worker's drain hook, cancel it, then join within its
+ /// budget. The live handle is owned by the slot and is **never** moved
+ /// into this future's frame, so a dropped/timed-out call cannot detach
+ /// it; on the managed timeout — or if this future is dropped
+ /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX]
+ pub async fn quiesce(&self, key: K) -> WorkerStatus {
+ // Snapshot the drain hook + budget, and bail early if nothing is
+ // registered for this key.
+ let (drain, budget) = {
+ let slots = self.lock_slots();
+ match slots.get(&key) {
+ Some(s) if s.cancel.is_some() || s.handle.is_some() => {
+ (s.drain.clone(), s.join_budget)
+ }
+ _ => return WorkerStatus::NotRunning,
+ }
+ };
+
+ // R2: gate-before-cancel — fully await the drain hook before the
+ // cancel signal is observed.
+ if let Some(drain) = drain {
+ drain().await;
+ }
+
+ // Signal-only cancel.
+ if let Some(slot) = self.lock_slots().get_mut(&key) {
+ if let Some(token) = slot.cancel.take() {
+ token.cancel();
+ }
+ }
+
+ // Poll-join within budget. The re-park guard moves the slot's
+ // still-live handle into orphans if this future is dropped before
+ // the loop finishes — the handle is never owned by this frame.
+ let _repark = Repark { reg: self, key };
+ let deadline = Instant::now() + budget;
+ loop {
+ enum Step {
+ Classify(WorkerHandle),
+ Park(WorkerHandle),
+ NotRunning,
+ Wait,
+ }
+ let step = {
+ let mut slots = self.lock_slots();
+ match slots.get_mut(&key) {
+ None => Step::NotRunning,
+ Some(slot) => match slot.handle.take_if(|h| h.is_finished()) {
+ Some(h) => Step::Classify(h),
+ None if slot.handle.is_none() => Step::NotRunning,
+ None if Instant::now() >= deadline => {
+ Step::Park(slot.handle.take().expect("handle present"))
+ }
+ None => Step::Wait,
+ },
+ }
+ };
+ match step {
+ Step::Classify(h) => return h.classify(),
+ Step::Park(h) => {
+ self.lock_orphans().push(h);
+ return WorkerStatus::Timeout;
+ }
+ Step::NotRunning => return WorkerStatus::NotRunning,
+ Step::Wait => tokio::time::sleep(Duration::from_millis(5)).await,
+ }
+ }
+ }
+
+ /// Is any registered worker **or** parked orphan still alive?
+ /// Store-wiping paths must gate on this returning `false` before
+ /// destroying shared state. [F2 FIX]
+ pub fn any_alive(&self) -> bool {
+ {
+ let slots = self.lock_slots();
+ for slot in slots.values() {
+ if slot.cancel.is_some() {
+ return true;
+ }
+ if let Some(handle) = &slot.handle {
+ if !handle.is_finished() {
+ return true;
+ }
+ }
+ }
+ }
+ self.lock_orphans().iter().any(|h| !h.is_finished())
+ }
+
+ /// Reap parked orphans with a short grace; survivors are re-parked and
+ /// reported as [`WorkerStatus::Detached`] (idempotent retry).
+ pub async fn reap_orphans(&self, grace: Duration) -> WorkerStatus {
+ self.reap_orphans_impl(grace).await.0
+ }
+
+ /// Weight-ordered teardown: ascending tier by tier, each worker's
+ /// (drain-hook -> cancel -> join) run concurrently within a tier;
+ /// orphan reap runs last. **Requires a multi-thread runtime.**
+ pub async fn shutdown(&self) -> ShutdownReport {
+ Self::assert_multi_thread("shutdown");
+
+ // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in
+ // ascending weight order, giving the lower-first drain.
+ let tiers: BTreeMap> = {
+ let slots = self.lock_slots();
+ let mut tiers: BTreeMap> = BTreeMap::new();
+ for (key, slot) in slots.iter() {
+ tiers.entry(slot.weight).or_default().push(*key);
+ }
+ tiers
+ };
+
+ let mut per_worker = BTreeMap::new();
+ for (_weight, keys) in tiers {
+ // Drain every worker in this tier concurrently: each
+ // quiesce() drives its own drain-hook -> cancel -> join, and
+ // `join_all` polls them on one task so their drain hooks
+ // interleave (equal-weight concurrency).
+ let drained = keys.into_iter().map(|key| async move { (key, self.quiesce(key).await) });
+ for (key, status) in futures::future::join_all(drained).await {
+ per_worker.insert(key, status);
+ }
+ }
+
+ // Account for parked orphans last.
+ let (_status, detached) = self.reap_orphans_impl(self.reap_backstop).await;
+ ShutdownReport {
+ per_worker,
+ detached,
+ }
+ }
+
+ // -----------------------------------------------------------------
+ // Internal helpers
+ // -----------------------------------------------------------------
+
+ fn lock_slots(&self) -> std::sync::MutexGuard<'_, BTreeMap> {
+ self.slots.lock().unwrap_or_else(|e| e.into_inner())
+ }
+
+ fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec> {
+ self.orphans.lock().unwrap_or_else(|e| e.into_inner())
+ }
+
+ fn assert_multi_thread(ctx: &str) {
+ assert!(
+ matches!(
+ tokio::runtime::Handle::current().runtime_flavor(),
+ RuntimeFlavor::MultiThread
+ ),
+ "ThreadRegistry::{ctx}() requires a multi-thread Tokio runtime: an \
+ OS-thread worker drives its loop via Handle::block_on and needs the \
+ runtime's timer/IO driver, but a current_thread runtime can only \
+ drive one block_on at a time"
+ );
+ }
+
+ /// Gen-gated exit epilogue, run on the worker after its body returns:
+ /// clear this slot's running flag only if a newer start has not since
+ /// installed a replacement.
+ fn run_epilogue(&self, key: K, my_gen: u64) {
+ if let Some(slot) = self.lock_slots().get_mut(&key) {
+ if slot.generation == my_gen {
+ slot.cancel = None;
+ }
+ }
+ }
+
+ /// Reap a restarted key's prior worker — or park it if it is genuinely
+ /// wedged past the reap backstop. Must be called with no registry lock
+ /// held (it spins synchronously for an OS thread).
+ fn reap_prior_or_park(&self, prior: Option, key: K) {
+ let Some(handle) = prior else {
+ return;
+ };
+ match handle {
+ WorkerHandle::OsThread(h) => {
+ let deadline = Instant::now() + self.reap_backstop;
+ loop {
+ if h.is_finished() {
+ let _ = h.join();
+ return;
+ }
+ if Instant::now() >= deadline {
+ tracing::warn!(
+ ?key,
+ backstop = ?self.reap_backstop,
+ "prior worker thread did not finish within the reap \
+ backstop after cancellation; parking it as an orphan \
+ for teardown to join rather than detaching it"
+ );
+ self.lock_orphans().push(WorkerHandle::OsThread(h));
+ return;
+ }
+ std::thread::sleep(Duration::from_millis(5));
+ }
+ }
+ // A task can't be joined synchronously here; park a still-live
+ // one for async reap. A finished one is dropped (detaching a
+ // finished task is a no-op).
+ task => {
+ if !task.is_finished() {
+ self.lock_orphans().push(task);
+ }
+ }
+ }
+ }
+
+ /// Drain the orphan list, polling until `grace`. Returns the terminal
+ /// status and the number of survivors re-parked for an idempotent
+ /// retry.
+ async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) {
+ let mut pending: Vec = {
+ let mut guard = self.lock_orphans();
+ std::mem::take(&mut *guard)
+ };
+ if pending.is_empty() {
+ return (WorkerStatus::Ok, 0);
+ }
+
+ let deadline = Instant::now() + grace;
+ // Keep the first non-clean terminal status; a live survivor still
+ // takes precedence at the deadline.
+ let mut non_clean: Option = None;
+ loop {
+ let mut still_live = Vec::with_capacity(pending.len());
+ for handle in pending.drain(..) {
+ if handle.is_finished() {
+ let status = handle.classify();
+ if !status.is_clean() {
+ non_clean.get_or_insert(status);
+ }
+ } else {
+ still_live.push(handle);
+ }
+ }
+ pending = still_live;
+
+ if pending.is_empty() {
+ return (non_clean.unwrap_or(WorkerStatus::Ok), 0);
+ }
+ if Instant::now() >= deadline {
+ let survivors = pending.len();
+ self.lock_orphans().extend(pending);
+ return (WorkerStatus::Detached, survivors);
+ }
+ tokio::time::sleep(Duration::from_millis(5)).await;
+ }
+ }
+
+ /// Test-only seam: park a raw thread handle as an orphan. Used by
+ /// cross-crate regression tests (e.g. the wallet's F2 gate) that must
+ /// inject a wedged prior-generation thread without driving the full
+ /// restart-reap path.
+ #[doc(hidden)]
+ pub fn park_orphan_for_test(&self, handle: std::thread::JoinHandle<()>) {
+ self.lock_orphans().push(WorkerHandle::OsThread(handle));
+ }
+}
+
+/// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future
+/// is dropped before it finishes (e.g. an outer timeout fires), this moves
+/// the slot's still-live handle into the orphan list instead of letting it
+/// be dropped-and-detached. On normal completion the handle has already
+/// been taken from the slot, so this is a no-op.
+struct Repark<'a, K: RegistryKey> {
+ reg: &'a ThreadRegistry,
+ key: K,
+}
+
+impl Drop for Repark<'_, K> {
+ fn drop(&mut self) {
+ // Take the handle under the slot lock, release it, then push to
+ // orphans — never nest the two locks.
+ let handle = self
+ .reg
+ .lock_slots()
+ .get_mut(&self.key)
+ .and_then(|slot| slot.handle.take());
+ if let Some(handle) = handle {
+ self.reg.lock_orphans().push(handle);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::panic::{catch_unwind, AssertUnwindSafe};
+ use std::sync::atomic::{AtomicBool, Ordering};
+ use std::sync::mpsc;
+ use tokio::runtime::{Builder, Handle};
+ use tokio::sync::Barrier;
+
+ type Reg = Arc>;
+
+ /// Start an OS-thread worker that exits cleanly when cancelled. The
+ /// runtime handle is captured from the caller's context (the worker
+ /// thread is not itself a tokio worker, so it can't fetch its own).
+ fn start_clean(reg: &Reg, key: &'static str, cfg: WorkerConfig) {
+ let handle = Handle::current();
+ reg.start_thread(key, cfg, move |cancel| {
+ handle.block_on(async move { cancel.cancelled().await });
+ });
+ }
+
+ /// Body for a worker wedged in a non-yielding section: blocks on a
+ /// channel and ignores its cancellation token (stands in for a thread
+ /// stuck in a `Drop` that never observes cancel).
+ fn wedged_body(rx: mpsc::Receiver<()>) -> impl FnOnce(CancellationToken) + Send + 'static {
+ move |_cancel| {
+ let _ = rx.recv();
+ }
+ }
+
+ fn orphan_len(reg: &Reg) -> usize {
+ reg.lock_orphans().len()
+ }
+
+ // ----- Group 1: F1 regression -------------------------------------
+
+ /// TC-001 — a `quiesce` whose outer future is dropped (a tiny enclosing
+ /// timeout) must re-park the live handle, never drop-and-detach it. The
+ /// slot is cleared (`is_running == false`) but the handle lives in
+ /// orphans and `any_alive()` stays true.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc001_quiesce_drop_reparks_handle_not_detach() {
+ let reg = ThreadRegistry::<&str>::new();
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+ reg.start_thread("alpha", WorkerConfig::default(), wedged_body(release_rx));
+ assert!(reg.is_running("alpha"));
+
+ // The wedged worker never observes cancel, so the internal 30s
+ // budget can't fire here; the tiny outer timeout drops the quiesce
+ // future mid-poll. A naive by-value-into-future impl would detach
+ // the handle (orphans empty, any_alive false); the fix re-parks it.
+ let result =
+ tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await;
+ assert!(result.is_err(), "outer timeout must fire on the wedged worker");
+
+ assert!(reg.any_alive(), "re-parked handle keeps any_alive true");
+ assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)");
+ assert_eq!(orphan_len(®), 1, "handle was re-parked, not detached");
+ assert!(!WorkerStatus::Timeout.is_clean());
+
+ // Release + reap: the orphan joins cleanly and liveness clears.
+ release_tx.send(()).unwrap();
+ assert_eq!(
+ reg.reap_orphans(Duration::from_secs(2)).await,
+ WorkerStatus::Ok
+ );
+ assert!(!reg.any_alive());
+ }
+
+ /// TC-001b — internal-budget variant: a wedged worker with a tiny
+ /// `join_budget` makes `quiesce` itself time out, re-park, and return
+ /// `Timeout` (no outer drop involved).
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc001b_quiesce_internal_budget_timeout_reparks() {
+ let reg = ThreadRegistry::<&str>::new();
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+ let cfg = WorkerConfig {
+ join_budget: Duration::from_millis(50),
+ ..WorkerConfig::default()
+ };
+ reg.start_thread("alpha", cfg, wedged_body(release_rx));
+
+ let status = reg.quiesce("alpha").await;
+ assert_eq!(status, WorkerStatus::Timeout);
+ assert_eq!(orphan_len(®), 1);
+ assert!(reg.any_alive());
+ assert!(!reg.is_running("alpha"));
+
+ release_tx.send(()).unwrap();
+ assert_eq!(
+ reg.reap_orphans(Duration::from_secs(2)).await,
+ WorkerStatus::Ok
+ );
+ assert!(!reg.any_alive());
+ }
+
+ /// GAP-006 — the F1 scenario via the `shutdown()` path: a wedged worker
+ /// with a tiny budget surfaces as `Timeout` in the report, its handle
+ /// is re-parked (`detached == 1`, `any_alive`), and the result is
+ /// non-clean — never a clean detach.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn gap006_shutdown_path_reparks_wedged_worker() {
+ let reg = ThreadRegistry::<&str>::new();
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+ let cfg = WorkerConfig {
+ join_budget: Duration::from_millis(50),
+ ..WorkerConfig::default()
+ };
+ reg.start_thread("alpha", cfg, wedged_body(release_rx));
+
+ let report = tokio::time::timeout(Duration::from_secs(10), reg.shutdown())
+ .await
+ .expect("shutdown must complete within bound");
+ assert_eq!(report.per_worker.get("alpha"), Some(&WorkerStatus::Timeout));
+ assert_eq!(report.detached, 1, "wedged handle re-parked, survived reap");
+ assert!(!report.all_clean());
+ assert!(reg.any_alive());
+
+ // Cleanup.
+ release_tx.send(()).unwrap();
+ let _ = reg.reap_orphans(Duration::from_secs(5)).await;
+ assert!(!reg.any_alive());
+ }
+
+ // ----- Group 3: registry unit suite -------------------------------
+
+ /// TC-003 — a slow prior-generation thread's epilogue must NOT clear a
+ /// newer generation's token. Restarting reaps the prior generation
+ /// fully (its epilogue runs); the new generation stays tracked.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc003_generation_match_epilogue_preserves_new_token() {
+ let reg = ThreadRegistry::<&str>::new();
+ start_clean(®, "beta", WorkerConfig::default()); // gen 1
+ assert!(reg.is_running("beta"));
+
+ // Cancel gen 1, then restart. start_thread's reap joins gen 1
+ // (running its gen-gated epilogue) before returning, so this is
+ // deterministic: if the epilogue ignored generation it would have
+ // cleared gen 2's token during that join.
+ reg.cancel("beta");
+ start_clean(®, "beta", WorkerConfig::default()); // gen 2
+
+ assert!(
+ reg.is_running("beta"),
+ "gen-2 token must survive gen-1's epilogue"
+ );
+ assert_eq!(reg.quiesce("beta").await, WorkerStatus::Ok);
+ }
+
+ /// TC-004 — a naturally-finished prior thread is joined cleanly on
+ /// restart, with no parking.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc004_restart_reaps_finished_prior_without_parking() {
+ let reg = ThreadRegistry::<&str>::new();
+ start_clean(®, "gamma", WorkerConfig::default());
+ // Cancel so the prior exits, then restart: the reap must join it,
+ // not park it.
+ reg.cancel("gamma");
+ start_clean(®, "gamma", WorkerConfig::default());
+ assert_eq!(orphan_len(®), 0, "finished prior was joined, not parked");
+ assert!(reg.is_running("gamma"));
+ assert_eq!(reg.quiesce("gamma").await, WorkerStatus::Ok);
+ }
+
+ /// TC-005 — a prior thread wedged past the reap backstop is parked in
+ /// orphans (not dropped), then drained after release.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc005_restart_parks_wedged_prior() {
+ let reg = ThreadRegistry::with_reap_backstop(Duration::from_millis(100));
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+
+ // gen 1: wedged (ignores cancel).
+ reg.start_thread("delta", WorkerConfig::default(), wedged_body(release_rx));
+ reg.cancel("delta");
+
+ // gen 2: clean. The restart reaps gen 1 — wedged past the 100ms
+ // backstop, so it is parked. Run off the runtime workers since the
+ // reap spins synchronously.
+ let reg_for_start = Arc::clone(®);
+ let parent = Handle::current();
+ tokio::task::spawn_blocking(move || {
+ let handle = parent.clone();
+ reg_for_start.start_thread("delta", WorkerConfig::default(), move |cancel| {
+ handle.block_on(async move { cancel.cancelled().await });
+ });
+ })
+ .await
+ .unwrap();
+
+ assert_eq!(orphan_len(®), 1, "wedged prior parked, not dropped");
+ assert!(reg.any_alive());
+ assert!(reg.is_running("delta"), "gen-2 loop started");
+
+ // Release the wedged prior; reap drains it.
+ release_tx.send(()).unwrap();
+ assert_eq!(
+ reg.reap_orphans(Duration::from_secs(2)).await,
+ WorkerStatus::Ok
+ );
+ assert_eq!(orphan_len(®), 0);
+
+ // Cleanup gen 2.
+ assert_eq!(reg.quiesce("delta").await, WorkerStatus::Ok);
+ }
+
+ /// TC-006 — orphan drain: a survivor at the grace deadline is reported
+ /// `Detached` and re-parked; once released it reaps `Ok`.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc006_orphan_drain_detached_then_ok() {
+ let reg = ThreadRegistry::<&str>::new();
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+ let wedged = std::thread::spawn(move || {
+ let _ = release_rx.recv();
+ });
+ reg.park_orphan_for_test(wedged);
+
+ assert_eq!(
+ reg.reap_orphans(Duration::from_millis(50)).await,
+ WorkerStatus::Detached
+ );
+ assert_eq!(orphan_len(®), 1, "survivor re-parked for retry");
+ assert!(reg.any_alive());
+
+ release_tx.send(()).unwrap();
+ assert_eq!(
+ reg.reap_orphans(Duration::from_secs(2)).await,
+ WorkerStatus::Ok
+ );
+ assert_eq!(orphan_len(®), 0);
+ assert!(!reg.any_alive());
+ }
+
+ /// TC-007 — weight-ordered shutdown drains a lower tier before a higher
+ /// one.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc007_weight_ordered_shutdown_drains_low_first() {
+ let reg = ThreadRegistry::<&str>::new();
+ let log = Arc::new(Mutex::new(Vec::<&'static str>::new()));
+
+ let mk_hook = |tag: &'static str, log: Arc>>| -> DrainHook {
+ Arc::new(move || {
+ let log = Arc::clone(&log);
+ Box::pin(async move {
+ log.lock().unwrap().push(tag);
+ })
+ })
+ };
+
+ start_clean(
+ ®,
+ "w0",
+ WorkerConfig {
+ weight: ShutdownWeight(0),
+ drain: Some(mk_hook("w0", Arc::clone(&log))),
+ ..WorkerConfig::default()
+ },
+ );
+ start_clean(
+ ®,
+ "w5",
+ WorkerConfig {
+ weight: ShutdownWeight(5),
+ drain: Some(mk_hook("w5", Arc::clone(&log))),
+ ..WorkerConfig::default()
+ },
+ );
+ start_clean(
+ ®,
+ "w10",
+ WorkerConfig {
+ weight: ShutdownWeight(10),
+ drain: Some(mk_hook("w10", Arc::clone(&log))),
+ ..WorkerConfig::default()
+ },
+ );
+
+ let report = reg.shutdown().await;
+ assert!(report.all_clean());
+
+ let log = log.lock().unwrap();
+ let pos = |tag| log.iter().position(|t| *t == tag).unwrap();
+ assert!(pos("w0") < pos("w5"));
+ assert!(pos("w5") < pos("w10"));
+ }
+
+ /// TC-008 — equal-weight workers drain concurrently. A shared
+ /// `Barrier(2)` in both drain hooks would deadlock under sequential
+ /// draining (caught by the enclosing timeout); the event log proves
+ /// both arrived before either passed.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc008_equal_weight_drains_concurrently() {
+ let reg = ThreadRegistry::<&str>::new();
+ let log = Arc::new(Mutex::new(Vec::<&'static str>::new()));
+ let barrier = Arc::new(Barrier::new(2));
+
+ let mk_hook = |arrived: &'static str,
+ passed: &'static str,
+ log: Arc>>,
+ barrier: Arc|
+ -> DrainHook {
+ Arc::new(move || {
+ let log = Arc::clone(&log);
+ let barrier = Arc::clone(&barrier);
+ Box::pin(async move {
+ log.lock().unwrap().push(arrived);
+ barrier.wait().await;
+ log.lock().unwrap().push(passed);
+ })
+ })
+ };
+
+ start_clean(
+ ®,
+ "a",
+ WorkerConfig {
+ weight: ShutdownWeight(0),
+ drain: Some(mk_hook("a_arrived", "a_passed", Arc::clone(&log), Arc::clone(&barrier))),
+ ..WorkerConfig::default()
+ },
+ );
+ start_clean(
+ ®,
+ "b",
+ WorkerConfig {
+ weight: ShutdownWeight(0),
+ drain: Some(mk_hook("b_arrived", "b_passed", Arc::clone(&log), Arc::clone(&barrier))),
+ ..WorkerConfig::default()
+ },
+ );
+
+ let report = tokio::time::timeout(Duration::from_secs(5), reg.shutdown())
+ .await
+ .expect("equal-weight drain must not deadlock (proves concurrency)");
+ assert!(report.all_clean());
+
+ let log = log.lock().unwrap();
+ let pos = |tag| log.iter().position(|t| *t == tag).unwrap();
+ let last_arrived = pos("a_arrived").max(pos("b_arrived"));
+ let first_passed = pos("a_passed").min(pos("b_passed"));
+ assert!(
+ last_arrived < first_passed,
+ "both hooks must reach the barrier before either passes: {log:?}"
+ );
+ }
+
+ /// TC-009 — `any_alive()` accounts for both live slots and orphans.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc009_any_alive_spans_slots_and_orphans() {
+ let reg = ThreadRegistry::<&str>::new();
+ start_clean(®, "alpha", WorkerConfig::default());
+ assert!(reg.any_alive());
+
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+ let wedged = std::thread::spawn(move || {
+ let _ = release_rx.recv();
+ });
+ reg.park_orphan_for_test(wedged);
+ assert!(reg.any_alive());
+
+ assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok);
+ assert!(reg.any_alive(), "orphan still contributes after slot drains");
+ assert!(!reg.is_running("alpha"));
+
+ release_tx.send(()).unwrap();
+ let _ = reg.reap_orphans(Duration::from_secs(2)).await;
+ assert!(!reg.any_alive());
+ }
+
+ /// TC-010 — `shutdown()` panics with a documented message on a
+ /// current-thread runtime (R4, variant B).
+ #[test]
+ fn tc010_shutdown_asserts_multi_thread_runtime() {
+ let rt = Builder::new_current_thread().enable_all().build().unwrap();
+ let reg = ThreadRegistry::<&str>::new();
+ let result = catch_unwind(AssertUnwindSafe(|| {
+ rt.block_on(async { reg.shutdown().await });
+ }));
+ let payload = result.expect_err("shutdown must panic on current_thread");
+ let msg = payload
+ .downcast_ref::()
+ .map(String::as_str)
+ .or_else(|| payload.downcast_ref::<&str>().copied())
+ .unwrap_or("");
+ assert!(
+ msg.contains("multi-thread"),
+ "panic must name the runtime constraint, got: {msg}"
+ );
+ }
+
+ // ----- Group 4: DrainHook ordering --------------------------------
+
+ /// TC-011 — the drain hook is fully awaited before the cancel signal is
+ /// observed by the worker.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc011_drain_hook_completes_before_cancel() {
+ let reg = ThreadRegistry::<&str>::new();
+ let log = Arc::new(Mutex::new(Vec::<&'static str>::new()));
+
+ let log_hook = Arc::clone(&log);
+ let drain: DrainHook = Arc::new(move || {
+ let log = Arc::clone(&log_hook);
+ Box::pin(async move {
+ log.lock().unwrap().push("drain_hook_start");
+ tokio::time::sleep(Duration::from_millis(10)).await;
+ log.lock().unwrap().push("drain_hook_complete");
+ })
+ });
+
+ let log_worker = Arc::clone(&log);
+ let handle = Handle::current();
+ reg.start_thread(
+ "epsilon",
+ WorkerConfig {
+ drain: Some(drain),
+ ..WorkerConfig::default()
+ },
+ move |cancel| {
+ handle.block_on(async move {
+ cancel.cancelled().await;
+ log_worker.lock().unwrap().push("cancel_observed");
+ });
+ },
+ );
+
+ assert_eq!(reg.quiesce("epsilon").await, WorkerStatus::Ok);
+ assert!(!reg.is_running("epsilon"));
+
+ let log = log.lock().unwrap();
+ let pos = |tag| log.iter().position(|t| *t == tag).unwrap();
+ assert!(pos("drain_hook_start") < pos("drain_hook_complete"));
+ assert!(pos("drain_hook_complete") < pos("cancel_observed"));
+ }
+
+ /// TC-012 — a `quiesce` blocks in the drain hook until an `is_syncing`
+ /// barrier the hook polls falls, and only then cancels + joins.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc012_drain_hook_observes_barrier_before_join() {
+ let reg = ThreadRegistry::<&str>::new();
+ let is_syncing = Arc::new(AtomicBool::new(true));
+
+ let gate = Arc::clone(&is_syncing);
+ let drain: DrainHook = Arc::new(move || {
+ let gate = Arc::clone(&gate);
+ Box::pin(async move {
+ while gate.load(Ordering::Acquire) {
+ tokio::time::sleep(Duration::from_millis(5)).await;
+ }
+ })
+ });
+ start_clean(
+ ®,
+ "zeta",
+ WorkerConfig {
+ drain: Some(drain),
+ ..WorkerConfig::default()
+ },
+ );
+
+ let quiesce_completed = Arc::new(AtomicBool::new(false));
+ let reg_q = Arc::clone(®);
+ let done = Arc::clone(&quiesce_completed);
+ let quiesce_task = tokio::spawn(async move {
+ let status = reg_q.quiesce("zeta").await;
+ done.store(true, Ordering::Release);
+ status
+ });
+
+ // While the barrier is held, quiesce must stay pending.
+ tokio::time::sleep(Duration::from_millis(50)).await;
+ assert!(
+ !quiesce_completed.load(Ordering::Acquire),
+ "quiesce must block while is_syncing is held"
+ );
+
+ // Release the barrier; quiesce drains, cancels, joins.
+ is_syncing.store(false, Ordering::Release);
+ let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task)
+ .await
+ .expect("quiesce must complete once the barrier falls")
+ .unwrap();
+ assert_eq!(status, WorkerStatus::Ok);
+ assert!(quiesce_completed.load(Ordering::Acquire));
+ }
+
+ // ----- Group 5: status classification -----------------------------
+
+ /// TC-013 — only the `Task` kind can classify as `Stopped` (from a
+ /// runtime-level cancel/abort JoinError); a cooperatively token-
+ /// cancelled task exits normally as `Ok`. Verifies the kind-dispatch
+ /// at the classification boundary.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc013_task_kind_classifies_stopped_and_ok() {
+ // Stopped: an aborted task yields a cancelled JoinError.
+ let aborted = tokio::spawn(std::future::pending::<()>());
+ aborted.abort();
+ while !aborted.is_finished() {
+ tokio::time::sleep(Duration::from_millis(1)).await;
+ }
+ let status = WorkerHandle::Task(aborted).classify();
+ assert!(matches!(status, WorkerStatus::Stopped(_)), "got {status:?}");
+ assert!(!status.is_clean());
+
+ // Ok: a cooperatively token-cancelled task returns normally.
+ let reg = ThreadRegistry::<&str>::new();
+ reg.start_task("task_a", WorkerConfig::default(), |cancel| async move {
+ cancel.cancelled().await;
+ });
+ assert_eq!(reg.quiesce("task_a").await, WorkerStatus::Ok);
+ assert!(!reg.is_running("task_a"));
+ }
+
+ /// TC-014 — an `OsThread` worker yields `Ok` (clean) or `Panicked`
+ /// (`&str` and `String` payloads), never `Stopped`.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn tc014_os_thread_ok_and_panicked_never_stopped() {
+ let reg = ThreadRegistry::<&str>::new();
+ start_clean(®, "os_clean", WorkerConfig::default());
+ let ok = reg.quiesce("os_clean").await;
+ assert_eq!(ok, WorkerStatus::Ok);
+ assert!(ok.is_clean());
+
+ // &str panic payload.
+ reg.start_thread("os_panic_str", WorkerConfig::default(), |_cancel| {
+ panic!("deliberate test panic");
+ });
+ match reg.quiesce("os_panic_str").await {
+ WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate test panic")),
+ other => panic!("expected Panicked, got {other:?}"),
+ }
+
+ // String panic payload.
+ reg.start_thread("os_panic_string", WorkerConfig::default(), |_cancel| {
+ std::panic::panic_any(String::from("deliberate string panic"));
+ });
+ match reg.quiesce("os_panic_string").await {
+ WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate string panic")),
+ other => panic!("expected Panicked, got {other:?}"),
+ }
+ }
+
+ // ----- Gaps -------------------------------------------------------
+
+ /// GAP-003 — `shutdown()` is idempotent: a second call finds every slot
+ /// already joined and reports `NotRunning`, still clean.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn gap003_shutdown_is_idempotent() {
+ let reg = ThreadRegistry::<&str>::new();
+ start_clean(®, "alpha", WorkerConfig::default());
+
+ let first = reg.shutdown().await;
+ assert_eq!(first.per_worker.get("alpha"), Some(&WorkerStatus::Ok));
+ assert!(first.all_clean());
+
+ let second = reg.shutdown().await;
+ assert_eq!(
+ second.per_worker.get("alpha"),
+ Some(&WorkerStatus::NotRunning)
+ );
+ assert!(second.all_clean());
+ }
+
+ /// GAP-004 — `cancel(key)` is selective: cancelling A does not touch B.
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn gap004_cancel_is_selective() {
+ let reg = ThreadRegistry::<&str>::new();
+ start_clean(®, "a", WorkerConfig::default());
+ start_clean(®, "b", WorkerConfig::default());
+
+ reg.cancel("a");
+ assert!(reg.is_running("b"), "cancel(a) must not cancel b");
+ assert_eq!(reg.quiesce("a").await, WorkerStatus::Ok);
+ assert!(reg.is_running("b"), "b still running after a drains");
+ assert_eq!(reg.quiesce("b").await, WorkerStatus::Ok);
+ }
+
+ /// GAP-005 — `WorkerConfig::default()` values are pinned.
+ #[test]
+ fn gap005_worker_config_defaults_pinned() {
+ let cfg = WorkerConfig::default();
+ assert_eq!(cfg.weight, ShutdownWeight(0));
+ assert!(cfg.drain.is_none());
+ assert_eq!(cfg.join_budget, DEFAULT_JOIN_BUDGET);
+ }
+}
From ac9a51a7c70f25cc307ef076cb0e9498c6a67f9b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:27:04 +0200
Subject: [PATCH 18/29] feat(dash-async): key-scope parked orphans for
any_alive_for()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Tag each parked orphan with its originating worker key and add
any_alive_for(key), so a store-wiping path scoped to one worker (the
wallet's clear_shielded F2 gate) can refuse only while that worker — its
slot or a parked prior-generation thread under its key — is alive, without
being blocked by unrelated workers that are legitimately running (e.g. the
always-on event-adapter task). Registry-wide any_alive() is retained.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
packages/rs-dash-async/src/registry.rs | 108 ++++++++++++++++++-------
1 file changed, 81 insertions(+), 27 deletions(-)
diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index e7cd835cd8..d04d8cbbbc 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -247,9 +247,13 @@ impl SlotState {
// ---------------------------------------------------------------------
/// Shared lifecycle engine for background workers. See the module docs.
+///
+/// Parked orphans carry their originating key so a store-wiping path for
+/// one worker can gate on [`any_alive_for`](Self::any_alive_for) without
+/// being blocked by an unrelated worker still legitimately running.
pub struct ThreadRegistry {
slots: Mutex>,
- orphans: Mutex>,
+ orphans: Mutex>,
reap_backstop: Duration,
}
@@ -446,7 +450,7 @@ impl ThreadRegistry {
match step {
Step::Classify(h) => return h.classify(),
Step::Park(h) => {
- self.lock_orphans().push(h);
+ self.lock_orphans().push((key, h));
return WorkerStatus::Timeout;
}
Step::NotRunning => return WorkerStatus::NotRunning,
@@ -455,24 +459,34 @@ impl ThreadRegistry {
}
}
- /// Is any registered worker **or** parked orphan still alive?
- /// Store-wiping paths must gate on this returning `false` before
- /// destroying shared state. [F2 FIX]
+ /// Is any registered worker **or** parked orphan still alive across
+ /// the whole registry?
pub fn any_alive(&self) -> bool {
{
let slots = self.lock_slots();
for slot in slots.values() {
- if slot.cancel.is_some() {
+ if slot_alive(slot) {
return true;
}
- if let Some(handle) = &slot.handle {
- if !handle.is_finished() {
- return true;
- }
- }
}
}
- self.lock_orphans().iter().any(|h| !h.is_finished())
+ self.lock_orphans().iter().any(|(_, h)| !h.is_finished())
+ }
+
+ /// Is the worker for `key` — its live slot **or** any orphan parked
+ /// under that key — still alive? A store-wiping path scoped to one
+ /// worker must gate on this (rather than the registry-wide
+ /// [`any_alive`](Self::any_alive)) so an unrelated worker that is
+ /// legitimately running does not block the wipe. [F2 FIX]
+ pub fn any_alive_for(&self, key: K) -> bool {
+ if let Some(slot) = self.lock_slots().get(&key) {
+ if slot_alive(slot) {
+ return true;
+ }
+ }
+ self.lock_orphans()
+ .iter()
+ .any(|(k, h)| *k == key && !h.is_finished())
}
/// Reap parked orphans with a short grace; survivors are re-parked and
@@ -526,7 +540,7 @@ impl ThreadRegistry {
self.slots.lock().unwrap_or_else(|e| e.into_inner())
}
- fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec> {
+ fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec<(K, WorkerHandle)>> {
self.orphans.lock().unwrap_or_else(|e| e.into_inner())
}
@@ -577,7 +591,7 @@ impl ThreadRegistry {
backstop after cancellation; parking it as an orphan \
for teardown to join rather than detaching it"
);
- self.lock_orphans().push(WorkerHandle::OsThread(h));
+ self.lock_orphans().push((key, WorkerHandle::OsThread(h)));
return;
}
std::thread::sleep(Duration::from_millis(5));
@@ -588,7 +602,7 @@ impl ThreadRegistry {
// finished task is a no-op).
task => {
if !task.is_finished() {
- self.lock_orphans().push(task);
+ self.lock_orphans().push((key, task));
}
}
}
@@ -598,7 +612,7 @@ impl ThreadRegistry {
/// status and the number of survivors re-parked for an idempotent
/// retry.
async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) {
- let mut pending: Vec = {
+ let mut pending: Vec<(K, WorkerHandle)> = {
let mut guard = self.lock_orphans();
std::mem::take(&mut *guard)
};
@@ -612,14 +626,14 @@ impl ThreadRegistry {
let mut non_clean: Option = None;
loop {
let mut still_live = Vec::with_capacity(pending.len());
- for handle in pending.drain(..) {
+ for (key, handle) in pending.drain(..) {
if handle.is_finished() {
let status = handle.classify();
if !status.is_clean() {
non_clean.get_or_insert(status);
}
} else {
- still_live.push(handle);
+ still_live.push((key, handle));
}
}
pending = still_live;
@@ -636,16 +650,21 @@ impl ThreadRegistry {
}
}
- /// Test-only seam: park a raw thread handle as an orphan. Used by
- /// cross-crate regression tests (e.g. the wallet's F2 gate) that must
- /// inject a wedged prior-generation thread without driving the full
- /// restart-reap path.
+ /// Test-only seam: park a raw thread handle as an orphan under `key`.
+ /// Used by cross-crate regression tests (e.g. the wallet's F2 gate)
+ /// that must inject a wedged prior-generation thread without driving
+ /// the full restart-reap path.
#[doc(hidden)]
- pub fn park_orphan_for_test(&self, handle: std::thread::JoinHandle<()>) {
- self.lock_orphans().push(WorkerHandle::OsThread(handle));
+ pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) {
+ self.lock_orphans().push((key, WorkerHandle::OsThread(handle)));
}
}
+/// `true` if a slot is running or holds an unfinished handle.
+fn slot_alive(slot: &SlotState) -> bool {
+ slot.cancel.is_some() || slot.handle.as_ref().is_some_and(|h| !h.is_finished())
+}
+
/// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future
/// is dropped before it finishes (e.g. an outer timeout fires), this moves
/// the slot's still-live handle into the orphan list instead of letting it
@@ -666,7 +685,7 @@ impl Drop for Repark<'_, K> {
.get_mut(&self.key)
.and_then(|slot| slot.handle.take());
if let Some(handle) = handle {
- self.reg.lock_orphans().push(handle);
+ self.reg.lock_orphans().push((self.key, handle));
}
}
}
@@ -885,7 +904,7 @@ mod tests {
let wedged = std::thread::spawn(move || {
let _ = release_rx.recv();
});
- reg.park_orphan_for_test(wedged);
+ reg.park_orphan_for_test("orphan", wedged);
assert_eq!(
reg.reap_orphans(Duration::from_millis(50)).await,
@@ -1027,7 +1046,7 @@ mod tests {
let wedged = std::thread::spawn(move || {
let _ = release_rx.recv();
});
- reg.park_orphan_for_test(wedged);
+ reg.park_orphan_for_test("orphan", wedged);
assert!(reg.any_alive());
assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok);
@@ -1039,6 +1058,41 @@ mod tests {
assert!(!reg.any_alive());
}
+ /// `any_alive_for(key)` is scoped: an orphan parked under one key does
+ /// not make a different key look alive (the F2 gate must not be
+ /// blocked by unrelated workers).
+ #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+ async fn any_alive_for_is_key_scoped() {
+ let reg = ThreadRegistry::<&str>::new();
+ let (release_tx, release_rx) = mpsc::channel::<()>();
+ let wedged = std::thread::spawn(move || {
+ let _ = release_rx.recv();
+ });
+ reg.park_orphan_for_test("shielded", wedged);
+
+ // A live, unrelated worker.
+ start_clean(®, "identity", WorkerConfig::default());
+
+ assert!(reg.any_alive(), "registry-wide liveness sees both");
+ assert!(reg.any_alive_for("shielded"), "shielded orphan is alive");
+ assert!(
+ !reg.any_alive_for("address"),
+ "an unrelated key with no slot/orphan is not alive"
+ );
+
+ // The running 'identity' worker must not make 'shielded' look alive
+ // beyond its own orphan, and vice versa.
+ assert!(reg.any_alive_for("identity"), "running identity is alive");
+
+ release_tx.send(()).unwrap();
+ let _ = reg.reap_orphans(Duration::from_secs(2)).await;
+ assert!(
+ !reg.any_alive_for("shielded"),
+ "shielded clear once its orphan is reaped"
+ );
+ assert_eq!(reg.quiesce("identity").await, WorkerStatus::Ok);
+ }
+
/// TC-010 — `shutdown()` panics with a documented message on a
/// current-thread runtime (R4, variant B).
#[test]
From d20aed0027929efc32c6a33ddff9461dc812b8e5 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:55:04 +0200
Subject: [PATCH 19/29] refactor(platform-wallet): migrate sync coordinators
onto shared ThreadRegistry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Replace the triplicated per-coordinator lifecycle machinery
(background_cancel/join/generation mutexes, the cloned coordinator_orphans
list, and the mod.rs free fns join_coordinator_thread / reap_prior_or_park
/ join_detached_orphans / panic_message) with the shared
dash-async ThreadRegistry. The manager holds one
Arc>; each coordinator's start/stop/
is_running/quiesce now delegate to it under a fixed WalletWorker key, and
each exposes its quiescing-gate raise as a registry DrainHook. The
wallet-event adapter becomes a registry start_task worker (weight 10,
draining after the coordinators it sinks at weight 0).
Fixes the two confirmed bugs structurally:
- F1: shutdown() is now CoordinatorExitStatus::from_report(registry
.shutdown()); each worker's join is bounded by its own join_budget
inside the registry, where the live handle stays owned by the slot. A
dropped/timed-out join can no longer detach a live thread — it re-parks
to Timeout.
- F2: clear_shielded() gates the store wipe on
registry.any_alive_for(ShieldedSync) (shielded-scoped, so the always-on
event adapter and the other coordinators running normally do not block
Clear), refusing while a prior-generation shielded thread is parked
alive.
CoordinatorThreadStatus / CoordinatorExitStatus / all_clean() are
byte-stable (FFI destroy maps !all_clean -> ErrorShutdownIncomplete);
WorkerStatus maps onto them 1:1 via From. The three wall-clock
restart-reap regression tests and the relocated free-fn tests are deleted
(subsumed by the dash-async registry suite); F2 (clear_shielded) and R5
(from_report) gain wallet-level tests.
Co-Authored-By: Claude Opus 4.8 (1M context)
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
packages/rs-dash-async/src/registry.rs | 37 +-
.../src/changeset/core_bridge.rs | 122 ++-
.../rs-platform-wallet/src/changeset/mod.rs | 2 +-
.../src/manager/identity_sync.rs | 319 ++-----
.../rs-platform-wallet/src/manager/mod.rs | 898 ++++++------------
.../src/manager/platform_address_sync.rs | 304 +-----
.../src/manager/shielded_sync.rs | 297 +-----
7 files changed, 537 insertions(+), 1442 deletions(-)
diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index d04d8cbbbc..802ca3598c 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -132,8 +132,7 @@ impl ShutdownReport {
/// let _hook: DrainHook =
/// Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) });
/// ```
-pub type DrainHook =
- Arc Pin + Send>> + Send + Sync>;
+pub type DrainHook = Arc Pin + Send>> + Send + Sync>;
/// Default managed-join budget when a [`WorkerConfig`] does not override
/// it. Pinned so an accidental change surfaces in tests.
@@ -518,7 +517,9 @@ impl ThreadRegistry {
// quiesce() drives its own drain-hook -> cancel -> join, and
// `join_all` polls them on one task so their drain hooks
// interleave (equal-weight concurrency).
- let drained = keys.into_iter().map(|key| async move { (key, self.quiesce(key).await) });
+ let drained = keys
+ .into_iter()
+ .map(|key| async move { (key, self.quiesce(key).await) });
for (key, status) in futures::future::join_all(drained).await {
per_worker.insert(key, status);
}
@@ -656,7 +657,8 @@ impl ThreadRegistry {
/// the full restart-reap path.
#[doc(hidden)]
pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) {
- self.lock_orphans().push((key, WorkerHandle::OsThread(handle)));
+ self.lock_orphans()
+ .push((key, WorkerHandle::OsThread(handle)));
}
}
@@ -741,9 +743,11 @@ mod tests {
// budget can't fire here; the tiny outer timeout drops the quiesce
// future mid-poll. A naive by-value-into-future impl would detach
// the handle (orphans empty, any_alive false); the fix re-parks it.
- let result =
- tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await;
- assert!(result.is_err(), "outer timeout must fire on the wedged worker");
+ let result = tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await;
+ assert!(
+ result.is_err(),
+ "outer timeout must fire on the wedged worker"
+ );
assert!(reg.any_alive(), "re-parked handle keeps any_alive true");
assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)");
@@ -1006,7 +1010,12 @@ mod tests {
"a",
WorkerConfig {
weight: ShutdownWeight(0),
- drain: Some(mk_hook("a_arrived", "a_passed", Arc::clone(&log), Arc::clone(&barrier))),
+ drain: Some(mk_hook(
+ "a_arrived",
+ "a_passed",
+ Arc::clone(&log),
+ Arc::clone(&barrier),
+ )),
..WorkerConfig::default()
},
);
@@ -1015,7 +1024,12 @@ mod tests {
"b",
WorkerConfig {
weight: ShutdownWeight(0),
- drain: Some(mk_hook("b_arrived", "b_passed", Arc::clone(&log), Arc::clone(&barrier))),
+ drain: Some(mk_hook(
+ "b_arrived",
+ "b_passed",
+ Arc::clone(&log),
+ Arc::clone(&barrier),
+ )),
..WorkerConfig::default()
},
);
@@ -1050,7 +1064,10 @@ mod tests {
assert!(reg.any_alive());
assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok);
- assert!(reg.any_alive(), "orphan still contributes after slot drains");
+ assert!(
+ reg.any_alive(),
+ "orphan still contributes after slot drains"
+ );
assert!(!reg.is_running("alpha"));
release_tx.send(()).unwrap();
diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
index 46945667ef..9e22d9e6f2 100644
--- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs
+++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
@@ -19,10 +19,11 @@
//!
//! # Lifetime
//!
-//! [`spawn_wallet_event_adapter`] returns a [`JoinHandle`]. The caller
-//! (typically `PlatformWalletManager`) keeps the handle for the
-//! manager's lifetime; on shutdown, fire the [`CancellationToken`] to
-//! make the task exit cleanly.
+//! [`wallet_event_adapter_loop`] is the task body. The caller (typically
+//! `PlatformWalletManager`) registers it on the shared `ThreadRegistry`
+//! via `start_task`, which owns its [`JoinHandle`] and cancellation; on
+//! shutdown the registry fires the [`CancellationToken`] to make the task
+//! exit cleanly and joins it.
use std::sync::Arc;
@@ -34,87 +35,82 @@ use key_wallet::Utxo;
use key_wallet_manager::{WalletEvent, WalletId, WalletManager};
use tokio::sync::broadcast::error::RecvError;
use tokio::sync::RwLock;
-use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use crate::changeset::changeset::{CoreChangeSet, PlatformWalletChangeSet};
use crate::changeset::traits::PlatformWalletPersistence;
use crate::wallet::platform_wallet::PlatformWalletInfo;
-/// Spawn the wallet-event subscriber task.
+/// The wallet-event subscriber loop (the task body owned by the registry).
///
-/// Subscribes to `wallet_manager.subscribe_events()` from inside the
-/// spawned task (so the call-site doesn't need to be on a tokio
-/// runtime), then loops dispatching events to the persister via
-/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires
-/// or the upstream broadcast channel closes.
+/// Subscribes to `wallet_manager.subscribe_events()` from inside the task
+/// (so the call-site doesn't need to be on a tokio runtime), then loops
+/// dispatching events to the persister via
+/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires or the
+/// upstream broadcast channel closes.
///
-/// Generic over `P` so the spawned task gets static-dispatch on
-/// every `persister.store(...)` call. Pass the manager's own
-/// `Arc` (not the `Arc`
-/// coercion) to actually realize the static-dispatch win.
-pub fn spawn_wallet_event_adapter(
+/// Generic over `P` so the task gets static-dispatch on every
+/// `persister.store(...)` call. Pass the manager's own `Arc
` (not the
+/// `Arc` coercion) to realize that win.
+pub async fn wallet_event_adapter_loop(
wallet_manager: Arc>>,
persister: Arc,
cancel: CancellationToken,
-) -> JoinHandle<()>
-where
+) where
P: PlatformWalletPersistence + 'static,
{
- tokio::spawn(async move {
- let mut receiver = {
- let guard = wallet_manager.read().await;
- guard.subscribe_events()
- };
- tracing::debug!("wallet-event adapter task started");
+ let mut receiver = {
+ let guard = wallet_manager.read().await;
+ guard.subscribe_events()
+ };
+ tracing::debug!("wallet-event adapter task started");
- loop {
- tokio::select! {
- recv = receiver.recv() => {
- match recv {
- Ok(event) => {
- let wallet_id = event.wallet_id();
- // For events that need to consult per-wallet
- // state (today only `TransactionInstantLocked`,
- // which checks finality before recording the IS
- // lock), grab a brief read lock on the manager.
- let core = build_core_changeset(&wallet_manager, &event).await;
- if core.is_empty_no_records() {
- // SyncHeightAdvanced for an unknown wallet,
- // empty BlockProcessed, etc. — nothing to
- // persist. Skip the round-trip.
- continue;
- }
- let cs = PlatformWalletChangeSet {
- core: Some(core),
- ..PlatformWalletChangeSet::default()
- };
- if let Err(e) = persister.store(wallet_id, cs) {
- tracing::warn!(
- wallet_id = %hex::encode(wallet_id),
- error = %e,
- "Persister rejected core changeset; state will be re-emitted on next sync round"
- );
- }
- }
- Err(RecvError::Closed) if cancel.is_cancelled() => break,
- Err(RecvError::Closed) => {
- tracing::error!("WalletEvent broadcast closed unexpectedly");
- break;
+ loop {
+ tokio::select! {
+ recv = receiver.recv() => {
+ match recv {
+ Ok(event) => {
+ let wallet_id = event.wallet_id();
+ // For events that need to consult per-wallet
+ // state (today only `TransactionInstantLocked`,
+ // which checks finality before recording the IS
+ // lock), grab a brief read lock on the manager.
+ let core = build_core_changeset(&wallet_manager, &event).await;
+ if core.is_empty_no_records() {
+ // SyncHeightAdvanced for an unknown wallet,
+ // empty BlockProcessed, etc. — nothing to
+ // persist. Skip the round-trip.
+ continue;
}
- Err(RecvError::Lagged(n)) => {
+ let cs = PlatformWalletChangeSet {
+ core: Some(core),
+ ..PlatformWalletChangeSet::default()
+ };
+ if let Err(e) = persister.store(wallet_id, cs) {
tracing::warn!(
- missed = n,
- "wallet-event adapter lagged on broadcast channel; some events were dropped"
+ wallet_id = %hex::encode(wallet_id),
+ error = %e,
+ "Persister rejected core changeset; state will be re-emitted on next sync round"
);
}
}
+ Err(RecvError::Closed) if cancel.is_cancelled() => break,
+ Err(RecvError::Closed) => {
+ tracing::error!("WalletEvent broadcast closed unexpectedly");
+ break;
+ }
+ Err(RecvError::Lagged(n)) => {
+ tracing::warn!(
+ missed = n,
+ "wallet-event adapter lagged on broadcast channel; some events were dropped"
+ );
+ }
}
- _ = cancel.cancelled() => break,
}
+ _ = cancel.cancelled() => break,
}
- tracing::debug!("wallet-event adapter task exiting");
- })
+ }
+ tracing::debug!("wallet-event adapter task exiting");
}
/// Project an upstream [`WalletEvent`] into a [`CoreChangeSet`] suitable
diff --git a/packages/rs-platform-wallet/src/changeset/mod.rs b/packages/rs-platform-wallet/src/changeset/mod.rs
index dc76ddd39a..208c132e87 100644
--- a/packages/rs-platform-wallet/src/changeset/mod.rs
+++ b/packages/rs-platform-wallet/src/changeset/mod.rs
@@ -33,7 +33,7 @@ pub use changeset::{
};
pub use client_start_state::ClientStartState;
pub use client_wallet_start_state::ClientWalletStartState;
-pub use core_bridge::spawn_wallet_event_adapter;
+pub use core_bridge::wallet_event_adapter_loop;
pub use identity_manager_start_state::IdentityManagerStartState;
pub use merge::Merge;
pub use platform_address_sync_start_state::PlatformAddressSyncStartState;
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 40329bad74..8dfe83eede 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -49,16 +49,17 @@
use std::collections::BTreeMap;
use std::sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
- Arc, Mutex as StdMutex,
+ Arc,
};
-use dash_async::AtomicFlagGuard;
+use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use super::WalletWorker;
+
use dpp::balances::credits::TokenAmount;
use dpp::prelude::Identifier;
use tokio::sync::RwLock;
-use tokio_util::sync::CancellationToken;
use dash_sdk::platform::tokens::identity_token_balances::{
IdentityTokenBalances, IdentityTokenBalancesQuery,
@@ -160,23 +161,11 @@ where
/// over `P` so every `persister.store(...)` call on the hot sync
/// loop dispatches statically.
persister: Arc
,
- /// Cancel token for the background loop, if running.
- background_cancel: StdMutex>,
- /// Join handle for the background loop's OS thread, if running.
- /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
- /// confirm the `!Send` loop fully exited before the host drops the
- /// runtime.
- background_join: StdMutex >>,
- /// Manager-owned orphans list (shared `Arc`). On a tight
- /// `stop()`→`start()` where the prior thread is wedged past the 1 s
- /// reap backstop, [`start`](Self::start) parks the still-live handle
- /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
- /// instead of dropping it, so manager `shutdown()` accounts for it.
- coordinator_orphans: super::CoordinatorOrphans,
- /// Monotonically increasing generation counter. Incremented each
- /// time `start()` installs a new cancel token so the exiting
- /// thread can tell whether its token is still current.
- background_generation: AtomicU64,
+ /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
+ /// `quiesce` delegate to it under the [`WalletWorker::IdentitySync`]
+ /// key; it owns the loop's cancel token, OS-thread join handle, the
+ /// restart reap-or-park, and the orphan list.
+ registry: Arc>,
interval_secs: AtomicU64,
is_syncing: AtomicBool,
/// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -215,15 +204,12 @@ where
pub fn new(
sdk: Arc,
persister: Arc,
- coordinator_orphans: super::CoordinatorOrphans,
+ registry: Arc>,
) -> Self {
Self {
sdk,
persister,
- background_cancel: StdMutex::new(None),
- background_join: StdMutex::new(None),
- coordinator_orphans,
- background_generation: AtomicU64::new(0),
+ registry,
interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
is_syncing: AtomicBool::new(false),
quiescing: AtomicBool::new(false),
@@ -339,10 +325,22 @@ where
/// Whether the background loop is currently running.
pub fn is_running(&self) -> bool {
- self.background_cancel
- .lock()
- .map(|g| g.is_some())
- .unwrap_or(false)
+ self.registry.is_running(WalletWorker::IdentitySync)
+ }
+
+ /// The drain barrier handed to the registry: raise the `quiescing`
+ /// gate so any pass past its `is_syncing` CAS bails. The registry then
+ /// cancels the loop and joins the thread (the join waits for the
+ /// in-flight pass to drop and `is_syncing` to clear), so the barrier
+ /// itself is instant and never blocks teardown.
+ fn drain_hook(self: &Arc) -> DrainHook {
+ let this = Arc::clone(self);
+ Arc::new(move || {
+ let this = Arc::clone(&this);
+ Box::pin(async move {
+ this.quiescing.store(true, Ordering::Release);
+ })
+ })
}
/// Whether a sync pass is in flight right now.
@@ -414,57 +412,32 @@ where
/// The first pass runs immediately; subsequent passes fire every
/// [`interval`](Self::interval).
pub fn start(self: Arc) {
- let mut cancel_guard = self
- .background_cancel
- .lock()
- .unwrap_or_else(|e| e.into_inner());
- if cancel_guard.is_some() {
- return;
- }
-
- // Take any handle left by a prior stop() call so we can reap it — but
- // DON'T join it here, while we still hold background_cancel. stop()
- // takes-and-cancels the token but never touches background_join, so a
- // stop()→start() sequence would otherwise overwrite (detach) the old
- // handle and shutdown() would miss that thread. Joining it under
- // background_cancel would DEADLOCK the reap into its 1 s backstop: the
- // exiting prior thread's epilogue also locks background_cancel (to
- // clear its slot), so it would block on the lock we hold → never
- // finish → get detached on the exact stop()→start() path the reap
- // exists for. We install the new token + bump the generation below,
- // release the lock, and only THEN reap (after this fn's tail).
- let prior = self
- .background_join
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .take();
-
- let cancel = CancellationToken::new();
- *cancel_guard = Some(cancel.clone());
- let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
+ // Reopen the quiescing gate so this (re)start's passes can run; a
+ // prior quiesce raised it via the drain hook.
+ self.quiescing.store(false, Ordering::Release);
+
+ let cfg = WorkerConfig {
+ weight: super::COORDINATOR_WEIGHT,
+ join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
+ drain: Some(self.drain_hook()),
+ };
+ // The loop drives `!Send` SDK futures via `Handle::block_on` on a
+ // dedicated OS thread (the registry spawns it). The handle is
+ // captured from this tokio context; the new thread is not itself a
+ // tokio worker. `biased` polls the cancel arm first, so a pass
+ // stalled on a hung SDK fetch is dropped at its `.await` the
+ // instant the registry cancels — clearing `is_syncing` promptly so
+ // the join lands inside the budget.
let handle = tokio::runtime::Handle::current();
let this = Arc::clone(&self);
- let join = std::thread::Builder::new()
- .name("identity-sync".into())
- .spawn(move || {
+ self.registry
+ .start_thread(WalletWorker::IdentitySync, cfg, move |cancel| {
handle.block_on(async move {
loop {
if cancel.is_cancelled() {
break;
}
-
- // Race the in-flight pass against cancellation.
- // `stop()` / `quiesce()` cancel the token; with
- // `biased` the cancel arm is polled first, so a
- // pass stalled on a hung SDK fetch is dropped at
- // its `.await` the instant we cancel. Dropping the
- // `sync_now` future unwinds to the `is_syncing`
- // `AtomicFlagGuard` it holds, clearing the flag
- // promptly — so `quiesce()`'s drain loop frees and
- // the join lands well inside `shutdown()`'s
- // timeout. A stalled pass can no longer strand a
- // live `!Send` thread past `shutdown()`.
tokio::select! {
biased;
_ = cancel.cancelled() => break,
@@ -477,47 +450,8 @@ where
_ = cancel.cancelled() => break,
}
}
-
- // Only clear the slot if no newer start() has
- // installed a replacement token since we launched.
- if let Ok(mut guard) = this.background_cancel.lock() {
- if this.background_generation.load(Ordering::Acquire) == my_gen {
- *guard = None;
- }
- }
});
- })
- .expect("failed to spawn identity-sync thread");
- // Store the join handle while still holding cancel_guard — a
- // concurrent quiesce() must wait for this lock before calling
- // stop(), so the handle is always stored before it can be taken.
- *self
- .background_join
- .lock()
- .unwrap_or_else(|e| e.into_inner()) = Some(join);
-
- // Release background_cancel BEFORE reaping the prior thread, so its
- // epilogue can acquire the lock, observe the bumped generation, skip
- // clearing our freshly-installed token, and return. Holding the lock
- // across the join below is what would block the prior thread, spin
- // the full 1 s deadline, and detach — the very stall this ordering
- // removes.
- drop(cancel_guard);
-
- // Now reap the prior thread. It was already cancellation-signalled by
- // stop(), and with the lock released its epilogue completes promptly,
- // so is_finished() trips within a few milliseconds and the join is
- // near-instant. The 1 s deadline survives only as a genuine-wedge
- // backstop (e.g. a pass wedged in a Drop that never yields); if it
- // fires `reap_prior_or_park` parks the still-live, already-cancelled
- // thread in the manager orphans list so `shutdown()` joins it and
- // reports it non-clean rather than dropping it (residual UAF).
- super::reap_prior_or_park(
- prior,
- &self.coordinator_orphans,
- std::time::Duration::from_secs(1),
- "identity-sync",
- );
+ });
}
/// Stop the background sync loop. No-op if not running.
@@ -529,14 +463,7 @@ where
/// by manager shutdown so the host can free the persister context —
/// use [`quiesce`](Self::quiesce).
pub fn stop(&self) {
- if let Some(token) = self
- .background_cancel
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .take()
- {
- token.cancel();
- }
+ self.registry.cancel(WalletWorker::IdentitySync);
}
/// Cancel the background loop **and wait for any in-flight sync pass
@@ -564,24 +491,17 @@ where
/// the `!Send` loop has stopped touching `tokio::time` before a
/// one-shot host drops the runtime.
pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
- self.quiescing.store(true, Ordering::Release);
- // RAII gate: resets `quiescing` on *every* exit path — a normal
- // return, a timed-out `shutdown()` dropping this future, or a
- // panic. Without it a quiesce that doesn't run to completion
- // leaves the gate latched `true`, silently bailing every future
- // pass. Reopening on drop is safe because `stop()` (below) has
- // already cancelled the loop, so no new pass can start.
+ // RAII gate: reopen `quiescing` on *every* exit path — normal
+ // return, a dropped future, or a panic. The registry's drain hook
+ // raises it inside `quiesce` below; without this reset a quiesce
+ // that doesn't complete would leave the gate latched and silently
+ // bail every future pass. Reopening is safe because the loop has
+ // been cancelled, so no new pass can start.
let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
- self.stop();
- while self.is_syncing.load(Ordering::Acquire) {
- tokio::time::sleep(Duration::from_millis(20)).await;
- }
- let handle = self
- .background_join
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .take();
- super::join_coordinator_thread(handle).await
+ self.registry
+ .quiesce(WalletWorker::IdentitySync)
+ .await
+ .into()
}
/// Run one sync pass across every registered identity.
@@ -856,8 +776,8 @@ mod tests {
fn make_manager() -> Arc> {
let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
let persister = Arc::new(NoopPersister);
- let orphans = Arc::new(StdMutex::new(Vec::new()));
- Arc::new(IdentitySyncManager::new(sdk, persister, orphans))
+ let registry = ThreadRegistry::new();
+ Arc::new(IdentitySyncManager::new(sdk, persister, registry))
}
fn make_recording_manager() -> (
@@ -866,12 +786,12 @@ mod tests {
) {
let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
let persister = Arc::new(RecordingPersister::new());
- let orphans = Arc::new(StdMutex::new(Vec::new()));
+ let registry = ThreadRegistry::new();
(
Arc::new(IdentitySyncManager::new(
sdk,
Arc::clone(&persister),
- orphans,
+ registry,
)),
persister,
)
@@ -993,123 +913,6 @@ mod tests {
assert_eq!(mgr.interval(), Duration::from_secs(120));
}
- /// `quiesce()` must not return while a pass is in flight, and must
- /// return promptly once the pass drains.
- ///
- /// Drives the real `is_syncing` lifecycle: a background task takes
- /// the slot via the same `compare_exchange` the real `sync_now`
- /// uses, holds it across a sleep (standing in for the pass body +
- /// persister fan-out, which `sync_now` keeps the flag set across),
- /// then clears it. We assert `quiesce()` is still pending while the
- /// flag is held and completes after it falls — i.e. the falling edge
- /// of `is_syncing` is what unblocks the barrier.
- #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
- async fn quiesce_blocks_until_in_flight_pass_drains() {
- let mgr = make_manager();
-
- // Stand in for an in-flight `sync_now`: take the `is_syncing`
- // slot exactly as the real pass does, hold it, then release.
- let holder = Arc::clone(&mgr);
- let pass = tokio::spawn(async move {
- assert!(
- holder
- .is_syncing
- .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
- .is_ok(),
- "test should own the is_syncing slot"
- );
- tokio::time::sleep(Duration::from_millis(200)).await;
- holder.is_syncing.store(false, Ordering::Release);
- });
-
- // Give the holder task a chance to take the slot before we
- // start draining.
- while !mgr.is_syncing() {
- tokio::time::sleep(Duration::from_millis(5)).await;
- }
-
- let quiesce_fut = mgr.quiesce();
- tokio::pin!(quiesce_fut);
-
- // While the pass holds the flag, quiesce must stay pending.
- tokio::select! {
- _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"),
- _ = tokio::time::sleep(Duration::from_millis(50)) => {}
- }
- assert!(mgr.is_syncing(), "pass should still be in flight");
-
- // Once the pass drains, quiesce must return (well within a
- // generous bound — it polls every 20ms).
- tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut)
- .await
- .expect("quiesce did not return after the pass drained");
-
- // The gate is reopened before quiesce returns.
- assert!(!mgr.quiescing.load(Ordering::Acquire));
- assert!(!mgr.is_syncing());
- pass.await.unwrap();
- }
-
- /// Regression: a tight `stop()` → `start()` must reap the prior loop's
- /// OS thread promptly, NOT stall on the 1 s detach backstop.
- ///
- /// The prior thread's exit epilogue locks `background_cancel` to
- /// conditionally clear its slot. The earlier ordering held
- /// `background_cancel` across the prior-handle join inside `start()`, so
- /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
- /// that lock, never finished, and the reap spin-waited the full second
- /// before detaching — a 1 s stall plus a transient untracked thread. The
- /// fix installs the new token + generation, releases `background_cancel`,
- /// and only then reaps, so the prior thread's epilogue runs and the join
- /// lands in milliseconds.
- ///
- /// `stop()` and `start()` run back-to-back in one blocking closure
- /// (mirroring the real call site) so `start()` re-acquires the lock
- /// microseconds after `stop()` frees it — before the async-woken prior
- /// thread can reach its epilogue. Against the old lock-held ordering this
- /// reliably stalls ~1 s and fails the bound below.
- #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
- async fn restart_after_stop_reaps_prior_thread() {
- let mgr = make_manager();
-
- // Launch the first loop and let its immediate (no-op, nothing
- // registered) pass complete so the thread parks in the interval
- // sleep, where cancellation lands cleanly.
- Arc::clone(&mgr).start();
- assert!(mgr.is_running());
- tokio::time::sleep(Duration::from_millis(50)).await;
-
- // Back-to-back cancel-only stop + restart, off the runtime so the
- // synchronous reap can't starve a worker. `start()` re-grabs
- // background_cancel right after `stop()` frees it.
- let restart = Arc::clone(&mgr);
- let elapsed = tokio::task::spawn_blocking(move || {
- restart.stop();
- let started = std::time::Instant::now();
- Arc::clone(&restart).start();
- started.elapsed()
- })
- .await
- .unwrap();
-
- assert!(
- elapsed < Duration::from_millis(500),
- "stop()→start() stalled for {elapsed:?}: prior thread was not \
- reaped promptly (background_cancel held across the join?)"
- );
- assert!(mgr.is_running(), "restart must leave the new loop tracked");
-
- // Wind the new loop down so the test leaves no live !Send thread.
- let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
- .await
- .expect("cleanup quiesce did not complete within 2s after restart");
- assert!(
- status.is_clean(),
- "cleanup quiesce ended non-cleanly: {status:?}"
- );
- assert!(!mgr.is_running());
- }
-
/// A `sync_now()` invoked while `quiescing` is set must bail without
/// running the pass — in particular, without calling
/// `persister.store(...)`. This is the gate that prevents a pass
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 7e9690d066..d03dcccf7b 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -10,13 +10,12 @@ mod wallet_lifecycle;
use std::sync::Arc;
+use dash_async::{ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig};
use tokio::sync::{Notify, RwLock};
-use tokio::task::JoinHandle;
-use tokio_util::sync::CancellationToken;
use key_wallet_manager::WalletManager;
-use crate::changeset::{spawn_wallet_event_adapter, PlatformWalletPersistence};
+use crate::changeset::{wallet_event_adapter_loop, PlatformWalletPersistence};
use crate::events::{PlatformEventHandler, PlatformEventManager};
use crate::manager::identity_sync::IdentitySyncManager;
use crate::manager::platform_address_sync::PlatformAddressSyncManager;
@@ -28,21 +27,29 @@ use crate::wallet::core::BalanceUpdateHandler;
use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId};
use crate::wallet::PlatformWallet;
-/// Shared list of coordinator OS threads that a tight `stop()`→`start()`
-/// reap had to detach past its 1 s wedge-backstop.
-///
-/// A coordinator's `start()` reap normally joins the prior thread within
-/// a few milliseconds. If that thread is genuinely wedged in a
-/// non-yielding `Drop` (vanishingly rare — the loop exits via a
-/// cancellable `select!`), [`reap_prior_or_park`] parks its still-live
-/// `JoinHandle` here instead of dropping it. The manager owns this list
-/// and shares a clone (`Arc`) with every coordinator, so
-/// [`PlatformWalletManager::shutdown`] can join everything parked here
-/// within its timeout and report
-/// [`CoordinatorThreadStatus::Detached`] if any thread is still alive —
-/// telling the host NOT to free a callback context the thread may still
-/// touch (closing a residual use-after-free window).
-pub(crate) type CoordinatorOrphans = Arc>>>;
+/// Identity of a background worker on the manager's shared
+/// [`ThreadRegistry`]. The three periodic sync coordinators run as
+/// OS-thread workers (their SDK futures are `!Send`); the wallet-event
+/// adapter runs as a tokio task. Drained in ascending weight order on
+/// [`shutdown`](PlatformWalletManager::shutdown): the coordinators
+/// (weight 0) first, then the event adapter (weight 10) they store into.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub enum WalletWorker {
+ /// Platform-address (BLAST) balance sync loop.
+ PlatformAddressSync,
+ /// Per-identity token-state sync loop.
+ IdentitySync,
+ /// Shielded (Orchard) note sync loop.
+ ShieldedSync,
+ /// Wallet-event adapter task (sinks coordinator stores).
+ EventAdapter,
+}
+
+/// Teardown weight of the periodic sync coordinators — drained first.
+pub(crate) const COORDINATOR_WEIGHT: ShutdownWeight = ShutdownWeight(0);
+/// Teardown weight of the wallet-event adapter — drained after the
+/// coordinators that feed it.
+pub(crate) const EVENT_ADAPTER_WEIGHT: ShutdownWeight = ShutdownWeight(10);
/// Multi-wallet coordinator with SPV sync and event handling.
///
@@ -98,16 +105,12 @@ pub struct PlatformWalletManager {
#[cfg(feature = "shielded")]
pub(super) event_manager: Arc,
pub(super) persister: Arc,
- /// Cancellation token + join handle for the wallet-event adapter
- /// task. Held so [`shutdown`] can stop it cleanly when the manager
- /// is torn down.
- pub(super) event_adapter_cancel: CancellationToken,
- pub(super) event_adapter_join: tokio::sync::Mutex>>,
- /// Coordinator OS threads detached by a tight `stop()`→`start()`
- /// reap (see [`CoordinatorOrphans`]). Shared (cloned `Arc`) with
- /// every coordinator so their `start()` reaps can park a wedged
- /// prior thread here, and drained/joined by [`shutdown`](Self::shutdown).
- pub(super) coordinator_orphans: CoordinatorOrphans,
+ /// Shared worker-lifecycle engine. Owns every background worker's
+ /// cancellation token + join handle, the restart reap-or-park, and the
+ /// orphan list. The coordinators hold a clone and register their loops
+ /// on it; the event adapter runs here as a tokio task. [`shutdown`]
+ /// drains it in weight order and joins every worker before returning.
+ pub(super) registry: Arc>,
}
/// How one background coordinator thread terminated.
@@ -161,6 +164,25 @@ impl CoordinatorThreadStatus {
}
}
+/// Relocate a registry [`WorkerStatus`](dash_async::WorkerStatus) into the
+/// FFI-stable `CoordinatorThreadStatus`. The variant set and payloads are
+/// identical by construction, so this is a byte-stable 1:1 mapping — the
+/// FFI `destroy` / shielded-stop adapters keep reading the same shape.
+impl From for CoordinatorThreadStatus {
+ fn from(status: dash_async::WorkerStatus) -> Self {
+ use dash_async::WorkerStatus as W;
+ match status {
+ W::Ok => Self::Ok,
+ W::Stopped(reason) => Self::Stopped(reason),
+ W::Panicked(msg) => Self::Panicked(msg),
+ W::Timeout => Self::Timeout,
+ W::Detached => Self::Detached,
+ W::NotRunning => Self::NotRunning,
+ W::Error(msg) => Self::Error(msg),
+ }
+ }
+}
+
/// Per-thread terminal status of every background worker, returned by
/// [`PlatformWalletManager::shutdown`].
///
@@ -211,196 +233,36 @@ impl CoordinatorExitStatus {
&& self.event_adapter.is_clean()
&& self.detached_threads.is_clean()
}
-}
-/// Join a coordinator's background OS thread and classify how it ended.
-///
-/// Called from each coordinator's `quiesce()` after cancelling the
-/// loop and draining any in-flight pass, so the thread is already on
-/// its way out and the join is near-instant. Joining while the runtime
-/// is still alive guarantees the `!Send` loop has stopped touching
-/// `tokio::time` before the host drops the runtime.
-///
-/// **Polling approach**: we poll [`JoinHandle::is_finished`] in 5 ms
-/// steps rather than wrapping `handle.join()` in
-/// [`spawn_blocking`](tokio::task::spawn_blocking). The
-/// `spawn_blocking` approach spawns a blocking-pool task that cannot be
-/// cancelled once started — so dropping the timeout future that wraps
-/// `quiesce()` would leave the blocking task alive and `handle.join()`
-/// still running, defeating the timeout boundary. Polling lets the
-/// executor yield at each `.await` step so `tokio::time::timeout`
-/// wrapping `quiesce()` can truly interrupt this call.
-///
-/// **Requires a multi-thread runtime.** Each coordinator's OS thread
-/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on)
-/// and needs the runtime's timer/IO driver; a `current_thread` runtime
-/// can only service one `block_on` at a time, so joining one coordinator
-/// while the others (and `shutdown()` itself) are mid-`block_on` would
-/// deadlock. `shutdown()` asserts the multi-thread flavor up front.
-pub(crate) async fn join_coordinator_thread(
- handle: Option>,
-) -> CoordinatorThreadStatus {
- let Some(handle) = handle else {
- return CoordinatorThreadStatus::NotRunning;
- };
- // Poll until the thread exits. The coordinator was already cancelled
- // (stop() fires before quiesce() calls us), so is_finished() becomes
- // true nearly immediately — typically within a single 5 ms step.
- loop {
- if handle.is_finished() {
- return match handle.join() {
- Ok(()) => CoordinatorThreadStatus::Ok,
- Err(payload) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
- };
- }
- // Yield to the executor so the outer tokio::time::timeout wrapping
- // quiesce() can fire if the deadline has passed. Without this yield
- // the loop would busy-spin and block the task.
- tokio::time::sleep(std::time::Duration::from_millis(5)).await;
- }
-}
-
-/// Best-effort extraction of a panic message from a joined thread/task
-/// payload (`&str` and `String` are the common cases).
-fn panic_message(payload: Box) -> String {
- if let Some(s) = payload.downcast_ref::<&str>() {
- (*s).to_string()
- } else if let Some(s) = payload.downcast_ref::() {
- s.clone()
- } else {
- "".to_string()
- }
-}
-
-/// Reap a coordinator's prior OS thread after a `stop()`→`start()`
-/// reschedule — or park it for [`PlatformWalletManager::shutdown`] if it
-/// is genuinely wedged.
-///
-/// Shared by all three coordinators' `start()` (identity / platform-
-/// address / shielded), called at the tail of `start()` *after* the
-/// `background_cancel` lock has been released, so the exiting prior
-/// thread's epilogue (which also takes that lock) can complete and the
-/// join lands in milliseconds.
-///
-/// `prior` was cancellation-signalled by the preceding `stop()`, so its
-/// `select!` loop exits and the thread finishes almost immediately. The
-/// `backstop` deadline fires only if the thread is wedged in a
-/// non-yielding `Drop` that never observes the cancellation (vanishingly
-/// rare). On that wedge we must NOT silently drop the still-live handle:
-/// the thread still holds an `Arc` to the host event handler and could
-/// fire a callback, so a later `destroy` that freed the host context
-/// would hit a use-after-free. Instead we park the handle in `orphans`
-/// so `shutdown()` joins it within its own timeout and reports
-/// [`CoordinatorThreadStatus::Detached`] if it is still alive — keeping
-/// [`CoordinatorExitStatus::all_clean`] honest.
-pub(crate) fn reap_prior_or_park(
- prior: Option>,
- orphans: &CoordinatorOrphans,
- backstop: std::time::Duration,
- coordinator: &str,
-) {
- let Some(handle) = prior else {
- return;
- };
- let deadline = std::time::Instant::now() + backstop;
- loop {
- if handle.is_finished() {
- // Near-instant since finished; reaps the thread's resources.
- let _ = handle.join();
- return;
- }
- if std::time::Instant::now() >= deadline {
- tracing::warn!(
- coordinator,
- ?backstop,
- "prior sync thread did not finish within the backstop after \
- cancellation; parking it in the manager orphans list for \
- shutdown() to join rather than detaching it"
- );
- // Park the still-live (but already-cancelled) handle so a
- // later shutdown() can join it and report it non-clean,
- // instead of dropping it and leaving a UAF window where the
- // host frees a callback context the thread may still touch.
- orphans
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .push(handle);
- return;
- }
- std::thread::sleep(std::time::Duration::from_millis(5));
- }
-}
-
-/// Drain the manager's [`CoordinatorOrphans`] list and classify how the
-/// parked threads ended, polling until `deadline`.
-///
-/// Threads land in the list only when a tight `stop()`→`start()` reap had
-/// to detach a prior coordinator thread past its 1 s wedge-backstop (see
-/// [`reap_prior_or_park`]). They were parked rather than dropped so this
-/// final teardown can account for them: a still-live detached thread
-/// keeps an `Arc` to the host event handler and could fire one last
-/// callback, so the host must not free its context until every such
-/// thread has exited.
-///
-/// Polls [`JoinHandle::is_finished`](std::thread::JoinHandle::is_finished)
-/// in 5 ms steps, yielding at each `.await` so a wrapping
-/// `tokio::time::timeout` can still interrupt it (no uncancellable
-/// blocking join — `join()` is only ever called on an already-finished
-/// handle). Returns:
-/// - [`Ok`](CoordinatorThreadStatus::Ok) — the list was empty, or every
-/// parked thread joined cleanly;
-/// - [`Panicked`](CoordinatorThreadStatus::Panicked) — a parked thread
-/// had panicked (and none were left alive at the deadline);
-/// - [`Detached`](CoordinatorThreadStatus::Detached) — at least one
-/// parked thread was still alive at `deadline`. Any still-live handles
-/// are re-parked so a later (idempotent) `shutdown()` can retry.
-pub(crate) async fn join_detached_orphans(
- orphans: &CoordinatorOrphans,
- deadline: std::time::Instant,
-) -> CoordinatorThreadStatus {
- // Take the whole list out under the lock; we re-park any survivors
- // at the deadline, but never hold the lock across an `.await`.
- let mut pending: Vec> = {
- let mut guard = orphans.lock().unwrap_or_else(|e| e.into_inner());
- std::mem::take(&mut *guard)
- };
- if pending.is_empty() {
- return CoordinatorThreadStatus::Ok;
- }
-
- let mut panicked: Option = None;
- loop {
- // Reap every thread that has finished this pass; retain the rest.
- let mut still_live = Vec::with_capacity(pending.len());
- for handle in pending.drain(..) {
- if handle.is_finished() {
- if let Err(payload) = handle.join() {
- // Keep the first panic message; a live `Detached`
- // thread still takes precedence at the deadline below.
- panicked.get_or_insert_with(|| panic_message(payload));
- }
+ /// Build the FFI-stable exit status from the registry's weight-ordered
+ /// [`ShutdownReport`]. A worker absent from the report never ran, so it
+ /// maps to [`NotRunning`](CoordinatorThreadStatus::NotRunning); a
+ /// non-zero orphan-survivor count surfaces as
+ /// [`Detached`](CoordinatorThreadStatus::Detached), keeping
+ /// [`all_clean`](Self::all_clean) honest for a still-live wedged thread.
+ pub(crate) fn from_report(report: ShutdownReport) -> Self {
+ let worker = |key: WalletWorker| -> CoordinatorThreadStatus {
+ report
+ .per_worker
+ .get(&key)
+ .cloned()
+ .map(CoordinatorThreadStatus::from)
+ .unwrap_or(CoordinatorThreadStatus::NotRunning)
+ };
+ Self {
+ platform_address_sync: worker(WalletWorker::PlatformAddressSync),
+ identity_sync: worker(WalletWorker::IdentitySync),
+ #[cfg(feature = "shielded")]
+ shielded_sync: Some(worker(WalletWorker::ShieldedSync)),
+ #[cfg(not(feature = "shielded"))]
+ shielded_sync: None,
+ event_adapter: worker(WalletWorker::EventAdapter),
+ detached_threads: if report.detached > 0 {
+ CoordinatorThreadStatus::Detached
} else {
- still_live.push(handle);
- }
- }
- pending = still_live;
-
- if pending.is_empty() {
- return match panicked {
- Some(msg) => CoordinatorThreadStatus::Panicked(msg),
- None => CoordinatorThreadStatus::Ok,
- };
- }
- if std::time::Instant::now() >= deadline {
- // Re-park survivors so an idempotent re-`shutdown()` retries
- // rather than losing track of a still-live thread.
- orphans
- .lock()
- .unwrap_or_else(|e| e.into_inner())
- .extend(pending);
- return CoordinatorThreadStatus::Detached;
+ CoordinatorThreadStatus::Ok
+ },
}
- tokio::time::sleep(std::time::Duration::from_millis(5)).await;
}
}
@@ -449,14 +311,28 @@ impl PlatformWalletManager {
let wallets = Arc::new(RwLock::new(std::collections::BTreeMap::new()));
let lock_notify = Arc::new(Notify::new());
- // Spawn the wallet-event adapter that translates upstream
- // `WalletEvent`s into `CoreChangeSet`s and forwards them to
- // the persister.
- let event_adapter_cancel = CancellationToken::new();
- let event_adapter_join = spawn_wallet_event_adapter(
- Arc::clone(&wallet_manager),
- Arc::clone(&persister),
- event_adapter_cancel.clone(),
+ // Shared worker-lifecycle engine. The 1 s reap backstop (separate
+ // from the 30 s managed-join budget) is the grace a wedged prior
+ // thread gets before it is reported `Detached`.
+ let registry = ThreadRegistry::with_reap_backstop(std::time::Duration::from_secs(
+ SHUTDOWN_ORPHAN_GRACE_SECS,
+ ));
+
+ // Register the wallet-event adapter as a tokio task on the
+ // registry. It sinks the coordinators' stores, so it drains AFTER
+ // them (weight 10 vs the coordinators' 0).
+ let adapter_wallet_manager = Arc::clone(&wallet_manager);
+ let adapter_persister = Arc::clone(&persister);
+ registry.start_task(
+ WalletWorker::EventAdapter,
+ WorkerConfig {
+ weight: EVENT_ADAPTER_WEIGHT,
+ join_budget: std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
+ drain: None,
+ },
+ move |cancel| {
+ wallet_event_adapter_loop(adapter_wallet_manager, adapter_persister, cancel)
+ },
);
// Build handler list: app handler + internal handlers.
@@ -473,13 +349,6 @@ impl PlatformWalletManager {
balance_handler,
]));
- // Shared orphans list: a coordinator's `start()` reap parks here
- // any prior thread it had to detach past its 1 s wedge-backstop,
- // and `shutdown()` joins them. Every coordinator gets a clone of
- // this same `Arc` so they all park into the one list the manager
- // drains.
- let coordinator_orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
-
let spv = Arc::new(SpvRuntime::new(
Arc::clone(&wallet_manager),
Arc::clone(&event_manager),
@@ -487,12 +356,12 @@ impl