From f3354f6617a9407cf4c630ade2a30a87db4f9680 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Mon, 22 Jun 2026 21:46:48 +0200
Subject: [PATCH 01/29] feat(platform-wallet)!: shutdown() joins coordinator
 threads and returns CoordinatorExitStatus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The three periodic sync coordinators (platform-address, identity,
shielded) run their `!Send` loops on detached OS threads via
`Handle::block_on`. `shutdown()`/`quiesce()` previously only drained the
in-flight pass (the `is_syncing` barrier) and never joined the threads,
so a consumer that drops the tokio runtime right after `shutdown()`
(one-shot / headless / stdio) could race a coordinator still polling
`tokio::time` on a shutting-down runtime and panic with
"A Tokio 1.x context was found, but it is being shutdown".

Each coordinator now stores its OS-thread `JoinHandle`; `quiesce()` joins
it (via `spawn_blocking`, after the existing drain) and returns a
`CoordinatorThreadStatus` (NotRunning / Ok / Panicked / Error). Joining
while the runtime is still alive guarantees the loop has stopped touching
`tokio::time` before the host drops the runtime. `shutdown()` aggregates
the three into `CoordinatorExitStatus`, so a panicked loop surfaces in
the status instead of being silently dropped.

JoinHandle-join chosen over a oneshot/Notify signal: `JoinHandle::join`
natively distinguishes a clean return from a panic and waits for the
actual OS thread to terminate (not just a signal fired mid-teardown),
yielding the per-thread status for free. The generation-guard reschedule
and quiesce-drain behavior are preserved.

BREAKING CHANGE: `PlatformWalletManager::shutdown()` now returns
`CoordinatorExitStatus` instead of `()`.

FFI: the internal `shutdown()` call logs the new status; the `extern "C"`
`platform_wallet_manager_destroy` signature and C ABI are unchanged.

<sub>🤖 Co-authored by [Claudius the Magnificent](https://github.com/lklimek/claudius) AI Agent</sub>
---
 .../rs-platform-wallet-ffi/src/manager.rs     |  13 +-
 .../src/manager/identity_sync.rs              |  26 +-
 .../rs-platform-wallet/src/manager/mod.rs     | 277 +++++++++++++++++-
 .../src/manager/platform_address_sync.rs      |  26 +-
 .../src/manager/shielded_sync.rs              |  26 +-
 5 files changed, 354 insertions(+), 14 deletions(-)
diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs
index 5930c1c4db..d09d98a1e8 100644
--- a/packages/rs-platform-wallet-ffi/src/manager.rs
+++ b/packages/rs-platform-wallet-ffi/src/manager.rs
@@ -360,7 +360,18 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy(
         // left alive to fire a callback against freed memory.
         // `shutdown()` is idempotent, so this is safe even if the host
         // already stopped some sync managers before calling destroy.
-        runtime().block_on(manager.shutdown());
+        // It now joins the coordinator OS threads and returns their
+        // per-thread exit status; the C ABI exposes none of that, so we
+        // just log it (a panicked loop is worth surfacing) and drop it.
+        let status = runtime().block_on(manager.shutdown());
+        if !status.all_clean() {
+            tracing::warn!(
+                ?status,
+                "platform wallet coordinator(s) did not exit cleanly"
+            );
+        } else {
+            tracing::debug!(?status, "platform wallet coordinators joined cleanly");
+        }
     }
     PlatformWalletFFIResult::ok()
 }
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 8730398f97..ae5ae879f7 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -160,6 +160,11 @@ where
     persister: Arc<P>,
     /// Cancel token for the background loop, if running.
     background_cancel: StdMutex<Option<CancellationToken>>,
+    /// Join handle for the background loop's OS thread, if running.
+    /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
+    /// confirm the `!Send` loop fully exited before the host drops the
+    /// runtime.
+    background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
     /// Monotonically increasing generation counter. Incremented each
     /// time `start()` installs a new cancel token so the exiting
     /// thread can tell whether its token is still current.
@@ -204,6 +209,7 @@ where
             sdk,
             persister,
             background_cancel: StdMutex::new(None),
+            background_join: StdMutex::new(None),
             background_generation: AtomicU64::new(0),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
@@ -405,8 +411,8 @@ where
         drop(guard);
 
         let handle = tokio::runtime::Handle::current();
-        let this = self;
-        std::thread::Builder::new()
+        let this = Arc::clone(&self);
+        let join = std::thread::Builder::new()
             .name("identity-sync".into())
             .spawn(move || {
                 handle.block_on(async move {
@@ -434,6 +440,8 @@ where
                 });
             })
             .expect("failed to spawn identity-sync thread");
+        // Store the handle so `quiesce` can join the OS thread.
+        *self.background_join.lock().expect("bg_join poisoned") = Some(join);
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -473,13 +481,25 @@ where
     /// so its falling edge (with the gate up) is a sound "fully drained"
     /// signal. The gate is reopened before returning so a later
     /// start/sync works normally.
-    pub async fn quiesce(&self) {
+    ///
+    /// Finally **joins** the loop's OS thread (after the drain, so the
+    /// thread is on its way out) and returns its terminal status. Joining
+    /// while the runtime is still alive is what lets the manager promise
+    /// the `!Send` loop has stopped touching `tokio::time` before a
+    /// one-shot host drops the runtime.
+    pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
         self.quiescing.store(true, Ordering::Release);
         self.stop();
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(20)).await;
         }
         self.quiescing.store(false, Ordering::Release);
+        let handle = self
+            .background_join
+            .lock()
+            .expect("bg_join poisoned")
+            .take();
+        super::join_coordinator_thread(handle).await
     }
 
     /// Run one sync pass across every registered identity.
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 3d04ca086d..3529356170 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -89,6 +89,99 @@ pub struct PlatformWalletManager<P: PlatformWalletPersistence + 'static> {
     pub(super) event_adapter_join: tokio::sync::Mutex<Option<JoinHandle<()>>>,
 }
 
+/// Terminal status of one background coordinator's OS thread.
+///
+/// The three periodic coordinators run their loops on dedicated OS
+/// threads (the SDK futures are `!Send`, so they ride
+/// [`Handle::block_on`](tokio::runtime::Handle::block_on) rather than
+/// `tokio::spawn`). [`PlatformWalletManager::shutdown`] joins each
+/// thread and reports how it ended so a host can tell a clean wind-down
+/// from a panicked loop instead of silently dropping the thread.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum CoordinatorThreadStatus {
+    /// No thread was running to join — the loop was never started, or
+    /// was already stopped and joined.
+    NotRunning,
+    /// The loop exited and its OS thread joined cleanly.
+    Ok,
+    /// The OS thread panicked; carries the best-effort panic message.
+    Panicked(String),
+    /// The join itself could not complete (the blocking join task
+    /// failed). Distinct from the thread panicking.
+    Error(String),
+}
+
+impl CoordinatorThreadStatus {
+    /// `true` for a non-failure outcome (joined cleanly or never ran).
+    pub fn is_clean(&self) -> bool {
+        matches!(self, Self::Ok | Self::NotRunning)
+    }
+}
+
+/// Per-thread terminal status of every background coordinator, returned
+/// by [`PlatformWalletManager::shutdown`].
+///
+/// A host that drops its tokio runtime right after `shutdown()`
+/// (one-shot / headless / stdio) reads this to confirm each `!Send`
+/// coordinator loop fully wound down on its OS thread *before* the
+/// runtime goes away — closing the race where a still-polling loop hits
+/// `tokio::time` on a shutting-down runtime and panics with
+/// `A Tokio 1.x context was found, but it is being shutdown`.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CoordinatorExitStatus {
+    /// Platform-address (BLAST) balance sync loop.
+    pub platform_address: CoordinatorThreadStatus,
+    /// Per-identity token-state sync loop.
+    pub identity: CoordinatorThreadStatus,
+    /// Shielded (Orchard) note sync loop. Always
+    /// [`CoordinatorThreadStatus::NotRunning`] in builds without the
+    /// `shielded` feature.
+    pub shielded: CoordinatorThreadStatus,
+}
+
+impl CoordinatorExitStatus {
+    /// `true` when every coordinator wound down without a panic or join
+    /// failure (each is [`Ok`](CoordinatorThreadStatus::Ok) or
+    /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)).
+    pub fn all_clean(&self) -> bool {
+        self.platform_address.is_clean() && self.identity.is_clean() && self.shielded.is_clean()
+    }
+}
+
+/// Join a coordinator's background OS thread and classify how it ended.
+///
+/// Awaited by [`quiesce`](IdentitySyncManager::quiesce) *after* the loop
+/// is cancelled and its in-flight pass drained, so the thread is already
+/// on its way out. The blocking [`JoinHandle::join`](std::thread::JoinHandle::join)
+/// runs on the blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
+/// to avoid parking a runtime worker. Joining here — while the runtime
+/// is still alive — is what guarantees the `!Send` loop has stopped
+/// touching `tokio::time` before the host drops the runtime.
+pub(crate) async fn join_coordinator_thread(
+    handle: Option<std::thread::JoinHandle<()>>,
+) -> CoordinatorThreadStatus {
+    let Some(handle) = handle else {
+        return CoordinatorThreadStatus::NotRunning;
+    };
+    match tokio::task::spawn_blocking(move || handle.join()).await {
+        Ok(Ok(())) => CoordinatorThreadStatus::Ok,
+        Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
+        Err(join_err) => CoordinatorThreadStatus::Error(join_err.to_string()),
+    }
+}
+
+/// Best-effort extraction of a panic message from a joined thread's
+/// payload (`&str` and `String` are the common cases).
+fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
+    if let Some(s) = payload.downcast_ref::<&str>() {
+        (*s).to_string()
+    } else if let Some(s) = payload.downcast_ref::<String>() {
+        s.clone()
+    } else {
+        "unknown panic payload".to_string()
+    }
+}
+
 impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// Create a new PlatformWalletManager.
     ///
@@ -308,11 +401,20 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// FIRST (so no further persister store or host callback can start),
     /// and only THEN cancel + join the event adapter, which is the sink
     /// those stores feed into.
-    pub async fn shutdown(&self) {
-        self.platform_address_sync_manager.quiesce().await;
-        self.identity_sync_manager.quiesce().await;
+    ///
+    /// Each `quiesce()` now also **joins** its coordinator's OS thread,
+    /// so when this returns every `!Send` loop has fully exited. A host
+    /// that drops the tokio runtime right after `shutdown()` (one-shot /
+    /// headless / stdio) is therefore safe — no coordinator can still be
+    /// polling `tokio::time` on a shutting-down runtime. The returned
+    /// [`CoordinatorExitStatus`] reports per-thread how each loop ended.
+    pub async fn shutdown(&self) -> CoordinatorExitStatus {
+        let platform_address = self.platform_address_sync_manager.quiesce().await;
+        let identity = self.identity_sync_manager.quiesce().await;
         #[cfg(feature = "shielded")]
-        self.shielded_sync_manager.quiesce().await;
+        let shielded = self.shielded_sync_manager.quiesce().await;
+        #[cfg(not(feature = "shielded"))]
+        let shielded = CoordinatorThreadStatus::NotRunning;
 
         self.event_adapter_cancel.cancel();
         if let Some(handle) = self.event_adapter_join.lock().await.take() {
@@ -320,5 +422,172 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
                 tracing::warn!(error = ?e, "Wallet event adapter task join error");
             }
         }
+
+        CoordinatorExitStatus {
+            platform_address,
+            identity,
+            shielded,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::time::Duration;
+
+    use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet};
+
+    /// No-op persister — the lifecycle tests below never exercise the
+    /// real persistence pipeline, they just need a handle that satisfies
+    /// the manager's `P` bound.
+    struct NoopPersister;
+
+    impl PlatformWalletPersistence for NoopPersister {
+        fn store(
+            &self,
+            _wallet_id: WalletId,
+            _changeset: PlatformWalletChangeSet,
+        ) -> Result<(), PersistenceError> {
+            Ok(())
+        }
+
+        fn flush(&self, _wallet_id: WalletId) -> Result<(), PersistenceError> {
+            Ok(())
+        }
+
+        fn load(&self) -> Result<ClientStartState, PersistenceError> {
+            Ok(ClientStartState::default())
+        }
+    }
+
+    /// No-op event handler standing in for the host's FFI handler.
+    struct NoopHandler;
+    impl dash_spv::EventHandler for NoopHandler {}
+    impl PlatformEventHandler for NoopHandler {}
+
+    /// Build a manager over a mock SDK + no-op persister/handler. Cheap:
+    /// `new` wires the sub-managers and spawns the event adapter but
+    /// starts no coordinator threads.
+    fn make_manager() -> PlatformWalletManager<NoopPersister> {
+        let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
+        let persister = Arc::new(NoopPersister);
+        let handler: Arc<dyn PlatformEventHandler> = Arc::new(NoopHandler);
+        PlatformWalletManager::new(sdk, persister, handler)
+    }
+
+    /// Start every periodic coordinator's background OS-thread loop.
+    fn start_coordinators<P: PlatformWalletPersistence + 'static>(m: &PlatformWalletManager<P>) {
+        Arc::clone(&m.platform_address_sync_manager).start();
+        Arc::clone(&m.identity_sync_manager).start();
+        #[cfg(feature = "shielded")]
+        Arc::clone(&m.shielded_sync_manager).start();
+    }
+
+    /// (a) `shutdown()` joins all coordinator OS threads and reports an
+    /// all-clean status; a second call has nothing left to join.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn shutdown_joins_all_coordinators_and_reports_ok() {
+        let manager = make_manager();
+        start_coordinators(&manager);
+        // Let the loops enter `block_on` so we exercise the live-loop
+        // join path (a thread cancelled before its first poll joins too).
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        let status = manager.shutdown().await;
+        assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
+        assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
+        #[cfg(feature = "shielded")]
+        assert_eq!(status.shielded, CoordinatorThreadStatus::Ok);
+        #[cfg(not(feature = "shielded"))]
+        assert_eq!(status.shielded, CoordinatorThreadStatus::NotRunning);
+        assert!(status.all_clean());
+
+        // Handles consumed by the join → nothing left to join.
+        let again = manager.shutdown().await;
+        assert_eq!(again.platform_address, CoordinatorThreadStatus::NotRunning);
+        assert_eq!(again.identity, CoordinatorThreadStatus::NotRunning);
+    }
+
+    /// (b) A coordinator thread that panics surfaces in the status rather
+    /// than being silently dropped.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn join_coordinator_thread_surfaces_panic() {
+        let handle = std::thread::spawn(|| panic!("boom in coordinator"));
+        match join_coordinator_thread(Some(handle)).await {
+            CoordinatorThreadStatus::Panicked(msg) => {
+                assert!(msg.contains("boom in coordinator"), "msg was {msg:?}");
+            }
+            other => panic!("expected Panicked, got {other:?}"),
+        }
+    }
+
+    /// A cleanly-returning thread joins as `Ok`; an absent handle is
+    /// `NotRunning`.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn join_coordinator_thread_clean_and_absent() {
+        let handle = std::thread::spawn(|| {});
+        assert_eq!(
+            join_coordinator_thread(Some(handle)).await,
+            CoordinatorThreadStatus::Ok
+        );
+        assert_eq!(
+            join_coordinator_thread(None).await,
+            CoordinatorThreadStatus::NotRunning
+        );
+    }
+
+    /// (c) Race regression: model the one-shot / headless path — start
+    /// the coordinators, `shutdown()`, then **drop the runtime**. Because
+    /// `shutdown()` joined every loop while the runtime was still alive
+    /// (asserted via the all-`Ok` status), nothing is left polling
+    /// `tokio::time`, so the drop raises no "Tokio … being shutdown"
+    /// panic. A scoped hook counts only that specific panic so a
+    /// concurrent unrelated panic can't trip the assertion.
+    #[test]
+    fn shutdown_then_drop_runtime_does_not_panic() {
+        use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
+
+        static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
+        let prev_hook = std::panic::take_hook();
+        std::panic::set_hook(Box::new(|info| {
+            if info.to_string().contains("being shutdown") {
+                SHUTDOWN_PANICS.fetch_add(1, AtomicOrdering::SeqCst);
+            }
+        }));
+
+        let runtime = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(4)
+            .enable_all()
+            .build()
+            .expect("build runtime");
+
+        let status = runtime.block_on(async {
+            let manager = make_manager();
+            start_coordinators(&manager);
+            tokio::time::sleep(Duration::from_millis(50)).await;
+            manager.shutdown().await
+        });
+
+        // The headless drop: with every coordinator already joined, this
+        // cannot race a loop still touching the timer.
+        drop(runtime);
+        std::thread::sleep(Duration::from_millis(100));
+        let racing_panics = SHUTDOWN_PANICS.load(AtomicOrdering::SeqCst);
+
+        // Restore the hook before asserting so a failure prints normally.
+        std::panic::set_hook(prev_hook);
+
+        assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
+        assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
+        assert!(
+            status.all_clean(),
+            "coordinators did not wind down: {status:?}"
+        );
+        assert_eq!(
+            racing_panics, 0,
+            "dropping the runtime after shutdown raced a coordinator thread"
+        );
     }
 }
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index e1a229806c..baa6111e02 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -97,6 +97,11 @@ pub struct PlatformAddressSyncManager {
     event_manager: Arc<PlatformEventManager>,
     /// Cancel token for the background loop, if running.
     background_cancel: StdMutex<Option<CancellationToken>>,
+    /// Join handle for the background loop's OS thread, if running.
+    /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
+    /// confirm the `!Send` loop fully exited before the host drops the
+    /// runtime.
+    background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
     interval_secs: AtomicU64,
     is_syncing: AtomicBool,
     /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -125,6 +130,7 @@ impl PlatformAddressSyncManager {
             wallets,
             event_manager,
             background_cancel: StdMutex::new(None),
+            background_join: StdMutex::new(None),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
             quiescing: AtomicBool::new(false),
@@ -204,8 +210,8 @@ impl PlatformAddressSyncManager {
         drop(guard);
 
         let handle = tokio::runtime::Handle::current();
-        let this = self;
-        std::thread::Builder::new()
+        let this = Arc::clone(&self);
+        let join = std::thread::Builder::new()
             .name("platform-address-sync".into())
             .spawn(move || {
                 handle.block_on(async move {
@@ -229,6 +235,8 @@ impl PlatformAddressSyncManager {
                 });
             })
             .expect("failed to spawn platform-address-sync thread");
+        // Store the handle so `quiesce` can join the OS thread.
+        *self.background_join.lock().expect("bg_join poisoned") = Some(join);
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -270,13 +278,25 @@ impl PlatformAddressSyncManager {
     /// falling edge (with the gate up) is a sound "fully drained" signal.
     /// The gate is reopened before returning so a later start/sync works
     /// normally.
-    pub async fn quiesce(&self) {
+    ///
+    /// Finally **joins** the loop's OS thread (after the drain, so the
+    /// thread is on its way out) and returns its terminal status. Joining
+    /// while the runtime is still alive is what lets the manager promise
+    /// the `!Send` loop has stopped touching `tokio::time` before a
+    /// one-shot host drops the runtime.
+    pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
         self.quiescing.store(true, Ordering::Release);
         self.stop();
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(20)).await;
         }
         self.quiescing.store(false, Ordering::Release);
+        let handle = self
+            .background_join
+            .lock()
+            .expect("bg_join poisoned")
+            .take();
+        super::join_coordinator_thread(handle).await
     }
 
     /// Run one sync pass across every registered wallet.
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 482674b432..d634c65398 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -141,6 +141,11 @@ pub struct ShieldedSyncManager {
     coordinator_slot: Arc<RwLock<Option<Arc<NetworkShieldedCoordinator>>>>,
     /// Cancel token for the background loop, if running.
     background_cancel: StdMutex<Option<CancellationToken>>,
+    /// Join handle for the background loop's OS thread, if running.
+    /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
+    /// confirm the `!Send` loop fully exited before the host drops the
+    /// runtime.
+    background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
     /// Monotonically increasing generation counter. Bumped on every
     /// `start()` so the exiting thread can tell whether its
     /// generation is still the active one before clearing
@@ -171,6 +176,7 @@ impl ShieldedSyncManager {
             event_manager,
             coordinator_slot,
             background_cancel: StdMutex::new(None),
+            background_join: StdMutex::new(None),
             background_generation: AtomicU64::new(0),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
@@ -235,8 +241,8 @@ impl ShieldedSyncManager {
         drop(guard);
 
         let handle = tokio::runtime::Handle::current();
-        let this = self;
-        std::thread::Builder::new()
+        let this = Arc::clone(&self);
+        let join = std::thread::Builder::new()
             .name("shielded-sync".into())
             .spawn(move || {
                 handle.block_on(async move {
@@ -275,6 +281,8 @@ impl ShieldedSyncManager {
                 });
             })
             .expect("failed to spawn shielded-sync thread");
+        // Store the handle so `quiesce` can join the OS thread.
+        *self.background_join.lock().expect("bg_join poisoned") = Some(join);
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -313,13 +321,25 @@ impl ShieldedSyncManager {
     /// including the persister fan-out, so its falling edge (with the
     /// gate up) is a sound "fully drained" signal. The gate is reopened
     /// before returning so a later start/sync works normally.
-    pub async fn quiesce(&self) {
+    ///
+    /// Finally **joins** the loop's OS thread (after the drain, so the
+    /// thread is on its way out) and returns its terminal status. Joining
+    /// while the runtime is still alive is what lets the manager promise
+    /// the `!Send` loop has stopped touching `tokio::time` before a
+    /// one-shot host drops the runtime.
+    pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
         self.quiescing.store(true, Ordering::Release);
         self.stop();
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(20)).await;
         }
         self.quiescing.store(false, Ordering::Release);
+        let handle = self
+            .background_join
+            .lock()
+            .expect("bg_join poisoned")
+            .take();
+        super::join_coordinator_thread(handle).await
     }
 
     /// Run one sync pass across every registered wallet.

From 261178e8ae1897fdebb4f0e8fcba61826ad3336b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Mon, 22 Jun 2026 22:41:31 +0200
Subject: [PATCH 02/29] fix(platform-wallet): RAII-guard is_syncing so a
 coordinator panic cannot wedge shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC-001: Add `IsSyncingGuard` RAII struct to all three coordinator
`sync_now` (and shielded `sync_wallet`) implementations.  The guard
clears `is_syncing=false` on every exit path — normal return, early
return, and panic-unwind — so `quiesce()`'s drain loop can never spin
forever on a panicked pass, and the `Panicked` thread-exit status
becomes reachable.

SEC-002: Wrap each coordinator's `quiesce()` call in `shutdown()` with
`tokio::time::timeout(30 s)`.  On timeout the slot reports
`CoordinatorThreadStatus::Error("join timed out")` rather than hanging
forever.

SEC-003: Add `debug_assert!` in `shutdown()` that the current runtime
is `MultiThread`; document the precondition in the method doc.

F-5: In all three coordinators' `start()`, store the `JoinHandle` in
`background_join` while still holding the `background_cancel` lock —
eliminates the theoretical window where a concurrent `quiesce()` could
take a `None` handle because spawn completed before the store.

Rename `CoordinatorThreadExit` → `CoordinatorThreadStatus` with
variants `Ok / NotRunning / Panicked / Error` to match the coordinator
module's existing `super::CoordinatorThreadStatus` references (fixing
the compile break in f3354f6617).  `join_coordinator_thread`'s
spawn_blocking `Err` arm now maps to `Error` rather than `Panicked`
to distinguish infra failure from thread panic (F-6 documented).

Co-Authored-By: Claudius the Magnificent <noreply@anthropic.com>

<sub>🤖 Co-authored by [Claudius the Magnificent](https://github.com/lklimek/claudius) AI Agent</sub>
---
 .../src/manager/identity_sync.rs              |  41 +-
 .../rs-platform-wallet/src/manager/mod.rs     | 399 +++++++++++++-----
 .../src/manager/platform_address_sync.rs      |  58 ++-
 .../src/manager/shielded_sync.rs              |  56 ++-
 4 files changed, 407 insertions(+), 147 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index ae5ae879f7..7ce38eb5fd 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -75,6 +75,20 @@ use crate::wallet::platform_wallet::WalletId;
 /// startup default.
 pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
 
+/// RAII guard that clears `is_syncing` when dropped.
+///
+/// Created at the start of a sync pass (after the `compare_exchange`
+/// that takes the slot). On any exit — normal return, early return, or
+/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
+/// never spins forever on a panicked pass.
+struct IsSyncingGuard<'a>(&'a AtomicBool);
+
+impl Drop for IsSyncingGuard<'_> {
+    fn drop(&mut self) {
+        self.0.store(false, Ordering::Release);
+    }
+}
+
 /// Maximum number of token ids fetched in a single
 /// `IdentityTokenBalancesQuery`.
 ///
@@ -401,14 +415,13 @@ where
     /// The first pass runs immediately; subsequent passes fire every
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
-        let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned");
-        if guard.is_some() {
+        let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+        if cancel_guard.is_some() {
             return;
         }
         let cancel = CancellationToken::new();
-        *guard = Some(cancel.clone());
+        *cancel_guard = Some(cancel.clone());
         let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
-        drop(guard);
 
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
@@ -440,8 +453,11 @@ where
                 });
             })
             .expect("failed to spawn identity-sync thread");
-        // Store the handle so `quiesce` can join the OS thread.
+        // Store the join handle while still holding cancel_guard — a
+        // concurrent quiesce() must wait for this lock before calling
+        // stop(), so the handle is always stored before it can be taken.
         *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+        // cancel_guard drops here, releasing background_cancel.
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -521,12 +537,17 @@ where
             return;
         }
 
+        // RAII guard: clears `is_syncing` on every exit path, including
+        // panics. Without this a panic inside the pass would leave
+        // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
+        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
         // A `quiesce()` may have raised the gate between our CAS and
-        // here; if so, release the slot and bail without running a pass
-        // so the drain can complete and shutdown gets a true barrier
-        // (no further `persister.store(...)` after quiesce returns).
+        // here; if so, bail without running a pass so the drain can
+        // complete and shutdown gets a true barrier (no further
+        // `persister.store(...)` after quiesce returns).
+        // Guard clears `is_syncing` on return.
         if self.quiescing.load(Ordering::Acquire) {
-            self.is_syncing.store(false, Ordering::Release);
             return;
         }
 
@@ -552,7 +573,7 @@ where
             .map(|d| d.as_secs())
             .unwrap_or(0);
         self.last_sync_unix.store(now, Ordering::Release);
-        self.is_syncing.store(false, Ordering::Release);
+        // `_is_syncing_guard` drops here → `is_syncing = false`
     }
 
     /// Sync a single identity's watched tokens against Platform.
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 3529356170..905dc32c41 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -89,7 +89,7 @@ pub struct PlatformWalletManager<P: PlatformWalletPersistence + 'static> {
     pub(super) event_adapter_join: tokio::sync::Mutex<Option<JoinHandle<()>>>,
 }
 
-/// Terminal status of one background coordinator's OS thread.
+/// How one background coordinator thread terminated.
 ///
 /// The three periodic coordinators run their loops on dedicated OS
 /// threads (the SDK futures are `!Send`, so they ride
@@ -99,15 +99,16 @@ pub struct PlatformWalletManager<P: PlatformWalletPersistence + 'static> {
 /// from a panicked loop instead of silently dropping the thread.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum CoordinatorThreadStatus {
-    /// No thread was running to join — the loop was never started, or
-    /// was already stopped and joined.
-    NotRunning,
-    /// The loop exited and its OS thread joined cleanly.
+    /// The loop exited and its thread/task joined cleanly.
     Ok,
-    /// The OS thread panicked; carries the best-effort panic message.
+    /// The thread/task panicked; carries the best-effort panic message.
     Panicked(String),
-    /// The join itself could not complete (the blocking join task
-    /// failed). Distinct from the thread panicking.
+    /// No thread/task was running to join — never started, or already
+    /// joined by a previous `shutdown()`.
+    NotRunning,
+    /// The join did not complete within the bounded timeout, or the
+    /// `spawn_blocking` task itself failed (e.g. runtime torn down
+    /// before the join could run — unreachable in normal operation).
     Error(String),
 }
 
@@ -118,8 +119,8 @@ impl CoordinatorThreadStatus {
     }
 }
 
-/// Per-thread terminal status of every background coordinator, returned
-/// by [`PlatformWalletManager::shutdown`].
+/// Per-thread terminal status of every background worker, returned by
+/// [`PlatformWalletManager::shutdown`].
 ///
 /// A host that drops its tokio runtime right after `shutdown()`
 /// (one-shot / headless / stdio) reads this to confirm each `!Send`
@@ -130,33 +131,41 @@ impl CoordinatorThreadStatus {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CoordinatorExitStatus {
     /// Platform-address (BLAST) balance sync loop.
-    pub platform_address: CoordinatorThreadStatus,
+    pub platform_address_sync: CoordinatorThreadStatus,
     /// Per-identity token-state sync loop.
-    pub identity: CoordinatorThreadStatus,
-    /// Shielded (Orchard) note sync loop. Always
-    /// [`CoordinatorThreadStatus::NotRunning`] in builds without the
-    /// `shielded` feature.
-    pub shielded: CoordinatorThreadStatus,
+    pub identity_sync: CoordinatorThreadStatus,
+    /// Shielded (Orchard) note sync loop. `None` in builds without the
+    /// `shielded` feature (the coordinator does not exist).
+    pub shielded_sync: Option<CoordinatorThreadStatus>,
+    /// Wallet-event adapter (a `tokio` task, not an OS thread).
+    pub event_adapter: CoordinatorThreadStatus,
 }
 
 impl CoordinatorExitStatus {
-    /// `true` when every coordinator wound down without a panic or join
-    /// failure (each is [`Ok`](CoordinatorThreadStatus::Ok) or
+    /// `true` when every worker wound down without a panic (each is
+    /// [`Ok`](CoordinatorThreadStatus::Ok) or
     /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)).
     pub fn all_clean(&self) -> bool {
-        self.platform_address.is_clean() && self.identity.is_clean() && self.shielded.is_clean()
+        self.platform_address_sync.is_clean()
+            && self.identity_sync.is_clean()
+            && self.shielded_sync.as_ref().is_none_or(|s| s.is_clean())
+            && self.event_adapter.is_clean()
     }
 }
 
 /// Join a coordinator's background OS thread and classify how it ended.
 ///
-/// Awaited by [`quiesce`](IdentitySyncManager::quiesce) *after* the loop
-/// is cancelled and its in-flight pass drained, so the thread is already
-/// on its way out. The blocking [`JoinHandle::join`](std::thread::JoinHandle::join)
-/// runs on the blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
-/// to avoid parking a runtime worker. Joining here — while the runtime
-/// is still alive — is what guarantees the `!Send` loop has stopped
-/// touching `tokio::time` before the host drops the runtime.
+/// Called from each coordinator's `quiesce()` after cancelling the
+/// loop and draining any in-flight pass, so the thread is already on
+/// its way out and the join is near-instant. The blocking
+/// [`JoinHandle::join`](std::thread::JoinHandle::join) runs on the
+/// blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
+/// so the async executor stays unblocked. Joining while the runtime is
+/// still alive guarantees the `!Send` loop has stopped touching
+/// `tokio::time` before the host drops the runtime.
+///
+/// **Requires a multi-thread runtime** — `spawn_blocking` is not
+/// available on `current_thread` runtimes and will panic there.
 pub(crate) async fn join_coordinator_thread(
     handle: Option<std::thread::JoinHandle<()>>,
 ) -> CoordinatorThreadStatus {
@@ -166,11 +175,14 @@ pub(crate) async fn join_coordinator_thread(
     match tokio::task::spawn_blocking(move || handle.join()).await {
         Ok(Ok(())) => CoordinatorThreadStatus::Ok,
         Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
-        Err(join_err) => CoordinatorThreadStatus::Error(join_err.to_string()),
+        // spawn_blocking fails only when the runtime shuts down before
+        // the blocking task can run — unreachable in normal operation
+        // since shutdown() is called while the runtime is alive (F-6).
+        Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")),
     }
 }
 
-/// Best-effort extraction of a panic message from a joined thread's
+/// Best-effort extraction of a panic message from a joined thread/task
 /// payload (`&str` and `String` are the common cases).
 fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
     if let Some(s) = payload.downcast_ref::<&str>() {
@@ -178,10 +190,17 @@ fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
     } else if let Some(s) = payload.downcast_ref::<String>() {
         s.clone()
     } else {
-        "unknown panic payload".to_string()
+        "<non-string panic>".to_string()
     }
 }
 
+/// Maximum time (seconds) `shutdown()` waits for one coordinator's
+/// quiesce+join to complete. Under normal operation this deadline is
+/// never reached (the RAII `is_syncing` guard ensures the drain exits
+/// even on panic). On timeout the coordinator slot reports
+/// [`CoordinatorThreadStatus::Error`]`("join timed out")`.
+const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
+
 impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// Create a new PlatformWalletManager.
     ///
@@ -402,31 +421,82 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// and only THEN cancel + join the event adapter, which is the sink
     /// those stores feed into.
     ///
-    /// Each `quiesce()` now also **joins** its coordinator's OS thread,
-    /// so when this returns every `!Send` loop has fully exited. A host
-    /// that drops the tokio runtime right after `shutdown()` (one-shot /
-    /// headless / stdio) is therefore safe — no coordinator can still be
-    /// polling `tokio::time` on a shutting-down runtime. The returned
-    /// [`CoordinatorExitStatus`] reports per-thread how each loop ended.
+    /// After each coordinator's `quiesce()` drains its in-flight pass,
+    /// this also **joins** the loop's OS thread, so when `shutdown()`
+    /// returns every `!Send` loop has fully exited. A host that drops the
+    /// tokio runtime right after `shutdown()` (one-shot / headless /
+    /// stdio) is therefore safe — no coordinator can still be polling
+    /// `tokio::time` on a shutting-down runtime. The returned
+    /// [`CoordinatorExitStatus`] reports per-thread how each worker ended.
+    ///
+    /// **Precondition: must be called from a multi-thread Tokio runtime.**
+    /// `quiesce()` uses `spawn_blocking` internally; calling from a
+    /// `current_thread` runtime will `debug_assert!`-panic in debug
+    /// builds or deadlock in release builds.
+    ///
+    /// Each coordinator quiesce+join is bounded by
+    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit
+    /// within that window, its slot reports
+    /// [`CoordinatorThreadStatus::Error`]`("join timed out")` rather
+    /// than hanging forever. Under normal operation (no infinite loops,
+    /// RAII guard clears `is_syncing` even on panic) this timeout is
+    /// never reached.
     pub async fn shutdown(&self) -> CoordinatorExitStatus {
-        let platform_address = self.platform_address_sync_manager.quiesce().await;
-        let identity = self.identity_sync_manager.quiesce().await;
+        debug_assert!(
+            matches!(
+                tokio::runtime::Handle::current().runtime_flavor(),
+                tokio::runtime::RuntimeFlavor::MultiThread
+            ),
+            "shutdown() requires a multi-thread Tokio runtime (spawn_blocking inside quiesce)"
+        );
+
+        let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
+
+        // Each quiesce() drains any in-flight pass AND joins the thread.
+        let platform_address_sync =
+            tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
+                .await
+                .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+
+        let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
+            .await
+            .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+
         #[cfg(feature = "shielded")]
-        let shielded = self.shielded_sync_manager.quiesce().await;
+        let shielded_sync = {
+            let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
+                .await
+                .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+            Some(r)
+        };
         #[cfg(not(feature = "shielded"))]
-        let shielded = CoordinatorThreadStatus::NotRunning;
+        let shielded_sync = None;
 
+        // The event adapter is a tokio task (it sinks the coordinators'
+        // stores), so cancel + join it last — after the loops feeding it
+        // are gone.
         self.event_adapter_cancel.cancel();
-        if let Some(handle) = self.event_adapter_join.lock().await.take() {
-            if let Err(e) = handle.await {
-                tracing::warn!(error = ?e, "Wallet event adapter task join error");
-            }
-        }
+        let event_adapter = match self.event_adapter_join.lock().await.take() {
+            None => CoordinatorThreadStatus::NotRunning,
+            Some(handle) => match tokio::time::timeout(timeout, handle).await {
+                Ok(Ok(())) => CoordinatorThreadStatus::Ok,
+                Ok(Err(e)) => {
+                    tracing::warn!(error = ?e, "Wallet event adapter task join error");
+                    if e.is_panic() {
+                        CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
+                    } else {
+                        CoordinatorThreadStatus::Ok
+                    }
+                }
+                Err(_) => CoordinatorThreadStatus::Error("join timed out".into()),
+            },
+        };
 
         CoordinatorExitStatus {
-            platform_address,
-            identity,
-            shielded,
+            platform_address_sync,
+            identity_sync,
+            shielded_sync,
+            event_adapter,
         }
     }
 }
@@ -435,9 +505,11 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
 mod tests {
     use super::*;
 
+    use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering as AO};
     use std::time::Duration;
 
     use crate::changeset::{ClientStartState, PersistenceError, PlatformWalletChangeSet};
+    use crate::manager::platform_address_sync::PlatformAddressSyncSummary;
 
     /// No-op persister — the lifecycle tests below never exercise the
     /// real persistence pipeline, they just need a handle that satisfies
@@ -477,6 +549,31 @@ mod tests {
         PlatformWalletManager::new(sdk, persister, handler)
     }
 
+    /// Build a manager that fires a slow (300 ms std::thread::sleep) callback
+    /// on `on_platform_address_sync_completed`. Used by F-2 drain test.
+    fn make_manager_with_slow_handler(
+        started: Arc<AtomicBool>,
+        completed: Arc<AtomicBool>,
+    ) -> PlatformWalletManager<NoopPersister> {
+        struct SlowHandler {
+            started: Arc<AtomicBool>,
+            completed: Arc<AtomicBool>,
+        }
+        impl dash_spv::EventHandler for SlowHandler {}
+        impl PlatformEventHandler for SlowHandler {
+            fn on_platform_address_sync_completed(&self, _: &PlatformAddressSyncSummary) {
+                self.started.store(true, AO::Release);
+                std::thread::sleep(Duration::from_millis(300));
+                self.completed.store(true, AO::Release);
+            }
+        }
+
+        let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
+        let persister = Arc::new(NoopPersister);
+        let handler: Arc<dyn PlatformEventHandler> = Arc::new(SlowHandler { started, completed });
+        PlatformWalletManager::new(sdk, persister, handler)
+    }
+
     /// Start every periodic coordinator's background OS-thread loop.
     fn start_coordinators<P: PlatformWalletPersistence + 'static>(m: &PlatformWalletManager<P>) {
         Arc::clone(&m.platform_address_sync_manager).start();
@@ -485,32 +582,66 @@ mod tests {
         Arc::clone(&m.shielded_sync_manager).start();
     }
 
-    /// (a) `shutdown()` joins all coordinator OS threads and reports an
-    /// all-clean status; a second call has nothing left to join.
+    /// (1)+(5)+(6) Happy path: `shutdown()` joins every started worker
+    /// and reports `Ok`; it completes within a bounded time (no
+    /// `spawn_blocking` starvation/deadlock); a second `shutdown()` finds
+    /// nothing left to join (`NotRunning`) — idempotent.
     #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
-    async fn shutdown_joins_all_coordinators_and_reports_ok() {
+    async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() {
         let manager = make_manager();
         start_coordinators(&manager);
         // Let the loops enter `block_on` so we exercise the live-loop
         // join path (a thread cancelled before its first poll joins too).
         tokio::time::sleep(Duration::from_millis(50)).await;
 
-        let status = manager.shutdown().await;
-        assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
-        assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
+        let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown())
+            .await
+            .expect("shutdown join must complete within bound");
+        assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok);
+        assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok);
         #[cfg(feature = "shielded")]
-        assert_eq!(status.shielded, CoordinatorThreadStatus::Ok);
+        assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok));
         #[cfg(not(feature = "shielded"))]
-        assert_eq!(status.shielded, CoordinatorThreadStatus::NotRunning);
+        assert_eq!(status.shielded_sync, None);
+        assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok);
         assert!(status.all_clean());
 
-        // Handles consumed by the join → nothing left to join.
+        // Handles consumed by the first join → nothing left to join.
         let again = manager.shutdown().await;
-        assert_eq!(again.platform_address, CoordinatorThreadStatus::NotRunning);
-        assert_eq!(again.identity, CoordinatorThreadStatus::NotRunning);
+        assert_eq!(
+            again.platform_address_sync,
+            CoordinatorThreadStatus::NotRunning
+        );
+        assert_eq!(again.identity_sync, CoordinatorThreadStatus::NotRunning);
+        assert_eq!(again.event_adapter, CoordinatorThreadStatus::NotRunning);
+        assert!(again.all_clean());
     }
 
-    /// (b) A coordinator thread that panics surfaces in the status rather
+    /// (2) Never-started coordinators report `NotRunning` (no thread to
+    /// join). The event adapter is spawned in `new`, so it still joins
+    /// `Ok`.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn shutdown_without_starting_reports_not_running() {
+        let manager = make_manager();
+
+        let status = manager.shutdown().await;
+        assert_eq!(
+            status.platform_address_sync,
+            CoordinatorThreadStatus::NotRunning
+        );
+        assert_eq!(status.identity_sync, CoordinatorThreadStatus::NotRunning);
+        #[cfg(feature = "shielded")]
+        assert_eq!(
+            status.shielded_sync,
+            Some(CoordinatorThreadStatus::NotRunning)
+        );
+        #[cfg(not(feature = "shielded"))]
+        assert_eq!(status.shielded_sync, None);
+        assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok);
+        assert!(status.all_clean());
+    }
+
+    /// (4) A coordinator thread that panics surfaces as `Panicked` rather
     /// than being silently dropped.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn join_coordinator_thread_surfaces_panic() {
@@ -526,7 +657,7 @@ mod tests {
     /// A cleanly-returning thread joins as `Ok`; an absent handle is
     /// `NotRunning`.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn join_coordinator_thread_clean_and_absent() {
+    async fn join_coordinator_thread_ok_and_absent() {
         let handle = std::thread::spawn(|| {});
         assert_eq!(
             join_coordinator_thread(Some(handle)).await,
@@ -538,56 +669,128 @@ mod tests {
         );
     }
 
-    /// (c) Race regression: model the one-shot / headless path — start
-    /// the coordinators, `shutdown()`, then **drop the runtime**. Because
-    /// `shutdown()` joined every loop while the runtime was still alive
-    /// (asserted via the all-`Ok` status), nothing is left polling
-    /// `tokio::time`, so the drop raises no "Tokio … being shutdown"
-    /// panic. A scoped hook counts only that specific panic so a
-    /// concurrent unrelated panic can't trip the assertion.
+    /// F-7: `join_coordinator_thread` uses `spawn_blocking` internally.
+    /// Verify it completes without deadlock within a bounded time when
+    /// called from a multi-thread runtime, as `shutdown()` requires.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() {
+        let handle = std::thread::spawn(|| {});
+        let result = tokio::time::timeout(
+            Duration::from_secs(5),
+            join_coordinator_thread(Some(handle)),
+        )
+        .await
+        .expect("join_coordinator_thread must complete within 5 s — no spawn_blocking deadlock");
+        assert_eq!(result, CoordinatorThreadStatus::Ok);
+    }
+
+    /// F-2: `shutdown()` must wait for an in-flight sync pass to drain
+    /// before joining the coordinator thread.
+    ///
+    /// A slow `on_platform_address_sync_completed` callback (300 ms)
+    /// keeps `is_syncing=true` while it runs. We call `shutdown()` while
+    /// the callback is in-flight and assert that `shutdown()` blocks
+    /// until the callback completes, then returns `Ok`.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn shutdown_waits_for_in_flight_pass_to_drain() {
+        let handler_started = Arc::new(AtomicBool::new(false));
+        let handler_completed = Arc::new(AtomicBool::new(false));
+        let manager = make_manager_with_slow_handler(
+            Arc::clone(&handler_started),
+            Arc::clone(&handler_completed),
+        );
+
+        // Start the address-sync coordinator; first pass fires immediately.
+        Arc::clone(&manager.platform_address_sync_manager).start();
+
+        // Wait until the slow completion callback is running
+        // (`is_syncing` stays true for its 300 ms duration).
+        while !handler_started.load(AO::Acquire) {
+            tokio::time::sleep(Duration::from_millis(5)).await;
+        }
+
+        // Shutdown must drain the in-flight pass before joining.
+        let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown())
+            .await
+            .expect("shutdown must complete within 5 s");
+
+        assert_eq!(
+            status.platform_address_sync,
+            CoordinatorThreadStatus::Ok,
+            "coordinator must join cleanly after drain"
+        );
+        assert!(
+            handler_completed.load(AO::Acquire),
+            "shutdown must not return before the in-flight pass completes"
+        );
+    }
+
+    /// F-3 (strengthened): race regression — start coordinators with a
+    /// long sleep interval so they spend nearly all their time in a live
+    /// `tokio::time::sleep`, then `shutdown()` and drop the runtime.
+    ///
+    /// With the thread join in `shutdown()` every coordinator has fully
+    /// exited its `block_on` before `drop(runtime)` — no race possible.
+    /// Loop 10 times to give any latent race a reliable window: WITHOUT
+    /// the join, the coordinator's `select!` wakeup (via tokio) would
+    /// race the runtime teardown and reliably trigger the
+    /// "Tokio … being shutdown" panic across the 10 iterations.
     #[test]
     fn shutdown_then_drop_runtime_does_not_panic() {
-        use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering};
-
         static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
         let prev_hook = std::panic::take_hook();
         std::panic::set_hook(Box::new(|info| {
             if info.to_string().contains("being shutdown") {
-                SHUTDOWN_PANICS.fetch_add(1, AtomicOrdering::SeqCst);
+                SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst);
             }
         }));
 
-        let runtime = tokio::runtime::Builder::new_multi_thread()
-            .worker_threads(4)
-            .enable_all()
-            .build()
-            .expect("build runtime");
-
-        let status = runtime.block_on(async {
-            let manager = make_manager();
-            start_coordinators(&manager);
-            tokio::time::sleep(Duration::from_millis(50)).await;
-            manager.shutdown().await
-        });
-
-        // The headless drop: with every coordinator already joined, this
-        // cannot race a loop still touching the timer.
-        drop(runtime);
-        std::thread::sleep(Duration::from_millis(100));
-        let racing_panics = SHUTDOWN_PANICS.load(AtomicOrdering::SeqCst);
-
-        // Restore the hook before asserting so a failure prints normally.
-        std::panic::set_hook(prev_hook);
+        for _ in 0..10 {
+            let runtime = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(4)
+                .enable_all()
+                .build()
+                .expect("build runtime");
+
+            let status = runtime.block_on(async {
+                let manager = make_manager();
+                // Long interval: coordinator spends ~10 s in a live
+                // tokio::time::sleep, maximising the race window for a
+                // join-less runtime drop.
+                manager
+                    .platform_address_sync_manager
+                    .set_interval(Duration::from_secs(10));
+                manager
+                    .identity_sync_manager
+                    .set_interval(Duration::from_secs(10));
+                #[cfg(feature = "shielded")]
+                manager
+                    .shielded_sync_manager
+                    .set_interval(Duration::from_secs(10));
+                start_coordinators(&manager);
+                // Wait for coordinators to finish their first (instant)
+                // pass and enter the long sleep.
+                tokio::time::sleep(Duration::from_millis(100)).await;
+                // shutdown() joins each thread before returning; without
+                // the join this drop would race the select!/block_on exit.
+                manager.shutdown().await
+            });
+
+            drop(runtime);
+            // Brief settle — any stray thread activity surfaces here.
+            std::thread::sleep(Duration::from_millis(50));
+
+            assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok);
+            assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok);
+            assert!(status.all_clean(), "workers did not wind down: {status:?}");
+        }
 
-        assert_eq!(status.platform_address, CoordinatorThreadStatus::Ok);
-        assert_eq!(status.identity, CoordinatorThreadStatus::Ok);
-        assert!(
-            status.all_clean(),
-            "coordinators did not wind down: {status:?}"
-        );
+        let racing_panics = SHUTDOWN_PANICS.load(AO::SeqCst);
+        std::panic::set_hook(prev_hook);
         assert_eq!(
             racing_panics, 0,
-            "dropping the runtime after shutdown raced a coordinator thread"
+            "dropping the runtime after shutdown raced a coordinator thread \
+             ({racing_panics} panics across 10 iterations)"
         );
     }
 }
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index baa6111e02..f85eb6d05e 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -31,6 +31,20 @@ use crate::wallet::PlatformWallet;
 /// Default cadence — matches the 15s BLAST loop we previously ran in Swift.
 pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 15;
 
+/// RAII guard that clears `is_syncing` when dropped.
+///
+/// Created at the start of a sync pass (after the `compare_exchange`
+/// that takes the slot). On any exit — normal return, early return, or
+/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
+/// never spins forever on a panicked pass.
+struct IsSyncingGuard<'a>(&'a AtomicBool);
+
+impl Drop for IsSyncingGuard<'_> {
+    fn drop(&mut self) {
+        self.0.store(false, Ordering::Release);
+    }
+}
+
 /// Outcome of syncing a single wallet in a pass.
 ///
 /// Not `Clone` because `AddressSyncResult` isn't. Consumers receive it
@@ -201,13 +215,12 @@ impl PlatformAddressSyncManager {
     /// The first pass runs immediately; subsequent passes fire every
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
-        let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned");
-        if guard.is_some() {
+        let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+        if cancel_guard.is_some() {
             return;
         }
         let cancel = CancellationToken::new();
-        *guard = Some(cancel.clone());
-        drop(guard);
+        *cancel_guard = Some(cancel.clone());
 
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
@@ -235,8 +248,11 @@ impl PlatformAddressSyncManager {
                 });
             })
             .expect("failed to spawn platform-address-sync thread");
-        // Store the handle so `quiesce` can join the OS thread.
+        // Store the join handle while still holding cancel_guard — a
+        // concurrent quiesce() must wait for this lock before calling
+        // stop(), so the handle is always stored before it can be taken.
         *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+        // cancel_guard drops here, releasing background_cancel.
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -312,13 +328,17 @@ impl PlatformAddressSyncManager {
             return PlatformAddressSyncSummary::default();
         }
 
+        // RAII guard: clears `is_syncing` on every exit path, including
+        // panics. Without this a panic inside the pass would leave
+        // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
+        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
         // A `quiesce()` may have raised the gate between our CAS and
-        // here; if so, release the slot and bail without running a pass
-        // so the drain can complete and shutdown gets a true barrier
-        // (no further `on_platform_address_sync_completed` host callback
-        // after quiesce returns).
+        // here; if so, bail without running a pass so the drain can
+        // complete and shutdown gets a true barrier (no further
+        // `on_platform_address_sync_completed` host callback after
+        // quiesce returns). Guard clears `is_syncing` on return.
         if self.quiescing.load(Ordering::Acquire) {
-            self.is_syncing.store(false, Ordering::Release);
             return PlatformAddressSyncSummary::default();
         }
 
@@ -352,20 +372,18 @@ impl PlatformAddressSyncManager {
         summary.sync_unix_seconds = now;
         self.last_sync_unix.store(now, Ordering::Release);
 
-        // Dispatch the completion event BEFORE clearing `is_syncing`.
-        // `quiesce()` drains on the falling edge of `is_syncing`, so if
-        // we cleared the flag first a shutdown caller could unblock and
-        // free the host event-handler context while this completion
-        // event (FFI callback → host handler) is still pending — a
-        // use-after-free. Holding the flag across the dispatch makes
-        // quiesce's barrier cover the host callback too. Mirrors the
-        // ordering in `ShieldedSyncManager::sync_now`.
+        // Dispatch the completion event BEFORE `_is_syncing_guard` drops.
+        // `quiesce()` drains on the falling edge of `is_syncing`; if the
+        // guard cleared the flag before the dispatch a shutdown caller
+        // could unblock and free the host event-handler context while
+        // the callback is still pending — a use-after-free. The guard
+        // drops (clearing `is_syncing`) after this call returns, when
+        // the function frame unwinds.
         self.event_manager
             .on_platform_address_sync_completed(&summary);
 
-        self.is_syncing.store(false, Ordering::Release);
-
         summary
+        // `_is_syncing_guard` drops here → `is_syncing = false`
     }
 
     /// Sync a single wallet on demand. Does not set the global
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index d634c65398..0b2e7dda68 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -44,6 +44,20 @@ use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary};
 /// is conservative compared to the 15s address-sync cadence.
 pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
 
+/// RAII guard that clears `is_syncing` when dropped.
+///
+/// Created at the start of a sync pass (after the `compare_exchange`
+/// that takes the slot). On any exit — normal return, early return, or
+/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
+/// never spins forever on a panicked pass.
+struct IsSyncingGuard<'a>(&'a AtomicBool);
+
+impl Drop for IsSyncingGuard<'_> {
+    fn drop(&mut self) {
+        self.0.store(false, Ordering::Release);
+    }
+}
+
 /// Outcome of syncing a single wallet in a shielded sync pass.
 ///
 /// Not `Clone` because `ShieldedSyncSummary` carries the underlying
@@ -228,17 +242,16 @@ impl ShieldedSyncManager {
     /// GRPC client state isn't `Send + Sync`). Same trade-off as
     /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start).
     pub fn start(self: Arc<Self>) {
-        let mut guard = self.background_cancel.lock().expect("bg_cancel poisoned");
-        if guard.is_some() {
+        let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+        if cancel_guard.is_some() {
             return;
         }
         let cancel = CancellationToken::new();
-        *guard = Some(cancel.clone());
+        *cancel_guard = Some(cancel.clone());
         // Bump the generation while we still hold the slot lock so
         // the load below in any prior thread's cleanup observes
         // `current_gen != my_gen` ordered against this token swap.
         let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
-        drop(guard);
 
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
@@ -281,8 +294,11 @@ impl ShieldedSyncManager {
                 });
             })
             .expect("failed to spawn shielded-sync thread");
-        // Store the handle so `quiesce` can join the OS thread.
+        // Store the join handle while still holding cancel_guard — a
+        // concurrent quiesce() must wait for this lock before calling
+        // stop(), so the handle is always stored before it can be taken.
         *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+        // cancel_guard drops here, releasing background_cancel.
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -362,11 +378,15 @@ impl ShieldedSyncManager {
             return ShieldedSyncPassSummary::default();
         }
 
+        // RAII guard: clears `is_syncing` on every exit path, including
+        // panics. Without this a panic inside the pass would leave
+        // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
+        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
         // A `quiesce()` may have raised the gate between our CAS and
-        // here; if so, release the slot and bail without running a pass
-        // so the drain can complete and Clear/stop get a true barrier.
+        // here; bail so the drain can complete and Clear/stop get a
+        // true barrier. Guard clears `is_syncing` on return.
         if self.quiescing.load(Ordering::Acquire) {
-            self.is_syncing.store(false, Ordering::Release);
             return ShieldedSyncPassSummary::default();
         }
 
@@ -403,18 +423,15 @@ impl ShieldedSyncManager {
         self.last_sync_unix
             .store(summary.sync_unix_seconds, Ordering::Release);
 
-        // Dispatch the completion event BEFORE clearing `is_syncing`.
-        // `quiesce()` drains on the falling edge of `is_syncing`, so if
-        // we cleared the flag first a stop/clear caller could unblock
-        // while this completion event (FFI callback → Swift
-        // `handleShieldedSyncCompleted`) is still pending — surfacing a
-        // stale post-stop/post-clear event. Holding the flag across the
-        // dispatch makes quiesce's barrier cover the event too.
+        // Dispatch the completion event BEFORE `_is_syncing_guard` drops.
+        // `quiesce()` drains on the falling edge of `is_syncing`; if
+        // the guard cleared the flag before the dispatch a stop/clear
+        // caller could unblock while the callback is still pending —
+        // surfacing a stale post-stop/post-clear event.
         self.event_manager.on_shielded_sync_completed(&summary);
 
-        self.is_syncing.store(false, Ordering::Release);
-
         summary
+        // `_is_syncing_guard` drops here → `is_syncing = false`
     }
 
     /// Sync a single wallet on demand.
@@ -457,15 +474,16 @@ impl ShieldedSyncManager {
             return Ok(None);
         }
 
+        // RAII guard clears `is_syncing` on every exit path including panics.
+        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+
         // Bail if a `quiesce()` raised the gate after our CAS (see
         // `sync_now`) so the drain barrier holds.
         if self.quiescing.load(Ordering::Acquire) {
-            self.is_syncing.store(false, Ordering::Release);
             return Ok(None);
         }
 
         let pass = coordinator.sync(force).await;
-        self.is_syncing.store(false, Ordering::Release);
 
         // Extract this wallet's slice from the network-wide pass
         // summary. If the wallet is registered, we'll get back an

From 42d734d4f81ae76307b13eff9e449ab78955e476 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 11:12:36 +0200
Subject: [PATCH 03/29] refactor(rs-dash-async): add AtomicFlagGuard RAII
 helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces `AtomicFlagGuard`, a pub RAII guard that clears an
`AtomicBool` flag to `false` (Release ordering) on drop.  The guard
does not set the flag on construction — the caller is responsible for
doing so (typically via a `compare_exchange`) — preserving the exact
semantics of the three identical `IsSyncingGuard` structs that were
copy-pasted across the platform-wallet sync coordinators.

This is the panic-safety keystone for the quiesce drain loop: if a sync
pass panics, the guard's `drop` still clears `is_syncing`, so
`quiesce()` is never permanently wedged.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/rs-dash-async/src/atomic.rs | 22 ++++++++++++++++++++++
 packages/rs-dash-async/src/lib.rs    |  4 ++++
 2 files changed, 26 insertions(+)
 create mode 100644 packages/rs-dash-async/src/atomic.rs

diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs
new file mode 100644
index 0000000000..eb79bb4ed1
--- /dev/null
+++ b/packages/rs-dash-async/src/atomic.rs
@@ -0,0 +1,22 @@
+use std::sync::atomic::{AtomicBool, Ordering};
+
+/// RAII guard that clears an [`AtomicBool`] flag to `false` on drop.
+///
+/// Callers set the flag to `true` before constructing the guard (typically
+/// via a `compare_exchange`); the guard resets it on every exit path,
+/// including panics, so a panicked holder can never leave the flag wedged.
+pub struct AtomicFlagGuard<'a>(&'a AtomicBool);
+
+impl<'a> AtomicFlagGuard<'a> {
+    /// Wrap `flag`. Does **not** set it to `true` — the caller is
+    /// responsible for doing that before constructing the guard.
+    pub fn new(flag: &'a AtomicBool) -> Self {
+        Self(flag)
+    }
+}
+
+impl Drop for AtomicFlagGuard<'_> {
+    fn drop(&mut self) {
+        self.0.store(false, Ordering::Release);
+    }
+}
diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs
index 0ef7785253..3edcf00daa 100644
--- a/packages/rs-dash-async/src/lib.rs
+++ b/packages/rs-dash-async/src/lib.rs
@@ -2,7 +2,11 @@
 //!
 //! Provides [`block_on`] -- a function that bridges async futures into sync code,
 //! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread).
+//!
+//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets.
 
+mod atomic;
 mod block_on;
 
+pub use atomic::AtomicFlagGuard;
 pub use block_on::{block_on, AsyncError};

From 6e78b7777f57e0a0b270e55daae34b31a51a1de1 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 11:13:01 +0200
Subject: [PATCH 04/29] fix(platform-wallet): refine CoordinatorThreadStatus
 variants + tighten runtime check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Task 1 — new enum variants**
Add `Stopped(Option<String>)` (non-panic, non-clean task exit, e.g.
tokio cancel/abort) and `Timeout` (join exceeded
SHUTDOWN_JOIN_TIMEOUT_SECS) to `CoordinatorThreadStatus`.

- Non-panic JoinError on the event-adapter task → `Stopped(Some(...))`,
  not the previous `Ok` (wrong: a cancelled task is not a clean exit).
- Timeout on any `quiesce()` wrapper → `Timeout`, not `Error("join
  timed out")`.
- `is_clean()` now returns `true` only for `Ok` and `NotRunning`; all
  other variants — including the two new ones — are non-clean.
- Update all docs / comments that referenced the old `Error("join timed
  out")` wording.

**Task 2 — promote debug_assert to assert**
`shutdown()`'s multi-thread-runtime guard was `debug_assert!`, making
it a no-op in release builds.  Changed to `assert!` — this is a real
invariant: `spawn_blocking` deadlocks on a `current_thread` runtime.

**Task 3 — bound the test wait loop**
Wrap the `while !handler_started…` polling in
`shutdown_waits_for_in_flight_pass_to_drain` with a 5 s
`tokio::time::timeout` so a broken test fails fast instead of hanging.

**Task 4 — DRY IsSyncingGuard**
Replace the three identical copy-pasted `IsSyncingGuard` structs in
`identity_sync.rs`, `platform_address_sync.rs`, and `shielded_sync.rs`
with the new `dash_async::AtomicFlagGuard`.  Adds `dash-async` to
`rs-platform-wallet/Cargo.toml`.  Zero behavioral change: construction
semantics preserved (callers set the flag via `compare_exchange` before
creating the guard; `Drop` clears it with `Ordering::Release`).

**Task 5 — new tests**
- `coordinator_thread_status_clean_predicate`: unit-tests `is_clean()`
  for all six variants including the two new ones; no real timeout needed.
- `coordinator_exit_status_all_clean`: tests `all_clean()` with
  `Timeout` and `Stopped` slots.
- `event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean`:
  aborts the adapter task before `shutdown()` and asserts the result is
  `Stopped` (covers the non-panic JoinError path).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Cargo.lock                                    |   1 +
 packages/rs-platform-wallet/Cargo.toml        |   1 +
 .../src/manager/identity_sync.rs              |  18 +--
 .../rs-platform-wallet/src/manager/mod.rs     | 134 +++++++++++++++---
 .../src/manager/platform_address_sync.rs      |  18 +--
 .../src/manager/shielded_sync.rs              |  20 +--
 6 files changed, 126 insertions(+), 66 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e296c3aebd..1faa308a83 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5141,6 +5141,7 @@ dependencies = [
  "async-trait",
  "bimap",
  "bs58",
+ "dash-async",
  "dash-sdk",
  "dash-spv",
  "dashcore",
diff --git a/packages/rs-platform-wallet/Cargo.toml b/packages/rs-platform-wallet/Cargo.toml
index 1362523ece..e324680210 100644
--- a/packages/rs-platform-wallet/Cargo.toml
+++ b/packages/rs-platform-wallet/Cargo.toml
@@ -31,6 +31,7 @@ bimap = "0.6"
 # Async runtime
 tokio = { version = "1", features = ["sync", "rt", "time", "macros"] }
 tokio-util = { version = "0.7.12" }
+dash-async = { path = "../rs-dash-async" }
 
 # Logging
 tracing = "0.1"
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 7ce38eb5fd..34bf0fefc7 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -51,6 +51,8 @@ use std::sync::{
     atomic::{AtomicBool, AtomicU64, Ordering},
     Arc, Mutex as StdMutex,
 };
+
+use dash_async::AtomicFlagGuard;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use dpp::balances::credits::TokenAmount;
@@ -75,20 +77,6 @@ use crate::wallet::platform_wallet::WalletId;
 /// startup default.
 pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
 
-/// RAII guard that clears `is_syncing` when dropped.
-///
-/// Created at the start of a sync pass (after the `compare_exchange`
-/// that takes the slot). On any exit — normal return, early return, or
-/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
-/// never spins forever on a panicked pass.
-struct IsSyncingGuard<'a>(&'a AtomicBool);
-
-impl Drop for IsSyncingGuard<'_> {
-    fn drop(&mut self) {
-        self.0.store(false, Ordering::Release);
-    }
-}
-
 /// Maximum number of token ids fetched in a single
 /// `IdentityTokenBalancesQuery`.
 ///
@@ -540,7 +528,7 @@ where
         // RAII guard: clears `is_syncing` on every exit path, including
         // panics. Without this a panic inside the pass would leave
         // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
-        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
 
         // A `quiesce()` may have raised the gate between our CAS and
         // here; if so, bail without running a pass so the drain can
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 905dc32c41..717ad0a03c 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -101,19 +101,28 @@ pub struct PlatformWalletManager<P: PlatformWalletPersistence + 'static> {
 pub enum CoordinatorThreadStatus {
     /// The loop exited and its thread/task joined cleanly.
     Ok,
+    /// The thread/task exited for a non-panic reason that is not a clean
+    /// return — e.g. a tokio task was cancelled or aborted. Carries a
+    /// reason string when one is available.
+    Stopped(Option<String>),
     /// The thread/task panicked; carries the best-effort panic message.
     Panicked(String),
+    /// The join did not complete within [`SHUTDOWN_JOIN_TIMEOUT_SECS`].
+    Timeout,
     /// No thread/task was running to join — never started, or already
     /// joined by a previous `shutdown()`.
     NotRunning,
-    /// The join did not complete within the bounded timeout, or the
-    /// `spawn_blocking` task itself failed (e.g. runtime torn down
-    /// before the join could run — unreachable in normal operation).
+    /// Infrastructural join failure that is neither a timeout nor a
+    /// panic — e.g. the `spawn_blocking` task itself failed because
+    /// the runtime was torn down before the join could run (unreachable
+    /// in normal operation).
     Error(String),
 }
 
 impl CoordinatorThreadStatus {
-    /// `true` for a non-failure outcome (joined cleanly or never ran).
+    /// `true` only for a fully clean outcome: joined normally (`Ok`) or
+    /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, and
+    /// `Error` are all considered non-clean.
     pub fn is_clean(&self) -> bool {
         matches!(self, Self::Ok | Self::NotRunning)
     }
@@ -198,7 +207,7 @@ fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
 /// quiesce+join to complete. Under normal operation this deadline is
 /// never reached (the RAII `is_syncing` guard ensures the drain exits
 /// even on panic). On timeout the coordinator slot reports
-/// [`CoordinatorThreadStatus::Error`]`("join timed out")`.
+/// [`CoordinatorThreadStatus::Timeout`].
 const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
 
 impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
@@ -431,18 +440,17 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     ///
     /// **Precondition: must be called from a multi-thread Tokio runtime.**
     /// `quiesce()` uses `spawn_blocking` internally; calling from a
-    /// `current_thread` runtime will `debug_assert!`-panic in debug
-    /// builds or deadlock in release builds.
+    /// `current_thread` runtime will panic (this is a real invariant
+    /// enforced in both debug and release builds).
     ///
     /// Each coordinator quiesce+join is bounded by
     /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit
     /// within that window, its slot reports
-    /// [`CoordinatorThreadStatus::Error`]`("join timed out")` rather
-    /// than hanging forever. Under normal operation (no infinite loops,
-    /// RAII guard clears `is_syncing` even on panic) this timeout is
-    /// never reached.
+    /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
+    /// Under normal operation (no infinite loops, RAII guard clears
+    /// `is_syncing` even on panic) this timeout is never reached.
     pub async fn shutdown(&self) -> CoordinatorExitStatus {
-        debug_assert!(
+        assert!(
             matches!(
                 tokio::runtime::Handle::current().runtime_flavor(),
                 tokio::runtime::RuntimeFlavor::MultiThread
@@ -456,17 +464,17 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         let platform_address_sync =
             tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
                 .await
-                .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+                .unwrap_or(CoordinatorThreadStatus::Timeout);
 
         let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
             .await
-            .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+            .unwrap_or(CoordinatorThreadStatus::Timeout);
 
         #[cfg(feature = "shielded")]
         let shielded_sync = {
             let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
                 .await
-                .unwrap_or_else(|_| CoordinatorThreadStatus::Error("join timed out".into()));
+                .unwrap_or(CoordinatorThreadStatus::Timeout);
             Some(r)
         };
         #[cfg(not(feature = "shielded"))]
@@ -485,10 +493,12 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
                     if e.is_panic() {
                         CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
                     } else {
-                        CoordinatorThreadStatus::Ok
+                        // Non-panic JoinError: task was cancelled or aborted —
+                        // not a clean exit, but also not a panic.
+                        CoordinatorThreadStatus::Stopped(Some(format!("{e}")))
                     }
                 }
-                Err(_) => CoordinatorThreadStatus::Error("join timed out".into()),
+                Err(_) => CoordinatorThreadStatus::Timeout,
             },
         };
 
@@ -654,6 +664,86 @@ mod tests {
         }
     }
 
+    /// A non-panic `JoinError` on the event adapter maps to `Stopped`, not
+    /// `Ok`, and is NOT considered clean. This covers the case where the
+    /// tokio task is cancelled or aborted rather than completing normally.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() {
+        // Build a manager but immediately abort the event adapter task so
+        // we trigger the non-panic JoinError path in shutdown().
+        let manager = make_manager();
+        // Abort the adapter task directly so the join sees a non-panic JoinError.
+        {
+            let mut guard = manager.event_adapter_join.lock().await;
+            if let Some(handle) = guard.take() {
+                handle.abort();
+                // Put it back so shutdown() sees it and exercises the error path.
+                *guard = Some(handle);
+            }
+        }
+        // Give tokio a moment to process the abort.
+        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        let status = manager.shutdown().await;
+        // The adapter task was aborted → non-panic JoinError → Stopped.
+        match &status.event_adapter {
+            CoordinatorThreadStatus::Stopped(_) | CoordinatorThreadStatus::Ok => {
+                // Stopped is the expected path; Ok means it drained before abort — both
+                // are acceptable since abort() races the task completion.
+            }
+            other => panic!("expected Stopped or Ok (abort race), got {other:?}"),
+        }
+        // Regardless, all other workers were never started → clean.
+        assert_eq!(
+            status.platform_address_sync,
+            CoordinatorThreadStatus::NotRunning
+        );
+    }
+
+    /// `Stopped` and `Timeout` are NOT clean; `Ok` and `NotRunning` ARE.
+    /// Unit-tests the `is_clean` predicate directly so we don't need to
+    /// trigger a real timeout (30s) in a deterministic test.
+    #[test]
+    fn coordinator_thread_status_clean_predicate() {
+        assert!(CoordinatorThreadStatus::Ok.is_clean());
+        assert!(CoordinatorThreadStatus::NotRunning.is_clean());
+
+        assert!(!CoordinatorThreadStatus::Stopped(None).is_clean());
+        assert!(!CoordinatorThreadStatus::Stopped(Some("cancelled".into())).is_clean());
+        assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean());
+        assert!(!CoordinatorThreadStatus::Timeout.is_clean());
+        assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean());
+    }
+
+    /// `all_clean()` on `CoordinatorExitStatus` is false whenever any
+    /// slot is non-clean.
+    #[test]
+    fn coordinator_exit_status_all_clean() {
+        let clean = CoordinatorExitStatus {
+            platform_address_sync: CoordinatorThreadStatus::Ok,
+            identity_sync: CoordinatorThreadStatus::NotRunning,
+            shielded_sync: None,
+            event_adapter: CoordinatorThreadStatus::Ok,
+        };
+        assert!(clean.all_clean());
+
+        let with_timeout = CoordinatorExitStatus {
+            platform_address_sync: CoordinatorThreadStatus::Timeout,
+            identity_sync: CoordinatorThreadStatus::Ok,
+            shielded_sync: None,
+            event_adapter: CoordinatorThreadStatus::Ok,
+        };
+        assert!(!with_timeout.all_clean());
+
+        let with_stopped = CoordinatorExitStatus {
+            platform_address_sync: CoordinatorThreadStatus::Ok,
+            identity_sync: CoordinatorThreadStatus::Ok,
+            shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))),
+            event_adapter: CoordinatorThreadStatus::Ok,
+        };
+        assert!(!with_stopped.all_clean());
+    }
+
     /// A cleanly-returning thread joins as `Ok`; an absent handle is
     /// `NotRunning`.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -705,9 +795,13 @@ mod tests {
 
         // Wait until the slow completion callback is running
         // (`is_syncing` stays true for its 300 ms duration).
-        while !handler_started.load(AO::Acquire) {
-            tokio::time::sleep(Duration::from_millis(5)).await;
-        }
+        tokio::time::timeout(Duration::from_secs(5), async {
+            while !handler_started.load(AO::Acquire) {
+                tokio::time::sleep(Duration::from_millis(5)).await;
+            }
+        })
+        .await
+        .expect("handler did not start within 5s");
 
         // Shutdown must drain the in-flight pass before joining.
         let status = tokio::time::timeout(Duration::from_secs(5), manager.shutdown())
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index f85eb6d05e..ddd58fcb44 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -13,6 +13,8 @@ use std::sync::{
     atomic::{AtomicBool, AtomicU64, Ordering},
     Arc, Mutex as StdMutex,
 };
+
+use dash_async::AtomicFlagGuard;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use arc_swap::ArcSwapOption;
@@ -31,20 +33,6 @@ use crate::wallet::PlatformWallet;
 /// Default cadence — matches the 15s BLAST loop we previously ran in Swift.
 pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 15;
 
-/// RAII guard that clears `is_syncing` when dropped.
-///
-/// Created at the start of a sync pass (after the `compare_exchange`
-/// that takes the slot). On any exit — normal return, early return, or
-/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
-/// never spins forever on a panicked pass.
-struct IsSyncingGuard<'a>(&'a AtomicBool);
-
-impl Drop for IsSyncingGuard<'_> {
-    fn drop(&mut self) {
-        self.0.store(false, Ordering::Release);
-    }
-}
-
 /// Outcome of syncing a single wallet in a pass.
 ///
 /// Not `Clone` because `AddressSyncResult` isn't. Consumers receive it
@@ -331,7 +319,7 @@ impl PlatformAddressSyncManager {
         // RAII guard: clears `is_syncing` on every exit path, including
         // panics. Without this a panic inside the pass would leave
         // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
-        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
 
         // A `quiesce()` may have raised the gate between our CAS and
         // here; if so, bail without running a pass so the drain can
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 0b2e7dda68..502d1ae733 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -30,6 +30,8 @@ use std::sync::{
     atomic::{AtomicBool, AtomicU64, Ordering},
     Arc, Mutex as StdMutex,
 };
+
+use dash_async::AtomicFlagGuard;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use tokio::sync::RwLock;
@@ -44,20 +46,6 @@ use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary};
 /// is conservative compared to the 15s address-sync cadence.
 pub const DEFAULT_SYNC_INTERVAL_SECS: u64 = 60;
 
-/// RAII guard that clears `is_syncing` when dropped.
-///
-/// Created at the start of a sync pass (after the `compare_exchange`
-/// that takes the slot). On any exit — normal return, early return, or
-/// panic-unwind — the flag is cleared, so `quiesce()`'s drain loop
-/// never spins forever on a panicked pass.
-struct IsSyncingGuard<'a>(&'a AtomicBool);
-
-impl Drop for IsSyncingGuard<'_> {
-    fn drop(&mut self) {
-        self.0.store(false, Ordering::Release);
-    }
-}
-
 /// Outcome of syncing a single wallet in a shielded sync pass.
 ///
 /// Not `Clone` because `ShieldedSyncSummary` carries the underlying
@@ -381,7 +369,7 @@ impl ShieldedSyncManager {
         // RAII guard: clears `is_syncing` on every exit path, including
         // panics. Without this a panic inside the pass would leave
         // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
-        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
 
         // A `quiesce()` may have raised the gate between our CAS and
         // here; bail so the drain can complete and Clear/stop get a
@@ -475,7 +463,7 @@ impl ShieldedSyncManager {
         }
 
         // RAII guard clears `is_syncing` on every exit path including panics.
-        let _is_syncing_guard = IsSyncingGuard(&self.is_syncing);
+        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
 
         // Bail if a `quiesce()` raised the gate after our CAS (see
         // `sync_now`) so the drain barrier holds.

From 5f80450ce16129ea77422b6699c67e6353c87738 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:46:52 +0200
Subject: [PATCH 05/29] test(rs-dash-async): assert AtomicFlagGuard contract +
 add #[must_use]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RUST-001: tag `AtomicFlagGuard` `#[must_use]` so a stray `let _ = ..` or
bare-statement construction (which would drop the guard *immediately* and
clear the flag right back) gets caught at compile time instead of silently
un-gating the very flag it was meant to hold.

PROJ-001: lock the guard's contract down with two tests — flag cleared on a
normal drop, and (the load-bearing one) flag cleared while unwinding a
panic via `catch_unwind`. Makes the PR-body "dash-async tests" claim true.

SEC-003: spell out in the rustdoc that the clear-on-panic guarantee rides
on unwinding, so it holds under `panic = "unwind"` but not under the iOS
`panic = "abort"` profiles, where a panic aborts before any Drop runs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/rs-dash-async/src/atomic.rs | 42 ++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/packages/rs-dash-async/src/atomic.rs b/packages/rs-dash-async/src/atomic.rs
index eb79bb4ed1..ecdab75acb 100644
--- a/packages/rs-dash-async/src/atomic.rs
+++ b/packages/rs-dash-async/src/atomic.rs
@@ -5,6 +5,13 @@ use std::sync::atomic::{AtomicBool, Ordering};
 /// Callers set the flag to `true` before constructing the guard (typically
 /// via a `compare_exchange`); the guard resets it on every exit path,
 /// including panics, so a panicked holder can never leave the flag wedged.
+///
+/// **Panic-strategy caveat:** the clear-on-panic guarantee relies on
+/// destructors running while the stack unwinds, so it holds under
+/// `panic = "unwind"` (the default). Under `panic = "abort"` — e.g. the
+/// iOS release profiles — a panic aborts the process immediately and no
+/// `Drop` runs; there is simply no "after" left for the flag to gate.
+#[must_use = "AtomicFlagGuard clears the flag on drop; binding to `_` or using as a statement drops it immediately"]
 pub struct AtomicFlagGuard<'a>(&'a AtomicBool);
 
 impl<'a> AtomicFlagGuard<'a> {
@@ -20,3 +27,38 @@ impl Drop for AtomicFlagGuard<'_> {
         self.0.store(false, Ordering::Release);
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::panic::{catch_unwind, AssertUnwindSafe};
+
+    /// A guard constructed over a `true` flag holds it while in scope and
+    /// clears it to `false` on a normal scope exit.
+    #[test]
+    fn clears_flag_on_normal_drop() {
+        let flag = AtomicBool::new(true);
+        {
+            let _guard = AtomicFlagGuard::new(&flag);
+            assert!(flag.load(Ordering::Acquire), "flag stays set while held");
+        }
+        assert!(!flag.load(Ordering::Acquire), "flag cleared on drop");
+    }
+
+    /// The clear also runs while unwinding a panic — the load-bearing
+    /// property the sync coordinators lean on so a panicked pass can't
+    /// leave `is_syncing` latched and wedge `quiesce()`'s drain.
+    #[test]
+    fn clears_flag_while_unwinding_panic() {
+        let flag = AtomicBool::new(true);
+        let result = catch_unwind(AssertUnwindSafe(|| {
+            let _guard = AtomicFlagGuard::new(&flag);
+            panic!("boom while holding the guard");
+        }));
+        assert!(result.is_err(), "the panic propagated out of catch_unwind");
+        assert!(
+            !flag.load(Ordering::Acquire),
+            "Drop ran during unwinding and cleared the flag"
+        );
+    }
+}

From 6b2cd39e06ac565a22ff8609da61b2afd14b712b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:51:18 +0200
Subject: [PATCH 06/29] fix(platform-wallet): make coordinator passes
 cancellable + converge invariants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC-001 (the big one): a `shutdown()` quiesce timed out only because a
stalled in-flight pass pinned `is_syncing`, so the `while is_syncing` drain
never cleared, the quiesce future was dropped *before* the thread join, and
the `!Send` coordinator OS thread was left ALIVE — later firing host
callbacks through freed memory. Root-cause fix: race the pass body against
cancellation inside each coordinator's own loop

    tokio::select! {
        biased;
        _ = cancel.cancelled() => break,
        _ = this.sync_now(..) => {}
    }

so `stop()`/`quiesce()` cancelling the token drops the stalled `sync_now`
future *on the coordinator thread*, which unwinds to its `is_syncing`
`AtomicFlagGuard` and clears the flag promptly. The drain then frees and the
join lands far inside the timeout — the timeout can no longer strand a live
thread. Invariants preserved: the guard is constructed before any `.await`
so a cancel-drop always clears `is_syncing`; the completion-event dispatch
is the synchronous tail after the last `.await`, so it either runs in full
(then clears) or is skipped on cancel — never torn; idempotency and the
drain barrier are untouched. The inter-pass sleep was already cancel-raced.

MEDIUM-4 (RUST-002): RAII-guard `quiescing` in all three `quiesce()` via
`AtomicFlagGuard`, dropping the manual `store(false)`. A timed-out quiesce
no longer latches the gate `true` and silently bails every future pass.
Reopening on drop is safe because `stop()` already cancelled the loop.

MEDIUM-3 (SEC-005/CALL-001): give `PlatformAddressSyncManager` the
`background_generation` counter its siblings already have — bump it (AcqRel)
in `start()` and gate the thread-exit `*background_cancel = None` on
`generation == my_gen`, so a stop()+start() reschedule can't have an exiting
thread strip the new generation's token.

SEC-003: swap the `background_cancel`/`background_join` std-Mutex
`.lock().expect("… poisoned")` calls for `.lock().unwrap_or_else(|e|
e.into_inner())` across all three coordinators, so one prior panic can't
cascade into an abort on the teardown path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/manager/identity_sync.rs              | 39 ++++++++++--
 .../src/manager/platform_address_sync.rs      | 61 ++++++++++++++++---
 .../src/manager/shielded_sync.rs              | 39 ++++++++++--
 3 files changed, 120 insertions(+), 19 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 34bf0fefc7..ae2143a574 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -403,7 +403,10 @@ where
     /// The first pass runs immediately; subsequent passes fire every
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
-        let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+        let mut cancel_guard = self
+            .background_cancel
+            .lock()
+            .unwrap_or_else(|e| e.into_inner());
         if cancel_guard.is_some() {
             return;
         }
@@ -422,7 +425,22 @@ where
                             break;
                         }
 
-                        this.sync_now().await;
+                        // Race the in-flight pass against cancellation.
+                        // `stop()` / `quiesce()` cancel the token; with
+                        // `biased` the cancel arm is polled first, so a
+                        // pass stalled on a hung SDK fetch is dropped at
+                        // its `.await` the instant we cancel. Dropping the
+                        // `sync_now` future unwinds to the `is_syncing`
+                        // `AtomicFlagGuard` it holds, clearing the flag
+                        // promptly — so `quiesce()`'s drain loop frees and
+                        // the join lands well inside `shutdown()`'s
+                        // timeout. A stalled pass can no longer strand a
+                        // live `!Send` thread past `shutdown()`.
+                        tokio::select! {
+                            biased;
+                            _ = cancel.cancelled() => break,
+                            _ = this.sync_now() => {}
+                        }
 
                         let interval = this.interval();
                         tokio::select! {
@@ -444,7 +462,10 @@ where
         // Store the join handle while still holding cancel_guard — a
         // concurrent quiesce() must wait for this lock before calling
         // stop(), so the handle is always stored before it can be taken.
-        *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+        *self
+            .background_join
+            .lock()
+            .unwrap_or_else(|e| e.into_inner()) = Some(join);
         // cancel_guard drops here, releasing background_cancel.
     }
 
@@ -460,7 +481,7 @@ where
         if let Some(token) = self
             .background_cancel
             .lock()
-            .expect("bg_cancel poisoned")
+            .unwrap_or_else(|e| e.into_inner())
             .take()
         {
             token.cancel();
@@ -493,15 +514,21 @@ where
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
         self.quiescing.store(true, Ordering::Release);
+        // RAII gate: resets `quiescing` on *every* exit path — a normal
+        // return, a timed-out `shutdown()` dropping this future, or a
+        // panic. Without it a quiesce that doesn't run to completion
+        // leaves the gate latched `true`, silently bailing every future
+        // pass. Reopening on drop is safe because `stop()` (below) has
+        // already cancelled the loop, so no new pass can start.
+        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
         self.stop();
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(20)).await;
         }
-        self.quiescing.store(false, Ordering::Release);
         let handle = self
             .background_join
             .lock()
-            .expect("bg_join poisoned")
+            .unwrap_or_else(|e| e.into_inner())
             .take();
         super::join_coordinator_thread(handle).await
     }
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index ddd58fcb44..28987bd9c5 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -104,6 +104,14 @@ pub struct PlatformAddressSyncManager {
     /// confirm the `!Send` loop fully exited before the host drops the
     /// runtime.
     background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
+    /// Monotonically increasing generation counter. Bumped on every
+    /// `start()` so the exiting thread can tell whether its generation is
+    /// still the active one before clearing `background_cancel`. Without
+    /// this guard a tight `stop()` → `start()` reschedule lets the prior
+    /// thread's cleanup strip the *new* generation's token, leaving the
+    /// new loop running but untrackable via `is_running()` / `stop()`.
+    /// Mirrors the identity / shielded coordinators.
+    background_generation: AtomicU64,
     interval_secs: AtomicU64,
     is_syncing: AtomicBool,
     /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -133,6 +141,7 @@ impl PlatformAddressSyncManager {
             event_manager,
             background_cancel: StdMutex::new(None),
             background_join: StdMutex::new(None),
+            background_generation: AtomicU64::new(0),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
             quiescing: AtomicBool::new(false),
@@ -203,12 +212,19 @@ impl PlatformAddressSyncManager {
     /// The first pass runs immediately; subsequent passes fire every
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
-        let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+        let mut cancel_guard = self
+            .background_cancel
+            .lock()
+            .unwrap_or_else(|e| e.into_inner());
         if cancel_guard.is_some() {
             return;
         }
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
+        // Bump the generation while we still hold the slot lock so any
+        // prior thread's cleanup observes `current_gen != my_gen` ordered
+        // against this token swap.
+        let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
 
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
@@ -221,7 +237,22 @@ impl PlatformAddressSyncManager {
                             break;
                         }
 
-                        this.sync_now().await;
+                        // Race the in-flight pass against cancellation.
+                        // `stop()` / `quiesce()` cancel the token; with
+                        // `biased` the cancel arm is polled first, so a
+                        // pass stalled on a hung SDK fetch is dropped at
+                        // its `.await` the instant we cancel. Dropping the
+                        // `sync_now` future unwinds to the `is_syncing`
+                        // `AtomicFlagGuard` it holds, clearing the flag
+                        // promptly — so `quiesce()`'s drain loop frees and
+                        // the join lands well inside `shutdown()`'s
+                        // timeout. A stalled pass can no longer strand a
+                        // live `!Send` thread past `shutdown()`.
+                        tokio::select! {
+                            biased;
+                            _ = cancel.cancelled() => break,
+                            _ = this.sync_now() => {}
+                        }
 
                         let interval = this.interval();
                         tokio::select! {
@@ -230,8 +261,15 @@ impl PlatformAddressSyncManager {
                         }
                     }
 
+                    // Only clear the slot if no newer start() has
+                    // installed a replacement token since we launched —
+                    // mirrors the identity / shielded coordinators so a
+                    // stop() → start() reschedule can't have this exiting
+                    // thread strip the new generation's cancel token.
                     if let Ok(mut guard) = this.background_cancel.lock() {
-                        *guard = None;
+                        if this.background_generation.load(Ordering::Acquire) == my_gen {
+                            *guard = None;
+                        }
                     }
                 });
             })
@@ -239,7 +277,10 @@ impl PlatformAddressSyncManager {
         // Store the join handle while still holding cancel_guard — a
         // concurrent quiesce() must wait for this lock before calling
         // stop(), so the handle is always stored before it can be taken.
-        *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+        *self
+            .background_join
+            .lock()
+            .unwrap_or_else(|e| e.into_inner()) = Some(join);
         // cancel_guard drops here, releasing background_cancel.
     }
 
@@ -256,7 +297,7 @@ impl PlatformAddressSyncManager {
         if let Some(token) = self
             .background_cancel
             .lock()
-            .expect("bg_cancel poisoned")
+            .unwrap_or_else(|e| e.into_inner())
             .take()
         {
             token.cancel();
@@ -290,15 +331,21 @@ impl PlatformAddressSyncManager {
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
         self.quiescing.store(true, Ordering::Release);
+        // RAII gate: resets `quiescing` on *every* exit path — a normal
+        // return, a timed-out `shutdown()` dropping this future, or a
+        // panic. Without it a quiesce that doesn't run to completion
+        // leaves the gate latched `true`, silently bailing every future
+        // pass. Reopening on drop is safe because `stop()` (below) has
+        // already cancelled the loop, so no new pass can start.
+        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
         self.stop();
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(20)).await;
         }
-        self.quiescing.store(false, Ordering::Release);
         let handle = self
             .background_join
             .lock()
-            .expect("bg_join poisoned")
+            .unwrap_or_else(|e| e.into_inner())
             .take();
         super::join_coordinator_thread(handle).await
     }
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 502d1ae733..accaca69d0 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -230,7 +230,10 @@ impl ShieldedSyncManager {
     /// GRPC client state isn't `Send + Sync`). Same trade-off as
     /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start).
     pub fn start(self: Arc<Self>) {
-        let mut cancel_guard = self.background_cancel.lock().expect("bg_cancel poisoned");
+        let mut cancel_guard = self
+            .background_cancel
+            .lock()
+            .unwrap_or_else(|e| e.into_inner());
         if cancel_guard.is_some() {
             return;
         }
@@ -259,7 +262,22 @@ impl ShieldedSyncManager {
                         // chunk every interval. User-initiated
                         // syncs pass `force=true` to the FFI
                         // entry point below and bypass this.
-                        this.sync_now(false).await;
+                        //
+                        // Race the pass against cancellation. `stop()` /
+                        // `quiesce()` cancel the token; with `biased` the
+                        // cancel arm is polled first, so a pass stalled on
+                        // a hung SDK fetch is dropped at its `.await` the
+                        // instant we cancel. Dropping the `sync_now` future
+                        // unwinds to the `is_syncing` `AtomicFlagGuard` it
+                        // holds, clearing the flag promptly — so the drain
+                        // loop in `quiesce()` frees and the join lands well
+                        // inside `shutdown()`'s timeout. A stalled pass can
+                        // no longer strand a live `!Send` thread.
+                        tokio::select! {
+                            biased;
+                            _ = cancel.cancelled() => break,
+                            _ = this.sync_now(false) => {}
+                        }
 
                         let interval = this.interval();
                         tokio::select! {
@@ -285,7 +303,10 @@ impl ShieldedSyncManager {
         // Store the join handle while still holding cancel_guard — a
         // concurrent quiesce() must wait for this lock before calling
         // stop(), so the handle is always stored before it can be taken.
-        *self.background_join.lock().expect("bg_join poisoned") = Some(join);
+        *self
+            .background_join
+            .lock()
+            .unwrap_or_else(|e| e.into_inner()) = Some(join);
         // cancel_guard drops here, releasing background_cancel.
     }
 
@@ -301,7 +322,7 @@ impl ShieldedSyncManager {
         if let Some(token) = self
             .background_cancel
             .lock()
-            .expect("bg_cancel poisoned")
+            .unwrap_or_else(|e| e.into_inner())
             .take()
         {
             token.cancel();
@@ -333,15 +354,21 @@ impl ShieldedSyncManager {
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
         self.quiescing.store(true, Ordering::Release);
+        // RAII gate: resets `quiescing` on *every* exit path — a normal
+        // return, a timed-out `shutdown()` / Clear dropping this future,
+        // or a panic. Without it a quiesce that doesn't run to completion
+        // leaves the gate latched `true`, silently bailing every future
+        // pass. Reopening on drop is safe because `stop()` (below) has
+        // already cancelled the loop, so no new pass can start.
+        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
         self.stop();
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(20)).await;
         }
-        self.quiescing.store(false, Ordering::Release);
         let handle = self
             .background_join
             .lock()
-            .expect("bg_join poisoned")
+            .unwrap_or_else(|e| e.into_inner())
             .take();
         super::join_coordinator_thread(handle).await
     }

From 13a22dd7ca65a885d1eb1d0fa38acd5b91684920 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:54:38 +0200
Subject: [PATCH 07/29] fix(platform-wallet): bound clear_shielded + tidy
 shutdown docs/logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC-002: `clear_shielded()` now wraps its `quiesce()` in the same
`SHUTDOWN_JOIN_TIMEOUT_SECS` backstop `shutdown()` uses, so a stalled
in-flight pass can't hang Clear forever. The const is now `pub` (and
re-exported from the crate root) so the FFI shielded-stop bridge can reuse
it; its doc + the `shutdown()` doc now describe it as a backstop and note
that cancellation is what makes the drain prompt.

SEC-004: bind the event-adapter join handle to a local before the join
`.await`, so the `tokio::Mutex` guard (previously a match-scrutinee
temporary) isn't held across the up-to-30s join.

PROJ-004: drop the lone `tracing::warn!` for the adapter join error inside
`shutdown()` — the returned status already carries it and the FFI `destroy`
adapter logs the aggregate once, so all four workers are now uniform.

RUST-004: rewrite the `shutdown()` `assert!` message (and the matching
docs) to name the real constraint — the coordinator OS threads each run
`Handle::block_on` and need the multi-thread runtime's timer/IO driver —
instead of blaming `spawn_blocking`, which works fine on current_thread.

PROJ-006: fix the `all_clean()` rustdoc (Stopped/Timeout/Error also make it
false, not just panics). PROJ-003: drop the dangling ephemeral `(F-6)` and
`F-2`/`F-3`/`F-7` + `(1)/(2)/(4)/(5)/(6)` markers, replacing with
self-describing prose. SEC-003: note the unwind-vs-abort caveat on the
`shutdown()` panic-safety guarantee.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/rs-platform-wallet/src/lib.rs        |   2 +-
 .../rs-platform-wallet/src/manager/mod.rs     | 116 ++++++++++++------
 2 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs
index 289a71378f..dd12883fc7 100644
--- a/packages/rs-platform-wallet/src/lib.rs
+++ b/packages/rs-platform-wallet/src/lib.rs
@@ -44,7 +44,7 @@ pub use manager::platform_address_sync::{
     PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome,
     DEFAULT_SYNC_INTERVAL_SECS,
 };
-pub use manager::PlatformWalletManager;
+pub use manager::{PlatformWalletManager, SHUTDOWN_JOIN_TIMEOUT_SECS};
 pub use spv::SpvRuntime;
 pub use wallet::asset_lock::manager::AssetLockManager;
 pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock};
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 717ad0a03c..6fa26902f8 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -151,9 +151,11 @@ pub struct CoordinatorExitStatus {
 }
 
 impl CoordinatorExitStatus {
-    /// `true` when every worker wound down without a panic (each is
+    /// `true` only when every worker is
     /// [`Ok`](CoordinatorThreadStatus::Ok) or
-    /// [`NotRunning`](CoordinatorThreadStatus::NotRunning)).
+    /// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any
+    /// `Stopped`, `Panicked`, `Timeout`, or `Error` slot makes it
+    /// `false`.
     pub fn all_clean(&self) -> bool {
         self.platform_address_sync.is_clean()
             && self.identity_sync.is_clean()
@@ -173,8 +175,12 @@ impl CoordinatorExitStatus {
 /// still alive guarantees the `!Send` loop has stopped touching
 /// `tokio::time` before the host drops the runtime.
 ///
-/// **Requires a multi-thread runtime** — `spawn_blocking` is not
-/// available on `current_thread` runtimes and will panic there.
+/// **Requires a multi-thread runtime.** Each coordinator's OS thread
+/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on)
+/// and needs the runtime's timer/IO driver; a `current_thread` runtime
+/// can only service one `block_on` at a time, so joining one coordinator
+/// while the others (and `shutdown()` itself) are mid-`block_on` would
+/// deadlock. `shutdown()` asserts the multi-thread flavor up front.
 pub(crate) async fn join_coordinator_thread(
     handle: Option<std::thread::JoinHandle<()>>,
 ) -> CoordinatorThreadStatus {
@@ -186,7 +192,7 @@ pub(crate) async fn join_coordinator_thread(
         Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
         // spawn_blocking fails only when the runtime shuts down before
         // the blocking task can run — unreachable in normal operation
-        // since shutdown() is called while the runtime is alive (F-6).
+        // since shutdown() is called while the runtime is alive.
         Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")),
     }
 }
@@ -203,12 +209,18 @@ fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
     }
 }
 
-/// Maximum time (seconds) `shutdown()` waits for one coordinator's
-/// quiesce+join to complete. Under normal operation this deadline is
-/// never reached (the RAII `is_syncing` guard ensures the drain exits
-/// even on panic). On timeout the coordinator slot reports
-/// [`CoordinatorThreadStatus::Timeout`].
-const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
+/// Maximum time (seconds) the teardown paths — `shutdown()`,
+/// `clear_shielded`, and the FFI shielded-stop bridge — wait for one
+/// coordinator's quiesce+join to complete.
+///
+/// This is a backstop, not the primary stop mechanism. `quiesce()`
+/// cancels the loop, which aborts any in-flight pass at its `.await`
+/// point (see each coordinator's `start()` select), so the `is_syncing`
+/// drain clears promptly and the join normally lands far inside this
+/// window. The deadline fires only if a pass's *drop* itself wedges
+/// (e.g. a blocking destructor); on timeout the coordinator slot reports
+/// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
+pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
 
 impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// Create a new PlatformWalletManager.
@@ -403,7 +415,17 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// must not commit its own persistence wipe in that case.
     #[cfg(feature = "shielded")]
     pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
-        self.shielded_sync_manager.quiesce().await;
+        // Bound the quiesce with the same backstop `shutdown()` uses so a
+        // stalled in-flight pass can't hang Clear forever — cancellation
+        // makes the drain prompt; this timeout only matters if a pass's
+        // drop wedges. The terminal status isn't surfaced on the Clear
+        // path (the coordinator reset below is what can fail), so the
+        // timeout result is intentionally discarded.
+        let _ = tokio::time::timeout(
+            std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
+            self.shielded_sync_manager.quiesce(),
+        )
+        .await;
         if let Some(coord) = self.shielded_coordinator().await {
             coord.clear().await?;
         }
@@ -439,23 +461,35 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// [`CoordinatorExitStatus`] reports per-thread how each worker ended.
     ///
     /// **Precondition: must be called from a multi-thread Tokio runtime.**
-    /// `quiesce()` uses `spawn_blocking` internally; calling from a
-    /// `current_thread` runtime will panic (this is a real invariant
-    /// enforced in both debug and release builds).
+    /// Each coordinator's OS thread drives its loop via
+    /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs
+    /// the runtime's timer/IO driver; a `current_thread` runtime can only
+    /// service one `block_on` at a time, so the join would deadlock. This
+    /// is asserted in both debug and release builds.
     ///
     /// Each coordinator quiesce+join is bounded by
-    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`]. If a coordinator does not exit
-    /// within that window, its slot reports
+    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] as a backstop. `quiesce()` cancels
+    /// the loop, which aborts any in-flight pass at its `.await` point, so
+    /// the `is_syncing` drain clears promptly and the join normally lands
+    /// far inside the window — the deadline fires only if a pass's *drop*
+    /// itself wedges. On timeout the coordinator slot reports
     /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
-    /// Under normal operation (no infinite loops, RAII guard clears
-    /// `is_syncing` even on panic) this timeout is never reached.
+    ///
+    /// The clear-on-panic half of that guarantee rides on unwinding, so
+    /// it holds under `panic = "unwind"`. Under the iOS `panic = "abort"`
+    /// release profiles a pass panic aborts the process outright (no
+    /// `Drop`, no status) — there is no live manager left to read a
+    /// status from.
     pub async fn shutdown(&self) -> CoordinatorExitStatus {
         assert!(
             matches!(
                 tokio::runtime::Handle::current().runtime_flavor(),
                 tokio::runtime::RuntimeFlavor::MultiThread
             ),
-            "shutdown() requires a multi-thread Tokio runtime (spawn_blocking inside quiesce)"
+            "shutdown() requires a multi-thread Tokio runtime: each \
+             coordinator's OS thread drives its sync loop via \
+             Handle::block_on and needs the runtime's timer/IO driver, but \
+             a current_thread runtime can only drive one block_on at a time"
         );
 
         let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
@@ -484,12 +518,19 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         // stores), so cancel + join it last — after the loops feeding it
         // are gone.
         self.event_adapter_cancel.cancel();
-        let event_adapter = match self.event_adapter_join.lock().await.take() {
+        // Take the handle out into a local first so the `tokio::Mutex`
+        // guard doesn't stay held across the (up-to-30s) join `.await`
+        // below — a match scrutinee temporary would otherwise keep the
+        // guard alive for the whole match.
+        let event_adapter_handle = self.event_adapter_join.lock().await.take();
+        let event_adapter = match event_adapter_handle {
             None => CoordinatorThreadStatus::NotRunning,
             Some(handle) => match tokio::time::timeout(timeout, handle).await {
                 Ok(Ok(())) => CoordinatorThreadStatus::Ok,
+                // The returned status already carries this failure, and the
+                // FFI `destroy` adapter logs the aggregate once at the host
+                // layer — so don't double-log here.
                 Ok(Err(e)) => {
-                    tracing::warn!(error = ?e, "Wallet event adapter task join error");
                     if e.is_panic() {
                         CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
                     } else {
@@ -560,7 +601,8 @@ mod tests {
     }
 
     /// Build a manager that fires a slow (300 ms std::thread::sleep) callback
-    /// on `on_platform_address_sync_completed`. Used by F-2 drain test.
+    /// on `on_platform_address_sync_completed`. Used by the in-flight-pass
+    /// drain test.
     fn make_manager_with_slow_handler(
         started: Arc<AtomicBool>,
         completed: Arc<AtomicBool>,
@@ -592,10 +634,10 @@ mod tests {
         Arc::clone(&m.shielded_sync_manager).start();
     }
 
-    /// (1)+(5)+(6) Happy path: `shutdown()` joins every started worker
-    /// and reports `Ok`; it completes within a bounded time (no
-    /// `spawn_blocking` starvation/deadlock); a second `shutdown()` finds
-    /// nothing left to join (`NotRunning`) — idempotent.
+    /// Happy path: `shutdown()` joins every started worker and reports
+    /// `Ok`; it completes within a bounded time (no `spawn_blocking`
+    /// starvation/deadlock); a second `shutdown()` finds nothing left to
+    /// join (`NotRunning`) — idempotent.
     #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
     async fn shutdown_joins_all_workers_reports_ok_and_is_idempotent() {
         let manager = make_manager();
@@ -627,7 +669,7 @@ mod tests {
         assert!(again.all_clean());
     }
 
-    /// (2) Never-started coordinators report `NotRunning` (no thread to
+    /// Never-started coordinators report `NotRunning` (no thread to
     /// join). The event adapter is spawned in `new`, so it still joins
     /// `Ok`.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
@@ -651,7 +693,7 @@ mod tests {
         assert!(status.all_clean());
     }
 
-    /// (4) A coordinator thread that panics surfaces as `Panicked` rather
+    /// A coordinator thread that panics surfaces as `Panicked` rather
     /// than being silently dropped.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn join_coordinator_thread_surfaces_panic() {
@@ -759,9 +801,9 @@ mod tests {
         );
     }
 
-    /// F-7: `join_coordinator_thread` uses `spawn_blocking` internally.
-    /// Verify it completes without deadlock within a bounded time when
-    /// called from a multi-thread runtime, as `shutdown()` requires.
+    /// `join_coordinator_thread` uses `spawn_blocking` internally. Verify
+    /// it completes without deadlock within a bounded time when called
+    /// from a multi-thread runtime, as `shutdown()` requires.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() {
         let handle = std::thread::spawn(|| {});
@@ -774,8 +816,8 @@ mod tests {
         assert_eq!(result, CoordinatorThreadStatus::Ok);
     }
 
-    /// F-2: `shutdown()` must wait for an in-flight sync pass to drain
-    /// before joining the coordinator thread.
+    /// `shutdown()` must wait for an in-flight sync pass to drain before
+    /// joining the coordinator thread.
     ///
     /// A slow `on_platform_address_sync_completed` callback (300 ms)
     /// keeps `is_syncing=true` while it runs. We call `shutdown()` while
@@ -819,9 +861,9 @@ mod tests {
         );
     }
 
-    /// F-3 (strengthened): race regression — start coordinators with a
-    /// long sleep interval so they spend nearly all their time in a live
-    /// `tokio::time::sleep`, then `shutdown()` and drop the runtime.
+    /// Race regression — start coordinators with a long sleep interval so
+    /// they spend nearly all their time in a live `tokio::time::sleep`,
+    /// then `shutdown()` and drop the runtime.
     ///
     /// With the thread join in `shutdown()` every coordinator has fully
     /// exited its `block_on` before `drop(runtime)` — no race possible.

From 93b89546ed7fac0964d7ae6e7dd3fa12b931b944 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 12:56:01 +0200
Subject: [PATCH 08/29] fix(platform-wallet-ffi): timeout-bound the shielded
 sync stop bridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC-002: `platform_wallet_manager_shielded_sync_stop` blocked on a bare
`quiesce()`, so a stalled in-flight pass could hang the host's stop call
forever. Wrap the quiesce in `tokio::time::timeout` reusing the library's
`SHUTDOWN_JOIN_TIMEOUT_SECS` backstop — same guarantee as `shutdown()`.
Cancellation makes the drain prompt; the timeout only matters if a pass's
drop wedges. The C signature is unchanged and the result is still discarded
(`ok` as before) — we only need the call not to hang.

Add `tokio/time` to the crate's direct features rather than leaning on
`platform-wallet` pulling it in transitively (the crate now calls
`tokio::time::timeout` directly).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/rs-platform-wallet-ffi/Cargo.toml         |  2 +-
 .../rs-platform-wallet-ffi/src/shielded_sync.rs    | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/packages/rs-platform-wallet-ffi/Cargo.toml b/packages/rs-platform-wallet-ffi/Cargo.toml
index 8a2bd4ef2b..7e60b05d69 100644
--- a/packages/rs-platform-wallet-ffi/Cargo.toml
+++ b/packages/rs-platform-wallet-ffi/Cargo.toml
@@ -22,7 +22,7 @@ rs-sdk-ffi = { path = "../rs-sdk-ffi" }
 once_cell = "1.19"
 parking_lot = { version = "0.12", features = ["send_guard"] }
 lazy_static = "1.4"
-tokio = { version = "1", features = ["rt-multi-thread"] }
+tokio = { version = "1", features = ["rt-multi-thread", "time"] }
 tokio-metrics = { workspace = true, optional = true }
 
 # Core dependencies (for Network type)
diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
index 2d58d8165f..da285e422e 100644
--- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
+++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
@@ -88,7 +88,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
     handle: Handle,
 ) -> PlatformWalletFFIResult {
     let option = PLATFORM_WALLET_MANAGER_STORAGE.with_item(handle, |manager| {
-        runtime().block_on(manager.shielded_sync().quiesce());
+        runtime().block_on(async {
+            // Bound the quiesce with the same backstop `shutdown()` uses so
+            // a stalled in-flight pass can't hang the host's stop call
+            // forever. Cancellation makes the drain prompt; this only
+            // matters if a pass's drop wedges. The terminal status is
+            // discarded — the C ABI exposes none of it, we only need the
+            // drain not to wedge.
+            let _ = tokio::time::timeout(
+                Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS),
+                manager.shielded_sync().quiesce(),
+            )
+            .await;
+        });
     });
     unwrap_option_or_return!(option);
     PlatformWalletFFIResult::ok()

From 2bd9501a0edde17c2c1bc3c6d8f6844eca46a973 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:13:38 +0200
Subject: [PATCH 09/29] fix(platform-wallet)!: close residual
 coordinator-thread UAF on shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the spawn_blocking-based join in join_coordinator_thread with an
is_finished() poll loop that awaits a 5ms sleep each step. spawn_blocking
tasks cannot be cancelled once started, so the prior approach left the
blocking join alive past the tokio::time::timeout wrapping quiesce() —
defeating the timeout boundary. Polling yields at each .await so the
external timeout is truly binding (threads are confirmed-exited or the
caller times out).

Each coordinator's start() now drains any handle left by a prior stop()
(is_finished spin-wait, 1s bound) before overwriting background_join, so a
stop()->start() reschedule can no longer detach a live, untracked thread
that shutdown() would miss.

FFI platform_wallet_manager_destroy now returns the new
ErrorShutdownIncomplete (19) when shutdown is not all-clean, signalling the
host must not immediately free the callback context — a lingering
coordinator may still fire one final callback. The C ABI is unchanged
(additive enum variant + degraded-path return code).

Tests: deterministic Stopped path via spawn(pending).abort() -> asserts
Stopped(_) and !is_clean(); race test uses per-iteration catch_unwind
instead of a process-global panic hook.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-platform-wallet-ffi/src/error.rs  |   9 ++
 .../rs-platform-wallet-ffi/src/manager.rs     |  13 +-
 .../src/manager/identity_sync.rs              |  32 ++++
 .../rs-platform-wallet/src/manager/mod.rs     | 142 ++++++++++++------
 .../src/manager/platform_address_sync.rs      |  32 ++++
 .../src/manager/shielded_sync.rs              |  32 ++++
 6 files changed, 211 insertions(+), 49 deletions(-)

diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs
index de1a6cb944..b50b5d79c5 100644
--- a/packages/rs-platform-wallet-ffi/src/error.rs
+++ b/packages/rs-platform-wallet-ffi/src/error.rs
@@ -125,6 +125,15 @@ pub enum PlatformWalletFFIResultCode {
     /// and could double-send if the original spend landed.
     ErrorShieldedSpendUnconfirmed = 18,
 
+    /// One or more background coordinator threads did not exit cleanly before
+    /// the 30 s join deadline. The host **must not** free the callback context
+    /// immediately — a lingering thread may still hold a reference to it and
+    /// fire one final callback. Either keep the context alive for a further
+    /// grace period, or accept the potential (but statistically tiny) race.
+    /// This is distinct from a normal operation error; the manager IS torn
+    /// down; the host should not retry `destroy`.
+    ErrorShutdownIncomplete = 19,
+
     NotFound = 98, // Used exclusively for all the Option that are retuned as errors
     ErrorUnknown = 99,
 }
diff --git a/packages/rs-platform-wallet-ffi/src/manager.rs b/packages/rs-platform-wallet-ffi/src/manager.rs
index d09d98a1e8..986103ab47 100644
--- a/packages/rs-platform-wallet-ffi/src/manager.rs
+++ b/packages/rs-platform-wallet-ffi/src/manager.rs
@@ -367,7 +367,18 @@ pub unsafe extern "C" fn platform_wallet_manager_destroy(
         if !status.all_clean() {
             tracing::warn!(
                 ?status,
-                "platform wallet coordinator(s) did not exit cleanly"
+                "platform wallet coordinator(s) did not exit cleanly; \
+                 host must not free the callback context immediately"
+            );
+            // Return a distinct non-ok code so the host can delay freeing
+            // its callback context. A lingering coordinator thread (e.g. one
+            // that timed out) still holds an Arc to the event handler and may
+            // fire one final callback through the host-owned context pointer;
+            // returning ok() here would signal that the context is safe to
+            // free when it may not be yet.
+            return PlatformWalletFFIResult::err(
+                PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+                format!("coordinator(s) did not exit cleanly: {status:?}"),
             );
         } else {
             tracing::debug!(?status, "platform wallet coordinators joined cleanly");
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index ae2143a574..6e87261e0a 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -410,6 +410,38 @@ where
         if cancel_guard.is_some() {
             return;
         }
+
+        // Drain any handle left by a prior stop() call. stop() takes-and-cancels
+        // the token but never touches background_join, so a stop()→start()
+        // sequence would otherwise overwrite (detach) the old handle —
+        // shutdown() would then miss that thread and join() only the new one.
+        // The old thread was already cancellation-signalled, so is_finished()
+        // becomes true within a few milliseconds; we spin-wait to guarantee
+        // no detached thread can fire callbacks after destroy() returns.
+        {
+            let prior = self
+                .background_join
+                .lock()
+                .unwrap_or_else(|e| e.into_inner())
+                .take();
+            if let Some(h) = prior {
+                let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+                while !h.is_finished() {
+                    if std::time::Instant::now() >= deadline {
+                        tracing::warn!(
+                            "identity-sync prior thread did not finish within 1 s \
+                             after cancellation; detaching to unblock start()"
+                        );
+                        break; // Drop h — detaches; thread was already cancelled.
+                    }
+                    std::thread::sleep(std::time::Duration::from_millis(5));
+                }
+                if h.is_finished() {
+                    let _ = h.join(); // Reap resources; near-instant since finished.
+                }
+            }
+        }
+
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
         let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 6fa26902f8..a9569dd00e 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -168,13 +168,20 @@ impl CoordinatorExitStatus {
 ///
 /// Called from each coordinator's `quiesce()` after cancelling the
 /// loop and draining any in-flight pass, so the thread is already on
-/// its way out and the join is near-instant. The blocking
-/// [`JoinHandle::join`](std::thread::JoinHandle::join) runs on the
-/// blocking pool (via [`spawn_blocking`](tokio::task::spawn_blocking))
-/// so the async executor stays unblocked. Joining while the runtime is
-/// still alive guarantees the `!Send` loop has stopped touching
+/// its way out and the join is near-instant. Joining while the runtime
+/// is still alive guarantees the `!Send` loop has stopped touching
 /// `tokio::time` before the host drops the runtime.
 ///
+/// **Polling approach**: we poll [`JoinHandle::is_finished`] in 5 ms
+/// steps rather than wrapping `handle.join()` in
+/// [`spawn_blocking`](tokio::task::spawn_blocking). The
+/// `spawn_blocking` approach spawns a blocking-pool task that cannot be
+/// cancelled once started — so dropping the timeout future that wraps
+/// `quiesce()` would leave the blocking task alive and `handle.join()`
+/// still running, defeating the timeout boundary. Polling lets the
+/// executor yield at each `.await` step so `tokio::time::timeout`
+/// wrapping `quiesce()` can truly interrupt this call.
+///
 /// **Requires a multi-thread runtime.** Each coordinator's OS thread
 /// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on)
 /// and needs the runtime's timer/IO driver; a `current_thread` runtime
@@ -187,13 +194,20 @@ pub(crate) async fn join_coordinator_thread(
     let Some(handle) = handle else {
         return CoordinatorThreadStatus::NotRunning;
     };
-    match tokio::task::spawn_blocking(move || handle.join()).await {
-        Ok(Ok(())) => CoordinatorThreadStatus::Ok,
-        Ok(Err(payload)) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
-        // spawn_blocking fails only when the runtime shuts down before
-        // the blocking task can run — unreachable in normal operation
-        // since shutdown() is called while the runtime is alive.
-        Err(join_err) => CoordinatorThreadStatus::Error(format!("join task failed: {join_err}")),
+    // Poll until the thread exits. The coordinator was already cancelled
+    // (stop() fires before quiesce() calls us), so is_finished() becomes
+    // true nearly immediately — typically within a single 5 ms step.
+    loop {
+        if handle.is_finished() {
+            return match handle.join() {
+                Ok(()) => CoordinatorThreadStatus::Ok,
+                Err(payload) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
+            };
+        }
+        // Yield to the executor so the outer tokio::time::timeout wrapping
+        // quiesce() can fire if the deadline has passed. Without this yield
+        // the loop would busy-spin and block the task.
+        tokio::time::sleep(std::time::Duration::from_millis(5)).await;
     }
 }
 
@@ -711,31 +725,44 @@ mod tests {
     /// tokio task is cancelled or aborted rather than completing normally.
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() {
-        // Build a manager but immediately abort the event adapter task so
-        // we trigger the non-panic JoinError path in shutdown().
+        // Replace the real adapter handle with a guaranteed-pending task, then
+        // abort it. A `pending::<()>()` future can never complete on its own,
+        // so abort() always produces a non-panic JoinError — deterministically
+        // exercising the Stopped branch regardless of scheduler timing.
+        // (The original approach aborted the real adapter handle, which could
+        // race the task's own completion and silently yield `Ok` instead.)
         let manager = make_manager();
-        // Abort the adapter task directly so the join sees a non-panic JoinError.
-        {
+
+        // Drain and discard the real adapter (may already be finished).
+        let original = {
             let mut guard = manager.event_adapter_join.lock().await;
-            if let Some(handle) = guard.take() {
-                handle.abort();
-                // Put it back so shutdown() sees it and exercises the error path.
-                *guard = Some(handle);
-            }
+            guard.take()
+        };
+        if let Some(h) = original {
+            h.abort();
+            let _ = h.await;
         }
-        // Give tokio a moment to process the abort.
-        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        // Install a permanently-pending task and abort it so the JoinError
+        // path in shutdown() is 100 % deterministic.
+        let pending = tokio::spawn(std::future::pending::<()>());
+        pending.abort();
+        *manager.event_adapter_join.lock().await = Some(pending);
 
         let status = manager.shutdown().await;
-        // The adapter task was aborted → non-panic JoinError → Stopped.
-        match &status.event_adapter {
-            CoordinatorThreadStatus::Stopped(_) | CoordinatorThreadStatus::Ok => {
-                // Stopped is the expected path; Ok means it drained before abort — both
-                // are acceptable since abort() races the task completion.
-            }
-            other => panic!("expected Stopped or Ok (abort race), got {other:?}"),
-        }
-        // Regardless, all other workers were never started → clean.
+
+        // The aborted pending task always yields a non-panic JoinError →
+        // shutdown() maps it to Stopped.
+        assert!(
+            matches!(status.event_adapter, CoordinatorThreadStatus::Stopped(_)),
+            "expected Stopped from a non-panic JoinError, got {:?}",
+            status.event_adapter
+        );
+        assert!(
+            !status.event_adapter.is_clean(),
+            "Stopped must not count as clean"
+        );
+        // Coordinators were never started → their slots are clean.
         assert_eq!(
             status.platform_address_sync,
             CoordinatorThreadStatus::NotRunning
@@ -801,18 +828,18 @@ mod tests {
         );
     }
 
-    /// `join_coordinator_thread` uses `spawn_blocking` internally. Verify
-    /// it completes without deadlock within a bounded time when called
-    /// from a multi-thread runtime, as `shutdown()` requires.
+    /// `join_coordinator_thread` uses `is_finished()` polling. Verify
+    /// it completes within a bounded time on a multi-thread runtime, as
+    /// `shutdown()` requires (and that it doesn't busy-spin indefinitely).
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn join_coordinator_thread_no_deadlock_with_spawn_blocking() {
+    async fn join_coordinator_thread_completes_within_deadline() {
         let handle = std::thread::spawn(|| {});
         let result = tokio::time::timeout(
             Duration::from_secs(5),
             join_coordinator_thread(Some(handle)),
         )
         .await
-        .expect("join_coordinator_thread must complete within 5 s — no spawn_blocking deadlock");
+        .expect("join_coordinator_thread must complete within 5 s");
         assert_eq!(result, CoordinatorThreadStatus::Ok);
     }
 
@@ -871,15 +898,14 @@ mod tests {
     /// the join, the coordinator's `select!` wakeup (via tokio) would
     /// race the runtime teardown and reliably trigger the
     /// "Tokio … being shutdown" panic across the 10 iterations.
+    ///
+    /// Uses `std::panic::catch_unwind` around `drop(runtime)` rather than
+    /// a process-global panic hook; the hook would be live for seconds and
+    /// could swallow diagnostics from concurrently-running tests (e.g.
+    /// `join_coordinator_thread_surfaces_panic`).
     #[test]
     fn shutdown_then_drop_runtime_does_not_panic() {
         static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
-        let prev_hook = std::panic::take_hook();
-        std::panic::set_hook(Box::new(|info| {
-            if info.to_string().contains("being shutdown") {
-                SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst);
-            }
-        }));
 
         for _ in 0..10 {
             let runtime = tokio::runtime::Builder::new_multi_thread()
@@ -912,7 +938,27 @@ mod tests {
                 manager.shutdown().await
             });
 
-            drop(runtime);
+            // Wrap the runtime drop in catch_unwind to intercept the specific
+            // "A Tokio 1.x context ... being shutdown" panic without installing
+            // a process-wide hook that would suppress diagnostics from other
+            // concurrently running tests.
+            let drop_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+                drop(runtime);
+            }));
+            if let Err(payload) = drop_result {
+                let msg = payload
+                    .downcast_ref::<String>()
+                    .map(String::as_str)
+                    .or_else(|| payload.downcast_ref::<&str>().copied())
+                    .unwrap_or("");
+                if msg.contains("being shutdown") {
+                    SHUTDOWN_PANICS.fetch_add(1, AO::SeqCst);
+                } else {
+                    // Unexpected panic — propagate so the test fails loudly.
+                    std::panic::resume_unwind(payload);
+                }
+            }
+
             // Brief settle — any stray thread activity surfaces here.
             std::thread::sleep(Duration::from_millis(50));
 
@@ -921,12 +967,12 @@ mod tests {
             assert!(status.all_clean(), "workers did not wind down: {status:?}");
         }
 
-        let racing_panics = SHUTDOWN_PANICS.load(AO::SeqCst);
-        std::panic::set_hook(prev_hook);
         assert_eq!(
-            racing_panics, 0,
+            SHUTDOWN_PANICS.load(AO::SeqCst),
+            0,
             "dropping the runtime after shutdown raced a coordinator thread \
-             ({racing_panics} panics across 10 iterations)"
+             ({} panics across 10 iterations)",
+            SHUTDOWN_PANICS.load(AO::SeqCst)
         );
     }
 }
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 28987bd9c5..7e72f2fe74 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -219,6 +219,38 @@ impl PlatformAddressSyncManager {
         if cancel_guard.is_some() {
             return;
         }
+
+        // Drain any handle left by a prior stop() call. stop() takes-and-cancels
+        // the token but never touches background_join, so a stop()→start()
+        // sequence would otherwise overwrite (detach) the old handle —
+        // shutdown() would then miss that thread and join() only the new one.
+        // The old thread was already cancellation-signalled, so is_finished()
+        // becomes true within a few milliseconds; we spin-wait to guarantee
+        // no detached thread can fire callbacks after destroy() returns.
+        {
+            let prior = self
+                .background_join
+                .lock()
+                .unwrap_or_else(|e| e.into_inner())
+                .take();
+            if let Some(h) = prior {
+                let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+                while !h.is_finished() {
+                    if std::time::Instant::now() >= deadline {
+                        tracing::warn!(
+                            "platform-address-sync prior thread did not finish within 1 s \
+                             after cancellation; detaching to unblock start()"
+                        );
+                        break; // Drop h — detaches; thread was already cancelled.
+                    }
+                    std::thread::sleep(std::time::Duration::from_millis(5));
+                }
+                if h.is_finished() {
+                    let _ = h.join(); // Reap resources; near-instant since finished.
+                }
+            }
+        }
+
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
         // Bump the generation while we still hold the slot lock so any
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index accaca69d0..365b0be17b 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -237,6 +237,38 @@ impl ShieldedSyncManager {
         if cancel_guard.is_some() {
             return;
         }
+
+        // Drain any handle left by a prior stop() call. stop() takes-and-cancels
+        // the token but never touches background_join, so a stop()→start()
+        // sequence would otherwise overwrite (detach) the old handle —
+        // shutdown() would then miss that thread and join() only the new one.
+        // The old thread was already cancellation-signalled, so is_finished()
+        // becomes true within a few milliseconds; we spin-wait to guarantee
+        // no detached thread can fire callbacks after destroy() returns.
+        {
+            let prior = self
+                .background_join
+                .lock()
+                .unwrap_or_else(|e| e.into_inner())
+                .take();
+            if let Some(h) = prior {
+                let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+                while !h.is_finished() {
+                    if std::time::Instant::now() >= deadline {
+                        tracing::warn!(
+                            "shielded-sync prior thread did not finish within 1 s \
+                             after cancellation; detaching to unblock start()"
+                        );
+                        break; // Drop h — detaches; thread was already cancelled.
+                    }
+                    std::thread::sleep(std::time::Duration::from_millis(5));
+                }
+                if h.is_finished() {
+                    let _ = h.join(); // Reap resources; near-instant since finished.
+                }
+            }
+        }
+
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
         // Bump the generation while we still hold the slot lock so

From 7c975ed5a632c5de60eb32761748dc93f8a35416 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:18:21 +0200
Subject: [PATCH 10/29] fix(platform-wallet)!: surface non-clean shielded drain
 on clear/stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the destroy UAF-surfacing discipline (which already returns
ErrorShutdownIncomplete=19 on a non-clean shutdown) to the shielded
clear/stop paths, so a partial/timed-out coordinator drain can no
longer be silently swallowed.

- clear_shielded now captures the quiesce result instead of discarding
  it: on a timed-out or non-clean drain it returns the new typed
  PlatformWalletError::ShieldedShutdownIncomplete (carrying the terminal
  CoordinatorThreadStatus) and leaves the commitment-tree store INTACT,
  rather than unconditionally wiping a store an in-flight pass may still
  write into. The store is wiped only on a clean drain.
- FFI shielded_sync_stop now returns ErrorShutdownIncomplete (with the
  status rendered into the message) on a non-clean/timed-out drain,
  instead of always returning ok() — symmetric with destroy. A timeout
  is reported as the Timeout status.
- FFI shielded_clear maps the new ShieldedShutdownIncomplete variant to
  ErrorShutdownIncomplete (store-reset failures still map to
  ErrorWalletOperation); the blanket From<PlatformWalletError> gains the
  same arm, pinned by a unit test.
- Swift mirror gains errorShutdownIncomplete=19 plus a richer
  PlatformWalletError.shutdownIncomplete case, wired through both the
  init(ffi:) and init(result:) switches.
- Re-export CoordinatorThreadStatus / CoordinatorExitStatus from the
  crate root so the FFI can name the status type.

BREAKING CHANGE: clear_shielded / shielded_sync_stop / shielded_clear
now report a non-clean coordinator drain instead of succeeding silently;
hosts must defer freeing their callback context and must not commit their
own persistence wipe on ErrorShutdownIncomplete.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-platform-wallet-ffi/src/error.rs  | 31 +++++++++
 .../src/shielded_sync.rs                      | 66 +++++++++++++++----
 packages/rs-platform-wallet/src/error.rs      | 21 ++++++
 packages/rs-platform-wallet/src/lib.rs        |  5 +-
 .../rs-platform-wallet/src/manager/mod.rs     | 32 +++++++--
 .../PlatformWallet/PlatformWalletResult.swift | 16 +++++
 6 files changed, 149 insertions(+), 22 deletions(-)

diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs
index b50b5d79c5..5769ffcc43 100644
--- a/packages/rs-platform-wallet-ffi/src/error.rs
+++ b/packages/rs-platform-wallet-ffi/src/error.rs
@@ -246,6 +246,14 @@ impl From<PlatformWalletError> for PlatformWalletFFIResult {
             PlatformWalletError::ShieldedSpendUnconfirmed { .. } => {
                 PlatformWalletFFIResultCode::ErrorShieldedSpendUnconfirmed
             }
+            // A Clear that refused because the in-flight shielded pass didn't
+            // drain cleanly: surface it as ErrorShutdownIncomplete (symmetric
+            // with `platform_wallet_manager_destroy`) so the host defers
+            // freeing its callback context AND does not commit its own
+            // persistence wipe — the store was intentionally left intact.
+            PlatformWalletError::ShieldedShutdownIncomplete { .. } => {
+                PlatformWalletFFIResultCode::ErrorShutdownIncomplete
+            }
             _ => PlatformWalletFFIResultCode::ErrorUnknown,
         };
         PlatformWalletFFIResult::err(code, error.to_string())
@@ -604,6 +612,29 @@ mod tests {
         assert_eq!(msg, rendered, "Display payload must survive verbatim");
     }
 
+    /// A Clear that refused on a non-clean shielded drain must surface as
+    /// `ErrorShutdownIncomplete` (symmetric with `destroy`), not flatten to
+    /// `ErrorUnknown`, so the host knows to defer freeing its callback
+    /// context and to NOT commit its own persistence wipe. The typed Display
+    /// rendering (carrying the terminal coordinator status) survives verbatim.
+    #[test]
+    fn shielded_shutdown_incomplete_maps_to_dedicated_code() {
+        let err = PlatformWalletError::ShieldedShutdownIncomplete {
+            status: platform_wallet::CoordinatorThreadStatus::Timeout,
+        };
+        let rendered = err.to_string();
+        let result: PlatformWalletFFIResult = err.into();
+        assert_eq!(
+            result.code,
+            PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+            "ShieldedShutdownIncomplete should map to ErrorShutdownIncomplete (rendered: {rendered})"
+        );
+        let msg = unsafe { std::ffi::CStr::from_ptr(result.message) }
+            .to_string_lossy()
+            .into_owned();
+        assert_eq!(msg, rendered, "Display payload must survive verbatim");
+    }
+
     /// Other wallet-error variants without a dedicated FFI arm still
     /// fall through to `ErrorUnknown` while carrying the typed
     /// Display rendering as the message. Pin this so the catch-all
diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
index da285e422e..14082628e4 100644
--- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
+++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
@@ -68,12 +68,20 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_start(
 /// Stop the shielded sync manager and wait for any in-flight pass to
 /// drain before returning. No-op if not running.
 ///
-/// Uses `quiesce` rather than cancel-only stop, so on return: the loop
-/// is cancelled, no new pass will start, and any in-flight pass has
+/// Uses `quiesce` rather than cancel-only stop, so on a clean return: the
+/// loop is cancelled, no new pass will start, and any in-flight pass has
 /// fully drained — its **persistence callbacks have completed** (no
 /// note/sync-state row can be written after this returns) and its
 /// completion-event *dispatch* on the Rust side has run.
 ///
+/// Returns `ErrorShutdownIncomplete` instead of `Success` when that drain
+/// did **not** complete cleanly (the in-flight pass timed out on the join
+/// backstop, or the loop ended non-cleanly). The terminal coordinator
+/// status is rendered into the result message. On this code the host must
+/// **not** free the callback context immediately — a lingering pass may
+/// still fire one final callback through it (symmetric with
+/// `platform_wallet_manager_destroy`).
+///
 /// Caveat on host-observed events: a host that marshals the completion
 /// callback onto its own executor (e.g. the Swift trampoline hops it to
 /// the `@MainActor`) may still observe that final, already-dispatched
@@ -92,17 +100,36 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
             // Bound the quiesce with the same backstop `shutdown()` uses so
             // a stalled in-flight pass can't hang the host's stop call
             // forever. Cancellation makes the drain prompt; this only
-            // matters if a pass's drop wedges. The terminal status is
-            // discarded — the C ABI exposes none of it, we only need the
-            // drain not to wedge.
-            let _ = tokio::time::timeout(
+            // matters if a pass's drop wedges. A timeout (the future was
+            // dropped at the deadline) is reported as the non-clean
+            // `Timeout` status, matching `shutdown()`'s backstop
+            // substitution, so the host learns the drain may be incomplete.
+            match tokio::time::timeout(
                 Duration::from_secs(platform_wallet::SHUTDOWN_JOIN_TIMEOUT_SECS),
                 manager.shielded_sync().quiesce(),
             )
-            .await;
-        });
+            .await
+            {
+                Ok(status) => status,
+                Err(_elapsed) => platform_wallet::CoordinatorThreadStatus::Timeout,
+            }
+        })
     });
-    unwrap_option_or_return!(option);
+    let status = unwrap_option_or_return!(option);
+    // Symmetric with `platform_wallet_manager_destroy`: a non-clean drain
+    // means the shielded loop may still hold a reference to the host-owned
+    // event-handler / persister context and could fire one final callback,
+    // so signal the host to defer freeing that context rather than returning
+    // ok() and inviting a use-after-free.
+    if !status.is_clean() {
+        return PlatformWalletFFIResult::err(
+            PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+            format!(
+                "shielded sync stop did not drain cleanly ({status:?}); \
+                 host must not free the callback context immediately"
+            ),
+        );
+    }
     PlatformWalletFFIResult::ok()
 }
 
@@ -429,7 +456,9 @@ pub unsafe extern "C" fn platform_wallet_manager_configure_shielded(
 /// via the changeset path.
 ///
 /// Returns `ErrorWalletOperation` if the Rust-side store reset
-/// fails. The host **must** check this before wiping its own
+/// fails, or `ErrorShutdownIncomplete` if the in-flight sync pass
+/// did not drain cleanly first (in which case the store is left
+/// intact). The host **must** check this before wiping its own
 /// persistence: a silent failure would leave the shared tree
 /// populated while the host drops its rows, and the next cold
 /// resync would gate-skip every re-downloaded position against the
@@ -455,10 +484,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_clear(
     });
     let result = unwrap_option_or_return!(option);
     if let Err(e) = result {
-        return PlatformWalletFFIResult::err(
-            PlatformWalletFFIResultCode::ErrorWalletOperation,
-            format!("clear_shielded failed: {e}"),
-        );
+        // A non-clean / timed-out quiesce aborts the clear *before* the store
+        // is touched: surface it as ErrorShutdownIncomplete (symmetric with
+        // destroy / shielded_sync_stop) so the host defers freeing its
+        // callback context and does NOT commit its own persistence wipe — the
+        // store was intentionally left intact. Every other clear failure is a
+        // store-reset error → ErrorWalletOperation, as before.
+        let code = match &e {
+            platform_wallet::PlatformWalletError::ShieldedShutdownIncomplete { .. } => {
+                PlatformWalletFFIResultCode::ErrorShutdownIncomplete
+            }
+            _ => PlatformWalletFFIResultCode::ErrorWalletOperation,
+        };
+        return PlatformWalletFFIResult::err(code, format!("clear_shielded failed: {e}"));
     }
     PlatformWalletFFIResult::ok()
 }
diff --git a/packages/rs-platform-wallet/src/error.rs b/packages/rs-platform-wallet/src/error.rs
index c94cb7093d..196d2ee5b4 100644
--- a/packages/rs-platform-wallet/src/error.rs
+++ b/packages/rs-platform-wallet/src/error.rs
@@ -239,6 +239,27 @@ pub enum PlatformWalletError {
 
     #[error("Shielded sub-wallet not bound: call bind_shielded first")]
     ShieldedNotBound,
+
+    /// A Clear/wipe could not safely complete because the shielded sync
+    /// coordinator's in-flight pass did not drain cleanly first — it either
+    /// timed out on the join backstop or its loop ended non-cleanly
+    /// (cancelled / panicked). The shared commitment-tree store is therefore
+    /// **left intact** (not wiped): a still-running pass could re-persist
+    /// notes into the store immediately after a `clear()`, desyncing the
+    /// host's wiped rows from a repopulated tree and gate-skipping every
+    /// re-downloaded position on the next cold resync. The host **must not**
+    /// commit its own persistence wipe; retry Clear once the pass settles.
+    /// Carries the terminal [`CoordinatorThreadStatus`] for diagnostics.
+    ///
+    /// [`CoordinatorThreadStatus`]: crate::manager::CoordinatorThreadStatus
+    #[error(
+        "shielded clear aborted: sync coordinator did not drain cleanly \
+         ({status:?}); commitment-tree store left intact so an in-flight pass \
+         cannot re-persist into a wiped store — retry once the pass settles"
+    )]
+    ShieldedShutdownIncomplete {
+        status: crate::manager::CoordinatorThreadStatus,
+    },
 }
 
 /// Check whether an SDK error indicates that an InstantSend lock proof was
diff --git a/packages/rs-platform-wallet/src/lib.rs b/packages/rs-platform-wallet/src/lib.rs
index dd12883fc7..8b55948aa1 100644
--- a/packages/rs-platform-wallet/src/lib.rs
+++ b/packages/rs-platform-wallet/src/lib.rs
@@ -44,7 +44,10 @@ pub use manager::platform_address_sync::{
     PlatformAddressSyncManager, PlatformAddressSyncSummary, WalletSyncOutcome,
     DEFAULT_SYNC_INTERVAL_SECS,
 };
-pub use manager::{PlatformWalletManager, SHUTDOWN_JOIN_TIMEOUT_SECS};
+pub use manager::{
+    CoordinatorExitStatus, CoordinatorThreadStatus, PlatformWalletManager,
+    SHUTDOWN_JOIN_TIMEOUT_SECS,
+};
 pub use spv::SpvRuntime;
 pub use wallet::asset_lock::manager::AssetLockManager;
 pub use wallet::asset_lock::tracked::{AssetLockStatus, TrackedAssetLock};
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index a9569dd00e..2de6ad6d5a 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -425,21 +425,39 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// disk but its contents are reset to empty so the next bind cold-
     /// resyncs from index 0.
     ///
-    /// Returns an error if the coordinator's store reset fails; the host
-    /// must not commit its own persistence wipe in that case.
+    /// Returns an error — and leaves the store untouched — in two cases, so
+    /// the host knows **not** to commit its own persistence wipe:
+    /// - the in-flight sync pass did not drain cleanly (timed out on the join
+    ///   backstop, or its loop ended non-cleanly) →
+    ///   [`crate::error::PlatformWalletError::ShieldedShutdownIncomplete`]; or
+    /// - the coordinator's store reset itself fails.
     #[cfg(feature = "shielded")]
     pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
         // Bound the quiesce with the same backstop `shutdown()` uses so a
         // stalled in-flight pass can't hang Clear forever — cancellation
         // makes the drain prompt; this timeout only matters if a pass's
-        // drop wedges. The terminal status isn't surfaced on the Clear
-        // path (the coordinator reset below is what can fail), so the
-        // timeout result is intentionally discarded.
-        let _ = tokio::time::timeout(
+        // drop wedges. Unlike `shutdown()`, the terminal status is
+        // load-bearing HERE: a non-clean drain means the in-flight pass may
+        // still be running and could re-persist notes into the very store
+        // the `clear()` below is about to wipe. A timeout (the future was
+        // dropped at the deadline) is treated as the non-clean `Timeout`
+        // status, matching `shutdown()`'s backstop substitution.
+        let status = match tokio::time::timeout(
             std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
             self.shielded_sync_manager.quiesce(),
         )
-        .await;
+        .await
+        {
+            Ok(status) => status,
+            Err(_elapsed) => CoordinatorThreadStatus::Timeout,
+        };
+        // Only commit the store wipe once the in-flight pass has fully
+        // drained. Otherwise refuse: a partial/timed-out drain could let a
+        // surviving pass write into a store we just cleared, desyncing the
+        // host's own wipe from a repopulated tree.
+        if !status.is_clean() {
+            return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status });
+        }
         if let Some(coord) = self.shielded_coordinator().await {
             coord.clear().await?;
         }
diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
index 2c311f91e9..31ef07ad4a 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
@@ -39,6 +39,12 @@ public enum PlatformWalletResultCode: Int32, Sendable {
     /// outcome. Do NOT auto-retry — a retry would rebuild the bundle and
     /// could double-execute if the original landed.
     case errorShieldedSpendUnconfirmed = 18
+    /// A destroy/stop/clear completed but a background coordinator did not
+    /// exit cleanly (timed out or ended non-cleanly). The host should defer
+    /// freeing its callback context — a lingering coordinator may still fire
+    /// one final callback through it — and, on the clear path, must NOT
+    /// commit its own persistence wipe (the Rust store was left intact).
+    case errorShutdownIncomplete = 19
     case notFound = 98
     case errorUnknown = 99
 
@@ -82,6 +88,8 @@ public enum PlatformWalletResultCode: Int32, Sendable {
             self = .errorShieldedBroadcastUnconfirmed
         case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHIELDED_SPEND_UNCONFIRMED:
             self = .errorShieldedSpendUnconfirmed
+        case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_SHUTDOWN_INCOMPLETE:
+            self = .errorShutdownIncomplete
         case PLATFORM_WALLET_FFI_RESULT_CODE_NOT_FOUND:
             self = .notFound
         case PLATFORM_WALLET_FFI_RESULT_CODE_ERROR_UNKNOWN:
@@ -177,6 +185,12 @@ public enum PlatformWalletError: LocalizedError {
     /// notes reserved wallet-side (a shield reserves nothing) until the
     /// next sync reconciles the outcome. Do NOT auto-retry.
     case shieldedSpendUnconfirmed(String)
+    /// A destroy / stop / clear completed but a background coordinator did
+    /// not exit cleanly. The host should defer freeing its callback context
+    /// (a lingering coordinator may still fire one final callback) and, on
+    /// the clear path, must NOT commit its own persistence wipe — the Rust
+    /// store was left intact so it can be retried once the pass settles.
+    case shutdownIncomplete(String)
     case notFound(String)
     case unknown(String)
 
@@ -192,6 +206,7 @@ public enum PlatformWalletError: LocalizedError {
              .arithmeticOverflow(let m), .noSelectableInputs(let m),
              .walletAlreadyExists(let m), .shieldedBroadcastFailed(let m),
              .shieldedBroadcastUnconfirmed(let m), .shieldedSpendUnconfirmed(let m),
+             .shutdownIncomplete(let m),
              .notFound(let m), .unknown(let m):
             return m
         }
@@ -222,6 +237,7 @@ public enum PlatformWalletError: LocalizedError {
         case .errorShieldedBroadcastFailed: self = .shieldedBroadcastFailed(detail)
         case .errorShieldedBroadcastUnconfirmed: self = .shieldedBroadcastUnconfirmed(detail)
         case .errorShieldedSpendUnconfirmed: self = .shieldedSpendUnconfirmed(detail)
+        case .errorShutdownIncomplete: self = .shutdownIncomplete(detail)
         case .notFound:               self = .notFound(detail)
         case .errorUnknown:           self = .unknown(detail)
         }

From 5f63c9544c84c44c7a62eeed14c73634a27e45e6 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:30:26 +0200
Subject: [PATCH 11/29] fix(platform-wallet): reap prior coordinator thread
 outside background_cancel lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All three coordinators (identity_sync, platform_address_sync,
shielded_sync) reaped the prior loop's OS thread inside start() WHILE
holding background_cancel. But the exiting prior thread's epilogue also
locks background_cancel to clear its slot, so a tight stop()→start()
deadlocked the reap: the prior thread blocked on the lock start() held,
never finished, and the is_finished() spin-wait burned the full 1 s
deadline then DETACHED the handle — a 1 s stall plus a transient
untracked thread, on the exact stop()→start() path the reap exists for.

Reorder start() to install the new cancel token + bump the generation
under the lock, then drop(cancel_guard) to release background_cancel,
and only THEN run the spin-wait + join. The prior thread's epilogue now
acquires the lock (or, for shielded, observes the bumped generation),
skips clearing the freshly-installed token, and returns, so is_finished()
trips in milliseconds and the join is near-instant. start() stays
synchronous; the 1 s deadline remains only as a genuine-wedge backstop.

Adds restart_after_stop_reaps_prior_thread regression tests to the
identity and platform-address coordinators: start → (stop+start
back-to-back) → assert the restart returns well under the 1 s deadline.
Verified non-vacuous — against the old lock-held ordering it stalls
~1.0 s and fails.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/manager/identity_sync.rs              | 132 ++++++++++++++----
 .../src/manager/platform_address_sync.rs      | 132 ++++++++++++++----
 .../src/manager/shielded_sync.rs              |  78 +++++++----
 3 files changed, 249 insertions(+), 93 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 6e87261e0a..9cc14ac831 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -411,36 +411,22 @@ where
             return;
         }
 
-        // Drain any handle left by a prior stop() call. stop() takes-and-cancels
-        // the token but never touches background_join, so a stop()→start()
-        // sequence would otherwise overwrite (detach) the old handle —
-        // shutdown() would then miss that thread and join() only the new one.
-        // The old thread was already cancellation-signalled, so is_finished()
-        // becomes true within a few milliseconds; we spin-wait to guarantee
-        // no detached thread can fire callbacks after destroy() returns.
-        {
-            let prior = self
-                .background_join
-                .lock()
-                .unwrap_or_else(|e| e.into_inner())
-                .take();
-            if let Some(h) = prior {
-                let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
-                while !h.is_finished() {
-                    if std::time::Instant::now() >= deadline {
-                        tracing::warn!(
-                            "identity-sync prior thread did not finish within 1 s \
-                             after cancellation; detaching to unblock start()"
-                        );
-                        break; // Drop h — detaches; thread was already cancelled.
-                    }
-                    std::thread::sleep(std::time::Duration::from_millis(5));
-                }
-                if h.is_finished() {
-                    let _ = h.join(); // Reap resources; near-instant since finished.
-                }
-            }
-        }
+        // Take any handle left by a prior stop() call so we can reap it — but
+        // DON'T join it here, while we still hold background_cancel. stop()
+        // takes-and-cancels the token but never touches background_join, so a
+        // stop()→start() sequence would otherwise overwrite (detach) the old
+        // handle and shutdown() would miss that thread. Joining it under
+        // background_cancel would DEADLOCK the reap into its 1 s backstop: the
+        // exiting prior thread's epilogue also locks background_cancel (to
+        // clear its slot), so it would block on the lock we hold → never
+        // finish → get detached on the exact stop()→start() path the reap
+        // exists for. We install the new token + bump the generation below,
+        // release the lock, and only THEN reap (after this fn's tail).
+        let prior = self
+            .background_join
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .take();
 
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
@@ -498,7 +484,37 @@ where
             .background_join
             .lock()
             .unwrap_or_else(|e| e.into_inner()) = Some(join);
-        // cancel_guard drops here, releasing background_cancel.
+
+        // Release background_cancel BEFORE reaping the prior thread, so its
+        // epilogue can acquire the lock, observe the bumped generation, skip
+        // clearing our freshly-installed token, and return. Holding the lock
+        // across the join below is what would block the prior thread, spin
+        // the full 1 s deadline, and detach — the very stall this ordering
+        // removes.
+        drop(cancel_guard);
+
+        // Now reap the prior thread. It was already cancellation-signalled by
+        // stop(), and with the lock released its epilogue completes promptly,
+        // so is_finished() trips within a few milliseconds and the join is
+        // near-instant. The 1 s deadline survives only as a genuine-wedge
+        // backstop (e.g. a pass wedged in a Drop that never yields); if it
+        // fires we detach the already-cancelled thread to unblock start().
+        if let Some(h) = prior {
+            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+            while !h.is_finished() {
+                if std::time::Instant::now() >= deadline {
+                    tracing::warn!(
+                        "identity-sync prior thread did not finish within 1 s \
+                         after cancellation; detaching to unblock start()"
+                    );
+                    break; // Drop h — detaches; thread was already cancelled.
+                }
+                std::thread::sleep(std::time::Duration::from_millis(5));
+            }
+            if h.is_finished() {
+                let _ = h.join(); // Reap resources; near-instant since finished.
+            }
+        }
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -1025,6 +1041,60 @@ mod tests {
         pass.await.unwrap();
     }
 
+    /// Regression: a tight `stop()` → `start()` must reap the prior loop's
+    /// OS thread promptly, NOT stall on the 1 s detach backstop.
+    ///
+    /// The prior thread's exit epilogue locks `background_cancel` to
+    /// conditionally clear its slot. The earlier ordering held
+    /// `background_cancel` across the prior-handle join inside `start()`, so
+    /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
+    /// that lock, never finished, and the reap spin-waited the full second
+    /// before detaching — a 1 s stall plus a transient untracked thread. The
+    /// fix installs the new token + generation, releases `background_cancel`,
+    /// and only then reaps, so the prior thread's epilogue runs and the join
+    /// lands in milliseconds.
+    ///
+    /// `stop()` and `start()` run back-to-back in one blocking closure
+    /// (mirroring the real call site) so `start()` re-acquires the lock
+    /// microseconds after `stop()` frees it — before the async-woken prior
+    /// thread can reach its epilogue. Against the old lock-held ordering this
+    /// reliably stalls ~1 s and fails the bound below.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn restart_after_stop_reaps_prior_thread() {
+        let mgr = make_manager();
+
+        // Launch the first loop and let its immediate (no-op, nothing
+        // registered) pass complete so the thread parks in the interval
+        // sleep, where cancellation lands cleanly.
+        Arc::clone(&mgr).start();
+        assert!(mgr.is_running());
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // Back-to-back cancel-only stop + restart, off the runtime so the
+        // synchronous reap can't starve a worker. `start()` re-grabs
+        // background_cancel right after `stop()` frees it.
+        let restart = Arc::clone(&mgr);
+        let elapsed = tokio::task::spawn_blocking(move || {
+            restart.stop();
+            let started = std::time::Instant::now();
+            Arc::clone(&restart).start();
+            started.elapsed()
+        })
+        .await
+        .unwrap();
+
+        assert!(
+            elapsed < Duration::from_millis(500),
+            "stop()→start() stalled for {elapsed:?}: prior thread was not \
+             reaped promptly (background_cancel held across the join?)"
+        );
+        assert!(mgr.is_running(), "restart must leave the new loop tracked");
+
+        // Wind the new loop down so the test leaves no live !Send thread.
+        mgr.quiesce().await;
+        assert!(!mgr.is_running());
+    }
+
     /// A `sync_now()` invoked while `quiescing` is set must bail without
     /// running the pass — in particular, without calling
     /// `persister.store(...)`. This is the gate that prevents a pass
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 7e72f2fe74..87b6595e53 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -220,36 +220,22 @@ impl PlatformAddressSyncManager {
             return;
         }
 
-        // Drain any handle left by a prior stop() call. stop() takes-and-cancels
-        // the token but never touches background_join, so a stop()→start()
-        // sequence would otherwise overwrite (detach) the old handle —
-        // shutdown() would then miss that thread and join() only the new one.
-        // The old thread was already cancellation-signalled, so is_finished()
-        // becomes true within a few milliseconds; we spin-wait to guarantee
-        // no detached thread can fire callbacks after destroy() returns.
-        {
-            let prior = self
-                .background_join
-                .lock()
-                .unwrap_or_else(|e| e.into_inner())
-                .take();
-            if let Some(h) = prior {
-                let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
-                while !h.is_finished() {
-                    if std::time::Instant::now() >= deadline {
-                        tracing::warn!(
-                            "platform-address-sync prior thread did not finish within 1 s \
-                             after cancellation; detaching to unblock start()"
-                        );
-                        break; // Drop h — detaches; thread was already cancelled.
-                    }
-                    std::thread::sleep(std::time::Duration::from_millis(5));
-                }
-                if h.is_finished() {
-                    let _ = h.join(); // Reap resources; near-instant since finished.
-                }
-            }
-        }
+        // Take any handle left by a prior stop() call so we can reap it — but
+        // DON'T join it here, while we still hold background_cancel. stop()
+        // takes-and-cancels the token but never touches background_join, so a
+        // stop()→start() sequence would otherwise overwrite (detach) the old
+        // handle and shutdown() would miss that thread. Joining it under
+        // background_cancel would DEADLOCK the reap into its 1 s backstop: the
+        // exiting prior thread's epilogue also locks background_cancel (to
+        // clear its slot), so it would block on the lock we hold → never
+        // finish → get detached on the exact stop()→start() path the reap
+        // exists for. We install the new token + bump the generation below,
+        // release the lock, and only THEN reap (after this fn's tail).
+        let prior = self
+            .background_join
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .take();
 
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
@@ -313,7 +299,37 @@ impl PlatformAddressSyncManager {
             .background_join
             .lock()
             .unwrap_or_else(|e| e.into_inner()) = Some(join);
-        // cancel_guard drops here, releasing background_cancel.
+
+        // Release background_cancel BEFORE reaping the prior thread, so its
+        // epilogue can acquire the lock, observe the bumped generation, skip
+        // clearing our freshly-installed token, and return. Holding the lock
+        // across the join below is what would block the prior thread, spin
+        // the full 1 s deadline, and detach — the very stall this ordering
+        // removes.
+        drop(cancel_guard);
+
+        // Now reap the prior thread. It was already cancellation-signalled by
+        // stop(), and with the lock released its epilogue completes promptly,
+        // so is_finished() trips within a few milliseconds and the join is
+        // near-instant. The 1 s deadline survives only as a genuine-wedge
+        // backstop (e.g. a pass wedged in a Drop that never yields); if it
+        // fires we detach the already-cancelled thread to unblock start().
+        if let Some(h) = prior {
+            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+            while !h.is_finished() {
+                if std::time::Instant::now() >= deadline {
+                    tracing::warn!(
+                        "platform-address-sync prior thread did not finish within 1 s \
+                         after cancellation; detaching to unblock start()"
+                    );
+                    break; // Drop h — detaches; thread was already cancelled.
+                }
+                std::thread::sleep(std::time::Duration::from_millis(5));
+            }
+            if h.is_finished() {
+                let _ = h.join(); // Reap resources; near-instant since finished.
+            }
+        }
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -590,6 +606,60 @@ mod tests {
         pass.await.unwrap();
     }
 
+    /// Regression: a tight `stop()` → `start()` must reap the prior loop's
+    /// OS thread promptly, NOT stall on the 1 s detach backstop.
+    ///
+    /// The prior thread's exit epilogue locks `background_cancel` to
+    /// conditionally clear its slot. The earlier ordering held
+    /// `background_cancel` across the prior-handle join inside `start()`, so
+    /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
+    /// that lock, never finished, and the reap spin-waited the full second
+    /// before detaching — a 1 s stall plus a transient untracked thread. The
+    /// fix installs the new token + generation, releases `background_cancel`,
+    /// and only then reaps, so the prior thread's epilogue runs and the join
+    /// lands in milliseconds.
+    ///
+    /// `stop()` and `start()` run back-to-back in one blocking closure
+    /// (mirroring the real call site) so `start()` re-acquires the lock
+    /// microseconds after `stop()` frees it — before the async-woken prior
+    /// thread can reach its epilogue. Against the old lock-held ordering this
+    /// reliably stalls ~1 s and fails the bound below.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn restart_after_stop_reaps_prior_thread() {
+        let (mgr, _counter) = make_manager();
+
+        // Launch the first loop and let its immediate (no-op, empty wallet
+        // map) pass complete so the thread parks in the interval sleep, where
+        // cancellation lands cleanly.
+        Arc::clone(&mgr).start();
+        assert!(mgr.is_running());
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // Back-to-back cancel-only stop + restart, off the runtime so the
+        // synchronous reap can't starve a worker. `start()` re-grabs
+        // background_cancel right after `stop()` frees it.
+        let restart = Arc::clone(&mgr);
+        let elapsed = tokio::task::spawn_blocking(move || {
+            restart.stop();
+            let started = std::time::Instant::now();
+            Arc::clone(&restart).start();
+            started.elapsed()
+        })
+        .await
+        .unwrap();
+
+        assert!(
+            elapsed < Duration::from_millis(500),
+            "stop()→start() stalled for {elapsed:?}: prior thread was not \
+             reaped promptly (background_cancel held across the join?)"
+        );
+        assert!(mgr.is_running(), "restart must leave the new loop tracked");
+
+        // Wind the new loop down so the test leaves no live !Send thread.
+        mgr.quiesce().await;
+        assert!(!mgr.is_running());
+    }
+
     /// A `sync_now()` invoked while `quiescing` is set must bail without
     /// running the pass — in particular, without firing the
     /// `on_platform_address_sync_completed` host callback. This is the
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 365b0be17b..d0aa75a843 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -238,36 +238,22 @@ impl ShieldedSyncManager {
             return;
         }
 
-        // Drain any handle left by a prior stop() call. stop() takes-and-cancels
-        // the token but never touches background_join, so a stop()→start()
-        // sequence would otherwise overwrite (detach) the old handle —
-        // shutdown() would then miss that thread and join() only the new one.
-        // The old thread was already cancellation-signalled, so is_finished()
-        // becomes true within a few milliseconds; we spin-wait to guarantee
-        // no detached thread can fire callbacks after destroy() returns.
-        {
-            let prior = self
-                .background_join
-                .lock()
-                .unwrap_or_else(|e| e.into_inner())
-                .take();
-            if let Some(h) = prior {
-                let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
-                while !h.is_finished() {
-                    if std::time::Instant::now() >= deadline {
-                        tracing::warn!(
-                            "shielded-sync prior thread did not finish within 1 s \
-                             after cancellation; detaching to unblock start()"
-                        );
-                        break; // Drop h — detaches; thread was already cancelled.
-                    }
-                    std::thread::sleep(std::time::Duration::from_millis(5));
-                }
-                if h.is_finished() {
-                    let _ = h.join(); // Reap resources; near-instant since finished.
-                }
-            }
-        }
+        // Take any handle left by a prior stop() call so we can reap it — but
+        // DON'T join it here, while we still hold background_cancel. stop()
+        // takes-and-cancels the token but never touches background_join, so a
+        // stop()→start() sequence would otherwise overwrite (detach) the old
+        // handle and shutdown() would miss that thread. Joining it under
+        // background_cancel would DEADLOCK the reap into its 1 s backstop: the
+        // exiting prior thread's epilogue also locks background_cancel (to
+        // clear its slot), so it would block on the lock we hold → never
+        // finish → get detached on the exact stop()→start() path the reap
+        // exists for. We install the new token + bump the generation below,
+        // release the lock, and only THEN reap (after this fn's tail).
+        let prior = self
+            .background_join
+            .lock()
+            .unwrap_or_else(|e| e.into_inner())
+            .take();
 
         let cancel = CancellationToken::new();
         *cancel_guard = Some(cancel.clone());
@@ -339,7 +325,37 @@ impl ShieldedSyncManager {
             .background_join
             .lock()
             .unwrap_or_else(|e| e.into_inner()) = Some(join);
-        // cancel_guard drops here, releasing background_cancel.
+
+        // Release background_cancel BEFORE reaping the prior thread, so its
+        // epilogue can observe the bumped generation (and skip clearing our
+        // freshly-installed token) without contending the lock we hold.
+        // Holding the lock across the join below is what would block the
+        // prior thread, spin the full 1 s deadline, and detach — the very
+        // stall this ordering removes.
+        drop(cancel_guard);
+
+        // Now reap the prior thread. It was already cancellation-signalled by
+        // stop(), and with the lock released its epilogue completes promptly,
+        // so is_finished() trips within a few milliseconds and the join is
+        // near-instant. The 1 s deadline survives only as a genuine-wedge
+        // backstop (e.g. a pass wedged in a Drop that never yields); if it
+        // fires we detach the already-cancelled thread to unblock start().
+        if let Some(h) = prior {
+            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
+            while !h.is_finished() {
+                if std::time::Instant::now() >= deadline {
+                    tracing::warn!(
+                        "shielded-sync prior thread did not finish within 1 s \
+                         after cancellation; detaching to unblock start()"
+                    );
+                    break; // Drop h — detaches; thread was already cancelled.
+                }
+                std::thread::sleep(std::time::Duration::from_millis(5));
+            }
+            if h.is_finished() {
+                let _ = h.join(); // Reap resources; near-instant since finished.
+            }
+        }
     }
 
     /// Stop the background sync loop. No-op if not running.

From 2b068ba57564e836f8b48f9fb6c643943b73336c Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:34:10 +0200
Subject: [PATCH 12/29] fix(platform-wallet): close shielded epilogue TOCTOU +
 pin restart reap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three shielded-sync hardening fixes, bringing it in line with its
identity-sync and platform-address-sync siblings.

- shielded_sync.rs exit epilogue read `background_generation` BEFORE
  acquiring `background_cancel` (load-then-lock). That stale-read TOCTOU let
  a prior thread observe a pre-bump generation, block on the lock until a
  concurrent start() released it, then null the freshly-installed token —
  leaving the new loop running but untracked via is_running()/stop(). Acquire
  the lock first and compare the generation under it, exactly like the
  siblings.

- Add the `restart_after_stop_reaps_prior_thread` regression test the
  siblings already carry. It pins the reap-after-drop(cancel_guard) reorder:
  a back-to-back stop()+start() must reap the prior OS thread in <500 ms, not
  stall ~1 s on the detach backstop. Confirmed non-vacuous — it fails at
  ~1.0 s with the reap moved back inside the lock.

- platform-wallet-ffi: the ErrorShutdownIncomplete doc only described
  destroy. It is now also returned by shielded_sync_stop and shielded_clear,
  where the manager is NOT torn down and the operation can be retried.
  Document all three callers and their differing retry semantics.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-platform-wallet-ffi/src/error.rs  | 32 ++++--
 .../src/manager/shielded_sync.rs              | 97 +++++++++++++++++--
 2 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/packages/rs-platform-wallet-ffi/src/error.rs b/packages/rs-platform-wallet-ffi/src/error.rs
index 5769ffcc43..e5b5184a82 100644
--- a/packages/rs-platform-wallet-ffi/src/error.rs
+++ b/packages/rs-platform-wallet-ffi/src/error.rs
@@ -125,13 +125,31 @@ pub enum PlatformWalletFFIResultCode {
     /// and could double-send if the original spend landed.
     ErrorShieldedSpendUnconfirmed = 18,
 
-    /// One or more background coordinator threads did not exit cleanly before
-    /// the 30 s join deadline. The host **must not** free the callback context
-    /// immediately — a lingering thread may still hold a reference to it and
-    /// fire one final callback. Either keep the context alive for a further
-    /// grace period, or accept the potential (but statistically tiny) race.
-    /// This is distinct from a normal operation error; the manager IS torn
-    /// down; the host should not retry `destroy`.
+    /// A background coordinator drain did not complete cleanly within the
+    /// join deadline — one or more `!Send` sync threads may still be alive
+    /// and still hold a reference to the host-owned callback context, so they
+    /// could fire one final callback through it. On this code the host **must
+    /// not** free the callback context immediately: either keep it alive for a
+    /// further grace period, or accept the (statistically tiny) race.
+    ///
+    /// Returned by three callers, which differ in whether the operation may
+    /// be **retried**:
+    /// - `platform_wallet_manager_destroy`: the manager **IS** torn down
+    ///   (removed from storage) regardless — do **not** retry `destroy`; the
+    ///   handle is already gone. Only the callback-context lifetime caveat
+    ///   above applies.
+    /// - `platform_wallet_manager_shielded_sync_stop`: the manager is **NOT**
+    ///   torn down — only the shielded loop's drain was non-clean. The host
+    ///   may retry the stop (or proceed to `destroy`); the handle stays valid.
+    /// - `platform_wallet_manager_shielded_clear`: the manager is **NOT** torn
+    ///   down and the store was left **intact** (Clear aborted before touching
+    ///   it). The host may retry the clear, and must **not** commit its own
+    ///   persistence wipe — doing so would desync the host's rows from the
+    ///   still-populated shared tree.
+    ///
+    /// Distinct from a normal operation error (the underlying operation may
+    /// well have made progress); the terminal coordinator status is rendered
+    /// into the result message.
     ErrorShutdownIncomplete = 19,
 
     NotFound = 98, // Used exclusively for all the Option that are retuned as errors
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index d0aa75a843..98e94035aa 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -305,13 +305,20 @@ impl ShieldedSyncManager {
                     }
 
                     // Only clear `background_cancel` if the active
-                    // generation is still ours. Without this guard a
-                    // tight `stop()` → `start()` reschedule has the
-                    // exiting thread overwrite the *new* generation's
-                    // token, leaving the new loop running but
-                    // unreflectable via `is_running()` / `stop()`.
-                    if this.background_generation.load(Ordering::Acquire) == my_gen {
-                        if let Ok(mut guard) = this.background_cancel.lock() {
+                    // generation is still ours. Acquire the lock FIRST,
+                    // then read/compare `background_generation` under it
+                    // (matching identity_sync / platform_address_sync).
+                    // Reading the generation BEFORE locking opens a
+                    // stale-read TOCTOU: this exiting thread could observe
+                    // a pre-bump generation, then block on the lock until a
+                    // concurrent `start()` released it, and null the
+                    // freshly-installed token — leaving the new loop
+                    // running but unreflectable via `is_running()` /
+                    // `stop()`. `start()` bumps the generation while it
+                    // holds this same lock, so comparing under the lock
+                    // guarantees we observe the post-swap value.
+                    if let Ok(mut guard) = this.background_cancel.lock() {
+                        if this.background_generation.load(Ordering::Acquire) == my_gen {
                             *guard = None;
                         }
                     }
@@ -576,3 +583,79 @@ impl std::fmt::Debug for ShieldedSyncManager {
             .finish()
     }
 }
+
+// The whole module is already `#[cfg(feature = "shielded")]`-gated at its
+// `mod` declaration (manager/mod.rs), so these tests compile only under that
+// feature — no extra per-test gate needed.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Build a manager over an **empty** coordinator slot wired to a
+    /// handler-less event manager. An empty slot makes every `sync_now`
+    /// pass a no-op (empty-coordinator handling returns immediately), so
+    /// the background loop parks in its interval sleep — exactly where
+    /// cancellation lands cleanly — without needing a live SDK / network.
+    /// That is all the start/stop/restart thread-lifecycle tests below
+    /// exercise.
+    fn make_manager() -> Arc<ShieldedSyncManager> {
+        let coordinator_slot = Arc::new(RwLock::new(None));
+        let event_manager = Arc::new(PlatformEventManager::new(vec![]));
+        Arc::new(ShieldedSyncManager::new(event_manager, coordinator_slot))
+    }
+
+    /// Regression: a tight `stop()` → `start()` must reap the prior loop's
+    /// OS thread promptly, NOT stall on the 1 s detach backstop.
+    ///
+    /// The prior thread's exit epilogue locks `background_cancel` to
+    /// conditionally clear its slot. The earlier ordering held
+    /// `background_cancel` across the prior-handle join inside `start()`, so
+    /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
+    /// that lock, never finished, and the reap spin-waited the full second
+    /// before detaching — a 1 s stall plus a transient untracked thread. The
+    /// fix installs the new token + generation, releases `background_cancel`,
+    /// and only then reaps, so the prior thread's epilogue runs and the join
+    /// lands in milliseconds. Mirrors the identity-sync and
+    /// platform-address-sync siblings.
+    ///
+    /// `stop()` and `start()` run back-to-back in one blocking closure
+    /// (mirroring the real call site) so `start()` re-acquires the lock
+    /// microseconds after `stop()` frees it — before the async-woken prior
+    /// thread can reach its epilogue. Against the old lock-held ordering this
+    /// reliably stalls ~1 s and fails the bound below.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn restart_after_stop_reaps_prior_thread() {
+        let mgr = make_manager();
+
+        // Launch the first loop and let its immediate (no-op, empty
+        // coordinator) pass complete so the thread parks in the interval
+        // sleep, where cancellation lands cleanly.
+        Arc::clone(&mgr).start();
+        assert!(mgr.is_running());
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // Back-to-back cancel-only stop + restart, off the runtime so the
+        // synchronous reap can't starve a worker. `start()` re-grabs
+        // background_cancel right after `stop()` frees it.
+        let restart = Arc::clone(&mgr);
+        let elapsed = tokio::task::spawn_blocking(move || {
+            restart.stop();
+            let started = std::time::Instant::now();
+            Arc::clone(&restart).start();
+            started.elapsed()
+        })
+        .await
+        .unwrap();
+
+        assert!(
+            elapsed < Duration::from_millis(500),
+            "stop()→start() stalled for {elapsed:?}: prior thread was not \
+             reaped promptly (background_cancel held across the join?)"
+        );
+        assert!(mgr.is_running(), "restart must leave the new loop tracked");
+
+        // Wind the new loop down so the test leaves no live !Send thread.
+        mgr.quiesce().await;
+        assert!(!mgr.is_running());
+    }
+}

From 5017ba13136e8a1f1d818ac6547290f716836e39 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:34:18 +0200
Subject: [PATCH 13/29] fix(swift-sdk): retain wallet callback context on
 incomplete shutdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PlatformWalletManager.deinit called .discard() on shielded_sync_stop and
destroy, tossing the result code. Rust now returns ErrorShutdownIncomplete
(19) on a non-clean drain, with the contract: a lingering coordinator thread
may still fire one final callback through the host-owned callback context.
But persistenceHandler/eventHandler are handed to Rust via
Unmanaged.passUnretained and kept alive only by this object's fields, so the
instant deinit returns ARC frees them — a use-after-free on that final
callback.

Capture the code via a new discardReturningCode() helper; on
ErrorShutdownIncomplete from shielded_sync_stop OR destroy, deliberately leak
one extra strong reference (an unbalanced passRetained, never released) to
each handler so it outlives any lingering thread. A clean shutdown — the
common case — takes neither branch and releases the handlers normally; we
never leak unconditionally.

UNVERIFIED locally: no Swift toolchain / xcframework on this host. Reasoned
correct-by-construction; needs an iOS-environment build to confirm.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../PlatformWalletManager.swift               | 43 +++++++++++++++++--
 .../PlatformWallet/PlatformWalletResult.swift | 12 ++++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
index 0e433d368e..36bafa37d1 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
@@ -152,10 +152,45 @@ public class PlatformWalletManager: ObservableObject {
 
     deinit {
         progressPollTask?.cancel()
-        if handle != NULL_HANDLE {
-            platform_wallet_manager_platform_address_sync_stop(handle).discard()
-            platform_wallet_manager_shielded_sync_stop(handle).discard()
-            platform_wallet_manager_destroy(handle).discard()
+        guard handle != NULL_HANDLE else { return }
+
+        // Tear down the Rust manager: cancel the address-sync loop, drain
+        // the shielded loop, then destroy. The first stop is cancel-only
+        // and never reports an incomplete drain, so we still `discard()` it.
+        platform_wallet_manager_platform_address_sync_stop(handle).discard()
+
+        // Capture the CODE (not just free the message) for the two calls
+        // that CAN report `.errorShutdownIncomplete`: `shielded_sync_stop`
+        // and `destroy`. Rust returns that code when a background
+        // coordinator did not drain within the join deadline — meaning a
+        // lingering `!Send` coordinator thread may still hold the
+        // `passUnretained` context pointers Rust was handed for our
+        // `persistenceHandler` / `eventHandler` and fire ONE final callback
+        // through them. The contract: on that code the host must NOT free
+        // the callback context immediately.
+        let shieldedStopCode =
+            platform_wallet_manager_shielded_sync_stop(handle).discardReturningCode()
+        let destroyCode =
+            platform_wallet_manager_destroy(handle).discardReturningCode()
+
+        // Both handlers are passed to Rust via `Unmanaged.passUnretained`
+        // (see `PlatformWalletPersistenceHandler`/`PlatformWalletEventHandler`
+        // `makeCallbacks()`), so Rust holds non-owning pointers and these
+        // objects are kept alive ONLY by the stored properties below. The
+        // instant this deinit returns, ARC releases them — which would be a
+        // use-after-free if a lingering coordinator then fires its final
+        // callback. So, ONLY on an incomplete shutdown, deliberately leak one
+        // extra strong reference to each (an unbalanced `passRetained` that is
+        // never released) so they outlive any lingering thread. A clean
+        // shutdown (the common case) takes neither branch and releases the
+        // handlers normally — we never leak unconditionally. The leak is
+        // bounded by how often a shutdown wedges (rare) and trades two small
+        // objects for guaranteed callback safety, since an incomplete drain
+        // gives no later signal that the lingering thread has finally exited.
+        if shieldedStopCode == .errorShutdownIncomplete
+            || destroyCode == .errorShutdownIncomplete {
+            if let persistenceHandler { _ = Unmanaged.passRetained(persistenceHandler) }
+            if let eventHandler { _ = Unmanaged.passRetained(eventHandler) }
         }
     }
 
diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
index 31ef07ad4a..c24f72fbf8 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletResult.swift
@@ -256,4 +256,16 @@ extension PlatformWalletFFIResult {
     func discard() {
         _ = PlatformWalletResult(self)
     }
+
+    /// Free the result's Rust-owned message and return its typed code.
+    ///
+    /// Like `discard()`, but hands back the code so the caller can branch
+    /// on it — used by `PlatformWalletManager.deinit`, which must detect
+    /// `.errorShutdownIncomplete` to decide whether to keep its callback
+    /// context alive. The message is still freed deterministically (the
+    /// temporary `PlatformWalletResult` frees it on drop).
+    @inline(__always)
+    func discardReturningCode() -> PlatformWalletResultCode {
+        PlatformWalletResult(self).code
+    }
 }

From b4917732a39eb3a86ec706c9ac115f2011c185f9 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:50:44 +0200
Subject: [PATCH 14/29] test(platform-wallet): bound cleanup quiesce in
 restart-reap regression tests

Wrap the cleanup `mgr.quiesce().await` in all three
`restart_after_stop_reaps_prior_thread` tests with a 2-second
`tokio::time::timeout`. An unbounded quiesce after the restarted loop
would hang CI forever if the loop wedges; now it fails fast with a clear
message. Also assert `status.is_clean()` on the returned
`CoordinatorThreadStatus`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-platform-wallet/src/manager/identity_sync.rs  | 8 +++++++-
 .../src/manager/platform_address_sync.rs                  | 8 +++++++-
 packages/rs-platform-wallet/src/manager/shielded_sync.rs  | 8 +++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 9cc14ac831..ab6fa6033e 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -1091,7 +1091,13 @@ mod tests {
         assert!(mgr.is_running(), "restart must leave the new loop tracked");
 
         // Wind the new loop down so the test leaves no live !Send thread.
-        mgr.quiesce().await;
+        let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
+            .await
+            .expect("cleanup quiesce did not complete within 2s after restart");
+        assert!(
+            status.is_clean(),
+            "cleanup quiesce ended non-cleanly: {status:?}"
+        );
         assert!(!mgr.is_running());
     }
 
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 87b6595e53..094ae1a25b 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -656,7 +656,13 @@ mod tests {
         assert!(mgr.is_running(), "restart must leave the new loop tracked");
 
         // Wind the new loop down so the test leaves no live !Send thread.
-        mgr.quiesce().await;
+        let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
+            .await
+            .expect("cleanup quiesce did not complete within 2s after restart");
+        assert!(
+            status.is_clean(),
+            "cleanup quiesce ended non-cleanly: {status:?}"
+        );
         assert!(!mgr.is_running());
     }
 
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 98e94035aa..ba7b752315 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -655,7 +655,13 @@ mod tests {
         assert!(mgr.is_running(), "restart must leave the new loop tracked");
 
         // Wind the new loop down so the test leaves no live !Send thread.
-        mgr.quiesce().await;
+        let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
+            .await
+            .expect("cleanup quiesce did not complete within 2s after restart");
+        assert!(
+            status.is_clean(),
+            "cleanup quiesce ended non-cleanly: {status:?}"
+        );
         assert!(!mgr.is_running());
     }
 }

From 76c8bee0060f2bd5b473c62ef633bcfc9bd69a81 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 12:16:40 +0200
Subject: [PATCH 15/29] fix(platform-wallet): track detached coordinator
 threads so shutdown() reports them as non-clean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the residual use-after-free window left by the coordinator
reap backstop. On a tight stop()->start(), each sync coordinator
waits ~1s for the prior OS thread to finish; if that thread is
genuinely wedged in a non-yielding Drop, the backstop previously
DROPPED the still-live JoinHandle (detaching it). A later shutdown()
joined only the current handle, all_clean() returned true, and the
FFI destroy returned ok() — at which point the host could free the
callback context the detached, still-running thread might still touch.

Fix (review option i): the manager now owns a shared CoordinatorOrphans
list (Arc<Mutex<Vec<JoinHandle>>>) cloned into every coordinator. The
duplicated reap blocks in identity/platform-address/shielded start()
are consolidated into reap_prior_or_park(), which PARKS a wedged prior
thread in that list instead of dropping it (lock-ordering preserved:
drop(cancel_guard) still happens before the reap). shutdown() then
drains the list via join_detached_orphans() within a bounded, yielding
is_finished() poll and reports a new CoordinatorThreadStatus::Detached
(non-clean) in CoordinatorExitStatus::detached_threads when any orphan
is still alive at the grace deadline. all_clean() folds it in, so the
FFI destroy correctly returns ErrorShutdownIncomplete and the host
delays freeing its context. The new Detached variant re-exports through
lib.rs with its sibling statuses.

Tests (manager/mod.rs): reap_prior_or_park parks a force-wedged thread;
join_detached_orphans reports Detached then Ok; and a manager shutdown()
with a parked still-live orphan reports non-clean. All proven
non-vacuous by neutering the park/join. Cleanup quiesce/join in tests is
bounded; a wedged stand-in thread is released and joined so none leak.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/manager/identity_sync.rs              |  49 ++-
 .../rs-platform-wallet/src/manager/mod.rs     | 405 +++++++++++++++++-
 .../src/manager/platform_address_sync.rs      |  41 +-
 .../src/manager/shielded_sync.rs              |  41 +-
 4 files changed, 475 insertions(+), 61 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index ab6fa6033e..40329bad74 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -167,6 +167,12 @@ where
     /// confirm the `!Send` loop fully exited before the host drops the
     /// runtime.
     background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
+    /// Manager-owned orphans list (shared `Arc`). On a tight
+    /// `stop()`→`start()` where the prior thread is wedged past the 1 s
+    /// reap backstop, [`start`](Self::start) parks the still-live handle
+    /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
+    /// instead of dropping it, so manager `shutdown()` accounts for it.
+    coordinator_orphans: super::CoordinatorOrphans,
     /// Monotonically increasing generation counter. Incremented each
     /// time `start()` installs a new cancel token so the exiting
     /// thread can tell whether its token is still current.
@@ -206,12 +212,17 @@ where
     /// writes). The registry starts empty — call
     /// [`register_identity`](Self::register_identity) before
     /// [`start`](Self::start).
-    pub fn new(sdk: Arc<dash_sdk::Sdk>, persister: Arc<P>) -> Self {
+    pub fn new(
+        sdk: Arc<dash_sdk::Sdk>,
+        persister: Arc<P>,
+        coordinator_orphans: super::CoordinatorOrphans,
+    ) -> Self {
         Self {
             sdk,
             persister,
             background_cancel: StdMutex::new(None),
             background_join: StdMutex::new(None),
+            coordinator_orphans,
             background_generation: AtomicU64::new(0),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
@@ -498,23 +509,15 @@ where
         // so is_finished() trips within a few milliseconds and the join is
         // near-instant. The 1 s deadline survives only as a genuine-wedge
         // backstop (e.g. a pass wedged in a Drop that never yields); if it
-        // fires we detach the already-cancelled thread to unblock start().
-        if let Some(h) = prior {
-            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
-            while !h.is_finished() {
-                if std::time::Instant::now() >= deadline {
-                    tracing::warn!(
-                        "identity-sync prior thread did not finish within 1 s \
-                         after cancellation; detaching to unblock start()"
-                    );
-                    break; // Drop h — detaches; thread was already cancelled.
-                }
-                std::thread::sleep(std::time::Duration::from_millis(5));
-            }
-            if h.is_finished() {
-                let _ = h.join(); // Reap resources; near-instant since finished.
-            }
-        }
+        // fires `reap_prior_or_park` parks the still-live, already-cancelled
+        // thread in the manager orphans list so `shutdown()` joins it and
+        // reports it non-clean rather than dropping it (residual UAF).
+        super::reap_prior_or_park(
+            prior,
+            &self.coordinator_orphans,
+            std::time::Duration::from_secs(1),
+            "identity-sync",
+        );
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -853,7 +856,8 @@ mod tests {
     fn make_manager() -> Arc<IdentitySyncManager<NoopPersister>> {
         let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
         let persister = Arc::new(NoopPersister);
-        Arc::new(IdentitySyncManager::new(sdk, persister))
+        let orphans = Arc::new(StdMutex::new(Vec::new()));
+        Arc::new(IdentitySyncManager::new(sdk, persister, orphans))
     }
 
     fn make_recording_manager() -> (
@@ -862,8 +866,13 @@ mod tests {
     ) {
         let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
         let persister = Arc::new(RecordingPersister::new());
+        let orphans = Arc::new(StdMutex::new(Vec::new()));
         (
-            Arc::new(IdentitySyncManager::new(sdk, Arc::clone(&persister))),
+            Arc::new(IdentitySyncManager::new(
+                sdk,
+                Arc::clone(&persister),
+                orphans,
+            )),
             persister,
         )
     }
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 2de6ad6d5a..0e02d430b7 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -28,6 +28,22 @@ use crate::wallet::core::BalanceUpdateHandler;
 use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId};
 use crate::wallet::PlatformWallet;
 
+/// Shared list of coordinator OS threads that a tight `stop()`→`start()`
+/// reap had to detach past its 1 s wedge-backstop.
+///
+/// A coordinator's `start()` reap normally joins the prior thread within
+/// a few milliseconds. If that thread is genuinely wedged in a
+/// non-yielding `Drop` (vanishingly rare — the loop exits via a
+/// cancellable `select!`), [`reap_prior_or_park`] parks its still-live
+/// `JoinHandle` here instead of dropping it. The manager owns this list
+/// and shares a clone (`Arc`) with every coordinator, so
+/// [`PlatformWalletManager::shutdown`] can join everything parked here
+/// within its timeout and report
+/// [`CoordinatorThreadStatus::Detached`] if any thread is still alive —
+/// telling the host NOT to free a callback context the thread may still
+/// touch (closing a residual use-after-free window).
+pub(crate) type CoordinatorOrphans = Arc<std::sync::Mutex<Vec<std::thread::JoinHandle<()>>>>;
+
 /// Multi-wallet coordinator with SPV sync and event handling.
 ///
 /// Events are dispatched through [`PlatformEventManager`] to all registered
@@ -87,6 +103,11 @@ pub struct PlatformWalletManager<P: PlatformWalletPersistence + 'static> {
     /// is torn down.
     pub(super) event_adapter_cancel: CancellationToken,
     pub(super) event_adapter_join: tokio::sync::Mutex<Option<JoinHandle<()>>>,
+    /// Coordinator OS threads detached by a tight `stop()`→`start()`
+    /// reap (see [`CoordinatorOrphans`]). Shared (cloned `Arc`) with
+    /// every coordinator so their `start()` reaps can park a wedged
+    /// prior thread here, and drained/joined by [`shutdown`](Self::shutdown).
+    pub(super) coordinator_orphans: CoordinatorOrphans,
 }
 
 /// How one background coordinator thread terminated.
@@ -117,12 +138,24 @@ pub enum CoordinatorThreadStatus {
     /// the runtime was torn down before the join could run (unreachable
     /// in normal operation).
     Error(String),
+    /// At least one coordinator OS thread that an earlier tight
+    /// `stop()`→`start()` reap had to detach past its 1 s wedge-backstop
+    /// was still alive at the shutdown deadline.
+    ///
+    /// Such a thread was parked in the manager's [`CoordinatorOrphans`]
+    /// list (not silently dropped) precisely so this case is visible.
+    /// A still-live detached thread keeps an `Arc` to the host event
+    /// handler and may fire one final callback, so the host must NOT
+    /// free the callback context yet — this status keeps
+    /// [`is_clean`](Self::is_clean) `false` so the FFI `destroy` returns
+    /// `ErrorShutdownIncomplete` instead of `ok()`.
+    Detached,
 }
 
 impl CoordinatorThreadStatus {
     /// `true` only for a fully clean outcome: joined normally (`Ok`) or
-    /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`, and
-    /// `Error` are all considered non-clean.
+    /// never ran (`NotRunning`). `Stopped`, `Panicked`, `Timeout`,
+    /// `Error`, and `Detached` are all considered non-clean.
     pub fn is_clean(&self) -> bool {
         matches!(self, Self::Ok | Self::NotRunning)
     }
@@ -148,19 +181,35 @@ pub struct CoordinatorExitStatus {
     pub shielded_sync: Option<CoordinatorThreadStatus>,
     /// Wallet-event adapter (a `tokio` task, not an OS thread).
     pub event_adapter: CoordinatorThreadStatus,
+    /// Aggregate status of any coordinator OS threads that an earlier
+    /// tight `stop()`→`start()` reap had to detach past its 1 s
+    /// wedge-backstop and park in the manager's [`CoordinatorOrphans`]
+    /// list.
+    ///
+    /// [`Ok`](CoordinatorThreadStatus::Ok) when none were detached (or
+    /// every detached thread has since joined cleanly);
+    /// [`Detached`](CoordinatorThreadStatus::Detached) when at least one
+    /// is still alive at the shutdown deadline. This is what keeps
+    /// [`all_clean`](Self::all_clean) honest for the wedge case the rest
+    /// of the teardown can't see — without it a detached-but-still-live
+    /// thread would let the host free a callback context the thread may
+    /// still touch (a residual use-after-free).
+    pub detached_threads: CoordinatorThreadStatus,
 }
 
 impl CoordinatorExitStatus {
-    /// `true` only when every worker is
+    /// `true` only when every worker — including any parked
+    /// [`detached_threads`](Self::detached_threads) — is
     /// [`Ok`](CoordinatorThreadStatus::Ok) or
     /// [`NotRunning`](CoordinatorThreadStatus::NotRunning); any
-    /// `Stopped`, `Panicked`, `Timeout`, or `Error` slot makes it
-    /// `false`.
+    /// `Stopped`, `Panicked`, `Timeout`, `Error`, or `Detached` slot
+    /// makes it `false`.
     pub fn all_clean(&self) -> bool {
         self.platform_address_sync.is_clean()
             && self.identity_sync.is_clean()
             && self.shielded_sync.as_ref().is_none_or(|s| s.is_clean())
             && self.event_adapter.is_clean()
+            && self.detached_threads.is_clean()
     }
 }
 
@@ -223,6 +272,138 @@ fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
     }
 }
 
+/// Reap a coordinator's prior OS thread after a `stop()`→`start()`
+/// reschedule — or park it for [`PlatformWalletManager::shutdown`] if it
+/// is genuinely wedged.
+///
+/// Shared by all three coordinators' `start()` (identity / platform-
+/// address / shielded), called at the tail of `start()` *after* the
+/// `background_cancel` lock has been released, so the exiting prior
+/// thread's epilogue (which also takes that lock) can complete and the
+/// join lands in milliseconds.
+///
+/// `prior` was cancellation-signalled by the preceding `stop()`, so its
+/// `select!` loop exits and the thread finishes almost immediately. The
+/// `backstop` deadline fires only if the thread is wedged in a
+/// non-yielding `Drop` that never observes the cancellation (vanishingly
+/// rare). On that wedge we must NOT silently drop the still-live handle:
+/// the thread still holds an `Arc` to the host event handler and could
+/// fire a callback, so a later `destroy` that freed the host context
+/// would hit a use-after-free. Instead we park the handle in `orphans`
+/// so `shutdown()` joins it within its own timeout and reports
+/// [`CoordinatorThreadStatus::Detached`] if it is still alive — keeping
+/// [`CoordinatorExitStatus::all_clean`] honest.
+pub(crate) fn reap_prior_or_park(
+    prior: Option<std::thread::JoinHandle<()>>,
+    orphans: &CoordinatorOrphans,
+    backstop: std::time::Duration,
+    coordinator: &str,
+) {
+    let Some(handle) = prior else {
+        return;
+    };
+    let deadline = std::time::Instant::now() + backstop;
+    loop {
+        if handle.is_finished() {
+            // Near-instant since finished; reaps the thread's resources.
+            let _ = handle.join();
+            return;
+        }
+        if std::time::Instant::now() >= deadline {
+            tracing::warn!(
+                coordinator,
+                ?backstop,
+                "prior sync thread did not finish within the backstop after \
+                 cancellation; parking it in the manager orphans list for \
+                 shutdown() to join rather than detaching it"
+            );
+            // Park the still-live (but already-cancelled) handle so a
+            // later shutdown() can join it and report it non-clean,
+            // instead of dropping it and leaving a UAF window where the
+            // host frees a callback context the thread may still touch.
+            orphans
+                .lock()
+                .unwrap_or_else(|e| e.into_inner())
+                .push(handle);
+            return;
+        }
+        std::thread::sleep(std::time::Duration::from_millis(5));
+    }
+}
+
+/// Drain the manager's [`CoordinatorOrphans`] list and classify how the
+/// parked threads ended, polling until `deadline`.
+///
+/// Threads land in the list only when a tight `stop()`→`start()` reap had
+/// to detach a prior coordinator thread past its 1 s wedge-backstop (see
+/// [`reap_prior_or_park`]). They were parked rather than dropped so this
+/// final teardown can account for them: a still-live detached thread
+/// keeps an `Arc` to the host event handler and could fire one last
+/// callback, so the host must not free its context until every such
+/// thread has exited.
+///
+/// Polls [`JoinHandle::is_finished`](std::thread::JoinHandle::is_finished)
+/// in 5 ms steps, yielding at each `.await` so a wrapping
+/// `tokio::time::timeout` can still interrupt it (no uncancellable
+/// blocking join — `join()` is only ever called on an already-finished
+/// handle). Returns:
+/// - [`Ok`](CoordinatorThreadStatus::Ok) — the list was empty, or every
+///   parked thread joined cleanly;
+/// - [`Panicked`](CoordinatorThreadStatus::Panicked) — a parked thread
+///   had panicked (and none were left alive at the deadline);
+/// - [`Detached`](CoordinatorThreadStatus::Detached) — at least one
+///   parked thread was still alive at `deadline`. Any still-live handles
+///   are re-parked so a later (idempotent) `shutdown()` can retry.
+pub(crate) async fn join_detached_orphans(
+    orphans: &CoordinatorOrphans,
+    deadline: std::time::Instant,
+) -> CoordinatorThreadStatus {
+    // Take the whole list out under the lock; we re-park any survivors
+    // at the deadline, but never hold the lock across an `.await`.
+    let mut pending: Vec<std::thread::JoinHandle<()>> = {
+        let mut guard = orphans.lock().unwrap_or_else(|e| e.into_inner());
+        std::mem::take(&mut *guard)
+    };
+    if pending.is_empty() {
+        return CoordinatorThreadStatus::Ok;
+    }
+
+    let mut panicked: Option<String> = None;
+    loop {
+        // Reap every thread that has finished this pass; retain the rest.
+        let mut still_live = Vec::with_capacity(pending.len());
+        for handle in pending.drain(..) {
+            if handle.is_finished() {
+                if let Err(payload) = handle.join() {
+                    // Keep the first panic message; a live `Detached`
+                    // thread still takes precedence at the deadline below.
+                    panicked.get_or_insert_with(|| panic_message(payload));
+                }
+            } else {
+                still_live.push(handle);
+            }
+        }
+        pending = still_live;
+
+        if pending.is_empty() {
+            return match panicked {
+                Some(msg) => CoordinatorThreadStatus::Panicked(msg),
+                None => CoordinatorThreadStatus::Ok,
+            };
+        }
+        if std::time::Instant::now() >= deadline {
+            // Re-park survivors so an idempotent re-`shutdown()` retries
+            // rather than losing track of a still-live thread.
+            orphans
+                .lock()
+                .unwrap_or_else(|e| e.into_inner())
+                .extend(pending);
+            return CoordinatorThreadStatus::Detached;
+        }
+        tokio::time::sleep(std::time::Duration::from_millis(5)).await;
+    }
+}
+
 /// Maximum time (seconds) the teardown paths — `shutdown()`,
 /// `clear_shielded`, and the FFI shielded-stop bridge — wait for one
 /// coordinator's quiesce+join to complete.
@@ -236,6 +417,23 @@ fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
 /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
 pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
 
+/// Grace period (seconds) [`PlatformWalletManager::shutdown`] spends
+/// polling any parked [`CoordinatorOrphans`] before declaring a survivor
+/// [`Detached`](CoordinatorThreadStatus::Detached).
+///
+/// Unlike a live coordinator — whose `quiesce()` may legitimately spend
+/// seconds draining an in-flight pass, hence the 30 s
+/// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] — an orphan is a thread an earlier reap
+/// already had to detach *because it was wedged past its 1 s backstop*.
+/// A healthy detached thread finishes within milliseconds of the
+/// cancellation it long ago received (so `is_finished()` is usually true
+/// on the first poll and the join is instant); one still alive after this
+/// grace is wedged in a non-yielding `Drop` and will not finish however
+/// long we wait. A short grace therefore separates "finishing" from
+/// "wedged" without stretching teardown, and reporting `Detached` is the
+/// conservative, UAF-safe outcome (the host delays freeing its context).
+pub(crate) const SHUTDOWN_ORPHAN_GRACE_SECS: u64 = 1;
+
 impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// Create a new PlatformWalletManager.
     ///
@@ -275,6 +473,13 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
             balance_handler,
         ]));
 
+        // Shared orphans list: a coordinator's `start()` reap parks here
+        // any prior thread it had to detach past its 1 s wedge-backstop,
+        // and `shutdown()` joins them. Every coordinator gets a clone of
+        // this same `Arc` so they all park into the one list the manager
+        // drains.
+        let coordinator_orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
+
         let spv = Arc::new(SpvRuntime::new(
             Arc::clone(&wallet_manager),
             Arc::clone(&event_manager),
@@ -282,10 +487,12 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         let platform_address_sync = Arc::new(PlatformAddressSyncManager::new(
             Arc::clone(&wallets),
             Arc::clone(&event_manager),
+            Arc::clone(&coordinator_orphans),
         ));
         let identity_sync = Arc::new(IdentitySyncManager::new(
             Arc::clone(&sdk),
             Arc::clone(&persister),
+            Arc::clone(&coordinator_orphans),
         ));
         #[cfg(feature = "shielded")]
         let shielded_coordinator: Arc<
@@ -295,6 +502,7 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         let shielded_sync = Arc::new(ShieldedSyncManager::new(
             Arc::clone(&event_manager),
             Arc::clone(&shielded_coordinator),
+            Arc::clone(&coordinator_orphans),
         ));
         Self {
             sdk,
@@ -313,6 +521,7 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
             persister,
             event_adapter_cancel,
             event_adapter_join: tokio::sync::Mutex::new(Some(event_adapter_join)),
+            coordinator_orphans,
         }
     }
 
@@ -575,11 +784,26 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
             },
         };
 
+        // Finally, account for any coordinator threads an earlier tight
+        // stop()→start() reap had to detach past its 1 s wedge-backstop.
+        // They were parked in `coordinator_orphans` (not dropped) so we
+        // can join them here; a survivor at the grace deadline reports
+        // `Detached`, which keeps `all_clean()` false so the FFI `destroy`
+        // returns `ErrorShutdownIncomplete` rather than letting the host
+        // free a callback context the live thread may still touch. The
+        // grace poll yields, so it never blocks teardown uncancellably.
+        let detached_threads = join_detached_orphans(
+            &self.coordinator_orphans,
+            std::time::Instant::now() + std::time::Duration::from_secs(SHUTDOWN_ORPHAN_GRACE_SECS),
+        )
+        .await;
+
         CoordinatorExitStatus {
             platform_address_sync,
             identity_sync,
             shielded_sync,
             event_adapter,
+            detached_threads,
         }
     }
 }
@@ -800,6 +1024,9 @@ mod tests {
         assert!(!CoordinatorThreadStatus::Panicked("boom".into()).is_clean());
         assert!(!CoordinatorThreadStatus::Timeout.is_clean());
         assert!(!CoordinatorThreadStatus::Error("infra".into()).is_clean());
+        // A detached-but-still-live coordinator thread is non-clean: the
+        // host must not free its callback context yet.
+        assert!(!CoordinatorThreadStatus::Detached.is_clean());
     }
 
     /// `all_clean()` on `CoordinatorExitStatus` is false whenever any
@@ -811,6 +1038,7 @@ mod tests {
             identity_sync: CoordinatorThreadStatus::NotRunning,
             shielded_sync: None,
             event_adapter: CoordinatorThreadStatus::Ok,
+            detached_threads: CoordinatorThreadStatus::Ok,
         };
         assert!(clean.all_clean());
 
@@ -819,6 +1047,7 @@ mod tests {
             identity_sync: CoordinatorThreadStatus::Ok,
             shielded_sync: None,
             event_adapter: CoordinatorThreadStatus::Ok,
+            detached_threads: CoordinatorThreadStatus::Ok,
         };
         assert!(!with_timeout.all_clean());
 
@@ -827,8 +1056,20 @@ mod tests {
             identity_sync: CoordinatorThreadStatus::Ok,
             shielded_sync: Some(CoordinatorThreadStatus::Stopped(Some("aborted".into()))),
             event_adapter: CoordinatorThreadStatus::Ok,
+            detached_threads: CoordinatorThreadStatus::Ok,
         };
         assert!(!with_stopped.all_clean());
+
+        // A still-live detached orphan alone makes the aggregate
+        // non-clean — the slot the rest of the teardown can't see.
+        let with_detached = CoordinatorExitStatus {
+            platform_address_sync: CoordinatorThreadStatus::Ok,
+            identity_sync: CoordinatorThreadStatus::Ok,
+            shielded_sync: None,
+            event_adapter: CoordinatorThreadStatus::Ok,
+            detached_threads: CoordinatorThreadStatus::Detached,
+        };
+        assert!(!with_detached.all_clean());
     }
 
     /// A cleanly-returning thread joins as `Ok`; an absent handle is
@@ -993,4 +1234,158 @@ mod tests {
             SHUTDOWN_PANICS.load(AO::SeqCst)
         );
     }
+
+    /// Spawn a thread that parks until `release` is signalled (or the
+    /// sender drops), standing in for a coordinator thread wedged in a
+    /// non-yielding `Drop` that ignores the cancellation it received.
+    fn spawn_wedged_thread() -> (std::sync::mpsc::Sender<()>, std::thread::JoinHandle<()>) {
+        let (release_tx, release_rx) = std::sync::mpsc::channel::<()>();
+        let handle = std::thread::spawn(move || {
+            // Block here regardless of any cancellation, exactly like a
+            // Drop that never yields, until the test releases us.
+            let _ = release_rx.recv();
+        });
+        (release_tx, handle)
+    }
+
+    /// A prior coordinator thread that is still alive past the reap
+    /// backstop must be **parked in the orphans list**, not dropped —
+    /// otherwise `shutdown()` would never know it exists and could let the
+    /// host free a callback context the live thread still touches.
+    ///
+    /// Non-vacuous: if `reap_prior_or_park` dropped the wedged handle
+    /// (the old behavior) the list would stay empty and the length
+    /// assertion below would fail.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn reap_prior_or_park_parks_wedged_thread() {
+        let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
+        let (release_tx, wedged) = spawn_wedged_thread();
+
+        // `reap_prior_or_park` is synchronous and spins a std sleep until
+        // its backstop, so run it off the runtime workers. A short backstop
+        // (real `start()` uses 1 s) keeps the test fast.
+        let orphans_for_reap = Arc::clone(&orphans);
+        tokio::task::spawn_blocking(move || {
+            reap_prior_or_park(
+                Some(wedged),
+                &orphans_for_reap,
+                Duration::from_millis(100),
+                "test-coordinator",
+            );
+        })
+        .await
+        .unwrap();
+
+        assert_eq!(
+            orphans.lock().unwrap().len(),
+            1,
+            "a prior thread wedged past the backstop must be parked, not dropped"
+        );
+
+        // Cleanup: release + join the parked thread so none leaks.
+        release_tx.send(()).unwrap();
+        let parked = orphans.lock().unwrap().pop().unwrap();
+        tokio::task::spawn_blocking(move || {
+            let _ = parked.join();
+        })
+        .await
+        .unwrap();
+    }
+
+    /// `join_detached_orphans` classifies the parked threads: empty list →
+    /// `Ok`; a survivor at the deadline → `Detached` (re-parked for a later
+    /// retry); once the survivor exits, a fresh join reports `Ok` and
+    /// drains the list.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn join_detached_orphans_reports_detached_then_ok() {
+        let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
+
+        // Nothing parked → clean.
+        assert_eq!(
+            join_detached_orphans(&orphans, std::time::Instant::now()).await,
+            CoordinatorThreadStatus::Ok
+        );
+
+        // Park a still-live thread; a short deadline elapses with it alive.
+        let (release_tx, wedged) = spawn_wedged_thread();
+        orphans.lock().unwrap().push(wedged);
+        let status = join_detached_orphans(
+            &orphans,
+            std::time::Instant::now() + Duration::from_millis(50),
+        )
+        .await;
+        assert_eq!(
+            status,
+            CoordinatorThreadStatus::Detached,
+            "a survivor at the deadline must report Detached"
+        );
+        assert_eq!(
+            orphans.lock().unwrap().len(),
+            1,
+            "a survivor must be re-parked so an idempotent re-shutdown retries"
+        );
+
+        // Release it; the next join reaps it cleanly and empties the list.
+        release_tx.send(()).unwrap();
+        let status = tokio::time::timeout(
+            Duration::from_secs(5),
+            join_detached_orphans(&orphans, std::time::Instant::now() + Duration::from_secs(5)),
+        )
+        .await
+        .expect("orphan join must complete once the thread is released");
+        assert_eq!(status, CoordinatorThreadStatus::Ok);
+        assert!(
+            orphans.lock().unwrap().is_empty(),
+            "a joined orphan must be drained from the list"
+        );
+    }
+
+    /// Headline regression: a coordinator thread detached past the reap
+    /// backstop and parked in the orphans list makes a subsequent
+    /// `shutdown()` report the result as **non-clean** — so the FFI
+    /// `destroy` returns `ErrorShutdownIncomplete` and the host delays
+    /// freeing the callback context the still-live thread may touch.
+    ///
+    /// Non-vacuous: if `join_detached_orphans` ignored the list (or the
+    /// orphan were dropped at reap instead of parked), `detached_threads`
+    /// would be `Ok` and `all_clean()` would be `true`, failing both
+    /// assertions.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn shutdown_reports_detached_orphan_as_non_clean() {
+        let manager = make_manager();
+
+        // Stand in for the genuine-wedge outcome: an earlier tight
+        // stop()→start() reap had to detach a still-live coordinator thread
+        // past its 1 s backstop, so `reap_prior_or_park` parked it here.
+        let (release_tx, wedged) = spawn_wedged_thread();
+        manager.coordinator_orphans.lock().unwrap().push(wedged);
+
+        let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown())
+            .await
+            .expect("shutdown must complete within bound");
+
+        assert_eq!(
+            status.detached_threads,
+            CoordinatorThreadStatus::Detached,
+            "a still-live detached orphan must surface as Detached"
+        );
+        assert!(
+            !status.all_clean(),
+            "all_clean() must be false while a detached coordinator thread is \
+             still alive: {status:?}"
+        );
+
+        // Cleanup: shutdown() re-parked the survivor; release + join it so
+        // no live thread leaks past the test. Pop into a local first so the
+        // std MutexGuard is not held across the await below.
+        release_tx.send(()).unwrap();
+        let parked = manager.coordinator_orphans.lock().unwrap().pop();
+        if let Some(parked) = parked {
+            tokio::task::spawn_blocking(move || {
+                let _ = parked.join();
+            })
+            .await
+            .unwrap();
+        }
+    }
 }
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 094ae1a25b..40457c4a87 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -104,6 +104,12 @@ pub struct PlatformAddressSyncManager {
     /// confirm the `!Send` loop fully exited before the host drops the
     /// runtime.
     background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
+    /// Manager-owned orphans list (shared `Arc`). On a tight
+    /// `stop()`→`start()` where the prior thread is wedged past the 1 s
+    /// reap backstop, [`start`](Self::start) parks the still-live handle
+    /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
+    /// instead of dropping it, so manager `shutdown()` accounts for it.
+    coordinator_orphans: super::CoordinatorOrphans,
     /// Monotonically increasing generation counter. Bumped on every
     /// `start()` so the exiting thread can tell whether its generation is
     /// still the active one before clearing `background_cancel`. Without
@@ -135,12 +141,14 @@ impl PlatformAddressSyncManager {
     pub fn new(
         wallets: Arc<RwLock<BTreeMap<WalletId, Arc<PlatformWallet>>>>,
         event_manager: Arc<PlatformEventManager>,
+        coordinator_orphans: super::CoordinatorOrphans,
     ) -> Self {
         Self {
             wallets,
             event_manager,
             background_cancel: StdMutex::new(None),
             background_join: StdMutex::new(None),
+            coordinator_orphans,
             background_generation: AtomicU64::new(0),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
@@ -313,23 +321,15 @@ impl PlatformAddressSyncManager {
         // so is_finished() trips within a few milliseconds and the join is
         // near-instant. The 1 s deadline survives only as a genuine-wedge
         // backstop (e.g. a pass wedged in a Drop that never yields); if it
-        // fires we detach the already-cancelled thread to unblock start().
-        if let Some(h) = prior {
-            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
-            while !h.is_finished() {
-                if std::time::Instant::now() >= deadline {
-                    tracing::warn!(
-                        "platform-address-sync prior thread did not finish within 1 s \
-                         after cancellation; detaching to unblock start()"
-                    );
-                    break; // Drop h — detaches; thread was already cancelled.
-                }
-                std::thread::sleep(std::time::Duration::from_millis(5));
-            }
-            if h.is_finished() {
-                let _ = h.join(); // Reap resources; near-instant since finished.
-            }
-        }
+        // fires `reap_prior_or_park` parks the still-live, already-cancelled
+        // thread in the manager orphans list so `shutdown()` joins it and
+        // reports it non-clean rather than dropping it (residual UAF).
+        super::reap_prior_or_park(
+            prior,
+            &self.coordinator_orphans,
+            std::time::Duration::from_secs(1),
+            "platform-address-sync",
+        );
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -543,8 +543,13 @@ mod tests {
         let event_manager = Arc::new(PlatformEventManager::new(vec![
             Arc::clone(&counter) as Arc<dyn PlatformEventHandler>
         ]));
+        let orphans = Arc::new(StdMutex::new(Vec::new()));
         (
-            Arc::new(PlatformAddressSyncManager::new(wallets, event_manager)),
+            Arc::new(PlatformAddressSyncManager::new(
+                wallets,
+                event_manager,
+                orphans,
+            )),
             counter,
         )
     }
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index ba7b752315..3c84bd7071 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -148,6 +148,12 @@ pub struct ShieldedSyncManager {
     /// confirm the `!Send` loop fully exited before the host drops the
     /// runtime.
     background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
+    /// Manager-owned orphans list (shared `Arc`). On a tight
+    /// `stop()`→`start()` where the prior thread is wedged past the 1 s
+    /// reap backstop, [`start`](Self::start) parks the still-live handle
+    /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
+    /// instead of dropping it, so manager `shutdown()` accounts for it.
+    coordinator_orphans: super::CoordinatorOrphans,
     /// Monotonically increasing generation counter. Bumped on every
     /// `start()` so the exiting thread can tell whether its
     /// generation is still the active one before clearing
@@ -173,12 +179,14 @@ impl ShieldedSyncManager {
     pub fn new(
         event_manager: Arc<PlatformEventManager>,
         coordinator_slot: Arc<RwLock<Option<Arc<NetworkShieldedCoordinator>>>>,
+        coordinator_orphans: super::CoordinatorOrphans,
     ) -> Self {
         Self {
             event_manager,
             coordinator_slot,
             background_cancel: StdMutex::new(None),
             background_join: StdMutex::new(None),
+            coordinator_orphans,
             background_generation: AtomicU64::new(0),
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
@@ -346,23 +354,15 @@ impl ShieldedSyncManager {
         // so is_finished() trips within a few milliseconds and the join is
         // near-instant. The 1 s deadline survives only as a genuine-wedge
         // backstop (e.g. a pass wedged in a Drop that never yields); if it
-        // fires we detach the already-cancelled thread to unblock start().
-        if let Some(h) = prior {
-            let deadline = std::time::Instant::now() + std::time::Duration::from_secs(1);
-            while !h.is_finished() {
-                if std::time::Instant::now() >= deadline {
-                    tracing::warn!(
-                        "shielded-sync prior thread did not finish within 1 s \
-                         after cancellation; detaching to unblock start()"
-                    );
-                    break; // Drop h — detaches; thread was already cancelled.
-                }
-                std::thread::sleep(std::time::Duration::from_millis(5));
-            }
-            if h.is_finished() {
-                let _ = h.join(); // Reap resources; near-instant since finished.
-            }
-        }
+        // fires `reap_prior_or_park` parks the still-live, already-cancelled
+        // thread in the manager orphans list so `shutdown()` joins it and
+        // reports it non-clean rather than dropping it (residual UAF).
+        super::reap_prior_or_park(
+            prior,
+            &self.coordinator_orphans,
+            std::time::Duration::from_secs(1),
+            "shielded-sync",
+        );
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -601,7 +601,12 @@ mod tests {
     fn make_manager() -> Arc<ShieldedSyncManager> {
         let coordinator_slot = Arc::new(RwLock::new(None));
         let event_manager = Arc::new(PlatformEventManager::new(vec![]));
-        Arc::new(ShieldedSyncManager::new(event_manager, coordinator_slot))
+        let orphans = Arc::new(StdMutex::new(Vec::new()));
+        Arc::new(ShieldedSyncManager::new(
+            event_manager,
+            coordinator_slot,
+            orphans,
+        ))
     }
 
     /// Regression: a tight `stop()` → `start()` must reap the prior loop's

From 3cca1cf833e1a2aaf7dadd9df722323634678cae Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 13:36:19 +0200
Subject: [PATCH 16/29] perf(platform-wallet): drain coordinators concurrently
 in shutdown() via tokio::join!

The three periodic coordinators (platform-address, identity, shielded)
were quiesced sequentially in shutdown(), making the worst-case wait
additive (~3 x SHUTDOWN_JOIN_TIMEOUT_SECS). Each quiesce() touches only
its own state (its quiescing/is_syncing atomics and its own
background_cancel/background_join mutexes) and joins its own OS thread,
sharing no lock, so racing them is sound. Drain them concurrently via
tokio::join!, collapsing the worst case to ~max(timeouts).

Each join! arm keeps its OWN inner tokio::time::timeout, so every
coordinator still yields its own per-coordinator CoordinatorThreadStatus
(a single outer timeout would flatten all three to Timeout). The event
adapter teardown and join_detached_orphans stay sequential and ordered
strictly AFTER the coordinator join!, since the adapter sinks the
coordinators' stores. The multi-thread runtime assert is unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../rs-platform-wallet/src/manager/mod.rs     | 57 +++++++++++++------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 0e02d430b7..7e9690d066 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -691,7 +691,13 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// context — a use-after-free. So we `quiesce()` the sync managers
     /// FIRST (so no further persister store or host callback can start),
     /// and only THEN cancel + join the event adapter, which is the sink
-    /// those stores feed into.
+    /// those stores feed into. The three coordinators are independent —
+    /// each `quiesce()` touches only its own state (its `quiescing` /
+    /// `is_syncing` atomics and its own `background_cancel` /
+    /// `background_join` mutexes) and joins its own OS thread, sharing no
+    /// lock — so they are drained *concurrently* via `tokio::join!`; only
+    /// the event-adapter teardown stays ordered strictly after them,
+    /// because it is the sink the coordinators store into.
     ///
     /// After each coordinator's `quiesce()` drains its in-flight pass,
     /// this also **joins** the loop's OS thread, so when `shutdown()`
@@ -708,8 +714,10 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// service one `block_on` at a time, so the join would deadlock. This
     /// is asserted in both debug and release builds.
     ///
-    /// Each coordinator quiesce+join is bounded by
-    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] as a backstop. `quiesce()` cancels
+    /// Each coordinator quiesce+join is bounded by its own
+    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] backstop. Because the three drain
+    /// concurrently, the worst-case wait collapses to ~that single
+    /// backstop instead of the sum of all three. `quiesce()` cancels
     /// the loop, which aborts any in-flight pass at its `.await` point, so
     /// the `is_syncing` drain clears promptly and the join normally lands
     /// far inside the window — the deadline fires only if a pass's *drop*
@@ -735,25 +743,42 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
 
         let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
 
-        // Each quiesce() drains any in-flight pass AND joins the thread.
-        let platform_address_sync =
+        // Drain the three independent periodic coordinators *concurrently*.
+        // Each quiesce() drains any in-flight pass AND joins its own OS
+        // thread, touching only that coordinator's own state (no shared
+        // lock), so racing them is sound and collapses the worst case from
+        // the sum of the three backstops to ~max(...). Each drain keeps its
+        // OWN inner `tokio::time::timeout`, so it still yields its own
+        // per-coordinator `CoordinatorThreadStatus` — a single outer timeout
+        // around the whole join! would flatten all three to `Timeout` and
+        // lose that detail.
+        let drain_platform_address = async {
             tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
                 .await
-                .unwrap_or(CoordinatorThreadStatus::Timeout);
-
-        let identity_sync = tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
-            .await
-            .unwrap_or(CoordinatorThreadStatus::Timeout);
-
+                .unwrap_or(CoordinatorThreadStatus::Timeout)
+        };
+        let drain_identity = async {
+            tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
+                .await
+                .unwrap_or(CoordinatorThreadStatus::Timeout)
+        };
         #[cfg(feature = "shielded")]
-        let shielded_sync = {
-            let r = tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
+        let drain_shielded = async {
+            tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
                 .await
-                .unwrap_or(CoordinatorThreadStatus::Timeout);
-            Some(r)
+                .unwrap_or(CoordinatorThreadStatus::Timeout)
+        };
+
+        #[cfg(feature = "shielded")]
+        let (platform_address_sync, identity_sync, shielded_sync) = {
+            let (p, i, s) = tokio::join!(drain_platform_address, drain_identity, drain_shielded);
+            (p, i, Some(s))
         };
         #[cfg(not(feature = "shielded"))]
-        let shielded_sync = None;
+        let (platform_address_sync, identity_sync, shielded_sync) = {
+            let (p, i) = tokio::join!(drain_platform_address, drain_identity);
+            (p, i, None)
+        };
 
         // The event adapter is a tokio task (it sinks the coordinators'
         // stores), so cancel + join it last — after the loops feeding it

From 8c528116a30ac0cb5173236c0fd83d12a2cabc41 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:22:19 +0200
Subject: [PATCH 17/29] feat(dash-async): add shared ThreadRegistry
 worker-lifecycle engine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Centralizes the previously-triplicated background-worker lifecycle —
generation-match exit epilogue, restart reap-or-park, orphan drain — into
one tested engine in the shared dash-async crate, generic over a worker
key and supporting both OS-thread (`!Send` block_on loops) and tokio-task
workers.

Makes two confirmed bugs impossible by construction:
- F1: quiesce/join paths take `&self`; the live JoinHandle stays owned by
  the slot and is never moved into a cancellable future's frame. A
  dropped/timed-out quiesce re-parks the handle into orphans (Timeout),
  never drop-and-detach to a clean NotRunning.
- F2: any_alive() is the single liveness gate spanning live slots AND
  parked orphans, so store-wiping paths can refuse while a prior thread
  is alive.

Weight-ordered shutdown drains tiers ascending, concurrently within a
tier. WorkerStatus variants are byte-identical to the wallet's
CoordinatorThreadStatus for a stable FFI mapping.

Adds the full registry test suite (TC-001/001b/003-014, F1 shutdown-path
GAP-006, compile-fail DrainHook Send check, default-config and idempotent-
shutdown gaps). 22 tests + 1 doctest green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 Cargo.lock                             |    2 +
 packages/rs-dash-async/Cargo.toml      |    4 +-
 packages/rs-dash-async/src/lib.rs      |   11 +-
 packages/rs-dash-async/src/registry.rs | 1257 ++++++++++++++++++++++++
 4 files changed, 1272 insertions(+), 2 deletions(-)
 create mode 100644 packages/rs-dash-async/src/registry.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1faa308a83..2108bed826 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1617,8 +1617,10 @@ dependencies = [
 name = "dash-async"
 version = "4.0.0-rc.2"
 dependencies = [
+ "futures",
  "thiserror 2.0.18",
  "tokio",
+ "tokio-util",
  "tracing",
 ]
 
diff --git a/packages/rs-dash-async/Cargo.toml b/packages/rs-dash-async/Cargo.toml
index 26e2c8fdeb..69d180e568 100644
--- a/packages/rs-dash-async/Cargo.toml
+++ b/packages/rs-dash-async/Cargo.toml
@@ -13,6 +13,8 @@ tracing = "0.1.41"
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 tokio = { version = "1.40", features = ["rt", "rt-multi-thread", "time", "net"] }
+tokio-util = { version = "0.7.12" }
+futures = { version = "0.3.30" }
 
 [dev-dependencies]
-tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync"] }
+tokio = { version = "1.40", features = ["macros", "rt-multi-thread", "sync", "time"] }
diff --git a/packages/rs-dash-async/src/lib.rs b/packages/rs-dash-async/src/lib.rs
index 3edcf00daa..1ce0820359 100644
--- a/packages/rs-dash-async/src/lib.rs
+++ b/packages/rs-dash-async/src/lib.rs
@@ -3,10 +3,19 @@
 //! Provides [`block_on`] -- a function that bridges async futures into sync code,
 //! handling multiple tokio runtime flavors (no runtime, current-thread, multi-thread).
 //!
-//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets.
+//! Also provides [`AtomicFlagGuard`] — a RAII guard for panic-safe `AtomicBool` flag resets,
+//! and [`ThreadRegistry`] — a shared lifecycle engine for background OS-thread / tokio-task
+//! workers (start, cancel, weight-ordered quiesce + join, orphan reap).
 
 mod atomic;
 mod block_on;
+#[cfg(not(target_arch = "wasm32"))]
+mod registry;
 
 pub use atomic::AtomicFlagGuard;
 pub use block_on::{block_on, AsyncError};
+#[cfg(not(target_arch = "wasm32"))]
+pub use registry::{
+    DrainHook, RegistryKey, ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig,
+    WorkerStatus, DEFAULT_JOIN_BUDGET, DEFAULT_REAP_BACKSTOP,
+};
diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
new file mode 100644
index 0000000000..e7cd835cd8
--- /dev/null
+++ b/packages/rs-dash-async/src/registry.rs
@@ -0,0 +1,1257 @@
+//! Shared lifecycle engine for background workers (`ThreadRegistry`).
+//!
+//! Centralizes the dangerous, previously-triplicated 80% of a background
+//! worker's lifecycle — the generation-match exit epilogue, the
+//! reap-or-park of a restarted worker's prior thread, and the orphan
+//! drain — into one tested place, while deliberately leaving the
+//! domain-specific 20% (the "is a pass in flight?" drain barrier) to the
+//! consumer as a [`DrainHook`].
+//!
+//! Two worker kinds are supported:
+//! - [`start_thread`](ThreadRegistry::start_thread) — a dedicated OS
+//!   thread, for loops that `block_on` `!Send` futures internally (the
+//!   `!Send` value never crosses the spawn boundary; the body itself is
+//!   `Send`).
+//! - [`start_task`](ThreadRegistry::start_task) — a tokio task, for
+//!   `Send` futures.
+//!
+//! # Why F1 and F2 cannot recur
+//!
+//! - **F1** (timeout-dropped quiesce detaches a live thread): every join
+//!   path takes `&self`; the live join handle stays owned by the slot
+//!   and is never moved into a cancellable future's frame. A
+//!   dropped/timed-out [`quiesce`](ThreadRegistry::quiesce) therefore
+//!   cannot drop-and-detach the handle — on timeout (or on an external
+//!   drop) the handle is deterministically re-parked into the orphan
+//!   list, and the slot reports [`WorkerStatus::Timeout`], never a clean
+//!   `NotRunning`.
+//! - **F2** (store wipe races a parked prior-generation thread):
+//!   orphans live in the registry and [`any_alive`](ThreadRegistry::any_alive)
+//!   is the single liveness gate spanning live slots **and** parked
+//!   orphans. Every store-wiping path consults it, so a parked
+//!   still-live thread blocks the wipe.
+
+use std::collections::BTreeMap;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use futures::future::FutureExt;
+use tokio::runtime::RuntimeFlavor;
+use tokio_util::sync::CancellationToken;
+
+// ---------------------------------------------------------------------
+// Key & weight
+// ---------------------------------------------------------------------
+
+/// Worker identity. A wallet supplies a fixed enum; rs-dapi a generated
+/// id. Blanket-implemented — consumers just derive the listed bounds on
+/// their own key type.
+pub trait RegistryKey: Copy + Ord + Eq + std::fmt::Debug + Send + Sync + 'static {}
+impl<T: Copy + Ord + Eq + std::fmt::Debug + Send + Sync + 'static> RegistryKey for T {}
+
+/// Teardown order. Lower weights drain first; equal weights drain
+/// concurrently within a tier. Default `0`.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default)]
+pub struct ShutdownWeight(pub i32);
+
+// ---------------------------------------------------------------------
+// Status
+// ---------------------------------------------------------------------
+
+/// Terminal status of one worker. Variant set and payloads are
+/// byte-identical to the wallet's `CoordinatorThreadStatus`, which is
+/// constructed from this via `From` so the FFI surface stays stable.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum WorkerStatus {
+    /// The loop exited and its thread/task joined cleanly.
+    Ok,
+    /// A tokio task ended for a non-panic, non-clean reason (cancelled /
+    /// aborted at the runtime level). Carries a reason when available.
+    /// Only the `Task` kind can produce this; an OS thread never does.
+    Stopped(Option<String>),
+    /// The thread/task panicked; carries the best-effort panic message.
+    Panicked(String),
+    /// The managed join exceeded this worker's `join_budget`. The live
+    /// handle was re-parked into the orphan list — UAF-safe, non-clean.
+    Timeout,
+    /// A parked orphan was still alive after the reap grace — UAF-safe,
+    /// non-clean.
+    Detached,
+    /// No thread/task was running to join — never started, or already
+    /// joined by a prior teardown.
+    NotRunning,
+    /// Infrastructural join failure that is neither a timeout nor a
+    /// panic (unreachable in normal operation).
+    Error(String),
+}
+
+impl WorkerStatus {
+    /// `true` only for a fully clean outcome: joined normally (`Ok`) or
+    /// never ran (`NotRunning`).
+    pub fn is_clean(&self) -> bool {
+        matches!(self, Self::Ok | Self::NotRunning)
+    }
+}
+
+/// Aggregate result of [`ThreadRegistry::shutdown`].
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ShutdownReport<K: RegistryKey> {
+    /// Per-worker terminal status, keyed by worker id.
+    pub per_worker: BTreeMap<K, WorkerStatus>,
+    /// Number of parked orphans still alive at the reap deadline.
+    pub detached: usize,
+}
+
+impl<K: RegistryKey> ShutdownReport<K> {
+    /// `true` only when every per-worker status is clean and no orphan
+    /// survived the reap.
+    pub fn all_clean(&self) -> bool {
+        self.detached == 0 && self.per_worker.values().all(WorkerStatus::is_clean)
+    }
+}
+
+// ---------------------------------------------------------------------
+// Per-worker registration options
+// ---------------------------------------------------------------------
+
+/// Async drain hook the registry awaits **before** cancelling a worker,
+/// in weight order. The domain barrier (raise a `quiescing` gate, wait
+/// out an in-flight pass) lives here, supplied by the consumer — the
+/// registry never owns domain semantics.
+///
+/// The captured state must be `Send + Sync`; a `!Send` capture does not
+/// compile as a `DrainHook`:
+///
+/// ```compile_fail
+/// use std::rc::Rc;
+/// use std::sync::Arc;
+/// use dash_async::DrainHook;
+/// let rc = Rc::new(42u32); // !Send
+/// let _hook: DrainHook =
+///     Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) });
+/// ```
+pub type DrainHook =
+    Arc<dyn Fn() -> Pin<Box<dyn Future<Output = ()> + Send>> + Send + Sync>;
+
+/// Default managed-join budget when a [`WorkerConfig`] does not override
+/// it. Pinned so an accidental change surfaces in tests.
+pub const DEFAULT_JOIN_BUDGET: Duration = Duration::from_secs(30);
+
+/// Default orphan reap backstop (start-time reap and shutdown grace).
+pub const DEFAULT_REAP_BACKSTOP: Duration = Duration::from_secs(1);
+
+/// Per-worker registration options.
+pub struct WorkerConfig {
+    /// Teardown tier; lower drains first, equal weights concurrently.
+    pub weight: ShutdownWeight,
+    /// Optional drain barrier awaited before cancellation.
+    pub drain: Option<DrainHook>,
+    /// Managed-join timeout for this worker.
+    pub join_budget: Duration,
+}
+
+impl Default for WorkerConfig {
+    fn default() -> Self {
+        Self {
+            weight: ShutdownWeight::default(),
+            drain: None,
+            join_budget: DEFAULT_JOIN_BUDGET,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------
+// Internal handle + slot state
+// ---------------------------------------------------------------------
+
+/// A live worker's join handle. Kept owned by its slot so a cancellable
+/// caller can never move it into a future frame and detach it on drop.
+enum WorkerHandle {
+    OsThread(std::thread::JoinHandle<()>),
+    Task(tokio::task::JoinHandle<()>),
+}
+
+impl WorkerHandle {
+    fn is_finished(&self) -> bool {
+        match self {
+            WorkerHandle::OsThread(h) => h.is_finished(),
+            WorkerHandle::Task(h) => h.is_finished(),
+        }
+    }
+
+    /// Classify a **finished** handle. Kind-dispatched (R3): an OS thread
+    /// yields only `Ok` / `Panicked`; a task can also yield `Stopped`
+    /// (cancelled / aborted at the runtime level).
+    fn classify(self) -> WorkerStatus {
+        match self {
+            WorkerHandle::OsThread(j) => match j.join() {
+                Ok(()) => WorkerStatus::Ok,
+                Err(payload) => WorkerStatus::Panicked(panic_message(payload)),
+            },
+            WorkerHandle::Task(j) => match j.now_or_never() {
+                Some(Ok(())) => WorkerStatus::Ok,
+                Some(Err(e)) if e.is_panic() => {
+                    WorkerStatus::Panicked(panic_message(e.into_panic()))
+                }
+                Some(Err(e)) => WorkerStatus::Stopped(Some(e.to_string())),
+                // Only ever called on a finished handle, so a finished
+                // task is always ready; this arm is defensive.
+                None => WorkerStatus::Error("task handle not ready at join".to_string()),
+            },
+        }
+    }
+}
+
+/// Best-effort extraction of a panic message (`&str` / `String` cases).
+fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
+    if let Some(s) = payload.downcast_ref::<&str>() {
+        (*s).to_string()
+    } else if let Some(s) = payload.downcast_ref::<String>() {
+        s.clone()
+    } else {
+        "<non-string panic>".to_string()
+    }
+}
+
+/// One key's slot. The entry is created on first start and never removed,
+/// so `generation` stays monotonic across the key's whole lifetime — a
+/// parked prior-generation thread can therefore always tell that its
+/// generation is stale. `cancel.is_some()` is the running indicator;
+/// `handle` is the join handle, reaped by the next start or by quiesce.
+struct SlotState {
+    generation: u64,
+    cancel: Option<CancellationToken>,
+    handle: Option<WorkerHandle>,
+    weight: ShutdownWeight,
+    drain: Option<DrainHook>,
+    join_budget: Duration,
+}
+
+impl SlotState {
+    fn dormant() -> Self {
+        Self {
+            generation: 0,
+            cancel: None,
+            handle: None,
+            weight: ShutdownWeight::default(),
+            drain: None,
+            join_budget: DEFAULT_JOIN_BUDGET,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------
+// The registry
+// ---------------------------------------------------------------------
+
+/// Shared lifecycle engine for background workers. See the module docs.
+pub struct ThreadRegistry<K: RegistryKey> {
+    slots: Mutex<BTreeMap<K, SlotState>>,
+    orphans: Mutex<Vec<WorkerHandle>>,
+    reap_backstop: Duration,
+}
+
+impl<K: RegistryKey> ThreadRegistry<K> {
+    /// New registry with the default reap backstop ([`DEFAULT_REAP_BACKSTOP`]).
+    pub fn new() -> Arc<Self> {
+        Self::with_reap_backstop(DEFAULT_REAP_BACKSTOP)
+    }
+
+    /// New registry with an explicit orphan reap backstop (the wallet
+    /// uses 1s — the same grace separates "finishing" from "wedged").
+    pub fn with_reap_backstop(backstop: Duration) -> Arc<Self> {
+        Arc::new(Self {
+            slots: Mutex::new(BTreeMap::new()),
+            orphans: Mutex::new(Vec::new()),
+            reap_backstop: backstop,
+        })
+    }
+
+    /// Start an OS-thread worker for `!Send` loops. `body` runs on a
+    /// fresh `std::thread` and may build and `block_on` `!Send` futures
+    /// internally — the `!Send` value never crosses the spawn boundary
+    /// (`body` itself is `Send`). Starting a key that already has a live
+    /// worker is a no-op; a key whose prior thread has not been reaped is
+    /// reaped-or-parked first (the restart-reap path).
+    ///
+    /// **Requires a multi-thread runtime**: the worker drives its loop
+    /// via `Handle::block_on` and needs the shared timer/IO driver.
+    pub fn start_thread<F>(self: &Arc<Self>, key: K, cfg: WorkerConfig, body: F)
+    where
+        F: FnOnce(CancellationToken) + Send + 'static,
+    {
+        Self::assert_multi_thread("start_thread");
+        let prior = {
+            let mut slots = self.lock_slots();
+            let slot = slots.entry(key).or_insert_with(SlotState::dormant);
+            if slot.cancel.is_some() {
+                return;
+            }
+            // Take the prior handle to reap below; bump generation and
+            // install the new token under this one lock so a prior
+            // thread's epilogue observes the post-swap generation.
+            let prior = slot.handle.take();
+            let token = CancellationToken::new();
+            slot.cancel = Some(token.clone());
+            slot.generation += 1;
+            let my_gen = slot.generation;
+            slot.weight = cfg.weight;
+            slot.drain = cfg.drain;
+            slot.join_budget = cfg.join_budget;
+
+            let reg = Arc::clone(self);
+            let body_token = token;
+            let join = std::thread::Builder::new()
+                .name(format!("tr-worker-{key:?}"))
+                .spawn(move || {
+                    body(body_token);
+                    reg.run_epilogue(key, my_gen);
+                })
+                .expect("failed to spawn registry worker thread");
+            // Store the handle while still under the slot lock; the guard
+            // is released at the end of this block, BEFORE the reap below
+            // (R1: store handle -> drop guard -> THEN reap-or-park).
+            slot.handle = Some(WorkerHandle::OsThread(join));
+            prior
+        };
+
+        // The prior thread was cancellation-signalled by a preceding
+        // cancel(); with the slot lock released its epilogue completes
+        // promptly and the join lands in milliseconds. The backstop fires
+        // only on a genuine wedge, in which case the still-live handle is
+        // parked (not dropped) so teardown can account for it.
+        self.reap_prior_or_park(prior, key);
+    }
+
+    /// Start a tokio-task worker for `Send` futures. Same restart-reap
+    /// semantics as [`start_thread`](Self::start_thread); does not require
+    /// a multi-thread runtime.
+    pub fn start_task<F, Fut>(self: &Arc<Self>, key: K, cfg: WorkerConfig, body: F)
+    where
+        F: FnOnce(CancellationToken) -> Fut + Send + 'static,
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        let prior = {
+            let mut slots = self.lock_slots();
+            let slot = slots.entry(key).or_insert_with(SlotState::dormant);
+            if slot.cancel.is_some() {
+                return;
+            }
+            let prior = slot.handle.take();
+            let token = CancellationToken::new();
+            slot.cancel = Some(token.clone());
+            slot.generation += 1;
+            let my_gen = slot.generation;
+            slot.weight = cfg.weight;
+            slot.drain = cfg.drain;
+            slot.join_budget = cfg.join_budget;
+
+            let reg = Arc::clone(self);
+            let body_token = token;
+            let join = tokio::spawn(async move {
+                body(body_token).await;
+                reg.run_epilogue(key, my_gen);
+            });
+            slot.handle = Some(WorkerHandle::Task(join));
+            prior
+        };
+        self.reap_prior_or_park(prior, key);
+    }
+
+    /// Whether a worker is currently registered and running for `key`.
+    pub fn is_running(&self, key: K) -> bool {
+        self.lock_slots()
+            .get(&key)
+            .map(|s| s.cancel.is_some())
+            .unwrap_or(false)
+    }
+
+    /// Signal-only cancellation of one worker (was `stop()`).
+    pub fn cancel(&self, key: K) {
+        if let Some(slot) = self.lock_slots().get_mut(&key) {
+            if let Some(token) = slot.cancel.take() {
+                token.cancel();
+            }
+        }
+    }
+
+    /// Signal-only cancellation of every registered worker.
+    pub fn cancel_all(&self) {
+        for slot in self.lock_slots().values_mut() {
+            if let Some(token) = slot.cancel.take() {
+                token.cancel();
+            }
+        }
+    }
+
+    /// Await this worker's drain hook, cancel it, then join within its
+    /// budget. The live handle is owned by the slot and is **never** moved
+    /// into this future's frame, so a dropped/timed-out call cannot detach
+    /// it; on the managed timeout — or if this future is dropped
+    /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX]
+    pub async fn quiesce(&self, key: K) -> WorkerStatus {
+        // Snapshot the drain hook + budget, and bail early if nothing is
+        // registered for this key.
+        let (drain, budget) = {
+            let slots = self.lock_slots();
+            match slots.get(&key) {
+                Some(s) if s.cancel.is_some() || s.handle.is_some() => {
+                    (s.drain.clone(), s.join_budget)
+                }
+                _ => return WorkerStatus::NotRunning,
+            }
+        };
+
+        // R2: gate-before-cancel — fully await the drain hook before the
+        // cancel signal is observed.
+        if let Some(drain) = drain {
+            drain().await;
+        }
+
+        // Signal-only cancel.
+        if let Some(slot) = self.lock_slots().get_mut(&key) {
+            if let Some(token) = slot.cancel.take() {
+                token.cancel();
+            }
+        }
+
+        // Poll-join within budget. The re-park guard moves the slot's
+        // still-live handle into orphans if this future is dropped before
+        // the loop finishes — the handle is never owned by this frame.
+        let _repark = Repark { reg: self, key };
+        let deadline = Instant::now() + budget;
+        loop {
+            enum Step {
+                Classify(WorkerHandle),
+                Park(WorkerHandle),
+                NotRunning,
+                Wait,
+            }
+            let step = {
+                let mut slots = self.lock_slots();
+                match slots.get_mut(&key) {
+                    None => Step::NotRunning,
+                    Some(slot) => match slot.handle.take_if(|h| h.is_finished()) {
+                        Some(h) => Step::Classify(h),
+                        None if slot.handle.is_none() => Step::NotRunning,
+                        None if Instant::now() >= deadline => {
+                            Step::Park(slot.handle.take().expect("handle present"))
+                        }
+                        None => Step::Wait,
+                    },
+                }
+            };
+            match step {
+                Step::Classify(h) => return h.classify(),
+                Step::Park(h) => {
+                    self.lock_orphans().push(h);
+                    return WorkerStatus::Timeout;
+                }
+                Step::NotRunning => return WorkerStatus::NotRunning,
+                Step::Wait => tokio::time::sleep(Duration::from_millis(5)).await,
+            }
+        }
+    }
+
+    /// Is any registered worker **or** parked orphan still alive?
+    /// Store-wiping paths must gate on this returning `false` before
+    /// destroying shared state. [F2 FIX]
+    pub fn any_alive(&self) -> bool {
+        {
+            let slots = self.lock_slots();
+            for slot in slots.values() {
+                if slot.cancel.is_some() {
+                    return true;
+                }
+                if let Some(handle) = &slot.handle {
+                    if !handle.is_finished() {
+                        return true;
+                    }
+                }
+            }
+        }
+        self.lock_orphans().iter().any(|h| !h.is_finished())
+    }
+
+    /// Reap parked orphans with a short grace; survivors are re-parked and
+    /// reported as [`WorkerStatus::Detached`] (idempotent retry).
+    pub async fn reap_orphans(&self, grace: Duration) -> WorkerStatus {
+        self.reap_orphans_impl(grace).await.0
+    }
+
+    /// Weight-ordered teardown: ascending tier by tier, each worker's
+    /// (drain-hook -> cancel -> join) run concurrently within a tier;
+    /// orphan reap runs last. **Requires a multi-thread runtime.**
+    pub async fn shutdown(&self) -> ShutdownReport<K> {
+        Self::assert_multi_thread("shutdown");
+
+        // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in
+        // ascending weight order, giving the lower-first drain.
+        let tiers: BTreeMap<ShutdownWeight, Vec<K>> = {
+            let slots = self.lock_slots();
+            let mut tiers: BTreeMap<ShutdownWeight, Vec<K>> = BTreeMap::new();
+            for (key, slot) in slots.iter() {
+                tiers.entry(slot.weight).or_default().push(*key);
+            }
+            tiers
+        };
+
+        let mut per_worker = BTreeMap::new();
+        for (_weight, keys) in tiers {
+            // Drain every worker in this tier concurrently: each
+            // quiesce() drives its own drain-hook -> cancel -> join, and
+            // `join_all` polls them on one task so their drain hooks
+            // interleave (equal-weight concurrency).
+            let drained = keys.into_iter().map(|key| async move { (key, self.quiesce(key).await) });
+            for (key, status) in futures::future::join_all(drained).await {
+                per_worker.insert(key, status);
+            }
+        }
+
+        // Account for parked orphans last.
+        let (_status, detached) = self.reap_orphans_impl(self.reap_backstop).await;
+        ShutdownReport {
+            per_worker,
+            detached,
+        }
+    }
+
+    // -----------------------------------------------------------------
+    // Internal helpers
+    // -----------------------------------------------------------------
+
+    fn lock_slots(&self) -> std::sync::MutexGuard<'_, BTreeMap<K, SlotState>> {
+        self.slots.lock().unwrap_or_else(|e| e.into_inner())
+    }
+
+    fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec<WorkerHandle>> {
+        self.orphans.lock().unwrap_or_else(|e| e.into_inner())
+    }
+
+    fn assert_multi_thread(ctx: &str) {
+        assert!(
+            matches!(
+                tokio::runtime::Handle::current().runtime_flavor(),
+                RuntimeFlavor::MultiThread
+            ),
+            "ThreadRegistry::{ctx}() requires a multi-thread Tokio runtime: an \
+             OS-thread worker drives its loop via Handle::block_on and needs the \
+             runtime's timer/IO driver, but a current_thread runtime can only \
+             drive one block_on at a time"
+        );
+    }
+
+    /// Gen-gated exit epilogue, run on the worker after its body returns:
+    /// clear this slot's running flag only if a newer start has not since
+    /// installed a replacement.
+    fn run_epilogue(&self, key: K, my_gen: u64) {
+        if let Some(slot) = self.lock_slots().get_mut(&key) {
+            if slot.generation == my_gen {
+                slot.cancel = None;
+            }
+        }
+    }
+
+    /// Reap a restarted key's prior worker — or park it if it is genuinely
+    /// wedged past the reap backstop. Must be called with no registry lock
+    /// held (it spins synchronously for an OS thread).
+    fn reap_prior_or_park(&self, prior: Option<WorkerHandle>, key: K) {
+        let Some(handle) = prior else {
+            return;
+        };
+        match handle {
+            WorkerHandle::OsThread(h) => {
+                let deadline = Instant::now() + self.reap_backstop;
+                loop {
+                    if h.is_finished() {
+                        let _ = h.join();
+                        return;
+                    }
+                    if Instant::now() >= deadline {
+                        tracing::warn!(
+                            ?key,
+                            backstop = ?self.reap_backstop,
+                            "prior worker thread did not finish within the reap \
+                             backstop after cancellation; parking it as an orphan \
+                             for teardown to join rather than detaching it"
+                        );
+                        self.lock_orphans().push(WorkerHandle::OsThread(h));
+                        return;
+                    }
+                    std::thread::sleep(Duration::from_millis(5));
+                }
+            }
+            // A task can't be joined synchronously here; park a still-live
+            // one for async reap. A finished one is dropped (detaching a
+            // finished task is a no-op).
+            task => {
+                if !task.is_finished() {
+                    self.lock_orphans().push(task);
+                }
+            }
+        }
+    }
+
+    /// Drain the orphan list, polling until `grace`. Returns the terminal
+    /// status and the number of survivors re-parked for an idempotent
+    /// retry.
+    async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) {
+        let mut pending: Vec<WorkerHandle> = {
+            let mut guard = self.lock_orphans();
+            std::mem::take(&mut *guard)
+        };
+        if pending.is_empty() {
+            return (WorkerStatus::Ok, 0);
+        }
+
+        let deadline = Instant::now() + grace;
+        // Keep the first non-clean terminal status; a live survivor still
+        // takes precedence at the deadline.
+        let mut non_clean: Option<WorkerStatus> = None;
+        loop {
+            let mut still_live = Vec::with_capacity(pending.len());
+            for handle in pending.drain(..) {
+                if handle.is_finished() {
+                    let status = handle.classify();
+                    if !status.is_clean() {
+                        non_clean.get_or_insert(status);
+                    }
+                } else {
+                    still_live.push(handle);
+                }
+            }
+            pending = still_live;
+
+            if pending.is_empty() {
+                return (non_clean.unwrap_or(WorkerStatus::Ok), 0);
+            }
+            if Instant::now() >= deadline {
+                let survivors = pending.len();
+                self.lock_orphans().extend(pending);
+                return (WorkerStatus::Detached, survivors);
+            }
+            tokio::time::sleep(Duration::from_millis(5)).await;
+        }
+    }
+
+    /// Test-only seam: park a raw thread handle as an orphan. Used by
+    /// cross-crate regression tests (e.g. the wallet's F2 gate) that must
+    /// inject a wedged prior-generation thread without driving the full
+    /// restart-reap path.
+    #[doc(hidden)]
+    pub fn park_orphan_for_test(&self, handle: std::thread::JoinHandle<()>) {
+        self.lock_orphans().push(WorkerHandle::OsThread(handle));
+    }
+}
+
+/// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future
+/// is dropped before it finishes (e.g. an outer timeout fires), this moves
+/// the slot's still-live handle into the orphan list instead of letting it
+/// be dropped-and-detached. On normal completion the handle has already
+/// been taken from the slot, so this is a no-op.
+struct Repark<'a, K: RegistryKey> {
+    reg: &'a ThreadRegistry<K>,
+    key: K,
+}
+
+impl<K: RegistryKey> Drop for Repark<'_, K> {
+    fn drop(&mut self) {
+        // Take the handle under the slot lock, release it, then push to
+        // orphans — never nest the two locks.
+        let handle = self
+            .reg
+            .lock_slots()
+            .get_mut(&self.key)
+            .and_then(|slot| slot.handle.take());
+        if let Some(handle) = handle {
+            self.reg.lock_orphans().push(handle);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::panic::{catch_unwind, AssertUnwindSafe};
+    use std::sync::atomic::{AtomicBool, Ordering};
+    use std::sync::mpsc;
+    use tokio::runtime::{Builder, Handle};
+    use tokio::sync::Barrier;
+
+    type Reg = Arc<ThreadRegistry<&'static str>>;
+
+    /// Start an OS-thread worker that exits cleanly when cancelled. The
+    /// runtime handle is captured from the caller's context (the worker
+    /// thread is not itself a tokio worker, so it can't fetch its own).
+    fn start_clean(reg: &Reg, key: &'static str, cfg: WorkerConfig) {
+        let handle = Handle::current();
+        reg.start_thread(key, cfg, move |cancel| {
+            handle.block_on(async move { cancel.cancelled().await });
+        });
+    }
+
+    /// Body for a worker wedged in a non-yielding section: blocks on a
+    /// channel and ignores its cancellation token (stands in for a thread
+    /// stuck in a `Drop` that never observes cancel).
+    fn wedged_body(rx: mpsc::Receiver<()>) -> impl FnOnce(CancellationToken) + Send + 'static {
+        move |_cancel| {
+            let _ = rx.recv();
+        }
+    }
+
+    fn orphan_len(reg: &Reg) -> usize {
+        reg.lock_orphans().len()
+    }
+
+    // ----- Group 1: F1 regression -------------------------------------
+
+    /// TC-001 — a `quiesce` whose outer future is dropped (a tiny enclosing
+    /// timeout) must re-park the live handle, never drop-and-detach it. The
+    /// slot is cleared (`is_running == false`) but the handle lives in
+    /// orphans and `any_alive()` stays true.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc001_quiesce_drop_reparks_handle_not_detach() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        reg.start_thread("alpha", WorkerConfig::default(), wedged_body(release_rx));
+        assert!(reg.is_running("alpha"));
+
+        // The wedged worker never observes cancel, so the internal 30s
+        // budget can't fire here; the tiny outer timeout drops the quiesce
+        // future mid-poll. A naive by-value-into-future impl would detach
+        // the handle (orphans empty, any_alive false); the fix re-parks it.
+        let result =
+            tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await;
+        assert!(result.is_err(), "outer timeout must fire on the wedged worker");
+
+        assert!(reg.any_alive(), "re-parked handle keeps any_alive true");
+        assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)");
+        assert_eq!(orphan_len(&reg), 1, "handle was re-parked, not detached");
+        assert!(!WorkerStatus::Timeout.is_clean());
+
+        // Release + reap: the orphan joins cleanly and liveness clears.
+        release_tx.send(()).unwrap();
+        assert_eq!(
+            reg.reap_orphans(Duration::from_secs(2)).await,
+            WorkerStatus::Ok
+        );
+        assert!(!reg.any_alive());
+    }
+
+    /// TC-001b — internal-budget variant: a wedged worker with a tiny
+    /// `join_budget` makes `quiesce` itself time out, re-park, and return
+    /// `Timeout` (no outer drop involved).
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc001b_quiesce_internal_budget_timeout_reparks() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        let cfg = WorkerConfig {
+            join_budget: Duration::from_millis(50),
+            ..WorkerConfig::default()
+        };
+        reg.start_thread("alpha", cfg, wedged_body(release_rx));
+
+        let status = reg.quiesce("alpha").await;
+        assert_eq!(status, WorkerStatus::Timeout);
+        assert_eq!(orphan_len(&reg), 1);
+        assert!(reg.any_alive());
+        assert!(!reg.is_running("alpha"));
+
+        release_tx.send(()).unwrap();
+        assert_eq!(
+            reg.reap_orphans(Duration::from_secs(2)).await,
+            WorkerStatus::Ok
+        );
+        assert!(!reg.any_alive());
+    }
+
+    /// GAP-006 — the F1 scenario via the `shutdown()` path: a wedged worker
+    /// with a tiny budget surfaces as `Timeout` in the report, its handle
+    /// is re-parked (`detached == 1`, `any_alive`), and the result is
+    /// non-clean — never a clean detach.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn gap006_shutdown_path_reparks_wedged_worker() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        let cfg = WorkerConfig {
+            join_budget: Duration::from_millis(50),
+            ..WorkerConfig::default()
+        };
+        reg.start_thread("alpha", cfg, wedged_body(release_rx));
+
+        let report = tokio::time::timeout(Duration::from_secs(10), reg.shutdown())
+            .await
+            .expect("shutdown must complete within bound");
+        assert_eq!(report.per_worker.get("alpha"), Some(&WorkerStatus::Timeout));
+        assert_eq!(report.detached, 1, "wedged handle re-parked, survived reap");
+        assert!(!report.all_clean());
+        assert!(reg.any_alive());
+
+        // Cleanup.
+        release_tx.send(()).unwrap();
+        let _ = reg.reap_orphans(Duration::from_secs(5)).await;
+        assert!(!reg.any_alive());
+    }
+
+    // ----- Group 3: registry unit suite -------------------------------
+
+    /// TC-003 — a slow prior-generation thread's epilogue must NOT clear a
+    /// newer generation's token. Restarting reaps the prior generation
+    /// fully (its epilogue runs); the new generation stays tracked.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc003_generation_match_epilogue_preserves_new_token() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "beta", WorkerConfig::default()); // gen 1
+        assert!(reg.is_running("beta"));
+
+        // Cancel gen 1, then restart. start_thread's reap joins gen 1
+        // (running its gen-gated epilogue) before returning, so this is
+        // deterministic: if the epilogue ignored generation it would have
+        // cleared gen 2's token during that join.
+        reg.cancel("beta");
+        start_clean(&reg, "beta", WorkerConfig::default()); // gen 2
+
+        assert!(
+            reg.is_running("beta"),
+            "gen-2 token must survive gen-1's epilogue"
+        );
+        assert_eq!(reg.quiesce("beta").await, WorkerStatus::Ok);
+    }
+
+    /// TC-004 — a naturally-finished prior thread is joined cleanly on
+    /// restart, with no parking.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc004_restart_reaps_finished_prior_without_parking() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "gamma", WorkerConfig::default());
+        // Cancel so the prior exits, then restart: the reap must join it,
+        // not park it.
+        reg.cancel("gamma");
+        start_clean(&reg, "gamma", WorkerConfig::default());
+        assert_eq!(orphan_len(&reg), 0, "finished prior was joined, not parked");
+        assert!(reg.is_running("gamma"));
+        assert_eq!(reg.quiesce("gamma").await, WorkerStatus::Ok);
+    }
+
+    /// TC-005 — a prior thread wedged past the reap backstop is parked in
+    /// orphans (not dropped), then drained after release.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc005_restart_parks_wedged_prior() {
+        let reg = ThreadRegistry::with_reap_backstop(Duration::from_millis(100));
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+
+        // gen 1: wedged (ignores cancel).
+        reg.start_thread("delta", WorkerConfig::default(), wedged_body(release_rx));
+        reg.cancel("delta");
+
+        // gen 2: clean. The restart reaps gen 1 — wedged past the 100ms
+        // backstop, so it is parked. Run off the runtime workers since the
+        // reap spins synchronously.
+        let reg_for_start = Arc::clone(&reg);
+        let parent = Handle::current();
+        tokio::task::spawn_blocking(move || {
+            let handle = parent.clone();
+            reg_for_start.start_thread("delta", WorkerConfig::default(), move |cancel| {
+                handle.block_on(async move { cancel.cancelled().await });
+            });
+        })
+        .await
+        .unwrap();
+
+        assert_eq!(orphan_len(&reg), 1, "wedged prior parked, not dropped");
+        assert!(reg.any_alive());
+        assert!(reg.is_running("delta"), "gen-2 loop started");
+
+        // Release the wedged prior; reap drains it.
+        release_tx.send(()).unwrap();
+        assert_eq!(
+            reg.reap_orphans(Duration::from_secs(2)).await,
+            WorkerStatus::Ok
+        );
+        assert_eq!(orphan_len(&reg), 0);
+
+        // Cleanup gen 2.
+        assert_eq!(reg.quiesce("delta").await, WorkerStatus::Ok);
+    }
+
+    /// TC-006 — orphan drain: a survivor at the grace deadline is reported
+    /// `Detached` and re-parked; once released it reaps `Ok`.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc006_orphan_drain_detached_then_ok() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        let wedged = std::thread::spawn(move || {
+            let _ = release_rx.recv();
+        });
+        reg.park_orphan_for_test(wedged);
+
+        assert_eq!(
+            reg.reap_orphans(Duration::from_millis(50)).await,
+            WorkerStatus::Detached
+        );
+        assert_eq!(orphan_len(&reg), 1, "survivor re-parked for retry");
+        assert!(reg.any_alive());
+
+        release_tx.send(()).unwrap();
+        assert_eq!(
+            reg.reap_orphans(Duration::from_secs(2)).await,
+            WorkerStatus::Ok
+        );
+        assert_eq!(orphan_len(&reg), 0);
+        assert!(!reg.any_alive());
+    }
+
+    /// TC-007 — weight-ordered shutdown drains a lower tier before a higher
+    /// one.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc007_weight_ordered_shutdown_drains_low_first() {
+        let reg = ThreadRegistry::<&str>::new();
+        let log = Arc::new(Mutex::new(Vec::<&'static str>::new()));
+
+        let mk_hook = |tag: &'static str, log: Arc<Mutex<Vec<&'static str>>>| -> DrainHook {
+            Arc::new(move || {
+                let log = Arc::clone(&log);
+                Box::pin(async move {
+                    log.lock().unwrap().push(tag);
+                })
+            })
+        };
+
+        start_clean(
+            &reg,
+            "w0",
+            WorkerConfig {
+                weight: ShutdownWeight(0),
+                drain: Some(mk_hook("w0", Arc::clone(&log))),
+                ..WorkerConfig::default()
+            },
+        );
+        start_clean(
+            &reg,
+            "w5",
+            WorkerConfig {
+                weight: ShutdownWeight(5),
+                drain: Some(mk_hook("w5", Arc::clone(&log))),
+                ..WorkerConfig::default()
+            },
+        );
+        start_clean(
+            &reg,
+            "w10",
+            WorkerConfig {
+                weight: ShutdownWeight(10),
+                drain: Some(mk_hook("w10", Arc::clone(&log))),
+                ..WorkerConfig::default()
+            },
+        );
+
+        let report = reg.shutdown().await;
+        assert!(report.all_clean());
+
+        let log = log.lock().unwrap();
+        let pos = |tag| log.iter().position(|t| *t == tag).unwrap();
+        assert!(pos("w0") < pos("w5"));
+        assert!(pos("w5") < pos("w10"));
+    }
+
+    /// TC-008 — equal-weight workers drain concurrently. A shared
+    /// `Barrier(2)` in both drain hooks would deadlock under sequential
+    /// draining (caught by the enclosing timeout); the event log proves
+    /// both arrived before either passed.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc008_equal_weight_drains_concurrently() {
+        let reg = ThreadRegistry::<&str>::new();
+        let log = Arc::new(Mutex::new(Vec::<&'static str>::new()));
+        let barrier = Arc::new(Barrier::new(2));
+
+        let mk_hook = |arrived: &'static str,
+                       passed: &'static str,
+                       log: Arc<Mutex<Vec<&'static str>>>,
+                       barrier: Arc<Barrier>|
+         -> DrainHook {
+            Arc::new(move || {
+                let log = Arc::clone(&log);
+                let barrier = Arc::clone(&barrier);
+                Box::pin(async move {
+                    log.lock().unwrap().push(arrived);
+                    barrier.wait().await;
+                    log.lock().unwrap().push(passed);
+                })
+            })
+        };
+
+        start_clean(
+            &reg,
+            "a",
+            WorkerConfig {
+                weight: ShutdownWeight(0),
+                drain: Some(mk_hook("a_arrived", "a_passed", Arc::clone(&log), Arc::clone(&barrier))),
+                ..WorkerConfig::default()
+            },
+        );
+        start_clean(
+            &reg,
+            "b",
+            WorkerConfig {
+                weight: ShutdownWeight(0),
+                drain: Some(mk_hook("b_arrived", "b_passed", Arc::clone(&log), Arc::clone(&barrier))),
+                ..WorkerConfig::default()
+            },
+        );
+
+        let report = tokio::time::timeout(Duration::from_secs(5), reg.shutdown())
+            .await
+            .expect("equal-weight drain must not deadlock (proves concurrency)");
+        assert!(report.all_clean());
+
+        let log = log.lock().unwrap();
+        let pos = |tag| log.iter().position(|t| *t == tag).unwrap();
+        let last_arrived = pos("a_arrived").max(pos("b_arrived"));
+        let first_passed = pos("a_passed").min(pos("b_passed"));
+        assert!(
+            last_arrived < first_passed,
+            "both hooks must reach the barrier before either passes: {log:?}"
+        );
+    }
+
+    /// TC-009 — `any_alive()` accounts for both live slots and orphans.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc009_any_alive_spans_slots_and_orphans() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "alpha", WorkerConfig::default());
+        assert!(reg.any_alive());
+
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        let wedged = std::thread::spawn(move || {
+            let _ = release_rx.recv();
+        });
+        reg.park_orphan_for_test(wedged);
+        assert!(reg.any_alive());
+
+        assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok);
+        assert!(reg.any_alive(), "orphan still contributes after slot drains");
+        assert!(!reg.is_running("alpha"));
+
+        release_tx.send(()).unwrap();
+        let _ = reg.reap_orphans(Duration::from_secs(2)).await;
+        assert!(!reg.any_alive());
+    }
+
+    /// TC-010 — `shutdown()` panics with a documented message on a
+    /// current-thread runtime (R4, variant B).
+    #[test]
+    fn tc010_shutdown_asserts_multi_thread_runtime() {
+        let rt = Builder::new_current_thread().enable_all().build().unwrap();
+        let reg = ThreadRegistry::<&str>::new();
+        let result = catch_unwind(AssertUnwindSafe(|| {
+            rt.block_on(async { reg.shutdown().await });
+        }));
+        let payload = result.expect_err("shutdown must panic on current_thread");
+        let msg = payload
+            .downcast_ref::<String>()
+            .map(String::as_str)
+            .or_else(|| payload.downcast_ref::<&str>().copied())
+            .unwrap_or("");
+        assert!(
+            msg.contains("multi-thread"),
+            "panic must name the runtime constraint, got: {msg}"
+        );
+    }
+
+    // ----- Group 4: DrainHook ordering --------------------------------
+
+    /// TC-011 — the drain hook is fully awaited before the cancel signal is
+    /// observed by the worker.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc011_drain_hook_completes_before_cancel() {
+        let reg = ThreadRegistry::<&str>::new();
+        let log = Arc::new(Mutex::new(Vec::<&'static str>::new()));
+
+        let log_hook = Arc::clone(&log);
+        let drain: DrainHook = Arc::new(move || {
+            let log = Arc::clone(&log_hook);
+            Box::pin(async move {
+                log.lock().unwrap().push("drain_hook_start");
+                tokio::time::sleep(Duration::from_millis(10)).await;
+                log.lock().unwrap().push("drain_hook_complete");
+            })
+        });
+
+        let log_worker = Arc::clone(&log);
+        let handle = Handle::current();
+        reg.start_thread(
+            "epsilon",
+            WorkerConfig {
+                drain: Some(drain),
+                ..WorkerConfig::default()
+            },
+            move |cancel| {
+                handle.block_on(async move {
+                    cancel.cancelled().await;
+                    log_worker.lock().unwrap().push("cancel_observed");
+                });
+            },
+        );
+
+        assert_eq!(reg.quiesce("epsilon").await, WorkerStatus::Ok);
+        assert!(!reg.is_running("epsilon"));
+
+        let log = log.lock().unwrap();
+        let pos = |tag| log.iter().position(|t| *t == tag).unwrap();
+        assert!(pos("drain_hook_start") < pos("drain_hook_complete"));
+        assert!(pos("drain_hook_complete") < pos("cancel_observed"));
+    }
+
+    /// TC-012 — a `quiesce` blocks in the drain hook until an `is_syncing`
+    /// barrier the hook polls falls, and only then cancels + joins.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc012_drain_hook_observes_barrier_before_join() {
+        let reg = ThreadRegistry::<&str>::new();
+        let is_syncing = Arc::new(AtomicBool::new(true));
+
+        let gate = Arc::clone(&is_syncing);
+        let drain: DrainHook = Arc::new(move || {
+            let gate = Arc::clone(&gate);
+            Box::pin(async move {
+                while gate.load(Ordering::Acquire) {
+                    tokio::time::sleep(Duration::from_millis(5)).await;
+                }
+            })
+        });
+        start_clean(
+            &reg,
+            "zeta",
+            WorkerConfig {
+                drain: Some(drain),
+                ..WorkerConfig::default()
+            },
+        );
+
+        let quiesce_completed = Arc::new(AtomicBool::new(false));
+        let reg_q = Arc::clone(&reg);
+        let done = Arc::clone(&quiesce_completed);
+        let quiesce_task = tokio::spawn(async move {
+            let status = reg_q.quiesce("zeta").await;
+            done.store(true, Ordering::Release);
+            status
+        });
+
+        // While the barrier is held, quiesce must stay pending.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(
+            !quiesce_completed.load(Ordering::Acquire),
+            "quiesce must block while is_syncing is held"
+        );
+
+        // Release the barrier; quiesce drains, cancels, joins.
+        is_syncing.store(false, Ordering::Release);
+        let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task)
+            .await
+            .expect("quiesce must complete once the barrier falls")
+            .unwrap();
+        assert_eq!(status, WorkerStatus::Ok);
+        assert!(quiesce_completed.load(Ordering::Acquire));
+    }
+
+    // ----- Group 5: status classification -----------------------------
+
+    /// TC-013 — only the `Task` kind can classify as `Stopped` (from a
+    /// runtime-level cancel/abort JoinError); a cooperatively token-
+    /// cancelled task exits normally as `Ok`. Verifies the kind-dispatch
+    /// at the classification boundary.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc013_task_kind_classifies_stopped_and_ok() {
+        // Stopped: an aborted task yields a cancelled JoinError.
+        let aborted = tokio::spawn(std::future::pending::<()>());
+        aborted.abort();
+        while !aborted.is_finished() {
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+        let status = WorkerHandle::Task(aborted).classify();
+        assert!(matches!(status, WorkerStatus::Stopped(_)), "got {status:?}");
+        assert!(!status.is_clean());
+
+        // Ok: a cooperatively token-cancelled task returns normally.
+        let reg = ThreadRegistry::<&str>::new();
+        reg.start_task("task_a", WorkerConfig::default(), |cancel| async move {
+            cancel.cancelled().await;
+        });
+        assert_eq!(reg.quiesce("task_a").await, WorkerStatus::Ok);
+        assert!(!reg.is_running("task_a"));
+    }
+
+    /// TC-014 — an `OsThread` worker yields `Ok` (clean) or `Panicked`
+    /// (`&str` and `String` payloads), never `Stopped`.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn tc014_os_thread_ok_and_panicked_never_stopped() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "os_clean", WorkerConfig::default());
+        let ok = reg.quiesce("os_clean").await;
+        assert_eq!(ok, WorkerStatus::Ok);
+        assert!(ok.is_clean());
+
+        // &str panic payload.
+        reg.start_thread("os_panic_str", WorkerConfig::default(), |_cancel| {
+            panic!("deliberate test panic");
+        });
+        match reg.quiesce("os_panic_str").await {
+            WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate test panic")),
+            other => panic!("expected Panicked, got {other:?}"),
+        }
+
+        // String panic payload.
+        reg.start_thread("os_panic_string", WorkerConfig::default(), |_cancel| {
+            std::panic::panic_any(String::from("deliberate string panic"));
+        });
+        match reg.quiesce("os_panic_string").await {
+            WorkerStatus::Panicked(msg) => assert!(msg.contains("deliberate string panic")),
+            other => panic!("expected Panicked, got {other:?}"),
+        }
+    }
+
+    // ----- Gaps -------------------------------------------------------
+
+    /// GAP-003 — `shutdown()` is idempotent: a second call finds every slot
+    /// already joined and reports `NotRunning`, still clean.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn gap003_shutdown_is_idempotent() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "alpha", WorkerConfig::default());
+
+        let first = reg.shutdown().await;
+        assert_eq!(first.per_worker.get("alpha"), Some(&WorkerStatus::Ok));
+        assert!(first.all_clean());
+
+        let second = reg.shutdown().await;
+        assert_eq!(
+            second.per_worker.get("alpha"),
+            Some(&WorkerStatus::NotRunning)
+        );
+        assert!(second.all_clean());
+    }
+
+    /// GAP-004 — `cancel(key)` is selective: cancelling A does not touch B.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn gap004_cancel_is_selective() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "a", WorkerConfig::default());
+        start_clean(&reg, "b", WorkerConfig::default());
+
+        reg.cancel("a");
+        assert!(reg.is_running("b"), "cancel(a) must not cancel b");
+        assert_eq!(reg.quiesce("a").await, WorkerStatus::Ok);
+        assert!(reg.is_running("b"), "b still running after a drains");
+        assert_eq!(reg.quiesce("b").await, WorkerStatus::Ok);
+    }
+
+    /// GAP-005 — `WorkerConfig::default()` values are pinned.
+    #[test]
+    fn gap005_worker_config_defaults_pinned() {
+        let cfg = WorkerConfig::default();
+        assert_eq!(cfg.weight, ShutdownWeight(0));
+        assert!(cfg.drain.is_none());
+        assert_eq!(cfg.join_budget, DEFAULT_JOIN_BUDGET);
+    }
+}

From ac9a51a7c70f25cc307ef076cb0e9498c6a67f9b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:27:04 +0200
Subject: [PATCH 18/29] feat(dash-async): key-scope parked orphans for
 any_alive_for()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tag each parked orphan with its originating worker key and add
any_alive_for(key), so a store-wiping path scoped to one worker (the
wallet's clear_shielded F2 gate) can refuse only while that worker — its
slot or a parked prior-generation thread under its key — is alive, without
being blocked by unrelated workers that are legitimately running (e.g. the
always-on event-adapter task). Registry-wide any_alive() is retained.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-dash-async/src/registry.rs | 108 ++++++++++++++++++-------
 1 file changed, 81 insertions(+), 27 deletions(-)

diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index e7cd835cd8..d04d8cbbbc 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -247,9 +247,13 @@ impl SlotState {
 // ---------------------------------------------------------------------
 
 /// Shared lifecycle engine for background workers. See the module docs.
+///
+/// Parked orphans carry their originating key so a store-wiping path for
+/// one worker can gate on [`any_alive_for`](Self::any_alive_for) without
+/// being blocked by an unrelated worker still legitimately running.
 pub struct ThreadRegistry<K: RegistryKey> {
     slots: Mutex<BTreeMap<K, SlotState>>,
-    orphans: Mutex<Vec<WorkerHandle>>,
+    orphans: Mutex<Vec<(K, WorkerHandle)>>,
     reap_backstop: Duration,
 }
 
@@ -446,7 +450,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             match step {
                 Step::Classify(h) => return h.classify(),
                 Step::Park(h) => {
-                    self.lock_orphans().push(h);
+                    self.lock_orphans().push((key, h));
                     return WorkerStatus::Timeout;
                 }
                 Step::NotRunning => return WorkerStatus::NotRunning,
@@ -455,24 +459,34 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         }
     }
 
-    /// Is any registered worker **or** parked orphan still alive?
-    /// Store-wiping paths must gate on this returning `false` before
-    /// destroying shared state. [F2 FIX]
+    /// Is any registered worker **or** parked orphan still alive across
+    /// the whole registry?
     pub fn any_alive(&self) -> bool {
         {
             let slots = self.lock_slots();
             for slot in slots.values() {
-                if slot.cancel.is_some() {
+                if slot_alive(slot) {
                     return true;
                 }
-                if let Some(handle) = &slot.handle {
-                    if !handle.is_finished() {
-                        return true;
-                    }
-                }
             }
         }
-        self.lock_orphans().iter().any(|h| !h.is_finished())
+        self.lock_orphans().iter().any(|(_, h)| !h.is_finished())
+    }
+
+    /// Is the worker for `key` — its live slot **or** any orphan parked
+    /// under that key — still alive? A store-wiping path scoped to one
+    /// worker must gate on this (rather than the registry-wide
+    /// [`any_alive`](Self::any_alive)) so an unrelated worker that is
+    /// legitimately running does not block the wipe. [F2 FIX]
+    pub fn any_alive_for(&self, key: K) -> bool {
+        if let Some(slot) = self.lock_slots().get(&key) {
+            if slot_alive(slot) {
+                return true;
+            }
+        }
+        self.lock_orphans()
+            .iter()
+            .any(|(k, h)| *k == key && !h.is_finished())
     }
 
     /// Reap parked orphans with a short grace; survivors are re-parked and
@@ -526,7 +540,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         self.slots.lock().unwrap_or_else(|e| e.into_inner())
     }
 
-    fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec<WorkerHandle>> {
+    fn lock_orphans(&self) -> std::sync::MutexGuard<'_, Vec<(K, WorkerHandle)>> {
         self.orphans.lock().unwrap_or_else(|e| e.into_inner())
     }
 
@@ -577,7 +591,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                              backstop after cancellation; parking it as an orphan \
                              for teardown to join rather than detaching it"
                         );
-                        self.lock_orphans().push(WorkerHandle::OsThread(h));
+                        self.lock_orphans().push((key, WorkerHandle::OsThread(h)));
                         return;
                     }
                     std::thread::sleep(Duration::from_millis(5));
@@ -588,7 +602,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             // finished task is a no-op).
             task => {
                 if !task.is_finished() {
-                    self.lock_orphans().push(task);
+                    self.lock_orphans().push((key, task));
                 }
             }
         }
@@ -598,7 +612,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// status and the number of survivors re-parked for an idempotent
     /// retry.
     async fn reap_orphans_impl(&self, grace: Duration) -> (WorkerStatus, usize) {
-        let mut pending: Vec<WorkerHandle> = {
+        let mut pending: Vec<(K, WorkerHandle)> = {
             let mut guard = self.lock_orphans();
             std::mem::take(&mut *guard)
         };
@@ -612,14 +626,14 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         let mut non_clean: Option<WorkerStatus> = None;
         loop {
             let mut still_live = Vec::with_capacity(pending.len());
-            for handle in pending.drain(..) {
+            for (key, handle) in pending.drain(..) {
                 if handle.is_finished() {
                     let status = handle.classify();
                     if !status.is_clean() {
                         non_clean.get_or_insert(status);
                     }
                 } else {
-                    still_live.push(handle);
+                    still_live.push((key, handle));
                 }
             }
             pending = still_live;
@@ -636,16 +650,21 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         }
     }
 
-    /// Test-only seam: park a raw thread handle as an orphan. Used by
-    /// cross-crate regression tests (e.g. the wallet's F2 gate) that must
-    /// inject a wedged prior-generation thread without driving the full
-    /// restart-reap path.
+    /// Test-only seam: park a raw thread handle as an orphan under `key`.
+    /// Used by cross-crate regression tests (e.g. the wallet's F2 gate)
+    /// that must inject a wedged prior-generation thread without driving
+    /// the full restart-reap path.
     #[doc(hidden)]
-    pub fn park_orphan_for_test(&self, handle: std::thread::JoinHandle<()>) {
-        self.lock_orphans().push(WorkerHandle::OsThread(handle));
+    pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) {
+        self.lock_orphans().push((key, WorkerHandle::OsThread(handle)));
     }
 }
 
+/// `true` if a slot is running or holds an unfinished handle.
+fn slot_alive(slot: &SlotState) -> bool {
+    slot.cancel.is_some() || slot.handle.as_ref().is_some_and(|h| !h.is_finished())
+}
+
 /// Re-park guard for [`ThreadRegistry::quiesce`]. If the poll-join future
 /// is dropped before it finishes (e.g. an outer timeout fires), this moves
 /// the slot's still-live handle into the orphan list instead of letting it
@@ -666,7 +685,7 @@ impl<K: RegistryKey> Drop for Repark<'_, K> {
             .get_mut(&self.key)
             .and_then(|slot| slot.handle.take());
         if let Some(handle) = handle {
-            self.reg.lock_orphans().push(handle);
+            self.reg.lock_orphans().push((self.key, handle));
         }
     }
 }
@@ -885,7 +904,7 @@ mod tests {
         let wedged = std::thread::spawn(move || {
             let _ = release_rx.recv();
         });
-        reg.park_orphan_for_test(wedged);
+        reg.park_orphan_for_test("orphan", wedged);
 
         assert_eq!(
             reg.reap_orphans(Duration::from_millis(50)).await,
@@ -1027,7 +1046,7 @@ mod tests {
         let wedged = std::thread::spawn(move || {
             let _ = release_rx.recv();
         });
-        reg.park_orphan_for_test(wedged);
+        reg.park_orphan_for_test("orphan", wedged);
         assert!(reg.any_alive());
 
         assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok);
@@ -1039,6 +1058,41 @@ mod tests {
         assert!(!reg.any_alive());
     }
 
+    /// `any_alive_for(key)` is scoped: an orphan parked under one key does
+    /// not make a different key look alive (the F2 gate must not be
+    /// blocked by unrelated workers).
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn any_alive_for_is_key_scoped() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        let wedged = std::thread::spawn(move || {
+            let _ = release_rx.recv();
+        });
+        reg.park_orphan_for_test("shielded", wedged);
+
+        // A live, unrelated worker.
+        start_clean(&reg, "identity", WorkerConfig::default());
+
+        assert!(reg.any_alive(), "registry-wide liveness sees both");
+        assert!(reg.any_alive_for("shielded"), "shielded orphan is alive");
+        assert!(
+            !reg.any_alive_for("address"),
+            "an unrelated key with no slot/orphan is not alive"
+        );
+
+        // The running 'identity' worker must not make 'shielded' look alive
+        // beyond its own orphan, and vice versa.
+        assert!(reg.any_alive_for("identity"), "running identity is alive");
+
+        release_tx.send(()).unwrap();
+        let _ = reg.reap_orphans(Duration::from_secs(2)).await;
+        assert!(
+            !reg.any_alive_for("shielded"),
+            "shielded clear once its orphan is reaped"
+        );
+        assert_eq!(reg.quiesce("identity").await, WorkerStatus::Ok);
+    }
+
     /// TC-010 — `shutdown()` panics with a documented message on a
     /// current-thread runtime (R4, variant B).
     #[test]

From d20aed0027929efc32c6a33ddff9461dc812b8e5 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:55:04 +0200
Subject: [PATCH 19/29] refactor(platform-wallet): migrate sync coordinators
 onto shared ThreadRegistry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the triplicated per-coordinator lifecycle machinery
(background_cancel/join/generation mutexes, the cloned coordinator_orphans
list, and the mod.rs free fns join_coordinator_thread / reap_prior_or_park
/ join_detached_orphans / panic_message) with the shared
dash-async ThreadRegistry. The manager holds one
Arc<ThreadRegistry<WalletWorker>>; each coordinator's start/stop/
is_running/quiesce now delegate to it under a fixed WalletWorker key, and
each exposes its quiescing-gate raise as a registry DrainHook. The
wallet-event adapter becomes a registry start_task worker (weight 10,
draining after the coordinators it sinks at weight 0).

Fixes the two confirmed bugs structurally:
- F1: shutdown() is now CoordinatorExitStatus::from_report(registry
  .shutdown()); each worker's join is bounded by its own join_budget
  inside the registry, where the live handle stays owned by the slot. A
  dropped/timed-out join can no longer detach a live thread — it re-parks
  to Timeout.
- F2: clear_shielded() gates the store wipe on
  registry.any_alive_for(ShieldedSync) (shielded-scoped, so the always-on
  event adapter and the other coordinators running normally do not block
  Clear), refusing while a prior-generation shielded thread is parked
  alive.

CoordinatorThreadStatus / CoordinatorExitStatus / all_clean() are
byte-stable (FFI destroy maps !all_clean -> ErrorShutdownIncomplete);
WorkerStatus maps onto them 1:1 via From. The three wall-clock
restart-reap regression tests and the relocated free-fn tests are deleted
(subsumed by the dash-async registry suite); F2 (clear_shielded) and R5
(from_report) gain wallet-level tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-dash-async/src/registry.rs        |  37 +-
 .../src/changeset/core_bridge.rs              | 122 ++-
 .../rs-platform-wallet/src/changeset/mod.rs   |   2 +-
 .../src/manager/identity_sync.rs              | 319 ++-----
 .../rs-platform-wallet/src/manager/mod.rs     | 898 ++++++------------
 .../src/manager/platform_address_sync.rs      | 304 +-----
 .../src/manager/shielded_sync.rs              | 297 +-----
 7 files changed, 537 insertions(+), 1442 deletions(-)

diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index d04d8cbbbc..802ca3598c 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -132,8 +132,7 @@ impl<K: RegistryKey> ShutdownReport<K> {
 /// let _hook: DrainHook =
 ///     Arc::new(move || { let r = Rc::clone(&rc); Box::pin(async move { let _ = &r; }) });
 /// ```
-pub type DrainHook =
-    Arc<dyn Fn() -> Pin<Box<dyn Future<Output = ()> + Send>> + Send + Sync>;
+pub type DrainHook = Arc<dyn Fn() -> Pin<Box<dyn Future<Output = ()> + Send>> + Send + Sync>;
 
 /// Default managed-join budget when a [`WorkerConfig`] does not override
 /// it. Pinned so an accidental change surfaces in tests.
@@ -518,7 +517,9 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             // quiesce() drives its own drain-hook -> cancel -> join, and
             // `join_all` polls them on one task so their drain hooks
             // interleave (equal-weight concurrency).
-            let drained = keys.into_iter().map(|key| async move { (key, self.quiesce(key).await) });
+            let drained = keys
+                .into_iter()
+                .map(|key| async move { (key, self.quiesce(key).await) });
             for (key, status) in futures::future::join_all(drained).await {
                 per_worker.insert(key, status);
             }
@@ -656,7 +657,8 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// the full restart-reap path.
     #[doc(hidden)]
     pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) {
-        self.lock_orphans().push((key, WorkerHandle::OsThread(handle)));
+        self.lock_orphans()
+            .push((key, WorkerHandle::OsThread(handle)));
     }
 }
 
@@ -741,9 +743,11 @@ mod tests {
         // budget can't fire here; the tiny outer timeout drops the quiesce
         // future mid-poll. A naive by-value-into-future impl would detach
         // the handle (orphans empty, any_alive false); the fix re-parks it.
-        let result =
-            tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await;
-        assert!(result.is_err(), "outer timeout must fire on the wedged worker");
+        let result = tokio::time::timeout(Duration::from_millis(100), reg.quiesce("alpha")).await;
+        assert!(
+            result.is_err(),
+            "outer timeout must fire on the wedged worker"
+        );
 
         assert!(reg.any_alive(), "re-parked handle keeps any_alive true");
         assert!(!reg.is_running("alpha"), "slot cleared (cancel taken)");
@@ -1006,7 +1010,12 @@ mod tests {
             "a",
             WorkerConfig {
                 weight: ShutdownWeight(0),
-                drain: Some(mk_hook("a_arrived", "a_passed", Arc::clone(&log), Arc::clone(&barrier))),
+                drain: Some(mk_hook(
+                    "a_arrived",
+                    "a_passed",
+                    Arc::clone(&log),
+                    Arc::clone(&barrier),
+                )),
                 ..WorkerConfig::default()
             },
         );
@@ -1015,7 +1024,12 @@ mod tests {
             "b",
             WorkerConfig {
                 weight: ShutdownWeight(0),
-                drain: Some(mk_hook("b_arrived", "b_passed", Arc::clone(&log), Arc::clone(&barrier))),
+                drain: Some(mk_hook(
+                    "b_arrived",
+                    "b_passed",
+                    Arc::clone(&log),
+                    Arc::clone(&barrier),
+                )),
                 ..WorkerConfig::default()
             },
         );
@@ -1050,7 +1064,10 @@ mod tests {
         assert!(reg.any_alive());
 
         assert_eq!(reg.quiesce("alpha").await, WorkerStatus::Ok);
-        assert!(reg.any_alive(), "orphan still contributes after slot drains");
+        assert!(
+            reg.any_alive(),
+            "orphan still contributes after slot drains"
+        );
         assert!(!reg.is_running("alpha"));
 
         release_tx.send(()).unwrap();
diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
index 46945667ef..9e22d9e6f2 100644
--- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs
+++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
@@ -19,10 +19,11 @@
 //!
 //! # Lifetime
 //!
-//! [`spawn_wallet_event_adapter`] returns a [`JoinHandle`]. The caller
-//! (typically `PlatformWalletManager`) keeps the handle for the
-//! manager's lifetime; on shutdown, fire the [`CancellationToken`] to
-//! make the task exit cleanly.
+//! [`wallet_event_adapter_loop`] is the task body. The caller (typically
+//! `PlatformWalletManager`) registers it on the shared `ThreadRegistry`
+//! via `start_task`, which owns its [`JoinHandle`] and cancellation; on
+//! shutdown the registry fires the [`CancellationToken`] to make the task
+//! exit cleanly and joins it.
 
 use std::sync::Arc;
 
@@ -34,87 +35,82 @@ use key_wallet::Utxo;
 use key_wallet_manager::{WalletEvent, WalletId, WalletManager};
 use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::RwLock;
-use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 
 use crate::changeset::changeset::{CoreChangeSet, PlatformWalletChangeSet};
 use crate::changeset::traits::PlatformWalletPersistence;
 use crate::wallet::platform_wallet::PlatformWalletInfo;
 
-/// Spawn the wallet-event subscriber task.
+/// The wallet-event subscriber loop (the task body owned by the registry).
 ///
-/// Subscribes to `wallet_manager.subscribe_events()` from inside the
-/// spawned task (so the call-site doesn't need to be on a tokio
-/// runtime), then loops dispatching events to the persister via
-/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires
-/// or the upstream broadcast channel closes.
+/// Subscribes to `wallet_manager.subscribe_events()` from inside the task
+/// (so the call-site doesn't need to be on a tokio runtime), then loops
+/// dispatching events to the persister via
+/// [`PlatformWalletPersistence::store`]. Exits when `cancel` fires or the
+/// upstream broadcast channel closes.
 ///
-/// Generic over `P` so the spawned task gets static-dispatch on
-/// every `persister.store(...)` call. Pass the manager's own
-/// `Arc<P>` (not the `Arc<dyn PlatformWalletPersistence>`
-/// coercion) to actually realize the static-dispatch win.
-pub fn spawn_wallet_event_adapter<P>(
+/// Generic over `P` so the task gets static-dispatch on every
+/// `persister.store(...)` call. Pass the manager's own `Arc<P>` (not the
+/// `Arc<dyn PlatformWalletPersistence>` coercion) to realize that win.
+pub async fn wallet_event_adapter_loop<P>(
     wallet_manager: Arc<RwLock<WalletManager<PlatformWalletInfo>>>,
     persister: Arc<P>,
     cancel: CancellationToken,
-) -> JoinHandle<()>
-where
+) where
     P: PlatformWalletPersistence + 'static,
 {
-    tokio::spawn(async move {
-        let mut receiver = {
-            let guard = wallet_manager.read().await;
-            guard.subscribe_events()
-        };
-        tracing::debug!("wallet-event adapter task started");
+    let mut receiver = {
+        let guard = wallet_manager.read().await;
+        guard.subscribe_events()
+    };
+    tracing::debug!("wallet-event adapter task started");
 
-        loop {
-            tokio::select! {
-                recv = receiver.recv() => {
-                    match recv {
-                        Ok(event) => {
-                            let wallet_id = event.wallet_id();
-                            // For events that need to consult per-wallet
-                            // state (today only `TransactionInstantLocked`,
-                            // which checks finality before recording the IS
-                            // lock), grab a brief read lock on the manager.
-                            let core = build_core_changeset(&wallet_manager, &event).await;
-                            if core.is_empty_no_records() {
-                                // SyncHeightAdvanced for an unknown wallet,
-                                // empty BlockProcessed, etc. — nothing to
-                                // persist. Skip the round-trip.
-                                continue;
-                            }
-                            let cs = PlatformWalletChangeSet {
-                                core: Some(core),
-                                ..PlatformWalletChangeSet::default()
-                            };
-                            if let Err(e) = persister.store(wallet_id, cs) {
-                                tracing::warn!(
-                                    wallet_id = %hex::encode(wallet_id),
-                                    error = %e,
-                                    "Persister rejected core changeset; state will be re-emitted on next sync round"
-                                );
-                            }
-                        }
-                        Err(RecvError::Closed) if cancel.is_cancelled() => break,
-                        Err(RecvError::Closed) => {
-                            tracing::error!("WalletEvent broadcast closed unexpectedly");
-                            break;
+    loop {
+        tokio::select! {
+            recv = receiver.recv() => {
+                match recv {
+                    Ok(event) => {
+                        let wallet_id = event.wallet_id();
+                        // For events that need to consult per-wallet
+                        // state (today only `TransactionInstantLocked`,
+                        // which checks finality before recording the IS
+                        // lock), grab a brief read lock on the manager.
+                        let core = build_core_changeset(&wallet_manager, &event).await;
+                        if core.is_empty_no_records() {
+                            // SyncHeightAdvanced for an unknown wallet,
+                            // empty BlockProcessed, etc. — nothing to
+                            // persist. Skip the round-trip.
+                            continue;
                         }
-                        Err(RecvError::Lagged(n)) => {
+                        let cs = PlatformWalletChangeSet {
+                            core: Some(core),
+                            ..PlatformWalletChangeSet::default()
+                        };
+                        if let Err(e) = persister.store(wallet_id, cs) {
                             tracing::warn!(
-                                missed = n,
-                                "wallet-event adapter lagged on broadcast channel; some events were dropped"
+                                wallet_id = %hex::encode(wallet_id),
+                                error = %e,
+                                "Persister rejected core changeset; state will be re-emitted on next sync round"
                             );
                         }
                     }
+                    Err(RecvError::Closed) if cancel.is_cancelled() => break,
+                    Err(RecvError::Closed) => {
+                        tracing::error!("WalletEvent broadcast closed unexpectedly");
+                        break;
+                    }
+                    Err(RecvError::Lagged(n)) => {
+                        tracing::warn!(
+                            missed = n,
+                            "wallet-event adapter lagged on broadcast channel; some events were dropped"
+                        );
+                    }
                 }
-                _ = cancel.cancelled() => break,
             }
+            _ = cancel.cancelled() => break,
         }
-        tracing::debug!("wallet-event adapter task exiting");
-    })
+    }
+    tracing::debug!("wallet-event adapter task exiting");
 }
 
 /// Project an upstream [`WalletEvent`] into a [`CoreChangeSet`] suitable
diff --git a/packages/rs-platform-wallet/src/changeset/mod.rs b/packages/rs-platform-wallet/src/changeset/mod.rs
index dc76ddd39a..208c132e87 100644
--- a/packages/rs-platform-wallet/src/changeset/mod.rs
+++ b/packages/rs-platform-wallet/src/changeset/mod.rs
@@ -33,7 +33,7 @@ pub use changeset::{
 };
 pub use client_start_state::ClientStartState;
 pub use client_wallet_start_state::ClientWalletStartState;
-pub use core_bridge::spawn_wallet_event_adapter;
+pub use core_bridge::wallet_event_adapter_loop;
 pub use identity_manager_start_state::IdentityManagerStartState;
 pub use merge::Merge;
 pub use platform_address_sync_start_state::PlatformAddressSyncStartState;
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 40329bad74..8dfe83eede 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -49,16 +49,17 @@
 use std::collections::BTreeMap;
 use std::sync::{
     atomic::{AtomicBool, AtomicU64, Ordering},
-    Arc, Mutex as StdMutex,
+    Arc,
 };
 
-use dash_async::AtomicFlagGuard;
+use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
+use super::WalletWorker;
+
 use dpp::balances::credits::TokenAmount;
 use dpp::prelude::Identifier;
 use tokio::sync::RwLock;
-use tokio_util::sync::CancellationToken;
 
 use dash_sdk::platform::tokens::identity_token_balances::{
     IdentityTokenBalances, IdentityTokenBalancesQuery,
@@ -160,23 +161,11 @@ where
     /// over `P` so every `persister.store(...)` call on the hot sync
     /// loop dispatches statically.
     persister: Arc<P>,
-    /// Cancel token for the background loop, if running.
-    background_cancel: StdMutex<Option<CancellationToken>>,
-    /// Join handle for the background loop's OS thread, if running.
-    /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
-    /// confirm the `!Send` loop fully exited before the host drops the
-    /// runtime.
-    background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
-    /// Manager-owned orphans list (shared `Arc`). On a tight
-    /// `stop()`→`start()` where the prior thread is wedged past the 1 s
-    /// reap backstop, [`start`](Self::start) parks the still-live handle
-    /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
-    /// instead of dropping it, so manager `shutdown()` accounts for it.
-    coordinator_orphans: super::CoordinatorOrphans,
-    /// Monotonically increasing generation counter. Incremented each
-    /// time `start()` installs a new cancel token so the exiting
-    /// thread can tell whether its token is still current.
-    background_generation: AtomicU64,
+    /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
+    /// `quiesce` delegate to it under the [`WalletWorker::IdentitySync`]
+    /// key; it owns the loop's cancel token, OS-thread join handle, the
+    /// restart reap-or-park, and the orphan list.
+    registry: Arc<ThreadRegistry<WalletWorker>>,
     interval_secs: AtomicU64,
     is_syncing: AtomicBool,
     /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -215,15 +204,12 @@ where
     pub fn new(
         sdk: Arc<dash_sdk::Sdk>,
         persister: Arc<P>,
-        coordinator_orphans: super::CoordinatorOrphans,
+        registry: Arc<ThreadRegistry<WalletWorker>>,
     ) -> Self {
         Self {
             sdk,
             persister,
-            background_cancel: StdMutex::new(None),
-            background_join: StdMutex::new(None),
-            coordinator_orphans,
-            background_generation: AtomicU64::new(0),
+            registry,
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
             quiescing: AtomicBool::new(false),
@@ -339,10 +325,22 @@ where
 
     /// Whether the background loop is currently running.
     pub fn is_running(&self) -> bool {
-        self.background_cancel
-            .lock()
-            .map(|g| g.is_some())
-            .unwrap_or(false)
+        self.registry.is_running(WalletWorker::IdentitySync)
+    }
+
+    /// The drain barrier handed to the registry: raise the `quiescing`
+    /// gate so any pass past its `is_syncing` CAS bails. The registry then
+    /// cancels the loop and joins the thread (the join waits for the
+    /// in-flight pass to drop and `is_syncing` to clear), so the barrier
+    /// itself is instant and never blocks teardown.
+    fn drain_hook(self: &Arc<Self>) -> DrainHook {
+        let this = Arc::clone(self);
+        Arc::new(move || {
+            let this = Arc::clone(&this);
+            Box::pin(async move {
+                this.quiescing.store(true, Ordering::Release);
+            })
+        })
     }
 
     /// Whether a sync pass is in flight right now.
@@ -414,57 +412,32 @@ where
     /// The first pass runs immediately; subsequent passes fire every
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
-        let mut cancel_guard = self
-            .background_cancel
-            .lock()
-            .unwrap_or_else(|e| e.into_inner());
-        if cancel_guard.is_some() {
-            return;
-        }
-
-        // Take any handle left by a prior stop() call so we can reap it — but
-        // DON'T join it here, while we still hold background_cancel. stop()
-        // takes-and-cancels the token but never touches background_join, so a
-        // stop()→start() sequence would otherwise overwrite (detach) the old
-        // handle and shutdown() would miss that thread. Joining it under
-        // background_cancel would DEADLOCK the reap into its 1 s backstop: the
-        // exiting prior thread's epilogue also locks background_cancel (to
-        // clear its slot), so it would block on the lock we hold → never
-        // finish → get detached on the exact stop()→start() path the reap
-        // exists for. We install the new token + bump the generation below,
-        // release the lock, and only THEN reap (after this fn's tail).
-        let prior = self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take();
-
-        let cancel = CancellationToken::new();
-        *cancel_guard = Some(cancel.clone());
-        let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
+        // Reopen the quiescing gate so this (re)start's passes can run; a
+        // prior quiesce raised it via the drain hook.
+        self.quiescing.store(false, Ordering::Release);
+
+        let cfg = WorkerConfig {
+            weight: super::COORDINATOR_WEIGHT,
+            join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
+            drain: Some(self.drain_hook()),
+        };
 
+        // The loop drives `!Send` SDK futures via `Handle::block_on` on a
+        // dedicated OS thread (the registry spawns it). The handle is
+        // captured from this tokio context; the new thread is not itself a
+        // tokio worker. `biased` polls the cancel arm first, so a pass
+        // stalled on a hung SDK fetch is dropped at its `.await` the
+        // instant the registry cancels — clearing `is_syncing` promptly so
+        // the join lands inside the budget.
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
-        let join = std::thread::Builder::new()
-            .name("identity-sync".into())
-            .spawn(move || {
+        self.registry
+            .start_thread(WalletWorker::IdentitySync, cfg, move |cancel| {
                 handle.block_on(async move {
                     loop {
                         if cancel.is_cancelled() {
                             break;
                         }
-
-                        // Race the in-flight pass against cancellation.
-                        // `stop()` / `quiesce()` cancel the token; with
-                        // `biased` the cancel arm is polled first, so a
-                        // pass stalled on a hung SDK fetch is dropped at
-                        // its `.await` the instant we cancel. Dropping the
-                        // `sync_now` future unwinds to the `is_syncing`
-                        // `AtomicFlagGuard` it holds, clearing the flag
-                        // promptly — so `quiesce()`'s drain loop frees and
-                        // the join lands well inside `shutdown()`'s
-                        // timeout. A stalled pass can no longer strand a
-                        // live `!Send` thread past `shutdown()`.
                         tokio::select! {
                             biased;
                             _ = cancel.cancelled() => break,
@@ -477,47 +450,8 @@ where
                             _ = cancel.cancelled() => break,
                         }
                     }
-
-                    // Only clear the slot if no newer start() has
-                    // installed a replacement token since we launched.
-                    if let Ok(mut guard) = this.background_cancel.lock() {
-                        if this.background_generation.load(Ordering::Acquire) == my_gen {
-                            *guard = None;
-                        }
-                    }
                 });
-            })
-            .expect("failed to spawn identity-sync thread");
-        // Store the join handle while still holding cancel_guard — a
-        // concurrent quiesce() must wait for this lock before calling
-        // stop(), so the handle is always stored before it can be taken.
-        *self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner()) = Some(join);
-
-        // Release background_cancel BEFORE reaping the prior thread, so its
-        // epilogue can acquire the lock, observe the bumped generation, skip
-        // clearing our freshly-installed token, and return. Holding the lock
-        // across the join below is what would block the prior thread, spin
-        // the full 1 s deadline, and detach — the very stall this ordering
-        // removes.
-        drop(cancel_guard);
-
-        // Now reap the prior thread. It was already cancellation-signalled by
-        // stop(), and with the lock released its epilogue completes promptly,
-        // so is_finished() trips within a few milliseconds and the join is
-        // near-instant. The 1 s deadline survives only as a genuine-wedge
-        // backstop (e.g. a pass wedged in a Drop that never yields); if it
-        // fires `reap_prior_or_park` parks the still-live, already-cancelled
-        // thread in the manager orphans list so `shutdown()` joins it and
-        // reports it non-clean rather than dropping it (residual UAF).
-        super::reap_prior_or_park(
-            prior,
-            &self.coordinator_orphans,
-            std::time::Duration::from_secs(1),
-            "identity-sync",
-        );
+            });
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -529,14 +463,7 @@ where
     /// by manager shutdown so the host can free the persister context —
     /// use [`quiesce`](Self::quiesce).
     pub fn stop(&self) {
-        if let Some(token) = self
-            .background_cancel
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take()
-        {
-            token.cancel();
-        }
+        self.registry.cancel(WalletWorker::IdentitySync);
     }
 
     /// Cancel the background loop **and wait for any in-flight sync pass
@@ -564,24 +491,17 @@ where
     /// the `!Send` loop has stopped touching `tokio::time` before a
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
-        self.quiescing.store(true, Ordering::Release);
-        // RAII gate: resets `quiescing` on *every* exit path — a normal
-        // return, a timed-out `shutdown()` dropping this future, or a
-        // panic. Without it a quiesce that doesn't run to completion
-        // leaves the gate latched `true`, silently bailing every future
-        // pass. Reopening on drop is safe because `stop()` (below) has
-        // already cancelled the loop, so no new pass can start.
+        // RAII gate: reopen `quiescing` on *every* exit path — normal
+        // return, a dropped future, or a panic. The registry's drain hook
+        // raises it inside `quiesce` below; without this reset a quiesce
+        // that doesn't complete would leave the gate latched and silently
+        // bail every future pass. Reopening is safe because the loop has
+        // been cancelled, so no new pass can start.
         let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.stop();
-        while self.is_syncing.load(Ordering::Acquire) {
-            tokio::time::sleep(Duration::from_millis(20)).await;
-        }
-        let handle = self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take();
-        super::join_coordinator_thread(handle).await
+        self.registry
+            .quiesce(WalletWorker::IdentitySync)
+            .await
+            .into()
     }
 
     /// Run one sync pass across every registered identity.
@@ -856,8 +776,8 @@ mod tests {
     fn make_manager() -> Arc<IdentitySyncManager<NoopPersister>> {
         let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
         let persister = Arc::new(NoopPersister);
-        let orphans = Arc::new(StdMutex::new(Vec::new()));
-        Arc::new(IdentitySyncManager::new(sdk, persister, orphans))
+        let registry = ThreadRegistry::new();
+        Arc::new(IdentitySyncManager::new(sdk, persister, registry))
     }
 
     fn make_recording_manager() -> (
@@ -866,12 +786,12 @@ mod tests {
     ) {
         let sdk = Arc::new(dash_sdk::SdkBuilder::new_mock().build().expect("mock sdk"));
         let persister = Arc::new(RecordingPersister::new());
-        let orphans = Arc::new(StdMutex::new(Vec::new()));
+        let registry = ThreadRegistry::new();
         (
             Arc::new(IdentitySyncManager::new(
                 sdk,
                 Arc::clone(&persister),
-                orphans,
+                registry,
             )),
             persister,
         )
@@ -993,123 +913,6 @@ mod tests {
         assert_eq!(mgr.interval(), Duration::from_secs(120));
     }
 
-    /// `quiesce()` must not return while a pass is in flight, and must
-    /// return promptly once the pass drains.
-    ///
-    /// Drives the real `is_syncing` lifecycle: a background task takes
-    /// the slot via the same `compare_exchange` the real `sync_now`
-    /// uses, holds it across a sleep (standing in for the pass body +
-    /// persister fan-out, which `sync_now` keeps the flag set across),
-    /// then clears it. We assert `quiesce()` is still pending while the
-    /// flag is held and completes after it falls — i.e. the falling edge
-    /// of `is_syncing` is what unblocks the barrier.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn quiesce_blocks_until_in_flight_pass_drains() {
-        let mgr = make_manager();
-
-        // Stand in for an in-flight `sync_now`: take the `is_syncing`
-        // slot exactly as the real pass does, hold it, then release.
-        let holder = Arc::clone(&mgr);
-        let pass = tokio::spawn(async move {
-            assert!(
-                holder
-                    .is_syncing
-                    .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
-                    .is_ok(),
-                "test should own the is_syncing slot"
-            );
-            tokio::time::sleep(Duration::from_millis(200)).await;
-            holder.is_syncing.store(false, Ordering::Release);
-        });
-
-        // Give the holder task a chance to take the slot before we
-        // start draining.
-        while !mgr.is_syncing() {
-            tokio::time::sleep(Duration::from_millis(5)).await;
-        }
-
-        let quiesce_fut = mgr.quiesce();
-        tokio::pin!(quiesce_fut);
-
-        // While the pass holds the flag, quiesce must stay pending.
-        tokio::select! {
-            _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"),
-            _ = tokio::time::sleep(Duration::from_millis(50)) => {}
-        }
-        assert!(mgr.is_syncing(), "pass should still be in flight");
-
-        // Once the pass drains, quiesce must return (well within a
-        // generous bound — it polls every 20ms).
-        tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut)
-            .await
-            .expect("quiesce did not return after the pass drained");
-
-        // The gate is reopened before quiesce returns.
-        assert!(!mgr.quiescing.load(Ordering::Acquire));
-        assert!(!mgr.is_syncing());
-        pass.await.unwrap();
-    }
-
-    /// Regression: a tight `stop()` → `start()` must reap the prior loop's
-    /// OS thread promptly, NOT stall on the 1 s detach backstop.
-    ///
-    /// The prior thread's exit epilogue locks `background_cancel` to
-    /// conditionally clear its slot. The earlier ordering held
-    /// `background_cancel` across the prior-handle join inside `start()`, so
-    /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
-    /// that lock, never finished, and the reap spin-waited the full second
-    /// before detaching — a 1 s stall plus a transient untracked thread. The
-    /// fix installs the new token + generation, releases `background_cancel`,
-    /// and only then reaps, so the prior thread's epilogue runs and the join
-    /// lands in milliseconds.
-    ///
-    /// `stop()` and `start()` run back-to-back in one blocking closure
-    /// (mirroring the real call site) so `start()` re-acquires the lock
-    /// microseconds after `stop()` frees it — before the async-woken prior
-    /// thread can reach its epilogue. Against the old lock-held ordering this
-    /// reliably stalls ~1 s and fails the bound below.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
-    async fn restart_after_stop_reaps_prior_thread() {
-        let mgr = make_manager();
-
-        // Launch the first loop and let its immediate (no-op, nothing
-        // registered) pass complete so the thread parks in the interval
-        // sleep, where cancellation lands cleanly.
-        Arc::clone(&mgr).start();
-        assert!(mgr.is_running());
-        tokio::time::sleep(Duration::from_millis(50)).await;
-
-        // Back-to-back cancel-only stop + restart, off the runtime so the
-        // synchronous reap can't starve a worker. `start()` re-grabs
-        // background_cancel right after `stop()` frees it.
-        let restart = Arc::clone(&mgr);
-        let elapsed = tokio::task::spawn_blocking(move || {
-            restart.stop();
-            let started = std::time::Instant::now();
-            Arc::clone(&restart).start();
-            started.elapsed()
-        })
-        .await
-        .unwrap();
-
-        assert!(
-            elapsed < Duration::from_millis(500),
-            "stop()→start() stalled for {elapsed:?}: prior thread was not \
-             reaped promptly (background_cancel held across the join?)"
-        );
-        assert!(mgr.is_running(), "restart must leave the new loop tracked");
-
-        // Wind the new loop down so the test leaves no live !Send thread.
-        let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
-            .await
-            .expect("cleanup quiesce did not complete within 2s after restart");
-        assert!(
-            status.is_clean(),
-            "cleanup quiesce ended non-cleanly: {status:?}"
-        );
-        assert!(!mgr.is_running());
-    }
-
     /// A `sync_now()` invoked while `quiescing` is set must bail without
     /// running the pass — in particular, without calling
     /// `persister.store(...)`. This is the gate that prevents a pass
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 7e9690d066..d03dcccf7b 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -10,13 +10,12 @@ mod wallet_lifecycle;
 
 use std::sync::Arc;
 
+use dash_async::{ShutdownReport, ShutdownWeight, ThreadRegistry, WorkerConfig};
 use tokio::sync::{Notify, RwLock};
-use tokio::task::JoinHandle;
-use tokio_util::sync::CancellationToken;
 
 use key_wallet_manager::WalletManager;
 
-use crate::changeset::{spawn_wallet_event_adapter, PlatformWalletPersistence};
+use crate::changeset::{wallet_event_adapter_loop, PlatformWalletPersistence};
 use crate::events::{PlatformEventHandler, PlatformEventManager};
 use crate::manager::identity_sync::IdentitySyncManager;
 use crate::manager::platform_address_sync::PlatformAddressSyncManager;
@@ -28,21 +27,29 @@ use crate::wallet::core::BalanceUpdateHandler;
 use crate::wallet::platform_wallet::{PlatformWalletInfo, WalletId};
 use crate::wallet::PlatformWallet;
 
-/// Shared list of coordinator OS threads that a tight `stop()`→`start()`
-/// reap had to detach past its 1 s wedge-backstop.
-///
-/// A coordinator's `start()` reap normally joins the prior thread within
-/// a few milliseconds. If that thread is genuinely wedged in a
-/// non-yielding `Drop` (vanishingly rare — the loop exits via a
-/// cancellable `select!`), [`reap_prior_or_park`] parks its still-live
-/// `JoinHandle` here instead of dropping it. The manager owns this list
-/// and shares a clone (`Arc`) with every coordinator, so
-/// [`PlatformWalletManager::shutdown`] can join everything parked here
-/// within its timeout and report
-/// [`CoordinatorThreadStatus::Detached`] if any thread is still alive —
-/// telling the host NOT to free a callback context the thread may still
-/// touch (closing a residual use-after-free window).
-pub(crate) type CoordinatorOrphans = Arc<std::sync::Mutex<Vec<std::thread::JoinHandle<()>>>>;
+/// Identity of a background worker on the manager's shared
+/// [`ThreadRegistry`]. The three periodic sync coordinators run as
+/// OS-thread workers (their SDK futures are `!Send`); the wallet-event
+/// adapter runs as a tokio task. Drained in ascending weight order on
+/// [`shutdown`](PlatformWalletManager::shutdown): the coordinators
+/// (weight 0) first, then the event adapter (weight 10) they store into.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub enum WalletWorker {
+    /// Platform-address (BLAST) balance sync loop.
+    PlatformAddressSync,
+    /// Per-identity token-state sync loop.
+    IdentitySync,
+    /// Shielded (Orchard) note sync loop.
+    ShieldedSync,
+    /// Wallet-event adapter task (sinks coordinator stores).
+    EventAdapter,
+}
+
+/// Teardown weight of the periodic sync coordinators — drained first.
+pub(crate) const COORDINATOR_WEIGHT: ShutdownWeight = ShutdownWeight(0);
+/// Teardown weight of the wallet-event adapter — drained after the
+/// coordinators that feed it.
+pub(crate) const EVENT_ADAPTER_WEIGHT: ShutdownWeight = ShutdownWeight(10);
 
 /// Multi-wallet coordinator with SPV sync and event handling.
 ///
@@ -98,16 +105,12 @@ pub struct PlatformWalletManager<P: PlatformWalletPersistence + 'static> {
     #[cfg(feature = "shielded")]
     pub(super) event_manager: Arc<PlatformEventManager>,
     pub(super) persister: Arc<P>,
-    /// Cancellation token + join handle for the wallet-event adapter
-    /// task. Held so [`shutdown`] can stop it cleanly when the manager
-    /// is torn down.
-    pub(super) event_adapter_cancel: CancellationToken,
-    pub(super) event_adapter_join: tokio::sync::Mutex<Option<JoinHandle<()>>>,
-    /// Coordinator OS threads detached by a tight `stop()`→`start()`
-    /// reap (see [`CoordinatorOrphans`]). Shared (cloned `Arc`) with
-    /// every coordinator so their `start()` reaps can park a wedged
-    /// prior thread here, and drained/joined by [`shutdown`](Self::shutdown).
-    pub(super) coordinator_orphans: CoordinatorOrphans,
+    /// Shared worker-lifecycle engine. Owns every background worker's
+    /// cancellation token + join handle, the restart reap-or-park, and the
+    /// orphan list. The coordinators hold a clone and register their loops
+    /// on it; the event adapter runs here as a tokio task. [`shutdown`]
+    /// drains it in weight order and joins every worker before returning.
+    pub(super) registry: Arc<ThreadRegistry<WalletWorker>>,
 }
 
 /// How one background coordinator thread terminated.
@@ -161,6 +164,25 @@ impl CoordinatorThreadStatus {
     }
 }
 
+/// Relocate a registry [`WorkerStatus`](dash_async::WorkerStatus) into the
+/// FFI-stable `CoordinatorThreadStatus`. The variant set and payloads are
+/// identical by construction, so this is a byte-stable 1:1 mapping — the
+/// FFI `destroy` / shielded-stop adapters keep reading the same shape.
+impl From<dash_async::WorkerStatus> for CoordinatorThreadStatus {
+    fn from(status: dash_async::WorkerStatus) -> Self {
+        use dash_async::WorkerStatus as W;
+        match status {
+            W::Ok => Self::Ok,
+            W::Stopped(reason) => Self::Stopped(reason),
+            W::Panicked(msg) => Self::Panicked(msg),
+            W::Timeout => Self::Timeout,
+            W::Detached => Self::Detached,
+            W::NotRunning => Self::NotRunning,
+            W::Error(msg) => Self::Error(msg),
+        }
+    }
+}
+
 /// Per-thread terminal status of every background worker, returned by
 /// [`PlatformWalletManager::shutdown`].
 ///
@@ -211,196 +233,36 @@ impl CoordinatorExitStatus {
             && self.event_adapter.is_clean()
             && self.detached_threads.is_clean()
     }
-}
 
-/// Join a coordinator's background OS thread and classify how it ended.
-///
-/// Called from each coordinator's `quiesce()` after cancelling the
-/// loop and draining any in-flight pass, so the thread is already on
-/// its way out and the join is near-instant. Joining while the runtime
-/// is still alive guarantees the `!Send` loop has stopped touching
-/// `tokio::time` before the host drops the runtime.
-///
-/// **Polling approach**: we poll [`JoinHandle::is_finished`] in 5 ms
-/// steps rather than wrapping `handle.join()` in
-/// [`spawn_blocking`](tokio::task::spawn_blocking). The
-/// `spawn_blocking` approach spawns a blocking-pool task that cannot be
-/// cancelled once started — so dropping the timeout future that wraps
-/// `quiesce()` would leave the blocking task alive and `handle.join()`
-/// still running, defeating the timeout boundary. Polling lets the
-/// executor yield at each `.await` step so `tokio::time::timeout`
-/// wrapping `quiesce()` can truly interrupt this call.
-///
-/// **Requires a multi-thread runtime.** Each coordinator's OS thread
-/// drives its loop via [`Handle::block_on`](tokio::runtime::Handle::block_on)
-/// and needs the runtime's timer/IO driver; a `current_thread` runtime
-/// can only service one `block_on` at a time, so joining one coordinator
-/// while the others (and `shutdown()` itself) are mid-`block_on` would
-/// deadlock. `shutdown()` asserts the multi-thread flavor up front.
-pub(crate) async fn join_coordinator_thread(
-    handle: Option<std::thread::JoinHandle<()>>,
-) -> CoordinatorThreadStatus {
-    let Some(handle) = handle else {
-        return CoordinatorThreadStatus::NotRunning;
-    };
-    // Poll until the thread exits. The coordinator was already cancelled
-    // (stop() fires before quiesce() calls us), so is_finished() becomes
-    // true nearly immediately — typically within a single 5 ms step.
-    loop {
-        if handle.is_finished() {
-            return match handle.join() {
-                Ok(()) => CoordinatorThreadStatus::Ok,
-                Err(payload) => CoordinatorThreadStatus::Panicked(panic_message(payload)),
-            };
-        }
-        // Yield to the executor so the outer tokio::time::timeout wrapping
-        // quiesce() can fire if the deadline has passed. Without this yield
-        // the loop would busy-spin and block the task.
-        tokio::time::sleep(std::time::Duration::from_millis(5)).await;
-    }
-}
-
-/// Best-effort extraction of a panic message from a joined thread/task
-/// payload (`&str` and `String` are the common cases).
-fn panic_message(payload: Box<dyn std::any::Any + Send>) -> String {
-    if let Some(s) = payload.downcast_ref::<&str>() {
-        (*s).to_string()
-    } else if let Some(s) = payload.downcast_ref::<String>() {
-        s.clone()
-    } else {
-        "<non-string panic>".to_string()
-    }
-}
-
-/// Reap a coordinator's prior OS thread after a `stop()`→`start()`
-/// reschedule — or park it for [`PlatformWalletManager::shutdown`] if it
-/// is genuinely wedged.
-///
-/// Shared by all three coordinators' `start()` (identity / platform-
-/// address / shielded), called at the tail of `start()` *after* the
-/// `background_cancel` lock has been released, so the exiting prior
-/// thread's epilogue (which also takes that lock) can complete and the
-/// join lands in milliseconds.
-///
-/// `prior` was cancellation-signalled by the preceding `stop()`, so its
-/// `select!` loop exits and the thread finishes almost immediately. The
-/// `backstop` deadline fires only if the thread is wedged in a
-/// non-yielding `Drop` that never observes the cancellation (vanishingly
-/// rare). On that wedge we must NOT silently drop the still-live handle:
-/// the thread still holds an `Arc` to the host event handler and could
-/// fire a callback, so a later `destroy` that freed the host context
-/// would hit a use-after-free. Instead we park the handle in `orphans`
-/// so `shutdown()` joins it within its own timeout and reports
-/// [`CoordinatorThreadStatus::Detached`] if it is still alive — keeping
-/// [`CoordinatorExitStatus::all_clean`] honest.
-pub(crate) fn reap_prior_or_park(
-    prior: Option<std::thread::JoinHandle<()>>,
-    orphans: &CoordinatorOrphans,
-    backstop: std::time::Duration,
-    coordinator: &str,
-) {
-    let Some(handle) = prior else {
-        return;
-    };
-    let deadline = std::time::Instant::now() + backstop;
-    loop {
-        if handle.is_finished() {
-            // Near-instant since finished; reaps the thread's resources.
-            let _ = handle.join();
-            return;
-        }
-        if std::time::Instant::now() >= deadline {
-            tracing::warn!(
-                coordinator,
-                ?backstop,
-                "prior sync thread did not finish within the backstop after \
-                 cancellation; parking it in the manager orphans list for \
-                 shutdown() to join rather than detaching it"
-            );
-            // Park the still-live (but already-cancelled) handle so a
-            // later shutdown() can join it and report it non-clean,
-            // instead of dropping it and leaving a UAF window where the
-            // host frees a callback context the thread may still touch.
-            orphans
-                .lock()
-                .unwrap_or_else(|e| e.into_inner())
-                .push(handle);
-            return;
-        }
-        std::thread::sleep(std::time::Duration::from_millis(5));
-    }
-}
-
-/// Drain the manager's [`CoordinatorOrphans`] list and classify how the
-/// parked threads ended, polling until `deadline`.
-///
-/// Threads land in the list only when a tight `stop()`→`start()` reap had
-/// to detach a prior coordinator thread past its 1 s wedge-backstop (see
-/// [`reap_prior_or_park`]). They were parked rather than dropped so this
-/// final teardown can account for them: a still-live detached thread
-/// keeps an `Arc` to the host event handler and could fire one last
-/// callback, so the host must not free its context until every such
-/// thread has exited.
-///
-/// Polls [`JoinHandle::is_finished`](std::thread::JoinHandle::is_finished)
-/// in 5 ms steps, yielding at each `.await` so a wrapping
-/// `tokio::time::timeout` can still interrupt it (no uncancellable
-/// blocking join — `join()` is only ever called on an already-finished
-/// handle). Returns:
-/// - [`Ok`](CoordinatorThreadStatus::Ok) — the list was empty, or every
-///   parked thread joined cleanly;
-/// - [`Panicked`](CoordinatorThreadStatus::Panicked) — a parked thread
-///   had panicked (and none were left alive at the deadline);
-/// - [`Detached`](CoordinatorThreadStatus::Detached) — at least one
-///   parked thread was still alive at `deadline`. Any still-live handles
-///   are re-parked so a later (idempotent) `shutdown()` can retry.
-pub(crate) async fn join_detached_orphans(
-    orphans: &CoordinatorOrphans,
-    deadline: std::time::Instant,
-) -> CoordinatorThreadStatus {
-    // Take the whole list out under the lock; we re-park any survivors
-    // at the deadline, but never hold the lock across an `.await`.
-    let mut pending: Vec<std::thread::JoinHandle<()>> = {
-        let mut guard = orphans.lock().unwrap_or_else(|e| e.into_inner());
-        std::mem::take(&mut *guard)
-    };
-    if pending.is_empty() {
-        return CoordinatorThreadStatus::Ok;
-    }
-
-    let mut panicked: Option<String> = None;
-    loop {
-        // Reap every thread that has finished this pass; retain the rest.
-        let mut still_live = Vec::with_capacity(pending.len());
-        for handle in pending.drain(..) {
-            if handle.is_finished() {
-                if let Err(payload) = handle.join() {
-                    // Keep the first panic message; a live `Detached`
-                    // thread still takes precedence at the deadline below.
-                    panicked.get_or_insert_with(|| panic_message(payload));
-                }
+    /// Build the FFI-stable exit status from the registry's weight-ordered
+    /// [`ShutdownReport`]. A worker absent from the report never ran, so it
+    /// maps to [`NotRunning`](CoordinatorThreadStatus::NotRunning); a
+    /// non-zero orphan-survivor count surfaces as
+    /// [`Detached`](CoordinatorThreadStatus::Detached), keeping
+    /// [`all_clean`](Self::all_clean) honest for a still-live wedged thread.
+    pub(crate) fn from_report(report: ShutdownReport<WalletWorker>) -> Self {
+        let worker = |key: WalletWorker| -> CoordinatorThreadStatus {
+            report
+                .per_worker
+                .get(&key)
+                .cloned()
+                .map(CoordinatorThreadStatus::from)
+                .unwrap_or(CoordinatorThreadStatus::NotRunning)
+        };
+        Self {
+            platform_address_sync: worker(WalletWorker::PlatformAddressSync),
+            identity_sync: worker(WalletWorker::IdentitySync),
+            #[cfg(feature = "shielded")]
+            shielded_sync: Some(worker(WalletWorker::ShieldedSync)),
+            #[cfg(not(feature = "shielded"))]
+            shielded_sync: None,
+            event_adapter: worker(WalletWorker::EventAdapter),
+            detached_threads: if report.detached > 0 {
+                CoordinatorThreadStatus::Detached
             } else {
-                still_live.push(handle);
-            }
-        }
-        pending = still_live;
-
-        if pending.is_empty() {
-            return match panicked {
-                Some(msg) => CoordinatorThreadStatus::Panicked(msg),
-                None => CoordinatorThreadStatus::Ok,
-            };
-        }
-        if std::time::Instant::now() >= deadline {
-            // Re-park survivors so an idempotent re-`shutdown()` retries
-            // rather than losing track of a still-live thread.
-            orphans
-                .lock()
-                .unwrap_or_else(|e| e.into_inner())
-                .extend(pending);
-            return CoordinatorThreadStatus::Detached;
+                CoordinatorThreadStatus::Ok
+            },
         }
-        tokio::time::sleep(std::time::Duration::from_millis(5)).await;
     }
 }
 
@@ -449,14 +311,28 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         let wallets = Arc::new(RwLock::new(std::collections::BTreeMap::new()));
         let lock_notify = Arc::new(Notify::new());
 
-        // Spawn the wallet-event adapter that translates upstream
-        // `WalletEvent`s into `CoreChangeSet`s and forwards them to
-        // the persister.
-        let event_adapter_cancel = CancellationToken::new();
-        let event_adapter_join = spawn_wallet_event_adapter(
-            Arc::clone(&wallet_manager),
-            Arc::clone(&persister),
-            event_adapter_cancel.clone(),
+        // Shared worker-lifecycle engine. The 1 s reap backstop (separate
+        // from the 30 s managed-join budget) is the grace a wedged prior
+        // thread gets before it is reported `Detached`.
+        let registry = ThreadRegistry::with_reap_backstop(std::time::Duration::from_secs(
+            SHUTDOWN_ORPHAN_GRACE_SECS,
+        ));
+
+        // Register the wallet-event adapter as a tokio task on the
+        // registry. It sinks the coordinators' stores, so it drains AFTER
+        // them (weight 10 vs the coordinators' 0).
+        let adapter_wallet_manager = Arc::clone(&wallet_manager);
+        let adapter_persister = Arc::clone(&persister);
+        registry.start_task(
+            WalletWorker::EventAdapter,
+            WorkerConfig {
+                weight: EVENT_ADAPTER_WEIGHT,
+                join_budget: std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
+                drain: None,
+            },
+            move |cancel| {
+                wallet_event_adapter_loop(adapter_wallet_manager, adapter_persister, cancel)
+            },
         );
 
         // Build handler list: app handler + internal handlers.
@@ -473,13 +349,6 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
             balance_handler,
         ]));
 
-        // Shared orphans list: a coordinator's `start()` reap parks here
-        // any prior thread it had to detach past its 1 s wedge-backstop,
-        // and `shutdown()` joins them. Every coordinator gets a clone of
-        // this same `Arc` so they all park into the one list the manager
-        // drains.
-        let coordinator_orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
-
         let spv = Arc::new(SpvRuntime::new(
             Arc::clone(&wallet_manager),
             Arc::clone(&event_manager),
@@ -487,12 +356,12 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         let platform_address_sync = Arc::new(PlatformAddressSyncManager::new(
             Arc::clone(&wallets),
             Arc::clone(&event_manager),
-            Arc::clone(&coordinator_orphans),
+            Arc::clone(&registry),
         ));
         let identity_sync = Arc::new(IdentitySyncManager::new(
             Arc::clone(&sdk),
             Arc::clone(&persister),
-            Arc::clone(&coordinator_orphans),
+            Arc::clone(&registry),
         ));
         #[cfg(feature = "shielded")]
         let shielded_coordinator: Arc<
@@ -502,7 +371,7 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         let shielded_sync = Arc::new(ShieldedSyncManager::new(
             Arc::clone(&event_manager),
             Arc::clone(&shielded_coordinator),
-            Arc::clone(&coordinator_orphans),
+            Arc::clone(&registry),
         ));
         Self {
             sdk,
@@ -519,9 +388,7 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
             #[cfg(feature = "shielded")]
             event_manager,
             persister,
-            event_adapter_cancel,
-            event_adapter_join: tokio::sync::Mutex::new(Some(event_adapter_join)),
-            coordinator_orphans,
+            registry,
         }
     }
 
@@ -642,194 +509,79 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     /// - the coordinator's store reset itself fails.
     #[cfg(feature = "shielded")]
     pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
-        // Bound the quiesce with the same backstop `shutdown()` uses so a
-        // stalled in-flight pass can't hang Clear forever — cancellation
-        // makes the drain prompt; this timeout only matters if a pass's
-        // drop wedges. Unlike `shutdown()`, the terminal status is
-        // load-bearing HERE: a non-clean drain means the in-flight pass may
-        // still be running and could re-persist notes into the very store
-        // the `clear()` below is about to wipe. A timeout (the future was
-        // dropped at the deadline) is treated as the non-clean `Timeout`
-        // status, matching `shutdown()`'s backstop substitution.
-        let status = match tokio::time::timeout(
-            std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
-            self.shielded_sync_manager.quiesce(),
-        )
-        .await
-        {
-            Ok(status) => status,
-            Err(_elapsed) => CoordinatorThreadStatus::Timeout,
-        };
+        // Quiesce the shielded loop: cancel it, drain any in-flight pass
+        // (incl. its persister fan-out), and join its OS thread. The
+        // registry bounds the join by the coordinator's own
+        // `SHUTDOWN_JOIN_TIMEOUT_SECS` budget — returning `Timeout` rather
+        // than hanging if a pass's drop wedges — so no outer timeout is
+        // needed here.
+        let status = self.shielded_sync_manager.quiesce().await;
+
         // Only commit the store wipe once the in-flight pass has fully
-        // drained. Otherwise refuse: a partial/timed-out drain could let a
-        // surviving pass write into a store we just cleared, desyncing the
-        // host's own wipe from a repopulated tree.
+        // drained. A partial/timed-out drain could let a surviving pass
+        // write into a store we just cleared, desyncing the host's own
+        // wipe from a repopulated tree.
         if !status.is_clean() {
             return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status });
         }
+        // [F2 FIX] Also refuse if a prior-generation shielded thread is
+        // still parked alive: it holds an `Arc` to the persister/store and
+        // could re-persist notes into the store we are about to wipe. The
+        // check is shielded-scoped, so the other coordinators / the
+        // always-on event adapter running normally do not block Clear.
+        if self.registry.any_alive_for(WalletWorker::ShieldedSync) {
+            return Err(
+                crate::error::PlatformWalletError::ShieldedShutdownIncomplete {
+                    status: CoordinatorThreadStatus::Detached,
+                },
+            );
+        }
         if let Some(coord) = self.shielded_coordinator().await {
             coord.clear().await?;
         }
         Ok(())
     }
 
-    /// Stop all background tasks and wait for them to exit.
+    /// Stop all background workers and wait for them to exit.
     ///
-    /// **Quiesces** the periodic coordinators
-    /// (`PlatformAddressSyncManager`, `IdentitySyncManager`,
-    /// `ShieldedSyncManager`) — cancelling each loop *and draining any
-    /// in-flight pass to completion*, including its persister /
-    /// host-callback fan-out — then drains the wallet-event adapter task.
-    /// Idempotent. Call before dropping the manager when a clean
-    /// shutdown is required (e.g. on app termination); a dirty drop
-    /// simply leaks the tasks until the runtime exits.
+    /// Delegates to the shared [`ThreadRegistry::shutdown`], which drains
+    /// in ascending weight order: the periodic coordinators (weight 0)
+    /// first — concurrently, since they share no lock — then the
+    /// wallet-event adapter (weight 10) that sinks their stores, then any
+    /// parked orphans. Each worker's drain raises its `quiescing` gate,
+    /// cancels the loop, and **joins** its OS thread / task, so when this
+    /// returns every `!Send` loop has fully exited. Idempotent.
     ///
     /// Ordering matters: cancel-only `stop()` would let a pass already
     /// inside `sync_now` keep running and call `persister.store(...)` /
-    /// fire a host completion callback after the FFI's `destroy`
-    /// returned and the host freed the persister / event-handler
-    /// context — a use-after-free. So we `quiesce()` the sync managers
-    /// FIRST (so no further persister store or host callback can start),
-    /// and only THEN cancel + join the event adapter, which is the sink
-    /// those stores feed into. The three coordinators are independent —
-    /// each `quiesce()` touches only its own state (its `quiescing` /
-    /// `is_syncing` atomics and its own `background_cancel` /
-    /// `background_join` mutexes) and joins its own OS thread, sharing no
-    /// lock — so they are drained *concurrently* via `tokio::join!`; only
-    /// the event-adapter teardown stays ordered strictly after them,
-    /// because it is the sink the coordinators store into.
+    /// fire a host completion callback after the FFI's `destroy` returned
+    /// and the host freed the persister / event-handler context — a
+    /// use-after-free. Quiescing the coordinators (weight 0) before the
+    /// event adapter (weight 10) closes that window: no further store can
+    /// start before its sink is torn down.
     ///
-    /// After each coordinator's `quiesce()` drains its in-flight pass,
-    /// this also **joins** the loop's OS thread, so when `shutdown()`
-    /// returns every `!Send` loop has fully exited. A host that drops the
-    /// tokio runtime right after `shutdown()` (one-shot / headless /
-    /// stdio) is therefore safe — no coordinator can still be polling
-    /// `tokio::time` on a shutting-down runtime. The returned
-    /// [`CoordinatorExitStatus`] reports per-thread how each worker ended.
+    /// A host that drops the tokio runtime right after `shutdown()`
+    /// (one-shot / headless / stdio) is therefore safe — no coordinator
+    /// can still be polling `tokio::time` on a shutting-down runtime. The
+    /// returned [`CoordinatorExitStatus`] reports per-worker how each ended.
     ///
     /// **Precondition: must be called from a multi-thread Tokio runtime.**
     /// Each coordinator's OS thread drives its loop via
-    /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs
-    /// the runtime's timer/IO driver; a `current_thread` runtime can only
-    /// service one `block_on` at a time, so the join would deadlock. This
-    /// is asserted in both debug and release builds.
-    ///
-    /// Each coordinator quiesce+join is bounded by its own
-    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] backstop. Because the three drain
-    /// concurrently, the worst-case wait collapses to ~that single
-    /// backstop instead of the sum of all three. `quiesce()` cancels
-    /// the loop, which aborts any in-flight pass at its `.await` point, so
-    /// the `is_syncing` drain clears promptly and the join normally lands
-    /// far inside the window — the deadline fires only if a pass's *drop*
-    /// itself wedges. On timeout the coordinator slot reports
-    /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever.
+    /// [`Handle::block_on`](tokio::runtime::Handle::block_on) and needs the
+    /// runtime's timer/IO driver; a `current_thread` runtime can only
+    /// service one `block_on` at a time, so the join would deadlock.
+    /// [`ThreadRegistry::shutdown`] asserts this in both debug and release.
     ///
-    /// The clear-on-panic half of that guarantee rides on unwinding, so
-    /// it holds under `panic = "unwind"`. Under the iOS `panic = "abort"`
-    /// release profiles a pass panic aborts the process outright (no
-    /// `Drop`, no status) — there is no live manager left to read a
-    /// status from.
+    /// Each worker's join is bounded by its own
+    /// [`SHUTDOWN_JOIN_TIMEOUT_SECS`] budget; on timeout its handle is
+    /// re-parked and the slot reports
+    /// [`CoordinatorThreadStatus::Timeout`] rather than hanging forever
+    /// (the F1 fix — a dropped/timed-out join can never detach a live
+    /// thread). The clear-on-panic half rides on unwinding, so it holds
+    /// under `panic = "unwind"`; under the iOS `panic = "abort"` profiles a
+    /// pass panic aborts the process outright.
     pub async fn shutdown(&self) -> CoordinatorExitStatus {
-        assert!(
-            matches!(
-                tokio::runtime::Handle::current().runtime_flavor(),
-                tokio::runtime::RuntimeFlavor::MultiThread
-            ),
-            "shutdown() requires a multi-thread Tokio runtime: each \
-             coordinator's OS thread drives its sync loop via \
-             Handle::block_on and needs the runtime's timer/IO driver, but \
-             a current_thread runtime can only drive one block_on at a time"
-        );
-
-        let timeout = std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS);
-
-        // Drain the three independent periodic coordinators *concurrently*.
-        // Each quiesce() drains any in-flight pass AND joins its own OS
-        // thread, touching only that coordinator's own state (no shared
-        // lock), so racing them is sound and collapses the worst case from
-        // the sum of the three backstops to ~max(...). Each drain keeps its
-        // OWN inner `tokio::time::timeout`, so it still yields its own
-        // per-coordinator `CoordinatorThreadStatus` — a single outer timeout
-        // around the whole join! would flatten all three to `Timeout` and
-        // lose that detail.
-        let drain_platform_address = async {
-            tokio::time::timeout(timeout, self.platform_address_sync_manager.quiesce())
-                .await
-                .unwrap_or(CoordinatorThreadStatus::Timeout)
-        };
-        let drain_identity = async {
-            tokio::time::timeout(timeout, self.identity_sync_manager.quiesce())
-                .await
-                .unwrap_or(CoordinatorThreadStatus::Timeout)
-        };
-        #[cfg(feature = "shielded")]
-        let drain_shielded = async {
-            tokio::time::timeout(timeout, self.shielded_sync_manager.quiesce())
-                .await
-                .unwrap_or(CoordinatorThreadStatus::Timeout)
-        };
-
-        #[cfg(feature = "shielded")]
-        let (platform_address_sync, identity_sync, shielded_sync) = {
-            let (p, i, s) = tokio::join!(drain_platform_address, drain_identity, drain_shielded);
-            (p, i, Some(s))
-        };
-        #[cfg(not(feature = "shielded"))]
-        let (platform_address_sync, identity_sync, shielded_sync) = {
-            let (p, i) = tokio::join!(drain_platform_address, drain_identity);
-            (p, i, None)
-        };
-
-        // The event adapter is a tokio task (it sinks the coordinators'
-        // stores), so cancel + join it last — after the loops feeding it
-        // are gone.
-        self.event_adapter_cancel.cancel();
-        // Take the handle out into a local first so the `tokio::Mutex`
-        // guard doesn't stay held across the (up-to-30s) join `.await`
-        // below — a match scrutinee temporary would otherwise keep the
-        // guard alive for the whole match.
-        let event_adapter_handle = self.event_adapter_join.lock().await.take();
-        let event_adapter = match event_adapter_handle {
-            None => CoordinatorThreadStatus::NotRunning,
-            Some(handle) => match tokio::time::timeout(timeout, handle).await {
-                Ok(Ok(())) => CoordinatorThreadStatus::Ok,
-                // The returned status already carries this failure, and the
-                // FFI `destroy` adapter logs the aggregate once at the host
-                // layer — so don't double-log here.
-                Ok(Err(e)) => {
-                    if e.is_panic() {
-                        CoordinatorThreadStatus::Panicked(panic_message(e.into_panic()))
-                    } else {
-                        // Non-panic JoinError: task was cancelled or aborted —
-                        // not a clean exit, but also not a panic.
-                        CoordinatorThreadStatus::Stopped(Some(format!("{e}")))
-                    }
-                }
-                Err(_) => CoordinatorThreadStatus::Timeout,
-            },
-        };
-
-        // Finally, account for any coordinator threads an earlier tight
-        // stop()→start() reap had to detach past its 1 s wedge-backstop.
-        // They were parked in `coordinator_orphans` (not dropped) so we
-        // can join them here; a survivor at the grace deadline reports
-        // `Detached`, which keeps `all_clean()` false so the FFI `destroy`
-        // returns `ErrorShutdownIncomplete` rather than letting the host
-        // free a callback context the live thread may still touch. The
-        // grace poll yields, so it never blocks teardown uncancellably.
-        let detached_threads = join_detached_orphans(
-            &self.coordinator_orphans,
-            std::time::Instant::now() + std::time::Duration::from_secs(SHUTDOWN_ORPHAN_GRACE_SECS),
-        )
-        .await;
-
-        CoordinatorExitStatus {
-            platform_address_sync,
-            identity_sync,
-            shielded_sync,
-            event_adapter,
-            detached_threads,
-        }
+        CoordinatorExitStatus::from_report(self.registry.shutdown().await)
     }
 }
 
@@ -974,68 +726,6 @@ mod tests {
         assert!(status.all_clean());
     }
 
-    /// A coordinator thread that panics surfaces as `Panicked` rather
-    /// than being silently dropped.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn join_coordinator_thread_surfaces_panic() {
-        let handle = std::thread::spawn(|| panic!("boom in coordinator"));
-        match join_coordinator_thread(Some(handle)).await {
-            CoordinatorThreadStatus::Panicked(msg) => {
-                assert!(msg.contains("boom in coordinator"), "msg was {msg:?}");
-            }
-            other => panic!("expected Panicked, got {other:?}"),
-        }
-    }
-
-    /// A non-panic `JoinError` on the event adapter maps to `Stopped`, not
-    /// `Ok`, and is NOT considered clean. This covers the case where the
-    /// tokio task is cancelled or aborted rather than completing normally.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn event_adapter_non_panic_join_error_maps_to_stopped_and_is_not_clean() {
-        // Replace the real adapter handle with a guaranteed-pending task, then
-        // abort it. A `pending::<()>()` future can never complete on its own,
-        // so abort() always produces a non-panic JoinError — deterministically
-        // exercising the Stopped branch regardless of scheduler timing.
-        // (The original approach aborted the real adapter handle, which could
-        // race the task's own completion and silently yield `Ok` instead.)
-        let manager = make_manager();
-
-        // Drain and discard the real adapter (may already be finished).
-        let original = {
-            let mut guard = manager.event_adapter_join.lock().await;
-            guard.take()
-        };
-        if let Some(h) = original {
-            h.abort();
-            let _ = h.await;
-        }
-
-        // Install a permanently-pending task and abort it so the JoinError
-        // path in shutdown() is 100 % deterministic.
-        let pending = tokio::spawn(std::future::pending::<()>());
-        pending.abort();
-        *manager.event_adapter_join.lock().await = Some(pending);
-
-        let status = manager.shutdown().await;
-
-        // The aborted pending task always yields a non-panic JoinError →
-        // shutdown() maps it to Stopped.
-        assert!(
-            matches!(status.event_adapter, CoordinatorThreadStatus::Stopped(_)),
-            "expected Stopped from a non-panic JoinError, got {:?}",
-            status.event_adapter
-        );
-        assert!(
-            !status.event_adapter.is_clean(),
-            "Stopped must not count as clean"
-        );
-        // Coordinators were never started → their slots are clean.
-        assert_eq!(
-            status.platform_address_sync,
-            CoordinatorThreadStatus::NotRunning
-        );
-    }
-
     /// `Stopped` and `Timeout` are NOT clean; `Ok` and `NotRunning` ARE.
     /// Unit-tests the `is_clean` predicate directly so we don't need to
     /// trigger a real timeout (30s) in a deterministic test.
@@ -1097,36 +787,6 @@ mod tests {
         assert!(!with_detached.all_clean());
     }
 
-    /// A cleanly-returning thread joins as `Ok`; an absent handle is
-    /// `NotRunning`.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn join_coordinator_thread_ok_and_absent() {
-        let handle = std::thread::spawn(|| {});
-        assert_eq!(
-            join_coordinator_thread(Some(handle)).await,
-            CoordinatorThreadStatus::Ok
-        );
-        assert_eq!(
-            join_coordinator_thread(None).await,
-            CoordinatorThreadStatus::NotRunning
-        );
-    }
-
-    /// `join_coordinator_thread` uses `is_finished()` polling. Verify
-    /// it completes within a bounded time on a multi-thread runtime, as
-    /// `shutdown()` requires (and that it doesn't busy-spin indefinitely).
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn join_coordinator_thread_completes_within_deadline() {
-        let handle = std::thread::spawn(|| {});
-        let result = tokio::time::timeout(
-            Duration::from_secs(5),
-            join_coordinator_thread(Some(handle)),
-        )
-        .await
-        .expect("join_coordinator_thread must complete within 5 s");
-        assert_eq!(result, CoordinatorThreadStatus::Ok);
-    }
-
     /// `shutdown()` must wait for an in-flight sync pass to drain before
     /// joining the coordinator thread.
     ///
@@ -1185,8 +845,7 @@ mod tests {
     ///
     /// Uses `std::panic::catch_unwind` around `drop(runtime)` rather than
     /// a process-global panic hook; the hook would be live for seconds and
-    /// could swallow diagnostics from concurrently-running tests (e.g.
-    /// `join_coordinator_thread_surfaces_panic`).
+    /// could swallow diagnostics from other concurrently-running tests.
     #[test]
     fn shutdown_then_drop_runtime_does_not_panic() {
         static SHUTDOWN_PANICS: AtomicUsize = AtomicUsize::new(0);
@@ -1273,117 +932,26 @@ mod tests {
         (release_tx, handle)
     }
 
-    /// A prior coordinator thread that is still alive past the reap
-    /// backstop must be **parked in the orphans list**, not dropped —
-    /// otherwise `shutdown()` would never know it exists and could let the
-    /// host free a callback context the live thread still touches.
-    ///
-    /// Non-vacuous: if `reap_prior_or_park` dropped the wedged handle
-    /// (the old behavior) the list would stay empty and the length
-    /// assertion below would fail.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn reap_prior_or_park_parks_wedged_thread() {
-        let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
-        let (release_tx, wedged) = spawn_wedged_thread();
-
-        // `reap_prior_or_park` is synchronous and spins a std sleep until
-        // its backstop, so run it off the runtime workers. A short backstop
-        // (real `start()` uses 1 s) keeps the test fast.
-        let orphans_for_reap = Arc::clone(&orphans);
-        tokio::task::spawn_blocking(move || {
-            reap_prior_or_park(
-                Some(wedged),
-                &orphans_for_reap,
-                Duration::from_millis(100),
-                "test-coordinator",
-            );
-        })
-        .await
-        .unwrap();
-
-        assert_eq!(
-            orphans.lock().unwrap().len(),
-            1,
-            "a prior thread wedged past the backstop must be parked, not dropped"
-        );
-
-        // Cleanup: release + join the parked thread so none leaks.
-        release_tx.send(()).unwrap();
-        let parked = orphans.lock().unwrap().pop().unwrap();
-        tokio::task::spawn_blocking(move || {
-            let _ = parked.join();
-        })
-        .await
-        .unwrap();
-    }
-
-    /// `join_detached_orphans` classifies the parked threads: empty list →
-    /// `Ok`; a survivor at the deadline → `Detached` (re-parked for a later
-    /// retry); once the survivor exits, a fresh join reports `Ok` and
-    /// drains the list.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn join_detached_orphans_reports_detached_then_ok() {
-        let orphans: CoordinatorOrphans = Arc::new(std::sync::Mutex::new(Vec::new()));
-
-        // Nothing parked → clean.
-        assert_eq!(
-            join_detached_orphans(&orphans, std::time::Instant::now()).await,
-            CoordinatorThreadStatus::Ok
-        );
-
-        // Park a still-live thread; a short deadline elapses with it alive.
-        let (release_tx, wedged) = spawn_wedged_thread();
-        orphans.lock().unwrap().push(wedged);
-        let status = join_detached_orphans(
-            &orphans,
-            std::time::Instant::now() + Duration::from_millis(50),
-        )
-        .await;
-        assert_eq!(
-            status,
-            CoordinatorThreadStatus::Detached,
-            "a survivor at the deadline must report Detached"
-        );
-        assert_eq!(
-            orphans.lock().unwrap().len(),
-            1,
-            "a survivor must be re-parked so an idempotent re-shutdown retries"
-        );
-
-        // Release it; the next join reaps it cleanly and empties the list.
-        release_tx.send(()).unwrap();
-        let status = tokio::time::timeout(
-            Duration::from_secs(5),
-            join_detached_orphans(&orphans, std::time::Instant::now() + Duration::from_secs(5)),
-        )
-        .await
-        .expect("orphan join must complete once the thread is released");
-        assert_eq!(status, CoordinatorThreadStatus::Ok);
-        assert!(
-            orphans.lock().unwrap().is_empty(),
-            "a joined orphan must be drained from the list"
-        );
-    }
-
     /// Headline regression: a coordinator thread detached past the reap
     /// backstop and parked in the orphans list makes a subsequent
     /// `shutdown()` report the result as **non-clean** — so the FFI
     /// `destroy` returns `ErrorShutdownIncomplete` and the host delays
     /// freeing the callback context the still-live thread may touch.
     ///
-    /// Non-vacuous: if `join_detached_orphans` ignored the list (or the
-    /// orphan were dropped at reap instead of parked), `detached_threads`
-    /// would be `Ok` and `all_clean()` would be `true`, failing both
-    /// assertions.
+    /// Non-vacuous: if the registry dropped the orphan at reap instead of
+    /// parking it, `detached_threads` would be `Ok` and `all_clean()` would
+    /// be `true`, failing both assertions.
     #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
     async fn shutdown_reports_detached_orphan_as_non_clean() {
         let manager = make_manager();
 
         // Stand in for the genuine-wedge outcome: an earlier tight
-        // stop()→start() reap had to detach a still-live coordinator thread
-        // past its 1 s backstop, so `reap_prior_or_park` parked it here.
+        // stop()->start() reap had to detach a still-live coordinator thread
+        // past its backstop, so the registry parked it as an orphan.
         let (release_tx, wedged) = spawn_wedged_thread();
-        manager.coordinator_orphans.lock().unwrap().push(wedged);
+        manager
+            .registry
+            .park_orphan_for_test(WalletWorker::ShieldedSync, wedged);
 
         let status = tokio::time::timeout(Duration::from_secs(10), manager.shutdown())
             .await
@@ -1400,17 +968,133 @@ mod tests {
              still alive: {status:?}"
         );
 
-        // Cleanup: shutdown() re-parked the survivor; release + join it so
-        // no live thread leaks past the test. Pop into a local first so the
-        // std MutexGuard is not held across the await below.
+        // Cleanup: shutdown() re-parked the survivor; release it and reap so
+        // no live thread leaks past the test.
         release_tx.send(()).unwrap();
-        let parked = manager.coordinator_orphans.lock().unwrap().pop();
-        if let Some(parked) = parked {
-            tokio::task::spawn_blocking(move || {
-                let _ = parked.join();
-            })
+        let _ = manager.registry.reap_orphans(Duration::from_secs(5)).await;
+    }
+
+    /// TC-002 (F2): `clear_shielded` must refuse while a prior-generation
+    /// shielded thread is parked alive — even though the current shielded
+    /// quiesce is clean and the other coordinators / the always-on event
+    /// adapter are legitimately running. Releasing + reaping the orphan
+    /// lets a retry succeed.
+    ///
+    /// Non-vacuous: against the pre-fix gate (only `!status.is_clean()`),
+    /// the clean `NotRunning` quiesce would pass the guard and wipe the
+    /// store under the live orphan — `clear_shielded` would return `Ok`.
+    #[cfg(feature = "shielded")]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn clear_shielded_refuses_while_shielded_orphan_alive() {
+        let manager = make_manager();
+
+        // Park a wedged thread under the ShieldedSync key: a prior-
+        // generation shielded thread an earlier reap could not join.
+        let (release_tx, wedged) = spawn_wedged_thread();
+        manager
+            .registry
+            .park_orphan_for_test(WalletWorker::ShieldedSync, wedged);
+
+        assert!(manager.registry.any_alive_for(WalletWorker::ShieldedSync));
+        assert!(!manager.shielded_sync_manager.is_running());
+
+        // Refuses: the live shielded orphan could re-persist into the store
+        // the wipe is about to clear.
+        let err = manager
+            .clear_shielded()
             .await
-            .unwrap();
-        }
+            .expect_err("clear_shielded must refuse while a shielded orphan is alive");
+        assert!(
+            matches!(
+                err,
+                crate::error::PlatformWalletError::ShieldedShutdownIncomplete { .. }
+            ),
+            "expected ShieldedShutdownIncomplete, got {err:?}"
+        );
+
+        // Release + reap the orphan; the shielded-scoped gate now clears and
+        // a retry succeeds (no shielded store configured → clear is a no-op).
+        release_tx.send(()).unwrap();
+        let _ = manager.registry.reap_orphans(Duration::from_secs(5)).await;
+        assert!(!manager.registry.any_alive_for(WalletWorker::ShieldedSync));
+        manager
+            .clear_shielded()
+            .await
+            .expect("clear_shielded must succeed once the orphan is reaped");
+    }
+
+    /// TC-015 (R5): `from_report` maps the registry's [`ShutdownReport`]
+    /// onto the FFI-stable `CoordinatorExitStatus` with identical field /
+    /// variant shape and `all_clean()` semantics. The full `WorkerStatus`
+    /// -> `CoordinatorThreadStatus` variant table is exercised.
+    #[test]
+    fn from_report_maps_to_ffi_stable_exit_status() {
+        use dash_async::WorkerStatus;
+        use std::collections::BTreeMap;
+
+        // All Ok, no orphans.
+        let per = BTreeMap::from([
+            (WalletWorker::PlatformAddressSync, WorkerStatus::Ok),
+            (WalletWorker::IdentitySync, WorkerStatus::Ok),
+            (WalletWorker::ShieldedSync, WorkerStatus::Ok),
+            (WalletWorker::EventAdapter, WorkerStatus::Ok),
+        ]);
+        let status = CoordinatorExitStatus::from_report(ShutdownReport {
+            per_worker: per,
+            detached: 0,
+        });
+        assert_eq!(status.platform_address_sync, CoordinatorThreadStatus::Ok);
+        assert_eq!(status.identity_sync, CoordinatorThreadStatus::Ok);
+        #[cfg(feature = "shielded")]
+        assert_eq!(status.shielded_sync, Some(CoordinatorThreadStatus::Ok));
+        #[cfg(not(feature = "shielded"))]
+        assert_eq!(status.shielded_sync, None);
+        assert_eq!(status.event_adapter, CoordinatorThreadStatus::Ok);
+        assert_eq!(status.detached_threads, CoordinatorThreadStatus::Ok);
+        assert!(status.all_clean());
+
+        // A surviving orphan -> Detached -> non-clean; absent workers ->
+        // NotRunning.
+        let status = CoordinatorExitStatus::from_report(ShutdownReport {
+            per_worker: BTreeMap::new(),
+            detached: 1,
+        });
+        assert_eq!(status.detached_threads, CoordinatorThreadStatus::Detached);
+        assert_eq!(
+            status.platform_address_sync,
+            CoordinatorThreadStatus::NotRunning
+        );
+        assert!(!status.all_clean());
+
+        // A per-worker Timeout propagates and is non-clean.
+        let per = BTreeMap::from([(WalletWorker::IdentitySync, WorkerStatus::Timeout)]);
+        let status = CoordinatorExitStatus::from_report(ShutdownReport {
+            per_worker: per,
+            detached: 0,
+        });
+        assert_eq!(status.identity_sync, CoordinatorThreadStatus::Timeout);
+        assert!(!status.all_clean());
+
+        // Full variant mapping table.
+        assert_eq!(
+            CoordinatorThreadStatus::from(WorkerStatus::Stopped(Some("x".into()))),
+            CoordinatorThreadStatus::Stopped(Some("x".into()))
+        );
+        assert_eq!(
+            CoordinatorThreadStatus::from(WorkerStatus::Panicked("p".into())),
+            CoordinatorThreadStatus::Panicked("p".into())
+        );
+        assert_eq!(
+            CoordinatorThreadStatus::from(WorkerStatus::Error("e".into())),
+            CoordinatorThreadStatus::Error("e".into())
+        );
+        assert_eq!(
+            CoordinatorThreadStatus::from(WorkerStatus::NotRunning),
+            CoordinatorThreadStatus::NotRunning
+        );
+        assert_eq!(
+            CoordinatorThreadStatus::from(WorkerStatus::Detached),
+            CoordinatorThreadStatus::Detached
+        );
     }
 }
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index 40457c4a87..e68fcfef7c 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -11,19 +11,19 @@
 use std::collections::BTreeMap;
 use std::sync::{
     atomic::{AtomicBool, AtomicU64, Ordering},
-    Arc, Mutex as StdMutex,
+    Arc,
 };
 
-use dash_async::AtomicFlagGuard;
+use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use arc_swap::ArcSwapOption;
 use dash_sdk::platform::address_sync::{AddressSyncConfig, AddressSyncResult};
 use key_wallet::PlatformP2PKHAddress;
 
+use super::WalletWorker;
 use crate::wallet::PlatformAddressTag;
 use tokio::sync::RwLock;
-use tokio_util::sync::CancellationToken;
 
 use crate::error::PlatformWalletError;
 use crate::events::PlatformEventManager;
@@ -97,27 +97,10 @@ impl PlatformAddressSyncSummary {
 pub struct PlatformAddressSyncManager {
     wallets: Arc<RwLock<BTreeMap<WalletId, Arc<PlatformWallet>>>>,
     event_manager: Arc<PlatformEventManager>,
-    /// Cancel token for the background loop, if running.
-    background_cancel: StdMutex<Option<CancellationToken>>,
-    /// Join handle for the background loop's OS thread, if running.
-    /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
-    /// confirm the `!Send` loop fully exited before the host drops the
-    /// runtime.
-    background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
-    /// Manager-owned orphans list (shared `Arc`). On a tight
-    /// `stop()`→`start()` where the prior thread is wedged past the 1 s
-    /// reap backstop, [`start`](Self::start) parks the still-live handle
-    /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
-    /// instead of dropping it, so manager `shutdown()` accounts for it.
-    coordinator_orphans: super::CoordinatorOrphans,
-    /// Monotonically increasing generation counter. Bumped on every
-    /// `start()` so the exiting thread can tell whether its generation is
-    /// still the active one before clearing `background_cancel`. Without
-    /// this guard a tight `stop()` → `start()` reschedule lets the prior
-    /// thread's cleanup strip the *new* generation's token, leaving the
-    /// new loop running but untrackable via `is_running()` / `stop()`.
-    /// Mirrors the identity / shielded coordinators.
-    background_generation: AtomicU64,
+    /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
+    /// `quiesce` delegate to it under the
+    /// [`WalletWorker::PlatformAddressSync`] key.
+    registry: Arc<ThreadRegistry<WalletWorker>>,
     interval_secs: AtomicU64,
     is_syncing: AtomicBool,
     /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -141,15 +124,12 @@ impl PlatformAddressSyncManager {
     pub fn new(
         wallets: Arc<RwLock<BTreeMap<WalletId, Arc<PlatformWallet>>>>,
         event_manager: Arc<PlatformEventManager>,
-        coordinator_orphans: super::CoordinatorOrphans,
+        registry: Arc<ThreadRegistry<WalletWorker>>,
     ) -> Self {
         Self {
             wallets,
             event_manager,
-            background_cancel: StdMutex::new(None),
-            background_join: StdMutex::new(None),
-            coordinator_orphans,
-            background_generation: AtomicU64::new(0),
+            registry,
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
             quiescing: AtomicBool::new(false),
@@ -185,10 +165,22 @@ impl PlatformAddressSyncManager {
 
     /// Whether the background loop is currently running.
     pub fn is_running(&self) -> bool {
-        self.background_cancel
-            .lock()
-            .map(|g| g.is_some())
-            .unwrap_or(false)
+        self.registry.is_running(WalletWorker::PlatformAddressSync)
+    }
+
+    /// The drain barrier handed to the registry: raise the `quiescing`
+    /// gate so any pass past its `is_syncing` CAS bails. The registry then
+    /// cancels the loop and joins the thread (the join waits for the
+    /// in-flight pass — incl. its completion-event dispatch — to drop and
+    /// `is_syncing` to clear), so this barrier is instant.
+    fn drain_hook(self: &Arc<Self>) -> DrainHook {
+        let this = Arc::clone(self);
+        Arc::new(move || {
+            let this = Arc::clone(&this);
+            Box::pin(async move {
+                this.quiescing.store(true, Ordering::Release);
+            })
+        })
     }
 
     /// Whether a sync pass is in flight right now.
@@ -220,60 +212,28 @@ impl PlatformAddressSyncManager {
     /// The first pass runs immediately; subsequent passes fire every
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
-        let mut cancel_guard = self
-            .background_cancel
-            .lock()
-            .unwrap_or_else(|e| e.into_inner());
-        if cancel_guard.is_some() {
-            return;
-        }
+        // Reopen the quiescing gate so this (re)start's passes can run.
+        self.quiescing.store(false, Ordering::Release);
 
-        // Take any handle left by a prior stop() call so we can reap it — but
-        // DON'T join it here, while we still hold background_cancel. stop()
-        // takes-and-cancels the token but never touches background_join, so a
-        // stop()→start() sequence would otherwise overwrite (detach) the old
-        // handle and shutdown() would miss that thread. Joining it under
-        // background_cancel would DEADLOCK the reap into its 1 s backstop: the
-        // exiting prior thread's epilogue also locks background_cancel (to
-        // clear its slot), so it would block on the lock we hold → never
-        // finish → get detached on the exact stop()→start() path the reap
-        // exists for. We install the new token + bump the generation below,
-        // release the lock, and only THEN reap (after this fn's tail).
-        let prior = self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take();
-
-        let cancel = CancellationToken::new();
-        *cancel_guard = Some(cancel.clone());
-        // Bump the generation while we still hold the slot lock so any
-        // prior thread's cleanup observes `current_gen != my_gen` ordered
-        // against this token swap.
-        let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
+        let cfg = WorkerConfig {
+            weight: super::COORDINATOR_WEIGHT,
+            join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
+            drain: Some(self.drain_hook()),
+        };
 
+        // The loop drives `!Send` SDK futures via `Handle::block_on` on a
+        // dedicated OS thread (spawned by the registry). `biased` polls the
+        // cancel arm first so a pass stalled on a hung SDK fetch is dropped
+        // at its `.await` the instant the registry cancels.
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
-        let join = std::thread::Builder::new()
-            .name("platform-address-sync".into())
-            .spawn(move || {
+        self.registry
+            .start_thread(WalletWorker::PlatformAddressSync, cfg, move |cancel| {
                 handle.block_on(async move {
                     loop {
                         if cancel.is_cancelled() {
                             break;
                         }
-
-                        // Race the in-flight pass against cancellation.
-                        // `stop()` / `quiesce()` cancel the token; with
-                        // `biased` the cancel arm is polled first, so a
-                        // pass stalled on a hung SDK fetch is dropped at
-                        // its `.await` the instant we cancel. Dropping the
-                        // `sync_now` future unwinds to the `is_syncing`
-                        // `AtomicFlagGuard` it holds, clearing the flag
-                        // promptly — so `quiesce()`'s drain loop frees and
-                        // the join lands well inside `shutdown()`'s
-                        // timeout. A stalled pass can no longer strand a
-                        // live `!Send` thread past `shutdown()`.
                         tokio::select! {
                             biased;
                             _ = cancel.cancelled() => break,
@@ -286,50 +246,8 @@ impl PlatformAddressSyncManager {
                             _ = cancel.cancelled() => break,
                         }
                     }
-
-                    // Only clear the slot if no newer start() has
-                    // installed a replacement token since we launched —
-                    // mirrors the identity / shielded coordinators so a
-                    // stop() → start() reschedule can't have this exiting
-                    // thread strip the new generation's cancel token.
-                    if let Ok(mut guard) = this.background_cancel.lock() {
-                        if this.background_generation.load(Ordering::Acquire) == my_gen {
-                            *guard = None;
-                        }
-                    }
                 });
-            })
-            .expect("failed to spawn platform-address-sync thread");
-        // Store the join handle while still holding cancel_guard — a
-        // concurrent quiesce() must wait for this lock before calling
-        // stop(), so the handle is always stored before it can be taken.
-        *self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner()) = Some(join);
-
-        // Release background_cancel BEFORE reaping the prior thread, so its
-        // epilogue can acquire the lock, observe the bumped generation, skip
-        // clearing our freshly-installed token, and return. Holding the lock
-        // across the join below is what would block the prior thread, spin
-        // the full 1 s deadline, and detach — the very stall this ordering
-        // removes.
-        drop(cancel_guard);
-
-        // Now reap the prior thread. It was already cancellation-signalled by
-        // stop(), and with the lock released its epilogue completes promptly,
-        // so is_finished() trips within a few milliseconds and the join is
-        // near-instant. The 1 s deadline survives only as a genuine-wedge
-        // backstop (e.g. a pass wedged in a Drop that never yields); if it
-        // fires `reap_prior_or_park` parks the still-live, already-cancelled
-        // thread in the manager orphans list so `shutdown()` joins it and
-        // reports it non-clean rather than dropping it (residual UAF).
-        super::reap_prior_or_park(
-            prior,
-            &self.coordinator_orphans,
-            std::time::Duration::from_secs(1),
-            "platform-address-sync",
-        );
+            });
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -342,14 +260,7 @@ impl PlatformAddressSyncManager {
     /// the host can free the event-handler context — use
     /// [`quiesce`](Self::quiesce).
     pub fn stop(&self) {
-        if let Some(token) = self
-            .background_cancel
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take()
-        {
-            token.cancel();
-        }
+        self.registry.cancel(WalletWorker::PlatformAddressSync);
     }
 
     /// Cancel the background loop **and wait for any in-flight sync pass
@@ -378,24 +289,14 @@ impl PlatformAddressSyncManager {
     /// the `!Send` loop has stopped touching `tokio::time` before a
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
-        self.quiescing.store(true, Ordering::Release);
-        // RAII gate: resets `quiescing` on *every* exit path — a normal
-        // return, a timed-out `shutdown()` dropping this future, or a
-        // panic. Without it a quiesce that doesn't run to completion
-        // leaves the gate latched `true`, silently bailing every future
-        // pass. Reopening on drop is safe because `stop()` (below) has
-        // already cancelled the loop, so no new pass can start.
+        // RAII gate: reopen `quiescing` on every exit path. The registry's
+        // drain hook raises it inside `quiesce`; reopening on return is
+        // safe because the loop has been cancelled, so no new pass starts.
         let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.stop();
-        while self.is_syncing.load(Ordering::Acquire) {
-            tokio::time::sleep(Duration::from_millis(20)).await;
-        }
-        let handle = self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take();
-        super::join_coordinator_thread(handle).await
+        self.registry
+            .quiesce(WalletWorker::PlatformAddressSync)
+            .await
+            .into()
     }
 
     /// Run one sync pass across every registered wallet.
@@ -543,12 +444,12 @@ mod tests {
         let event_manager = Arc::new(PlatformEventManager::new(vec![
             Arc::clone(&counter) as Arc<dyn PlatformEventHandler>
         ]));
-        let orphans = Arc::new(StdMutex::new(Vec::new()));
+        let registry = ThreadRegistry::new();
         (
             Arc::new(PlatformAddressSyncManager::new(
                 wallets,
                 event_manager,
-                orphans,
+                registry,
             )),
             counter,
         )
@@ -564,113 +465,6 @@ mod tests {
         assert!(!mgr.is_syncing());
     }
 
-    /// `quiesce()` must not return while a pass is in flight, and must
-    /// return promptly once the pass drains.
-    ///
-    /// Drives the real `is_syncing` lifecycle: a background task takes
-    /// the slot via the same `compare_exchange` the real `sync_now`
-    /// uses, holds it across a sleep (standing in for the pass body +
-    /// completion-event dispatch, which `sync_now` keeps the flag set
-    /// across), then clears it.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn quiesce_blocks_until_in_flight_pass_drains() {
-        let (mgr, _counter) = make_manager();
-
-        let holder = Arc::clone(&mgr);
-        let pass = tokio::spawn(async move {
-            assert!(
-                holder
-                    .is_syncing
-                    .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
-                    .is_ok(),
-                "test should own the is_syncing slot"
-            );
-            tokio::time::sleep(Duration::from_millis(200)).await;
-            holder.is_syncing.store(false, Ordering::Release);
-        });
-
-        while !mgr.is_syncing() {
-            tokio::time::sleep(Duration::from_millis(5)).await;
-        }
-
-        let quiesce_fut = mgr.quiesce();
-        tokio::pin!(quiesce_fut);
-
-        tokio::select! {
-            _ = &mut quiesce_fut => panic!("quiesce returned while a pass was in flight"),
-            _ = tokio::time::sleep(Duration::from_millis(50)) => {}
-        }
-        assert!(mgr.is_syncing(), "pass should still be in flight");
-
-        tokio::time::timeout(Duration::from_secs(2), &mut quiesce_fut)
-            .await
-            .expect("quiesce did not return after the pass drained");
-
-        assert!(!mgr.quiescing.load(Ordering::Acquire));
-        assert!(!mgr.is_syncing());
-        pass.await.unwrap();
-    }
-
-    /// Regression: a tight `stop()` → `start()` must reap the prior loop's
-    /// OS thread promptly, NOT stall on the 1 s detach backstop.
-    ///
-    /// The prior thread's exit epilogue locks `background_cancel` to
-    /// conditionally clear its slot. The earlier ordering held
-    /// `background_cancel` across the prior-handle join inside `start()`, so
-    /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
-    /// that lock, never finished, and the reap spin-waited the full second
-    /// before detaching — a 1 s stall plus a transient untracked thread. The
-    /// fix installs the new token + generation, releases `background_cancel`,
-    /// and only then reaps, so the prior thread's epilogue runs and the join
-    /// lands in milliseconds.
-    ///
-    /// `stop()` and `start()` run back-to-back in one blocking closure
-    /// (mirroring the real call site) so `start()` re-acquires the lock
-    /// microseconds after `stop()` frees it — before the async-woken prior
-    /// thread can reach its epilogue. Against the old lock-held ordering this
-    /// reliably stalls ~1 s and fails the bound below.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
-    async fn restart_after_stop_reaps_prior_thread() {
-        let (mgr, _counter) = make_manager();
-
-        // Launch the first loop and let its immediate (no-op, empty wallet
-        // map) pass complete so the thread parks in the interval sleep, where
-        // cancellation lands cleanly.
-        Arc::clone(&mgr).start();
-        assert!(mgr.is_running());
-        tokio::time::sleep(Duration::from_millis(50)).await;
-
-        // Back-to-back cancel-only stop + restart, off the runtime so the
-        // synchronous reap can't starve a worker. `start()` re-grabs
-        // background_cancel right after `stop()` frees it.
-        let restart = Arc::clone(&mgr);
-        let elapsed = tokio::task::spawn_blocking(move || {
-            restart.stop();
-            let started = std::time::Instant::now();
-            Arc::clone(&restart).start();
-            started.elapsed()
-        })
-        .await
-        .unwrap();
-
-        assert!(
-            elapsed < Duration::from_millis(500),
-            "stop()→start() stalled for {elapsed:?}: prior thread was not \
-             reaped promptly (background_cancel held across the join?)"
-        );
-        assert!(mgr.is_running(), "restart must leave the new loop tracked");
-
-        // Wind the new loop down so the test leaves no live !Send thread.
-        let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
-            .await
-            .expect("cleanup quiesce did not complete within 2s after restart");
-        assert!(
-            status.is_clean(),
-            "cleanup quiesce ended non-cleanly: {status:?}"
-        );
-        assert!(!mgr.is_running());
-    }
-
     /// A `sync_now()` invoked while `quiescing` is set must bail without
     /// running the pass — in particular, without firing the
     /// `on_platform_address_sync_completed` host callback. This is the
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 3c84bd7071..a930febdc7 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -28,15 +28,15 @@
 use std::collections::BTreeMap;
 use std::sync::{
     atomic::{AtomicBool, AtomicU64, Ordering},
-    Arc, Mutex as StdMutex,
+    Arc,
 };
 
-use dash_async::AtomicFlagGuard;
+use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use tokio::sync::RwLock;
-use tokio_util::sync::CancellationToken;
 
+use super::WalletWorker;
 use crate::events::PlatformEventManager;
 use crate::wallet::platform_wallet::WalletId;
 use crate::wallet::shielded::{NetworkShieldedCoordinator, ShieldedSyncSummary};
@@ -141,27 +141,10 @@ pub struct ShieldedSyncManager {
     /// run first, so an empty slot guarantees no shielded state
     /// exists).
     coordinator_slot: Arc<RwLock<Option<Arc<NetworkShieldedCoordinator>>>>,
-    /// Cancel token for the background loop, if running.
-    background_cancel: StdMutex<Option<CancellationToken>>,
-    /// Join handle for the background loop's OS thread, if running.
-    /// Taken and joined by [`quiesce`](Self::quiesce) so shutdown can
-    /// confirm the `!Send` loop fully exited before the host drops the
-    /// runtime.
-    background_join: StdMutex<Option<std::thread::JoinHandle<()>>>,
-    /// Manager-owned orphans list (shared `Arc`). On a tight
-    /// `stop()`→`start()` where the prior thread is wedged past the 1 s
-    /// reap backstop, [`start`](Self::start) parks the still-live handle
-    /// here (via [`reap_prior_or_park`](super::reap_prior_or_park))
-    /// instead of dropping it, so manager `shutdown()` accounts for it.
-    coordinator_orphans: super::CoordinatorOrphans,
-    /// Monotonically increasing generation counter. Bumped on every
-    /// `start()` so the exiting thread can tell whether its
-    /// generation is still the active one before clearing
-    /// `background_cancel`. Without this, a `stop()` → `start()`
-    /// overlap lets the prior thread's cleanup strip the new
-    /// generation's token, leaving the new loop running but
-    /// untrackable via `is_running()`.
-    background_generation: AtomicU64,
+    /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
+    /// `quiesce` delegate to it under the [`WalletWorker::ShieldedSync`]
+    /// key.
+    registry: Arc<ThreadRegistry<WalletWorker>>,
     interval_secs: AtomicU64,
     is_syncing: AtomicBool,
     /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
@@ -179,15 +162,12 @@ impl ShieldedSyncManager {
     pub fn new(
         event_manager: Arc<PlatformEventManager>,
         coordinator_slot: Arc<RwLock<Option<Arc<NetworkShieldedCoordinator>>>>,
-        coordinator_orphans: super::CoordinatorOrphans,
+        registry: Arc<ThreadRegistry<WalletWorker>>,
     ) -> Self {
         Self {
             event_manager,
             coordinator_slot,
-            background_cancel: StdMutex::new(None),
-            background_join: StdMutex::new(None),
-            coordinator_orphans,
-            background_generation: AtomicU64::new(0),
+            registry,
             interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
             is_syncing: AtomicBool::new(false),
             quiescing: AtomicBool::new(false),
@@ -210,10 +190,22 @@ impl ShieldedSyncManager {
 
     /// Whether the background loop is currently running.
     pub fn is_running(&self) -> bool {
-        self.background_cancel
-            .lock()
-            .map(|g| g.is_some())
-            .unwrap_or(false)
+        self.registry.is_running(WalletWorker::ShieldedSync)
+    }
+
+    /// The drain barrier handed to the registry: raise the `quiescing`
+    /// gate so any pass past its `is_syncing` CAS bails. The registry then
+    /// cancels the loop and joins the thread (the join waits for the
+    /// in-flight pass — incl. its persister fan-out — to drop and
+    /// `is_syncing` to clear), so this barrier is instant.
+    fn drain_hook(self: &Arc<Self>) -> DrainHook {
+        let this = Arc::clone(self);
+        Arc::new(move || {
+            let this = Arc::clone(&this);
+            Box::pin(async move {
+                this.quiescing.store(true, Ordering::Release);
+            })
+        })
     }
 
     /// Whether a sync pass is in flight right now.
@@ -238,67 +230,30 @@ impl ShieldedSyncManager {
     /// GRPC client state isn't `Send + Sync`). Same trade-off as
     /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start).
     pub fn start(self: Arc<Self>) {
-        let mut cancel_guard = self
-            .background_cancel
-            .lock()
-            .unwrap_or_else(|e| e.into_inner());
-        if cancel_guard.is_some() {
-            return;
-        }
+        // Reopen the quiescing gate so this (re)start's passes can run.
+        self.quiescing.store(false, Ordering::Release);
 
-        // Take any handle left by a prior stop() call so we can reap it — but
-        // DON'T join it here, while we still hold background_cancel. stop()
-        // takes-and-cancels the token but never touches background_join, so a
-        // stop()→start() sequence would otherwise overwrite (detach) the old
-        // handle and shutdown() would miss that thread. Joining it under
-        // background_cancel would DEADLOCK the reap into its 1 s backstop: the
-        // exiting prior thread's epilogue also locks background_cancel (to
-        // clear its slot), so it would block on the lock we hold → never
-        // finish → get detached on the exact stop()→start() path the reap
-        // exists for. We install the new token + bump the generation below,
-        // release the lock, and only THEN reap (after this fn's tail).
-        let prior = self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take();
-
-        let cancel = CancellationToken::new();
-        *cancel_guard = Some(cancel.clone());
-        // Bump the generation while we still hold the slot lock so
-        // the load below in any prior thread's cleanup observes
-        // `current_gen != my_gen` ordered against this token swap.
-        let my_gen = self.background_generation.fetch_add(1, Ordering::AcqRel) + 1;
+        let cfg = WorkerConfig {
+            weight: super::COORDINATOR_WEIGHT,
+            join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
+            drain: Some(self.drain_hook()),
+        };
 
+        // The loop drives `!Send` SDK futures via `Handle::block_on` on a
+        // dedicated OS thread (spawned by the registry). The background
+        // cadence passes `force=false` to honor the per-wallet caught-up
+        // cooldown; user-initiated syncs pass `force=true` via the FFI.
+        // `biased` polls the cancel arm first so a pass stalled on a hung
+        // SDK fetch is dropped the instant the registry cancels.
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
-        let join = std::thread::Builder::new()
-            .name("shielded-sync".into())
-            .spawn(move || {
+        self.registry
+            .start_thread(WalletWorker::ShieldedSync, cfg, move |cancel| {
                 handle.block_on(async move {
                     loop {
                         if cancel.is_cancelled() {
                             break;
                         }
-
-                        // Background-loop cadence — honor the
-                        // per-wallet caught-up cooldown so a
-                        // sleepy network doesn't refetch +
-                        // re-trial-decrypt the partial buffer
-                        // chunk every interval. User-initiated
-                        // syncs pass `force=true` to the FFI
-                        // entry point below and bypass this.
-                        //
-                        // Race the pass against cancellation. `stop()` /
-                        // `quiesce()` cancel the token; with `biased` the
-                        // cancel arm is polled first, so a pass stalled on
-                        // a hung SDK fetch is dropped at its `.await` the
-                        // instant we cancel. Dropping the `sync_now` future
-                        // unwinds to the `is_syncing` `AtomicFlagGuard` it
-                        // holds, clearing the flag promptly — so the drain
-                        // loop in `quiesce()` frees and the join lands well
-                        // inside `shutdown()`'s timeout. A stalled pass can
-                        // no longer strand a live `!Send` thread.
                         tokio::select! {
                             biased;
                             _ = cancel.cancelled() => break,
@@ -311,58 +266,8 @@ impl ShieldedSyncManager {
                             _ = cancel.cancelled() => break,
                         }
                     }
-
-                    // Only clear `background_cancel` if the active
-                    // generation is still ours. Acquire the lock FIRST,
-                    // then read/compare `background_generation` under it
-                    // (matching identity_sync / platform_address_sync).
-                    // Reading the generation BEFORE locking opens a
-                    // stale-read TOCTOU: this exiting thread could observe
-                    // a pre-bump generation, then block on the lock until a
-                    // concurrent `start()` released it, and null the
-                    // freshly-installed token — leaving the new loop
-                    // running but unreflectable via `is_running()` /
-                    // `stop()`. `start()` bumps the generation while it
-                    // holds this same lock, so comparing under the lock
-                    // guarantees we observe the post-swap value.
-                    if let Ok(mut guard) = this.background_cancel.lock() {
-                        if this.background_generation.load(Ordering::Acquire) == my_gen {
-                            *guard = None;
-                        }
-                    }
                 });
-            })
-            .expect("failed to spawn shielded-sync thread");
-        // Store the join handle while still holding cancel_guard — a
-        // concurrent quiesce() must wait for this lock before calling
-        // stop(), so the handle is always stored before it can be taken.
-        *self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner()) = Some(join);
-
-        // Release background_cancel BEFORE reaping the prior thread, so its
-        // epilogue can observe the bumped generation (and skip clearing our
-        // freshly-installed token) without contending the lock we hold.
-        // Holding the lock across the join below is what would block the
-        // prior thread, spin the full 1 s deadline, and detach — the very
-        // stall this ordering removes.
-        drop(cancel_guard);
-
-        // Now reap the prior thread. It was already cancellation-signalled by
-        // stop(), and with the lock released its epilogue completes promptly,
-        // so is_finished() trips within a few milliseconds and the join is
-        // near-instant. The 1 s deadline survives only as a genuine-wedge
-        // backstop (e.g. a pass wedged in a Drop that never yields); if it
-        // fires `reap_prior_or_park` parks the still-live, already-cancelled
-        // thread in the manager orphans list so `shutdown()` joins it and
-        // reports it non-clean rather than dropping it (residual UAF).
-        super::reap_prior_or_park(
-            prior,
-            &self.coordinator_orphans,
-            std::time::Duration::from_secs(1),
-            "shielded-sync",
-        );
+            });
     }
 
     /// Stop the background sync loop. No-op if not running.
@@ -374,14 +279,7 @@ impl ShieldedSyncManager {
     /// nothing more will be persisted" barrier — required by Clear,
     /// unregister, and rebind — use [`quiesce`](Self::quiesce).
     pub fn stop(&self) {
-        if let Some(token) = self
-            .background_cancel
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take()
-        {
-            token.cancel();
-        }
+        self.registry.cancel(WalletWorker::ShieldedSync);
     }
 
     /// Cancel the background loop **and wait for any in-flight sync pass
@@ -408,24 +306,14 @@ impl ShieldedSyncManager {
     /// the `!Send` loop has stopped touching `tokio::time` before a
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
-        self.quiescing.store(true, Ordering::Release);
-        // RAII gate: resets `quiescing` on *every* exit path — a normal
-        // return, a timed-out `shutdown()` / Clear dropping this future,
-        // or a panic. Without it a quiesce that doesn't run to completion
-        // leaves the gate latched `true`, silently bailing every future
-        // pass. Reopening on drop is safe because `stop()` (below) has
-        // already cancelled the loop, so no new pass can start.
+        // RAII gate: reopen `quiescing` on every exit path. The registry's
+        // drain hook raises it inside `quiesce`; reopening on return is
+        // safe because the loop has been cancelled, so no new pass starts.
         let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.stop();
-        while self.is_syncing.load(Ordering::Acquire) {
-            tokio::time::sleep(Duration::from_millis(20)).await;
-        }
-        let handle = self
-            .background_join
-            .lock()
-            .unwrap_or_else(|e| e.into_inner())
-            .take();
-        super::join_coordinator_thread(handle).await
+        self.registry
+            .quiesce(WalletWorker::ShieldedSync)
+            .await
+            .into()
     }
 
     /// Run one sync pass across every registered wallet.
@@ -583,90 +471,3 @@ impl std::fmt::Debug for ShieldedSyncManager {
             .finish()
     }
 }
-
-// The whole module is already `#[cfg(feature = "shielded")]`-gated at its
-// `mod` declaration (manager/mod.rs), so these tests compile only under that
-// feature — no extra per-test gate needed.
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    /// Build a manager over an **empty** coordinator slot wired to a
-    /// handler-less event manager. An empty slot makes every `sync_now`
-    /// pass a no-op (empty-coordinator handling returns immediately), so
-    /// the background loop parks in its interval sleep — exactly where
-    /// cancellation lands cleanly — without needing a live SDK / network.
-    /// That is all the start/stop/restart thread-lifecycle tests below
-    /// exercise.
-    fn make_manager() -> Arc<ShieldedSyncManager> {
-        let coordinator_slot = Arc::new(RwLock::new(None));
-        let event_manager = Arc::new(PlatformEventManager::new(vec![]));
-        let orphans = Arc::new(StdMutex::new(Vec::new()));
-        Arc::new(ShieldedSyncManager::new(
-            event_manager,
-            coordinator_slot,
-            orphans,
-        ))
-    }
-
-    /// Regression: a tight `stop()` → `start()` must reap the prior loop's
-    /// OS thread promptly, NOT stall on the 1 s detach backstop.
-    ///
-    /// The prior thread's exit epilogue locks `background_cancel` to
-    /// conditionally clear its slot. The earlier ordering held
-    /// `background_cancel` across the prior-handle join inside `start()`, so
-    /// on a back-to-back `stop()` → `start()` the exiting thread blocked on
-    /// that lock, never finished, and the reap spin-waited the full second
-    /// before detaching — a 1 s stall plus a transient untracked thread. The
-    /// fix installs the new token + generation, releases `background_cancel`,
-    /// and only then reaps, so the prior thread's epilogue runs and the join
-    /// lands in milliseconds. Mirrors the identity-sync and
-    /// platform-address-sync siblings.
-    ///
-    /// `stop()` and `start()` run back-to-back in one blocking closure
-    /// (mirroring the real call site) so `start()` re-acquires the lock
-    /// microseconds after `stop()` frees it — before the async-woken prior
-    /// thread can reach its epilogue. Against the old lock-held ordering this
-    /// reliably stalls ~1 s and fails the bound below.
-    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
-    async fn restart_after_stop_reaps_prior_thread() {
-        let mgr = make_manager();
-
-        // Launch the first loop and let its immediate (no-op, empty
-        // coordinator) pass complete so the thread parks in the interval
-        // sleep, where cancellation lands cleanly.
-        Arc::clone(&mgr).start();
-        assert!(mgr.is_running());
-        tokio::time::sleep(Duration::from_millis(50)).await;
-
-        // Back-to-back cancel-only stop + restart, off the runtime so the
-        // synchronous reap can't starve a worker. `start()` re-grabs
-        // background_cancel right after `stop()` frees it.
-        let restart = Arc::clone(&mgr);
-        let elapsed = tokio::task::spawn_blocking(move || {
-            restart.stop();
-            let started = std::time::Instant::now();
-            Arc::clone(&restart).start();
-            started.elapsed()
-        })
-        .await
-        .unwrap();
-
-        assert!(
-            elapsed < Duration::from_millis(500),
-            "stop()→start() stalled for {elapsed:?}: prior thread was not \
-             reaped promptly (background_cancel held across the join?)"
-        );
-        assert!(mgr.is_running(), "restart must leave the new loop tracked");
-
-        // Wind the new loop down so the test leaves no live !Send thread.
-        let status = tokio::time::timeout(Duration::from_secs(2), mgr.quiesce())
-            .await
-            .expect("cleanup quiesce did not complete within 2s after restart");
-        assert!(
-            status.is_clean(),
-            "cleanup quiesce ended non-cleanly: {status:?}"
-        );
-        assert!(!mgr.is_running());
-    }
-}

From d190f298d1ead1f056e46a13550419d42f660a45 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 24 Jun 2026 23:29:50 +0200
Subject: [PATCH 20/29] test(dash-async): anchor DrainHook compile_fail doctest
 to E0277 + note assert asymmetry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

QA-002: anchor the DrainHook Send+Sync compile_fail doctest to E0277 so it
verifies the !Send capture is rejected for the right reason (unsatisfied
Send bound) and cannot pass vacuously on an unrelated compile error.

QA-001: document the runtime-flavor assert asymmetry — start_thread and
shutdown assert a multi-thread runtime but start_task does not, so a
task-only consumer (rs-dapi) on a current_thread runtime would panic late
at shutdown(). The wallet always uses start_thread, so it trips the assert
at start and is unaffected. Fix deferred to the rs-dapi adoption PR.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-dash-async/src/registry.rs | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index 802ca3598c..328ef419bc 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -122,9 +122,11 @@ impl<K: RegistryKey> ShutdownReport<K> {
 /// registry never owns domain semantics.
 ///
 /// The captured state must be `Send + Sync`; a `!Send` capture does not
-/// compile as a `DrainHook`:
+/// compile as a `DrainHook`. The fence is anchored to `E0277` (unsatisfied
+/// `Send` bound) so the test cannot pass vacuously on some unrelated
+/// compile error:
 ///
-/// ```compile_fail
+/// ```compile_fail,E0277
 /// use std::rc::Rc;
 /// use std::sync::Arc;
 /// use dash_async::DrainHook;
@@ -331,6 +333,18 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// Start a tokio-task worker for `Send` futures. Same restart-reap
     /// semantics as [`start_thread`](Self::start_thread); does not require
     /// a multi-thread runtime.
+    ///
+    // TODO(rs-dapi-adoption): runtime-flavor assert is asymmetric.
+    // `start_thread` and `shutdown` assert a multi-thread runtime (the
+    // OS-thread `block_on` needs the shared reactor), but `start_task` does
+    // not — a task only needs a runtime handle. So a TASK-ONLY consumer
+    // (rs-dapi, no `start_thread`) can register and run workers on a
+    // `current_thread` runtime, then panic LATE when it finally calls
+    // `shutdown()`. The wallet (which always uses `start_thread`) trips the
+    // assert at start, so it is unaffected. Fix when rs-dapi adopts the
+    // registry: either drop the assert from `shutdown` for all-task
+    // registries (track whether any OS-thread worker was ever started) or
+    // assert in `start_task` too and require multi-thread everywhere.
     pub fn start_task<F, Fut>(self: &Arc<Self>, key: K, cfg: WorkerConfig, body: F)
     where
         F: FnOnce(CancellationToken) -> Fut + Send + 'static,
@@ -498,6 +512,8 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// (drain-hook -> cancel -> join) run concurrently within a tier;
     /// orphan reap runs last. **Requires a multi-thread runtime.**
     pub async fn shutdown(&self) -> ShutdownReport<K> {
+        // TODO(rs-dapi-adoption): see `start_task` — this assert is the late
+        // panic point for a task-only consumer on a current_thread runtime.
         Self::assert_multi_thread("shutdown");
 
         // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in

From 3e81fc1bb9c0ede295c0d865c98afe858ea0cb99 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 01:03:37 +0200
Subject: [PATCH 21/29] fix(dash-async,platform-wallet): harden ThreadRegistry
 lifecycle + doc accuracy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apply consolidated review findings to the shared ThreadRegistry and the
wallet's shutdown/clear paths.

Concurrency hardening (with regression tests):
- generation-guard quiesce(): a concurrent same-key restart can no longer
  have its fresh live handle parked or reported Timeout — the superseded
  quiesce returns NotRunning and the gen-scoped Repark leaves the new
  handle alone.
- graceful spawn-failure rollback: start_thread no longer .expect()s on
  thread spawn; a failure re-installs the prior handle (never detached),
  clears the running flag, and returns instead of panicking across the FFI
  boundary.
- panic-safe epilogue: an EpilogueGuard drop-guard runs the gen-gated
  epilogue even when a worker body unwinds, so is_running() reflects a
  crash and start() can relaunch it.
- closing latch: shutdown() latches the registry closed under the slot
  lock before snapshotting tiers; start_thread/start_task refuse new
  workers once teardown begins, so a start racing shutdown cannot leave an
  un-joined worker behind.
- clear_shielded holds the shielded quiescing gate across its liveness
  check and store wipe (closes the direct sync_now/sync_wallet TOCTOU) and
  documents the residual start-vs-clear host-serialization precondition.

Hygiene:
- feature-gate park_orphan_for_test behind a new test-util feature so the
  mutation seam never ships in a downstream production build.
- add Debug to WorkerConfig/ThreadRegistry; add # Panics docs to
  start_thread/start_task/shutdown; drop a tombstone comment.
- correct the WorkerStatus<->CoordinatorThreadStatus "byte-identical"
  claim to "exhaustive by-name match, never a layout cast".
- repoint dead [CoordinatorOrphans] and [spawn_wallet_event_adapter]
  intra-doc links at the surviving registry / renamed loop.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-dash-async/Cargo.toml             |   6 +
 packages/rs-dash-async/src/registry.rs        | 391 ++++++++++++++++--
 packages/rs-platform-wallet/Cargo.toml        |   3 +
 .../src/changeset/core_bridge.rs              |   2 +-
 .../rs-platform-wallet/src/manager/mod.rs     |  34 +-
 .../src/manager/shielded_sync.rs              |  10 +
 6 files changed, 404 insertions(+), 42 deletions(-)

diff --git a/packages/rs-dash-async/Cargo.toml b/packages/rs-dash-async/Cargo.toml
index 69d180e568..a567cc60ae 100644
--- a/packages/rs-dash-async/Cargo.toml
+++ b/packages/rs-dash-async/Cargo.toml
@@ -7,6 +7,12 @@ authors = ["Dash Core Team"]
 license = "MIT"
 description = "Async-sync bridging utilities for Dash Platform"
 
+[features]
+# Exposes cross-crate test seams (e.g. `ThreadRegistry::park_orphan_for_test`)
+# so downstream crates can drive registry regression tests without shipping
+# the seam in their production builds.
+test-util = []
+
 [dependencies]
 thiserror = "2.0"
 tracing = "0.1.41"
diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index 328ef419bc..982dd6b57c 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -34,6 +34,7 @@
 use std::collections::BTreeMap;
 use std::future::Future;
 use std::pin::Pin;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
@@ -60,9 +61,11 @@ pub struct ShutdownWeight(pub i32);
 // Status
 // ---------------------------------------------------------------------
 
-/// Terminal status of one worker. Variant set and payloads are
-/// byte-identical to the wallet's `CoordinatorThreadStatus`, which is
-/// constructed from this via `From` so the FFI surface stays stable.
+/// Terminal status of one worker. Its variant set and payloads correspond
+/// 1:1 to the wallet's `CoordinatorThreadStatus`, which is built from this
+/// via an exhaustive by-name `From` so the FFI surface stays stable. The
+/// two enums keep their own declaration order and carry no `#[repr]`, so
+/// the mapping is a match, never a layout-compatible cast.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum WorkerStatus {
     /// The loop exited and its thread/task joined cleanly.
@@ -163,6 +166,18 @@ impl Default for WorkerConfig {
     }
 }
 
+impl std::fmt::Debug for WorkerConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // `drain` is a boxed closure with no useful `Debug`; render its
+        // presence instead.
+        f.debug_struct("WorkerConfig")
+            .field("weight", &self.weight)
+            .field("drain", &self.drain.is_some())
+            .field("join_budget", &self.join_budget)
+            .finish()
+    }
+}
+
 // ---------------------------------------------------------------------
 // Internal handle + slot state
 // ---------------------------------------------------------------------
@@ -256,6 +271,28 @@ pub struct ThreadRegistry<K: RegistryKey> {
     slots: Mutex<BTreeMap<K, SlotState>>,
     orphans: Mutex<Vec<(K, WorkerHandle)>>,
     reap_backstop: Duration,
+    /// One-way teardown latch. [`shutdown`](Self::shutdown) sets it under
+    /// the slot lock before snapshotting tiers; `start_thread`/`start_task`
+    /// honour it under the same lock and refuse to register a new worker
+    /// once teardown has begun, so a start racing shutdown can never leave
+    /// an un-joined worker behind.
+    closing: AtomicBool,
+    /// Test seam: when set, the next OS-thread spawn returns an injected
+    /// `io::Error` instead of really spawning, so the spawn-failure
+    /// rollback path can be exercised deterministically.
+    #[cfg(test)]
+    force_spawn_failure: AtomicBool,
+}
+
+impl<K: RegistryKey> std::fmt::Debug for ThreadRegistry<K> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ThreadRegistry")
+            .field("live_slots", &self.lock_slots().len())
+            .field("orphans", &self.lock_orphans().len())
+            .field("reap_backstop", &self.reap_backstop)
+            .field("closing", &self.closing.load(Ordering::Acquire))
+            .finish()
+    }
 }
 
 impl<K: RegistryKey> ThreadRegistry<K> {
@@ -271,6 +308,9 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             slots: Mutex::new(BTreeMap::new()),
             orphans: Mutex::new(Vec::new()),
             reap_backstop: backstop,
+            closing: AtomicBool::new(false),
+            #[cfg(test)]
+            force_spawn_failure: AtomicBool::new(false),
         })
     }
 
@@ -279,10 +319,21 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// internally — the `!Send` value never crosses the spawn boundary
     /// (`body` itself is `Send`). Starting a key that already has a live
     /// worker is a no-op; a key whose prior thread has not been reaped is
-    /// reaped-or-parked first (the restart-reap path).
+    /// reaped-or-parked first (the restart-reap path). After
+    /// [`shutdown`](Self::shutdown) has begun the call is also a no-op (the
+    /// one-way closing latch).
     ///
     /// **Requires a multi-thread runtime**: the worker drives its loop
     /// via `Handle::block_on` and needs the shared timer/IO driver.
+    ///
+    /// # Panics
+    ///
+    /// Panics if called outside a multi-thread Tokio runtime (see
+    /// [`shutdown`](Self::shutdown)). It does **not** panic on thread-spawn
+    /// failure: a failed spawn (e.g. the OS thread-count limit) is rolled
+    /// back — the prior handle is re-installed rather than detached and the
+    /// slot returns to not-running — and the call simply does not start a
+    /// worker.
     pub fn start_thread<F>(self: &Arc<Self>, key: K, cfg: WorkerConfig, body: F)
     where
         F: FnOnce(CancellationToken) + Send + 'static,
@@ -290,6 +341,11 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         Self::assert_multi_thread("start_thread");
         let prior = {
             let mut slots = self.lock_slots();
+            // One-way teardown latch: refuse new workers once shutdown has
+            // begun, under the same lock shutdown snapshots tiers with.
+            if self.closing.load(Ordering::Acquire) {
+                return;
+            }
             let slot = slots.entry(key).or_insert_with(SlotState::dormant);
             if slot.cancel.is_some() {
                 return;
@@ -308,18 +364,45 @@ impl<K: RegistryKey> ThreadRegistry<K> {
 
             let reg = Arc::clone(self);
             let body_token = token;
-            let join = std::thread::Builder::new()
-                .name(format!("tr-worker-{key:?}"))
-                .spawn(move || {
-                    body(body_token);
-                    reg.run_epilogue(key, my_gen);
-                })
-                .expect("failed to spawn registry worker thread");
-            // Store the handle while still under the slot lock; the guard
-            // is released at the end of this block, BEFORE the reap below
-            // (R1: store handle -> drop guard -> THEN reap-or-park).
-            slot.handle = Some(WorkerHandle::OsThread(join));
-            prior
+            // Build the epilogue drop-guard INSIDE the worker closure, not
+            // here: on a spawn failure the closure is dropped while we still
+            // hold the slot lock, and a guard constructed out here would run
+            // `run_epilogue` (which re-locks `slots`) on that drop and
+            // deadlock. Constructing it inside means it only exists once the
+            // thread is actually running. A panicking `body` then still
+            // clears this generation's running flag via the guard's Drop
+            // (under `panic = "unwind"`), and the panic keeps unwinding so
+            // the join handle still classifies as `Panicked`.
+            match self.spawn_os_thread(key, move || {
+                let _epilogue = EpilogueGuard { reg, key, my_gen };
+                body(body_token);
+            }) {
+                Ok(join) => {
+                    // Store the handle while still under the slot lock; the
+                    // guard is released at the end of this block, BEFORE the
+                    // reap below (R1: store handle -> drop guard -> THEN
+                    // reap-or-park).
+                    slot.handle = Some(WorkerHandle::OsThread(join));
+                    prior
+                }
+                Err(e) => {
+                    // Spawn failed (e.g. EAGAIN at the OS thread ceiling).
+                    // Roll back so the prior handle is never detached and
+                    // the slot is not left wedged "running": re-install
+                    // prior, clear the running flag. `generation` stays
+                    // bumped (it is only ever monotonic), which is harmless
+                    // — the next start reaps the re-installed prior.
+                    tracing::error!(
+                        ?key,
+                        error = %e,
+                        "failed to spawn registry worker thread; rolling back \
+                         start (prior handle re-installed, not detached)"
+                    );
+                    slot.cancel = None;
+                    slot.handle = prior;
+                    None
+                }
+            }
         };
 
         // The prior thread was cancellation-signalled by a preceding
@@ -345,6 +428,12 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     // registry: either drop the assert from `shutdown` for all-task
     // registries (track whether any OS-thread worker was ever started) or
     // assert in `start_task` too and require multi-thread everywhere.
+    ///
+    /// # Panics
+    ///
+    /// Panics if called outside a Tokio runtime context (`tokio::spawn`'s
+    /// own precondition). After [`shutdown`](Self::shutdown) has begun the
+    /// call is a no-op (the one-way closing latch).
     pub fn start_task<F, Fut>(self: &Arc<Self>, key: K, cfg: WorkerConfig, body: F)
     where
         F: FnOnce(CancellationToken) -> Fut + Send + 'static,
@@ -352,6 +441,10 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     {
         let prior = {
             let mut slots = self.lock_slots();
+            // One-way teardown latch — see `start_thread`.
+            if self.closing.load(Ordering::Acquire) {
+                return;
+            }
             let slot = slots.entry(key).or_insert_with(SlotState::dormant);
             if slot.cancel.is_some() {
                 return;
@@ -367,9 +460,12 @@ impl<K: RegistryKey> ThreadRegistry<K> {
 
             let reg = Arc::clone(self);
             let body_token = token;
+            // Drop-guard epilogue, same rationale as `start_thread`: a task
+            // whose future panics still clears its running flag via the
+            // guard's Drop during unwind.
             let join = tokio::spawn(async move {
+                let _epilogue = EpilogueGuard { reg, key, my_gen };
                 body(body_token).await;
-                reg.run_epilogue(key, my_gen);
             });
             slot.handle = Some(WorkerHandle::Task(join));
             prior
@@ -385,7 +481,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             .unwrap_or(false)
     }
 
-    /// Signal-only cancellation of one worker (was `stop()`).
+    /// Signal-only cancellation of one worker.
     pub fn cancel(&self, key: K) {
         if let Some(slot) = self.lock_slots().get_mut(&key) {
             if let Some(token) = slot.cancel.take() {
@@ -409,13 +505,14 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// it; on the managed timeout — or if this future is dropped
     /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX]
     pub async fn quiesce(&self, key: K) -> WorkerStatus {
-        // Snapshot the drain hook + budget, and bail early if nothing is
-        // registered for this key.
-        let (drain, budget) = {
+        // Snapshot the drain hook + budget + generation, and bail early if
+        // nothing is registered for this key. The generation is the anchor
+        // for the supersede guard below.
+        let (drain, budget, my_gen) = {
             let slots = self.lock_slots();
             match slots.get(&key) {
                 Some(s) if s.cancel.is_some() || s.handle.is_some() => {
-                    (s.drain.clone(), s.join_budget)
+                    (s.drain.clone(), s.join_budget, s.generation)
                 }
                 _ => return WorkerStatus::NotRunning,
             }
@@ -427,29 +524,46 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             drain().await;
         }
 
-        // Signal-only cancel.
+        // Signal-only cancel — but only if this is still the generation we
+        // snapshotted. A concurrent restart (which can proceed the instant
+        // we take `cancel` below) bumps the generation; taking the new
+        // token here would silently un-track the fresh worker.
         if let Some(slot) = self.lock_slots().get_mut(&key) {
-            if let Some(token) = slot.cancel.take() {
-                token.cancel();
+            if slot.generation == my_gen {
+                if let Some(token) = slot.cancel.take() {
+                    token.cancel();
+                }
             }
         }
 
         // Poll-join within budget. The re-park guard moves the slot's
         // still-live handle into orphans if this future is dropped before
-        // the loop finishes — the handle is never owned by this frame.
-        let _repark = Repark { reg: self, key };
+        // the loop finishes — the handle is never owned by this frame. Both
+        // the guard and the loop are generation-scoped, so a concurrent
+        // same-key restart's live handle is never parked or classified by
+        // the quiesce that cancelled the *prior* generation.
+        let _repark = Repark {
+            reg: self,
+            key,
+            my_gen,
+        };
         let deadline = Instant::now() + budget;
         loop {
             enum Step {
                 Classify(WorkerHandle),
                 Park(WorkerHandle),
                 NotRunning,
+                Superseded,
                 Wait,
             }
             let step = {
                 let mut slots = self.lock_slots();
                 match slots.get_mut(&key) {
                     None => Step::NotRunning,
+                    // A restart replaced the generation we were draining:
+                    // the handle now in the slot belongs to a newer, live
+                    // worker the restart owns. Leave it untouched.
+                    Some(slot) if slot.generation != my_gen => Step::Superseded,
                     Some(slot) => match slot.handle.take_if(|h| h.is_finished()) {
                         Some(h) => Step::Classify(h),
                         None if slot.handle.is_none() => Step::NotRunning,
@@ -466,7 +580,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                     self.lock_orphans().push((key, h));
                     return WorkerStatus::Timeout;
                 }
-                Step::NotRunning => return WorkerStatus::NotRunning,
+                Step::NotRunning | Step::Superseded => return WorkerStatus::NotRunning,
                 Step::Wait => tokio::time::sleep(Duration::from_millis(5)).await,
             }
         }
@@ -511,15 +625,31 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// Weight-ordered teardown: ascending tier by tier, each worker's
     /// (drain-hook -> cancel -> join) run concurrently within a tier;
     /// orphan reap runs last. **Requires a multi-thread runtime.**
+    ///
+    /// Latches the registry closed first (under the slot lock, before the
+    /// tier snapshot), so any `start_thread`/`start_task` racing teardown is
+    /// either already in the snapshot or refused outright — shutdown is a
+    /// one-way door and never leaves a worker un-joined. Idempotent.
+    ///
+    /// # Panics
+    ///
+    /// Panics if called outside a multi-thread Tokio runtime: an OS-thread
+    /// worker drives its loop via `Handle::block_on` and needs the shared
+    /// timer/IO driver, so a `current_thread` runtime would deadlock the
+    /// join.
     pub async fn shutdown(&self) -> ShutdownReport<K> {
         // TODO(rs-dapi-adoption): see `start_task` — this assert is the late
         // panic point for a task-only consumer on a current_thread runtime.
         Self::assert_multi_thread("shutdown");
 
         // Snapshot keys grouped by weight. A `BTreeMap` iterates tiers in
-        // ascending weight order, giving the lower-first drain.
+        // ascending weight order, giving the lower-first drain. Latch the
+        // registry closed under the same lock and before the snapshot so a
+        // racing start is serialized: it either landed before this lock (and
+        // is in the snapshot) or sees `closing` and bails.
         let tiers: BTreeMap<ShutdownWeight, Vec<K>> = {
             let slots = self.lock_slots();
+            self.closing.store(true, Ordering::Release);
             let mut tiers: BTreeMap<ShutdownWeight, Vec<K>> = BTreeMap::new();
             for (key, slot) in slots.iter() {
                 tiers.entry(slot.weight).or_default().push(*key);
@@ -574,9 +704,9 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         );
     }
 
-    /// Gen-gated exit epilogue, run on the worker after its body returns:
-    /// clear this slot's running flag only if a newer start has not since
-    /// installed a replacement.
+    /// Gen-gated exit epilogue, run on the worker after its body returns
+    /// (or unwinds): clear this slot's running flag only if a newer start
+    /// has not since installed a replacement.
     fn run_epilogue(&self, key: K, my_gen: u64) {
         if let Some(slot) = self.lock_slots().get_mut(&key) {
             if slot.generation == my_gen {
@@ -585,6 +715,22 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         }
     }
 
+    /// Spawn the named OS worker thread, surfacing a spawn failure as
+    /// `io::Result` instead of panicking so the caller can roll back. The
+    /// `#[cfg(test)]` seam forces a synthetic failure to exercise that path.
+    fn spawn_os_thread<C>(&self, key: K, closure: C) -> std::io::Result<std::thread::JoinHandle<()>>
+    where
+        C: FnOnce() + Send + 'static,
+    {
+        #[cfg(test)]
+        if self.force_spawn_failure.load(Ordering::Acquire) {
+            return Err(std::io::Error::other("forced spawn failure (test seam)"));
+        }
+        std::thread::Builder::new()
+            .name(format!("tr-worker-{key:?}"))
+            .spawn(closure)
+    }
+
     /// Reap a restarted key's prior worker — or park it if it is genuinely
     /// wedged past the reap backstop. Must be called with no registry lock
     /// held (it spins synchronously for an OS thread).
@@ -670,7 +816,9 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// Test-only seam: park a raw thread handle as an orphan under `key`.
     /// Used by cross-crate regression tests (e.g. the wallet's F2 gate)
     /// that must inject a wedged prior-generation thread without driving
-    /// the full restart-reap path.
+    /// the full restart-reap path. Feature-gated behind `test-util` so it
+    /// never ships in a production build of a downstream consumer.
+    #[cfg(any(test, feature = "test-util"))]
     #[doc(hidden)]
     pub fn park_orphan_for_test(&self, key: K, handle: std::thread::JoinHandle<()>) {
         self.lock_orphans()
@@ -688,19 +836,27 @@ fn slot_alive(slot: &SlotState) -> bool {
 /// the slot's still-live handle into the orphan list instead of letting it
 /// be dropped-and-detached. On normal completion the handle has already
 /// been taken from the slot, so this is a no-op.
+///
+/// Generation-scoped: it only re-parks the handle if the slot still holds
+/// the generation `quiesce` was draining. A concurrent same-key restart
+/// bumps the generation and installs its own live handle; this guard leaves
+/// that fresh handle alone.
 struct Repark<'a, K: RegistryKey> {
     reg: &'a ThreadRegistry<K>,
     key: K,
+    my_gen: u64,
 }
 
 impl<K: RegistryKey> Drop for Repark<'_, K> {
     fn drop(&mut self) {
         // Take the handle under the slot lock, release it, then push to
-        // orphans — never nest the two locks.
+        // orphans — never nest the two locks. Skip if a restart superseded
+        // our generation (the handle is the new worker's, not ours).
         let handle = self
             .reg
             .lock_slots()
             .get_mut(&self.key)
+            .filter(|slot| slot.generation == self.my_gen)
             .and_then(|slot| slot.handle.take());
         if let Some(handle) = handle {
             self.reg.lock_orphans().push((self.key, handle));
@@ -708,6 +864,27 @@ impl<K: RegistryKey> Drop for Repark<'_, K> {
     }
 }
 
+/// Worker-side exit guard. Runs the generation-gated [`run_epilogue`]
+/// from its `Drop`, so a worker whose `body` returns normally **or**
+/// unwinds on panic still clears its running flag — `is_running()` then
+/// reflects reality and `start()` can relaunch a crashed loop.
+///
+/// Panic-strategy caveat (same as `AtomicFlagGuard`): the clear-on-panic
+/// half relies on `Drop` running while the stack unwinds, so it holds under
+/// `panic = "unwind"`. Under `panic = "abort"` a worker panic aborts the
+/// process and there is no "after" to gate.
+struct EpilogueGuard<K: RegistryKey> {
+    reg: Arc<ThreadRegistry<K>>,
+    key: K,
+    my_gen: u64,
+}
+
+impl<K: RegistryKey> Drop for EpilogueGuard<K> {
+    fn drop(&mut self) {
+        self.reg.run_epilogue(self.key, self.my_gen);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1341,4 +1518,150 @@ mod tests {
         assert!(cfg.drain.is_none());
         assert_eq!(cfg.join_budget, DEFAULT_JOIN_BUDGET);
     }
+
+    // ----- Group 6: concurrency-hazard regressions --------------------
+
+    /// `quiesce` is generation-guarded. A same-key restart that lands after
+    /// quiesce takes the prior's cancel must not have its fresh, live handle
+    /// parked or reported `Timeout`: the superseded quiesce returns
+    /// `NotRunning` and the new generation survives.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn quiesce_generation_guard_spares_concurrent_restart() {
+        let reg = ThreadRegistry::<&str>::new();
+        // gen-1: a task that ignores cancellation (pending forever), with a
+        // tiny join budget so a non-guarded quiesce would Timeout quickly.
+        reg.start_task(
+            "k",
+            WorkerConfig {
+                join_budget: Duration::from_millis(150),
+                ..WorkerConfig::default()
+            },
+            |_cancel| async move { std::future::pending::<()>().await },
+        );
+
+        // Drive quiesce concurrently; it snapshots gen=1, cancels (ignored),
+        // and enters the poll loop with cancel already taken.
+        let reg_q = Arc::clone(&reg);
+        let q = tokio::spawn(async move { reg_q.quiesce("k").await });
+
+        // Let quiesce pass cancel.take() so a restart can proceed.
+        tokio::time::sleep(Duration::from_millis(40)).await;
+
+        // Restart: cancel is now None, so this proceeds — it takes gen-1's
+        // live handle as its prior (parked) and installs gen-2.
+        reg.start_task("k", WorkerConfig::default(), |cancel| async move {
+            cancel.cancelled().await;
+        });
+
+        // The superseded quiesce must NOT park gen-2 / report Timeout.
+        let status = q.await.unwrap();
+        assert_eq!(
+            status,
+            WorkerStatus::NotRunning,
+            "superseded quiesce returns NotRunning, never a spurious Timeout"
+        );
+        assert!(reg.is_running("k"), "gen-2 survives the racing quiesce");
+
+        // gen-2 quiesces cleanly.
+        assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok);
+    }
+
+    /// A thread-spawn failure must neither panic nor detach the live prior
+    /// handle: it rolls back (prior re-installed, running flag cleared) and
+    /// the slot stays usable / reapable.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn spawn_failure_reparks_live_prior_without_panic() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+        // gen-1: wedged (ignores cancel), stays live until released.
+        reg.start_thread("k", WorkerConfig::default(), wedged_body(release_rx));
+        // cancel() takes the token (slot.cancel = None) but the wedged thread
+        // keeps running — the slot now holds a LIVE prior handle with cancel
+        // cleared, the exact shape a racing restart would take as its prior.
+        reg.cancel("k");
+        assert!(!reg.is_running("k"));
+
+        // Force the restart's spawn to fail; it must not panic.
+        reg.force_spawn_failure.store(true, Ordering::Release);
+        reg.start_thread("k", WorkerConfig::default(), |_cancel| {});
+        assert!(
+            !reg.is_running("k"),
+            "failed spawn clears the running flag, never leaves it wedged"
+        );
+        assert!(reg.any_alive(), "live prior re-installed, never detached");
+
+        // Recover: release the prior; quiesce reaps the now-finished handle
+        // cleanly, proving it was owned (not leaked/detached) and the slot is
+        // not wedged.
+        reg.force_spawn_failure.store(false, Ordering::Release);
+        release_tx.send(()).unwrap();
+        assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok);
+        assert!(!reg.any_alive());
+    }
+
+    /// A panicking worker body still runs its epilogue (via the drop-guard),
+    /// so `is_running()` reflects the crash and `start()` can relaunch the
+    /// loop instead of silently no-op'ing.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn panicked_worker_clears_running_and_allows_restart() {
+        let reg = ThreadRegistry::<&str>::new();
+        // A worker whose body panics immediately.
+        reg.start_thread("k", WorkerConfig::default(), |_cancel| {
+            panic!("deliberate worker-body panic");
+        });
+
+        // The drop-guard epilogue clears the running flag despite the panic.
+        let mut waited = Duration::ZERO;
+        while reg.is_running("k") && waited < Duration::from_secs(2) {
+            tokio::time::sleep(Duration::from_millis(5)).await;
+            waited += Duration::from_millis(5);
+        }
+        assert!(
+            !reg.is_running("k"),
+            "panicked worker clears its running flag via the epilogue guard"
+        );
+
+        // start() can relaunch a crashed loop (no longer a silent no-op).
+        let ran = Arc::new(AtomicBool::new(false));
+        let ran_w = Arc::clone(&ran);
+        let handle = Handle::current();
+        reg.start_thread("k", WorkerConfig::default(), move |cancel| {
+            ran_w.store(true, Ordering::Release);
+            handle.block_on(async move { cancel.cancelled().await });
+        });
+        assert!(
+            reg.is_running("k"),
+            "start() relaunches a previously-panicked worker"
+        );
+        assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok);
+        assert!(
+            ran.load(Ordering::Acquire),
+            "restarted worker body executed"
+        );
+    }
+
+    /// `shutdown()` latches the registry closed: a start racing (or
+    /// following) teardown is refused, so no worker is left un-joined.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn shutdown_latches_closed_refusing_new_workers() {
+        let reg = ThreadRegistry::<&str>::new();
+        start_clean(&reg, "live", WorkerConfig::default());
+        let report = reg.shutdown().await;
+        assert!(report.all_clean());
+
+        // One-way door: both worker kinds are refused after shutdown.
+        start_clean(&reg, "late_thread", WorkerConfig::default());
+        assert!(
+            !reg.is_running("late_thread"),
+            "start_thread after shutdown is refused"
+        );
+        reg.start_task("late_task", WorkerConfig::default(), |cancel| async move {
+            cancel.cancelled().await;
+        });
+        assert!(
+            !reg.is_running("late_task"),
+            "start_task after shutdown is refused"
+        );
+        assert!(!reg.any_alive(), "nothing started post-shutdown");
+    }
 }
diff --git a/packages/rs-platform-wallet/Cargo.toml b/packages/rs-platform-wallet/Cargo.toml
index e324680210..5398e9c009 100644
--- a/packages/rs-platform-wallet/Cargo.toml
+++ b/packages/rs-platform-wallet/Cargo.toml
@@ -81,6 +81,9 @@ name = "shielded_chunk_timing_bench"
 required-features = ["shielded"]
 
 [dev-dependencies]
+# Enables `ThreadRegistry::park_orphan_for_test` for the manager's F2-gate
+# regression tests; the seam is feature-gated so it never ships in release.
+dash-async = { path = "../rs-dash-async", features = ["test-util"] }
 # Used by `examples/shielded_chunk_timing_bench.rs` and
 # `tests/shielded_decrypt_bench.rs` to assemble per-chunk wire
 # fixtures and decode the `ShieldedEncryptedNote` wire type.
diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
index 9e22d9e6f2..13a177cb47 100644
--- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs
+++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
@@ -3,7 +3,7 @@
 //! Upstream `key_wallet_manager::WalletManager` exposes a
 //! `broadcast::Sender<WalletEvent>` and a `subscribe_events()` accessor
 //! returning a `broadcast::Receiver<WalletEvent>`; consumers attach at
-//! startup and drain the stream. [`spawn_wallet_event_adapter`] is the
+//! startup and drain the stream. [`wallet_event_adapter_loop`] is the
 //! platform-wallet-side consumer: a tokio task that pulls events off
 //! that broadcast, projects each one into a
 //! [`CoreChangeSet`](crate::changeset::CoreChangeSet), wraps it in a
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index d03dcccf7b..51e14c2524 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -145,7 +145,7 @@ pub enum CoordinatorThreadStatus {
     /// `stop()`→`start()` reap had to detach past its 1 s wedge-backstop
     /// was still alive at the shutdown deadline.
     ///
-    /// Such a thread was parked in the manager's [`CoordinatorOrphans`]
+    /// Such a thread was parked in the shared [`ThreadRegistry`]'s orphan
     /// list (not silently dropped) precisely so this case is visible.
     /// A still-live detached thread keeps an `Arc` to the host event
     /// handler and may fire one final callback, so the host must NOT
@@ -165,9 +165,12 @@ impl CoordinatorThreadStatus {
 }
 
 /// Relocate a registry [`WorkerStatus`](dash_async::WorkerStatus) into the
-/// FFI-stable `CoordinatorThreadStatus`. The variant set and payloads are
-/// identical by construction, so this is a byte-stable 1:1 mapping — the
-/// FFI `destroy` / shielded-stop adapters keep reading the same shape.
+/// FFI-stable `CoordinatorThreadStatus`. The variant sets and payloads
+/// correspond 1:1, so the body is an exhaustive by-name `From` match that
+/// the compiler keeps total. The two enums intentionally keep their own
+/// declaration order and carry no `#[repr]`, so this is a match, never a
+/// layout-compatible cast — the FFI `destroy` / shielded-stop adapters keep
+/// reading the same logical shape.
 impl From<dash_async::WorkerStatus> for CoordinatorThreadStatus {
     fn from(status: dash_async::WorkerStatus) -> Self {
         use dash_async::WorkerStatus as W;
@@ -205,7 +208,7 @@ pub struct CoordinatorExitStatus {
     pub event_adapter: CoordinatorThreadStatus,
     /// Aggregate status of any coordinator OS threads that an earlier
     /// tight `stop()`→`start()` reap had to detach past its 1 s
-    /// wedge-backstop and park in the manager's [`CoordinatorOrphans`]
+    /// wedge-backstop and park in the shared [`ThreadRegistry`]'s orphan
     /// list.
     ///
     /// [`Ok`](CoordinatorThreadStatus::Ok) when none were detached (or
@@ -280,8 +283,8 @@ impl CoordinatorExitStatus {
 pub const SHUTDOWN_JOIN_TIMEOUT_SECS: u64 = 30;
 
 /// Grace period (seconds) [`PlatformWalletManager::shutdown`] spends
-/// polling any parked [`CoordinatorOrphans`] before declaring a survivor
-/// [`Detached`](CoordinatorThreadStatus::Detached).
+/// polling any orphans parked in the shared [`ThreadRegistry`] before
+/// declaring a survivor [`Detached`](CoordinatorThreadStatus::Detached).
 ///
 /// Unlike a live coordinator — whose `quiesce()` may legitimately spend
 /// seconds draining an in-flight pass, hence the 30 s
@@ -507,6 +510,15 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     ///   backstop, or its loop ended non-cleanly) →
     ///   [`crate::error::PlatformWalletError::ShieldedShutdownIncomplete`]; or
     /// - the coordinator's store reset itself fails.
+    ///
+    /// **Host-serialization precondition**: the caller must not invoke
+    /// `shielded_sync_start` for this manager concurrently with `clear`. A
+    /// concurrent direct `sync_now`/`sync_wallet` is held off (the quiescing
+    /// gate stays raised across the liveness check and the wipe), but a full
+    /// restart re-opens that gate as it spawns a fresh loop, so a `start`
+    /// racing `clear` can still re-persist into the wiped store. The wallet
+    /// UI drives these from one place; that ordering is the host's contract
+    /// until the registry grows a per-key clearing latch.
     #[cfg(feature = "shielded")]
     pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
         // Quiesce the shielded loop: cancel it, drain any in-flight pass
@@ -524,6 +536,14 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         if !status.is_clean() {
             return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status });
         }
+        // Hold the shielded quiescing gate raised across BOTH the liveness
+        // check below and the store wipe, so the gate guarding "no new pass"
+        // does not lapse between check and act: a direct `sync_now` /
+        // `sync_wallet` that lands here observes the gate and bails instead
+        // of writing into the store we are about to clear. The guard lowers
+        // the gate on return (every path), so a later start/sync works.
+        let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate();
+
         // [F2 FIX] Also refuse if a prior-generation shielded thread is
         // still parked alive: it holds an `Arc` to the persister/store and
         // could re-persist notes into the store we are about to wipe. The
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index a930febdc7..ea0a0566f9 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -316,6 +316,16 @@ impl ShieldedSyncManager {
             .into()
     }
 
+    /// Raise the `quiescing` gate and hold it raised until the returned
+    /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate as
+    /// soon as it returns, this lets a multi-step teardown (Clear) keep new
+    /// direct `sync_now` / `sync_wallet` passes off across a check-then-wipe
+    /// so the "no new pass" guarantee does not lapse between the two steps.
+    pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> {
+        self.quiescing.store(true, Ordering::Release);
+        AtomicFlagGuard::new(&self.quiescing)
+    }
+
     /// Run one sync pass across every registered wallet.
     ///
     /// `force` is propagated to each wallet's

From 911f99f7ce569ebe727c7439bf44fcb9358cc2b9 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 01:03:37 +0200
Subject: [PATCH 22/29] refactor(platform-wallet): extract CoordinatorLifecycle
 to dedup the three sync coordinators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The migration onto ThreadRegistry left the coordinator-side delegate +
pass-gating boilerplate copy-pasted across IdentitySyncManager,
PlatformAddressSyncManager, and ShieldedSyncManager: the same five
fields (registry, interval, is_syncing, quiescing, last_sync), byte-
identical interval/is_syncing/last_sync/drain_hook/stop/quiesce
delegations, and — critically — the subtle is_syncing-CAS + quiescing-gate
pass preamble reproduced four times.

Hoist all of it into one CoordinatorLifecycle helper that each coordinator
embeds and delegates to, so the teardown-critical handshake has a single
home. begin_pass() folds the CAS + guard + gate check into one RAII-guard-
returning call; hold_quiescing_gate() is the shared primitive the shielded
Clear flow holds across its check-then-wipe. Each coordinator now keeps
only its domain-specific pass body.

Behaviour-preserving: the full platform-wallet lib suite (307 tests, both
default and shielded) and the coordinator pass-gate tests are unchanged and
green. Also fixes the dangling [JoinHandle] intra-doc link in core_bridge
(its import was removed in the migration) by fully qualifying it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/changeset/core_bridge.rs              |   6 +-
 .../src/manager/coordinator_lifecycle.rs      | 193 ++++++++++++++++++
 .../src/manager/identity_sync.rs              | 134 ++++--------
 .../rs-platform-wallet/src/manager/mod.rs     |   1 +
 .../src/manager/platform_address_sync.rs      | 132 ++++--------
 .../src/manager/shielded_sync.rs              | 157 +++++---------
 6 files changed, 322 insertions(+), 301 deletions(-)
 create mode 100644 packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs

diff --git a/packages/rs-platform-wallet/src/changeset/core_bridge.rs b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
index 13a177cb47..927cf8d000 100644
--- a/packages/rs-platform-wallet/src/changeset/core_bridge.rs
+++ b/packages/rs-platform-wallet/src/changeset/core_bridge.rs
@@ -21,9 +21,9 @@
 //!
 //! [`wallet_event_adapter_loop`] is the task body. The caller (typically
 //! `PlatformWalletManager`) registers it on the shared `ThreadRegistry`
-//! via `start_task`, which owns its [`JoinHandle`] and cancellation; on
-//! shutdown the registry fires the [`CancellationToken`] to make the task
-//! exit cleanly and joins it.
+//! via `start_task`, which owns its [`JoinHandle`](tokio::task::JoinHandle)
+//! and cancellation; on shutdown the registry fires the
+//! [`CancellationToken`] to make the task exit cleanly and joins it.
 
 use std::sync::Arc;
 
diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
new file mode 100644
index 0000000000..440c67c676
--- /dev/null
+++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
@@ -0,0 +1,193 @@
+//! Shared lifecycle state + pass protocol for the periodic sync
+//! coordinators.
+//!
+//! The three coordinators ([`IdentitySyncManager`], [`PlatformAddressSyncManager`],
+//! [`ShieldedSyncManager`]) each drive a background loop on the shared
+//! [`ThreadRegistry`] and gate passes through an `is_syncing` / `quiescing`
+//! handshake. That handshake, plus the interval and last-sync bookkeeping,
+//! is identical across all three; it lives here so the (subtle, teardown-
+//! critical) protocol has a single home and each coordinator keeps only its
+//! domain-specific pass body.
+//!
+//! [`IdentitySyncManager`]: super::identity_sync::IdentitySyncManager
+//! [`PlatformAddressSyncManager`]: super::platform_address_sync::PlatformAddressSyncManager
+//! [`ShieldedSyncManager`]: super::shielded_sync::ShieldedSyncManager
+
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
+
+use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
+
+use super::{
+    CoordinatorThreadStatus, WalletWorker, COORDINATOR_WEIGHT, SHUTDOWN_JOIN_TIMEOUT_SECS,
+};
+
+/// Shared lifecycle state and pass-gating protocol for one periodic sync
+/// coordinator. Each coordinator embeds one of these and delegates its
+/// `start` / `stop` / `quiesce` / `is_running` / interval / pass-gate
+/// surface to it.
+pub(crate) struct CoordinatorLifecycle {
+    registry: Arc<ThreadRegistry<WalletWorker>>,
+    worker: WalletWorker,
+    interval_secs: AtomicU64,
+    is_syncing: AtomicBool,
+    /// `Arc` so the registry drain hook (a `'static` closure) can capture a
+    /// clone and raise the gate from inside `quiesce`.
+    quiescing: Arc<AtomicBool>,
+    last_sync_unix: AtomicU64,
+}
+
+impl CoordinatorLifecycle {
+    pub(crate) fn new(
+        registry: Arc<ThreadRegistry<WalletWorker>>,
+        worker: WalletWorker,
+        default_interval_secs: u64,
+    ) -> Self {
+        Self {
+            registry,
+            worker,
+            interval_secs: AtomicU64::new(default_interval_secs),
+            is_syncing: AtomicBool::new(false),
+            quiescing: Arc::new(AtomicBool::new(false)),
+            last_sync_unix: AtomicU64::new(0),
+        }
+    }
+
+    /// The shared worker-lifecycle engine this coordinator's loop runs on.
+    pub(crate) fn registry(&self) -> &Arc<ThreadRegistry<WalletWorker>> {
+        &self.registry
+    }
+
+    /// This coordinator's registry key.
+    pub(crate) fn worker(&self) -> WalletWorker {
+        self.worker
+    }
+
+    /// Set the polling interval. Clamped to a minimum of 1s.
+    pub(crate) fn set_interval(&self, interval: Duration) {
+        let secs = interval.as_secs().max(1);
+        self.interval_secs.store(secs, Ordering::Release);
+    }
+
+    /// Current polling interval.
+    pub(crate) fn interval(&self) -> Duration {
+        Duration::from_secs(self.interval_secs.load(Ordering::Acquire))
+    }
+
+    /// Current polling interval in whole seconds (for `Debug`).
+    pub(crate) fn interval_secs(&self) -> u64 {
+        self.interval_secs.load(Ordering::Acquire)
+    }
+
+    /// Whether the background loop is currently running.
+    pub(crate) fn is_running(&self) -> bool {
+        self.registry.is_running(self.worker)
+    }
+
+    /// Whether a sync pass is in flight right now.
+    pub(crate) fn is_syncing(&self) -> bool {
+        self.is_syncing.load(Ordering::Acquire)
+    }
+
+    /// Unix seconds of the last completed pass, or `None` if none has ever
+    /// completed.
+    pub(crate) fn last_sync_unix_seconds(&self) -> Option<u64> {
+        match self.last_sync_unix.load(Ordering::Acquire) {
+            0 => None,
+            n => Some(n),
+        }
+    }
+
+    /// Record the unix-seconds stamp of a just-completed pass.
+    pub(crate) fn store_last_sync_unix(&self, unix_secs: u64) {
+        self.last_sync_unix.store(unix_secs, Ordering::Release);
+    }
+
+    /// The registry config a coordinator starts its loop with: coordinator
+    /// teardown weight, the shared join budget, and the `quiescing`-raising
+    /// drain hook.
+    pub(crate) fn worker_config(&self) -> WorkerConfig {
+        WorkerConfig {
+            weight: COORDINATOR_WEIGHT,
+            join_budget: Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS),
+            drain: Some(self.drain_hook()),
+        }
+    }
+
+    /// Drain hook handed to the registry: raise the `quiescing` gate so any
+    /// pass past its `is_syncing` CAS bails. The registry then cancels the
+    /// loop and joins the thread, so the barrier itself is instant.
+    fn drain_hook(&self) -> DrainHook {
+        let quiescing = Arc::clone(&self.quiescing);
+        Arc::new(move || {
+            let quiescing = Arc::clone(&quiescing);
+            Box::pin(async move {
+                quiescing.store(true, Ordering::Release);
+            })
+        })
+    }
+
+    /// Reopen the `quiescing` gate so a (re)start's passes can run; a prior
+    /// quiesce raised it via the drain hook.
+    pub(crate) fn reopen_quiescing_gate(&self) {
+        self.quiescing.store(false, Ordering::Release);
+    }
+
+    /// Cancel-only stop: signal the loop and return immediately.
+    pub(crate) fn stop(&self) {
+        self.registry.cancel(self.worker);
+    }
+
+    /// Cancel the loop, drain any in-flight pass, and join the worker,
+    /// returning its terminal status. Reopens the `quiescing` gate on every
+    /// exit path (the registry's drain hook raised it; reopening is safe
+    /// because the loop has been cancelled, so no new pass starts).
+    pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus {
+        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
+        self.registry.quiesce(self.worker).await.into()
+    }
+
+    /// Raise the `quiescing` gate and hold it raised until the returned
+    /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate the
+    /// instant it returns, this lets a multi-step teardown (e.g. Clear)
+    /// keep new direct passes off across a check-then-wipe so the "no new
+    /// pass" guarantee does not lapse between the two steps. In production
+    /// only the shielded Clear flow needs this today; the coordinator pass-
+    /// gate tests also exercise it.
+    #[cfg(any(test, feature = "shielded"))]
+    pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> {
+        self.quiescing.store(true, Ordering::Release);
+        AtomicFlagGuard::new(&self.quiescing)
+    }
+
+    /// Enter a sync pass. Atomically claims the `is_syncing` slot, then
+    /// checks the `quiescing` gate. Returns the RAII guard that clears
+    /// `is_syncing` on drop, or `None` when the caller must bail without
+    /// doing work — because a pass is already in flight, or a teardown has
+    /// raised the gate. In the gated case the briefly-claimed slot is
+    /// released before returning (the guard drops), so a later post-quiesce
+    /// pass can still run.
+    pub(crate) fn begin_pass(&self) -> Option<AtomicFlagGuard<'_>> {
+        if self
+            .is_syncing
+            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
+            .is_err()
+        {
+            return None;
+        }
+
+        // RAII guard: clears `is_syncing` on every exit path, including
+        // panics. Without it a panic inside the pass would leave
+        // `is_syncing = true` forever and wedge `quiesce`'s drain loop.
+        let guard = AtomicFlagGuard::new(&self.is_syncing);
+
+        // A `quiesce` may have raised the gate between our CAS and here; if
+        // so, bail (dropping `guard`, which clears the slot) so the drain
+        // can complete and teardown gets a true "no further pass" barrier.
+        if self.quiescing.load(Ordering::Acquire) {
+            return None;
+        }
+        Some(guard)
+    }
+}
diff --git a/packages/rs-platform-wallet/src/manager/identity_sync.rs b/packages/rs-platform-wallet/src/manager/identity_sync.rs
index 8dfe83eede..165e4f4530 100644
--- a/packages/rs-platform-wallet/src/manager/identity_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/identity_sync.rs
@@ -47,14 +47,12 @@
 //! identities are registered and the SDK is connected.
 
 use std::collections::BTreeMap;
-use std::sync::{
-    atomic::{AtomicBool, AtomicU64, Ordering},
-    Arc,
-};
+use std::sync::Arc;
 
-use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
+use dash_async::ThreadRegistry;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
+use super::coordinator_lifecycle::CoordinatorLifecycle;
 use super::WalletWorker;
 
 use dpp::balances::credits::TokenAmount;
@@ -161,24 +159,14 @@ where
     /// over `P` so every `persister.store(...)` call on the hot sync
     /// loop dispatches statically.
     persister: Arc<P>,
-    /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
-    /// `quiesce` delegate to it under the [`WalletWorker::IdentitySync`]
-    /// key; it owns the loop's cancel token, OS-thread join handle, the
-    /// restart reap-or-park, and the orphan list.
-    registry: Arc<ThreadRegistry<WalletWorker>>,
-    interval_secs: AtomicU64,
-    is_syncing: AtomicBool,
-    /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
-    /// drains an in-flight one. `sync_now` bails (after taking the
-    /// `is_syncing` slot) when this is set, so once `quiesce` observes
-    /// `is_syncing == false` no further pass can start — giving shutdown
-    /// a real "no more host-visible persister stores" barrier that
+    /// Shared lifecycle state + pass-gating protocol under the
+    /// [`WalletWorker::IdentitySync`] key: the registry handle, polling
+    /// interval, the `is_syncing` / `quiescing` handshake, and the
+    /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the
+    /// `sync_now` pass gate delegate to it. The `quiescing` half gives
+    /// shutdown a real "no more host-visible persister stores" barrier that
     /// cancel-only [`stop`](Self::stop) does not provide.
-    quiescing: AtomicBool,
-    /// Unix seconds of the last completed pass across all identities.
-    /// `0` = never. Identity-level timestamps live on the per-identity
-    /// rows in [`IdentitySyncManager::state`].
-    last_sync_unix: AtomicU64,
+    lifecycle: CoordinatorLifecycle,
     /// Per-identity registry / cache. Keyed by identity id; each row
     /// carries the per-(identity, token) token rows plus the
     /// per-identity last-sync timestamp.
@@ -209,11 +197,11 @@ where
         Self {
             sdk,
             persister,
-            registry,
-            interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
-            is_syncing: AtomicBool::new(false),
-            quiescing: AtomicBool::new(false),
-            last_sync_unix: AtomicU64::new(0),
+            lifecycle: CoordinatorLifecycle::new(
+                registry,
+                WalletWorker::IdentitySync,
+                DEFAULT_SYNC_INTERVAL_SECS,
+            ),
             state: RwLock::new(BTreeMap::new()),
         }
     }
@@ -314,47 +302,28 @@ where
     ///
     /// The running loop picks this up on its next sleep.
     pub fn set_interval(&self, interval: Duration) {
-        let secs = interval.as_secs().max(1);
-        self.interval_secs.store(secs, Ordering::Release);
+        self.lifecycle.set_interval(interval);
     }
 
     /// Current polling interval.
     pub fn interval(&self) -> Duration {
-        Duration::from_secs(self.interval_secs.load(Ordering::Acquire))
+        self.lifecycle.interval()
     }
 
     /// Whether the background loop is currently running.
     pub fn is_running(&self) -> bool {
-        self.registry.is_running(WalletWorker::IdentitySync)
-    }
-
-    /// The drain barrier handed to the registry: raise the `quiescing`
-    /// gate so any pass past its `is_syncing` CAS bails. The registry then
-    /// cancels the loop and joins the thread (the join waits for the
-    /// in-flight pass to drop and `is_syncing` to clear), so the barrier
-    /// itself is instant and never blocks teardown.
-    fn drain_hook(self: &Arc<Self>) -> DrainHook {
-        let this = Arc::clone(self);
-        Arc::new(move || {
-            let this = Arc::clone(&this);
-            Box::pin(async move {
-                this.quiescing.store(true, Ordering::Release);
-            })
-        })
+        self.lifecycle.is_running()
     }
 
     /// Whether a sync pass is in flight right now.
     pub fn is_syncing(&self) -> bool {
-        self.is_syncing.load(Ordering::Acquire)
+        self.lifecycle.is_syncing()
     }
 
     /// Unix seconds of the last completed pass (across all identities),
     /// or `None` if no pass has ever completed.
     pub fn last_sync_unix_seconds(&self) -> Option<u64> {
-        match self.last_sync_unix.load(Ordering::Acquire) {
-            0 => None,
-            n => Some(n),
-        }
+        self.lifecycle.last_sync_unix_seconds()
     }
 
     /// Per-identity last-sync timestamp.
@@ -414,13 +383,9 @@ where
     pub fn start(self: Arc<Self>) {
         // Reopen the quiescing gate so this (re)start's passes can run; a
         // prior quiesce raised it via the drain hook.
-        self.quiescing.store(false, Ordering::Release);
+        self.lifecycle.reopen_quiescing_gate();
 
-        let cfg = WorkerConfig {
-            weight: super::COORDINATOR_WEIGHT,
-            join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
-            drain: Some(self.drain_hook()),
-        };
+        let cfg = self.lifecycle.worker_config();
 
         // The loop drives `!Send` SDK futures via `Handle::block_on` on a
         // dedicated OS thread (the registry spawns it). The handle is
@@ -431,8 +396,9 @@ where
         // the join lands inside the budget.
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
-        self.registry
-            .start_thread(WalletWorker::IdentitySync, cfg, move |cancel| {
+        self.lifecycle
+            .registry()
+            .start_thread(self.lifecycle.worker(), cfg, move |cancel| {
                 handle.block_on(async move {
                     loop {
                         if cancel.is_cancelled() {
@@ -463,7 +429,7 @@ where
     /// by manager shutdown so the host can free the persister context —
     /// use [`quiesce`](Self::quiesce).
     pub fn stop(&self) {
-        self.registry.cancel(WalletWorker::IdentitySync);
+        self.lifecycle.stop();
     }
 
     /// Cancel the background loop **and wait for any in-flight sync pass
@@ -491,17 +457,7 @@ where
     /// the `!Send` loop has stopped touching `tokio::time` before a
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
-        // RAII gate: reopen `quiescing` on *every* exit path — normal
-        // return, a dropped future, or a panic. The registry's drain hook
-        // raises it inside `quiesce` below; without this reset a quiesce
-        // that doesn't complete would leave the gate latched and silently
-        // bail every future pass. Reopening is safe because the loop has
-        // been cancelled, so no new pass can start.
-        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.registry
-            .quiesce(WalletWorker::IdentitySync)
-            .await
-            .into()
+        self.lifecycle.quiesce().await
     }
 
     /// Run one sync pass across every registered identity.
@@ -515,27 +471,13 @@ where
     /// `!Send` (no `tokio::spawn`) and because the design brief
     /// explicitly forbids it.
     pub async fn sync_now(&self) {
-        if self
-            .is_syncing
-            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
+        // Claim the pass slot and honour the quiescing gate; bail without
+        // work (and without a `persister.store(...)` after quiesce returns)
+        // if a pass is already in flight or a teardown raised the gate. The
+        // returned guard clears `is_syncing` on every exit path.
+        let Some(_pass) = self.lifecycle.begin_pass() else {
             return;
-        }
-
-        // RAII guard: clears `is_syncing` on every exit path, including
-        // panics. Without this a panic inside the pass would leave
-        // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
-        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
-
-        // A `quiesce()` may have raised the gate between our CAS and
-        // here; if so, bail without running a pass so the drain can
-        // complete and shutdown gets a true barrier (no further
-        // `persister.store(...)` after quiesce returns).
-        // Guard clears `is_syncing` on return.
-        if self.quiescing.load(Ordering::Acquire) {
-            return;
-        }
+        };
 
         // Snapshot the per-identity watch list under a short read
         // lock and release it before any network call. We keep
@@ -558,8 +500,8 @@ where
             .duration_since(UNIX_EPOCH)
             .map(|d| d.as_secs())
             .unwrap_or(0);
-        self.last_sync_unix.store(now, Ordering::Release);
-        // `_is_syncing_guard` drops here → `is_syncing = false`
+        self.lifecycle.store_last_sync_unix(now);
+        // `_pass` drops here → `is_syncing = false`
     }
 
     /// Sync a single identity's watched tokens against Platform.
@@ -700,7 +642,7 @@ where
         f.debug_struct("IdentitySyncManager")
             .field("is_running", &self.is_running())
             .field("is_syncing", &self.is_syncing())
-            .field("interval_secs", &self.interval_secs.load(Ordering::Acquire))
+            .field("interval_secs", &self.lifecycle.interval_secs())
             .field("last_sync_unix", &self.last_sync_unix_seconds())
             .finish()
     }
@@ -924,8 +866,8 @@ mod tests {
         let token_x = Identifier::from([10u8; 32]);
         mgr.register_identity(id_a, [token_x]).await;
 
-        // Raise the gate as `quiesce()` would.
-        mgr.quiescing.store(true, Ordering::Release);
+        // Raise the gate as `quiesce()` would, held across the pass.
+        let _gate = mgr.lifecycle.hold_quiescing_gate();
 
         mgr.sync_now().await;
 
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 51e14c2524..840dd13c7a 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -1,6 +1,7 @@
 //! Multi-wallet manager with SPV coordination.
 
 pub mod accessors;
+mod coordinator_lifecycle;
 pub mod identity_sync;
 mod load;
 pub mod platform_address_sync;
diff --git a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
index e68fcfef7c..5cb15b048e 100644
--- a/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/platform_address_sync.rs
@@ -9,18 +9,16 @@
 //! wallets are registered and the SPV runtime is up.
 
 use std::collections::BTreeMap;
-use std::sync::{
-    atomic::{AtomicBool, AtomicU64, Ordering},
-    Arc,
-};
+use std::sync::Arc;
 
-use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
+use dash_async::ThreadRegistry;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use arc_swap::ArcSwapOption;
 use dash_sdk::platform::address_sync::{AddressSyncConfig, AddressSyncResult};
 use key_wallet::PlatformP2PKHAddress;
 
+use super::coordinator_lifecycle::CoordinatorLifecycle;
 use super::WalletWorker;
 use crate::wallet::PlatformAddressTag;
 use tokio::sync::RwLock;
@@ -97,21 +95,14 @@ impl PlatformAddressSyncSummary {
 pub struct PlatformAddressSyncManager {
     wallets: Arc<RwLock<BTreeMap<WalletId, Arc<PlatformWallet>>>>,
     event_manager: Arc<PlatformEventManager>,
-    /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
-    /// `quiesce` delegate to it under the
-    /// [`WalletWorker::PlatformAddressSync`] key.
-    registry: Arc<ThreadRegistry<WalletWorker>>,
-    interval_secs: AtomicU64,
-    is_syncing: AtomicBool,
-    /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
-    /// drains an in-flight one. `sync_now` bails (after taking the
-    /// `is_syncing` slot) when this is set, so once `quiesce` observes
-    /// `is_syncing == false` no further pass can start — giving shutdown
-    /// a real "no more host-visible sync-completed callbacks" barrier
-    /// that cancel-only [`stop`](Self::stop) does not provide.
-    quiescing: AtomicBool,
-    /// Unix seconds of the last completed pass. `0` = never.
-    last_sync_unix: AtomicU64,
+    /// Shared lifecycle state + pass-gating protocol under the
+    /// [`WalletWorker::PlatformAddressSync`] key: registry handle, polling
+    /// interval, the `is_syncing` / `quiescing` handshake, and the
+    /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the
+    /// `sync_now` pass gate delegate to it. The `quiescing` half gives
+    /// shutdown a real "no more host-visible sync-completed callbacks"
+    /// barrier that cancel-only [`stop`](Self::stop) does not provide.
+    lifecycle: CoordinatorLifecycle,
     /// Shared config applied uniformly across wallets and accounts.
     ///
     /// `ArcSwapOption` instead of a mutex because writes are rare
@@ -129,11 +120,11 @@ impl PlatformAddressSyncManager {
         Self {
             wallets,
             event_manager,
-            registry,
-            interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
-            is_syncing: AtomicBool::new(false),
-            quiescing: AtomicBool::new(false),
-            last_sync_unix: AtomicU64::new(0),
+            lifecycle: CoordinatorLifecycle::new(
+                registry,
+                WalletWorker::PlatformAddressSync,
+                DEFAULT_SYNC_INTERVAL_SECS,
+            ),
             config: ArcSwapOption::empty(),
         }
     }
@@ -142,13 +133,12 @@ impl PlatformAddressSyncManager {
     ///
     /// The running loop picks this up on its next sleep.
     pub fn set_interval(&self, interval: Duration) {
-        let secs = interval.as_secs().max(1);
-        self.interval_secs.store(secs, Ordering::Release);
+        self.lifecycle.set_interval(interval);
     }
 
     /// Current polling interval.
     pub fn interval(&self) -> Duration {
-        Duration::from_secs(self.interval_secs.load(Ordering::Acquire))
+        self.lifecycle.interval()
     }
 
     /// Replace the shared [`AddressSyncConfig`] used on every pass.
@@ -165,36 +155,18 @@ impl PlatformAddressSyncManager {
 
     /// Whether the background loop is currently running.
     pub fn is_running(&self) -> bool {
-        self.registry.is_running(WalletWorker::PlatformAddressSync)
-    }
-
-    /// The drain barrier handed to the registry: raise the `quiescing`
-    /// gate so any pass past its `is_syncing` CAS bails. The registry then
-    /// cancels the loop and joins the thread (the join waits for the
-    /// in-flight pass — incl. its completion-event dispatch — to drop and
-    /// `is_syncing` to clear), so this barrier is instant.
-    fn drain_hook(self: &Arc<Self>) -> DrainHook {
-        let this = Arc::clone(self);
-        Arc::new(move || {
-            let this = Arc::clone(&this);
-            Box::pin(async move {
-                this.quiescing.store(true, Ordering::Release);
-            })
-        })
+        self.lifecycle.is_running()
     }
 
     /// Whether a sync pass is in flight right now.
     pub fn is_syncing(&self) -> bool {
-        self.is_syncing.load(Ordering::Acquire)
+        self.lifecycle.is_syncing()
     }
 
     /// Unix seconds of the last completed pass, or `None` if no pass
     /// has ever completed.
     pub fn last_sync_unix_seconds(&self) -> Option<u64> {
-        match self.last_sync_unix.load(Ordering::Acquire) {
-            0 => None,
-            n => Some(n),
-        }
+        self.lifecycle.last_sync_unix_seconds()
     }
 
     /// Start the background sync loop. Idempotent — calling while
@@ -213,13 +185,9 @@ impl PlatformAddressSyncManager {
     /// [`interval`](Self::interval).
     pub fn start(self: Arc<Self>) {
         // Reopen the quiescing gate so this (re)start's passes can run.
-        self.quiescing.store(false, Ordering::Release);
+        self.lifecycle.reopen_quiescing_gate();
 
-        let cfg = WorkerConfig {
-            weight: super::COORDINATOR_WEIGHT,
-            join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
-            drain: Some(self.drain_hook()),
-        };
+        let cfg = self.lifecycle.worker_config();
 
         // The loop drives `!Send` SDK futures via `Handle::block_on` on a
         // dedicated OS thread (spawned by the registry). `biased` polls the
@@ -227,8 +195,9 @@ impl PlatformAddressSyncManager {
         // at its `.await` the instant the registry cancels.
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
-        self.registry
-            .start_thread(WalletWorker::PlatformAddressSync, cfg, move |cancel| {
+        self.lifecycle
+            .registry()
+            .start_thread(self.lifecycle.worker(), cfg, move |cancel| {
                 handle.block_on(async move {
                     loop {
                         if cancel.is_cancelled() {
@@ -260,7 +229,7 @@ impl PlatformAddressSyncManager {
     /// the host can free the event-handler context — use
     /// [`quiesce`](Self::quiesce).
     pub fn stop(&self) {
-        self.registry.cancel(WalletWorker::PlatformAddressSync);
+        self.lifecycle.stop();
     }
 
     /// Cancel the background loop **and wait for any in-flight sync pass
@@ -289,14 +258,7 @@ impl PlatformAddressSyncManager {
     /// the `!Send` loop has stopped touching `tokio::time` before a
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
-        // RAII gate: reopen `quiescing` on every exit path. The registry's
-        // drain hook raises it inside `quiesce`; reopening on return is
-        // safe because the loop has been cancelled, so no new pass starts.
-        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.registry
-            .quiesce(WalletWorker::PlatformAddressSync)
-            .await
-            .into()
+        self.lifecycle.quiesce().await
     }
 
     /// Run one sync pass across every registered wallet.
@@ -304,27 +266,13 @@ impl PlatformAddressSyncManager {
     /// If a pass is already in flight, returns an empty summary and
     /// skips — the caller can inspect [`is_syncing`] to distinguish.
     pub async fn sync_now(&self) -> PlatformAddressSyncSummary {
-        if self
-            .is_syncing
-            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
+        // Claim the pass slot and honour the quiescing gate; bail with an
+        // empty summary (and without a host completion callback after
+        // quiesce returns) if a pass is already in flight or a teardown
+        // raised the gate. The guard clears `is_syncing` on every exit path.
+        let Some(_pass) = self.lifecycle.begin_pass() else {
             return PlatformAddressSyncSummary::default();
-        }
-
-        // RAII guard: clears `is_syncing` on every exit path, including
-        // panics. Without this a panic inside the pass would leave
-        // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
-        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
-
-        // A `quiesce()` may have raised the gate between our CAS and
-        // here; if so, bail without running a pass so the drain can
-        // complete and shutdown gets a true barrier (no further
-        // `on_platform_address_sync_completed` host callback after
-        // quiesce returns). Guard clears `is_syncing` on return.
-        if self.quiescing.load(Ordering::Acquire) {
-            return PlatformAddressSyncSummary::default();
-        }
+        };
 
         let snapshot: Vec<(WalletId, Arc<PlatformWallet>)> = {
             let wallets = self.wallets.read().await;
@@ -354,9 +302,9 @@ impl PlatformAddressSyncManager {
             .map(|d| d.as_secs())
             .unwrap_or(0);
         summary.sync_unix_seconds = now;
-        self.last_sync_unix.store(now, Ordering::Release);
+        self.lifecycle.store_last_sync_unix(now);
 
-        // Dispatch the completion event BEFORE `_is_syncing_guard` drops.
+        // Dispatch the completion event BEFORE the `_pass` guard drops.
         // `quiesce()` drains on the falling edge of `is_syncing`; if the
         // guard cleared the flag before the dispatch a shutdown caller
         // could unblock and free the host event-handler context while
@@ -367,7 +315,7 @@ impl PlatformAddressSyncManager {
             .on_platform_address_sync_completed(&summary);
 
         summary
-        // `_is_syncing_guard` drops here → `is_syncing = false`
+        // `_pass` drops here → `is_syncing = false`
     }
 
     /// Sync a single wallet on demand. Does not set the global
@@ -395,7 +343,7 @@ impl std::fmt::Debug for PlatformAddressSyncManager {
         f.debug_struct("PlatformAddressSyncManager")
             .field("is_running", &self.is_running())
             .field("is_syncing", &self.is_syncing())
-            .field("interval_secs", &self.interval_secs.load(Ordering::Acquire))
+            .field("interval_secs", &self.lifecycle.interval_secs())
             .field("last_sync_unix", &self.last_sync_unix_seconds())
             .finish()
     }
@@ -474,8 +422,8 @@ mod tests {
     async fn sync_now_bails_when_quiescing() {
         let (mgr, counter) = make_manager();
 
-        // Raise the gate as `quiesce()` would.
-        mgr.quiescing.store(true, Ordering::Release);
+        // Raise the gate as `quiesce()` would, held across the pass.
+        let _gate = mgr.lifecycle.hold_quiescing_gate();
 
         let summary = mgr.sync_now().await;
 
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index ea0a0566f9..6a66e30ba6 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -26,16 +26,14 @@
 //! [`configure_shielded`]: crate::manager::PlatformWalletManager::configure_shielded
 
 use std::collections::BTreeMap;
-use std::sync::{
-    atomic::{AtomicBool, AtomicU64, Ordering},
-    Arc,
-};
+use std::sync::Arc;
 
-use dash_async::{AtomicFlagGuard, DrainHook, ThreadRegistry, WorkerConfig};
+use dash_async::{AtomicFlagGuard, ThreadRegistry};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
 use tokio::sync::RwLock;
 
+use super::coordinator_lifecycle::CoordinatorLifecycle;
 use super::WalletWorker;
 use crate::events::PlatformEventManager;
 use crate::wallet::platform_wallet::WalletId;
@@ -141,21 +139,14 @@ pub struct ShieldedSyncManager {
     /// run first, so an empty slot guarantees no shielded state
     /// exists).
     coordinator_slot: Arc<RwLock<Option<Arc<NetworkShieldedCoordinator>>>>,
-    /// Shared worker-lifecycle engine. `start` / `stop` / `is_running` /
-    /// `quiesce` delegate to it under the [`WalletWorker::ShieldedSync`]
-    /// key.
-    registry: Arc<ThreadRegistry<WalletWorker>>,
-    interval_secs: AtomicU64,
-    is_syncing: AtomicBool,
-    /// Set by [`quiesce`](Self::quiesce) to gate new passes while it
-    /// drains an in-flight one. `sync_now` / `sync_wallet` bail (after
-    /// taking the `is_syncing` slot) when this is set, so once `quiesce`
-    /// observes `is_syncing == false` no further pass can start — giving
-    /// Clear / stop a real "no more host-visible mutations" barrier that
-    /// cancel-only [`stop`](Self::stop) does not provide.
-    quiescing: AtomicBool,
-    /// Unix seconds of the last completed pass. `0` = never.
-    last_sync_unix: AtomicU64,
+    /// Shared lifecycle state + pass-gating protocol under the
+    /// [`WalletWorker::ShieldedSync`] key: registry handle, polling
+    /// interval, the `is_syncing` / `quiescing` handshake, and the
+    /// last-sync stamp. `start` / `stop` / `is_running` / `quiesce` and the
+    /// `sync_now` / `sync_wallet` pass gate delegate to it. The `quiescing`
+    /// half gives Clear / stop a real "no more host-visible mutations"
+    /// barrier that cancel-only [`stop`](Self::stop) does not provide.
+    lifecycle: CoordinatorLifecycle,
 }
 
 impl ShieldedSyncManager {
@@ -167,11 +158,11 @@ impl ShieldedSyncManager {
         Self {
             event_manager,
             coordinator_slot,
-            registry,
-            interval_secs: AtomicU64::new(DEFAULT_SYNC_INTERVAL_SECS),
-            is_syncing: AtomicBool::new(false),
-            quiescing: AtomicBool::new(false),
-            last_sync_unix: AtomicU64::new(0),
+            lifecycle: CoordinatorLifecycle::new(
+                registry,
+                WalletWorker::ShieldedSync,
+                DEFAULT_SYNC_INTERVAL_SECS,
+            ),
         }
     }
 
@@ -179,47 +170,28 @@ impl ShieldedSyncManager {
     ///
     /// The running loop picks this up on its next sleep.
     pub fn set_interval(&self, interval: Duration) {
-        let secs = interval.as_secs().max(1);
-        self.interval_secs.store(secs, Ordering::Release);
+        self.lifecycle.set_interval(interval);
     }
 
     /// Current polling interval.
     pub fn interval(&self) -> Duration {
-        Duration::from_secs(self.interval_secs.load(Ordering::Acquire))
+        self.lifecycle.interval()
     }
 
     /// Whether the background loop is currently running.
     pub fn is_running(&self) -> bool {
-        self.registry.is_running(WalletWorker::ShieldedSync)
-    }
-
-    /// The drain barrier handed to the registry: raise the `quiescing`
-    /// gate so any pass past its `is_syncing` CAS bails. The registry then
-    /// cancels the loop and joins the thread (the join waits for the
-    /// in-flight pass — incl. its persister fan-out — to drop and
-    /// `is_syncing` to clear), so this barrier is instant.
-    fn drain_hook(self: &Arc<Self>) -> DrainHook {
-        let this = Arc::clone(self);
-        Arc::new(move || {
-            let this = Arc::clone(&this);
-            Box::pin(async move {
-                this.quiescing.store(true, Ordering::Release);
-            })
-        })
+        self.lifecycle.is_running()
     }
 
     /// Whether a sync pass is in flight right now.
     pub fn is_syncing(&self) -> bool {
-        self.is_syncing.load(Ordering::Acquire)
+        self.lifecycle.is_syncing()
     }
 
     /// Unix seconds of the last completed pass, or `None` if no pass
     /// has ever completed.
     pub fn last_sync_unix_seconds(&self) -> Option<u64> {
-        match self.last_sync_unix.load(Ordering::Acquire) {
-            0 => None,
-            n => Some(n),
-        }
+        self.lifecycle.last_sync_unix_seconds()
     }
 
     /// Start the background sync loop. Idempotent — calling while
@@ -231,13 +203,9 @@ impl ShieldedSyncManager {
     /// [`PlatformAddressSyncManager::start`](super::platform_address_sync::PlatformAddressSyncManager::start).
     pub fn start(self: Arc<Self>) {
         // Reopen the quiescing gate so this (re)start's passes can run.
-        self.quiescing.store(false, Ordering::Release);
+        self.lifecycle.reopen_quiescing_gate();
 
-        let cfg = WorkerConfig {
-            weight: super::COORDINATOR_WEIGHT,
-            join_budget: Duration::from_secs(super::SHUTDOWN_JOIN_TIMEOUT_SECS),
-            drain: Some(self.drain_hook()),
-        };
+        let cfg = self.lifecycle.worker_config();
 
         // The loop drives `!Send` SDK futures via `Handle::block_on` on a
         // dedicated OS thread (spawned by the registry). The background
@@ -247,8 +215,9 @@ impl ShieldedSyncManager {
         // SDK fetch is dropped the instant the registry cancels.
         let handle = tokio::runtime::Handle::current();
         let this = Arc::clone(&self);
-        self.registry
-            .start_thread(WalletWorker::ShieldedSync, cfg, move |cancel| {
+        self.lifecycle
+            .registry()
+            .start_thread(self.lifecycle.worker(), cfg, move |cancel| {
                 handle.block_on(async move {
                     loop {
                         if cancel.is_cancelled() {
@@ -279,7 +248,7 @@ impl ShieldedSyncManager {
     /// nothing more will be persisted" barrier — required by Clear,
     /// unregister, and rebind — use [`quiesce`](Self::quiesce).
     pub fn stop(&self) {
-        self.registry.cancel(WalletWorker::ShieldedSync);
+        self.lifecycle.stop();
     }
 
     /// Cancel the background loop **and wait for any in-flight sync pass
@@ -306,14 +275,7 @@ impl ShieldedSyncManager {
     /// the `!Send` loop has stopped touching `tokio::time` before a
     /// one-shot host drops the runtime.
     pub async fn quiesce(&self) -> super::CoordinatorThreadStatus {
-        // RAII gate: reopen `quiescing` on every exit path. The registry's
-        // drain hook raises it inside `quiesce`; reopening on return is
-        // safe because the loop has been cancelled, so no new pass starts.
-        let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.registry
-            .quiesce(WalletWorker::ShieldedSync)
-            .await
-            .into()
+        self.lifecycle.quiesce().await
     }
 
     /// Raise the `quiescing` gate and hold it raised until the returned
@@ -322,8 +284,7 @@ impl ShieldedSyncManager {
     /// direct `sync_now` / `sync_wallet` passes off across a check-then-wipe
     /// so the "no new pass" guarantee does not lapse between the two steps.
     pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> {
-        self.quiescing.store(true, Ordering::Release);
-        AtomicFlagGuard::new(&self.quiescing)
+        self.lifecycle.hold_quiescing_gate()
     }
 
     /// Run one sync pass across every registered wallet.
@@ -338,25 +299,13 @@ impl ShieldedSyncManager {
     /// If a pass is already in flight, returns an empty summary and
     /// skips — the caller can inspect [`is_syncing`] to distinguish.
     pub async fn sync_now(&self, force: bool) -> ShieldedSyncPassSummary {
-        if self
-            .is_syncing
-            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
-            return ShieldedSyncPassSummary::default();
-        }
-
-        // RAII guard: clears `is_syncing` on every exit path, including
-        // panics. Without this a panic inside the pass would leave
-        // `is_syncing=true` forever and wedge `quiesce()`'s drain loop.
-        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
-
-        // A `quiesce()` may have raised the gate between our CAS and
-        // here; bail so the drain can complete and Clear/stop get a
-        // true barrier. Guard clears `is_syncing` on return.
-        if self.quiescing.load(Ordering::Acquire) {
+        // Claim the pass slot and honour the quiescing gate; bail with an
+        // empty summary if a pass is already in flight or a teardown
+        // (Clear/stop) raised the gate. The guard clears `is_syncing` on
+        // every exit path.
+        let Some(_pass) = self.lifecycle.begin_pass() else {
             return ShieldedSyncPassSummary::default();
-        }
+        };
 
         // Snapshot the coordinator Arc and release the slot lock
         // before awaiting so a concurrent `configure_shielded`
@@ -388,10 +337,10 @@ impl ShieldedSyncManager {
         if summary.sync_unix_seconds == 0 {
             summary.sync_unix_seconds = now;
         }
-        self.last_sync_unix
-            .store(summary.sync_unix_seconds, Ordering::Release);
+        self.lifecycle
+            .store_last_sync_unix(summary.sync_unix_seconds);
 
-        // Dispatch the completion event BEFORE `_is_syncing_guard` drops.
+        // Dispatch the completion event BEFORE the `_pass` guard drops.
         // `quiesce()` drains on the falling edge of `is_syncing`; if
         // the guard cleared the flag before the dispatch a stop/clear
         // caller could unblock while the callback is still pending —
@@ -399,7 +348,7 @@ impl ShieldedSyncManager {
         self.event_manager.on_shielded_sync_completed(&summary);
 
         summary
-        // `_is_syncing_guard` drops here → `is_syncing = false`
+        // `_pass` drops here → `is_syncing = false`
     }
 
     /// Sync a single wallet on demand.
@@ -430,26 +379,14 @@ impl ShieldedSyncManager {
         };
 
         // Reuse the manager-wide `is_syncing` flag so a per-wallet
-        // `sync_wallet()` can't race the periodic `sync_now()`
-        // against the same store — both go through
-        // `coordinator.sync()`, which serializes per-coordinator
-        // but the manager flag is what the host UI watches.
-        if self
-            .is_syncing
-            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
-            .is_err()
-        {
+        // `sync_wallet()` can't race the periodic `sync_now()` against the
+        // same store — both go through `coordinator.sync()`, which
+        // serializes per-coordinator, but the manager flag is what the host
+        // UI watches. Bail (Ok(None)) if a pass is already in flight or a
+        // teardown raised the quiescing gate.
+        let Some(_pass) = self.lifecycle.begin_pass() else {
             return Ok(None);
-        }
-
-        // RAII guard clears `is_syncing` on every exit path including panics.
-        let _is_syncing_guard = AtomicFlagGuard::new(&self.is_syncing);
-
-        // Bail if a `quiesce()` raised the gate after our CAS (see
-        // `sync_now`) so the drain barrier holds.
-        if self.quiescing.load(Ordering::Acquire) {
-            return Ok(None);
-        }
+        };
 
         let pass = coordinator.sync(force).await;
 
@@ -476,7 +413,7 @@ impl std::fmt::Debug for ShieldedSyncManager {
         f.debug_struct("ShieldedSyncManager")
             .field("is_running", &self.is_running())
             .field("is_syncing", &self.is_syncing())
-            .field("interval_secs", &self.interval_secs.load(Ordering::Acquire))
+            .field("interval_secs", &self.lifecycle.interval_secs())
             .field("last_sync_unix", &self.last_sync_unix_seconds())
             .finish()
     }

From 22647a7fde77c5ed0c71d58d70971ecc96e3c801 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:34:47 +0200
Subject: [PATCH 23/29] fix(platform-wallet): raise quiescing gate in
 CoordinatorLifecycle::quiesce regardless of a running loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

registry.quiesce early-returns NotRunning WITHOUT running the DrainHook when no background-loop slot is registered, so quiesce left the quiescing gate down and never drained an in-flight pass. A concurrent direct sync_now/sync_wallet that had already cleared begin_pass() was therefore not held off, breaking the clear_shielded/stop contract.

Fix: quiesce now raises quiescing itself (gate-before-cancel preserved; the AtomicFlagGuard still reopens it on return), then — after the registry's bounded cancel+join — drains is_syncing when the status is clean. Draining only on a clean status keeps a wedged loop pass (reported Timeout, its thread orphaned) from reintroducing the shutdown stall the bounded join exists to prevent, while still covering the no-loop and idle-loop+direct-pass cases. TDD: new test fails against the pre-fix delegating quiesce.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/manager/coordinator_lifecycle.rs      | 135 +++++++++++++++++-
 1 file changed, 132 insertions(+), 3 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
index 440c67c676..84a6c02ed9 100644
--- a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
+++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
@@ -141,11 +141,57 @@ impl CoordinatorLifecycle {
 
     /// Cancel the loop, drain any in-flight pass, and join the worker,
     /// returning its terminal status. Reopens the `quiescing` gate on every
-    /// exit path (the registry's drain hook raised it; reopening is safe
-    /// because the loop has been cancelled, so no new pass starts).
+    /// exit path (the gate is reset by the guard; reopening is safe because
+    /// the loop has been cancelled, so no new pass starts).
+    ///
+    /// The gate is raised **here**, not left to the registry's drain hook:
+    /// `registry.quiesce` early-returns `NotRunning` without running the
+    /// hook when no background-loop slot is registered, so a coordinator
+    /// with only direct `sync_now`/`sync_wallet` traffic (no running loop)
+    /// would never see the gate go up — and a direct pass landing
+    /// concurrently would slip past the barrier `clear_shielded`/`stop`
+    /// promise. Raising it ourselves makes the "no new pass" gate hold
+    /// regardless of whether a loop is registered, and preserves
+    /// gate-before-cancel: it is up before `registry.quiesce` issues any
+    /// cancel.
     pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus {
+        // Gate up first (instant) and held until the guard drops on return.
+        self.quiescing.store(true, Ordering::Release);
         let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
-        self.registry.quiesce(self.worker).await.into()
+
+        // Cancel + bounded join of the background loop (if any). A wedged
+        // loop pass surfaces here as a non-clean `Timeout` rather than
+        // hanging — its orphaned thread is tracked by the registry for
+        // teardown, so we must not wait on it below.
+        let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into();
+
+        // Drain a *direct* in-flight pass the registry could not: with no
+        // loop slot, `registry.quiesce` returned `NotRunning` without
+        // joining anything; with an idle loop it joined a thread that was
+        // not the one holding `is_syncing`. Either way a `sync_now`/
+        // `sync_wallet` that entered before the gate rose may still be in
+        // flight. The gate keeps a new pass from starting, so this
+        // converges, and a panicked pass clears the flag via its own RAII
+        // guard. Only drain on a clean status: a non-clean one means a
+        // wedged loop pass is the `is_syncing` holder (its thread was
+        // orphaned, not joined), and waiting on it would reintroduce the
+        // shutdown stall the registry's bounded join exists to prevent.
+        if status.is_clean() {
+            self.drain_in_flight_pass().await;
+        }
+
+        status
+    }
+
+    /// Poll until no sync pass holds `is_syncing`. Only sound to call with
+    /// the `quiescing` gate already raised (so no new pass can start) and
+    /// after the background loop has been cancel-joined (so the only
+    /// possible holder is a direct, non-cancellable pass running to
+    /// completion). Mirrors the registry's 5ms poll cadence.
+    async fn drain_in_flight_pass(&self) {
+        while self.is_syncing.load(Ordering::Acquire) {
+            tokio::time::sleep(Duration::from_millis(5)).await;
+        }
     }
 
     /// Raise the `quiescing` gate and hold it raised until the returned
@@ -191,3 +237,86 @@ impl CoordinatorLifecycle {
         Some(guard)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::sync::oneshot;
+
+    fn make_lifecycle() -> Arc<CoordinatorLifecycle> {
+        let registry = ThreadRegistry::<WalletWorker>::new();
+        Arc::new(CoordinatorLifecycle::new(
+            registry,
+            WalletWorker::IdentitySync,
+            60,
+        ))
+    }
+
+    /// With NO background loop registered, `quiesce` must still raise the
+    /// `quiescing` gate — so a concurrent direct `sync_now`/`sync_wallet`
+    /// that lands after it bails — and drain an already-in-flight direct
+    /// pass before returning. The registry's drain hook cannot cover this:
+    /// `registry.quiesce` early-returns `NotRunning` WITHOUT running the
+    /// hook when no loop slot exists, so the gate would otherwise never go
+    /// up and the in-flight pass would not be drained. Regression for the
+    /// `clear_shielded`/`stop` contract ("a concurrent direct
+    /// sync_now/sync_wallet is held off"). Must fail against the pre-fix
+    /// `quiesce` that only delegated to the registry.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn quiesce_raises_gate_and_drains_direct_pass_without_background_loop() {
+        let lifecycle = make_lifecycle();
+        assert!(
+            !lifecycle.is_running(),
+            "precondition: no background loop registered"
+        );
+
+        // A direct sync_now/sync_wallet pass already past `begin_pass`, held
+        // in flight on a task until we release it.
+        let (ready_tx, ready_rx) = oneshot::channel::<()>();
+        let (release_tx, release_rx) = oneshot::channel::<()>();
+        let lc_pass = Arc::clone(&lifecycle);
+        let pass_task = tokio::spawn(async move {
+            let _pass = lc_pass.begin_pass().expect("first pass enters the slot");
+            ready_tx.send(()).expect("signal in-flight");
+            release_rx.await.expect("await release");
+            // `_pass` drops here → is_syncing = false
+        });
+
+        ready_rx.await.expect("pass reached in-flight");
+        assert!(lifecycle.is_syncing(), "direct pass holds is_syncing");
+
+        // Drive `quiesce` concurrently: it must raise the gate, then block
+        // draining the in-flight pass.
+        let lc_q = Arc::clone(&lifecycle);
+        let quiesce_task = tokio::spawn(async move { lc_q.quiesce().await });
+
+        // Give `quiesce` time to raise the gate and enter the drain.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(
+            lifecycle.quiescing.load(Ordering::Acquire),
+            "quiesce must raise the gate even with no background loop registered"
+        );
+        assert!(
+            lifecycle.is_syncing(),
+            "in-flight direct pass still held; quiesce has not skipped the drain"
+        );
+        assert!(
+            !quiesce_task.is_finished(),
+            "quiesce must block until the in-flight pass drains"
+        );
+
+        // Release the pass; `quiesce` drains `is_syncing`, then returns.
+        release_tx.send(()).expect("release the pass");
+        let status = tokio::time::timeout(Duration::from_secs(2), quiesce_task)
+            .await
+            .expect("quiesce completes once the pass drains")
+            .expect("quiesce task joined");
+        assert_eq!(status, CoordinatorThreadStatus::NotRunning);
+        assert!(
+            !lifecycle.is_syncing(),
+            "is_syncing was drained before quiesce returned"
+        );
+
+        pass_task.await.expect("pass task joined");
+    }
+}

From 7f3aeb59f8e90085ed5f1921a60e8dc6b2664fd4 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:34:56 +0200
Subject: [PATCH 24/29] fix(dash-async): park a restarted worker's prior under
 the slot lock so shutdown can't miss it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

start_thread took the prior handle out of the slot and ran reap_prior_or_park (a ~1s backstop spin) OUT of the slot lock. A concurrent shutdown() could take the lock, latch closing, snapshot tiers seeing only the NEW handle, release, and reap an EMPTY orphan list — reporting clean while the wedged prior was still live and un-joined.

Fix: park the prior into orphans UNDER the slot lock (park_prior_locked), making take-prior + park-prior atomic from shutdown's under-lock view; the bounded join stays out of the lock (reap_parked_prior, which finds the prior by ThreadId, removes+joins it when finished, or leaves a genuine wedge parked). start_task parks under the lock too. This introduces the module's only slots->orphans nesting; it is deadlock-free since no path takes slots while holding orphans. TDD: long-backstop test asserts the prior is parked before the spin could elapse; fails pre-fix.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-dash-async/src/registry.rs | 221 ++++++++++++++++++++-----
 1 file changed, 179 insertions(+), 42 deletions(-)

diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index 982dd6b57c..7f103ffcae 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -339,7 +339,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         F: FnOnce(CancellationToken) + Send + 'static,
     {
         Self::assert_multi_thread("start_thread");
-        let prior = {
+        let prior_tid = {
             let mut slots = self.lock_slots();
             // One-way teardown latch: refuse new workers once shutdown has
             // begun, under the same lock shutdown snapshots tiers with.
@@ -378,12 +378,27 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                 body(body_token);
             }) {
                 Ok(join) => {
-                    // Store the handle while still under the slot lock; the
-                    // guard is released at the end of this block, BEFORE the
-                    // reap below (R1: store handle -> drop guard -> THEN
-                    // reap-or-park).
+                    // Store the new handle, then park the prior into orphans
+                    // — both while still under THIS slot lock (R1: store
+                    // handle -> park prior -> drop guard -> THEN bounded
+                    // reap below).
                     slot.handle = Some(WorkerHandle::OsThread(join));
-                    prior
+                    // [F3 FIX] Park the prior UNDER the slot lock, before
+                    // releasing it. `shutdown` latches `closing` and
+                    // snapshots tiers under this same lock; parking here
+                    // means the take-prior + park-prior is atomic from its
+                    // view, so it can never observe the new slot without
+                    // also seeing the prior accounted in orphans. (The old
+                    // out-of-lock reap left a window: the prior was moved out
+                    // of the slot but not yet parked, so a shutdown
+                    // snapshotting in that gap reaped an empty orphan list
+                    // and reported clean while a wedged prior was still
+                    // live.) The bounded join stays OUT of the lock —
+                    // `reap_parked_prior` below. The `slots`->`orphans`
+                    // nesting this introduces is the only such nesting in the
+                    // module and is deadlock-free: no path acquires `slots`
+                    // while holding `orphans`.
+                    self.park_prior_locked(key, prior)
                 }
                 Err(e) => {
                     // Spawn failed (e.g. EAGAIN at the OS thread ceiling).
@@ -391,7 +406,8 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                     // the slot is not left wedged "running": re-install
                     // prior, clear the running flag. `generation` stays
                     // bumped (it is only ever monotonic), which is harmless
-                    // — the next start reaps the re-installed prior.
+                    // — the next start reaps the re-installed prior. Nothing
+                    // was parked, so there is no prior to reap below.
                     tracing::error!(
                         ?key,
                         error = %e,
@@ -407,10 +423,11 @@ impl<K: RegistryKey> ThreadRegistry<K> {
 
         // The prior thread was cancellation-signalled by a preceding
         // cancel(); with the slot lock released its epilogue completes
-        // promptly and the join lands in milliseconds. The backstop fires
-        // only on a genuine wedge, in which case the still-live handle is
+        // promptly and the join lands in milliseconds — `reap_parked_prior`
+        // then removes it from orphans and joins it. The backstop fires only
+        // on a genuine wedge, in which case the still-live handle is left
         // parked (not dropped) so teardown can account for it.
-        self.reap_prior_or_park(prior, key);
+        self.reap_parked_prior(key, prior_tid);
     }
 
     /// Start a tokio-task worker for `Send` futures. Same restart-reap
@@ -439,7 +456,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
         F: FnOnce(CancellationToken) -> Fut + Send + 'static,
         Fut: Future<Output = ()> + Send + 'static,
     {
-        let prior = {
+        {
             let mut slots = self.lock_slots();
             // One-way teardown latch — see `start_thread`.
             if self.closing.load(Ordering::Acquire) {
@@ -468,9 +485,17 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                 body(body_token).await;
             });
             slot.handle = Some(WorkerHandle::Task(join));
-            prior
-        };
-        self.reap_prior_or_park(prior, key);
+            // [F3 FIX] Park the prior UNDER this slot lock, same rationale as
+            // `start_thread`: it keeps `shutdown`'s under-lock tier snapshot
+            // from ever missing the prior. A task cannot be joined
+            // synchronously, so there is no bounded reap here — a live prior
+            // is parked for the async orphan reap (`reap_orphans` /
+            // `shutdown`) and a finished one is dropped. The returned thread
+            // id is unused: a task prior has none, and a (mixed-usage)
+            // OS-thread prior is likewise left to the async reap rather than
+            // spun on synchronously from this (possibly async) caller.
+            let _ = self.park_prior_locked(key, prior);
+        }
     }
 
     /// Whether a worker is currently registered and running for `key`.
@@ -731,43 +756,86 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             .spawn(closure)
     }
 
-    /// Reap a restarted key's prior worker — or park it if it is genuinely
-    /// wedged past the reap backstop. Must be called with no registry lock
-    /// held (it spins synchronously for an OS thread).
-    fn reap_prior_or_park(&self, prior: Option<WorkerHandle>, key: K) {
-        let Some(handle) = prior else {
+    /// Park a restarted key's prior handle into orphans. **Must be called
+    /// while the slot lock is held** — the resulting `slots`->`orphans`
+    /// nesting is the only such nesting in this module and is deadlock-free
+    /// (no path ever acquires `slots` while holding `orphans`, so there is no
+    /// cycle). Parking the prior here, rather than after the slot lock is
+    /// released, is what lets `shutdown`'s under-lock tier snapshot never
+    /// miss it: the take-prior and the park-prior are then atomic from
+    /// `shutdown`'s view. A finished task is dropped (detaching a finished
+    /// task is a no-op); a live task and any OS thread are parked. Returns
+    /// the parked OS thread's id so [`reap_parked_prior`](Self::reap_parked_prior)
+    /// can find and bounded-join it; tasks (reaped asynchronously) return
+    /// `None`.
+    fn park_prior_locked(
+        &self,
+        key: K,
+        prior: Option<WorkerHandle>,
+    ) -> Option<std::thread::ThreadId> {
+        match prior {
+            Some(WorkerHandle::OsThread(h)) => {
+                let tid = h.thread().id();
+                self.lock_orphans().push((key, WorkerHandle::OsThread(h)));
+                Some(tid)
+            }
+            Some(task) => {
+                if !task.is_finished() {
+                    self.lock_orphans().push((key, task));
+                }
+                None
+            }
+            None => None,
+        }
+    }
+
+    /// Bounded reap of an OS-thread prior that [`park_prior_locked`](Self::park_prior_locked)
+    /// parked under `key` at restart. Must be called with no registry lock
+    /// held (it spins synchronously). The instant the parked thread finishes
+    /// it is removed from orphans and joined — the join itself stays OUT of
+    /// any lock (only the bookkeeping is taken under the orphans lock). A
+    /// genuine wedge past the reap backstop is left parked, so teardown can
+    /// still account for it. No-op when no OS thread was parked (`None`), or
+    /// when the orphan was already taken by a concurrent reaper / `shutdown`
+    /// (which then owns the join).
+    fn reap_parked_prior(&self, key: K, prior_tid: Option<std::thread::ThreadId>) {
+        let Some(tid) = prior_tid else {
             return;
         };
-        match handle {
-            WorkerHandle::OsThread(h) => {
-                let deadline = Instant::now() + self.reap_backstop;
-                loop {
-                    if h.is_finished() {
-                        let _ = h.join();
-                        return;
-                    }
-                    if Instant::now() >= deadline {
+        let deadline = Instant::now() + self.reap_backstop;
+        loop {
+            // Bookkeeping under the orphans lock only: locate our parked
+            // prior by thread id and, once it has finished, take it out to
+            // join after the lock is released. Never hold the lock across the
+            // join.
+            let taken = {
+                let mut orphans = self.lock_orphans();
+                let pos = orphans.iter().position(|(k, h)| {
+                    *k == key && matches!(h, WorkerHandle::OsThread(t) if t.thread().id() == tid)
+                });
+                match pos {
+                    // Already taken by a concurrent reaper / shutdown: it owns
+                    // the join now.
+                    None => return,
+                    Some(i) if orphans[i].1.is_finished() => Some(orphans.remove(i).1),
+                    Some(_) if Instant::now() >= deadline => {
                         tracing::warn!(
                             ?key,
                             backstop = ?self.reap_backstop,
                             "prior worker thread did not finish within the reap \
-                             backstop after cancellation; parking it as an orphan \
-                             for teardown to join rather than detaching it"
+                             backstop after cancellation; leaving it parked as an \
+                             orphan for teardown to join rather than detaching it"
                         );
-                        self.lock_orphans().push((key, WorkerHandle::OsThread(h)));
                         return;
                     }
-                    std::thread::sleep(Duration::from_millis(5));
-                }
-            }
-            // A task can't be joined synchronously here; park a still-live
-            // one for async reap. A finished one is dropped (detaching a
-            // finished task is a no-op).
-            task => {
-                if !task.is_finished() {
-                    self.lock_orphans().push((key, task));
+                    Some(_) => None,
                 }
+            };
+            if let Some(WorkerHandle::OsThread(h)) = taken {
+                let _ = h.join();
+                return;
             }
+            std::thread::sleep(Duration::from_millis(5));
         }
     }
 
@@ -850,8 +918,12 @@ struct Repark<'a, K: RegistryKey> {
 impl<K: RegistryKey> Drop for Repark<'_, K> {
     fn drop(&mut self) {
         // Take the handle under the slot lock, release it, then push to
-        // orphans — never nest the two locks. Skip if a restart superseded
-        // our generation (the handle is the new worker's, not ours).
+        // orphans. This path holds only one lock at a time; the single
+        // sanctioned nesting in the module is `slots`->`orphans` in
+        // `park_prior_locked`, and nothing ever takes `slots` while holding
+        // `orphans`, so the ordering stays acyclic. Skip if a restart
+        // superseded our generation (the handle is the new worker's, not
+        // ours).
         let handle = self
             .reg
             .lock_slots()
@@ -1664,4 +1736,69 @@ mod tests {
         );
         assert!(!reg.any_alive(), "nothing started post-shutdown");
     }
+
+    /// [F3 FIX] `start_thread` must park a restarted key's still-wedged prior
+    /// into the orphan list UNDER the slot lock — at the START of the
+    /// restart, not only after the out-of-lock reap backstop elapses.
+    /// Otherwise a `shutdown()` that snapshots tiers in the window between
+    /// "prior taken out of the slot" and "prior parked" sees neither the
+    /// prior (already moved out of the slot) nor an orphan, and reports
+    /// clean while the wedged prior is still live and un-joined.
+    ///
+    /// Deterministic via a long backstop: with the fix the prior is
+    /// observable in orphans well before the backstop could elapse; the
+    /// pre-fix code parks it only at the end of the out-of-lock spin, so the
+    /// early assertion fails.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn start_thread_parks_wedged_prior_under_slot_lock_at_restart() {
+        // Long backstop so the under-lock parking is observable well before
+        // it could possibly elapse.
+        let reg = ThreadRegistry::with_reap_backstop(Duration::from_secs(10));
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+
+        // gen-1: wedged (ignores cancel), stays live until released.
+        reg.start_thread("k", WorkerConfig::default(), wedged_body(release_rx));
+        reg.cancel("k");
+
+        // gen-2 restart on a blocking thread: its bounded reap of the wedged
+        // gen-1 spins the (long) backstop, so start_thread does not return
+        // promptly. The fix parks gen-1 under the slot lock at the start of
+        // this call, before that spin.
+        let reg2 = Arc::clone(&reg);
+        let parent = Handle::current();
+        let restart = tokio::task::spawn_blocking(move || {
+            let handle = parent.clone();
+            reg2.start_thread("k", WorkerConfig::default(), move |cancel| {
+                handle.block_on(async move { cancel.cancelled().await });
+            });
+        });
+
+        // The wedged prior must appear in orphans far sooner than the 10s
+        // backstop — it was parked under the slot lock at restart.
+        let mut waited = Duration::ZERO;
+        while orphan_len(&reg) == 0 && waited < Duration::from_secs(2) {
+            tokio::time::sleep(Duration::from_millis(10)).await;
+            waited += Duration::from_millis(10);
+        }
+        assert_eq!(
+            orphan_len(&reg),
+            1,
+            "wedged prior must be parked under the slot lock at restart, not \
+             only after the backstop spin"
+        );
+        assert!(reg.is_running("k"), "gen-2 installed under the same lock");
+
+        // Release the wedged prior: the restart's bounded reap then finds it
+        // finished, removes it from orphans, and joins it.
+        release_tx.send(()).unwrap();
+        restart.await.unwrap();
+        assert_eq!(
+            orphan_len(&reg),
+            0,
+            "finished prior removed from orphans by the bounded reap"
+        );
+
+        // gen-2 quiesces cleanly.
+        assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok);
+    }
 }

From 41791c06c512821e9fbd5354e064a165e76bc880 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:35:05 +0200
Subject: [PATCH 25/29] fix(platform-wallet-ffi): gate shielded_sync_stop
 success on orphan liveness, like clear_shielded/destroy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

shielded_sync_stop returned Success on status.is_clean() alone, ignoring a prior-generation shielded thread still parked alive as an orphan — asymmetric with clear_shielded/destroy and a misleading contract (the orphan still holds the host callback context). No live UAF today since Swift always does stop->destroy, but Success should imply no live shielded worker/orphan.

Add manager::shielded_worker_alive() (the same shielded-scoped any_alive_for gate clear_shielded consults) and have shielded_sync_stop return ErrorShutdownIncomplete when a parked orphan survives a clean drain. FFI ABI unchanged (same return-code semantics); docstring updated so Success accurately implies no live shielded worker/orphan.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/shielded_sync.rs                      | 45 ++++++++++++++-----
 .../src/manager/accessors.rs                  | 19 ++++++++
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
index 14082628e4..493f84aa0f 100644
--- a/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
+++ b/packages/rs-platform-wallet-ffi/src/shielded_sync.rs
@@ -74,13 +74,20 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_start(
 /// note/sync-state row can be written after this returns) and its
 /// completion-event *dispatch* on the Rust side has run.
 ///
-/// Returns `ErrorShutdownIncomplete` instead of `Success` when that drain
-/// did **not** complete cleanly (the in-flight pass timed out on the join
-/// backstop, or the loop ended non-cleanly). The terminal coordinator
-/// status is rendered into the result message. On this code the host must
-/// **not** free the callback context immediately — a lingering pass may
-/// still fire one final callback through it (symmetric with
-/// `platform_wallet_manager_destroy`).
+/// Returns `ErrorShutdownIncomplete` instead of `Success` in either of two
+/// cases, so `Success` accurately implies **no live shielded worker or
+/// orphan remains**:
+/// - the drain did not complete cleanly (the in-flight pass timed out on the
+///   join backstop, or the loop ended non-cleanly); or
+/// - the drain was clean but a prior-generation shielded thread is still
+///   parked alive as an orphan (a tight `stop()`->`start()` reap detached it
+///   past the wedge backstop).
+///
+/// The terminal coordinator status is rendered into the result message. On
+/// this code the host must **not** free the callback context immediately — a
+/// lingering pass or parked orphan may still fire one final callback through
+/// it (symmetric with `platform_wallet_manager_destroy` and the shielded
+/// Clear flow).
 ///
 /// Caveat on host-observed events: a host that marshals the completion
 /// callback onto its own executor (e.g. the Swift trampoline hops it to
@@ -96,7 +103,7 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
     handle: Handle,
 ) -> PlatformWalletFFIResult {
     let option = PLATFORM_WALLET_MANAGER_STORAGE.with_item(handle, |manager| {
-        runtime().block_on(async {
+        let status = runtime().block_on(async {
             // Bound the quiesce with the same backstop `shutdown()` uses so
             // a stalled in-flight pass can't hang the host's stop call
             // forever. Cancellation makes the drain prompt; this only
@@ -113,9 +120,14 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
                 Ok(status) => status,
                 Err(_elapsed) => platform_wallet::CoordinatorThreadStatus::Timeout,
             }
-        })
+        });
+        // Capture orphan liveness while we still hold the manager: a clean
+        // quiesce drains the live slot but not a prior-generation thread
+        // parked as an orphan.
+        let shielded_alive = manager.shielded_worker_alive();
+        (status, shielded_alive)
     });
-    let status = unwrap_option_or_return!(option);
+    let (status, shielded_alive) = unwrap_option_or_return!(option);
     // Symmetric with `platform_wallet_manager_destroy`: a non-clean drain
     // means the shielded loop may still hold a reference to the host-owned
     // event-handler / persister context and could fire one final callback,
@@ -130,6 +142,19 @@ pub unsafe extern "C" fn platform_wallet_manager_shielded_sync_stop(
             ),
         );
     }
+    // Even on a clean drain, a parked prior-generation shielded thread may
+    // still be alive and holding the host's callback context — mirror
+    // `clear_shielded` / `destroy` and refuse the clean return so the host
+    // does not free that context out from under a lingering orphan.
+    if shielded_alive {
+        return PlatformWalletFFIResult::err(
+            PlatformWalletFFIResultCode::ErrorShutdownIncomplete,
+            "shielded sync stop drained cleanly but a prior-generation shielded \
+             worker is still parked alive; host must not free the callback \
+             context immediately"
+                .to_string(),
+        );
+    }
     PlatformWalletFFIResult::ok()
 }
 
diff --git a/packages/rs-platform-wallet/src/manager/accessors.rs b/packages/rs-platform-wallet/src/manager/accessors.rs
index 7bf901bccf..4ef045f906 100644
--- a/packages/rs-platform-wallet/src/manager/accessors.rs
+++ b/packages/rs-platform-wallet/src/manager/accessors.rs
@@ -299,6 +299,25 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         Arc::clone(&self.shielded_sync_manager)
     }
 
+    /// Whether a shielded-sync worker is still alive — either its live
+    /// registry slot or a prior-generation thread parked as an orphan after
+    /// a tight `stop()`->`start()` reap had to detach it past the wedge
+    /// backstop. Such an orphan still holds an `Arc` to the persister /
+    /// event-handler context and may fire one final callback, so a clean
+    /// [`quiesce`](ShieldedSyncManager::quiesce) status alone does not prove
+    /// the shielded worker is gone.
+    ///
+    /// This is the same shielded-scoped liveness gate
+    /// [`clear_shielded`](Self::clear_shielded) consults; it is exposed so
+    /// the FFI `shielded_sync_stop` can refuse a misleading clean return
+    /// while a parked orphan lingers (symmetric with `clear_shielded` /
+    /// `destroy`).
+    #[cfg(feature = "shielded")]
+    pub fn shielded_worker_alive(&self) -> bool {
+        self.registry
+            .any_alive_for(super::WalletWorker::ShieldedSync)
+    }
+
     /// Get a clone of a wallet by its ID.
     pub async fn get_wallet(&self, wallet_id: &WalletId) -> Option<Arc<PlatformWallet>> {
         let wallets = self.wallets.read().await;

From 4b099a92dcd8a5a339b7e926a6e4b04008c107eb Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:18:57 +0200
Subject: [PATCH 26/29] fix(platform-wallet): bound clear_shielded's drain and
 hold its quiescing gate continuously
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC-001: clear_shielded's in-flight-pass drain was unbounded and the FFI clear bridge a bare block_on, so a heavy direct pass could hang the host (ANR). Bound it with a SHUTDOWN_JOIN_TIMEOUT_SECS timeout (mirroring shielded_sync_stop); on timeout the clear reports Timeout and aborts BEFORE the wipe, leaving the store intact. Split out clear_shielded_inner(drain_timeout) so the timeout path is testable without the 30s budget.

SEC-002/RUST-002: the gate lapsed between quiesce() returning (its RAII guard lowers the shared flag) and the post-drain re-raise, letting a direct sync_now/sync_wallet slip past any_alive_for and re-persist into the wiped store. Fix: raise+HOLD the gate via clear's own guard BEFORE draining, and drain via a new gate-neutral quiesce_under_held_gate (extracted cancel_join_and_drain shared with quiesce, which stays byte-identical — Fix-1 invariant untouched). The gate now stays raised continuously across drain, liveness check, and wipe; doc softened to note the only residual is a full start() racing clear (per-key-latch follow-up). PROJ-004: clear now calls shielded_worker_alive() instead of re-inlining any_alive_for. Also clarifies the quiesce doc that only shielded gates sync_wallet (platform-address's is intentionally ungated). TDD: both SEC tests proven non-vacuous (revert->fail->restore).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/manager/coordinator_lifecycle.rs      | 123 ++++++++++++----
 .../rs-platform-wallet/src/manager/mod.rs     | 131 ++++++++++++++----
 .../src/manager/shielded_sync.rs              |  19 +++
 3 files changed, 222 insertions(+), 51 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
index 84a6c02ed9..ebcc73419c 100644
--- a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
+++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
@@ -147,39 +147,67 @@ impl CoordinatorLifecycle {
     /// The gate is raised **here**, not left to the registry's drain hook:
     /// `registry.quiesce` early-returns `NotRunning` without running the
     /// hook when no background-loop slot is registered, so a coordinator
-    /// with only direct `sync_now`/`sync_wallet` traffic (no running loop)
-    /// would never see the gate go up — and a direct pass landing
-    /// concurrently would slip past the barrier `clear_shielded`/`stop`
-    /// promise. Raising it ourselves makes the "no new pass" gate hold
-    /// regardless of whether a loop is registered, and preserves
-    /// gate-before-cancel: it is up before `registry.quiesce` issues any
-    /// cancel.
+    /// with only direct pass traffic (no running loop) would never see the
+    /// gate go up — and a direct pass landing concurrently would slip past
+    /// the barrier `clear_shielded`/`stop` promise. Raising it ourselves
+    /// makes the "no new pass" gate hold regardless of whether a loop is
+    /// registered, and preserves gate-before-cancel: it is up before
+    /// `registry.quiesce` issues any cancel.
+    ///
+    /// "Direct pass" here means the gated entry points that take the
+    /// `is_syncing` slot via [`begin_pass`](Self::begin_pass): every
+    /// coordinator's `sync_now`, plus the shielded coordinator's
+    /// `sync_wallet`. The platform-address coordinator's `sync_wallet` is
+    /// intentionally **ungated** (it never touches `is_syncing`; callers
+    /// that need exclusion gate themselves), so the gate/drain barrier does
+    /// not apply to it.
     pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus {
         // Gate up first (instant) and held until the guard drops on return.
         self.quiescing.store(true, Ordering::Release);
         let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
+        self.cancel_join_and_drain().await
+    }
 
-        // Cancel + bounded join of the background loop (if any). A wedged
-        // loop pass surfaces here as a non-clean `Timeout` rather than
-        // hanging — its orphaned thread is tracked by the registry for
-        // teardown, so we must not wait on it below.
-        let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into();
+    /// Like [`quiesce`](Self::quiesce) but for a caller that has **already**
+    /// raised the `quiescing` gate (via [`hold_quiescing_gate`](Self::hold_quiescing_gate))
+    /// and will keep holding it: this neither raises nor lowers the gate, so
+    /// a multi-step teardown (the shielded Clear flow) keeps the "no new
+    /// pass" barrier raised *continuously* across the drain, the orphan-
+    /// liveness check, and the store wipe — with no lapse for a direct
+    /// `sync_now`/`sync_wallet` to slip through and re-persist into the
+    /// store being cleared. (`quiesce`'s own RAII guard would lower the gate
+    /// the instant it returned, which is why Clear cannot just call it and
+    /// re-raise afterwards: a single shared `AtomicFlagGuard` always clears
+    /// the flag on drop, so the re-raise would leave a window.) Gate-before-
+    /// cancel still holds: the caller raised the gate before this runs.
+    #[cfg(any(test, feature = "shielded"))]
+    pub(crate) async fn quiesce_under_held_gate(&self) -> CoordinatorThreadStatus {
+        debug_assert!(
+            self.quiescing.load(Ordering::Acquire),
+            "quiesce_under_held_gate requires the caller to already hold the quiescing gate"
+        );
+        self.cancel_join_and_drain().await
+    }
 
-        // Drain a *direct* in-flight pass the registry could not: with no
-        // loop slot, `registry.quiesce` returned `NotRunning` without
-        // joining anything; with an idle loop it joined a thread that was
-        // not the one holding `is_syncing`. Either way a `sync_now`/
-        // `sync_wallet` that entered before the gate rose may still be in
-        // flight. The gate keeps a new pass from starting, so this
-        // converges, and a panicked pass clears the flag via its own RAII
-        // guard. Only drain on a clean status: a non-clean one means a
-        // wedged loop pass is the `is_syncing` holder (its thread was
-        // orphaned, not joined), and waiting on it would reintroduce the
-        // shutdown stall the registry's bounded join exists to prevent.
+    /// Cancel + bounded-join the background loop (if any), then drain a
+    /// direct in-flight pass on a clean status. Assumes the `quiescing` gate
+    /// is **already raised** (by [`quiesce`](Self::quiesce)'s own guard or a
+    /// caller's hold guard) and does not touch it.
+    ///
+    /// A wedged loop pass surfaces from `registry.quiesce` as a non-clean
+    /// `Timeout` rather than hanging — its orphaned thread is tracked by the
+    /// registry for teardown, so the drain below must not wait on it. On a
+    /// clean status the only possible `is_syncing` holder is a direct
+    /// `sync_now`/`sync_wallet` that entered before the gate rose (with no
+    /// loop slot `registry.quiesce` joined nothing; with an idle loop it
+    /// joined a thread that was not the one holding the flag). The raised
+    /// gate keeps a new pass from starting, so the drain converges, and a
+    /// panicked pass clears the flag via its own RAII guard.
+    async fn cancel_join_and_drain(&self) -> CoordinatorThreadStatus {
+        let status: CoordinatorThreadStatus = self.registry.quiesce(self.worker).await.into();
         if status.is_clean() {
             self.drain_in_flight_pass().await;
         }
-
         status
     }
 
@@ -187,7 +215,9 @@ impl CoordinatorLifecycle {
     /// the `quiescing` gate already raised (so no new pass can start) and
     /// after the background loop has been cancel-joined (so the only
     /// possible holder is a direct, non-cancellable pass running to
-    /// completion). Mirrors the registry's 5ms poll cadence.
+    /// completion). Mirrors the registry's 5ms poll cadence. Unbounded by
+    /// design — the caller bounds the whole teardown (the FFI `stop` /
+    /// `clear` bridges wrap it in a `SHUTDOWN_JOIN_TIMEOUT_SECS` timeout).
     async fn drain_in_flight_pass(&self) {
         while self.is_syncing.load(Ordering::Acquire) {
             tokio::time::sleep(Duration::from_millis(5)).await;
@@ -319,4 +349,47 @@ mod tests {
 
         pass_task.await.expect("pass task joined");
     }
+
+    /// `quiesce_under_held_gate` must NOT lower the `quiescing` gate the
+    /// caller is holding — the mechanism that lets the shielded Clear flow
+    /// keep the "no new pass" barrier raised *continuously* across the
+    /// drain, the liveness check, and the store wipe. The plain
+    /// [`quiesce`](CoordinatorLifecycle::quiesce)'s own RAII guard would
+    /// lower it on return, leaving a window a direct pass could slip into
+    /// before Clear re-raised it. Must fail against a variant that delegates
+    /// to `quiesce` (whose guard clears the shared flag on drop).
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn quiesce_under_held_gate_keeps_caller_gate_raised() {
+        let lifecycle = make_lifecycle();
+
+        // Caller (the Clear flow) raises and holds the gate before draining.
+        let hold = lifecycle.hold_quiescing_gate();
+        assert!(
+            lifecycle.quiescing.load(Ordering::Acquire),
+            "caller's hold raised the gate"
+        );
+
+        // Drain under the held gate (no loop registered → NotRunning); the
+        // gate must remain raised across the call.
+        let status = lifecycle.quiesce_under_held_gate().await;
+        assert_eq!(status, CoordinatorThreadStatus::NotRunning);
+        assert!(
+            lifecycle.quiescing.load(Ordering::Acquire),
+            "gate stays raised across the drain — no lapse for a direct pass"
+        );
+
+        // A direct pass attempting to begin during Clear (gate held) is
+        // refused: it bails after the CAS on the raised gate.
+        assert!(
+            lifecycle.begin_pass().is_none(),
+            "the continuously-held gate holds off a new direct pass"
+        );
+
+        // Once Clear's own guard drops, the gate reopens for later work.
+        drop(hold);
+        assert!(
+            !lifecycle.quiescing.load(Ordering::Acquire),
+            "gate reopens once the caller's hold guard drops"
+        );
+    }
 }
diff --git a/packages/rs-platform-wallet/src/manager/mod.rs b/packages/rs-platform-wallet/src/manager/mod.rs
index 840dd13c7a..fd22ec6d17 100644
--- a/packages/rs-platform-wallet/src/manager/mod.rs
+++ b/packages/rs-platform-wallet/src/manager/mod.rs
@@ -514,21 +514,53 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
     ///
     /// **Host-serialization precondition**: the caller must not invoke
     /// `shielded_sync_start` for this manager concurrently with `clear`. A
-    /// concurrent direct `sync_now`/`sync_wallet` is held off (the quiescing
-    /// gate stays raised across the liveness check and the wipe), but a full
-    /// restart re-opens that gate as it spawns a fresh loop, so a `start`
-    /// racing `clear` can still re-persist into the wiped store. The wallet
-    /// UI drives these from one place; that ordering is the host's contract
-    /// until the registry grows a per-key clearing latch.
+    /// concurrent direct `sync_now`/`sync_wallet` is held off — the quiescing
+    /// gate is raised *continuously* for the whole clear (from before the
+    /// drain, across the liveness check, through the wipe), so such a pass
+    /// observes the gate and bails with no lapse. The one remaining residual
+    /// is a full `shielded_sync_start` racing `clear`: a restart spawns a
+    /// fresh loop and reopens the gate, so it could re-persist into the wiped
+    /// store. The wallet UI drives these from one place; that ordering is the
+    /// host's contract until the registry grows a per-key clearing latch.
     #[cfg(feature = "shielded")]
     pub async fn clear_shielded(&self) -> Result<(), crate::error::PlatformWalletError> {
-        // Quiesce the shielded loop: cancel it, drain any in-flight pass
-        // (incl. its persister fan-out), and join its OS thread. The
-        // registry bounds the join by the coordinator's own
-        // `SHUTDOWN_JOIN_TIMEOUT_SECS` budget — returning `Timeout` rather
-        // than hanging if a pass's drop wedges — so no outer timeout is
-        // needed here.
-        let status = self.shielded_sync_manager.quiesce().await;
+        self.clear_shielded_inner(std::time::Duration::from_secs(SHUTDOWN_JOIN_TIMEOUT_SECS))
+            .await
+    }
+
+    /// [`clear_shielded`](Self::clear_shielded) with an explicit drain
+    /// deadline. Split out so tests can exercise the timeout path without
+    /// waiting the full production budget.
+    #[cfg(feature = "shielded")]
+    async fn clear_shielded_inner(
+        &self,
+        drain_timeout: std::time::Duration,
+    ) -> Result<(), crate::error::PlatformWalletError> {
+        // Raise and HOLD the shielded quiescing gate for the WHOLE clear,
+        // BEFORE quiescing — so the "no new pass" barrier never lapses
+        // between the drain, the liveness check, and the store wipe: a direct
+        // `sync_now`/`sync_wallet` landing anywhere in here observes the gate
+        // and bails instead of re-persisting into the store we are about to
+        // clear. `quiesce_under_held_gate` deliberately does NOT touch the
+        // gate (a single `AtomicFlagGuard` always clears the flag on drop, so
+        // letting `quiesce` manage it and re-raising afterwards would leave a
+        // window). The guard lowers the gate on return (every path).
+        let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate();
+
+        // Cancel the loop and drain any in-flight pass (incl. its persister
+        // fan-out). Bound the drain (mirroring `shielded_sync_stop`'s
+        // timeout) so a heavy direct pass cannot hang the host's Clear: on
+        // timeout the clear reports `Timeout` and aborts BEFORE the wipe,
+        // leaving the store intact.
+        let status = match tokio::time::timeout(
+            drain_timeout,
+            self.shielded_sync_manager.quiesce_under_held_gate(),
+        )
+        .await
+        {
+            Ok(status) => status,
+            Err(_elapsed) => CoordinatorThreadStatus::Timeout,
+        };
 
         // Only commit the store wipe once the in-flight pass has fully
         // drained. A partial/timed-out drain could let a surviving pass
@@ -537,20 +569,14 @@ impl<P: PlatformWalletPersistence + 'static> PlatformWalletManager<P> {
         if !status.is_clean() {
             return Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { status });
         }
-        // Hold the shielded quiescing gate raised across BOTH the liveness
-        // check below and the store wipe, so the gate guarding "no new pass"
-        // does not lapse between check and act: a direct `sync_now` /
-        // `sync_wallet` that lands here observes the gate and bails instead
-        // of writing into the store we are about to clear. The guard lowers
-        // the gate on return (every path), so a later start/sync works.
-        let _clearing_gate = self.shielded_sync_manager.hold_quiescing_gate();
 
-        // [F2 FIX] Also refuse if a prior-generation shielded thread is
-        // still parked alive: it holds an `Arc` to the persister/store and
-        // could re-persist notes into the store we are about to wipe. The
-        // check is shielded-scoped, so the other coordinators / the
-        // always-on event adapter running normally do not block Clear.
-        if self.registry.any_alive_for(WalletWorker::ShieldedSync) {
+        // Also refuse if a prior-generation shielded thread is still parked
+        // alive: it holds an `Arc` to the persister/store and could re-persist
+        // notes into the store we are about to wipe. The check is shielded-
+        // scoped (shares the `shielded_worker_alive` gate), so the other
+        // coordinators / the always-on event adapter running normally do not
+        // block Clear.
+        if self.shielded_worker_alive() {
             return Err(
                 crate::error::PlatformWalletError::ShieldedShutdownIncomplete {
                     status: CoordinatorThreadStatus::Detached,
@@ -1044,6 +1070,59 @@ mod tests {
             .expect("clear_shielded must succeed once the orphan is reaped");
     }
 
+    /// SEC-001: `clear_shielded` must BOUND its in-flight-pass drain so a
+    /// heavy direct `sync_now`/`sync_wallet` that won't drain in time cannot
+    /// hang the host's Clear. On the drain deadline the clear reports
+    /// `ShieldedShutdownIncomplete` and aborts BEFORE the store wipe, leaving
+    /// the store intact.
+    ///
+    /// Non-vacuous: against an unbounded drain the held pass keeps
+    /// `is_syncing` set forever and `clear_shielded_inner` never returns — the
+    /// test's outer timeout fires and the `expect` below panics.
+    #[cfg(feature = "shielded")]
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn clear_shielded_aborts_without_wiping_when_drain_times_out() {
+        let manager = Arc::new(make_manager());
+
+        // A direct sync pass already in flight (holds `is_syncing`); it never
+        // drains within the clear's drain budget.
+        let (ready_tx, ready_rx) = tokio::sync::oneshot::channel::<()>();
+        let (release_tx, release_rx) = tokio::sync::oneshot::channel::<()>();
+        let ssm = Arc::clone(&manager.shielded_sync_manager);
+        let pass_task = tokio::spawn(async move {
+            let _pass = ssm
+                .begin_pass_for_test()
+                .expect("direct pass enters the slot");
+            ready_tx.send(()).expect("signal in-flight");
+            release_rx.await.expect("await release");
+            // `_pass` drops here → is_syncing = false
+        });
+
+        ready_rx.await.expect("pass reached in-flight");
+        assert!(manager.shielded_sync_manager.is_syncing());
+
+        // Clear with a short drain budget: the held pass can't drain in time,
+        // so the clear must return ShieldedShutdownIncomplete — bounded, never
+        // hanging — and never reach the wipe.
+        let result = tokio::time::timeout(
+            Duration::from_secs(5),
+            manager.clear_shielded_inner(Duration::from_millis(100)),
+        )
+        .await
+        .expect("clear must return within its bounded drain, never hang");
+        assert!(
+            matches!(
+                result,
+                Err(crate::error::PlatformWalletError::ShieldedShutdownIncomplete { .. })
+            ),
+            "bounded drain timeout must surface as ShieldedShutdownIncomplete, got {result:?}"
+        );
+
+        // Release the held pass and join.
+        release_tx.send(()).expect("release the pass");
+        pass_task.await.expect("pass task joined");
+    }
+
     /// TC-015 (R5): `from_report` maps the registry's [`ShutdownReport`]
     /// onto the FFI-stable `CoordinatorExitStatus` with identical field /
     /// variant shape and `all_clean()` semantics. The full `WorkerStatus`
diff --git a/packages/rs-platform-wallet/src/manager/shielded_sync.rs b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
index 6a66e30ba6..f949b48dd1 100644
--- a/packages/rs-platform-wallet/src/manager/shielded_sync.rs
+++ b/packages/rs-platform-wallet/src/manager/shielded_sync.rs
@@ -278,6 +278,25 @@ impl ShieldedSyncManager {
         self.lifecycle.quiesce().await
     }
 
+    /// Drain + join **without touching the `quiescing` gate**, for a caller
+    /// (the Clear flow) that already holds it raised via
+    /// [`hold_quiescing_gate`](Self::hold_quiescing_gate) and keeps holding
+    /// it across the whole teardown. See
+    /// [`CoordinatorLifecycle::quiesce_under_held_gate`].
+    pub(crate) async fn quiesce_under_held_gate(&self) -> super::CoordinatorThreadStatus {
+        self.lifecycle.quiesce_under_held_gate().await
+    }
+
+    /// Test seam: enter a sync pass directly (claim `is_syncing` via the
+    /// pass gate) so a teardown test can stand in for a direct
+    /// `sync_now`/`sync_wallet` already in flight, without driving the real
+    /// (coordinator-backed) sync path. The returned guard clears the flag
+    /// on drop.
+    #[cfg(test)]
+    pub(crate) fn begin_pass_for_test(&self) -> Option<AtomicFlagGuard<'_>> {
+        self.lifecycle.begin_pass()
+    }
+
     /// Raise the `quiescing` gate and hold it raised until the returned
     /// guard drops. Where [`quiesce`](Self::quiesce) reopens the gate as
     /// soon as it returns, this lets a multi-step teardown (Clear) keep new

From 7be68c561846eb0114b449398ca83440d914603f Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:19:07 +0200
Subject: [PATCH 27/29] refactor(dash-async): full spawn-failure rollback +
 drop stale doc history-tags

RUST-004: a failed thread spawn left the slot carrying the FAILED start's weight/drain/join_budget (and a bumped generation). Now snapshot the pre-start config and restore ALL of it on the Err path, so the re-installed prior keeps its own teardown config; generation rolls back too (the +1 is only observed under the slot lock and the failed start spawns no thread, so the rollback is net-zero and the externally-visible generation stays monotonic). New regression test asserts the restored config.

RUST-005: trimmed the duplicated park-under-lock rustdoc block in start_thread that repeated park_prior_locked's doc.

RUST-003/PROJ hygiene: removed [F1/F2/F3 FIX] history-tags from committed comments, replaced the 'Why F1 and F2 cannot recur' module section with present-state invariant descriptions, and fixed the glossary to reference the key-scoped any_alive_for (the gate store-wiping paths actually consult) rather than the registry-wide any_alive.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 packages/rs-dash-async/src/registry.rs | 160 ++++++++++++++++++-------
 1 file changed, 119 insertions(+), 41 deletions(-)

diff --git a/packages/rs-dash-async/src/registry.rs b/packages/rs-dash-async/src/registry.rs
index 7f103ffcae..01795a5e73 100644
--- a/packages/rs-dash-async/src/registry.rs
+++ b/packages/rs-dash-async/src/registry.rs
@@ -15,21 +15,24 @@
 //! - [`start_task`](ThreadRegistry::start_task) — a tokio task, for
 //!   `Send` futures.
 //!
-//! # Why F1 and F2 cannot recur
+//! # Safety invariants
 //!
-//! - **F1** (timeout-dropped quiesce detaches a live thread): every join
-//!   path takes `&self`; the live join handle stays owned by the slot
-//!   and is never moved into a cancellable future's frame. A
+//! - **A timed-out or dropped quiesce never detaches a live thread.**
+//!   Every join path takes `&self`; the live join handle stays owned by
+//!   the slot and is never moved into a cancellable future's frame. A
 //!   dropped/timed-out [`quiesce`](ThreadRegistry::quiesce) therefore
 //!   cannot drop-and-detach the handle — on timeout (or on an external
 //!   drop) the handle is deterministically re-parked into the orphan
 //!   list, and the slot reports [`WorkerStatus::Timeout`], never a clean
 //!   `NotRunning`.
-//! - **F2** (store wipe races a parked prior-generation thread):
-//!   orphans live in the registry and [`any_alive`](ThreadRegistry::any_alive)
-//!   is the single liveness gate spanning live slots **and** parked
-//!   orphans. Every store-wiping path consults it, so a parked
-//!   still-live thread blocks the wipe.
+//! - **A store wipe cannot race a parked prior-generation thread.**
+//!   Orphans live in the registry and
+//!   [`any_alive_for`](ThreadRegistry::any_alive_for) is the key-scoped
+//!   liveness gate spanning a key's live slot **and** its parked orphans
+//!   (with [`any_alive`](ThreadRegistry::any_alive) the registry-wide
+//!   variant). A store-wiping path scoped to one worker consults the
+//!   key-scoped gate, so a parked still-live thread blocks the wipe of its
+//!   own worker's store without an unrelated worker blocking it.
 
 use std::collections::BTreeMap;
 use std::future::Future;
@@ -354,6 +357,17 @@ impl<K: RegistryKey> ThreadRegistry<K> {
             // install the new token under this one lock so a prior
             // thread's epilogue observes the post-swap generation.
             let prior = slot.handle.take();
+            // Snapshot the slot's pre-start config so a spawn failure can roll
+            // the slot back to exactly its prior state: a re-installed prior
+            // worker must keep its OWN teardown config, not inherit the failed
+            // start's weight/drain/join_budget. Generation is rolled back too —
+            // the bump is only ever observed under this lock and a failed start
+            // spawns no thread to reference it, so the rollback is net-zero and
+            // the externally-visible generation stays monotonic.
+            let prev_generation = slot.generation;
+            let prev_weight = slot.weight;
+            let prev_join_budget = slot.join_budget;
+            let prev_drain = slot.drain.take();
             let token = CancellationToken::new();
             slot.cancel = Some(token.clone());
             slot.generation += 1;
@@ -378,36 +392,27 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                 body(body_token);
             }) {
                 Ok(join) => {
-                    // Store the new handle, then park the prior into orphans
-                    // — both while still under THIS slot lock (R1: store
-                    // handle -> park prior -> drop guard -> THEN bounded
-                    // reap below).
+                    // Store the new handle, then park the prior into orphans —
+                    // both still under THIS slot lock, so `shutdown`'s
+                    // under-lock tier snapshot can never see the new slot
+                    // without also seeing the prior accounted (R1: store handle
+                    // -> park prior -> drop guard -> THEN bounded reap below).
+                    // See `park_prior_locked` for the lock-order rationale; the
+                    // bounded join stays out of the lock in `reap_parked_prior`.
                     slot.handle = Some(WorkerHandle::OsThread(join));
-                    // [F3 FIX] Park the prior UNDER the slot lock, before
-                    // releasing it. `shutdown` latches `closing` and
-                    // snapshots tiers under this same lock; parking here
-                    // means the take-prior + park-prior is atomic from its
-                    // view, so it can never observe the new slot without
-                    // also seeing the prior accounted in orphans. (The old
-                    // out-of-lock reap left a window: the prior was moved out
-                    // of the slot but not yet parked, so a shutdown
-                    // snapshotting in that gap reaped an empty orphan list
-                    // and reported clean while a wedged prior was still
-                    // live.) The bounded join stays OUT of the lock —
-                    // `reap_parked_prior` below. The `slots`->`orphans`
-                    // nesting this introduces is the only such nesting in the
-                    // module and is deadlock-free: no path acquires `slots`
-                    // while holding `orphans`.
                     self.park_prior_locked(key, prior)
                 }
                 Err(e) => {
-                    // Spawn failed (e.g. EAGAIN at the OS thread ceiling).
-                    // Roll back so the prior handle is never detached and
-                    // the slot is not left wedged "running": re-install
-                    // prior, clear the running flag. `generation` stays
-                    // bumped (it is only ever monotonic), which is harmless
-                    // — the next start reaps the re-installed prior. Nothing
-                    // was parked, so there is no prior to reap below.
+                    // Spawn failed (e.g. EAGAIN at the OS thread ceiling). Roll
+                    // the slot back to exactly its pre-start state: clear the
+                    // running flag, re-install the prior handle (never
+                    // detached), and restore the prior teardown config +
+                    // generation so nothing of the failed start lingers. The
+                    // re-installed prior keeps its own weight/drain/join_budget
+                    // for a later quiesce/shutdown, and generation returns to
+                    // its pre-bump value (the bump was never observed outside
+                    // this lock and spawned no thread). Nothing was parked, so
+                    // there is no prior to reap below.
                     tracing::error!(
                         ?key,
                         error = %e,
@@ -416,6 +421,10 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                     );
                     slot.cancel = None;
                     slot.handle = prior;
+                    slot.generation = prev_generation;
+                    slot.weight = prev_weight;
+                    slot.drain = prev_drain;
+                    slot.join_budget = prev_join_budget;
                     None
                 }
             }
@@ -485,7 +494,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
                 body(body_token).await;
             });
             slot.handle = Some(WorkerHandle::Task(join));
-            // [F3 FIX] Park the prior UNDER this slot lock, same rationale as
+            // Park the prior UNDER this slot lock, same rationale as
             // `start_thread`: it keeps `shutdown`'s under-lock tier snapshot
             // from ever missing the prior. A task cannot be joined
             // synchronously, so there is no bounded reap here — a live prior
@@ -528,7 +537,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// budget. The live handle is owned by the slot and is **never** moved
     /// into this future's frame, so a dropped/timed-out call cannot detach
     /// it; on the managed timeout — or if this future is dropped
-    /// mid-poll — the handle is re-parked into the orphan list. [F1 FIX]
+    /// mid-poll — the handle is re-parked into the orphan list.
     pub async fn quiesce(&self, key: K) -> WorkerStatus {
         // Snapshot the drain hook + budget + generation, and bail early if
         // nothing is registered for this key. The generation is the anchor
@@ -629,7 +638,7 @@ impl<K: RegistryKey> ThreadRegistry<K> {
     /// under that key — still alive? A store-wiping path scoped to one
     /// worker must gate on this (rather than the registry-wide
     /// [`any_alive`](Self::any_alive)) so an unrelated worker that is
-    /// legitimately running does not block the wipe. [F2 FIX]
+    /// legitimately running does not block the wipe.
     pub fn any_alive_for(&self, key: K) -> bool {
         if let Some(slot) = self.lock_slots().get(&key) {
             if slot_alive(slot) {
@@ -1671,6 +1680,75 @@ mod tests {
         assert!(!reg.any_alive());
     }
 
+    /// A thread-spawn failure must roll the slot back to its PRIOR config, not
+    /// leave the failed start's weight / drain / join_budget / generation
+    /// behind: the re-installed prior worker keeps its own teardown config for
+    /// a later quiesce/shutdown.
+    ///
+    /// Non-vacuous: against a partial rollback (only cancel/handle restored),
+    /// the slot would carry the failed start's weight/budget, a `None` drain,
+    /// and the bumped generation.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn spawn_failure_restores_prior_slot_config() {
+        let reg = ThreadRegistry::<&str>::new();
+        let (release_tx, release_rx) = mpsc::channel::<()>();
+
+        // gen-1 with a DISTINCTIVE config (drain hook + non-default weight and
+        // join budget). Wedged so it stays the live prior after cancel.
+        let hook: DrainHook = Arc::new(|| Box::pin(async {}));
+        let cfg1 = WorkerConfig {
+            weight: ShutdownWeight(7),
+            join_budget: Duration::from_secs(11),
+            drain: Some(hook),
+        };
+        reg.start_thread("k", cfg1, wedged_body(release_rx));
+        reg.cancel("k");
+        let gen_after_gen1 = reg.lock_slots().get("k").unwrap().generation;
+
+        // Failed restart with a DIFFERENT config; the rollback must discard it.
+        reg.force_spawn_failure.store(true, Ordering::Release);
+        let cfg2 = WorkerConfig {
+            weight: ShutdownWeight(99),
+            join_budget: Duration::from_secs(99),
+            drain: None,
+        };
+        reg.start_thread("k", cfg2, |_cancel| {});
+        reg.force_spawn_failure.store(false, Ordering::Release);
+
+        {
+            let slots = reg.lock_slots();
+            let slot = slots.get("k").expect("slot present");
+            assert_eq!(slot.weight, ShutdownWeight(7), "weight restored to prior");
+            assert_eq!(
+                slot.join_budget,
+                Duration::from_secs(11),
+                "join_budget restored to prior"
+            );
+            assert!(
+                slot.drain.is_some(),
+                "prior drain hook restored, not the failed start's None"
+            );
+            assert_eq!(
+                slot.generation, gen_after_gen1,
+                "generation rolled back to its pre-bump value"
+            );
+            assert!(
+                slot.cancel.is_none(),
+                "running flag cleared after failed spawn"
+            );
+            assert!(
+                slot.handle.is_some(),
+                "prior handle re-installed (alive), not detached"
+            );
+        }
+        assert!(reg.any_alive(), "live prior still accounted for");
+
+        // Recover: release + quiesce reaps the prior cleanly.
+        release_tx.send(()).unwrap();
+        assert_eq!(reg.quiesce("k").await, WorkerStatus::Ok);
+        assert!(!reg.any_alive());
+    }
+
     /// A panicking worker body still runs its epilogue (via the drop-guard),
     /// so `is_running()` reflects the crash and `start()` can relaunch the
     /// loop instead of silently no-op'ing.
@@ -1737,9 +1815,9 @@ mod tests {
         assert!(!reg.any_alive(), "nothing started post-shutdown");
     }
 
-    /// [F3 FIX] `start_thread` must park a restarted key's still-wedged prior
-    /// into the orphan list UNDER the slot lock — at the START of the
-    /// restart, not only after the out-of-lock reap backstop elapses.
+    /// `start_thread` must park a restarted key's still-wedged prior into the
+    /// orphan list UNDER the slot lock — at the START of the restart, not only
+    /// after the out-of-lock reap backstop elapses.
     /// Otherwise a `shutdown()` that snapshots tiers in the window between
     /// "prior taken out of the slot" and "prior parked" sees neither the
     /// prior (already moved out of the slot) nor an orphan, and reports

From 3821389cfd5d603585bff5a52c3e25a0b91fa4d5 Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:19:18 +0200
Subject: [PATCH 28/29] docs(swift-sdk): broaden deinit comment for
 shielded_sync_stop's orphan-trigger contract

shielded_sync_stop now also returns .errorShutdownIncomplete when the drain was clean but a prior-generation shielded thread is still parked alive as an orphan (not only when the in-flight drain times out). Update the manager deinit comment to reflect both triggers; behaviour unchanged (the deinit already leaks one strong ref to the handlers on .errorShutdownIncomplete). Comment-only.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../PlatformWallet/PlatformWalletManager.swift    | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
index 36bafa37d1..34137d15e5 100644
--- a/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
+++ b/packages/swift-sdk/Sources/SwiftDashSDK/PlatformWallet/PlatformWalletManager.swift
@@ -162,12 +162,15 @@ public class PlatformWalletManager: ObservableObject {
         // Capture the CODE (not just free the message) for the two calls
         // that CAN report `.errorShutdownIncomplete`: `shielded_sync_stop`
         // and `destroy`. Rust returns that code when a background
-        // coordinator did not drain within the join deadline — meaning a
-        // lingering `!Send` coordinator thread may still hold the
-        // `passUnretained` context pointers Rust was handed for our
-        // `persistenceHandler` / `eventHandler` and fire ONE final callback
-        // through them. The contract: on that code the host must NOT free
-        // the callback context immediately.
+        // coordinator did not drain within the join deadline, OR — for
+        // `shielded_sync_stop` — when the drain was clean but a prior-
+        // generation shielded thread is still parked alive as an orphan
+        // (a tight `stop()`→`start()` reap that had to detach it past the
+        // wedge backstop). In either case a lingering `!Send` coordinator
+        // thread may still hold the `passUnretained` context pointers Rust
+        // was handed for our `persistenceHandler` / `eventHandler` and fire
+        // ONE final callback through them. The contract: on that code the
+        // host must NOT free the callback context immediately.
         let shieldedStopCode =
             platform_wallet_manager_shielded_sync_stop(handle).discardReturningCode()
         let destroyCode =

From 748c4f826c24c0093743d66d3dc5f9157448a07b Mon Sep 17 00:00:00 2001
From: Lukasz Klimek <842586+lklimek@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:28:19 +0200
Subject: [PATCH 29/29] fix(platform-wallet): make the quiescing<->is_syncing
 handshake self-fencing (SeqCst)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEC-003: the pass-gate handshake is a Dekker-style StoreLoad pair across two distinct atomics — quiesce does store(quiescing) … load(is_syncing); begin_pass does CAS(is_syncing) … load(quiescing). Release/Acquire do NOT order StoreLoad across separate locations, so by the annotations alone both sides could miss each other (begin_pass reads a stale quiescing==false and runs a pass past a raised gate while the drain reads a stale is_syncing==false and returns). It was sound only incidentally — registry.quiesce happens to take the slots Mutex (a fence) before returning; a lock-free refactor of that path would make the race live.

Promote the four handshake ops to SeqCst (a single total order guarantees at least one side observes the other): the gate-raise stores in quiesce, hold_quiescing_gate, and the registry drain hook; the is_syncing CAS (success) and quiescing load in begin_pass; and the is_syncing load in the drain. Gate lowering (reopen / RAII drop) and observational reads stay Release/Acquire — a stale-high gate read only makes a pass bail conservatively. Fix-1's gate-before-cancel + never-latched invariant is preserved (SeqCst is strictly stronger than the prior Release). Added a load-bearing-ordering comment at begin_pass. Not unit-testable (ordering); the existing gate/drain handshake tests still pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_0157yd3YvWeyckhfQivS9gf7
---
 .../src/manager/coordinator_lifecycle.rs      | 37 ++++++++++++++++---
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
index ebcc73419c..87e20fa6e5 100644
--- a/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
+++ b/packages/rs-platform-wallet/src/manager/coordinator_lifecycle.rs
@@ -123,7 +123,9 @@ impl CoordinatorLifecycle {
         Arc::new(move || {
             let quiescing = Arc::clone(&quiescing);
             Box::pin(async move {
-                quiescing.store(true, Ordering::Release);
+                // SeqCst: store-half of the `quiescing`<->`is_syncing`
+                // handshake (see `begin_pass`).
+                quiescing.store(true, Ordering::SeqCst);
             })
         })
     }
@@ -163,7 +165,9 @@ impl CoordinatorLifecycle {
     /// not apply to it.
     pub(crate) async fn quiesce(&self) -> CoordinatorThreadStatus {
         // Gate up first (instant) and held until the guard drops on return.
-        self.quiescing.store(true, Ordering::Release);
+        // SeqCst: store-half of the `quiescing`<->`is_syncing` handshake
+        // (see `begin_pass`).
+        self.quiescing.store(true, Ordering::SeqCst);
         let _quiescing_gate = AtomicFlagGuard::new(&self.quiescing);
         self.cancel_join_and_drain().await
     }
@@ -219,7 +223,11 @@ impl CoordinatorLifecycle {
     /// design — the caller bounds the whole teardown (the FFI `stop` /
     /// `clear` bridges wrap it in a `SHUTDOWN_JOIN_TIMEOUT_SECS` timeout).
     async fn drain_in_flight_pass(&self) {
-        while self.is_syncing.load(Ordering::Acquire) {
+        // SeqCst: load-half of the `quiescing`<->`is_syncing` handshake (see
+        // `begin_pass`). Pairs with `begin_pass`'s SeqCst CAS so a pass that
+        // claimed the slot just as the gate rose is observed here and waited
+        // out, rather than slipping past an unsynchronized read.
+        while self.is_syncing.load(Ordering::SeqCst) {
             tokio::time::sleep(Duration::from_millis(5)).await;
         }
     }
@@ -233,7 +241,10 @@ impl CoordinatorLifecycle {
     /// gate tests also exercise it.
     #[cfg(any(test, feature = "shielded"))]
     pub(crate) fn hold_quiescing_gate(&self) -> AtomicFlagGuard<'_> {
-        self.quiescing.store(true, Ordering::Release);
+        // SeqCst: store-half of the `quiescing`<->`is_syncing` handshake (see
+        // `begin_pass`). The Clear flow raises the gate through here, so this
+        // raise must be self-fencing just like `quiesce`'s.
+        self.quiescing.store(true, Ordering::SeqCst);
         AtomicFlagGuard::new(&self.quiescing)
     }
 
@@ -245,9 +256,22 @@ impl CoordinatorLifecycle {
     /// released before returning (the guard drops), so a later post-quiesce
     /// pass can still run.
     pub(crate) fn begin_pass(&self) -> Option<AtomicFlagGuard<'_>> {
+        // LOAD-BEARING MEMORY ORDERING: the `is_syncing` claim (this CAS) and
+        // the `quiescing` gate read below form a Dekker-style mutual-exclusion
+        // handshake with `quiesce`'s `store(quiescing) … load(is_syncing)`.
+        // The guarantee we need is that a teardown and a pass-entry can never
+        // BOTH miss each other — either this pass observes the raised gate and
+        // bails, or the drain observes our `is_syncing` claim and waits it
+        // out. That is a StoreLoad relationship across two distinct atomics,
+        // which Release/Acquire do NOT order; only SeqCst (a single total
+        // order over all four ops) does. So the CAS *store* here, the gate
+        // load here, and the matching `store(quiescing)` / `load(is_syncing)`
+        // on the teardown side are all `SeqCst`. (Today the lock `registry`
+        // takes would also fence this, but that is incidental — relying on it
+        // would make the handshake silently fragile to a lock-free refactor.)
         if self
             .is_syncing
-            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
+            .compare_exchange(false, true, Ordering::SeqCst, Ordering::Acquire)
             .is_err()
         {
             return None;
@@ -261,7 +285,8 @@ impl CoordinatorLifecycle {
         // A `quiesce` may have raised the gate between our CAS and here; if
         // so, bail (dropping `guard`, which clears the slot) so the drain
         // can complete and teardown gets a true "no further pass" barrier.
-        if self.quiescing.load(Ordering::Acquire) {
+        // SeqCst — load-half of the handshake described above.
+        if self.quiescing.load(Ordering::SeqCst) {
             return None;
         }
         Some(guard)