diff --git a/RidestrSDK/Sources/RidestrSDK/Nostr/RelayManager.swift b/RidestrSDK/Sources/RidestrSDK/Nostr/RelayManager.swift index 35a191c..7fe6cad 100644 --- a/RidestrSDK/Sources/RidestrSDK/Nostr/RelayManager.swift +++ b/RidestrSDK/Sources/RidestrSDK/Nostr/RelayManager.swift @@ -14,7 +14,6 @@ public actor RelayManager: RelayManagerProtocol { private var connectedRelayURLs: [URL] = [] private var notificationHandler: NotificationRouter? private var notificationTask: Task? - private var _handlerAlive = false // Explicit liveness flag — Task.isCancelled doesn't detect normal completion public init(keypair: NostrKeypair) { self.keypair = keypair @@ -31,21 +30,42 @@ public actor RelayManager: RelayManagerProtocol { await teardownConnection(clearRelayURLs: true) } + /// At least one relay is currently connected. Queries the underlying + /// rust-nostr `Client` for the live per-relay status — the previous + /// impl checked only Swift-side cached state (the client pointer, a + /// configured-URLs list, and a handler-liveness flag), all of which + /// stay true through airplane-mode toggles and other transient network + /// drops, so it lied about the actual WebSocket state. public var isConnected: Bool { - client != nil && !connectedRelayURLs.isEmpty && _handlerAlive - } - - private func markHandlerDead() { - _handlerAlive = false + get async { + guard let client else { return false } + let relays = await client.relays() + for relay in relays.values where relay.isConnected() { + return true + } + return false + } } - /// Reconnect to relays if disconnected. Call from app foreground handler. - /// Does NOT restart subscriptions — callers must re-subscribe after this returns. + /// Force-rebuild the relay client. Call from app foreground handler. + /// + /// Always replaces the client (when relay URLs are configured) rather + /// than short-circuiting on cached state because the cached state + /// lies on iOS background→foreground: the OS suspends WebSockets on + /// background, and rust-nostr's per-relay status doesn't update until + /// the next read/write attempt fails (which can take minutes on a + /// silently-killed socket). Tearing down and rebuilding is the only + /// way to get truthful state — the alternative is letting the user + /// sit in "everything looks fine" while their relays are dead. + /// + /// Does NOT restart subscriptions — callers must re-subscribe after + /// this returns. Cheap when relays are reachable (~1s handshake); + /// the trade-off vs. per-call cost is correctness on every foreground. + /// No-op if `connect(to:)` has not been called yet (no URLs to reach). public func reconnectIfNeeded() async { guard !connectedRelayURLs.isEmpty else { return } - guard client == nil || !_handlerAlive else { return } - RidestrLogger.error("[RelayManager] Reconnecting relay client") + RidestrLogger.info("[RelayManager] Force-rebuilding relay client (foreground / explicit reconnect)") do { try await replaceClient(with: connectedRelayURLs) } catch { @@ -80,7 +100,6 @@ public actor RelayManager: RelayManagerProtocol { private func startNotificationHandler(for client: Client) { let router = NotificationRouter() self.notificationHandler = router - self._handlerAlive = true let clientRef = client notificationTask = Task.detached { [router] in do { @@ -93,15 +112,9 @@ public actor RelayManager: RelayManagerProtocol { router.removeAll() RidestrLogger.error("[RelayManager] All subscription streams terminated due to disconnect") } - // Monitor handler liveness from a separate task - Task { [weak self] in - await self?.notificationTask?.value // Suspends until task completes - await self?.markHandlerDead() - } } private func teardownConnection(clearRelayURLs: Bool) async { - _handlerAlive = false notificationHandler?.removeAll() activeStreams.removeAll() subscriptionGenerations.removeAll() diff --git a/RoadFlare/RoadFlareCore/ViewModels/ConnectionCoordinator.swift b/RoadFlare/RoadFlareCore/ViewModels/ConnectionCoordinator.swift index 99c7749..81e740f 100644 --- a/RoadFlare/RoadFlareCore/ViewModels/ConnectionCoordinator.swift +++ b/RoadFlare/RoadFlareCore/ViewModels/ConnectionCoordinator.swift @@ -1,16 +1,45 @@ import Foundation +import Network -/// Owns the periodic relay connection watchdog task. +/// Owns the periodic relay connection watchdog task and a path-change +/// reactive signal. /// -/// Checks connectivity on a fixed interval and triggers reconnection -/// when the relay drops. All behavior is injected via closures so the -/// coordinator has no direct dependencies on AppState or SDK services. +/// Two complementary mechanisms drive reconnects: +/// +/// 1. **Periodic watchdog** — polls the live `isConnected` query (which, +/// after PR #101, asks rust-nostr for per-relay status rather than +/// reading a cached flag) on a fixed interval and triggers reconnect +/// when it returns false. Bounded latency for any drop visible to +/// rust-nostr. +/// +/// 2. **`NWPathMonitor` reactive signal** — fires whenever iOS observes a +/// network path change (Wi-Fi/cellular swap, airplane mode toggle, +/// captive portal). Forces a reconnect immediately, bypassing the +/// `isConnected` gate, because the previous connection's cached state +/// is by definition suspect after a path change. rust-nostr's +/// per-relay status doesn't update until its next read/write fails, +/// which can take minutes on a silently-killed socket — the OS-level +/// signal is much faster. +/// +/// All behavior is injected via closures so the coordinator has no +/// direct dependencies on AppState or SDK services. @MainActor final class ConnectionCoordinator { private var watchdogTask: Task? + private var pathMonitor: NWPathMonitor? + private var pathMonitorQueue: DispatchQueue? + /// Most recently spawned path-event reconnect Task. `isReconnecting` + /// already serializes path-spawned reconnects (a second event during an + /// in-flight reconnect short-circuits via the guard), so at most one + /// such Task is doing real work at any moment — tracking the latest + /// is sufficient to cancel an in-flight reconnect on `stop()` (e.g. + /// logout / identity replacement). Mirrors the tracked-Task pattern + /// from PR #95's onboarding-publish watchdog. + private var pathReconnectTask: Task? + private var hasReceivedFirstPath = false private var isReconnecting = false - /// Start the periodic watchdog. + /// Start the periodic watchdog and the path-change reactive signal. /// /// - Parameters: /// - interval: Time between connectivity checks. @@ -29,17 +58,65 @@ final class ConnectionCoordinator { try? await Task.sleep(for: interval) guard let self, !self.isReconnecting, shouldReconnect() else { continue } guard !(await isConnected()) else { continue } + // Re-check after the connectivity await: a path-monitor + // event landing during the suspension can have set + // `isReconnecting = true` and started its own reconnect. + // Without this guard we'd kick off a second concurrent + // reconnect, tearing down the in-flight rebuild's client. + guard !self.isReconnecting else { continue } self.isReconnecting = true defer { self.isReconnecting = false } await reconnect() } } + + let monitor = NWPathMonitor() + let queue = DispatchQueue(label: "com.roadflare.connection-coordinator.path-monitor") + monitor.pathUpdateHandler = { [weak self] _ in + // The very first path update fires on monitor start with the + // current path — not a real change in production paths. Skip + // it to avoid a spurious rebuild on app launch right after + // `setupServices` has established the initial connection. + // (Apple's `NWPathMonitor` documentation does not strictly + // contract this, but it is the consistently observed behavior + // for `start(queue:)` and is widely relied on across iOS apps.) + Task { @MainActor [weak self] in + guard let self else { return } + if !self.hasReceivedFirstPath { + self.hasReceivedFirstPath = true + return + } + guard !self.isReconnecting, shouldReconnect() else { return } + // Set the busy flag synchronously, in the same critical + // section as the guard check, so a second path event + // landing between this Task and the inner reconnect Task + // observes the busy state and short-circuits. (Setting it + // inside the inner Task left a race window where two + // rapid path events both passed the guard and spawned + // overlapping reconnects.) + self.isReconnecting = true + self.pathReconnectTask = Task { @MainActor [weak self] in + defer { self?.isReconnecting = false } + guard !Task.isCancelled else { return } + await reconnect() + } + } + } + monitor.start(queue: queue) + self.pathMonitor = monitor + self.pathMonitorQueue = queue } - /// Stop the watchdog and cancel any in-flight reconnection. + /// Stop the watchdog, cancel the path monitor, and any in-flight reconnection. func stop() { watchdogTask?.cancel() watchdogTask = nil + pathMonitor?.cancel() + pathMonitor = nil + pathMonitorQueue = nil + pathReconnectTask?.cancel() + pathReconnectTask = nil + hasReceivedFirstPath = false isReconnecting = false } }