Skip to content
47 changes: 30 additions & 17 deletions RidestrSDK/Sources/RidestrSDK/Nostr/RelayManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ public actor RelayManager: RelayManagerProtocol {
private var connectedRelayURLs: [URL] = []
private var notificationHandler: NotificationRouter?
private var notificationTask: Task<Void, Never>?
private var _handlerAlive = false // Explicit liveness flag — Task.isCancelled doesn't detect normal completion

public init(keypair: NostrKeypair) {
self.keypair = keypair
Expand All @@ -31,21 +30,42 @@ public actor RelayManager: RelayManagerProtocol {
await teardownConnection(clearRelayURLs: true)
}

/// At least one relay is currently connected. Queries the underlying
/// rust-nostr `Client` for the live per-relay status — the previous
/// impl checked only Swift-side cached state (the client pointer, a
/// configured-URLs list, and a handler-liveness flag), all of which
/// stay true through airplane-mode toggles and other transient network
/// drops, so it lied about the actual WebSocket state.
public var isConnected: Bool {
client != nil && !connectedRelayURLs.isEmpty && _handlerAlive
}

private func markHandlerDead() {
_handlerAlive = false
get async {
guard let client else { return false }
let relays = await client.relays()
for relay in relays.values where relay.isConnected() {
return true
}
return false
}
}

/// Reconnect to relays if disconnected. Call from app foreground handler.
/// Does NOT restart subscriptions — callers must re-subscribe after this returns.
/// Force-rebuild the relay client. Call from app foreground handler.
///
/// Always replaces the client (when relay URLs are configured) rather
/// than short-circuiting on cached state because the cached state
/// lies on iOS background→foreground: the OS suspends WebSockets on
/// background, and rust-nostr's per-relay status doesn't update until
/// the next read/write attempt fails (which can take minutes on a
/// silently-killed socket). Tearing down and rebuilding is the only
/// way to get truthful state — the alternative is letting the user
/// sit in "everything looks fine" while their relays are dead.
///
/// Does NOT restart subscriptions — callers must re-subscribe after
/// this returns. Cheap when relays are reachable (~1s handshake);
/// the trade-off vs. per-call cost is correctness on every foreground.
/// No-op if `connect(to:)` has not been called yet (no URLs to reach).
public func reconnectIfNeeded() async {
guard !connectedRelayURLs.isEmpty else { return }
guard client == nil || !_handlerAlive else { return }

RidestrLogger.error("[RelayManager] Reconnecting relay client")
RidestrLogger.info("[RelayManager] Force-rebuilding relay client (foreground / explicit reconnect)")
do {
try await replaceClient(with: connectedRelayURLs)
} catch {
Expand Down Expand Up @@ -80,7 +100,6 @@ public actor RelayManager: RelayManagerProtocol {
private func startNotificationHandler(for client: Client) {
let router = NotificationRouter()
self.notificationHandler = router
self._handlerAlive = true
let clientRef = client
notificationTask = Task.detached { [router] in
do {
Expand All @@ -93,15 +112,9 @@ public actor RelayManager: RelayManagerProtocol {
router.removeAll()
RidestrLogger.error("[RelayManager] All subscription streams terminated due to disconnect")
}
// Monitor handler liveness from a separate task
Task { [weak self] in
await self?.notificationTask?.value // Suspends until task completes
await self?.markHandlerDead()
}
}

private func teardownConnection(clearRelayURLs: Bool) async {
_handlerAlive = false
notificationHandler?.removeAll()
activeStreams.removeAll()
subscriptionGenerations.removeAll()
Expand Down
89 changes: 83 additions & 6 deletions RoadFlare/RoadFlareCore/ViewModels/ConnectionCoordinator.swift
Original file line number Diff line number Diff line change
@@ -1,16 +1,45 @@
import Foundation
import Network

/// Owns the periodic relay connection watchdog task.
/// Owns the periodic relay connection watchdog task and a path-change
/// reactive signal.
///
/// Checks connectivity on a fixed interval and triggers reconnection
/// when the relay drops. All behavior is injected via closures so the
/// coordinator has no direct dependencies on AppState or SDK services.
/// Two complementary mechanisms drive reconnects:
///
/// 1. **Periodic watchdog** — polls the live `isConnected` query (which,
/// after PR #101, asks rust-nostr for per-relay status rather than
/// reading a cached flag) on a fixed interval and triggers reconnect
/// when it returns false. Bounded latency for any drop visible to
/// rust-nostr.
///
/// 2. **`NWPathMonitor` reactive signal** — fires whenever iOS observes a
/// network path change (Wi-Fi/cellular swap, airplane mode toggle,
/// captive portal). Forces a reconnect immediately, bypassing the
/// `isConnected` gate, because the previous connection's cached state
/// is by definition suspect after a path change. rust-nostr's
/// per-relay status doesn't update until its next read/write fails,
/// which can take minutes on a silently-killed socket — the OS-level
/// signal is much faster.
///
/// All behavior is injected via closures so the coordinator has no
/// direct dependencies on AppState or SDK services.
@MainActor
final class ConnectionCoordinator {
private var watchdogTask: Task<Void, Never>?
private var pathMonitor: NWPathMonitor?
private var pathMonitorQueue: DispatchQueue?
/// Most recently spawned path-event reconnect Task. `isReconnecting`
/// already serializes path-spawned reconnects (a second event during an
/// in-flight reconnect short-circuits via the guard), so at most one
/// such Task is doing real work at any moment — tracking the latest
/// is sufficient to cancel an in-flight reconnect on `stop()` (e.g.
/// logout / identity replacement). Mirrors the tracked-Task pattern
/// from PR #95's onboarding-publish watchdog.
private var pathReconnectTask: Task<Void, Never>?
private var hasReceivedFirstPath = false
private var isReconnecting = false

/// Start the periodic watchdog.
/// Start the periodic watchdog and the path-change reactive signal.
///
/// - Parameters:
/// - interval: Time between connectivity checks.
Expand All @@ -29,17 +58,65 @@ final class ConnectionCoordinator {
try? await Task.sleep(for: interval)
guard let self, !self.isReconnecting, shouldReconnect() else { continue }
guard !(await isConnected()) else { continue }
// Re-check after the connectivity await: a path-monitor
// event landing during the suspension can have set
// `isReconnecting = true` and started its own reconnect.
// Without this guard we'd kick off a second concurrent
// reconnect, tearing down the in-flight rebuild's client.
guard !self.isReconnecting else { continue }
self.isReconnecting = true
defer { self.isReconnecting = false }
await reconnect()
}
}

let monitor = NWPathMonitor()
let queue = DispatchQueue(label: "com.roadflare.connection-coordinator.path-monitor")
monitor.pathUpdateHandler = { [weak self] _ in
// The very first path update fires on monitor start with the
// current path — not a real change in production paths. Skip
// it to avoid a spurious rebuild on app launch right after
// `setupServices` has established the initial connection.
// (Apple's `NWPathMonitor` documentation does not strictly
// contract this, but it is the consistently observed behavior
// for `start(queue:)` and is widely relied on across iOS apps.)
Task { @MainActor [weak self] in
guard let self else { return }
if !self.hasReceivedFirstPath {
self.hasReceivedFirstPath = true
return
}
guard !self.isReconnecting, shouldReconnect() else { return }
// Set the busy flag synchronously, in the same critical
// section as the guard check, so a second path event
// landing between this Task and the inner reconnect Task
// observes the busy state and short-circuits. (Setting it
// inside the inner Task left a race window where two
// rapid path events both passed the guard and spawned
// overlapping reconnects.)
self.isReconnecting = true
self.pathReconnectTask = Task { @MainActor [weak self] in
defer { self?.isReconnecting = false }
guard !Task.isCancelled else { return }
await reconnect()
}
}
}
monitor.start(queue: queue)
self.pathMonitor = monitor
self.pathMonitorQueue = queue
}

/// Stop the watchdog and cancel any in-flight reconnection.
/// Stop the watchdog, cancel the path monitor, and any in-flight reconnection.
func stop() {
watchdogTask?.cancel()
watchdogTask = nil
pathMonitor?.cancel()
pathMonitor = nil
pathMonitorQueue = nil
pathReconnectTask?.cancel()
pathReconnectTask = nil
hasReceivedFirstPath = false
isReconnecting = false
}
}