diff --git a/CHANGELOG.md b/CHANGELOG.md index ec218ea..e8e19e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,42 @@ > **Note:** Versions 0.3.24 – 0.3.54 were released as git tags without changelog entries. Changelog resumes at 0.3.55 below. +## 0.3.81 + +### Fixed + +- **TCP keepalive on every NWConnection** — outbound sessions + (`SymPeerSession.init(outboundTo:)` / `init(remoteHost:port:)`) and + inbound connections accepted by `NWListener` (`SymDiscovery.startListener`) + now use `NWParameters` with `NWProtocolTCP.Options.enableKeepalive = true`, + `keepaliveIdle = 1`, `keepaliveInterval = 1`, `keepaliveCount = 3`. Dead + remote ends (peer process killed without graceful FIN — common on iOS app + suspension and Mac Catalyst rebuilds) are now reaped within ~4 seconds + instead of waiting for macOS default `TCP_KEEPALIVE = 7200s` (2 hours). + + Without this, a peer that crashes or restarts leaves the survivor with + an ESTABLISHED TCP socket that the OS doesn't reap for hours. The + `addPeer` dedup logic then keeps rejecting the live new dial against + the zombie entry, producing a permanent connection-flap loop visible in + the field as "peer joins, immediately drops, retries, repeat" — most + commonly hit on iPhone↔Mac-Catalyst pairs after either side rebuilds. + + Mirrors the fix shipped in `@sym-bot/sym` v0.5.3 on the Node side so + cross-runtime peers (sym-swift ↔ sym-node) recover symmetrically from + peer restarts. + +- **lastSeen-aware stale-prior detection in `addPeer` dedup.** A peer + entry whose `lastSeen` is older than `staleAfterSeconds` (10s, matching + Node SDK's `_heartbeatInterval` default) is now treated as stale and + the new dial replaces it, regardless of dual-dial tie-break or + same-direction-duplicate logic. The remote re-dialling is itself + evidence its prior is dead — rejecting blocks legitimate reconnects + after a peer restart for as long as the OS holds the dead socket. + + Combined with the TCP keepalive above, recovery from peer restart is + now seconds, not hours. Mirrors `@sym-bot/sym` v0.5.3 dedup-path + staleness check. + ## 0.3.77 ### Added diff --git a/Sources/SYM/SymDiscovery.swift b/Sources/SYM/SymDiscovery.swift index 0909ce1..5e15340 100644 --- a/Sources/SYM/SymDiscovery.swift +++ b/Sources/SYM/SymDiscovery.swift @@ -88,7 +88,13 @@ final class SymDiscovery { private func startListener() { do { - let parameters = NWParameters.tcp + // Keepalive applied to the listener parameters so accepted inbound + // NWConnections inherit it. Without this, a peer that crashes leaves + // us with a dead-but-ESTABLISHED inbound socket that survives ~2h + // before macOS keepalive reaps it — the addPeer dedup logic then + // rejects every live redial against the zombie. See SymPeerSession + // tcpParametersWithKeepalive() for full rationale. + let parameters = SymPeerSession.tcpParametersWithKeepalive() parameters.includePeerToPeer = true let listener = try NWListener(using: parameters) diff --git a/Sources/SYM/SymNode.swift b/Sources/SYM/SymNode.swift index e64e076..aa1b9b1 100644 --- a/Sources/SYM/SymNode.swift +++ b/Sources/SYM/SymNode.swift @@ -974,6 +974,21 @@ public final class SymNode { case rejectedNew // simultaneous dial: existing wins } + /// Stale-prior detection threshold for dedup. If the existing peer entry + /// has not been touched within this many seconds, treat it as stale and + /// let the new connection replace it regardless of dedup tie-break. + /// Matches `@sym-bot/sym` v0.5.3 `_heartbeatInterval` default (10s) so + /// cross-runtime peers agree on the same staleness window. + /// + /// Without this, a peer process killed without graceful FIN (iOS app + /// suspension, Mac Catalyst rebuild, network drop) leaves the survivor + /// with a dead-but-ESTABLISHED TCP entry that the OS doesn't reap for + /// hours. The dedup logic then keeps rejecting the live new dial. + /// TCP keepalive (set in SymPeerSession.tcpParametersWithKeepalive) + /// reaps within ~4s, but until that fires the lastSeen-age check is + /// the application-level guard. + static let staleAfterSeconds: TimeInterval = 10 + private func addPeer(_ session: SymPeerSession, nodeId: String, peerName: String, isOutbound: Bool) { let outcome: AddPeerOutcome = peerQueue.sync { if var existing = self.peers[nodeId] { @@ -998,20 +1013,33 @@ public final class SymNode { // disconnect the wire pair on the remote side and trigger // a peer-left storm. Always reject the duplicate. // + // BUT: both cases assume the prior is alive. A prior that + // hasn't been touched within `staleAfterSeconds` is treated + // as stale and replaced — the remote re-dialling is itself + // strong evidence its prior is dead, and rejecting blocks + // legitimate reconnects after a peer restart for hours + // until OS keepalive reaps the zombie. + // // The losing session has its delegate detached before disconnect // (see fall-through below) so its teardown can't ripple through // removeTransport and clobber the surviving registered session. - let isDualDial = prev.isOutbound != session.isOutbound + let staleByLastSeen = Date().timeIntervalSince(existing.lastSeen) > Self.staleAfterSeconds let preferNew: Bool - if isDualDial { - preferNew = SymNode.preferNewSessionInDualDial( - localNodeId: self.identity.nodeId, - remoteNodeId: nodeId, - newIsOutbound: isOutbound - ) + if staleByLastSeen { + // Prior is stale — the new dial is the live one. Replace. + preferNew = true } else { - // Same direction → keep the established prior, reject duplicate. - preferNew = false + let isDualDial = prev.isOutbound != session.isOutbound + if isDualDial { + preferNew = SymNode.preferNewSessionInDualDial( + localNodeId: self.identity.nodeId, + remoteNodeId: nodeId, + newIsOutbound: isOutbound + ) + } else { + // Same direction → keep the established prior, reject duplicate. + preferNew = false + } } if preferNew { existing.transports["bonjour"] = session diff --git a/Sources/SYM/SymPeerSession.swift b/Sources/SYM/SymPeerSession.swift index aef6ee7..45e2cff 100644 --- a/Sources/SYM/SymPeerSession.swift +++ b/Sources/SYM/SymPeerSession.swift @@ -89,10 +89,34 @@ final class SymPeerSession { // MARK: - Init + /// TCP parameters with aggressive keepalive. Default macOS TCP keepalive + /// is `TCP_KEEPALIVE = 7200s` (2 hours) before the first probe, which + /// means a dead-but-ESTABLISHED socket (peer process killed without + /// graceful FIN — common on iOS app suspension and Mac Catalyst rebuilds) + /// stays in ESTABLISHED state on the survivor side for hours. The + /// addPeer dedup logic then keeps rejecting the live new dial against + /// this zombie entry. + /// + /// Settings here mirror @sym-bot/sym v0.5.3 on the Node side: + /// 1s initial idle, 1s probe interval, 3 probes before decl-dead → dead + /// sockets reaped in ~4 seconds instead of ~2 hours. Same fix shape + /// applied to both sides of the dual-runtime mesh so cross-runtime + /// peers (sym-swift ↔ sym-node) recover symmetrically from peer + /// restarts. + static func tcpParametersWithKeepalive() -> NWParameters { + let params = NWParameters.tcp + if let tcp = params.defaultProtocolStack.transportProtocol as? NWProtocolTCP.Options { + tcp.enableKeepalive = true + tcp.keepaliveIdle = 1 + tcp.keepaliveInterval = 1 + tcp.keepaliveCount = 3 + } + return params + } + /// Outbound connection to a Bonjour endpoint. init(outboundTo endpoint: NWEndpoint, identity: SymIdentity) { - let parameters = NWParameters.tcp - self.connection = NWConnection(to: endpoint, using: parameters) + self.connection = NWConnection(to: endpoint, using: Self.tcpParametersWithKeepalive()) self.identity = identity self.isOutbound = true self.queue = DispatchQueue(label: "bot.sym.session.\(UUID().uuidString.prefix(8))", qos: .userInitiated) @@ -104,8 +128,7 @@ final class SymPeerSession { guard let nwPort = NWEndpoint.Port(rawValue: port) else { fatalError("[SYM] Invalid port: \(port)") } - let parameters = NWParameters.tcp - self.connection = NWConnection(host: nwHost, port: nwPort, using: parameters) + self.connection = NWConnection(host: nwHost, port: nwPort, using: Self.tcpParametersWithKeepalive()) self.identity = identity self.isOutbound = true self.queue = DispatchQueue(label: "bot.sym.session.\(UUID().uuidString.prefix(8))", qos: .userInitiated)