From 4d3b165945b7dde5e0a8eea6f8297deb47a64b2f Mon Sep 17 00:00:00 2001 From: "SYM.BOT" Date: Wed, 29 Apr 2026 13:54:27 +0100 Subject: [PATCH] fix: dedup simultaneous-dial sessions in addPeer to stop connection-flap loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two peers Bonjour-discover each other within ~50ms on a LAN and both initiate outbound TCP, each side ends up with two handshaked sessions for the same nodeId (one outbound, one inbound). Prior to this change the second handshake silently overwrote the first in `peers[nodeId].transports` without cancelling its NWConnection. The orphaned session kept a live read loop, eventually fired `didDisconnectWith`, and `removeTransport` stripped the surviving winner from the dict — killing the healthy connection ~1–2s after handshake. Symptom from the field: `[SYM] session: handshake complete` → `[SYM] peer: connected` → `cannot add handler to N from M - dropping` (libdispatch warning from the zombie's read loop) → `session: disconnected: Connection closed` → 6s pending → reconnect, repeat. No application payloads delivered across the affected connection during any cycle. Fix: `addPeer` now detects a prior bonjour transport and applies a deterministic tie-break — the lower nodeId acts as client and keeps its outbound; the higher nodeId keeps the matching inbound. Both peers compute the same physical-connection winner without exchanging coordination frames. The losing session has its delegate detached before `disconnect()` so its eventual cancellation cannot ripple back through `removeTransport` and clobber the winner. Tie-break extracted as `SymNode.preferNewSessionInDualDial(...)` static helper so it is unit-testable without network mocking. Two new tests: - `testBothPeersPickSamePhysicalConnection` — symmetry: A's outbound is the same socket as B's inbound; both peers must agree. - `testLowerNodeIdKeepsOutbound` — anchors the convention so future refactors don't silently flip it. 70/70 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- Sources/SYM/SymNode.swift | 95 +++++++++++++++++++++++++++++++++-- Tests/SYMTests/SYMTests.swift | 84 +++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 4 deletions(-) diff --git a/Sources/SYM/SymNode.swift b/Sources/SYM/SymNode.swift index c488e0e..8876e78 100644 --- a/Sources/SYM/SymNode.swift +++ b/Sources/SYM/SymNode.swift @@ -942,20 +942,100 @@ public final class SymNode { // MARK: - Multi-Transport Peer Management (MMP Section 4.6) + /// Deterministic tie-break for simultaneous-dial collisions. Both peers + /// observe the same `(localNodeId, remoteNodeId)` pair (with the local + /// and remote roles swapped on each side) and call this function with + /// their respective `newIsOutbound` value. The function picks the same + /// *physical* TCP connection on both sides — the outbound from the + /// lower nodeId, which is the inbound on the higher nodeId — so neither + /// peer needs to coordinate via a frame exchange. + /// + /// Returns `true` if the new session should replace the prior one, + /// `false` if the prior should be kept and the new session disconnected. + /// Internal-access for `@testable` unit tests. + static func preferNewSessionInDualDial( + localNodeId: String, + remoteNodeId: String, + newIsOutbound: Bool + ) -> Bool { + let localIsClient = localNodeId < remoteNodeId + let keepOutbound = localIsClient + return (keepOutbound == newIsOutbound) + } + + /// Outcome of the addPeer state-machine. Threading note: the dict + /// mutation happens inside `peerQueue.sync`; the cleanup actions + /// (cancelling losers, sending frames, emitting events) happen after + /// the lock is released so we don't hold `peerQueue` across I/O. + private enum AddPeerOutcome { + case firstTime // never seen before — new peer-joined + case secondaryTransport // existing peer, no prior bonjour + case replacedPriorBonjour(prev: SymPeerSession) // simultaneous dial: new wins + case rejectedNew // simultaneous dial: existing wins + } + private func addPeer(_ session: SymPeerSession, nodeId: String, peerName: String, isOutbound: Bool) { - let isNew = peerQueue.sync { () -> Bool in + let outcome: AddPeerOutcome = peerQueue.sync { if var existing = self.peers[nodeId] { - // Section 4.6: add Bonjour as secondary transport + if let prev = existing.transports["bonjour"] ?? nil { + // Simultaneous-dial collision: both peers Bonjour-discovered + // each other within ~50ms and both initiated outbound TCP. Each + // side now holds two handshaked sessions for the same nodeId + // (one outbound, one inbound). Without explicit dedup the new + // session silently overwrites the prior in the transports dict + // — the orphan keeps a live NWConnection + read loop, eventually + // fires didDisconnectWith, and `removeTransport` strips the + // surviving winner from the dict. Net effect: every connection + // dies within ~1–2s of handshake, application payloads never + // cross the wire. + // + // Tie-break deterministically by nodeId so both peers agree on + // the same physical connection without exchanging frames: the + // lower nodeId acts as client and keeps its outbound; the + // higher node keeps the matching inbound. The loser's delegate + // is detached before disconnect (see fall-through below) so its + // teardown can't ripple back through removeTransport. + let preferNew = SymNode.preferNewSessionInDualDial( + localNodeId: self.identity.nodeId, + remoteNodeId: nodeId, + newIsOutbound: isOutbound + ) + if preferNew { + existing.transports["bonjour"] = session + existing.lastSeen = Date() + self.peers[nodeId] = existing + return .replacedPriorBonjour(prev: prev) + } + return .rejectedNew + } + // Section 4.6: add Bonjour as secondary transport on top of relay existing.transports["bonjour"] = session existing.lastSeen = Date() self.peers[nodeId] = existing - return false + return .secondaryTransport } self.peers[nodeId] = PeerState( transports: ["bonjour": session], name: peerName, isOutbound: isOutbound, lastSeen: Date() ) - return true + return .firstTime + } + + switch outcome { + case .rejectedNew: + // Detach the delegate so this session's eventual cancellation does NOT + // fire didDisconnectWith → removeTransport, which would strip the + // still-registered winner from the transports dict. + logger.info("[SYM] peer: simultaneous-dial dedup — keeping prior, cancelling redundant \(isOutbound ? "outbound" : "inbound") to \(peerName)") + session.delegate = nil + session.disconnect() + return + case .replacedPriorBonjour(let prev): + logger.info("[SYM] peer: simultaneous-dial dedup — replacing prior with new \(isOutbound ? "outbound" : "inbound") to \(peerName)") + prev.delegate = nil + prev.disconnect() + case .secondaryTransport, .firstTime: + break } // Handshake was already sent in sessionDidBecomeReady (on TCP connect). @@ -964,6 +1044,13 @@ public final class SymNode { session.send(.wakeChannel(platform: wc.platform, token: wc.token, environment: wc.environment)) } + let isNew: Bool + switch outcome { + case .firstTime: isNew = true + case .secondaryTransport, .replacedPriorBonjour: isNew = false + case .rejectedNew: return // unreachable, returned above + } + if isNew { let knownPeers = buildPeerGossip(excluding: nodeId) if !knownPeers.isEmpty { session.send(.peerInfo(peers: knownPeers)) } diff --git a/Tests/SYMTests/SYMTests.swift b/Tests/SYMTests/SYMTests.swift index 7c178c5..724f139 100644 --- a/Tests/SYMTests/SYMTests.swift +++ b/Tests/SYMTests/SYMTests.swift @@ -450,3 +450,87 @@ final class NodeTests: XCTestCase { XCTAssertEqual(a.nodeId, b.nodeId, "same name should produce same nodeId") } } + +// MARK: - Simultaneous-Dial Dedup Tests +// +// Both peers Bonjour-discover each other within ~50ms on a LAN and both +// initiate outbound TCP. The handshakes complete on both sides, so each +// node holds two sessions for the same nodeId (one outbound, one inbound). +// `preferNewSessionInDualDial` is the tie-break that lets each peer +// independently agree on which physical TCP connection survives, without +// exchanging coordination frames. +// +// The critical correctness property: for the same (A, B) pair, peer A's +// decision and peer B's decision must select the SAME physical +// connection — A's outbound is also B's inbound, so if A picks "keep my +// outbound," B must pick "keep my inbound" for that same socket pair. +// +// If both peers picked their own outbound, both would tear down the +// same socket pair from opposite ends and the connection would die. +// If both peers picked their own inbound, the symmetric problem occurs. + +final class SimultaneousDialDedupTests: XCTestCase { + + /// Both peers MUST agree on the same physical connection. Models the + /// dual-dial scenario for an arbitrary (A, B) pair: A's outbound is + /// the same TCP socket as B's inbound. + func testBothPeersPickSamePhysicalConnection() { + let pairs: [(String, String)] = [ + ("aaaa", "bbbb"), // ascii lower + ("019dd87c", "019dd87d"), // realistic uuid7-prefix neighbors + ("zzzz", "aaaa"), // reverse ordered + ("00000000-0000-7000-8000-000000000001", + "00000000-0000-7000-8000-000000000002"), // full uuids + ] + + for (a, b) in pairs { + // From A's perspective: A's outbound is the same socket as B's inbound. + let aPrefersOwnOutbound = SymNode.preferNewSessionInDualDial( + localNodeId: a, remoteNodeId: b, newIsOutbound: true + ) + let aPrefersOwnInbound = SymNode.preferNewSessionInDualDial( + localNodeId: a, remoteNodeId: b, newIsOutbound: false + ) + // From B's perspective: B's inbound is A's outbound; B's outbound is A's inbound. + let bPrefersOwnOutbound = SymNode.preferNewSessionInDualDial( + localNodeId: b, remoteNodeId: a, newIsOutbound: true + ) + let bPrefersOwnInbound = SymNode.preferNewSessionInDualDial( + localNodeId: b, remoteNodeId: a, newIsOutbound: false + ) + + // Tie-break is total: each peer prefers exactly one direction. + XCTAssertNotEqual(aPrefersOwnOutbound, aPrefersOwnInbound, + "A's outbound vs inbound preference must be opposite for pair (\(a), \(b))") + XCTAssertNotEqual(bPrefersOwnOutbound, bPrefersOwnInbound, + "B's outbound vs inbound preference must be opposite for pair (\(a), \(b))") + + // A's outbound socket = B's inbound socket. If A keeps its outbound, + // B must keep its inbound (same socket); otherwise both peers tear + // down the same socket pair from opposite ends. + XCTAssertEqual(aPrefersOwnOutbound, bPrefersOwnInbound, + "A's outbound and B's inbound are the same socket — both peers must pick consistently for pair (\(a), \(b))") + XCTAssertEqual(aPrefersOwnInbound, bPrefersOwnOutbound, + "A's inbound and B's outbound are the same socket — both peers must pick consistently for pair (\(a), \(b))") + } + } + + /// The lower-nodeId-acts-as-client convention: the lower nodeId keeps + /// its outbound. Anchors the algorithm so future refactors don't + /// silently flip the convention (which would still be correct under + /// the symmetry test above, but would interop-break against any other + /// implementation that follows MMP convention). + func testLowerNodeIdKeepsOutbound() { + // a < b, so A is "client" and keeps its outbound. + let a = "00000000-0000-7000-8000-000000000001" + let b = "00000000-0000-7000-8000-000000000002" + XCTAssertTrue(SymNode.preferNewSessionInDualDial(localNodeId: a, remoteNodeId: b, newIsOutbound: true), + "lower nodeId should keep its outbound") + XCTAssertFalse(SymNode.preferNewSessionInDualDial(localNodeId: a, remoteNodeId: b, newIsOutbound: false), + "lower nodeId should reject its inbound") + XCTAssertFalse(SymNode.preferNewSessionInDualDial(localNodeId: b, remoteNodeId: a, newIsOutbound: true), + "higher nodeId should reject its outbound") + XCTAssertTrue(SymNode.preferNewSessionInDualDial(localNodeId: b, remoteNodeId: a, newIsOutbound: false), + "higher nodeId should keep its inbound") + } +}