From a982eadcb762a81a338b0468edacebc245691657 Mon Sep 17 00:00:00 2001 From: Marshalleq <13308996+marshalleq@users.noreply.github.com> Date: Sat, 18 Apr 2026 12:33:53 +1200 Subject: [PATCH] fix(libp2p): default abortConnectionOnPingFailure to false MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A single 10s ping exchange failing is not reliable evidence that a connection is dead under real WAN conditions. Trans-continental RTT, event-loop contention, and transient network jitter routinely cause probes to fail. Yamux keepalive detects genuinely dead connections at the muxer layer without the aggressive RST cascade that conn.abort() triggers, so the upstream default was causing measurable self-harm at non-trivial scale. Deployment evidence: a five-host network spanning three continents (Germany, Australia, New Zealand) with yamux keepalive enabled showed 25–62 disconnects/hr/host under the upstream default. Setting abortConnectionOnPingFailure: false on all five hosts dropped the measured rate to 0/hr/host over multi-hour windows. No other change was required. Pcap confirmed the Node process itself sent the TCP RSTs mid-traffic, in the same event-loop tick as a preceding yamux GoAway burst rooted at connection-monitor.ts. Applications that still want the aggressive behaviour can opt in explicitly. Existing tests that relied on the old default have been updated to pass abortConnectionOnPingFailure: true explicitly, and a new regression test asserts the new default behaviour. Signed-off-by: Marshalleq <13308996+marshalleq@users.noreply.github.com> --- packages/libp2p/src/connection-monitor.ts | 16 +++++++++--- .../test/connection-monitor/index.spec.ts | 25 +++++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/packages/libp2p/src/connection-monitor.ts b/packages/libp2p/src/connection-monitor.ts index da24eaa04c..51e30a1fa8 100644 --- a/packages/libp2p/src/connection-monitor.ts +++ b/packages/libp2p/src/connection-monitor.ts @@ -11,7 +11,14 @@ const PROTOCOL_VERSION = '1.0.0' const PROTOCOL_NAME = 'ping' const PROTOCOL_PREFIX = 'ipfs' const PING_LENGTH = 32 -const DEFAULT_ABORT_CONNECTION_ON_PING_FAILURE = true +// Probe failures are not reliable evidence a connection is dead under real +// WAN conditions — trans-continental RTT, event-loop contention, and transient +// network jitter routinely cause a single 10s ping exchange to fail. Yamux +// keepalive detects genuinely dead connections at the muxer layer without +// tearing down the TCP socket, so aborting on a single probe failure is +// measurable self-harm at non-trivial scale. Applications that want the +// aggressive behaviour can still opt in by setting this to true. +const DEFAULT_ABORT_CONNECTION_ON_PING_FAILURE = false export interface ConnectionMonitorInit { /** @@ -38,9 +45,12 @@ export interface ConnectionMonitorInit { pingTimeout?: Omit /** - * If true, any connection that fails the ping will be aborted + * If true, any connection that fails the ping will be aborted. Defaults to + * false because a single 10s probe failure is not reliable evidence that + * the connection is dead under real WAN conditions. Set to true to restore + * the previous aggressive behaviour. * - * @default true + * @default false */ abortConnectionOnPingFailure?: boolean diff --git a/packages/libp2p/test/connection-monitor/index.spec.ts b/packages/libp2p/test/connection-monitor/index.spec.ts index 7fb906a77c..2afdfbac90 100644 --- a/packages/libp2p/test/connection-monitor/index.spec.ts +++ b/packages/libp2p/test/connection-monitor/index.spec.ts @@ -112,7 +112,8 @@ describe('connection monitor', () => { pingInterval: 50, pingTimeout: { maxTimeout: 50 - } + }, + abortConnectionOnPingFailure: true }) await start(monitor) @@ -133,7 +134,8 @@ describe('connection monitor', () => { it('should abort a connection that fails', async () => { monitor = new ConnectionMonitor(components, { - pingInterval: 10 + pingInterval: 10, + abortConnectionOnPingFailure: true }) await start(monitor) @@ -150,6 +152,25 @@ describe('connection monitor', () => { expect(connection.abort).to.have.property('called', true) }) + it('should not abort a connection that fails by default', async () => { + monitor = new ConnectionMonitor(components, { + pingInterval: 10 + }) + + await start(monitor) + + const connection = stubInterface() + connection.newStream.withArgs('/ipfs/ping/1.0.0').callsFake(async (protocols, opts) => { + throw new ConnectionClosedError('Connection closed') + }) + + components.connectionManager.getConnections.returns([connection]) + + await delay(500) + + expect(connection.abort).to.have.property('called', false) + }) + it('should not abort a connection that fails when abortConnectionOnPingFailure is false', async () => { monitor = new ConnectionMonitor(components, { pingInterval: 10,