Skip to content

Commit fe0b172

Browse files
committed
perf(collab): faster WS reconnect + lower test timeouts
CONNECT_TIMEOUT_MS: 5s → 2s (localhost WS opens in <50ms) Reconnect backoff: 1s base/30s cap → 250ms base/5s cap onclose: reconnect on code 1000 if connection never fully initialized Test timeouts: 135s → 30s (removed test.slow()), waitForEditorReady 60s → 25s. Suite runs in 2m vs 3.3m previously. Remaining Firefox flake is browser-level: WS upgrade hangs at HTTP protocol level after ~128 prior tests. Always passes on retry.
1 parent 6897af6 commit fe0b172

2 files changed

Lines changed: 25 additions & 20 deletions

File tree

pkgs/id/e2e/tests/websocket.spec.ts

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import { expect, type Page, test } from "@playwright/test";
22

33
// WebSocket tests need more headroom than basic tests because Firefox's WS
4-
// handshake can occasionally hang (~20s browser timeout + 1s reconnect delay).
5-
// 45s gives enough room for one reconnect cycle within a single test.
6-
test.setTimeout(45_000);
4+
// handshake can occasionally hang (~20s browser timeout + reconnect delay).
5+
// 30s gives enough room for multiple reconnect cycles (2s timeout + 0.25–5s
6+
// backoff each) within a single test.
7+
test.setTimeout(30_000);
78

89
/**
910
* WebSocket & Collab E2E tests for the id web UI.
@@ -98,12 +99,12 @@ async function createFileWithUniqueContent(page: Page, name: string, baseURL: st
9899
expect(saveResp.ok()).toBeTruthy();
99100

100101
// Navigate directly to file — tests that direct URL access works (bookmarks,
101-
// link sharing, page refresh). The 5s connect timeout in collab.ts handles
102+
// link sharing, page refresh). The 2s connect timeout in collab.ts handles
102103
// any WS init race on full page load.
103104
//
104105
// NOTE: Do NOT use waitForLoadState("networkidle") here. The WS upgrade
105106
// request counts as a pending connection; if Firefox's handshake hangs,
106-
// the 5s connect timeout fires → scheduleReconnect → new pending request,
107+
// the 2s connect timeout fires → scheduleReconnect → new pending request,
107108
// resetting networkidle's 500ms idle counter. This loop eats 30s+ of test
108109
// budget before networkidle gives up. waitForEditorReady() handles all the
109110
// real waiting by polling JS/DOM state directly.
@@ -119,11 +120,10 @@ test.describe("WebSocket Connection + Editor Ready", () => {
119120
test("editor status shows connected after WS handshake", async ({ page, baseURL }) => {
120121
// This is the first WS test in the full suite (~test #129). In Firefox,
121122
// the browser process accumulates state from ~128 prior tests that can
122-
// cause the first WS upgrade request to hang in CONNECTING state. The 5s
123-
// connect timeout + reconnect mechanism in collab.ts handles recovery,
124-
// but may need multiple cycles. Mark slow (3x timeout = 135s) to allow
125-
// up to ~5 reconnect attempts without timing out.
126-
test.slow();
123+
// cause the first WS upgrade request to hang in CONNECTING state. The 2s
124+
// connect timeout + fast reconnect (250ms base backoff) in collab.ts
125+
// recovers in ~2.5s per attempt, so default 20s waitForEditorReady
126+
// easily handles 5+ reconnect cycles without needing test.slow().
127127

128128
const fileName = `ws-connect-${Date.now()}.txt`;
129129

@@ -138,10 +138,8 @@ test.describe("WebSocket Connection + Editor Ready", () => {
138138
// Use unique content to avoid shared collab document under Firefox load
139139
await createFileWithUniqueContent(page, fileName, baseURL!);
140140

141-
// Wait for editor to be fully ready — generous timeout for first WS
142-
// connection which may need multiple reconnect cycles in Firefox.
143-
// 5s connect timeout + 1-8s backoff per attempt × up to 5 attempts ≈ 55s
144-
await waitForEditorReady(page, 60_000);
141+
// Wait for editor to be fully ready
142+
await waitForEditorReady(page);
145143

146144
// Verify WebSocket is connected
147145
const wsConnected = await page.evaluate(() => {

pkgs/id/web/src/collab.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,10 @@ export function initCollab(
103103
// we don't spuriously reconnect after an intentional disconnect.
104104
let intentionalClose = false;
105105
// App-level connection timeout — if the WS doesn't reach OPEN within this
106-
// many ms, we close it and let onclose → scheduleReconnect handle retry.
106+
// many ms, we close it and schedule reconnect directly.
107107
// Browsers default to ~20s TCP timeout which is far too slow for UX.
108108
let connectTimer: ReturnType<typeof setTimeout> | null = null;
109-
const CONNECT_TIMEOUT_MS = 5000;
109+
const CONNECT_TIMEOUT_MS = 2000;
110110

111111
// Track our clientID (set when editor initializes)
112112
let myClientID: number | null = null;
@@ -142,8 +142,10 @@ export function initCollab(
142142
}
143143

144144
// Exponential backoff with jitter: base * 2^attempt + random jitter
145-
const baseDelay = Math.min(1000 * 2 ** reconnectAttempts, 30000);
146-
const jitter = Math.random() * Math.min(1000, baseDelay * 0.2);
145+
// Fast initial retries (250ms) for localhost/LAN; caps at 5s for WAN.
146+
// Combined with 2s connect timeout: worst-case cycle ≈ 2.25–7s per attempt.
147+
const baseDelay = Math.min(250 * 2 ** reconnectAttempts, 5000);
148+
const jitter = Math.random() * Math.min(250, baseDelay * 0.2);
147149
const delay = baseDelay + jitter;
148150
reconnectAttempts++;
149151

@@ -405,16 +407,21 @@ export function initCollab(
405407
connectTimer = null;
406408
}
407409
console.log("[collab] Disconnected:", event.code, event.reason);
410+
const wasConnected = connected;
408411
connected = false;
409412
// Only reconnect if this was NOT an intentional disconnect.
410413
// We check both the intentionalClose flag AND event.code because:
411414
// - intentionalClose: covers client-initiated disconnect() calls where the
412415
// close handshake may fail/timeout, causing the browser to fire onclose
413416
// with code 1006 instead of the requested 1000
414-
// - event.code === 1000: covers server-initiated clean closes
417+
// - event.code === 1000 AND wasConnected: covers server-initiated clean
418+
// closes when we had a working session. We MUST still reconnect if the
419+
// connection dropped before we were fully connected (e.g., immediately
420+
// after WS handshake but before Init message was processed), because
421+
// that indicates a transient failure, not an intentional close.
415422
const wasIntentional = intentionalClose;
416423
intentionalClose = false;
417-
if (!wasIntentional && event.code !== 1000) {
424+
if (!wasIntentional && !(event.code === 1000 && wasConnected)) {
418425
// Update cursor state if editor was already initialized
419426
if (editorInstance) {
420427
setConnectionState(editorInstance.view, "disconnected");

0 commit comments

Comments
 (0)