From 696a3425d8f703bec6955e1bed59b5391de89542 Mon Sep 17 00:00:00 2001 From: Sam Xu Date: Tue, 26 May 2026 04:32:39 -0700 Subject: [PATCH 1/4] fix(db-pg): bump pool.max to 50 + add connectionTimeoutMillis=5s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend went unresponsive on app-dev 2026-05-26 — xcjsam couldn't load any pods, /api/pods + /api/messages hung indefinitely. Root cause: pg.Pool defaults are max=10 and connectionTimeoutMillis=0 (wait forever). The hourly summarizer fans out 60 summary.request events; with only 10 slots, the surge starves the pool and every subsequent pool.query() — including user-facing getAllPods — waits forever on connection acquire. UI shows perpetual loading with no diagnostic signal. - Default max=50 (Aiven dev plan supports 100+; 50 leaves headroom for the 60-event burst without claiming the entire connection budget). - Default connectionTimeoutMillis=5000ms — fails fast as a 5xx so the user sees an error rather than a perpetual "loading" state. Without this, an exhausted pool is indistinguishable from a slow query at the client. - Both tunable via env (PG_POOL_MAX, PG_POOL_CONNECT_TIMEOUT_MS); non-numeric / non-positive values fall through to the safe default rather than zeroing the pool. - Two new regression tests assert the default max is ≥50 and that connectionTimeoutMillis is finite — guards against a future zero-cap refactor regressing back to the hang-forever shape. Issue #454 (live incident triage). Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/__tests__/unit/config/db-pg.test.js | 19 +++++++++++++++ backend/config/db-pg.ts | 26 +++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/backend/__tests__/unit/config/db-pg.test.js b/backend/__tests__/unit/config/db-pg.test.js index 4aaf25490..370f7f15a 100644 --- a/backend/__tests__/unit/config/db-pg.test.js +++ b/backend/__tests__/unit/config/db-pg.test.js @@ -38,3 +38,22 @@ describe('connectPG', () => { expect(result).toBeNull(); }); }); + +describe('Pool config (#454 incident — pool exhaustion)', () => { + // Capture the constructor args passed to pg.Pool when db-pg.ts loads. + // The mock above intercepts the constructor; reading + // `require('pg').Pool.mock.calls[0][0]` gives us the config object the + // backend would have handed to a real Pool. + const getPoolArgs = () => require('pg').Pool.mock.calls[0][0]; + + it('sets a default pool max well above pg.Pool default of 10', () => { + const args = getPoolArgs(); + expect(args.max).toBeGreaterThanOrEqual(50); + }); + + it('sets a finite connectionTimeoutMillis (no infinite hang on saturation)', () => { + const args = getPoolArgs(); + expect(args.connectionTimeoutMillis).toBeGreaterThan(0); + expect(args.connectionTimeoutMillis).toBeLessThanOrEqual(60_000); + }); +}); diff --git a/backend/config/db-pg.ts b/backend/config/db-pg.ts index 316724a83..4f6565a83 100644 --- a/backend/config/db-pg.ts +++ b/backend/config/db-pg.ts @@ -12,14 +12,40 @@ interface PgConfig { port: number | string; database: string | undefined; ssl?: { rejectUnauthorized: boolean; ca: string } | false; + // Pool sizing — see #454 (2026-05-26 incident). pg.Pool defaults to + // max=10 and connectionTimeoutMillis=0 (wait forever). On any traffic + // surge — e.g. the hourly summarizer fanning out 60 summary.request + // events — concurrent pool.query() calls saturate the 10 slots and + // every subsequent caller hangs indefinitely instead of failing fast. + // User-facing endpoints (getAllPods, /api/messages) then appear to + // "freeze" with no diagnostic signal. Bumping max + adding an explicit + // connection-acquire timeout fixes both: more headroom for the burst, + // and a clear acquire-timeout error if it does saturate. + max: number; + connectionTimeoutMillis: number; } +// Defaults: max=50 (Aiven dev plan supports 100+ connections; 50 gives +// ample room for the 60-event summarizer burst without claiming the +// entire DB connection budget), connectionTimeoutMillis=5000ms (fail +// fast as a 5xx so the user sees an error rather than a perpetual +// "loading"). Operators can tune via env without rebuilding; +// non-numeric / non-positive env values fall through to the default +// rather than zeroing the pool. +const parsePoolInt = (raw: string | undefined, fallback: number): number => { + if (!raw) return fallback; + const n = Number(raw); + return Number.isFinite(n) && n > 0 ? n : fallback; +}; + const pgConfig: PgConfig = { user: process.env.PG_USER, password: process.env.PG_PASSWORD, host: process.env.PG_HOST, port: process.env.PG_PORT || 5432, database: process.env.PG_DATABASE, + max: parsePoolInt(process.env.PG_POOL_MAX, 50), + connectionTimeoutMillis: parsePoolInt(process.env.PG_POOL_CONNECT_TIMEOUT_MS, 5000), }; if (process.env.PG_SSL_CA_PATH) { From 6fb5da3a53604454c113f24bc00f97ec96098dec Mon Sep 17 00:00:00 2001 From: Sam Xu Date: Tue, 26 May 2026 18:00:37 -0700 Subject: [PATCH 2/4] docs(runbooks): pg-pool-exhaustion diagnosis + recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the 2026-05-26 incident: probe localhost → exec one-off node → read logs for surge trigger → kubectl rollout restart → structural fix in PR #455. Pairs with the bumped Pool defaults to make future repros fast to triage. Pointer added in commonly-skills/devops/SKILL.md (separate repo). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/runbooks/pg-pool-exhaustion.md | 71 +++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 docs/runbooks/pg-pool-exhaustion.md diff --git a/docs/runbooks/pg-pool-exhaustion.md b/docs/runbooks/pg-pool-exhaustion.md new file mode 100644 index 000000000..245cf9391 --- /dev/null +++ b/docs/runbooks/pg-pool-exhaustion.md @@ -0,0 +1,71 @@ +# PG connection pool exhaustion — diagnosis + recovery + +**Symptom**: User-facing PG-backed endpoints (`/api/pods`, `/api/messages/:podId`) hang indefinitely. UI shows perpetual loading. Backend CPU + memory normal. Other endpoints (mongo-backed: `/api/posts`, `/api/auth/me`) respond fast. + +## Diagnosis flow + +1. **Confirm reachability** — probe a mongo-backed endpoint from inside the cluster. If it responds, the backend is up and only PG-backed paths are affected. + + ```bash + TOKEN= + kubectl exec -n commonly-dev deploy/backend -- bash -c \ + "curl -sS -m 5 -H 'Authorization: Bearer $TOKEN' \ + 'http://localhost:5000/api/posts?limit=2' \ + -w 'status=%{http_code} ttfb=%{time_starttransfer}s\n' -o /dev/null" + # Expected: status=200 ttfb<200ms + ``` + +2. **Confirm PG-backed paths hang** — same probe against `/api/pods?limit=2`. If it returns `status=000 ttfb=0` (timeout, 0 bytes), the controller never completed. + +3. **Rule out resource pressure** — `kubectl top pod -n commonly-dev `. If CPU < 50% of limit and memory < 75% of limit, the hang is NOT load-related. + +4. **Rule out underlying DB slowness** — run the underlying queries directly from a one-off node process. If they return in well under a second, the DB is fine and the live pool is the bottleneck. + + ```bash + kubectl exec -n commonly-dev deploy/backend -- node -e " + const { pool } = require('./dist/config/db-pg'); + (async () => { + const t0 = Date.now(); + const r = await pool.query('SELECT 1'); + console.log('elapsed ms:', Date.now() - t0); + process.exit(0); + })(); + " + ``` + +5. **Inspect logs for the surge trigger** — `kubectl logs deploy/backend --tail=200`. Look for: + - `Pod summary requests enqueued: ` — the hourly summarizer fanout. N=60 has been observed to saturate a 10-slot pool. + - `Dispatching agent heartbeat events...` repeated rapidly — heartbeat dispatcher cycling. + +## Immediate recovery + +```bash +kubectl rollout restart deploy/backend -n commonly-dev +kubectl rollout status deploy/backend -n commonly-dev --timeout=120s +``` + +~20s downtime. Frees all PG connections instantly. Re-probe `/api/pods` should return in <1s. + +## Why this happens + +`pg.Pool` defaults are `max=10` and `connectionTimeoutMillis=0` (wait forever). On any traffic surge — most commonly the hourly summarizer fanning out N events that each query PG — the 10 slots saturate and every subsequent `pool.query()` waits forever on connection acquire. UI shows perpetual loading with no diagnostic signal because Express never times out the awaiting handler. + +## Structural fix + +Applied 2026-05-26 in PR #455 (`backend/config/db-pg.ts`): + +- `max: 50` (default; tunable via `PG_POOL_MAX`). +- `connectionTimeoutMillis: 5000` (default; tunable via `PG_POOL_CONNECT_TIMEOUT_MS`). + +With this, an exhausted pool fails fast as a 5xx instead of hanging — user sees an error, on-call sees an alert, response is actionable. + +## Still TODO (post-#455) + +- **Audit summarizer + heartbeat dispatch concurrency** — burst-rate-limit the per-event PG calls so a 60-pod fanout doesn't claim 60 slots simultaneously. Chunking by 10 with `await Promise.all` per batch is the natural shape. +- **Add `/api/health/db` probe** that returns `pool.idleCount` + `pool.waitingCount` and alerts when waiting > 5 for >30s. Would have caught this before user impact. + +## Related + +- Incident issue: [#454](https://github.com/Team-Commonly/commonly/issues/454) +- Fix PR: [#455](https://github.com/Team-Commonly/commonly/pull/455) +- Code: `backend/config/db-pg.ts` (pool config), `backend/controllers/podController.ts:199-227` (getAllPods PG call site), `backend/services/summarizerService.ts` (likely surge source) From 4c84ab7bc8ea3f2de054a810ff13bfc40c1ff1fd Mon Sep 17 00:00:00 2001 From: Sam Xu Date: Tue, 26 May 2026 18:02:26 -0700 Subject: [PATCH 3/4] fix(db-pg.test): set placeholder PG_HOST so Pool ctor runs in unit job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unit-test job in tests.yml doesn't set PG_HOST (only the Service Tests Tier 1 job boots real PG). Without it, `pgConfig.host ? new Pool(...) : null` returns null, so `require('pg').Pool.mock.calls` is empty and the new "Pool config" assertions throw "Cannot read properties of undefined" on `mock.calls[0][0]`. Sets `process.env.PG_HOST = 'localhost-test'` before requiring db-pg when the env is unset. Placeholder value is intentional — the real Pool is mocked anyway; we just need the ctor path to execute. Fixes the new tests added in this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/__tests__/unit/config/db-pg.test.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/__tests__/unit/config/db-pg.test.js b/backend/__tests__/unit/config/db-pg.test.js index 370f7f15a..9741f71e2 100644 --- a/backend/__tests__/unit/config/db-pg.test.js +++ b/backend/__tests__/unit/config/db-pg.test.js @@ -17,6 +17,13 @@ jest.mock('pg', () => ({ Pool: jest.fn(() => mockPool), })); +// db-pg.ts only constructs a Pool when PG_HOST is set; otherwise it +// returns null and `new Pool(...)` is never called. CI doesn't set +// PG_HOST in the unit-test job (only the Service Tests Tier 1 job +// boots real PG). Ensure a placeholder is present so the Pool ctor +// runs and our config assertions below have something to inspect. +process.env.PG_HOST = process.env.PG_HOST || 'localhost-test'; + delete require.cache[require.resolve('../../../config/db-pg')]; const { pool, connectPG } = require('../../../config/db-pg'); From 41746451cb01051e5a0379fb5279bdbff4a167c2 Mon Sep 17 00:00:00 2001 From: Sam Xu Date: Tue, 26 May 2026 18:08:12 -0700 Subject: [PATCH 4/4] fix(db-pg.test): capture Pool ctor args via factory closure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous attempt (set PG_HOST before require) still left `require('pg').Pool.mock.calls[0][0]` undefined in CI. Root cause is subtle: `require('pg')` inside this file can return a different module exports object than the one db-pg.ts saw, so the `.mock.calls` array we read may be empty even though Pool was actually constructed. Fix: have the jest.mock factory capture the ctor args into a module-scope `let capturedPoolArgs` directly. No require indirection. Also moves the `process.env.PG_HOST` placeholder above the const declarations (more defensive — jest.mock factory may run on a hoisted schedule that races the env assignment otherwise). Adds a "captures Pool ctor args (sanity guard)" assertion so the failure mode is explicit if this ever regresses again. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/__tests__/unit/config/db-pg.test.js | 47 +++++++++++++-------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/backend/__tests__/unit/config/db-pg.test.js b/backend/__tests__/unit/config/db-pg.test.js index 9741f71e2..9f8bcdcb0 100644 --- a/backend/__tests__/unit/config/db-pg.test.js +++ b/backend/__tests__/unit/config/db-pg.test.js @@ -1,5 +1,12 @@ jest.mock('fs'); +// db-pg.ts only constructs a Pool when PG_HOST is set; otherwise it +// returns null and `new Pool(...)` is never called. CI doesn't set +// PG_HOST in the unit-test job (only the Service Tests Tier 1 job +// boots real PG). Ensure a placeholder is present BEFORE jest.mock +// hoists/the module is required so the Pool ctor path runs. +process.env.PG_HOST = process.env.PG_HOST || 'localhost-test'; + const fs = require('fs'); const mockClient = { @@ -13,17 +20,22 @@ const mockPool = { on: jest.fn(), }; +// Capture the Pool constructor args module-side. Relying on +// `require('pg').Pool.mock.calls[0][0]` is fragile because each +// `require('pg')` inside this file goes through the jest.mock factory, +// which (on some jest versions) returns a fresh module exports object +// each time — `.mock.calls` on the version we look at can be empty +// while the version db-pg.ts saw recorded the call. A captured +// variable sidesteps the indirection entirely. +let capturedPoolArgs = null; + jest.mock('pg', () => ({ - Pool: jest.fn(() => mockPool), + Pool: jest.fn((args) => { + capturedPoolArgs = args; + return mockPool; + }), })); -// db-pg.ts only constructs a Pool when PG_HOST is set; otherwise it -// returns null and `new Pool(...)` is never called. CI doesn't set -// PG_HOST in the unit-test job (only the Service Tests Tier 1 job -// boots real PG). Ensure a placeholder is present so the Pool ctor -// runs and our config assertions below have something to inspect. -process.env.PG_HOST = process.env.PG_HOST || 'localhost-test'; - delete require.cache[require.resolve('../../../config/db-pg')]; const { pool, connectPG } = require('../../../config/db-pg'); @@ -47,20 +59,19 @@ describe('connectPG', () => { }); describe('Pool config (#454 incident — pool exhaustion)', () => { - // Capture the constructor args passed to pg.Pool when db-pg.ts loads. - // The mock above intercepts the constructor; reading - // `require('pg').Pool.mock.calls[0][0]` gives us the config object the - // backend would have handed to a real Pool. - const getPoolArgs = () => require('pg').Pool.mock.calls[0][0]; + // capturedPoolArgs (defined at the top of the file) is populated by + // the mocked pg.Pool factory when db-pg.ts constructs its pool. + it('captures the Pool ctor args (sanity guard)', () => { + expect(capturedPoolArgs).not.toBeNull(); + expect(typeof capturedPoolArgs).toBe('object'); + }); it('sets a default pool max well above pg.Pool default of 10', () => { - const args = getPoolArgs(); - expect(args.max).toBeGreaterThanOrEqual(50); + expect(capturedPoolArgs.max).toBeGreaterThanOrEqual(50); }); it('sets a finite connectionTimeoutMillis (no infinite hang on saturation)', () => { - const args = getPoolArgs(); - expect(args.connectionTimeoutMillis).toBeGreaterThan(0); - expect(args.connectionTimeoutMillis).toBeLessThanOrEqual(60_000); + expect(capturedPoolArgs.connectionTimeoutMillis).toBeGreaterThan(0); + expect(capturedPoolArgs.connectionTimeoutMillis).toBeLessThanOrEqual(60_000); }); });