diff --git a/backend/__tests__/unit/config/db-pg.test.js b/backend/__tests__/unit/config/db-pg.test.js index 4aaf25490..9f8bcdcb0 100644 --- a/backend/__tests__/unit/config/db-pg.test.js +++ b/backend/__tests__/unit/config/db-pg.test.js @@ -1,5 +1,12 @@ jest.mock('fs'); +// db-pg.ts only constructs a Pool when PG_HOST is set; otherwise it +// returns null and `new Pool(...)` is never called. CI doesn't set +// PG_HOST in the unit-test job (only the Service Tests Tier 1 job +// boots real PG). Ensure a placeholder is present BEFORE jest.mock +// hoists/the module is required so the Pool ctor path runs. +process.env.PG_HOST = process.env.PG_HOST || 'localhost-test'; + const fs = require('fs'); const mockClient = { @@ -13,8 +20,20 @@ const mockPool = { on: jest.fn(), }; +// Capture the Pool constructor args module-side. Relying on +// `require('pg').Pool.mock.calls[0][0]` is fragile because each +// `require('pg')` inside this file goes through the jest.mock factory, +// which (on some jest versions) returns a fresh module exports object +// each time — `.mock.calls` on the version we look at can be empty +// while the version db-pg.ts saw recorded the call. A captured +// variable sidesteps the indirection entirely. +let capturedPoolArgs = null; + jest.mock('pg', () => ({ - Pool: jest.fn(() => mockPool), + Pool: jest.fn((args) => { + capturedPoolArgs = args; + return mockPool; + }), })); delete require.cache[require.resolve('../../../config/db-pg')]; @@ -38,3 +57,21 @@ describe('connectPG', () => { expect(result).toBeNull(); }); }); + +describe('Pool config (#454 incident — pool exhaustion)', () => { + // capturedPoolArgs (defined at the top of the file) is populated by + // the mocked pg.Pool factory when db-pg.ts constructs its pool. + it('captures the Pool ctor args (sanity guard)', () => { + expect(capturedPoolArgs).not.toBeNull(); + expect(typeof capturedPoolArgs).toBe('object'); + }); + + it('sets a default pool max well above pg.Pool default of 10', () => { + expect(capturedPoolArgs.max).toBeGreaterThanOrEqual(50); + }); + + it('sets a finite connectionTimeoutMillis (no infinite hang on saturation)', () => { + expect(capturedPoolArgs.connectionTimeoutMillis).toBeGreaterThan(0); + expect(capturedPoolArgs.connectionTimeoutMillis).toBeLessThanOrEqual(60_000); + }); +}); diff --git a/backend/config/db-pg.ts b/backend/config/db-pg.ts index 316724a83..4f6565a83 100644 --- a/backend/config/db-pg.ts +++ b/backend/config/db-pg.ts @@ -12,14 +12,40 @@ interface PgConfig { port: number | string; database: string | undefined; ssl?: { rejectUnauthorized: boolean; ca: string } | false; + // Pool sizing — see #454 (2026-05-26 incident). pg.Pool defaults to + // max=10 and connectionTimeoutMillis=0 (wait forever). On any traffic + // surge — e.g. the hourly summarizer fanning out 60 summary.request + // events — concurrent pool.query() calls saturate the 10 slots and + // every subsequent caller hangs indefinitely instead of failing fast. + // User-facing endpoints (getAllPods, /api/messages) then appear to + // "freeze" with no diagnostic signal. Bumping max + adding an explicit + // connection-acquire timeout fixes both: more headroom for the burst, + // and a clear acquire-timeout error if it does saturate. + max: number; + connectionTimeoutMillis: number; } +// Defaults: max=50 (Aiven dev plan supports 100+ connections; 50 gives +// ample room for the 60-event summarizer burst without claiming the +// entire DB connection budget), connectionTimeoutMillis=5000ms (fail +// fast as a 5xx so the user sees an error rather than a perpetual +// "loading"). Operators can tune via env without rebuilding; +// non-numeric / non-positive env values fall through to the default +// rather than zeroing the pool. +const parsePoolInt = (raw: string | undefined, fallback: number): number => { + if (!raw) return fallback; + const n = Number(raw); + return Number.isFinite(n) && n > 0 ? n : fallback; +}; + const pgConfig: PgConfig = { user: process.env.PG_USER, password: process.env.PG_PASSWORD, host: process.env.PG_HOST, port: process.env.PG_PORT || 5432, database: process.env.PG_DATABASE, + max: parsePoolInt(process.env.PG_POOL_MAX, 50), + connectionTimeoutMillis: parsePoolInt(process.env.PG_POOL_CONNECT_TIMEOUT_MS, 5000), }; if (process.env.PG_SSL_CA_PATH) { diff --git a/docs/runbooks/pg-pool-exhaustion.md b/docs/runbooks/pg-pool-exhaustion.md new file mode 100644 index 000000000..245cf9391 --- /dev/null +++ b/docs/runbooks/pg-pool-exhaustion.md @@ -0,0 +1,71 @@ +# PG connection pool exhaustion — diagnosis + recovery + +**Symptom**: User-facing PG-backed endpoints (`/api/pods`, `/api/messages/:podId`) hang indefinitely. UI shows perpetual loading. Backend CPU + memory normal. Other endpoints (mongo-backed: `/api/posts`, `/api/auth/me`) respond fast. + +## Diagnosis flow + +1. **Confirm reachability** — probe a mongo-backed endpoint from inside the cluster. If it responds, the backend is up and only PG-backed paths are affected. + + ```bash + TOKEN= + kubectl exec -n commonly-dev deploy/backend -- bash -c \ + "curl -sS -m 5 -H 'Authorization: Bearer $TOKEN' \ + 'http://localhost:5000/api/posts?limit=2' \ + -w 'status=%{http_code} ttfb=%{time_starttransfer}s\n' -o /dev/null" + # Expected: status=200 ttfb<200ms + ``` + +2. **Confirm PG-backed paths hang** — same probe against `/api/pods?limit=2`. If it returns `status=000 ttfb=0` (timeout, 0 bytes), the controller never completed. + +3. **Rule out resource pressure** — `kubectl top pod -n commonly-dev `. If CPU < 50% of limit and memory < 75% of limit, the hang is NOT load-related. + +4. **Rule out underlying DB slowness** — run the underlying queries directly from a one-off node process. If they return in well under a second, the DB is fine and the live pool is the bottleneck. + + ```bash + kubectl exec -n commonly-dev deploy/backend -- node -e " + const { pool } = require('./dist/config/db-pg'); + (async () => { + const t0 = Date.now(); + const r = await pool.query('SELECT 1'); + console.log('elapsed ms:', Date.now() - t0); + process.exit(0); + })(); + " + ``` + +5. **Inspect logs for the surge trigger** — `kubectl logs deploy/backend --tail=200`. Look for: + - `Pod summary requests enqueued: ` — the hourly summarizer fanout. N=60 has been observed to saturate a 10-slot pool. + - `Dispatching agent heartbeat events...` repeated rapidly — heartbeat dispatcher cycling. + +## Immediate recovery + +```bash +kubectl rollout restart deploy/backend -n commonly-dev +kubectl rollout status deploy/backend -n commonly-dev --timeout=120s +``` + +~20s downtime. Frees all PG connections instantly. Re-probe `/api/pods` should return in <1s. + +## Why this happens + +`pg.Pool` defaults are `max=10` and `connectionTimeoutMillis=0` (wait forever). On any traffic surge — most commonly the hourly summarizer fanning out N events that each query PG — the 10 slots saturate and every subsequent `pool.query()` waits forever on connection acquire. UI shows perpetual loading with no diagnostic signal because Express never times out the awaiting handler. + +## Structural fix + +Applied 2026-05-26 in PR #455 (`backend/config/db-pg.ts`): + +- `max: 50` (default; tunable via `PG_POOL_MAX`). +- `connectionTimeoutMillis: 5000` (default; tunable via `PG_POOL_CONNECT_TIMEOUT_MS`). + +With this, an exhausted pool fails fast as a 5xx instead of hanging — user sees an error, on-call sees an alert, response is actionable. + +## Still TODO (post-#455) + +- **Audit summarizer + heartbeat dispatch concurrency** — burst-rate-limit the per-event PG calls so a 60-pod fanout doesn't claim 60 slots simultaneously. Chunking by 10 with `await Promise.all` per batch is the natural shape. +- **Add `/api/health/db` probe** that returns `pool.idleCount` + `pool.waitingCount` and alerts when waiting > 5 for >30s. Would have caught this before user impact. + +## Related + +- Incident issue: [#454](https://github.com/Team-Commonly/commonly/issues/454) +- Fix PR: [#455](https://github.com/Team-Commonly/commonly/pull/455) +- Code: `backend/config/db-pg.ts` (pool config), `backend/controllers/podController.ts:199-227` (getAllPods PG call site), `backend/services/summarizerService.ts` (likely surge source)