Team-Commonly · samxu01 · May 26, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/backend/__tests__/unit/config/db-pg.test.js b/backend/__tests__/unit/config/db-pg.test.js
@@ -1,5 +1,12 @@
 jest.mock('fs');
 
+// db-pg.ts only constructs a Pool when PG_HOST is set; otherwise it
+// returns null and `new Pool(...)` is never called. CI doesn't set
+// PG_HOST in the unit-test job (only the Service Tests Tier 1 job
+// boots real PG). Ensure a placeholder is present BEFORE jest.mock
+// hoists/the module is required so the Pool ctor path runs.
+process.env.PG_HOST = process.env.PG_HOST || 'localhost-test';
+
 const fs = require('fs');
 
 const mockClient = {
@@ -13,8 +20,20 @@ const mockPool = {
   on: jest.fn(),
 };
 
+// Capture the Pool constructor args module-side. Relying on
+// `require('pg').Pool.mock.calls[0][0]` is fragile because each
+// `require('pg')` inside this file goes through the jest.mock factory,
+// which (on some jest versions) returns a fresh module exports object
+// each time — `.mock.calls` on the version we look at can be empty
+// while the version db-pg.ts saw recorded the call. A captured
+// variable sidesteps the indirection entirely.
+let capturedPoolArgs = null;
+
 jest.mock('pg', () => ({
-  Pool: jest.fn(() => mockPool),
+  Pool: jest.fn((args) => {
+    capturedPoolArgs = args;
+    return mockPool;
+  }),
 }));
 
 delete require.cache[require.resolve('../../../config/db-pg')];
@@ -38,3 +57,21 @@ describe('connectPG', () => {
     expect(result).toBeNull();
   });
 });
+
+describe('Pool config (#454 incident — pool exhaustion)', () => {
+  // capturedPoolArgs (defined at the top of the file) is populated by
+  // the mocked pg.Pool factory when db-pg.ts constructs its pool.
+  it('captures the Pool ctor args (sanity guard)', () => {
+    expect(capturedPoolArgs).not.toBeNull();
+    expect(typeof capturedPoolArgs).toBe('object');
+  });
+
+  it('sets a default pool max well above pg.Pool default of 10', () => {
+    expect(capturedPoolArgs.max).toBeGreaterThanOrEqual(50);
+  });
+
+  it('sets a finite connectionTimeoutMillis (no infinite hang on saturation)', () => {
+    expect(capturedPoolArgs.connectionTimeoutMillis).toBeGreaterThan(0);
+    expect(capturedPoolArgs.connectionTimeoutMillis).toBeLessThanOrEqual(60_000);
+  });
+});
diff --git a/backend/config/db-pg.ts b/backend/config/db-pg.ts
@@ -12,14 +12,40 @@ interface PgConfig {
   port: number | string;
   database: string | undefined;
   ssl?: { rejectUnauthorized: boolean; ca: string } | false;
+  // Pool sizing — see #454 (2026-05-26 incident). pg.Pool defaults to
+  // max=10 and connectionTimeoutMillis=0 (wait forever). On any traffic
+  // surge — e.g. the hourly summarizer fanning out 60 summary.request
+  // events — concurrent pool.query() calls saturate the 10 slots and
+  // every subsequent caller hangs indefinitely instead of failing fast.
+  // User-facing endpoints (getAllPods, /api/messages) then appear to
+  // "freeze" with no diagnostic signal. Bumping max + adding an explicit
+  // connection-acquire timeout fixes both: more headroom for the burst,
+  // and a clear acquire-timeout error if it does saturate.
+  max: number;
+  connectionTimeoutMillis: number;
 }
 
+// Defaults: max=50 (Aiven dev plan supports 100+ connections; 50 gives
+// ample room for the 60-event summarizer burst without claiming the
+// entire DB connection budget), connectionTimeoutMillis=5000ms (fail
+// fast as a 5xx so the user sees an error rather than a perpetual
+// "loading"). Operators can tune via env without rebuilding;
+// non-numeric / non-positive env values fall through to the default
+// rather than zeroing the pool.
+const parsePoolInt = (raw: string | undefined, fallback: number): number => {
+  if (!raw) return fallback;
+  const n = Number(raw);
+  return Number.isFinite(n) && n > 0 ? n : fallback;
+};
+
 const pgConfig: PgConfig = {
   user: process.env.PG_USER,
   password: process.env.PG_PASSWORD,
   host: process.env.PG_HOST,
   port: process.env.PG_PORT || 5432,
   database: process.env.PG_DATABASE,
+  max: parsePoolInt(process.env.PG_POOL_MAX, 50),
+  connectionTimeoutMillis: parsePoolInt(process.env.PG_POOL_CONNECT_TIMEOUT_MS, 5000),
 };
 
 if (process.env.PG_SSL_CA_PATH) {

diff --git a/docs/runbooks/pg-pool-exhaustion.md b/docs/runbooks/pg-pool-exhaustion.md
@@ -0,0 +1,71 @@
+# PG connection pool exhaustion — diagnosis + recovery
+
+**Symptom**: User-facing PG-backed endpoints (`/api/pods`, `/api/messages/:podId`) hang indefinitely. UI shows perpetual loading. Backend CPU + memory normal. Other endpoints (mongo-backed: `/api/posts`, `/api/auth/me`) respond fast.
+
+## Diagnosis flow
+
+1. **Confirm reachability** — probe a mongo-backed endpoint from inside the cluster. If it responds, the backend is up and only PG-backed paths are affected.
+
+   ```bash
+   TOKEN=<admin jwt>
+   kubectl exec -n commonly-dev deploy/backend -- bash -c \
+     "curl -sS -m 5 -H 'Authorization: Bearer $TOKEN' \
+      'http://localhost:5000/api/posts?limit=2' \
+      -w 'status=%{http_code} ttfb=%{time_starttransfer}s\n' -o /dev/null"
+   # Expected: status=200 ttfb<200ms
+   ```
+
+2. **Confirm PG-backed paths hang** — same probe against `/api/pods?limit=2`. If it returns `status=000 ttfb=0` (timeout, 0 bytes), the controller never completed.
+
+3. **Rule out resource pressure** — `kubectl top pod -n commonly-dev <backend-pod>`. If CPU < 50% of limit and memory < 75% of limit, the hang is NOT load-related.
+
+4. **Rule out underlying DB slowness** — run the underlying queries directly from a one-off node process. If they return in well under a second, the DB is fine and the live pool is the bottleneck.
+
+   ```bash
+   kubectl exec -n commonly-dev deploy/backend -- node -e "
+     const { pool } = require('./dist/config/db-pg');
+     (async () => {
+       const t0 = Date.now();
+       const r = await pool.query('SELECT 1');
+       console.log('elapsed ms:', Date.now() - t0);
+       process.exit(0);
+     })();
+   "
+   ```
+
+5. **Inspect logs for the surge trigger** — `kubectl logs deploy/backend --tail=200`. Look for:
+   - `Pod summary requests enqueued: <N>` — the hourly summarizer fanout. N=60 has been observed to saturate a 10-slot pool.
+   - `Dispatching agent heartbeat events...` repeated rapidly — heartbeat dispatcher cycling.
+
+## Immediate recovery
+
+```bash
+kubectl rollout restart deploy/backend -n commonly-dev
+kubectl rollout status deploy/backend -n commonly-dev --timeout=120s
+```
+
+~20s downtime. Frees all PG connections instantly. Re-probe `/api/pods` should return in <1s.
+
+## Why this happens
+
+`pg.Pool` defaults are `max=10` and `connectionTimeoutMillis=0` (wait forever). On any traffic surge — most commonly the hourly summarizer fanning out N events that each query PG — the 10 slots saturate and every subsequent `pool.query()` waits forever on connection acquire. UI shows perpetual loading with no diagnostic signal because Express never times out the awaiting handler.
+
+## Structural fix
+
+Applied 2026-05-26 in PR #455 (`backend/config/db-pg.ts`):
+
+- `max: 50` (default; tunable via `PG_POOL_MAX`).
+- `connectionTimeoutMillis: 5000` (default; tunable via `PG_POOL_CONNECT_TIMEOUT_MS`).
+
+With this, an exhausted pool fails fast as a 5xx instead of hanging — user sees an error, on-call sees an alert, response is actionable.
+
+## Still TODO (post-#455)
+
+- **Audit summarizer + heartbeat dispatch concurrency** — burst-rate-limit the per-event PG calls so a 60-pod fanout doesn't claim 60 slots simultaneously. Chunking by 10 with `await Promise.all` per batch is the natural shape.
+- **Add `/api/health/db` probe** that returns `pool.idleCount` + `pool.waitingCount` and alerts when waiting > 5 for >30s. Would have caught this before user impact.
+
+## Related
+
+- Incident issue: [#454](https://github.com/Team-Commonly/commonly/issues/454)
+- Fix PR: [#455](https://github.com/Team-Commonly/commonly/pull/455)
+- Code: `backend/config/db-pg.ts` (pool config), `backend/controllers/podController.ts:199-227` (getAllPods PG call site), `backend/services/summarizerService.ts` (likely surge source)