From b6d883b448ece51d798468fd217c2bf99922442c Mon Sep 17 00:00:00 2001
From: jiashuoz <jszjosh@gmail.com>
Date: Tue, 26 May 2026 22:06:49 -0700
Subject: [PATCH] test(e2e-prod): per-suite correctness sweeps + nits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Companion to PR #168 (the harness + contract-assertion hardening).
This is the broader correctness sweep from the same review: smaller
issues that aren't critical-class but undermine specific tests'
ability to actually verify what their names claim.

Per-suite fixes:

01-basic.test.ts
 - Remove unused `before` import.

02-authz.test.ts
 - "cross-tenant DELETE" had dead code: the assert.ok above already
   throws on non-4xx, so the explicit 200/204 check could never run.
   Reordered: hard-fail on 200/204 first (and assert.fail() so node:test
   sees the regression), then the 4xx ok-range assertion.

08-mcp.test.ts + 12-mcp-extended.test.ts
 - Replace hardcoded absolute path to mcp/dist/index.js with a
   repo-relative default + E2A_MCP_DIST env override. The hardcoded
   path was `/Users/joshzhang/Desktop/e2a/...` — broke for any other
   contributor or CI environment.

09-postfix.test.ts
 - The "/send 429 Retry-After" test was a pure no-op recording an
   `info()` skip — but it counted as a passing test in the totals
   and inflated the green count. Marked test.skip() so it shows up
   correctly in the spec-reporter output. The reason (avoiding fan-
   out of 60+ HITL notification emails) is preserved in the comment.

11-messaging.test.ts (reply-empty)
 - Previous candidate selection (`/api/v1/messages` user-scoped +
   fallback to any direction) routinely picked an outbound message,
   which 404'd at the /reply endpoint (reply only operates on inbound).
   The 404 satisfied the `>= 400 && < 500` assert so the test passed
   without ever testing the "empty body returns 400" branch. Now uses
   the agent-scoped inbound listing and asserts the exact 400; if a
   listing-vs-storage skew causes a 404 on an id we just listed, that
   surfaces as info() rather than masking the empty-body check.

13-billing-surface.test.ts
 - usage-agents drift tolerance was purely informational at any size.
   Now: ±1 is silently fine (timing race), >1 records info() with
   detail, but >3 hard-fails as a real counter-staleness regression.

Smoke against prod passes with all changes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/e2e-prod/suites/01-basic.test.ts        |  2 +-
 tests/e2e-prod/suites/02-authz.test.ts        |  8 +++--
 tests/e2e-prod/suites/08-mcp.test.ts          |  7 ++++-
 tests/e2e-prod/suites/09-postfix.test.ts      | 16 ++++------
 tests/e2e-prod/suites/11-messaging.test.ts    | 29 ++++++++++++++-----
 tests/e2e-prod/suites/12-mcp-extended.test.ts |  6 +++-
 .../suites/13-billing-surface.test.ts         | 14 +++++++--
 7 files changed, 57 insertions(+), 25 deletions(-)
diff --git a/tests/e2e-prod/suites/01-basic.test.ts b/tests/e2e-prod/suites/01-basic.test.ts
index 82fc7f8..4fb307a 100644
--- a/tests/e2e-prod/suites/01-basic.test.ts
+++ b/tests/e2e-prod/suites/01-basic.test.ts
@@ -1,4 +1,4 @@
-import { test, before, after } from "node:test";
+import { test, after } from "node:test";
 import assert from "node:assert/strict";
 import { ApiClient } from "../harness/client.ts";
 import { cleanup, track } from "../harness/cleanup.ts";
diff --git a/tests/e2e-prod/suites/02-authz.test.ts b/tests/e2e-prod/suites/02-authz.test.ts
index 6235d4e..4617058 100644
--- a/tests/e2e-prod/suites/02-authz.test.ts
+++ b/tests/e2e-prod/suites/02-authz.test.ts
@@ -76,10 +76,14 @@ test("authz: PUT /agents/<email-i-dont-own> returns 403 or 4xx", async () => {
 
 test("authz: DELETE /agents/<email-i-dont-own> returns 403 or 4xx (no cross-tenant delete)", async () => {
   const r = await client.delete(`/api/v1/agents/${encodeURIComponent("nobody@example.com")}`);
-  assert.ok(r.status === 403 || (r.status >= 400 && r.status < 500), `expected 4xx, got ${r.status}`);
+  // assert.ok throws on a non-4xx response — the explicit 200/204 check
+  // that used to live below was dead code (unreachable past the assert).
+  // The fail-tag is preserved for triage clarity if the assert ever fires.
   if (r.status === 200 || r.status === 204) {
-    fail(SUITE, "cross-tenant-delete", "CRITICAL: deleted an agent we don't own");
+    fail(SUITE, "cross-tenant-delete", `CRITICAL: deleted an agent we don't own; got ${r.status}`);
+    assert.fail(`cross-tenant DELETE returned ${r.status} — agent we don't own was deleted`);
   }
+  assert.ok(r.status === 403 || (r.status >= 400 && r.status < 500), `expected 4xx, got ${r.status}`);
 });
 
 test("authz: GET /agents/<email>/messages of unowned agent returns 4xx", async () => {
diff --git a/tests/e2e-prod/suites/08-mcp.test.ts b/tests/e2e-prod/suites/08-mcp.test.ts
index aa06490..6a64829 100644
--- a/tests/e2e-prod/suites/08-mcp.test.ts
+++ b/tests/e2e-prod/suites/08-mcp.test.ts
@@ -11,7 +11,12 @@ const SUITE = "08-mcp";
 const mcp = new StdioMcpClient();
 
 before(async () => {
-  await mcp.start("node", ["/Users/joshzhang/Desktop/e2a/mcp/dist/index.js"], {
+  // Default to the repo-relative dist path so the suite works for any
+  // contributor / CI runner. Hardcoded absolute path was unportable.
+  // Override with E2A_MCP_DIST if the dist lives elsewhere.
+  const mcpDist =
+    process.env.E2A_MCP_DIST ?? new URL("../../../mcp/dist/index.js", import.meta.url).pathname;
+  await mcp.start("node", [mcpDist], {
     E2A_API_KEY: apiClient.env.apiKey,
     E2A_BASE_URL: apiClient.env.apiUrl,
     E2A_AGENT_EMAIL: apiClient.env.primaryAgentEmail,
diff --git a/tests/e2e-prod/suites/09-postfix.test.ts b/tests/e2e-prod/suites/09-postfix.test.ts
index 2ffa5c7..95c1be6 100644
--- a/tests/e2e-prod/suites/09-postfix.test.ts
+++ b/tests/e2e-prod/suites/09-postfix.test.ts
@@ -105,16 +105,12 @@ test("postfix #7: bare LF in subject is also rejected (no carriage return)", asy
   assert.equal(r.status, 400, `expected 400 for bare LF, got ${r.status}: ${r.raw.slice(0, 200)}`);
 });
 
-test("postfix #1 #2: /send 429 includes Retry-After header (probed via invalid payloads + 1 quota-hit guard)", async () => {
-  // We can't probe the send rate limit on prod without queueing real HITL
-  // notifications. Instead we verify the header CONTRACT: the docs (and
-  // OpenAPI) now say 429 carries Retry-After. We'll skip the active probe.
-  info(
-    SUITE,
-    "retry-after-probe-skipped",
-    "skipping active 60-send rate-limit probe to avoid triggering HITL notification emails — see issue #146",
-  );
-});
+// Skipped: actively probing the /send rate limit queues 60+ real HITL
+// notifications to the owner inbox (see auto-memory feedback note). The
+// /agents Retry-After test below covers the header CONTRACT via a cheaper
+// path that doesn't fan out to SMTP. Marked as test.skip so it doesn't
+// pollute the green-pass count.
+test.skip("postfix #1 #2: /send 429 includes Retry-After header (skipped — would queue real HITL notifications)", async () => {});
 
 test("postfix #1 #2: /agents 429 includes Retry-After header (active probe — does NOT send mail)", async () => {
   // Agent creation is a pure CRUD op; failing creates don't fan out to SMTP.
diff --git a/tests/e2e-prod/suites/11-messaging.test.ts b/tests/e2e-prod/suites/11-messaging.test.ts
index 8eee84f..cb053d6 100644
--- a/tests/e2e-prod/suites/11-messaging.test.ts
+++ b/tests/e2e-prod/suites/11-messaging.test.ts
@@ -226,20 +226,35 @@ test("messaging: reply to bogus message ID returns 404", async () => {
 });
 
 test("messaging: reply with empty body returns 400", async () => {
-  // Find any message we own to attempt reply against; if none, skip.
-  const list = await client.get<{ messages: Array<{ id: string; direction?: string }> }>("/api/v1/messages", { query: { limit: 5 } });
-  const candidate = list.body?.messages?.find((m) => m.direction === "inbound") ?? list.body?.messages?.[0];
+  // /reply requires the target message be inbound and belong to the
+  // agent in the path. The previous version fell back to any message
+  // including outbound, which routinely 404'd before the 400-missing-
+  // body check ran — so the test passed without ever exercising the
+  // "empty body returns 400" branch. Now: pull from the agent-scoped
+  // inbound listing, skip cleanly if none exist.
+  const email = client.env.primaryAgentEmail;
+  const list = await client.get<{ messages: Array<{ id: string; direction?: string }> }>(
+    `/api/v1/agents/${encodeURIComponent(email)}/messages`,
+    { query: { limit: 5, direction: "inbound" } },
+  );
+  const candidate = list.body?.messages?.find((m) => m.direction === "inbound" || m.direction === undefined);
   if (!candidate) {
-    info(SUITE, "reply-empty-skipped", "no messages in inbox to attempt reply against");
+    info(SUITE, "reply-empty-skipped", `no inbound messages on ${email} — cannot exercise empty-body reply check`);
     return;
   }
-  const email = client.env.primaryAgentEmail;
   const r = await client.post(
     `/api/v1/agents/${encodeURIComponent(email)}/messages/${encodeURIComponent(candidate.id)}/reply`,
     { body: {} },
   );
-  // Spec: 400 missing body, OR 404 if message isn't owned by THIS agent.
-  assert.ok(r.status >= 400 && r.status < 500, `expected 4xx (400 or 404), got ${r.status}: ${r.raw.slice(0, 200)}`);
+  // Now that we picked from the agent-scoped inbound list, 400 is the
+  // expected response (missing body). 404 here would mean the inbound
+  // listing returned a stale id — flag it informationally rather than
+  // assert away a different bug.
+  if (r.status === 404) {
+    info(SUITE, "reply-empty-404-on-listed-msg", `inbound list returned ${candidate.id} but /reply 404'd — possible listing/storage skew`);
+    return;
+  }
+  assert.equal(r.status, 400, `expected 400 (empty body) on owned inbound message, got ${r.status}: ${r.raw.slice(0, 200)}`);
 });
 
 test("messaging: /messages search filters — surface what's supported", async () => {
diff --git a/tests/e2e-prod/suites/12-mcp-extended.test.ts b/tests/e2e-prod/suites/12-mcp-extended.test.ts
index b55027f..c3794fc 100644
--- a/tests/e2e-prod/suites/12-mcp-extended.test.ts
+++ b/tests/e2e-prod/suites/12-mcp-extended.test.ts
@@ -11,7 +11,11 @@ const SUITE = "12-mcp-extended";
 const mcp = new StdioMcpClient();
 
 before(async () => {
-  await mcp.start("node", ["/Users/joshzhang/Desktop/e2a/mcp/dist/index.js"], {
+  // Default to the repo-relative dist path so the suite works for any
+  // contributor / CI runner. Override with E2A_MCP_DIST if needed.
+  const mcpDist =
+    process.env.E2A_MCP_DIST ?? new URL("../../../mcp/dist/index.js", import.meta.url).pathname;
+  await mcp.start("node", [mcpDist], {
     E2A_API_KEY: apiClient.env.apiKey,
     E2A_BASE_URL: apiClient.env.apiUrl,
     E2A_AGENT_EMAIL: apiClient.env.primaryAgentEmail,
diff --git a/tests/e2e-prod/suites/13-billing-surface.test.ts b/tests/e2e-prod/suites/13-billing-surface.test.ts
index 34ce6c0..75889b2 100644
--- a/tests/e2e-prod/suites/13-billing-surface.test.ts
+++ b/tests/e2e-prod/suites/13-billing-surface.test.ts
@@ -74,12 +74,20 @@ test("billing: usage.agents counts roughly match the actual /agents list", async
   const agents = await client.get<{ agents: unknown[] }>("/api/v1/agents");
   const actual = agents.body?.agents?.length ?? 0;
   const reported = limits.body.usage.agents!;
-  // Allow ±1 drift for in-flight resources, but a big mismatch is a bug.
-  if (Math.abs(reported - actual) > 1) {
+  const drift = Math.abs(reported - actual);
+  // ±1 is benign timing under 1 RPS concurrent creates (the two HTTP calls
+  // are not atomic). Wider drift signals a real counter-staleness bug — the
+  // limits primitive is supposed to read live, not cache counts.
+  if (drift > 1) {
     info(
       SUITE,
       "usage-agents-drift",
-      `usage.agents=${reported} but /agents list has ${actual} — drift larger than ±1 may indicate counter is stale`,
+      `usage.agents=${reported}, /agents list=${actual}, drift=${drift} — wider than expected timing race`,
+    );
+    // Hard-fail past a meaningful threshold; anything inside it is noise.
+    assert.ok(
+      drift <= 3,
+      `usage.agents counter drift ${drift} (reported=${reported}, actual=${actual}) — counter is stale or broken`,
     );
   } else {
     info(SUITE, "usage-agents-consistent", `usage.agents=${reported}, /agents list=${actual} — within ±1`);