From b6d883b448ece51d798468fd217c2bf99922442c Mon Sep 17 00:00:00 2001 From: jiashuoz Date: Tue, 26 May 2026 22:06:49 -0700 Subject: [PATCH] test(e2e-prod): per-suite correctness sweeps + nits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to PR #168 (the harness + contract-assertion hardening). This is the broader correctness sweep from the same review: smaller issues that aren't critical-class but undermine specific tests' ability to actually verify what their names claim. Per-suite fixes: 01-basic.test.ts - Remove unused `before` import. 02-authz.test.ts - "cross-tenant DELETE" had dead code: the assert.ok above already throws on non-4xx, so the explicit 200/204 check could never run. Reordered: hard-fail on 200/204 first (and assert.fail() so node:test sees the regression), then the 4xx ok-range assertion. 08-mcp.test.ts + 12-mcp-extended.test.ts - Replace hardcoded absolute path to mcp/dist/index.js with a repo-relative default + E2A_MCP_DIST env override. The hardcoded path was `/Users/joshzhang/Desktop/e2a/...` — broke for any other contributor or CI environment. 09-postfix.test.ts - The "/send 429 Retry-After" test was a pure no-op recording an `info()` skip — but it counted as a passing test in the totals and inflated the green count. Marked test.skip() so it shows up correctly in the spec-reporter output. The reason (avoiding fan- out of 60+ HITL notification emails) is preserved in the comment. 11-messaging.test.ts (reply-empty) - Previous candidate selection (`/api/v1/messages` user-scoped + fallback to any direction) routinely picked an outbound message, which 404'd at the /reply endpoint (reply only operates on inbound). The 404 satisfied the `>= 400 && < 500` assert so the test passed without ever testing the "empty body returns 400" branch. Now uses the agent-scoped inbound listing and asserts the exact 400; if a listing-vs-storage skew causes a 404 on an id we just listed, that surfaces as info() rather than masking the empty-body check. 13-billing-surface.test.ts - usage-agents drift tolerance was purely informational at any size. Now: ±1 is silently fine (timing race), >1 records info() with detail, but >3 hard-fails as a real counter-staleness regression. Smoke against prod passes with all changes. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/e2e-prod/suites/01-basic.test.ts | 2 +- tests/e2e-prod/suites/02-authz.test.ts | 8 +++-- tests/e2e-prod/suites/08-mcp.test.ts | 7 ++++- tests/e2e-prod/suites/09-postfix.test.ts | 16 ++++------ tests/e2e-prod/suites/11-messaging.test.ts | 29 ++++++++++++++----- tests/e2e-prod/suites/12-mcp-extended.test.ts | 6 +++- .../suites/13-billing-surface.test.ts | 14 +++++++-- 7 files changed, 57 insertions(+), 25 deletions(-) diff --git a/tests/e2e-prod/suites/01-basic.test.ts b/tests/e2e-prod/suites/01-basic.test.ts index 82fc7f8..4fb307a 100644 --- a/tests/e2e-prod/suites/01-basic.test.ts +++ b/tests/e2e-prod/suites/01-basic.test.ts @@ -1,4 +1,4 @@ -import { test, before, after } from "node:test"; +import { test, after } from "node:test"; import assert from "node:assert/strict"; import { ApiClient } from "../harness/client.ts"; import { cleanup, track } from "../harness/cleanup.ts"; diff --git a/tests/e2e-prod/suites/02-authz.test.ts b/tests/e2e-prod/suites/02-authz.test.ts index 6235d4e..4617058 100644 --- a/tests/e2e-prod/suites/02-authz.test.ts +++ b/tests/e2e-prod/suites/02-authz.test.ts @@ -76,10 +76,14 @@ test("authz: PUT /agents/ returns 403 or 4xx", async () => { test("authz: DELETE /agents/ returns 403 or 4xx (no cross-tenant delete)", async () => { const r = await client.delete(`/api/v1/agents/${encodeURIComponent("nobody@example.com")}`); - assert.ok(r.status === 403 || (r.status >= 400 && r.status < 500), `expected 4xx, got ${r.status}`); + // assert.ok throws on a non-4xx response — the explicit 200/204 check + // that used to live below was dead code (unreachable past the assert). + // The fail-tag is preserved for triage clarity if the assert ever fires. if (r.status === 200 || r.status === 204) { - fail(SUITE, "cross-tenant-delete", "CRITICAL: deleted an agent we don't own"); + fail(SUITE, "cross-tenant-delete", `CRITICAL: deleted an agent we don't own; got ${r.status}`); + assert.fail(`cross-tenant DELETE returned ${r.status} — agent we don't own was deleted`); } + assert.ok(r.status === 403 || (r.status >= 400 && r.status < 500), `expected 4xx, got ${r.status}`); }); test("authz: GET /agents//messages of unowned agent returns 4xx", async () => { diff --git a/tests/e2e-prod/suites/08-mcp.test.ts b/tests/e2e-prod/suites/08-mcp.test.ts index aa06490..6a64829 100644 --- a/tests/e2e-prod/suites/08-mcp.test.ts +++ b/tests/e2e-prod/suites/08-mcp.test.ts @@ -11,7 +11,12 @@ const SUITE = "08-mcp"; const mcp = new StdioMcpClient(); before(async () => { - await mcp.start("node", ["/Users/joshzhang/Desktop/e2a/mcp/dist/index.js"], { + // Default to the repo-relative dist path so the suite works for any + // contributor / CI runner. Hardcoded absolute path was unportable. + // Override with E2A_MCP_DIST if the dist lives elsewhere. + const mcpDist = + process.env.E2A_MCP_DIST ?? new URL("../../../mcp/dist/index.js", import.meta.url).pathname; + await mcp.start("node", [mcpDist], { E2A_API_KEY: apiClient.env.apiKey, E2A_BASE_URL: apiClient.env.apiUrl, E2A_AGENT_EMAIL: apiClient.env.primaryAgentEmail, diff --git a/tests/e2e-prod/suites/09-postfix.test.ts b/tests/e2e-prod/suites/09-postfix.test.ts index 2ffa5c7..95c1be6 100644 --- a/tests/e2e-prod/suites/09-postfix.test.ts +++ b/tests/e2e-prod/suites/09-postfix.test.ts @@ -105,16 +105,12 @@ test("postfix #7: bare LF in subject is also rejected (no carriage return)", asy assert.equal(r.status, 400, `expected 400 for bare LF, got ${r.status}: ${r.raw.slice(0, 200)}`); }); -test("postfix #1 #2: /send 429 includes Retry-After header (probed via invalid payloads + 1 quota-hit guard)", async () => { - // We can't probe the send rate limit on prod without queueing real HITL - // notifications. Instead we verify the header CONTRACT: the docs (and - // OpenAPI) now say 429 carries Retry-After. We'll skip the active probe. - info( - SUITE, - "retry-after-probe-skipped", - "skipping active 60-send rate-limit probe to avoid triggering HITL notification emails — see issue #146", - ); -}); +// Skipped: actively probing the /send rate limit queues 60+ real HITL +// notifications to the owner inbox (see auto-memory feedback note). The +// /agents Retry-After test below covers the header CONTRACT via a cheaper +// path that doesn't fan out to SMTP. Marked as test.skip so it doesn't +// pollute the green-pass count. +test.skip("postfix #1 #2: /send 429 includes Retry-After header (skipped — would queue real HITL notifications)", async () => {}); test("postfix #1 #2: /agents 429 includes Retry-After header (active probe — does NOT send mail)", async () => { // Agent creation is a pure CRUD op; failing creates don't fan out to SMTP. diff --git a/tests/e2e-prod/suites/11-messaging.test.ts b/tests/e2e-prod/suites/11-messaging.test.ts index 8eee84f..cb053d6 100644 --- a/tests/e2e-prod/suites/11-messaging.test.ts +++ b/tests/e2e-prod/suites/11-messaging.test.ts @@ -226,20 +226,35 @@ test("messaging: reply to bogus message ID returns 404", async () => { }); test("messaging: reply with empty body returns 400", async () => { - // Find any message we own to attempt reply against; if none, skip. - const list = await client.get<{ messages: Array<{ id: string; direction?: string }> }>("/api/v1/messages", { query: { limit: 5 } }); - const candidate = list.body?.messages?.find((m) => m.direction === "inbound") ?? list.body?.messages?.[0]; + // /reply requires the target message be inbound and belong to the + // agent in the path. The previous version fell back to any message + // including outbound, which routinely 404'd before the 400-missing- + // body check ran — so the test passed without ever exercising the + // "empty body returns 400" branch. Now: pull from the agent-scoped + // inbound listing, skip cleanly if none exist. + const email = client.env.primaryAgentEmail; + const list = await client.get<{ messages: Array<{ id: string; direction?: string }> }>( + `/api/v1/agents/${encodeURIComponent(email)}/messages`, + { query: { limit: 5, direction: "inbound" } }, + ); + const candidate = list.body?.messages?.find((m) => m.direction === "inbound" || m.direction === undefined); if (!candidate) { - info(SUITE, "reply-empty-skipped", "no messages in inbox to attempt reply against"); + info(SUITE, "reply-empty-skipped", `no inbound messages on ${email} — cannot exercise empty-body reply check`); return; } - const email = client.env.primaryAgentEmail; const r = await client.post( `/api/v1/agents/${encodeURIComponent(email)}/messages/${encodeURIComponent(candidate.id)}/reply`, { body: {} }, ); - // Spec: 400 missing body, OR 404 if message isn't owned by THIS agent. - assert.ok(r.status >= 400 && r.status < 500, `expected 4xx (400 or 404), got ${r.status}: ${r.raw.slice(0, 200)}`); + // Now that we picked from the agent-scoped inbound list, 400 is the + // expected response (missing body). 404 here would mean the inbound + // listing returned a stale id — flag it informationally rather than + // assert away a different bug. + if (r.status === 404) { + info(SUITE, "reply-empty-404-on-listed-msg", `inbound list returned ${candidate.id} but /reply 404'd — possible listing/storage skew`); + return; + } + assert.equal(r.status, 400, `expected 400 (empty body) on owned inbound message, got ${r.status}: ${r.raw.slice(0, 200)}`); }); test("messaging: /messages search filters — surface what's supported", async () => { diff --git a/tests/e2e-prod/suites/12-mcp-extended.test.ts b/tests/e2e-prod/suites/12-mcp-extended.test.ts index b55027f..c3794fc 100644 --- a/tests/e2e-prod/suites/12-mcp-extended.test.ts +++ b/tests/e2e-prod/suites/12-mcp-extended.test.ts @@ -11,7 +11,11 @@ const SUITE = "12-mcp-extended"; const mcp = new StdioMcpClient(); before(async () => { - await mcp.start("node", ["/Users/joshzhang/Desktop/e2a/mcp/dist/index.js"], { + // Default to the repo-relative dist path so the suite works for any + // contributor / CI runner. Override with E2A_MCP_DIST if needed. + const mcpDist = + process.env.E2A_MCP_DIST ?? new URL("../../../mcp/dist/index.js", import.meta.url).pathname; + await mcp.start("node", [mcpDist], { E2A_API_KEY: apiClient.env.apiKey, E2A_BASE_URL: apiClient.env.apiUrl, E2A_AGENT_EMAIL: apiClient.env.primaryAgentEmail, diff --git a/tests/e2e-prod/suites/13-billing-surface.test.ts b/tests/e2e-prod/suites/13-billing-surface.test.ts index 34ce6c0..75889b2 100644 --- a/tests/e2e-prod/suites/13-billing-surface.test.ts +++ b/tests/e2e-prod/suites/13-billing-surface.test.ts @@ -74,12 +74,20 @@ test("billing: usage.agents counts roughly match the actual /agents list", async const agents = await client.get<{ agents: unknown[] }>("/api/v1/agents"); const actual = agents.body?.agents?.length ?? 0; const reported = limits.body.usage.agents!; - // Allow ±1 drift for in-flight resources, but a big mismatch is a bug. - if (Math.abs(reported - actual) > 1) { + const drift = Math.abs(reported - actual); + // ±1 is benign timing under 1 RPS concurrent creates (the two HTTP calls + // are not atomic). Wider drift signals a real counter-staleness bug — the + // limits primitive is supposed to read live, not cache counts. + if (drift > 1) { info( SUITE, "usage-agents-drift", - `usage.agents=${reported} but /agents list has ${actual} — drift larger than ±1 may indicate counter is stale`, + `usage.agents=${reported}, /agents list=${actual}, drift=${drift} — wider than expected timing race`, + ); + // Hard-fail past a meaningful threshold; anything inside it is noise. + assert.ok( + drift <= 3, + `usage.agents counter drift ${drift} (reported=${reported}, actual=${actual}) — counter is stale or broken`, ); } else { info(SUITE, "usage-agents-consistent", `usage.agents=${reported}, /agents list=${actual} — within ±1`);