From efb5f9d215e8a4729af7afb747f065468fd06828 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:44:16 -0700 Subject: [PATCH 1/6] feat(verifier): record agent trajectories --- .../core/lib/v3/agent/AnthropicCUAClient.ts | 4 + .../v3/agent/utils/captureAriaTreeProbe.ts | 75 +++ .../core/lib/v3/handlers/v3AgentHandler.ts | 109 ++++ .../core/lib/v3/handlers/v3CuaAgentHandler.ts | 160 +++++- .../core/lib/v3/types/public/busEvents.ts | 108 ++++ packages/core/lib/v3/types/public/index.ts | 1 + .../evals/framework/trajectoryRecorder.ts | 507 ++++++++++++++++++ .../scripts/verify-trajectory-recorder.ts | 230 ++++++++ 8 files changed, 1192 insertions(+), 2 deletions(-) create mode 100644 packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts create mode 100644 packages/core/lib/v3/types/public/busEvents.ts create mode 100644 packages/evals/framework/trajectoryRecorder.ts create mode 100644 packages/evals/scripts/verify-trajectory-recorder.ts diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 752d208e2..54d64f15d 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -902,6 +902,10 @@ export class AnthropicCUAClient extends AgentClient { ...input, }; } else if (action === "triple_click" || action === "tripleClick") { + // Anthropic's computer_20250124 tool emits `triple_click` with + // `coordinate: [x, y]`. Without this branch the snake_case name + + // raw coordinate array fall through to the generic `else` and + // executeAction logs "Unknown action type: triple_click". return { type: "tripleClick", x: diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts new file mode 100644 index 000000000..8e3fcc050 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts @@ -0,0 +1,75 @@ +/** + * captureAriaTreeProbe — capture a truncated accessibility tree of the active + * page for use as tier-2 evidence in the trajectory recorder. + * + * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the + * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay + * the cost. + * + * The a11y tree is the same payload the agent's `ariaTree` tool sees, but + * captured by the harness (not the agent) so the verifier has independent + * textual ground truth for grounding non-visual claims — prices, names, + * dates, list contents — without OCR'ing screenshots. + * + * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures + * across a ~30-step trajectory at that cap sum to ~240k tokens total, + * which the verifier handles via per-criterion top-K selection. The cap + * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can + * trade RAM/disk for fidelity. Truncated content is marked explicitly so + * the verifier knows it was clipped. + */ +import type { V3 } from "../../v3.js"; + +const APPROX_CHARS_PER_TOKEN = 4; +const DEFAULT_TOKEN_BUDGET = 8_000; +const DEFAULT_TIMEOUT_MS = 5_000; + +interface CaptureAriaTreeOptions { + /** Soft cap on token count (chars/4 approximation). Default 8000. */ + tokenBudget?: number; + /** Hard timeout on the capture. Default 5s. */ + timeoutMs?: number; +} + +/** + * Returns the truncated a11y tree as a plain string, or undefined when + * capture fails. Never throws — a11y capture is best-effort tier-2 evidence, + * not a hard requirement, so failures are silently absorbed (the verifier + * surfaces this via evidence_insufficient). + */ +export async function captureAriaTreeProbe( + v3: V3, + opts: CaptureAriaTreeOptions = {}, +): Promise { + const envBudget = parseInt( + process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "", + 10, + ); + const tokenBudget = + opts.tokenBudget ?? + (Number.isFinite(envBudget) && envBudget > 0 + ? envBudget + : DEFAULT_TOKEN_BUDGET); + const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN; + + try { + // v3.extract() without a schema returns { pageText } where pageText is the + // rendered accessibility tree — same path the agent's ariaTree tool uses. + const result = (await v3.extract({ timeout: timeoutMs })) as { + pageText?: string; + }; + const pageText = result?.pageText; + if (typeof pageText !== "string" || pageText.length === 0) return undefined; + + if (pageText.length > maxChars) { + return ( + pageText.slice(0, maxChars) + + `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]` + ); + } + return pageText; + } catch { + return undefined; + } +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index cff08c8a2..d0308bdd8 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -41,6 +41,7 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, @@ -248,6 +249,10 @@ export class V3AgentHandler { | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback, ) { + // Monotonic step counter scoped to this execute() call. Each tool call in + // the agent loop becomes one trajectory step. The counter feeds stepIndex + // on the bus events the TrajectoryRecorder subscribes to. + let stepCounter = 0; return async (event: StepResult) => { this.logger({ category: "agent", @@ -255,6 +260,11 @@ export class V3AgentHandler { level: 2, }); + const stepIndicesInTurn: number[] = []; + let lastFinalAnswer: + | { message: string; output?: Record } + | undefined; + if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { const toolCall = event.toolCalls[i]; @@ -279,6 +289,13 @@ export class V3AgentHandler { ? `${allReasoning} ${doneReasoning}`.trim() : allReasoning || "Task completed successfully"; } + lastFinalAnswer = { + message: state.finalMessage, + output: + typeof args?.output === "object" && args?.output !== null + ? (args.output as Record) + : undefined, + }; } const mappedActions = mapToolResultToActions({ toolCallName: toolCall.toolName, @@ -292,8 +309,100 @@ export class V3AgentHandler { action.timestamp = Date.now(); state.actions.push(action); } + + // Emit step_finished_event per tool call. The TrajectoryRecorder + // builds one Trajectory.Step per emission. tier-1 evidence (the + // bytes the LLM consumed) is captured separately via an + // onStepFinish wrapper in the harness (plan §10 Q1). + const stepIndex = stepCounter++; + stepIndicesInTurn.push(stepIndex); + const toolOk = + !toolResult || + (typeof toolResult === "object" && + !("error" in toolResult) && + !("isError" in toolResult && toolResult.isError)); + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: toolCall.toolName, + actionArgs: + typeof args === "object" && args !== null + ? (args as Record) + : {}, + reasoning: event.text ?? "", + toolOutput: { + ok: toolOk, + result: toolResult, + error: + toolResult && + typeof toolResult === "object" && + "error" in toolResult && + typeof (toolResult as { error?: unknown }).error === "string" + ? (toolResult as { error: string }).error + : undefined, + }, + finishedAt: new Date().toISOString(), + }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); + + // Harness probe — take a single screenshot / a11y snapshot per AI SDK + // step and attach it to every tool call in that turn. The observation + // reflects the settled page state after the batch of tool calls; this + // is more faithful than dropping probe evidence for all but the last + // tool call, while still avoiding per-tool screenshot overhead. + const wantsScreenshotProbe = + this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount("agent_step_observed_event") > 0; + if ( + stepIndicesInTurn.length > 0 && + (wantsScreenshotProbe || wantsStepObservation) + ) { + try { + const page = await this.v3.context.awaitActivePage(); + let screenshot: Buffer | undefined; + if (wantsScreenshotProbe) { + screenshot = await page.screenshot({ fullPage: false }); + } + let ariaTree: string | undefined; + if (wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier + // can ground textual claims (prices, names, dates) without OCR. + // Best-effort: returns undefined on failure/timeout. + ariaTree = await captureAriaTreeProbe(this.v3); + } + for (const stepIndex of stepIndicesInTurn) { + if (screenshot) { + // DOM/hybrid: this post-step screenshot is a harness probe + // only. The agent's tier-1 evidence is the tool's return value + // captured separately in agent_step_finished_event. + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot, + url: state.currentPageUrl, + evidenceRole: "probe", + }); + } + if (wantsStepObservation) { + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: state.currentPageUrl, + ariaTree, + }); + } + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: harness probe failed: ${getErrorMessage(e)}`, + level: 1, + }); + } + } + } + + if (lastFinalAnswer) { + this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer); } if (userCallback) { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index af3a3dad8..f1dd2666e 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,6 +7,7 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { ActionExecutionResult, AgentAction, @@ -16,6 +17,7 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; +import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -37,6 +39,13 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; + // Monotonic step counter used by bus events. The CUA loop is internal to + // the agent client, so unlike v3AgentHandler we don't have per-tool-call + // step events; instead we tag every screenshot emission with an + // incrementing index. Wave 1 may add finer-grained step events here. + private cuaStepCounter = 0; + private latestCuaScreenshot?: AgentScreenshotTakenEvent; + private latestCuaScreenshotConsumed = true; constructor( v3: V3, @@ -76,6 +85,17 @@ export class V3CuaAgentHandler { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); + + // Emit bus event so TrajectoryRecorder can capture the screenshot. In + // CUA mode this is the same buffer the provider receives — i.e., it + // serves both as tier-1 evidence (what the model saw) and as a tier-2 + // probe. See plan §04 "Mode-by-mode sources". + try { + this.emitCuaScreenshot(screenshotBuffer, page.url()); + } catch { + // bus emit errors are non-fatal + } + return screenshotBuffer.toString("base64"); // base64 png }); @@ -120,6 +140,7 @@ export class V3CuaAgentHandler { (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; try { + let executionResult: ActionExecutionResult | undefined; // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -133,7 +154,7 @@ export class V3CuaAgentHandler { // takes its own screenshot via screenshotProvider between API turns. const shouldLog = action.type !== "screenshot"; if (shouldLog) { - await FlowLogger.runWithLogging( + executionResult = await FlowLogger.runWithLogging( { eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick" data: { @@ -145,10 +166,13 @@ export class V3CuaAgentHandler { [action], ); } else { - await this.executeAction(action); + executionResult = await this.executeAction(action); } action.timestamp = Date.now(); + if (shouldLog) { + await this.emitCuaActionStep(action, executionResult); + } await new Promise((r) => setTimeout(r, waitBetween)); } catch (error) { @@ -658,6 +682,15 @@ export class V3CuaAgentHandler { const screenshotBuffer = await page.screenshot({ fullPage: false }); const currentUrl = page.url(); + + // Mirror the screenshot to the bus — same buffer the CUA client + // received, so it serves as both tier-1 evidence and tier-2 probe. + try { + this.emitCuaScreenshot(screenshotBuffer, currentUrl); + } catch { + // non-fatal + } + return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), currentUrl, @@ -767,6 +800,129 @@ export class V3CuaAgentHandler { } } + /** + * Emit a pre-action CUA screenshot — the exact buffer the model received + * as input. Tier-1 evidence (agent-mirrored); the tier-2 probe is taken + * separately in emitCuaActionStep after the action runs, so the recorder + * can compare what the model saw against what the page actually showed + * once the keystrokes/clicks landed. + */ + private emitCuaScreenshot( + screenshot: Buffer, + url: string, + ): AgentScreenshotTakenEvent { + const event: AgentScreenshotTakenEvent = { + stepIndex: this.cuaStepCounter++, + screenshot, + url, + evidenceRole: "agent", + }; + this.latestCuaScreenshot = event; + this.latestCuaScreenshotConsumed = false; + this.v3.bus.emit("agent_screenshot_taken_event", event); + return event; + } + + private async emitCuaActionStep( + action: AgentAction, + result: ActionExecutionResult | undefined, + ): Promise { + let pageUrl = + typeof action.pageUrl === "string" + ? action.pageUrl + : this.latestCuaScreenshot?.url; + try { + pageUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + // Keep the best pre-action URL fallback. + } + let stepIndex: number; + + if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) { + stepIndex = this.latestCuaScreenshot.stepIndex; + this.latestCuaScreenshotConsumed = true; + } else if (this.latestCuaScreenshot) { + stepIndex = this.cuaStepCounter++; + this.v3.bus.emit("agent_screenshot_taken_event", { + ...this.latestCuaScreenshot, + stepIndex, + }); + } else { + stepIndex = this.cuaStepCounter++; + } + + const actionArgs = Object.fromEntries( + Object.entries(action).filter(([key]) => key !== "screenshot"), + ); + const reasoning = + typeof action.reasoning === "string" + ? action.reasoning + : typeof action.action === "string" + ? action.action + : ""; + + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: String(action.type), + actionArgs, + reasoning, + toolOutput: { + ok: result?.success !== false, + result: result ?? { success: true }, + error: result?.error, + }, + finishedAt: new Date().toISOString(), + }); + + // Post-action tier-2 probe. The pre-action screenshot from + // screenshotProvider is what the model SAW; this one shows what the + // page actually LOOKS LIKE after the action ran. Without this the + // verifier has no visual evidence that keystrokes/clicks landed, and + // has to trust the action history alone. + // + // Listener-gated to keep ordinary agent runs free of the extra + // screenshot cost — mirrors v3AgentHandler's post-step probe. + const wantsScreenshotProbe = + this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount("agent_step_observed_event") > 0; + let probeUrl = pageUrl; + if (wantsScreenshotProbe || wantsStepObservation) { + try { + const page = await this.v3.context.awaitActivePage(); + probeUrl = page.url(); + if (wantsScreenshotProbe) { + const probeScreenshot = await page.screenshot({ fullPage: false }); + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot: probeScreenshot, + url: probeUrl, + evidenceRole: "probe", + }); + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: CUA post-action probe failed: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + } + + if (probeUrl && wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier can + // ground textual claims without OCR. Best-effort. + const ariaTree = await captureAriaTreeProbe(this.v3); + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: probeUrl, + ariaTree, + }); + } + } + private async injectCursor(): Promise { try { const page = await this.v3.context.awaitActivePage(); diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts new file mode 100644 index 000000000..62e992949 --- /dev/null +++ b/packages/core/lib/v3/types/public/busEvents.ts @@ -0,0 +1,108 @@ +/** + * Bus event payloads emitted by V3 on `v3.bus`. + * + * The bus is an EventEmitter; these types document the payload shape per + * event name so consumers (TrajectoryRecorder in packages/evals, custom + * subscribers) can type their handlers. + * + * Wave 0 of the verifier rewrite plan introduces: + * - agent_screenshot_taken_event — independent post-step screenshot probe + * - agent_step_finished_event — fired per tool-call in a step result + * - agent_step_observed_event — fired after the harness probe completes + * - agent_final_answer_event — fired when the `done` tool resolves + * + * `agent_step_started_event` is documented in the plan but deferred — the AI + * SDK's `onStepFinish` is a post-hook, and there's no symmetric pre-hook per + * tool execution in v3AgentHandler today. Started-state can be derived from + * the finished event's stepIndex if needed. + */ + +/** + * Names of bus events the agent handlers emit. Use these constants to + * subscribe; the bus accepts arbitrary strings, but a centralized list helps + * catch typos at the call site. + */ +export const BUS_EVENTS = { + AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event", + AGENT_STEP_FINISHED: "agent_step_finished_event", + AGENT_STEP_OBSERVED: "agent_step_observed_event", + AGENT_FINAL_ANSWER: "agent_final_answer_event", +} as const; + +export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; + +/** + * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the + * harness took after a step's tool execution. + * + * Note: in CUA mode the same Buffer is also what the provider received; in + * DOM/hybrid mode it's an independent harness probe. The verifier treats them + * as different evidence tiers regardless — see plan §04 ("Mode-by-mode sources"). + */ +export interface AgentScreenshotTakenEvent { + /** Zero-based index of the step this screenshot corresponds to. */ + stepIndex: number; + /** PNG bytes from page.screenshot(). */ + screenshot: Buffer; + /** Page URL at the time of capture. */ + url: string; + /** + * Evidence role for this screenshot. + * + * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also + * the exact image bytes sent to the provider, so they serve both as tier-1 + * agent evidence and tier-2 probe evidence. + */ + evidenceRole?: "probe" | "agent" | "agent_and_probe"; +} + +/** + * Payload for `agent_step_finished_event`. Emitted once per tool call within + * a step result. Carries the tool's reported outcome and a reference to the + * agent's textual reasoning for the step. + * + * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured + * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper + * — not in this payload. See plan §10 Q1 (resolved: onStepFinish). + */ +export interface AgentStepFinishedEvent { + stepIndex: number; + /** Name of the tool that ran (e.g., "act", "extract", "click"). */ + actionName: string; + /** Arguments passed to the tool. */ + actionArgs: Record; + /** Agent's textual reasoning (event.text on the AI SDK StepResult). */ + reasoning: string; + /** Outcome of the tool execution as seen by the harness. */ + toolOutput: { + ok: boolean; + /** The tool's native return value. */ + result: unknown; + error?: string; + }; + /** ISO 8601 timestamp at which the step finished. */ + finishedAt: string; +} + +/** + * Payload for `agent_step_observed_event`. Emitted after the harness probe + * completes for a step (page URL captured at minimum; a11y tree and scroll + * info added in Wave 2). + */ +export interface AgentStepObservedEvent { + stepIndex: number; + /** Page URL after the step's tool execution. */ + url: string; + /** v1 — accessibility tree snapshot. */ + ariaTree?: string; + /** v1 — viewport scroll context. */ + scroll?: { top: number; pageHeight: number }; +} + +/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */ +export interface AgentFinalAnswerEvent { + /** The agent's final summary message. */ + message: string; + /** Optional structured output if the agent's `output` schema was set. */ + output?: Record; +} diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts index 9c5df08d0..9bf24eb27 100644 --- a/packages/core/lib/v3/types/public/index.ts +++ b/packages/core/lib/v3/types/public/index.ts @@ -1,4 +1,5 @@ export * from "./agent.js"; +export * from "./busEvents.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts new file mode 100644 index 000000000..2b7f24b52 --- /dev/null +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -0,0 +1,507 @@ +/** + * TrajectoryRecorder — subscribes to v3.bus step events emitted by the agent + * handlers (v3AgentHandler / v3CuaAgentHandler) and assembles a Trajectory + * the verifier can consume. + * + * Lifecycle: + * const recorder = new TrajectoryRecorder({ v3, taskSpec }); + * recorder.start(); + * await agent.execute(...); + * const trajectory = await recorder.finish({ status: "complete", usage }); + * + * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES` (plan §10 Q2): + * - unset: persistence follows the default (on locally, off in CI). + * - "1" / "true": always persist. + * - "0" / "false": never persist. + * + * On-disk layout matches microsoft/fara's example_trajectory/ so we can + * cross-validate against verify_trajectories.py without format conversion. + * + * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk) + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { + AgentEvidence, + AgentFinalAnswerEvent, + AgentScreenshotTakenEvent, + AgentStepFinishedEvent, + AgentStepObservedEvent, + ProbeEvidence, + TaskSpec, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + Verdict, + V3, +} from "@browserbasehq/stagehand"; + +interface PartialStep { + index: number; + actionName: string; + actionArgs: Record; + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: { ok: boolean; result: unknown; error?: string }; + finishedAt: string; +} + +export interface TrajectoryRecorderOptions { + v3: V3; + taskSpec: TaskSpec; + /** + * Root directory under which trajectory dirs are written. Each task run + * gets a subdirectory named by runId/task.id. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp + env). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface TrajectoryFinishOptions { + status: TrajectoryStatus; + finalAnswer?: string; + usage?: Partial; +} + +const ZERO_USAGE: TrajectoryUsage = { + input_tokens: 0, + output_tokens: 0, +}; + +/** + * Decide whether to persist by default. Honors the explicit override first, + * then env, then falls back to "persist when not in CI". + */ +function shouldPersist(override: boolean | undefined): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +export class TrajectoryRecorder { + private readonly v3: V3; + private readonly taskSpec: TaskSpec; + private readonly runId: string; + private readonly outputDir: string; + private readonly persistEnabled: boolean; + + // Per-stepIndex builders; events can arrive out-of-order in theory, though + // the handlers emit step_finished → screenshot_taken → step_observed in the + // same microtask. + private readonly partialSteps = new Map>(); + private readonly observationByStep = new Map< + number, + AgentStepObservedEvent + >(); + private readonly screenshotsByStep = new Map< + number, + AgentScreenshotTakenEvent + >(); + private finalAnswerEvent?: AgentFinalAnswerEvent; + private startedAt = ""; + private endedAt = ""; + private listenersAttached = false; + + // Strongly-typed bound handlers so we can attach/detach the same references. + private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => { + this.screenshotsByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + + // Default to "probe" when the emit site doesn't tag the role — matches + // v3AgentHandler's post-step screenshot, which is always a tier-2 probe. + const role = e.evidenceRole ?? "probe"; + + // Probe channel (tier 2): the page's state at observation time. For CUA + // the pre-action screenshot is NOT a probe — that role is filled by the + // post-action emit from emitCuaActionStep. So only update probe.screenshot + // when the event explicitly carries the probe role. + if (role === "probe" || role === "agent_and_probe") { + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.screenshot = e.screenshot; + probe.url = e.url; + partial.probeEvidence = probe; + } else if (!partial.probeEvidence?.url) { + // Even for tier-1-only events, the URL is useful probe context if we + // don't have one yet. Doesn't overwrite a later post-action URL. + partial.probeEvidence = { + ...(partial.probeEvidence ?? {}), + url: e.url, + }; + } + + // Agent channel (tier 1): bytes the model ingested. + if (role === "agent" || role === "agent_and_probe") { + partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { + modalities: [ + { type: "image", bytes: e.screenshot, mediaType: "image/png" }, + ], + }); + } + }; + private readonly onStepFinished = (e: AgentStepFinishedEvent) => { + const partial = this.ensurePartial(e.stepIndex); + partial.actionName = e.actionName; + partial.actionArgs = e.actionArgs; + partial.reasoning = e.reasoning; + partial.toolOutput = e.toolOutput; + partial.finishedAt = e.finishedAt; + partial.agentEvidence = mergeAgentEvidence( + partial.agentEvidence, + buildAgentEvidence(e), + ); + }; + private readonly onStepObserved = (e: AgentStepObservedEvent) => { + this.observationByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.url = e.url; + if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; + if (e.scroll !== undefined) probe.scroll = e.scroll; + partial.probeEvidence = probe; + }; + private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => { + this.finalAnswerEvent = e; + }; + + constructor(opts: TrajectoryRecorderOptions) { + this.v3 = opts.v3; + this.taskSpec = opts.taskSpec; + this.runId = + opts.runId ?? + new Date().toISOString().replace(/[:.]/g, "-").replace("T", "T"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + this.outputDir = path.join(root, this.runId, opts.taskSpec.id); + this.persistEnabled = shouldPersist(opts.persist); + } + + /** Subscribe to bus events. Call once before agent.execute(). */ + start(): void { + if (this.listenersAttached) return; + this.startedAt = new Date().toISOString(); + this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.on("agent_step_finished_event", this.onStepFinished); + this.v3.bus.on("agent_step_observed_event", this.onStepObserved); + this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = true; + } + + /** + * Detach listeners, assemble the Trajectory, and (if persistence is on) + * write the on-disk layout. Idempotent. + */ + async finish(opts: TrajectoryFinishOptions): Promise { + this.detach(); + this.endedAt = new Date().toISOString(); + + const steps = this.assembleSteps(); + const trajectory: Trajectory = { + task: this.taskSpec, + steps, + finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, + status: opts.status, + usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, + timing: { startedAt: this.startedAt, endedAt: this.endedAt }, + }; + + if (this.persistEnabled) { + await this.persist(trajectory); + } + + return trajectory; + } + + /** Throw away in-memory state without writing to disk. Used on early abort. */ + cancel(): void { + this.detach(); + this.partialSteps.clear(); + this.observationByStep.clear(); + this.screenshotsByStep.clear(); + this.finalAnswerEvent = undefined; + } + + /** Where the trajectory dir lives (whether or not it was persisted). */ + get directory(): string { + return this.outputDir; + } + + /** Whether this recorder wrote the trajectory directory on finish(). */ + get persisted(): boolean { + return this.persistEnabled; + } + + /** + * Persist verifier scores next to the trajectory. No-op when trajectory + * persistence is disabled. + */ + async persistVerdict( + verdict: Verdict, + filename = "mmrubric_v1.json", + ): Promise { + if (!this.persistEnabled) return; + + const scoresDir = path.join(this.outputDir, "scores"); + await fs.mkdir(scoresDir, { recursive: true }); + await fs.writeFile( + path.join(scoresDir, filename), + JSON.stringify(verdict, null, 2), + ); + + const taskDataPath = path.join(this.outputDir, "task_data.json"); + let taskData: Record = {}; + try { + taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record< + string, + unknown + >; + } catch { + taskData = { task: this.taskSpec }; + } + await fs.writeFile( + taskDataPath, + JSON.stringify({ ...taskData, verdict }, null, 2), + ); + } + + private detach(): void { + if (!this.listenersAttached) return; + this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.off("agent_step_finished_event", this.onStepFinished); + this.v3.bus.off("agent_step_observed_event", this.onStepObserved); + this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = false; + } + + private ensurePartial(stepIndex: number): Partial { + let p = this.partialSteps.get(stepIndex); + if (!p) { + p = { index: stepIndex }; + this.partialSteps.set(stepIndex, p); + } + return p; + } + + /** + * Materialize ordered TrajectoryStep[] from the accumulated partials. + * Steps that never received a step_finished event are skipped (they can + * appear for CUA where only screenshot events fire — those are recorded as + * orphan probe screenshots and elided here). + */ + private assembleSteps(): TrajectoryStep[] { + const out: TrajectoryStep[] = []; + const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); + for (const i of indices) { + const p = this.partialSteps.get(i)!; + if ( + p.actionName === undefined || + p.toolOutput === undefined || + p.finishedAt === undefined + ) { + // Orphan screenshot-only entry (typically CUA). Skip — we record + // these by writing the screenshot to disk separately during persist(). + continue; + } + out.push({ + index: i, + actionName: p.actionName, + actionArgs: p.actionArgs ?? {}, + reasoning: p.reasoning ?? "", + agentEvidence: p.agentEvidence ?? { modalities: [] }, + probeEvidence: p.probeEvidence ?? {}, + toolOutput: p.toolOutput, + startedAt: this.startedAt, + finishedAt: p.finishedAt, + }); + } + return out; + } + + /** + * Write the trajectory directory layout. Mirrors fara's example_trajectory/: + * + * / + * ├── task_data.json + * ├── trajectory.json (screenshots referenced by path) + * ├── screenshot_.png + * └── times.json + */ + private async persist(trajectory: Trajectory): Promise { + await fs.mkdir(this.outputDir, { recursive: true }); + + // Walk steps and write screenshots; replace Buffer with path reference in + // the serialized trajectory. Both tiers externalize image bytes under + // screenshots/probe/.png — tier 2, what the harness observed + // screenshots/agent/.png — tier 1, what the model received + // The `_` suffix only appears when a step carries multiple images + // (rare; typically zero or one per step). Paths in JSON are relative to + // the trajectory dir so the directory is movable/copyable as a unit. + await fs.mkdir(path.join(this.outputDir, "screenshots", "probe"), { + recursive: true, + }); + await fs.mkdir(path.join(this.outputDir, "screenshots", "agent"), { + recursive: true, + }); + + const serializableSteps: unknown[] = []; + for (const step of trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile( + path.join(this.outputDir, relPath), + probe.screenshot, + ); + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const modalities: unknown[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + modalities.push(m); + continue; + } + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + await fs.writeFile(path.join(this.outputDir, relPath), m.bytes); + modalities.push({ + type: "image", + imagePath: relPath, + mediaType: m.mediaType, + }); + imageSeq += 1; + } + const agentEvidence = { modalities }; + serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); + } + + // Image modalities carry imagePath instead of raw bytes on disk, so this + // is no longer a strict Trajectory at the type level. Cast through + // unknown rather than widening the type contract. + const serialized = { + ...trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(this.outputDir, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + // task_data.json mirrors fara's shape: TaskSpec + (later) verdict. + await fs.writeFile( + path.join(this.outputDir, "task_data.json"), + JSON.stringify( + { + task: trajectory.task, + status: trajectory.status, + finalAnswer: trajectory.finalAnswer ?? null, + }, + null, + 2, + ), + ); + + await fs.writeFile( + path.join(this.outputDir, "times.json"), + JSON.stringify( + { + timing: trajectory.timing, + usage: trajectory.usage, + stepCount: trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(this.outputDir, "scores"), { recursive: true }); + await fs.writeFile( + path.join(this.outputDir, "core.log"), + coreLog(trajectory), + ); + } +} + +function mergeAgentEvidence( + ...parts: Array +): AgentEvidence { + return { + modalities: parts.flatMap((p) => p?.modalities ?? []), + }; +} + +/** + * Build a tier-1 AgentEvidence from a step_finished event. The handler's + * toolOutput.result is what the LLM consumed next turn (modulo SDK + * serialization). Wave 1 will replace this with a higher-fidelity capture + * pulled from event.response.messages. + */ +function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { + const modalities: AgentEvidence["modalities"] = []; + if (e.reasoning) { + modalities.push({ type: "text", content: e.reasoning }); + } + const result = e.toolOutput.result; + if (result === undefined || result === null) { + return { modalities }; + } + if (typeof result === "string") { + modalities.push({ type: "text", content: result }); + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + // Tool results commonly include a screenshotBase64 field for vision tools. + const r = result as { screenshotBase64?: string } & Record; + if (typeof r.screenshotBase64 === "string") { + try { + modalities.push({ + type: "image", + bytes: Buffer.from(r.screenshotBase64, "base64"), + mediaType: "image/png", + }); + } catch { + // ignore + } + } + modalities.push({ type: "json", content: result }); + } + return { modalities }; +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts new file mode 100644 index 000000000..20dfb85b6 --- /dev/null +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -0,0 +1,230 @@ +/** + * Wave 0 smoke test — verifies the TrajectoryRecorder plumbing end-to-end + * without launching a browser or calling an LLM. + * + * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus + * events the real agent handlers emit, then asserts: + * 1. The recorder assembles a Trajectory with the expected step shape. + * 2. The persisted directory layout matches fara's example_trajectory/. + * 3. V3Evaluator.verify() returns a parseable stub Verdict. + * + * Run via: pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts + */ +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { EventEmitter } from "node:events"; + +import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js"; +import { V3Evaluator } from "@browserbasehq/stagehand"; +import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; + +interface FakeV3 { + bus: EventEmitter; +} + +async function main(): Promise { + const tmpRoot = await fs.mkdtemp( + path.join(os.tmpdir(), "verifier-rewrite-smoke-"), + ); + console.log(`▸ tmpdir: ${tmpRoot}`); + + const bus = new EventEmitter(); + const v3 = { bus } as unknown as V3; + const taskSpec: TaskSpec = { + id: "smoke-united_13", + instruction: + "What is the price difference between economy and business class on United?", + initUrl: "https://www.google.com", + precomputedRubric: { + items: [ + { + criterion: "Identify correct route", + description: "Agent identifies United CHI→GRU flight.", + max_points: 2, + }, + { + criterion: "Report price delta", + description: "Agent reports economy↔business price delta.", + max_points: 3, + }, + ], + }, + expectedAnswer: "Approximately $4,000 difference.", + }; + + const recorder = new TrajectoryRecorder({ + v3, + taskSpec, + outputRoot: tmpRoot, + runId: "smoke-run", + persist: true, + }); + recorder.start(); + + // Emit a three-step synthetic trajectory. + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "goto", + actionArgs: { url: "https://united.com" }, + reasoning: "Open United Airlines homepage.", + toolOutput: { ok: true, result: { url: "https://united.com" } }, + finishedAt: new Date().toISOString(), + }); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot: Buffer.from("fake-png-bytes-0"), + url: "https://united.com", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://united.com", + }); + + bus.emit("agent_step_finished_event", { + stepIndex: 1, + actionName: "act", + actionArgs: { instruction: "Search Chicago to São Paulo, Nov 24" }, + reasoning: "Enter route and dates.", + toolOutput: { + ok: true, + result: { success: true, describe: "Filled route + dates" }, + }, + finishedAt: new Date().toISOString(), + }); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 1, + screenshot: Buffer.from("fake-png-bytes-1"), + url: "https://united.com/search", + }); + bus.emit("agent_step_observed_event", { + stepIndex: 1, + url: "https://united.com/search", + }); + + bus.emit("agent_step_finished_event", { + stepIndex: 2, + actionName: "extract", + actionArgs: { instruction: "extract fare cells" }, + reasoning: "Read economy and business fares from the results page.", + toolOutput: { + ok: true, + result: { economy: "$1,234", business: "$5,789" }, + }, + finishedAt: new Date().toISOString(), + }); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 2, + screenshot: Buffer.from("fake-png-bytes-2"), + url: "https://united.com/results", + }); + bus.emit("agent_step_observed_event", { + stepIndex: 2, + url: "https://united.com/results", + ariaTree: + "[0-1] RootWebArea: United Search Results\n [0-3] heading: Flight 1234\n [0-4] StaticText: Economy $1,234\n [0-5] StaticText: Business $5,789", + }); + + bus.emit("agent_final_answer_event", { + message: "Economy $1,234 vs business $5,789 — delta $4,555.", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 1234, output_tokens: 567 }, + }); + + // ── Assertions ────────────────────────────────────────────────────────── + assert.equal(trajectory.steps.length, 3, "expected 3 steps"); + assert.equal(trajectory.steps[0].actionName, "goto"); + assert.equal(trajectory.steps[1].actionName, "act"); + assert.equal(trajectory.steps[2].actionName, "extract"); + assert.ok( + trajectory.steps[0].agentEvidence.modalities.some( + (m) => m.type === "image", + ), + "CUA-style screenshot event should populate tier-1 image evidence", + ); + assert.ok( + trajectory.steps[2].agentEvidence.modalities.some( + (m) => + m.type === "json" && + typeof m.content === "object" && + m.content !== null && + "economy" in (m.content as Record), + ), + "extract step should carry a json modality with economy field", + ); + assert.equal( + trajectory.finalAnswer, + "Economy $1,234 vs business $5,789 — delta $4,555.", + ); + assert.equal(trajectory.status, "complete"); + assert.equal(trajectory.usage.input_tokens, 1234); + // a11y dump on step 2 should round-trip through the recorder into + // probeEvidence.ariaTree. + assert.ok( + trajectory.steps[2].probeEvidence.ariaTree?.includes("Economy $1,234"), + "step_observed.ariaTree should populate probeEvidence.ariaTree", + ); + console.log(" ✓ in-memory Trajectory shape (incl. ariaTree round-trip)"); + + // ── On-disk layout ────────────────────────────────────────────────────── + const taskDir = path.join(tmpRoot, "smoke-run", "smoke-united_13"); + const files = (await fs.readdir(taskDir)).sort(); + assert.deepEqual( + files, + [ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ], + `expected new trajectory layout, got ${files.join(", ")}`, + ); + const probeFiles = ( + await fs.readdir(path.join(taskDir, "screenshots", "probe")) + ).sort(); + assert.deepEqual( + probeFiles, + ["1.png", "2.png", "3.png"], + `expected probe screenshots, got ${probeFiles.join(", ")}`, + ); + const screenshotBytes = await fs.readFile( + path.join(taskDir, "screenshots", "probe", "1.png"), + ); + assert.equal(screenshotBytes.toString(), "fake-png-bytes-0"); + const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8"); + assert.ok(coreLog.includes('"action":"goto"')); + console.log(" ✓ on-disk layout matches fara's example_trajectory"); + + const persistedTask = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + assert.equal(persistedTask.task.id, "smoke-united_13"); + assert.equal(persistedTask.status, "complete"); + + // ── V3Evaluator.verify() exercised live in verify-live-trajectory.ts ── + // Sanity-check that the V3Evaluator class still constructs from a minimal + // V3 shape (recorder doesn't depend on the evaluator for plumbing). + const _unused: typeof V3Evaluator = V3Evaluator; + void _unused; + console.log( + " ✓ V3Evaluator still constructs (verify() exercised live elsewhere)", + ); + + console.log("\n✅ Wave 0 plumbing OK"); + await fs.rm(tmpRoot, { recursive: true, force: true }); +} + +main().catch((err) => { + console.error("\n❌ Wave 0 plumbing FAILED:", err); + process.exit(1); +}); + +// Type guard for FakeV3 lint suppression (the file uses `as unknown as V3`). +export type { FakeV3 }; From 1221fe03233d5584f0bb432feb55860cc0defb84 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:16:26 -0700 Subject: [PATCH 2/6] fix(verifier): align trajectory naming --- .changeset/verifier-trajectory-events.md | 5 +++++ packages/evals/framework/trajectoryRecorder.ts | 6 ++++-- packages/evals/scripts/verify-trajectory-recorder.ts | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 .changeset/verifier-trajectory-events.md diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md new file mode 100644 index 000000000..9dcb5c819 --- /dev/null +++ b/.changeset/verifier-trajectory-events.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Capture verifier trajectory evidence from v3 agent events for offline scoring. diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 2b7f24b52..501668c2b 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -327,12 +327,14 @@ export class TrajectoryRecorder { } /** - * Write the trajectory directory layout. Mirrors fara's example_trajectory/: + * Write the trajectory directory layout. * * / * ├── task_data.json * ├── trajectory.json (screenshots referenced by path) - * ├── screenshot_.png + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/.png * └── times.json */ private async persist(trajectory: Trajectory): Promise { diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts index 20dfb85b6..7076fff21 100644 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -42,12 +42,12 @@ async function main(): Promise { { criterion: "Identify correct route", description: "Agent identifies United CHI→GRU flight.", - max_points: 2, + maxPoints: 2, }, { criterion: "Report price delta", description: "Agent reports economy↔business price delta.", - max_points: 3, + maxPoints: 3, }, ], }, From bdea11ae1f58ca74f0c87d37a6ccf6f1c2c57687 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:35:06 -0700 Subject: [PATCH 3/6] chore(evals): remove upstream trajectory references --- packages/evals/framework/trajectoryRecorder.ts | 6 +++--- packages/evals/scripts/verify-trajectory-recorder.ts | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 501668c2b..5a8a62f1d 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -14,8 +14,8 @@ * - "1" / "true": always persist. * - "0" / "false": never persist. * - * On-disk layout matches microsoft/fara's example_trajectory/ so we can - * cross-validate against verify_trajectories.py without format conversion. + * On-disk layout is stable JSON + screenshots so saved runs can be re-scored + * without format conversion. * * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk) */ @@ -405,7 +405,7 @@ export class TrajectoryRecorder { JSON.stringify(serialized, null, 2), ); - // task_data.json mirrors fara's shape: TaskSpec + (later) verdict. + // task_data.json stores TaskSpec + (later) verdict. await fs.writeFile( path.join(this.outputDir, "task_data.json"), JSON.stringify( diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts index 7076fff21..049b96c70 100644 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -5,7 +5,7 @@ * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus * events the real agent handlers emit, then asserts: * 1. The recorder assembles a Trajectory with the expected step shape. - * 2. The persisted directory layout matches fara's example_trajectory/. + * 2. The persisted directory layout has the expected verifier files. * 3. V3Evaluator.verify() returns a parseable stub Verdict. * * Run via: pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts @@ -200,7 +200,7 @@ async function main(): Promise { assert.equal(screenshotBytes.toString(), "fake-png-bytes-0"); const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8"); assert.ok(coreLog.includes('"action":"goto"')); - console.log(" ✓ on-disk layout matches fara's example_trajectory"); + console.log(" ✓ on-disk layout has expected verifier files"); const persistedTask = JSON.parse( await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), From d5e1af41e31357a30315ef64dbf0247f37fb242b Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:19:16 -0700 Subject: [PATCH 4/6] docs(verifier): remove rollout comments from trajectory capture --- packages/core/lib/v3/handlers/v3AgentHandler.ts | 2 +- packages/core/lib/v3/handlers/v3CuaAgentHandler.ts | 4 ++-- packages/evals/framework/trajectoryRecorder.ts | 7 ++----- packages/evals/scripts/verify-trajectory-recorder.ts | 6 +++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index d0308bdd8..afddddef2 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -313,7 +313,7 @@ export class V3AgentHandler { // Emit step_finished_event per tool call. The TrajectoryRecorder // builds one Trajectory.Step per emission. tier-1 evidence (the // bytes the LLM consumed) is captured separately via an - // onStepFinish wrapper in the harness (plan §10 Q1). + // onStepFinish wrapper in the harness. const stepIndex = stepCounter++; stepIndicesInTurn.push(stepIndex); const toolOk = diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index f1dd2666e..2fd08b864 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -42,7 +42,7 @@ export class V3CuaAgentHandler { // Monotonic step counter used by bus events. The CUA loop is internal to // the agent client, so unlike v3AgentHandler we don't have per-tool-call // step events; instead we tag every screenshot emission with an - // incrementing index. Wave 1 may add finer-grained step events here. + // incrementing index. private cuaStepCounter = 0; private latestCuaScreenshot?: AgentScreenshotTakenEvent; private latestCuaScreenshotConsumed = true; @@ -89,7 +89,7 @@ export class V3CuaAgentHandler { // Emit bus event so TrajectoryRecorder can capture the screenshot. In // CUA mode this is the same buffer the provider receives — i.e., it // serves both as tier-1 evidence (what the model saw) and as a tier-2 - // probe. See plan §04 "Mode-by-mode sources". + // probe. try { this.emitCuaScreenshot(screenshotBuffer, page.url()); } catch { diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index 5a8a62f1d..d7c4d62ab 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -9,15 +9,13 @@ * await agent.execute(...); * const trajectory = await recorder.finish({ status: "complete", usage }); * - * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES` (plan §10 Q2): + * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES`: * - unset: persistence follows the default (on locally, off in CI). * - "1" / "true": always persist. * - "0" / "false": never persist. * * On-disk layout is stable JSON + screenshots so saved runs can be re-scored * without format conversion. - * - * @see ~/.claude/plans/verifier-rewrite.html §06 (Trajectory on-disk) */ import fs from "node:fs/promises"; import path from "node:path"; @@ -451,8 +449,7 @@ function mergeAgentEvidence( /** * Build a tier-1 AgentEvidence from a step_finished event. The handler's * toolOutput.result is what the LLM consumed next turn (modulo SDK - * serialization). Wave 1 will replace this with a higher-fidelity capture - * pulled from event.response.messages. + * serialization). */ function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { const modalities: AgentEvidence["modalities"] = []; diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts index 049b96c70..c2df86fd1 100644 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ b/packages/evals/scripts/verify-trajectory-recorder.ts @@ -1,5 +1,5 @@ /** - * Wave 0 smoke test — verifies the TrajectoryRecorder plumbing end-to-end + * Smoke test — verifies the TrajectoryRecorder plumbing end-to-end * without launching a browser or calling an LLM. * * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus @@ -217,12 +217,12 @@ async function main(): Promise { " ✓ V3Evaluator still constructs (verify() exercised live elsewhere)", ); - console.log("\n✅ Wave 0 plumbing OK"); + console.log("\n✅ Trajectory recorder plumbing OK"); await fs.rm(tmpRoot, { recursive: true, force: true }); } main().catch((err) => { - console.error("\n❌ Wave 0 plumbing FAILED:", err); + console.error("\n❌ Trajectory recorder plumbing FAILED:", err); process.exit(1); }); From bfd9bc983375ff339785cf59f09e14c8f4e6c9c5 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:31:14 -0700 Subject: [PATCH 5/6] test(evals): cover trajectory recorder in vitest --- .../evals/framework/trajectoryRecorder.ts | 16 +- .../scripts/verify-trajectory-recorder.ts | 230 ------------------ .../framework/trajectoryRecorder.test.ts | 197 +++++++++++++++ 3 files changed, 205 insertions(+), 238 deletions(-) delete mode 100644 packages/evals/scripts/verify-trajectory-recorder.ts create mode 100644 packages/evals/tests/framework/trajectoryRecorder.test.ts diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts index d7c4d62ab..8895a0844 100644 --- a/packages/evals/framework/trajectoryRecorder.ts +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -31,7 +31,7 @@ import type { TrajectoryStatus, TrajectoryStep, TrajectoryUsage, - Verdict, + EvaluationResult, V3, } from "@browserbasehq/stagehand"; @@ -239,12 +239,12 @@ export class TrajectoryRecorder { } /** - * Persist verifier scores next to the trajectory. No-op when trajectory + * Persist evaluator result next to the trajectory. No-op when trajectory * persistence is disabled. */ - async persistVerdict( - verdict: Verdict, - filename = "mmrubric_v1.json", + async persistResult( + result: EvaluationResult, + filename = "result.json", ): Promise { if (!this.persistEnabled) return; @@ -252,7 +252,7 @@ export class TrajectoryRecorder { await fs.mkdir(scoresDir, { recursive: true }); await fs.writeFile( path.join(scoresDir, filename), - JSON.stringify(verdict, null, 2), + JSON.stringify(result, null, 2), ); const taskDataPath = path.join(this.outputDir, "task_data.json"); @@ -267,7 +267,7 @@ export class TrajectoryRecorder { } await fs.writeFile( taskDataPath, - JSON.stringify({ ...taskData, verdict }, null, 2), + JSON.stringify({ ...taskData, result }, null, 2), ); } @@ -403,7 +403,7 @@ export class TrajectoryRecorder { JSON.stringify(serialized, null, 2), ); - // task_data.json stores TaskSpec + (later) verdict. + // task_data.json stores TaskSpec + (later) result. await fs.writeFile( path.join(this.outputDir, "task_data.json"), JSON.stringify( diff --git a/packages/evals/scripts/verify-trajectory-recorder.ts b/packages/evals/scripts/verify-trajectory-recorder.ts deleted file mode 100644 index c2df86fd1..000000000 --- a/packages/evals/scripts/verify-trajectory-recorder.ts +++ /dev/null @@ -1,230 +0,0 @@ -/** - * Smoke test — verifies the TrajectoryRecorder plumbing end-to-end - * without launching a browser or calling an LLM. - * - * Drives a fake V3 (just an EventEmitter-shaped `bus`) through the same bus - * events the real agent handlers emit, then asserts: - * 1. The recorder assembles a Trajectory with the expected step shape. - * 2. The persisted directory layout has the expected verifier files. - * 3. V3Evaluator.verify() returns a parseable stub Verdict. - * - * Run via: pnpm tsx packages/evals/scripts/verify-trajectory-recorder.ts - */ -import assert from "node:assert/strict"; -import fs from "node:fs/promises"; -import os from "node:os"; -import path from "node:path"; -import { EventEmitter } from "node:events"; - -import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js"; -import { V3Evaluator } from "@browserbasehq/stagehand"; -import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; - -interface FakeV3 { - bus: EventEmitter; -} - -async function main(): Promise { - const tmpRoot = await fs.mkdtemp( - path.join(os.tmpdir(), "verifier-rewrite-smoke-"), - ); - console.log(`▸ tmpdir: ${tmpRoot}`); - - const bus = new EventEmitter(); - const v3 = { bus } as unknown as V3; - const taskSpec: TaskSpec = { - id: "smoke-united_13", - instruction: - "What is the price difference between economy and business class on United?", - initUrl: "https://www.google.com", - precomputedRubric: { - items: [ - { - criterion: "Identify correct route", - description: "Agent identifies United CHI→GRU flight.", - maxPoints: 2, - }, - { - criterion: "Report price delta", - description: "Agent reports economy↔business price delta.", - maxPoints: 3, - }, - ], - }, - expectedAnswer: "Approximately $4,000 difference.", - }; - - const recorder = new TrajectoryRecorder({ - v3, - taskSpec, - outputRoot: tmpRoot, - runId: "smoke-run", - persist: true, - }); - recorder.start(); - - // Emit a three-step synthetic trajectory. - bus.emit("agent_step_finished_event", { - stepIndex: 0, - actionName: "goto", - actionArgs: { url: "https://united.com" }, - reasoning: "Open United Airlines homepage.", - toolOutput: { ok: true, result: { url: "https://united.com" } }, - finishedAt: new Date().toISOString(), - }); - bus.emit("agent_screenshot_taken_event", { - stepIndex: 0, - screenshot: Buffer.from("fake-png-bytes-0"), - url: "https://united.com", - evidenceRole: "agent_and_probe", - }); - bus.emit("agent_step_observed_event", { - stepIndex: 0, - url: "https://united.com", - }); - - bus.emit("agent_step_finished_event", { - stepIndex: 1, - actionName: "act", - actionArgs: { instruction: "Search Chicago to São Paulo, Nov 24" }, - reasoning: "Enter route and dates.", - toolOutput: { - ok: true, - result: { success: true, describe: "Filled route + dates" }, - }, - finishedAt: new Date().toISOString(), - }); - bus.emit("agent_screenshot_taken_event", { - stepIndex: 1, - screenshot: Buffer.from("fake-png-bytes-1"), - url: "https://united.com/search", - }); - bus.emit("agent_step_observed_event", { - stepIndex: 1, - url: "https://united.com/search", - }); - - bus.emit("agent_step_finished_event", { - stepIndex: 2, - actionName: "extract", - actionArgs: { instruction: "extract fare cells" }, - reasoning: "Read economy and business fares from the results page.", - toolOutput: { - ok: true, - result: { economy: "$1,234", business: "$5,789" }, - }, - finishedAt: new Date().toISOString(), - }); - bus.emit("agent_screenshot_taken_event", { - stepIndex: 2, - screenshot: Buffer.from("fake-png-bytes-2"), - url: "https://united.com/results", - }); - bus.emit("agent_step_observed_event", { - stepIndex: 2, - url: "https://united.com/results", - ariaTree: - "[0-1] RootWebArea: United Search Results\n [0-3] heading: Flight 1234\n [0-4] StaticText: Economy $1,234\n [0-5] StaticText: Business $5,789", - }); - - bus.emit("agent_final_answer_event", { - message: "Economy $1,234 vs business $5,789 — delta $4,555.", - }); - - const trajectory = await recorder.finish({ - status: "complete", - usage: { input_tokens: 1234, output_tokens: 567 }, - }); - - // ── Assertions ────────────────────────────────────────────────────────── - assert.equal(trajectory.steps.length, 3, "expected 3 steps"); - assert.equal(trajectory.steps[0].actionName, "goto"); - assert.equal(trajectory.steps[1].actionName, "act"); - assert.equal(trajectory.steps[2].actionName, "extract"); - assert.ok( - trajectory.steps[0].agentEvidence.modalities.some( - (m) => m.type === "image", - ), - "CUA-style screenshot event should populate tier-1 image evidence", - ); - assert.ok( - trajectory.steps[2].agentEvidence.modalities.some( - (m) => - m.type === "json" && - typeof m.content === "object" && - m.content !== null && - "economy" in (m.content as Record), - ), - "extract step should carry a json modality with economy field", - ); - assert.equal( - trajectory.finalAnswer, - "Economy $1,234 vs business $5,789 — delta $4,555.", - ); - assert.equal(trajectory.status, "complete"); - assert.equal(trajectory.usage.input_tokens, 1234); - // a11y dump on step 2 should round-trip through the recorder into - // probeEvidence.ariaTree. - assert.ok( - trajectory.steps[2].probeEvidence.ariaTree?.includes("Economy $1,234"), - "step_observed.ariaTree should populate probeEvidence.ariaTree", - ); - console.log(" ✓ in-memory Trajectory shape (incl. ariaTree round-trip)"); - - // ── On-disk layout ────────────────────────────────────────────────────── - const taskDir = path.join(tmpRoot, "smoke-run", "smoke-united_13"); - const files = (await fs.readdir(taskDir)).sort(); - assert.deepEqual( - files, - [ - "core.log", - "scores", - "screenshots", - "task_data.json", - "times.json", - "trajectory.json", - ], - `expected new trajectory layout, got ${files.join(", ")}`, - ); - const probeFiles = ( - await fs.readdir(path.join(taskDir, "screenshots", "probe")) - ).sort(); - assert.deepEqual( - probeFiles, - ["1.png", "2.png", "3.png"], - `expected probe screenshots, got ${probeFiles.join(", ")}`, - ); - const screenshotBytes = await fs.readFile( - path.join(taskDir, "screenshots", "probe", "1.png"), - ); - assert.equal(screenshotBytes.toString(), "fake-png-bytes-0"); - const coreLog = await fs.readFile(path.join(taskDir, "core.log"), "utf8"); - assert.ok(coreLog.includes('"action":"goto"')); - console.log(" ✓ on-disk layout has expected verifier files"); - - const persistedTask = JSON.parse( - await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), - ); - assert.equal(persistedTask.task.id, "smoke-united_13"); - assert.equal(persistedTask.status, "complete"); - - // ── V3Evaluator.verify() exercised live in verify-live-trajectory.ts ── - // Sanity-check that the V3Evaluator class still constructs from a minimal - // V3 shape (recorder doesn't depend on the evaluator for plumbing). - const _unused: typeof V3Evaluator = V3Evaluator; - void _unused; - console.log( - " ✓ V3Evaluator still constructs (verify() exercised live elsewhere)", - ); - - console.log("\n✅ Trajectory recorder plumbing OK"); - await fs.rm(tmpRoot, { recursive: true, force: true }); -} - -main().catch((err) => { - console.error("\n❌ Trajectory recorder plumbing FAILED:", err); - process.exit(1); -}); - -// Type guard for FakeV3 lint suppression (the file uses `as unknown as V3`). -export type { FakeV3 }; diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts new file mode 100644 index 000000000..5c5268e66 --- /dev/null +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -0,0 +1,197 @@ +import { EventEmitter } from "node:events"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, describe, expect, it } from "vitest"; +import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; + +import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js"; + +const tempDirs: string[] = []; + +afterEach(async () => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) await fs.rm(dir, { recursive: true, force: true }); + } +}); + +function makeTempDir(): Promise { + return fs + .mkdtemp(path.join(os.tmpdir(), "trajectory-recorder-")) + .then((dir) => { + tempDirs.push(dir); + return dir; + }); +} + +function makeV3(bus = new EventEmitter()): V3 { + return { bus } as unknown as V3; +} + +function makeTaskSpec(): TaskSpec { + return { + id: "recorder-task", + instruction: "Compare economy and business fares.", + initUrl: "https://example.com", + precomputedRubric: { + items: [ + { + criterion: "Report fare delta", + description: "Report the difference between two fares.", + maxPoints: 1, + }, + ], + }, + }; +} + +describe("TrajectoryRecorder", () => { + it("assembles trajectory evidence from bus events", async () => { + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + persist: false, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }); + bus.emit("agent_final_answer_event", { + message: "Business is $150 more than economy.", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 10, output_tokens: 5 }, + }); + + expect(trajectory.steps).toHaveLength(1); + expect(trajectory.steps[0]).toMatchObject({ + index: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + probeEvidence: { + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }, + }); + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.steps[0].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + { type: "text", content: "Read visible fare cells." }, + { type: "json", content: { economy: "$100", business: "$250" } }, + ]), + ); + expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); + }); + + it("persists trajectory files and evaluator results", async () => { + const outputRoot = await makeTempDir(); + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + outputRoot, + runId: "run-1", + persist: true, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "act", + actionArgs: { instruction: "Search fares" }, + reasoning: "Search for fares.", + toolOutput: { ok: true, result: "done" }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + }); + + await recorder.finish({ status: "complete" }); + await recorder.persistResult({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + + const taskDir = path.join(outputRoot, "run-1", "recorder-task"); + await expect(fs.readdir(taskDir)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + + const trajectory = JSON.parse( + await fs.readFile(path.join(taskDir, "trajectory.json"), "utf8"), + ); + expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe( + "screenshots/probe/1.png", + ); + expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({ + type: "image", + imagePath: "screenshots/agent/1.png", + mediaType: "image/png", + }); + + const taskData = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + expect(taskData.result).toMatchObject({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + }); +}); From 635b3d2a98844d17115f0dcbe442054a36f85677 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:48:53 -0700 Subject: [PATCH 6/6] docs(verifier): trim trajectory event comments --- .../core/lib/v3/agent/AnthropicCUAClient.ts | 4 ---- .../core/lib/v3/types/public/busEvents.ts | 19 +++++-------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 54d64f15d..752d208e2 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -902,10 +902,6 @@ export class AnthropicCUAClient extends AgentClient { ...input, }; } else if (action === "triple_click" || action === "tripleClick") { - // Anthropic's computer_20250124 tool emits `triple_click` with - // `coordinate: [x, y]`. Without this branch the snake_case name + - // raw coordinate array fall through to the generic `else` and - // executeAction logs "Unknown action type: triple_click". return { type: "tripleClick", x: diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts index 62e992949..e2fa11949 100644 --- a/packages/core/lib/v3/types/public/busEvents.ts +++ b/packages/core/lib/v3/types/public/busEvents.ts @@ -5,16 +5,8 @@ * event name so consumers (TrajectoryRecorder in packages/evals, custom * subscribers) can type their handlers. * - * Wave 0 of the verifier rewrite plan introduces: - * - agent_screenshot_taken_event — independent post-step screenshot probe - * - agent_step_finished_event — fired per tool-call in a step result - * - agent_step_observed_event — fired after the harness probe completes - * - agent_final_answer_event — fired when the `done` tool resolves - * - * `agent_step_started_event` is documented in the plan but deferred — the AI - * SDK's `onStepFinish` is a post-hook, and there's no symmetric pre-hook per - * tool execution in v3AgentHandler today. Started-state can be derived from - * the finished event's stepIndex if needed. + * The verifier recorder consumes these events to assemble persisted + * trajectories without coupling to individual agent handlers. */ /** @@ -37,7 +29,7 @@ export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; * * Note: in CUA mode the same Buffer is also what the provider received; in * DOM/hybrid mode it's an independent harness probe. The verifier treats them - * as different evidence tiers regardless — see plan §04 ("Mode-by-mode sources"). + * as different evidence tiers regardless. */ export interface AgentScreenshotTakenEvent { /** Zero-based index of the step this screenshot corresponds to. */ @@ -63,7 +55,7 @@ export interface AgentScreenshotTakenEvent { * * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper - * — not in this payload. See plan §10 Q1 (resolved: onStepFinish). + * and is not part of this payload. */ export interface AgentStepFinishedEvent { stepIndex: number; @@ -86,8 +78,7 @@ export interface AgentStepFinishedEvent { /** * Payload for `agent_step_observed_event`. Emitted after the harness probe - * completes for a step (page URL captured at minimum; a11y tree and scroll - * info added in Wave 2). + * completes for a step. */ export interface AgentStepObservedEvent { stepIndex: number;