diff --git a/.changeset/verifier-trajectory-events.md b/.changeset/verifier-trajectory-events.md new file mode 100644 index 000000000..9dcb5c819 --- /dev/null +++ b/.changeset/verifier-trajectory-events.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Capture verifier trajectory evidence from v3 agent events for offline scoring. diff --git a/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts new file mode 100644 index 000000000..8e3fcc050 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts @@ -0,0 +1,75 @@ +/** + * captureAriaTreeProbe — capture a truncated accessibility tree of the active + * page for use as tier-2 evidence in the trajectory recorder. + * + * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the + * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay + * the cost. + * + * The a11y tree is the same payload the agent's `ariaTree` tool sees, but + * captured by the harness (not the agent) so the verifier has independent + * textual ground truth for grounding non-visual claims — prices, names, + * dates, list contents — without OCR'ing screenshots. + * + * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures + * across a ~30-step trajectory at that cap sum to ~240k tokens total, + * which the verifier handles via per-criterion top-K selection. The cap + * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can + * trade RAM/disk for fidelity. Truncated content is marked explicitly so + * the verifier knows it was clipped. + */ +import type { V3 } from "../../v3.js"; + +const APPROX_CHARS_PER_TOKEN = 4; +const DEFAULT_TOKEN_BUDGET = 8_000; +const DEFAULT_TIMEOUT_MS = 5_000; + +interface CaptureAriaTreeOptions { + /** Soft cap on token count (chars/4 approximation). Default 8000. */ + tokenBudget?: number; + /** Hard timeout on the capture. Default 5s. */ + timeoutMs?: number; +} + +/** + * Returns the truncated a11y tree as a plain string, or undefined when + * capture fails. Never throws — a11y capture is best-effort tier-2 evidence, + * not a hard requirement, so failures are silently absorbed (the verifier + * surfaces this via evidence_insufficient). + */ +export async function captureAriaTreeProbe( + v3: V3, + opts: CaptureAriaTreeOptions = {}, +): Promise { + const envBudget = parseInt( + process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "", + 10, + ); + const tokenBudget = + opts.tokenBudget ?? + (Number.isFinite(envBudget) && envBudget > 0 + ? envBudget + : DEFAULT_TOKEN_BUDGET); + const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN; + + try { + // v3.extract() without a schema returns { pageText } where pageText is the + // rendered accessibility tree — same path the agent's ariaTree tool uses. + const result = (await v3.extract({ timeout: timeoutMs })) as { + pageText?: string; + }; + const pageText = result?.pageText; + if (typeof pageText !== "string" || pageText.length === 0) return undefined; + + if (pageText.length > maxChars) { + return ( + pageText.slice(0, maxChars) + + `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]` + ); + } + return pageText; + } catch { + return undefined; + } +} diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts index cff08c8a2..afddddef2 100644 --- a/packages/core/lib/v3/handlers/v3AgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3AgentHandler.ts @@ -41,6 +41,7 @@ import { AgentAbortError, } from "../types/public/sdkErrors.js"; import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { CaptchaSolver, CAPTCHA_SOLVED_MSG, @@ -248,6 +249,10 @@ export class V3AgentHandler { | GenerateTextOnStepFinishCallback | StreamTextOnStepFinishCallback, ) { + // Monotonic step counter scoped to this execute() call. Each tool call in + // the agent loop becomes one trajectory step. The counter feeds stepIndex + // on the bus events the TrajectoryRecorder subscribes to. + let stepCounter = 0; return async (event: StepResult) => { this.logger({ category: "agent", @@ -255,6 +260,11 @@ export class V3AgentHandler { level: 2, }); + const stepIndicesInTurn: number[] = []; + let lastFinalAnswer: + | { message: string; output?: Record } + | undefined; + if (event.toolCalls && event.toolCalls.length > 0) { for (let i = 0; i < event.toolCalls.length; i++) { const toolCall = event.toolCalls[i]; @@ -279,6 +289,13 @@ export class V3AgentHandler { ? `${allReasoning} ${doneReasoning}`.trim() : allReasoning || "Task completed successfully"; } + lastFinalAnswer = { + message: state.finalMessage, + output: + typeof args?.output === "object" && args?.output !== null + ? (args.output as Record) + : undefined, + }; } const mappedActions = mapToolResultToActions({ toolCallName: toolCall.toolName, @@ -292,8 +309,100 @@ export class V3AgentHandler { action.timestamp = Date.now(); state.actions.push(action); } + + // Emit step_finished_event per tool call. The TrajectoryRecorder + // builds one Trajectory.Step per emission. tier-1 evidence (the + // bytes the LLM consumed) is captured separately via an + // onStepFinish wrapper in the harness. + const stepIndex = stepCounter++; + stepIndicesInTurn.push(stepIndex); + const toolOk = + !toolResult || + (typeof toolResult === "object" && + !("error" in toolResult) && + !("isError" in toolResult && toolResult.isError)); + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: toolCall.toolName, + actionArgs: + typeof args === "object" && args !== null + ? (args as Record) + : {}, + reasoning: event.text ?? "", + toolOutput: { + ok: toolOk, + result: toolResult, + error: + toolResult && + typeof toolResult === "object" && + "error" in toolResult && + typeof (toolResult as { error?: unknown }).error === "string" + ? (toolResult as { error: string }).error + : undefined, + }, + finishedAt: new Date().toISOString(), + }); } state.currentPageUrl = (await this.v3.context.awaitActivePage()).url(); + + // Harness probe — take a single screenshot / a11y snapshot per AI SDK + // step and attach it to every tool call in that turn. The observation + // reflects the settled page state after the batch of tool calls; this + // is more faithful than dropping probe evidence for all but the last + // tool call, while still avoiding per-tool screenshot overhead. + const wantsScreenshotProbe = + this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount("agent_step_observed_event") > 0; + if ( + stepIndicesInTurn.length > 0 && + (wantsScreenshotProbe || wantsStepObservation) + ) { + try { + const page = await this.v3.context.awaitActivePage(); + let screenshot: Buffer | undefined; + if (wantsScreenshotProbe) { + screenshot = await page.screenshot({ fullPage: false }); + } + let ariaTree: string | undefined; + if (wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier + // can ground textual claims (prices, names, dates) without OCR. + // Best-effort: returns undefined on failure/timeout. + ariaTree = await captureAriaTreeProbe(this.v3); + } + for (const stepIndex of stepIndicesInTurn) { + if (screenshot) { + // DOM/hybrid: this post-step screenshot is a harness probe + // only. The agent's tier-1 evidence is the tool's return value + // captured separately in agent_step_finished_event. + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot, + url: state.currentPageUrl, + evidenceRole: "probe", + }); + } + if (wantsStepObservation) { + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: state.currentPageUrl, + ariaTree, + }); + } + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: harness probe failed: ${getErrorMessage(e)}`, + level: 1, + }); + } + } + } + + if (lastFinalAnswer) { + this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer); } if (userCallback) { diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index af3a3dad8..2fd08b864 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -7,6 +7,7 @@ import { GoogleCUAClient } from "../agent/GoogleCUAClient.js"; import { OpenAICUAClient } from "../agent/OpenAICUAClient.js"; import { mapKeyToPlaywright } from "../agent/utils/cuaKeyMapping.js"; import { ensureXPath } from "../agent/utils/xpath.js"; +import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js"; import { ActionExecutionResult, AgentAction, @@ -16,6 +17,7 @@ import { SafetyConfirmationHandler, } from "../types/public/agent.js"; import { LogLine } from "../types/public/logs.js"; +import type { AgentScreenshotTakenEvent } from "../types/public/busEvents.js"; import { type Action, V3FunctionName } from "../types/public/methods.js"; import { FlowLogger } from "../flowlogger/FlowLogger.js"; import { toTitleCase } from "../../utils.js"; @@ -37,6 +39,13 @@ export class V3CuaAgentHandler { private captchaSolver: CaptchaSolver | null = null; private captchaClickGuardRemaining = 0; private currentInstruction = ""; + // Monotonic step counter used by bus events. The CUA loop is internal to + // the agent client, so unlike v3AgentHandler we don't have per-tool-call + // step events; instead we tag every screenshot emission with an + // incrementing index. + private cuaStepCounter = 0; + private latestCuaScreenshot?: AgentScreenshotTakenEvent; + private latestCuaScreenshotConsumed = true; constructor( v3: V3, @@ -76,6 +85,17 @@ export class V3CuaAgentHandler { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); + + // Emit bus event so TrajectoryRecorder can capture the screenshot. In + // CUA mode this is the same buffer the provider receives — i.e., it + // serves both as tier-1 evidence (what the model saw) and as a tier-2 + // probe. + try { + this.emitCuaScreenshot(screenshotBuffer, page.url()); + } catch { + // bus emit errors are non-fatal + } + return screenshotBuffer.toString("base64"); // base64 png }); @@ -120,6 +140,7 @@ export class V3CuaAgentHandler { (this.options.clientOptions?.waitBetweenActions as number) || defaultDelay; try { + let executionResult: ActionExecutionResult | undefined; // Try to inject cursor before each action if enabled if (this.highlightCursor) { try { @@ -133,7 +154,7 @@ export class V3CuaAgentHandler { // takes its own screenshot via screenshotProvider between API turns. const shouldLog = action.type !== "screenshot"; if (shouldLog) { - await FlowLogger.runWithLogging( + executionResult = await FlowLogger.runWithLogging( { eventType: `V3Cua${toTitleCase(action.type)}`, // e.g. "V3CuaClick" data: { @@ -145,10 +166,13 @@ export class V3CuaAgentHandler { [action], ); } else { - await this.executeAction(action); + executionResult = await this.executeAction(action); } action.timestamp = Date.now(); + if (shouldLog) { + await this.emitCuaActionStep(action, executionResult); + } await new Promise((r) => setTimeout(r, waitBetween)); } catch (error) { @@ -658,6 +682,15 @@ export class V3CuaAgentHandler { const screenshotBuffer = await page.screenshot({ fullPage: false }); const currentUrl = page.url(); + + // Mirror the screenshot to the bus — same buffer the CUA client + // received, so it serves as both tier-1 evidence and tier-2 probe. + try { + this.emitCuaScreenshot(screenshotBuffer, currentUrl); + } catch { + // non-fatal + } + return await this.agentClient.captureScreenshot({ base64Image: screenshotBuffer.toString("base64"), currentUrl, @@ -767,6 +800,129 @@ export class V3CuaAgentHandler { } } + /** + * Emit a pre-action CUA screenshot — the exact buffer the model received + * as input. Tier-1 evidence (agent-mirrored); the tier-2 probe is taken + * separately in emitCuaActionStep after the action runs, so the recorder + * can compare what the model saw against what the page actually showed + * once the keystrokes/clicks landed. + */ + private emitCuaScreenshot( + screenshot: Buffer, + url: string, + ): AgentScreenshotTakenEvent { + const event: AgentScreenshotTakenEvent = { + stepIndex: this.cuaStepCounter++, + screenshot, + url, + evidenceRole: "agent", + }; + this.latestCuaScreenshot = event; + this.latestCuaScreenshotConsumed = false; + this.v3.bus.emit("agent_screenshot_taken_event", event); + return event; + } + + private async emitCuaActionStep( + action: AgentAction, + result: ActionExecutionResult | undefined, + ): Promise { + let pageUrl = + typeof action.pageUrl === "string" + ? action.pageUrl + : this.latestCuaScreenshot?.url; + try { + pageUrl = (await this.v3.context.awaitActivePage()).url(); + } catch { + // Keep the best pre-action URL fallback. + } + let stepIndex: number; + + if (this.latestCuaScreenshot && !this.latestCuaScreenshotConsumed) { + stepIndex = this.latestCuaScreenshot.stepIndex; + this.latestCuaScreenshotConsumed = true; + } else if (this.latestCuaScreenshot) { + stepIndex = this.cuaStepCounter++; + this.v3.bus.emit("agent_screenshot_taken_event", { + ...this.latestCuaScreenshot, + stepIndex, + }); + } else { + stepIndex = this.cuaStepCounter++; + } + + const actionArgs = Object.fromEntries( + Object.entries(action).filter(([key]) => key !== "screenshot"), + ); + const reasoning = + typeof action.reasoning === "string" + ? action.reasoning + : typeof action.action === "string" + ? action.action + : ""; + + this.v3.bus.emit("agent_step_finished_event", { + stepIndex, + actionName: String(action.type), + actionArgs, + reasoning, + toolOutput: { + ok: result?.success !== false, + result: result ?? { success: true }, + error: result?.error, + }, + finishedAt: new Date().toISOString(), + }); + + // Post-action tier-2 probe. The pre-action screenshot from + // screenshotProvider is what the model SAW; this one shows what the + // page actually LOOKS LIKE after the action ran. Without this the + // verifier has no visual evidence that keystrokes/clicks landed, and + // has to trust the action history alone. + // + // Listener-gated to keep ordinary agent runs free of the extra + // screenshot cost — mirrors v3AgentHandler's post-step probe. + const wantsScreenshotProbe = + this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0; + const wantsStepObservation = + this.v3.bus.listenerCount("agent_step_observed_event") > 0; + let probeUrl = pageUrl; + if (wantsScreenshotProbe || wantsStepObservation) { + try { + const page = await this.v3.context.awaitActivePage(); + probeUrl = page.url(); + if (wantsScreenshotProbe) { + const probeScreenshot = await page.screenshot({ fullPage: false }); + this.v3.bus.emit("agent_screenshot_taken_event", { + stepIndex, + screenshot: probeScreenshot, + url: probeUrl, + evidenceRole: "probe", + }); + } + } catch (e) { + this.logger({ + category: "agent", + message: `Warning: CUA post-action probe failed: ${ + e instanceof Error ? e.message : String(e) + }`, + level: 1, + }); + } + } + + if (probeUrl && wantsStepObservation) { + // Capture the a11y tree alongside the URL probe so the verifier can + // ground textual claims without OCR. Best-effort. + const ariaTree = await captureAriaTreeProbe(this.v3); + this.v3.bus.emit("agent_step_observed_event", { + stepIndex, + url: probeUrl, + ariaTree, + }); + } + } + private async injectCursor(): Promise { try { const page = await this.v3.context.awaitActivePage(); diff --git a/packages/core/lib/v3/types/public/busEvents.ts b/packages/core/lib/v3/types/public/busEvents.ts new file mode 100644 index 000000000..e2fa11949 --- /dev/null +++ b/packages/core/lib/v3/types/public/busEvents.ts @@ -0,0 +1,99 @@ +/** + * Bus event payloads emitted by V3 on `v3.bus`. + * + * The bus is an EventEmitter; these types document the payload shape per + * event name so consumers (TrajectoryRecorder in packages/evals, custom + * subscribers) can type their handlers. + * + * The verifier recorder consumes these events to assemble persisted + * trajectories without coupling to individual agent handlers. + */ + +/** + * Names of bus events the agent handlers emit. Use these constants to + * subscribe; the bus accepts arbitrary strings, but a centralized list helps + * catch typos at the call site. + */ +export const BUS_EVENTS = { + AGENT_SCREENSHOT_TAKEN: "agent_screenshot_taken_event", + AGENT_STEP_FINISHED: "agent_step_finished_event", + AGENT_STEP_OBSERVED: "agent_step_observed_event", + AGENT_FINAL_ANSWER: "agent_final_answer_event", +} as const; + +export type BusEventName = (typeof BUS_EVENTS)[keyof typeof BUS_EVENTS]; + +/** + * Payload for `agent_screenshot_taken_event`. The raw screenshot Buffer the + * harness took after a step's tool execution. + * + * Note: in CUA mode the same Buffer is also what the provider received; in + * DOM/hybrid mode it's an independent harness probe. The verifier treats them + * as different evidence tiers regardless. + */ +export interface AgentScreenshotTakenEvent { + /** Zero-based index of the step this screenshot corresponds to. */ + stepIndex: number; + /** PNG bytes from page.screenshot(). */ + screenshot: Buffer; + /** Page URL at the time of capture. */ + url: string; + /** + * Evidence role for this screenshot. + * + * DOM/hybrid post-tool screenshots are probe-only. CUA screenshots are also + * the exact image bytes sent to the provider, so they serve both as tier-1 + * agent evidence and tier-2 probe evidence. + */ + evidenceRole?: "probe" | "agent" | "agent_and_probe"; +} + +/** + * Payload for `agent_step_finished_event`. Emitted once per tool call within + * a step result. Carries the tool's reported outcome and a reference to the + * agent's textual reasoning for the step. + * + * Tier 1 evidence (the bytes the LLM consumed as the tool result) is captured + * separately by the harness via an AgentExecuteCallbacks.onStepFinish wrapper + * and is not part of this payload. + */ +export interface AgentStepFinishedEvent { + stepIndex: number; + /** Name of the tool that ran (e.g., "act", "extract", "click"). */ + actionName: string; + /** Arguments passed to the tool. */ + actionArgs: Record; + /** Agent's textual reasoning (event.text on the AI SDK StepResult). */ + reasoning: string; + /** Outcome of the tool execution as seen by the harness. */ + toolOutput: { + ok: boolean; + /** The tool's native return value. */ + result: unknown; + error?: string; + }; + /** ISO 8601 timestamp at which the step finished. */ + finishedAt: string; +} + +/** + * Payload for `agent_step_observed_event`. Emitted after the harness probe + * completes for a step. + */ +export interface AgentStepObservedEvent { + stepIndex: number; + /** Page URL after the step's tool execution. */ + url: string; + /** v1 — accessibility tree snapshot. */ + ariaTree?: string; + /** v1 — viewport scroll context. */ + scroll?: { top: number; pageHeight: number }; +} + +/** Payload for `agent_final_answer_event`. Emitted when the `done` tool resolves. */ +export interface AgentFinalAnswerEvent { + /** The agent's final summary message. */ + message: string; + /** Optional structured output if the agent's `output` schema was set. */ + output?: Record; +} diff --git a/packages/core/lib/v3/types/public/index.ts b/packages/core/lib/v3/types/public/index.ts index 9c5df08d0..9bf24eb27 100644 --- a/packages/core/lib/v3/types/public/index.ts +++ b/packages/core/lib/v3/types/public/index.ts @@ -1,4 +1,5 @@ export * from "./agent.js"; +export * from "./busEvents.js"; // Export api.ts under namespace to avoid conflicts with methods.ts types export * as Api from "./api.js"; // Also export BrowserbaseRegion directly for convenience diff --git a/packages/evals/framework/trajectoryRecorder.ts b/packages/evals/framework/trajectoryRecorder.ts new file mode 100644 index 000000000..8895a0844 --- /dev/null +++ b/packages/evals/framework/trajectoryRecorder.ts @@ -0,0 +1,506 @@ +/** + * TrajectoryRecorder — subscribes to v3.bus step events emitted by the agent + * handlers (v3AgentHandler / v3CuaAgentHandler) and assembles a Trajectory + * the verifier can consume. + * + * Lifecycle: + * const recorder = new TrajectoryRecorder({ v3, taskSpec }); + * recorder.start(); + * await agent.execute(...); + * const trajectory = await recorder.finish({ status: "complete", usage }); + * + * Persistence is env-gated by `VERIFIER_PERSIST_TRAJECTORIES`: + * - unset: persistence follows the default (on locally, off in CI). + * - "1" / "true": always persist. + * - "0" / "false": never persist. + * + * On-disk layout is stable JSON + screenshots so saved runs can be re-scored + * without format conversion. + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { + AgentEvidence, + AgentFinalAnswerEvent, + AgentScreenshotTakenEvent, + AgentStepFinishedEvent, + AgentStepObservedEvent, + ProbeEvidence, + TaskSpec, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + EvaluationResult, + V3, +} from "@browserbasehq/stagehand"; + +interface PartialStep { + index: number; + actionName: string; + actionArgs: Record; + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: { ok: boolean; result: unknown; error?: string }; + finishedAt: string; +} + +export interface TrajectoryRecorderOptions { + v3: V3; + taskSpec: TaskSpec; + /** + * Root directory under which trajectory dirs are written. Each task run + * gets a subdirectory named by runId/task.id. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp + env). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface TrajectoryFinishOptions { + status: TrajectoryStatus; + finalAnswer?: string; + usage?: Partial; +} + +const ZERO_USAGE: TrajectoryUsage = { + input_tokens: 0, + output_tokens: 0, +}; + +/** + * Decide whether to persist by default. Honors the explicit override first, + * then env, then falls back to "persist when not in CI". + */ +function shouldPersist(override: boolean | undefined): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +export class TrajectoryRecorder { + private readonly v3: V3; + private readonly taskSpec: TaskSpec; + private readonly runId: string; + private readonly outputDir: string; + private readonly persistEnabled: boolean; + + // Per-stepIndex builders; events can arrive out-of-order in theory, though + // the handlers emit step_finished → screenshot_taken → step_observed in the + // same microtask. + private readonly partialSteps = new Map>(); + private readonly observationByStep = new Map< + number, + AgentStepObservedEvent + >(); + private readonly screenshotsByStep = new Map< + number, + AgentScreenshotTakenEvent + >(); + private finalAnswerEvent?: AgentFinalAnswerEvent; + private startedAt = ""; + private endedAt = ""; + private listenersAttached = false; + + // Strongly-typed bound handlers so we can attach/detach the same references. + private readonly onScreenshot = (e: AgentScreenshotTakenEvent) => { + this.screenshotsByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + + // Default to "probe" when the emit site doesn't tag the role — matches + // v3AgentHandler's post-step screenshot, which is always a tier-2 probe. + const role = e.evidenceRole ?? "probe"; + + // Probe channel (tier 2): the page's state at observation time. For CUA + // the pre-action screenshot is NOT a probe — that role is filled by the + // post-action emit from emitCuaActionStep. So only update probe.screenshot + // when the event explicitly carries the probe role. + if (role === "probe" || role === "agent_and_probe") { + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.screenshot = e.screenshot; + probe.url = e.url; + partial.probeEvidence = probe; + } else if (!partial.probeEvidence?.url) { + // Even for tier-1-only events, the URL is useful probe context if we + // don't have one yet. Doesn't overwrite a later post-action URL. + partial.probeEvidence = { + ...(partial.probeEvidence ?? {}), + url: e.url, + }; + } + + // Agent channel (tier 1): bytes the model ingested. + if (role === "agent" || role === "agent_and_probe") { + partial.agentEvidence = mergeAgentEvidence(partial.agentEvidence, { + modalities: [ + { type: "image", bytes: e.screenshot, mediaType: "image/png" }, + ], + }); + } + }; + private readonly onStepFinished = (e: AgentStepFinishedEvent) => { + const partial = this.ensurePartial(e.stepIndex); + partial.actionName = e.actionName; + partial.actionArgs = e.actionArgs; + partial.reasoning = e.reasoning; + partial.toolOutput = e.toolOutput; + partial.finishedAt = e.finishedAt; + partial.agentEvidence = mergeAgentEvidence( + partial.agentEvidence, + buildAgentEvidence(e), + ); + }; + private readonly onStepObserved = (e: AgentStepObservedEvent) => { + this.observationByStep.set(e.stepIndex, e); + const partial = this.ensurePartial(e.stepIndex); + const probe: ProbeEvidence = { ...(partial.probeEvidence ?? {}) }; + probe.url = e.url; + if (e.ariaTree !== undefined) probe.ariaTree = e.ariaTree; + if (e.scroll !== undefined) probe.scroll = e.scroll; + partial.probeEvidence = probe; + }; + private readonly onFinalAnswer = (e: AgentFinalAnswerEvent) => { + this.finalAnswerEvent = e; + }; + + constructor(opts: TrajectoryRecorderOptions) { + this.v3 = opts.v3; + this.taskSpec = opts.taskSpec; + this.runId = + opts.runId ?? + new Date().toISOString().replace(/[:.]/g, "-").replace("T", "T"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + this.outputDir = path.join(root, this.runId, opts.taskSpec.id); + this.persistEnabled = shouldPersist(opts.persist); + } + + /** Subscribe to bus events. Call once before agent.execute(). */ + start(): void { + if (this.listenersAttached) return; + this.startedAt = new Date().toISOString(); + this.v3.bus.on("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.on("agent_step_finished_event", this.onStepFinished); + this.v3.bus.on("agent_step_observed_event", this.onStepObserved); + this.v3.bus.on("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = true; + } + + /** + * Detach listeners, assemble the Trajectory, and (if persistence is on) + * write the on-disk layout. Idempotent. + */ + async finish(opts: TrajectoryFinishOptions): Promise { + this.detach(); + this.endedAt = new Date().toISOString(); + + const steps = this.assembleSteps(); + const trajectory: Trajectory = { + task: this.taskSpec, + steps, + finalAnswer: opts.finalAnswer ?? this.finalAnswerEvent?.message, + status: opts.status, + usage: { ...ZERO_USAGE, ...(opts.usage ?? {}) }, + timing: { startedAt: this.startedAt, endedAt: this.endedAt }, + }; + + if (this.persistEnabled) { + await this.persist(trajectory); + } + + return trajectory; + } + + /** Throw away in-memory state without writing to disk. Used on early abort. */ + cancel(): void { + this.detach(); + this.partialSteps.clear(); + this.observationByStep.clear(); + this.screenshotsByStep.clear(); + this.finalAnswerEvent = undefined; + } + + /** Where the trajectory dir lives (whether or not it was persisted). */ + get directory(): string { + return this.outputDir; + } + + /** Whether this recorder wrote the trajectory directory on finish(). */ + get persisted(): boolean { + return this.persistEnabled; + } + + /** + * Persist evaluator result next to the trajectory. No-op when trajectory + * persistence is disabled. + */ + async persistResult( + result: EvaluationResult, + filename = "result.json", + ): Promise { + if (!this.persistEnabled) return; + + const scoresDir = path.join(this.outputDir, "scores"); + await fs.mkdir(scoresDir, { recursive: true }); + await fs.writeFile( + path.join(scoresDir, filename), + JSON.stringify(result, null, 2), + ); + + const taskDataPath = path.join(this.outputDir, "task_data.json"); + let taskData: Record = {}; + try { + taskData = JSON.parse(await fs.readFile(taskDataPath, "utf8")) as Record< + string, + unknown + >; + } catch { + taskData = { task: this.taskSpec }; + } + await fs.writeFile( + taskDataPath, + JSON.stringify({ ...taskData, result }, null, 2), + ); + } + + private detach(): void { + if (!this.listenersAttached) return; + this.v3.bus.off("agent_screenshot_taken_event", this.onScreenshot); + this.v3.bus.off("agent_step_finished_event", this.onStepFinished); + this.v3.bus.off("agent_step_observed_event", this.onStepObserved); + this.v3.bus.off("agent_final_answer_event", this.onFinalAnswer); + this.listenersAttached = false; + } + + private ensurePartial(stepIndex: number): Partial { + let p = this.partialSteps.get(stepIndex); + if (!p) { + p = { index: stepIndex }; + this.partialSteps.set(stepIndex, p); + } + return p; + } + + /** + * Materialize ordered TrajectoryStep[] from the accumulated partials. + * Steps that never received a step_finished event are skipped (they can + * appear for CUA where only screenshot events fire — those are recorded as + * orphan probe screenshots and elided here). + */ + private assembleSteps(): TrajectoryStep[] { + const out: TrajectoryStep[] = []; + const indices = [...this.partialSteps.keys()].sort((a, b) => a - b); + for (const i of indices) { + const p = this.partialSteps.get(i)!; + if ( + p.actionName === undefined || + p.toolOutput === undefined || + p.finishedAt === undefined + ) { + // Orphan screenshot-only entry (typically CUA). Skip — we record + // these by writing the screenshot to disk separately during persist(). + continue; + } + out.push({ + index: i, + actionName: p.actionName, + actionArgs: p.actionArgs ?? {}, + reasoning: p.reasoning ?? "", + agentEvidence: p.agentEvidence ?? { modalities: [] }, + probeEvidence: p.probeEvidence ?? {}, + toolOutput: p.toolOutput, + startedAt: this.startedAt, + finishedAt: p.finishedAt, + }); + } + return out; + } + + /** + * Write the trajectory directory layout. + * + * / + * ├── task_data.json + * ├── trajectory.json (screenshots referenced by path) + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/.png + * └── times.json + */ + private async persist(trajectory: Trajectory): Promise { + await fs.mkdir(this.outputDir, { recursive: true }); + + // Walk steps and write screenshots; replace Buffer with path reference in + // the serialized trajectory. Both tiers externalize image bytes under + // screenshots/probe/.png — tier 2, what the harness observed + // screenshots/agent/.png — tier 1, what the model received + // The `_` suffix only appears when a step carries multiple images + // (rare; typically zero or one per step). Paths in JSON are relative to + // the trajectory dir so the directory is movable/copyable as a unit. + await fs.mkdir(path.join(this.outputDir, "screenshots", "probe"), { + recursive: true, + }); + await fs.mkdir(path.join(this.outputDir, "screenshots", "agent"), { + recursive: true, + }); + + const serializableSteps: unknown[] = []; + for (const step of trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile( + path.join(this.outputDir, relPath), + probe.screenshot, + ); + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const modalities: unknown[] = []; + for (const m of step.agentEvidence.modalities) { + if (m.type !== "image") { + modalities.push(m); + continue; + } + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + await fs.writeFile(path.join(this.outputDir, relPath), m.bytes); + modalities.push({ + type: "image", + imagePath: relPath, + mediaType: m.mediaType, + }); + imageSeq += 1; + } + const agentEvidence = { modalities }; + serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); + } + + // Image modalities carry imagePath instead of raw bytes on disk, so this + // is no longer a strict Trajectory at the type level. Cast through + // unknown rather than widening the type contract. + const serialized = { + ...trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(this.outputDir, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + // task_data.json stores TaskSpec + (later) result. + await fs.writeFile( + path.join(this.outputDir, "task_data.json"), + JSON.stringify( + { + task: trajectory.task, + status: trajectory.status, + finalAnswer: trajectory.finalAnswer ?? null, + }, + null, + 2, + ), + ); + + await fs.writeFile( + path.join(this.outputDir, "times.json"), + JSON.stringify( + { + timing: trajectory.timing, + usage: trajectory.usage, + stepCount: trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(this.outputDir, "scores"), { recursive: true }); + await fs.writeFile( + path.join(this.outputDir, "core.log"), + coreLog(trajectory), + ); + } +} + +function mergeAgentEvidence( + ...parts: Array +): AgentEvidence { + return { + modalities: parts.flatMap((p) => p?.modalities ?? []), + }; +} + +/** + * Build a tier-1 AgentEvidence from a step_finished event. The handler's + * toolOutput.result is what the LLM consumed next turn (modulo SDK + * serialization). + */ +function buildAgentEvidence(e: AgentStepFinishedEvent): AgentEvidence { + const modalities: AgentEvidence["modalities"] = []; + if (e.reasoning) { + modalities.push({ type: "text", content: e.reasoning }); + } + const result = e.toolOutput.result; + if (result === undefined || result === null) { + return { modalities }; + } + if (typeof result === "string") { + modalities.push({ type: "text", content: result }); + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + // Tool results commonly include a screenshotBase64 field for vision tools. + const r = result as { screenshotBase64?: string } & Record; + if (typeof r.screenshotBase64 === "string") { + try { + modalities.push({ + type: "image", + bytes: Buffer.from(r.screenshotBase64, "base64"), + mediaType: "image/png", + }); + } catch { + // ignore + } + } + modalities.push({ type: "json", content: result }); + } + return { modalities }; +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/evals/tests/framework/trajectoryRecorder.test.ts b/packages/evals/tests/framework/trajectoryRecorder.test.ts new file mode 100644 index 000000000..5c5268e66 --- /dev/null +++ b/packages/evals/tests/framework/trajectoryRecorder.test.ts @@ -0,0 +1,197 @@ +import { EventEmitter } from "node:events"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, describe, expect, it } from "vitest"; +import type { TaskSpec, V3 } from "@browserbasehq/stagehand"; + +import { TrajectoryRecorder } from "../../framework/trajectoryRecorder.js"; + +const tempDirs: string[] = []; + +afterEach(async () => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) await fs.rm(dir, { recursive: true, force: true }); + } +}); + +function makeTempDir(): Promise { + return fs + .mkdtemp(path.join(os.tmpdir(), "trajectory-recorder-")) + .then((dir) => { + tempDirs.push(dir); + return dir; + }); +} + +function makeV3(bus = new EventEmitter()): V3 { + return { bus } as unknown as V3; +} + +function makeTaskSpec(): TaskSpec { + return { + id: "recorder-task", + instruction: "Compare economy and business fares.", + initUrl: "https://example.com", + precomputedRubric: { + items: [ + { + criterion: "Report fare delta", + description: "Report the difference between two fares.", + maxPoints: 1, + }, + ], + }, + }; +} + +describe("TrajectoryRecorder", () => { + it("assembles trajectory evidence from bus events", async () => { + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + persist: false, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }); + bus.emit("agent_final_answer_event", { + message: "Business is $150 more than economy.", + }); + + const trajectory = await recorder.finish({ + status: "complete", + usage: { input_tokens: 10, output_tokens: 5 }, + }); + + expect(trajectory.steps).toHaveLength(1); + expect(trajectory.steps[0]).toMatchObject({ + index: 0, + actionName: "extract", + actionArgs: { instruction: "Read fares" }, + reasoning: "Read visible fare cells.", + toolOutput: { + ok: true, + result: { economy: "$100", business: "$250" }, + }, + probeEvidence: { + url: "https://example.com/search", + ariaTree: "RootWebArea\nStaticText: Economy $100", + }, + }); + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(trajectory.steps[0].agentEvidence.modalities).toEqual( + expect.arrayContaining([ + { type: "image", bytes: screenshot, mediaType: "image/png" }, + { type: "text", content: "Read visible fare cells." }, + { type: "json", content: { economy: "$100", business: "$250" } }, + ]), + ); + expect(trajectory.finalAnswer).toBe("Business is $150 more than economy."); + }); + + it("persists trajectory files and evaluator results", async () => { + const outputRoot = await makeTempDir(); + const bus = new EventEmitter(); + const recorder = new TrajectoryRecorder({ + v3: makeV3(bus), + taskSpec: makeTaskSpec(), + outputRoot, + runId: "run-1", + persist: true, + }); + const screenshot = Buffer.from("screen-1"); + + recorder.start(); + bus.emit("agent_screenshot_taken_event", { + stepIndex: 0, + screenshot, + url: "https://example.com/search", + evidenceRole: "agent_and_probe", + }); + bus.emit("agent_step_finished_event", { + stepIndex: 0, + actionName: "act", + actionArgs: { instruction: "Search fares" }, + reasoning: "Search for fares.", + toolOutput: { ok: true, result: "done" }, + finishedAt: new Date(0).toISOString(), + }); + bus.emit("agent_step_observed_event", { + stepIndex: 0, + url: "https://example.com/search", + }); + + await recorder.finish({ status: "complete" }); + await recorder.persistResult({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + + const taskDir = path.join(outputRoot, "run-1", "recorder-task"); + await expect(fs.readdir(taskDir)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "probe", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "screenshots", "agent", "1.png")), + ).resolves.toEqual(screenshot); + await expect( + fs.readFile(path.join(taskDir, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + + const trajectory = JSON.parse( + await fs.readFile(path.join(taskDir, "trajectory.json"), "utf8"), + ); + expect(trajectory.steps[0].probeEvidence.screenshotPath).toBe( + "screenshots/probe/1.png", + ); + expect(trajectory.steps[0].agentEvidence.modalities).toContainEqual({ + type: "image", + imagePath: "screenshots/agent/1.png", + mediaType: "image/png", + }); + + const taskData = JSON.parse( + await fs.readFile(path.join(taskDir, "task_data.json"), "utf8"), + ); + expect(taskData.result).toMatchObject({ + outcomeSuccess: true, + explanation: "The task was completed.", + }); + }); +});