From 9535d8226741c6652d57ec0dda8f1d4c4efe6f50 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:55:51 -0700 Subject: [PATCH 1/5] feat(evals): add verifier harness adapters --- packages/evals/framework/claudeCodeRunner.ts | 127 ++++- packages/evals/framework/codexRunner.ts | 129 +++++- .../framework/harnesses/claudeCodeAdapter.ts | 225 +++++++++ .../evals/framework/harnesses/codexAdapter.ts | 223 +++++++++ .../framework/harnesses/persistTrajectory.ts | 185 ++++++++ .../framework/harnesses/trajectoryAdapter.ts | 208 +++++++++ .../evals/scripts/verify-harness-adapters.ts | 434 ++++++++++++++++++ 7 files changed, 1527 insertions(+), 4 deletions(-) create mode 100644 packages/evals/framework/harnesses/claudeCodeAdapter.ts create mode 100644 packages/evals/framework/harnesses/codexAdapter.ts create mode 100644 packages/evals/framework/harnesses/persistTrajectory.ts create mode 100644 packages/evals/framework/harnesses/trajectoryAdapter.ts create mode 100644 packages/evals/scripts/verify-harness-adapters.ts diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index 6ec620233..c306c4256 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -1,9 +1,12 @@ -import type { AvailableModel } from "@browserbasehq/stagehand"; +import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { TaskResult } from "./types.js"; import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; +import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js"; +import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js"; +import { verdictToSuccess } from "./verifierAdapter.js"; type ClaudeSdkMessage = Record; type ClaudeQuery = AsyncIterable; @@ -16,6 +19,25 @@ export type ClaudeAgentSdk = { }) => ClaudeQuery; }; +export interface ClaudeCodeVerifierConfig { + /** + * V3 instance used solely as the LLM-client carrier for V3Evaluator. The + * instance does NOT need to have `init()` been called — V3Evaluator.verify() + * uses only `v3.logger` to construct its LLMProvider. + */ + v3: V3; + /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */ + taskSpec: TaskSpec; + /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */ + dataset: string; + /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */ + successMode?: "outcome" | "process" | "both"; + /** Override trajectory persistence root. */ + trajectoryRoot?: string; + /** Override the run id (defaults to ISO timestamp). */ + runId?: string; +} + export interface ClaudeCodeRunnerInput { plan: ExternalHarnessTaskPlan; model: AvailableModel; @@ -23,6 +45,15 @@ export interface ClaudeCodeRunnerInput { toolAdapter?: PreparedClaudeCodeToolAdapter; signal?: AbortSignal; sdk?: ClaudeAgentSdk; + /** + * Optional verifier integration. When provided, the runner builds a + * Trajectory from the SDK message stream (via claudeCodeAdapter), runs + * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict + * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE). + * When omitted, the runner falls back to parsing the legacy EVAL_RESULT + * line — preserves current behavior for callers that haven't migrated. + */ + verifier?: ClaudeCodeVerifierConfig; } export interface ParsedClaudeCodeResult { @@ -124,7 +155,9 @@ export async function runClaudeCodeAgent({ toolAdapter, signal, sdk: injectedSdk, + verifier, }: ClaudeCodeRunnerInput): Promise { + const startedAt = new Date().toISOString(); const sdk = injectedSdk ?? (await loadClaudeAgentSdk()); const abortController = new AbortController(); if (signal) { @@ -220,8 +253,10 @@ export async function runClaudeCodeAgent({ parsed.summary ?? stopReason ?? (resultText || transcriptText || "Claude Code did not report success"); + const endedAt = new Date().toISOString(); + const tokenUsage = extractClaudeCodeTokenUsage(resultMessage); - return { + const baseResult: TaskResult = { _success: parsed.success, error: !parsed.success ? errorMessage : undefined, reasoning: parsed.summary, @@ -232,6 +267,94 @@ export async function runClaudeCodeAgent({ logs: logger.getLogs(), metrics: buildClaudeCodeMetrics(resultMessage), }; + + if (!verifier) { + return baseResult; + } + + // Build a Trajectory from the SDK message stream and run the rubric verifier. + try { + const trajectory = claudeCodeAdapter.fromHarnessResult( + { + messages, + finalAnswer: parsed.finalAnswer ?? resultText, + status: status === "completed" ? "complete" : "error", + usage: { + input_tokens: tokenUsage.inputTokens, + output_tokens: tokenUsage.outputTokens, + cached_input_tokens: tokenUsage.cacheReadInputTokens, + }, + timing: { startedAt, endedAt }, + }, + verifier.taskSpec, + ); + + const { V3Evaluator } = await import("@browserbasehq/stagehand"); + const { RubricCache } = await import("./rubricCache.js"); + const evaluator = new V3Evaluator(verifier.v3); + + // Hydrate rubric — use precomputed if present, otherwise cache-or-generate. + let rubric = verifier.taskSpec.precomputedRubric; + if (!rubric) { + if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + rubric = await evaluator.generateRubric(verifier.taskSpec); + } else { + const cache = new RubricCache({ dataset: verifier.dataset }); + rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator); + } + } + const hydratedSpec: TaskSpec = { + ...verifier.taskSpec, + precomputedRubric: rubric, + }; + + const verdict = await evaluator.verify(trajectory, hydratedSpec); + const successMode = + verifier.successMode ?? + ((process.env.EVAL_SUCCESS_MODE as + | "outcome" + | "process" + | "both" + | undefined) || + "outcome"); + const verifiedSuccess = verdictToSuccess(verdict, successMode); + + const { directory: trajectoryDir } = await persistAdapterTrajectory({ + trajectory, + taskSpec: hydratedSpec, + verdict, + outputRoot: verifier.trajectoryRoot, + runId: verifier.runId, + }); + + logger.log({ + category: "claude_code", + message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`, + level: 1, + }); + + return { + ...baseResult, + _success: verifiedSuccess, + error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage), + outcomeSuccess: verdict.outcomeSuccess, + processScore: verdict.processScore, + evidenceInsufficient: verdict.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + }; + } catch (verifyError) { + logger.warn({ + category: "claude_code", + message: `verifier integration failed: ${stringifyError(verifyError)}`, + level: 0, + auxiliary: { + error: { value: stringifyError(verifyError), type: "string" }, + }, + }); + return baseResult; + } } function buildClaudeCodeMetrics( diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts index 4d2844efa..2c5695789 100644 --- a/packages/evals/framework/codexRunner.ts +++ b/packages/evals/framework/codexRunner.ts @@ -1,9 +1,12 @@ -import type { AvailableModel } from "@browserbasehq/stagehand"; +import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { TaskResult } from "./types.js"; import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js"; +import { codexAdapter } from "./harnesses/codexAdapter.js"; +import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js"; +import { verdictToSuccess } from "./verifierAdapter.js"; type MetricValue = { count: number; value: number }; type CodexEvent = Record; @@ -25,6 +28,25 @@ export type CodexSdk = { startThread: (options?: Record) => CodexThread; }; +export interface CodexVerifierConfig { + /** + * V3 instance used solely as the LLM-client carrier for V3Evaluator. The + * instance does NOT need to have `init()` been called — V3Evaluator.verify() + * uses only `v3.logger` to construct its LLMProvider. + */ + v3: V3; + /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */ + taskSpec: TaskSpec; + /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */ + dataset: string; + /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */ + successMode?: "outcome" | "process" | "both"; + /** Override trajectory persistence root. */ + trajectoryRoot?: string; + /** Override the run id (defaults to ISO timestamp). */ + runId?: string; +} + export interface CodexRunnerInput { plan: ExternalHarnessTaskPlan; model: AvailableModel; @@ -32,6 +54,15 @@ export interface CodexRunnerInput { toolAdapter?: PreparedCodexToolAdapter; signal?: AbortSignal; sdk?: CodexSdk; + /** + * Optional verifier integration. When provided, the runner builds a + * Trajectory from the codex event stream (via codexAdapter), runs + * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict + * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE). + * When omitted, the runner falls back to parsing the legacy JSON result — + * preserves current behavior for callers that haven't migrated. + */ + verifier?: CodexVerifierConfig; } export interface ParsedCodexResult { @@ -114,7 +145,9 @@ export async function runCodexAgent({ toolAdapter, signal, sdk: injectedSdk, + verifier, }: CodexRunnerInput): Promise { + const startedAt = new Date().toISOString(); const sdk = injectedSdk ?? (await loadCodexSdk(toolAdapter?.env)); const prompt = buildCodexPrompt(plan, toolAdapter?.promptInstructions); const events: CodexEvent[] = []; @@ -191,8 +224,9 @@ export async function runCodexAgent({ finalResponse || transcriptText || "Codex did not report success"); + const endedAt = new Date().toISOString(); - return { + const baseResult: TaskResult = { _success: parsed.success, error: !parsed.success ? errorMessage : undefined, reasoning: parsed.summary, @@ -203,6 +237,97 @@ export async function runCodexAgent({ logs: logger.getLogs(), metrics: buildCodexMetrics(usage), }; + + if (!verifier) { + return baseResult; + } + + try { + const trajectory = codexAdapter.fromHarnessResult( + { + events, + finalAnswer: parsed.finalAnswer ?? finalResponse, + status: status === "completed" ? "complete" : "error", + usage: { + input_tokens: toFiniteNumber(usage?.input_tokens), + output_tokens: toFiniteNumber(usage?.output_tokens), + ...(usage?.reasoning_output_tokens !== undefined && { + reasoning_tokens: toFiniteNumber(usage.reasoning_output_tokens), + }), + ...(usage?.cached_input_tokens !== undefined && { + cached_input_tokens: toFiniteNumber(usage.cached_input_tokens), + }), + }, + timing: { startedAt, endedAt }, + }, + verifier.taskSpec, + ); + + const { V3Evaluator } = await import("@browserbasehq/stagehand"); + const { RubricCache } = await import("./rubricCache.js"); + const evaluator = new V3Evaluator(verifier.v3); + + let rubric = verifier.taskSpec.precomputedRubric; + if (!rubric) { + if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + rubric = await evaluator.generateRubric(verifier.taskSpec); + } else { + const cache = new RubricCache({ dataset: verifier.dataset }); + rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator); + } + } + const hydratedSpec: TaskSpec = { + ...verifier.taskSpec, + precomputedRubric: rubric, + }; + + const verdict = await evaluator.verify(trajectory, hydratedSpec); + const successMode = + verifier.successMode ?? + ((process.env.EVAL_SUCCESS_MODE as + | "outcome" + | "process" + | "both" + | undefined) || + "outcome"); + const verifiedSuccess = verdictToSuccess(verdict, successMode); + + const { directory: trajectoryDir } = await persistAdapterTrajectory({ + trajectory, + taskSpec: hydratedSpec, + verdict, + outputRoot: verifier.trajectoryRoot, + runId: verifier.runId, + }); + + logger.log({ + category: "codex", + message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`, + level: 1, + }); + + return { + ...baseResult, + _success: verifiedSuccess, + error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage), + outcomeSuccess: verdict.outcomeSuccess, + processScore: verdict.processScore, + evidenceInsufficient: verdict.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + }; + } catch (verifyError) { + logger.warn({ + category: "codex", + message: `verifier integration failed: ${stringifyError(verifyError)}`, + level: 0, + auxiliary: { + error: { value: stringifyError(verifyError), type: "string" }, + }, + }); + return baseResult; + } } function tryParseCodexJson( diff --git a/packages/evals/framework/harnesses/claudeCodeAdapter.ts b/packages/evals/framework/harnesses/claudeCodeAdapter.ts new file mode 100644 index 000000000..fd680895b --- /dev/null +++ b/packages/evals/framework/harnesses/claudeCodeAdapter.ts @@ -0,0 +1,225 @@ +/** + * claudeCodeAdapter — converts a Claude Code SDK run into a `Trajectory` the + * verifier can consume. + * + * Input shape: the SDK emits a stream of `ClaudeSdkMessage` objects of + * different `type`s — assistant (model output, may contain tool_use blocks), + * user (tool_result blocks for prior tool_use calls), and result (final + * outcome with cost/usage/turn counts). We accumulate the stream upstream in + * `runClaudeCodeAgent` and hand the full list here. + * + * Mapping: + * - Each `tool_use` block in an assistant message becomes one normalized + * tool call, paired with its matching `tool_result` from a subsequent + * user message (by `tool_use_id`). + * - Assistant `text` blocks that precede a tool_use are folded into that + * tool call's `reasoning`. Trailing text after the last tool call (and + * the final result message's `result` string when present) becomes the + * `finalAnswer`. + * - The result message's usage carries forward as the trajectory usage. + * + * Failure modes: + * - max_turns / sdk_error → status = "error", but we still emit whatever + * steps we have. The verifier flags evidence_insufficient on criteria it + * can't ground. + */ +import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; +import { + buildTrajectory, + type NormalizedToolCall, + type TrajectoryAdapter, +} from "./trajectoryAdapter.js"; + +/** Subset of the harness result we need to build a trajectory. */ +export interface ClaudeCodeRunResult { + /** Raw SDK message stream collected during execution, in arrival order. */ + messages: Array>; + /** Final assistant message captured separately (optional — falls back to messages). */ + finalAnswer?: string; + /** Trajectory-level status. Defaults to "complete". */ + status?: Trajectory["status"]; + /** Optional usage to fold into Trajectory.usage. */ + usage?: Partial; + /** Optional run start/end timing. Adapter fills with now-now otherwise. */ + timing?: Partial; +} + +interface ToolUseBlock { + /** tool_use_id used to match against tool_result blocks. */ + id: string; + name: string; + input: Record; + /** Assistant text accumulated before this tool call (becomes `reasoning`). */ + reasoningPrefix: string; +} + +interface ToolResultBlock { + toolUseId: string; + /** Concatenated text content of the result. */ + text: string; + /** Original structured content when not flattened to text. */ + raw?: unknown; + isError: boolean; +} + +export class ClaudeCodeTrajectoryAdapter + implements TrajectoryAdapter +{ + fromHarnessResult( + result: ClaudeCodeRunResult, + taskSpec: TaskSpec, + ): Trajectory { + const toolUses: ToolUseBlock[] = []; + const toolResults = new Map(); + const trailingTextParts: string[] = []; + let resultMessageText: string | undefined; + + let pendingReasoning = ""; + + for (const message of result.messages) { + const type = String((message as Record).type ?? ""); + const inner = (message as Record).message; + if (type === "result") { + const r = (message as Record).result; + if (typeof r === "string" && r.trim()) { + resultMessageText = r; + } + continue; + } + if (!isRecord(inner)) continue; + const content = inner.content; + if (!Array.isArray(content)) { + if (typeof content === "string" && type === "assistant") { + pendingReasoning = appendText(pendingReasoning, content); + trailingTextParts.push(content); + } + continue; + } + + if (type === "assistant") { + for (const block of content) { + if (!isRecord(block)) continue; + const blockType = String(block.type ?? ""); + if (blockType === "text" && typeof block.text === "string") { + pendingReasoning = appendText(pendingReasoning, block.text); + trailingTextParts.push(block.text); + continue; + } + if (blockType === "tool_use") { + const id = typeof block.id === "string" ? block.id : ""; + const name = typeof block.name === "string" ? block.name : "tool"; + const input = isRecord(block.input) + ? (block.input as Record) + : {}; + toolUses.push({ + id, + name, + input, + reasoningPrefix: pendingReasoning, + }); + // Once a tool_use lands, the buffered text belonged to its reasoning; + // future tool calls start with empty reasoning unless more text arrives. + pendingReasoning = ""; + // The text we just folded into reasoning is not the final answer. + // Drop it from trailingTextParts. + trailingTextParts.length = 0; + } + } + continue; + } + + if (type === "user") { + for (const block of content) { + if (!isRecord(block)) continue; + const blockType = String(block.type ?? ""); + if (blockType !== "tool_result") continue; + const toolUseId = + typeof block.tool_use_id === "string" ? block.tool_use_id : ""; + const isError = block.is_error === true; + const { text, raw } = extractToolResultContent(block.content); + toolResults.set(toolUseId, { + toolUseId, + text, + raw, + isError, + }); + } + continue; + } + } + + const toolCalls: NormalizedToolCall[] = toolUses.map((use) => { + const matched = toolResults.get(use.id); + const ok = matched ? !matched.isError : true; + const resultPayload = + matched?.raw !== undefined ? matched.raw : (matched?.text ?? ""); + return { + name: use.name, + args: use.input, + result: resultPayload, + ok, + ...(matched?.isError && matched.text && { error: matched.text }), + reasoning: use.reasoningPrefix.trim() || undefined, + }; + }); + + const trailing = trailingTextParts.join("\n").trim(); + const finalAnswer = + result.finalAnswer ?? + resultMessageText ?? + (trailing.length > 0 ? trailing : undefined); + + return buildTrajectory({ + taskSpec, + toolCalls, + finalAnswer, + status: result.status ?? "complete", + usage: result.usage, + timing: result.timing, + }); + } +} + +export const claudeCodeAdapter = new ClaudeCodeTrajectoryAdapter(); + +function appendText(buffer: string, addition: string): string { + if (!addition) return buffer; + if (!buffer) return addition; + return `${buffer}\n${addition}`; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +/** + * tool_result `content` can be: + * - a string (legacy) + * - an array of { type: "text", text } / { type: "image", source } blocks + * + * We flatten text blocks and preserve the original array (when structured) as + * `raw` so adapters that want the json modality can keep it. + */ +function extractToolResultContent(content: unknown): { + text: string; + raw?: unknown; +} { + if (typeof content === "string") { + return { text: content }; + } + if (!Array.isArray(content)) { + return { text: "" }; + } + const parts: string[] = []; + for (const block of content) { + if (!isRecord(block)) continue; + if (block.type === "text" && typeof block.text === "string") { + parts.push(block.text); + } else if (block.type === "image") { + parts.push("[image]"); + } else if (typeof block.text === "string") { + parts.push(block.text); + } + } + return { text: parts.join("\n"), raw: content }; +} diff --git a/packages/evals/framework/harnesses/codexAdapter.ts b/packages/evals/framework/harnesses/codexAdapter.ts new file mode 100644 index 000000000..cd313dd72 --- /dev/null +++ b/packages/evals/framework/harnesses/codexAdapter.ts @@ -0,0 +1,223 @@ +/** + * codexAdapter — converts a Codex SDK run into a `Trajectory` the verifier + * can consume. + * + * Input shape: codex emits `ThreadEvent`s — `item.completed` carrying a + * `ThreadItem` (command_execution, file_change, mcp_tool_call, agent_message, + * reasoning, web_search, todo_list, error), plus `turn.completed` for usage. + * We accumulate the full event list upstream in `runCodexAgent` and hand it + * here. + * + * Mapping: + * - command_execution items → tool call named `bash` (or the command's + * leading token), args = { command }, result = aggregated_output, + * ok = exit_code === 0. + * - mcp_tool_call items → tool call named `${server}.${tool}`, args = + * arguments, result = structured_content (json modality) when present, + * else flattened content text. ok = status !== "failed". + * - reasoning items between item.completed events → folded into the next + * tool call's reasoning string. + * - agent_message items → the final answer (last wins). + * - error items → captured as a failed tool call so the verifier sees the + * pattern (a no-op `error` action with the message in toolOutput.error). + * - file_change items → captured as a tool call named `file_change` with the + * change set in args (rare in browser eval contexts). + * - web_search items → captured as a tool call named `web_search` with the + * query in args. + * - todo_list items → not surfaced as tool calls (they aren't actions). + */ +import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; +import { + buildTrajectory, + type NormalizedToolCall, + type TrajectoryAdapter, +} from "./trajectoryAdapter.js"; + +export interface CodexRunResult { + /** All ThreadEvents collected from the SDK stream, in arrival order. */ + events: Array>; + /** Last `agent_message` text. Adapter falls back to scanning events otherwise. */ + finalAnswer?: string; + /** Trajectory-level status. Defaults to "complete". */ + status?: Trajectory["status"]; + /** Optional usage to fold into Trajectory.usage. */ + usage?: Partial; + /** Optional run start/end timing. Adapter fills with now-now otherwise. */ + timing?: Partial; +} + +export class CodexTrajectoryAdapter + implements TrajectoryAdapter +{ + fromHarnessResult(result: CodexRunResult, taskSpec: TaskSpec): Trajectory { + const toolCalls: NormalizedToolCall[] = []; + let pendingReasoning = ""; + let latestAgentMessage: string | undefined; + + for (const event of result.events) { + const type = String((event as Record).type ?? ""); + if (type !== "item.completed") continue; + const item = (event as Record).item; + if (!isRecord(item)) continue; + const itemType = String(item.type ?? ""); + + if (itemType === "reasoning" && typeof item.text === "string") { + pendingReasoning = pendingReasoning + ? `${pendingReasoning}\n${item.text}` + : item.text; + continue; + } + + if (itemType === "agent_message" && typeof item.text === "string") { + // Drop buffered reasoning that didn't precede a tool call. + pendingReasoning = ""; + latestAgentMessage = item.text; + continue; + } + + const call = normalizeItem(itemType, item, pendingReasoning); + if (call) { + toolCalls.push(call); + pendingReasoning = ""; + } + } + + const finalAnswer = result.finalAnswer ?? latestAgentMessage; + + return buildTrajectory({ + taskSpec, + toolCalls, + finalAnswer, + status: result.status ?? "complete", + usage: result.usage, + timing: result.timing, + }); + } +} + +export const codexAdapter = new CodexTrajectoryAdapter(); + +function normalizeItem( + itemType: string, + item: Record, + reasoning: string, +): NormalizedToolCall | undefined { + if (itemType === "command_execution") { + const command = typeof item.command === "string" ? item.command : ""; + const exitCode = + typeof item.exit_code === "number" ? item.exit_code : undefined; + const status = String(item.status ?? ""); + const ok = exitCode === 0 || status === "completed"; + const output = + typeof item.aggregated_output === "string" ? item.aggregated_output : ""; + // Use the leading token as the action name (`bash`, `browse`, etc.) when + // possible; falls back to `command_execution`. + const leading = command.split(/\s+/, 1)[0] || "command_execution"; + return { + name: leading, + args: { command, ...(exitCode !== undefined && { exit_code: exitCode }) }, + result: output, + ok, + ...(!ok && { + error: + exitCode !== undefined + ? `exit code ${exitCode}` + : `command status ${status}`, + }), + reasoning: reasoning || undefined, + }; + } + + if (itemType === "mcp_tool_call") { + const server = typeof item.server === "string" ? item.server : "mcp"; + const tool = typeof item.tool === "string" ? item.tool : "tool"; + const args = isRecord(item.arguments) + ? (item.arguments as Record) + : {}; + const status = String(item.status ?? ""); + const ok = status !== "failed"; + const mcpResult = isRecord(item.result) ? item.result : undefined; + const structured = mcpResult?.structured_content; + const content = mcpResult?.content; + const errorMessage = isRecord(item.error) + ? typeof item.error.message === "string" + ? item.error.message + : undefined + : undefined; + + // Prefer structured_content (json modality) when present, else flatten + // content blocks to text. Falls back to error message when failed. + let payload: unknown; + if (structured !== undefined && structured !== null) { + payload = structured; + } else if (Array.isArray(content)) { + const parts: string[] = []; + for (const block of content) { + if (!isRecord(block)) continue; + if (block.type === "text" && typeof block.text === "string") { + parts.push(block.text); + } else if (block.type === "image") { + parts.push("[image]"); + } else if (typeof block.text === "string") { + parts.push(block.text); + } + } + payload = parts.join("\n"); + } else if (!ok && errorMessage) { + payload = errorMessage; + } else { + payload = ""; + } + + return { + name: `${server}.${tool}`, + args, + result: payload, + ok, + ...(errorMessage && !ok && { error: errorMessage }), + reasoning: reasoning || undefined, + }; + } + + if (itemType === "web_search") { + const query = typeof item.query === "string" ? item.query : ""; + return { + name: "web_search", + args: { query }, + result: "", + ok: true, + reasoning: reasoning || undefined, + }; + } + + if (itemType === "file_change") { + const changes = Array.isArray(item.changes) ? item.changes : []; + const status = String(item.status ?? ""); + return { + name: "file_change", + args: { changes }, + result: { status, changes }, + ok: status === "completed", + reasoning: reasoning || undefined, + }; + } + + if (itemType === "error") { + const message = + typeof item.message === "string" ? item.message : "codex error item"; + return { + name: "error", + args: {}, + result: message, + ok: false, + error: message, + reasoning: reasoning || undefined, + }; + } + + return undefined; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts new file mode 100644 index 000000000..ffa3dd868 --- /dev/null +++ b/packages/evals/framework/harnesses/persistTrajectory.ts @@ -0,0 +1,185 @@ +/** + * persistAdapterTrajectory — writes the on-disk layout used by the Stagehand + * `TrajectoryRecorder.persist()` for trajectories built by external-harness + * adapters (claude_code, codex). + * + * `TrajectoryRecorder` itself is coupled to v3.bus events: it subscribes + * during the agent run, accumulates partial steps, and emits the final + * trajectory on finish(). External harnesses don't go through that bus — + * they produce a complete `Trajectory` synchronously after the harness + * finishes — so this helper writes the same on-disk layout without the + * event-subscription lifecycle. + * + * The on-disk layout matches microsoft/fara's example_trajectory/ so saved + * trajectories can be cross-validated against verify_trajectories.py without + * a format conversion step: + * + * / + * ├── task_data.json + * ├── trajectory.json (image bytes as base64, screenshots referenced by path) + * ├── screenshot_.png (only if probeEvidence carries Buffer screenshots — external + * │ harnesses don't natively, but the helper supports it) + * ├── scores/ + * │ └── mmrubric_v1.json (if `verdict` passed) + * ├── core.log + * └── times.json + * + * Honors `VERIFIER_PERSIST_TRAJECTORIES` for default on/off (matches + * TrajectoryRecorder's convention): + * - "1" / "true": always persist. + * - "0" / "false": never persist. + * - unset: persists when not in CI. + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { + ProbeEvidence, + TaskSpec, + Trajectory, + Verdict, +} from "@browserbasehq/stagehand"; + +export interface PersistAdapterTrajectoryOptions { + trajectory: Trajectory; + taskSpec: TaskSpec; + /** Verdict from V3Evaluator.verify(). Written to scores/mmrubric_v1.json. */ + verdict?: Verdict; + /** + * Output directory root. Final layout lives at `///`. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface PersistAdapterTrajectoryResult { + /** The directory the trajectory was (or would have been) persisted to. */ + directory: string; + /** Whether persistence actually wrote files. */ + persisted: boolean; +} + +function shouldPersist(override: boolean | undefined): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +export async function persistAdapterTrajectory( + opts: PersistAdapterTrajectoryOptions, +): Promise { + const runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + const directory = path.join(root, runId, opts.taskSpec.id); + const persisted = shouldPersist(opts.persist); + + if (!persisted) { + return { directory, persisted: false }; + } + + await fs.mkdir(directory, { recursive: true }); + + // Walk steps and (when a Buffer screenshot is present, which is rare for + // external harnesses) write it to disk + replace with a path reference. + // Image modalities in agentEvidence get base64-encoded inline to keep + // trajectory.json human-readable JSON. + const serializableSteps: unknown[] = []; + for (const step of opts.trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const filename = `screenshot_${step.index + 1}.png`; + await fs.writeFile(path.join(directory, filename), probe.screenshot); + probe.screenshotPath = filename; + delete probe.screenshot; + } + const agentEvidence = { + modalities: step.agentEvidence.modalities.map((m) => + m.type === "image" + ? { + type: "image", + bytesBase64: m.bytes.toString("base64"), + mediaType: m.mediaType, + } + : m, + ), + }; + serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); + } + + const serialized = { + ...opts.trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(directory, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + const taskData: Record = { + task: opts.trajectory.task, + status: opts.trajectory.status, + finalAnswer: opts.trajectory.finalAnswer ?? null, + }; + if (opts.verdict) { + taskData.verdict = opts.verdict; + } + await fs.writeFile( + path.join(directory, "task_data.json"), + JSON.stringify(taskData, null, 2), + ); + + await fs.writeFile( + path.join(directory, "times.json"), + JSON.stringify( + { + timing: opts.trajectory.timing, + usage: opts.trajectory.usage, + stepCount: opts.trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(directory, "scores"), { recursive: true }); + if (opts.verdict) { + await fs.writeFile( + path.join(directory, "scores", "mmrubric_v1.json"), + JSON.stringify(opts.verdict, null, 2), + ); + } + + await fs.writeFile( + path.join(directory, "core.log"), + coreLog(opts.trajectory), + ); + + return { directory, persisted: true }; +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/evals/framework/harnesses/trajectoryAdapter.ts b/packages/evals/framework/harnesses/trajectoryAdapter.ts new file mode 100644 index 000000000..cb3afac65 --- /dev/null +++ b/packages/evals/framework/harnesses/trajectoryAdapter.ts @@ -0,0 +1,208 @@ +/** + * TrajectoryAdapter — converts an external harness's natural output (a + * provider-shaped event/message log) into the Stagehand `Trajectory` shape + * that V3Evaluator.verify() consumes. + * + * The verifier is harness-agnostic (Trajectory + TaskSpec → Verdict; pure + * function, no live browser). That property is what lets non-Stagehand + * harnesses — Claude Code, Codex — be scored with the same rubric pipeline + * we use for Stagehand. Each external harness ships its own + * `TrajectoryAdapter` that maps its tool-call/message log to + * a `Trajectory`. The verifier never knows which adapter produced it. + * + * @see ~/.claude/plans/verifier-rewrite-plan.html §07 "External harness adapters" + */ +import type { + AgentEvidence, + AgentEvidenceModality, + TaskSpec, + Trajectory, + TrajectoryStep, +} from "@browserbasehq/stagehand"; + +/** + * Adapter interface. Implementations are pure: no I/O, no live browser, no + * mutation of the input result. The same harness result should always produce + * the same Trajectory. + * + * Empty `probeEvidence` on every step is supported — the verifier degrades + * gracefully via the `evidence_insufficient` path (paper's uncontrollable- + * failure principle). Text-heavy tasks (extract, lookup, search) still get a + * meaningful outcome verdict; visual-grounding criteria get flagged as + * evidence_insufficient rather than silently miscredited. + */ +export interface TrajectoryAdapter { + /** + * Convert the external harness's natural output into a Trajectory. Must be + * deterministic given the input. + */ + fromHarnessResult(result: THarnessResult, taskSpec: TaskSpec): Trajectory; +} + +/** + * Normalized tool invocation. Adapters parse harness-specific event/message + * shapes into this canonical structure before mapping to `TrajectoryStep`. + * + * The fields are deliberately permissive — harnesses vary in what they + * surface, and we want a single mapping helper to handle all of them. + */ +export interface NormalizedToolCall { + /** Tool name (e.g., "Bash", "mcp__stagehand_browser__run", "container.exec"). */ + name: string; + /** Tool arguments. Empty object if the harness doesn't surface them. */ + args: Record; + /** + * Tool result. Strings become a text modality; objects become a json modality. + * `undefined` is allowed (e.g., when the tool failed before producing output). + */ + result: unknown; + /** True if the tool reported success. Adapters infer this from harness flags. */ + ok: boolean; + /** Free-form error string when `ok === false`. */ + error?: string; + /** Optional reasoning text the assistant emitted before/with this tool call. */ + reasoning?: string; + /** Wall-clock when the call started. Falls back to call site's "now" if absent. */ + startedAt?: string; + /** Wall-clock when the call finished. Falls back to startedAt. */ + finishedAt?: string; +} + +/** + * Convert a `NormalizedToolCall` into a Trajectory `AgentEvidence`. Strings + * map to a single text modality; objects map to a json modality (plus a text + * modality with the stringified form so plain text-relevance prompts can + * grok structured output). Reasoning text becomes its own text modality — + * the verifier weights reasoning highly when grounding criteria without + * screenshots. + * + * `probeEvidence` is intentionally not produced here — external harnesses + * don't emit independent observations natively. See `actionToProbeEvidence` + * if a harness eventually grows that capability. + */ +export function actionToAgentEvidence( + call: Pick, +): AgentEvidence { + const modalities: AgentEvidenceModality[] = []; + + if (call.reasoning) { + modalities.push({ type: "text", content: call.reasoning }); + } + + const result = call.result; + if (result === undefined || result === null) { + return { modalities }; + } + + if (typeof result === "string") { + if (result.length > 0) { + modalities.push({ type: "text", content: result }); + } + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + // Provide both a JSON modality (preserved structure for prompts that + // accept JSON) and a stringified text modality (cheap fallback for prompts + // that only consume text). Step 2 relevance scoring tolerates duplicates. + modalities.push({ type: "json", content: result }); + const asText = safeStringify(result); + if (asText) { + modalities.push({ type: "text", content: asText }); + } + } else { + // Numbers, booleans, etc. — stringify so the verifier has a text handle. + modalities.push({ type: "text", content: String(result) }); + } + + return { modalities }; +} + +/** + * Materialize a `TrajectoryStep` from a normalized tool call. Centralizes the + * step-shape contract so every adapter produces verifier-equivalent steps. + */ +export function toolCallToTrajectoryStep( + index: number, + call: NormalizedToolCall, + fallbackTimestamp: string, +): TrajectoryStep { + const startedAt = call.startedAt ?? fallbackTimestamp; + const finishedAt = call.finishedAt ?? startedAt; + return { + index, + actionName: call.name, + actionArgs: call.args, + reasoning: call.reasoning ?? "", + agentEvidence: actionToAgentEvidence(call), + // External harnesses don't natively produce screenshots/aria/scroll, so + // probeEvidence stays empty. The verifier handles this via the + // evidence_insufficient path. + probeEvidence: {}, + toolOutput: { + ok: call.ok, + result: call.result, + ...(call.error && { error: call.error }), + }, + startedAt, + finishedAt, + }; +} + +/** + * Build a `Trajectory` from a sequence of normalized tool calls + the task + * metadata. Adapters call this after parsing their harness's event log. + */ +export interface BuildTrajectoryOptions { + taskSpec: TaskSpec; + toolCalls: NormalizedToolCall[]; + finalAnswer?: string; + status?: Trajectory["status"]; + /** Token usage if the harness surfaced it; partial fields are filled with 0. */ + usage?: Partial; + /** Defaults to `now` for both endpoints if the harness didn't track timing. */ + timing?: Partial; +} + +export function buildTrajectory(opts: BuildTrajectoryOptions): Trajectory { + const now = new Date().toISOString(); + const steps: TrajectoryStep[] = opts.toolCalls.map((call, idx) => + toolCallToTrajectoryStep(idx, call, now), + ); + + const startedAt = opts.timing?.startedAt ?? steps[0]?.startedAt ?? now; + const endedAt = + opts.timing?.endedAt ?? steps[steps.length - 1]?.finishedAt ?? startedAt; + + return { + task: opts.taskSpec, + steps, + finalAnswer: opts.finalAnswer, + status: opts.status ?? "complete", + usage: { + input_tokens: opts.usage?.input_tokens ?? 0, + output_tokens: opts.usage?.output_tokens ?? 0, + ...(opts.usage?.reasoning_tokens !== undefined && { + reasoning_tokens: opts.usage.reasoning_tokens, + }), + ...(opts.usage?.cached_input_tokens !== undefined && { + cached_input_tokens: opts.usage.cached_input_tokens, + }), + ...(opts.usage?.inference_time_ms !== undefined && { + inference_time_ms: opts.usage.inference_time_ms, + }), + }, + timing: { startedAt, endedAt }, + }; +} + +function safeStringify(value: unknown): string | undefined { + try { + return JSON.stringify(value); + } catch { + return undefined; + } +} diff --git a/packages/evals/scripts/verify-harness-adapters.ts b/packages/evals/scripts/verify-harness-adapters.ts new file mode 100644 index 000000000..6c949857d --- /dev/null +++ b/packages/evals/scripts/verify-harness-adapters.ts @@ -0,0 +1,434 @@ +/** + * External-harness adapter smoke test — verifies the claudeCodeAdapter and + * codexAdapter end-to-end without launching a browser. + * + * Hand-rolls synthetic harness results (tool-use messages for Claude Code, + * ThreadEvents for Codex) and asserts: + * 1. The produced Trajectory has the expected step count. + * 2. Text and JSON modalities are populated where they should be. + * 3. finalAnswer is captured. + * 4. status === "complete". + * + * Bonus (gated on GEMINI_API_KEY): feeds the synthetic trajectory into a real + * V3Evaluator.verify() with a tiny synthetic rubric, then prints the verdict. + * + * Run via: pnpm exec tsx packages/evals/scripts/verify-harness-adapters.ts + */ +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { claudeCodeAdapter } from "../framework/harnesses/claudeCodeAdapter.js"; +import { codexAdapter } from "../framework/harnesses/codexAdapter.js"; +import { persistAdapterTrajectory } from "../framework/harnesses/persistTrajectory.js"; +import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; + +async function testClaudeCodeAdapter(taskSpec: TaskSpec): Promise { + // Hand-rolled SDK message stream that mirrors what the Claude Agent SDK + // emits for a two-tool-call session with reasoning between them. + const messages: Array> = [ + { + type: "assistant", + message: { + content: [ + { + type: "text", + text: "I'll start by navigating to the United Airlines website.", + }, + { + type: "tool_use", + id: "tu_1", + name: "browse", + input: { command: "browse navigate https://www.united.com" }, + }, + ], + }, + }, + { + type: "user", + message: { + content: [ + { + type: "tool_result", + tool_use_id: "tu_1", + content: [ + { type: "text", text: "Navigated to https://www.united.com" }, + ], + is_error: false, + }, + ], + }, + }, + { + type: "assistant", + message: { + content: [ + { + type: "text", + text: "Now I'll look up the flight prices.", + }, + { + type: "tool_use", + id: "tu_2", + name: "browse", + input: { command: "browse extract { economy, business } prices" }, + }, + ], + }, + }, + { + type: "user", + message: { + content: [ + { + type: "tool_result", + tool_use_id: "tu_2", + content: [ + { + type: "text", + text: '{"economy":"$1,200","business":"$5,200"}', + }, + ], + is_error: false, + }, + ], + }, + }, + { + type: "assistant", + message: { + content: [ + { + type: "text", + text: "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).", + }, + ], + }, + }, + { + type: "result", + subtype: "success", + result: + "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).", + duration_ms: 1234, + num_turns: 3, + }, + ]; + + const trajectory = claudeCodeAdapter.fromHarnessResult( + { + messages, + status: "complete", + usage: { input_tokens: 100, output_tokens: 80 }, + }, + taskSpec, + ); + + assert.equal( + trajectory.steps.length, + 2, + `expected 2 steps from 2 tool_use blocks, got ${trajectory.steps.length}`, + ); + assert.equal(trajectory.steps[0].actionName, "browse"); + assert.equal(trajectory.steps[1].actionName, "browse"); + assert.equal(trajectory.status, "complete"); + assert.ok( + trajectory.finalAnswer?.includes("$4,000"), + `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`, + ); + + // Step 0: reasoning text modality + result text modality. + const step0Modalities = trajectory.steps[0].agentEvidence.modalities; + assert.ok( + step0Modalities.some( + (m) => m.type === "text" && m.content.includes("navigating"), + ), + "expected reasoning text in step 0 modalities", + ); + assert.ok( + step0Modalities.some( + (m) => + m.type === "text" && + m.content.includes("Navigated to https://www.united.com"), + ), + "expected tool-result text in step 0 modalities", + ); + + // Step 1 carries the second reasoning + result content. tool_result content + // is a structured array of {type, text} blocks, which the adapter forwards + // as the json modality (with a stringified text mirror). Accept either path. + const step1Modalities = trajectory.steps[1].agentEvidence.modalities; + const step1Joined = JSON.stringify(step1Modalities); + assert.ok( + step1Joined.includes("economy"), + `expected step 1 modalities to mention 'economy'; got ${step1Joined}`, + ); + + // Both steps must have empty probeEvidence — external harnesses don't + // produce screenshots natively. That's what triggers evidence_insufficient + // in the verifier downstream. + for (const step of trajectory.steps) { + assert.deepEqual( + step.probeEvidence, + {}, + `expected empty probeEvidence for external-harness step ${step.index}`, + ); + } + + console.log( + ` ✓ claudeCodeAdapter — ${trajectory.steps.length} steps, finalAnswer captured, probeEvidence empty`, + ); + + return trajectory; +} + +async function testCodexAdapter(taskSpec: TaskSpec): Promise { + // Hand-rolled codex ThreadEvent stream. Mirrors what runCodexAgent + // accumulates into its `events` array. + const events: Array> = [ + { type: "thread.started", thread_id: "thread-smoke" }, + { type: "turn.started" }, + { + type: "item.completed", + item: { + id: "rs-1", + type: "reasoning", + text: "I should start by navigating to the United website.", + }, + }, + { + type: "item.completed", + item: { + id: "ce-1", + type: "command_execution", + command: "browse navigate https://www.united.com", + aggregated_output: "Navigated to https://www.united.com", + exit_code: 0, + status: "completed", + }, + }, + { + type: "item.completed", + item: { + id: "rs-2", + type: "reasoning", + text: "Now extract the prices via the MCP browser tool.", + }, + }, + { + type: "item.completed", + item: { + id: "mc-1", + type: "mcp_tool_call", + server: "stagehand_browser", + tool: "extract", + arguments: { instruction: "Get prices" }, + result: { + content: [ + { + type: "text", + text: '{"economy":"$1,200","business":"$5,200"}', + }, + ], + structured_content: { economy: "$1,200", business: "$5,200" }, + }, + status: "completed", + }, + }, + { + type: "item.completed", + item: { + id: "am-1", + type: "agent_message", + text: "The price difference is approximately $4,000.", + }, + }, + { + type: "turn.completed", + usage: { + input_tokens: 120, + cached_input_tokens: 10, + output_tokens: 50, + reasoning_output_tokens: 5, + }, + }, + ]; + + const trajectory = codexAdapter.fromHarnessResult( + { + events, + status: "complete", + usage: { + input_tokens: 120, + output_tokens: 50, + reasoning_tokens: 5, + cached_input_tokens: 10, + }, + }, + taskSpec, + ); + + assert.equal( + trajectory.steps.length, + 2, + `expected 2 steps (command_execution + mcp_tool_call), got ${trajectory.steps.length}`, + ); + assert.equal(trajectory.steps[0].actionName, "browse"); + assert.equal(trajectory.steps[1].actionName, "stagehand_browser.extract"); + assert.equal(trajectory.status, "complete"); + assert.ok( + trajectory.finalAnswer?.includes("$4,000"), + `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`, + ); + + // Reasoning items must be folded into the following tool call. + assert.ok( + trajectory.steps[0].reasoning.includes("navigating"), + "expected first reasoning to be folded into step 0", + ); + assert.ok( + trajectory.steps[1].reasoning.includes("MCP browser tool"), + "expected second reasoning to be folded into step 1", + ); + + // The MCP tool result should produce a json modality from structured_content. + const step1Modalities = trajectory.steps[1].agentEvidence.modalities; + assert.ok( + step1Modalities.some( + (m) => + m.type === "json" && + typeof m.content === "object" && + m.content !== null && + (m.content as Record).economy === "$1,200", + ), + "expected json modality with structured_content on step 1", + ); + + // Probe evidence empty across the board. + for (const step of trajectory.steps) { + assert.deepEqual( + step.probeEvidence, + {}, + `expected empty probeEvidence for external-harness step ${step.index}`, + ); + } + + console.log( + ` ✓ codexAdapter — ${trajectory.steps.length} steps, reasoning folded, structured_content → json modality`, + ); + + return trajectory; +} + +async function testPersistence( + trajectory: Trajectory, + taskSpec: TaskSpec, + tmpRoot: string, + label: string, +): Promise { + const { directory, persisted } = await persistAdapterTrajectory({ + trajectory, + taskSpec, + outputRoot: tmpRoot, + runId: `smoke-${label}`, + persist: true, + }); + assert.equal(persisted, true); + + const entries = await fs.readdir(directory); + assert.ok( + entries.includes("task_data.json"), + "expected task_data.json on disk", + ); + assert.ok( + entries.includes("trajectory.json"), + "expected trajectory.json on disk", + ); + assert.ok(entries.includes("times.json"), "expected times.json on disk"); + assert.ok(entries.includes("core.log"), "expected core.log on disk"); + assert.ok(entries.includes("scores"), "expected scores/ directory on disk"); + console.log(` ✓ persistAdapterTrajectory(${label}) — wrote ${directory}`); +} + +async function maybeRunVerifier( + label: string, + trajectory: Trajectory, + taskSpec: TaskSpec, +): Promise { + const apiKey = + process.env.GEMINI_API_KEY || process.env.GOOGLE_GENERATIVE_AI_API_KEY; + if (!apiKey) { + console.log( + ` – V3Evaluator.verify(${label}) skipped (no GEMINI_API_KEY in env)`, + ); + return; + } + + const { V3Evaluator } = await import("@browserbasehq/stagehand"); + // Construct a V3 stub just for its logger (V3Evaluator only needs that). + // We can't `init()` it (no browser) but the verify path never touches the + // browser, only LLMProvider. + const { V3 } = await import("@browserbasehq/stagehand"); + // V3 requires V3Options; pass a minimal one with disablePino so we don't + // spin up the pino worker. + const v3 = new V3({ env: "LOCAL", verbose: 0, disablePino: true }); + + const evaluator = new V3Evaluator(v3, { backend: "verifier" }); + try { + const verdict = await evaluator.verify(trajectory, taskSpec); + console.log( + ` ✓ V3Evaluator.verify(${label}) — outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${verdict.perCriterion.length} evidence_insufficient=${verdict.evidenceInsufficient.length}`, + ); + } finally { + // V3 instance was never init'd, no teardown needed. + } +} + +async function main(): Promise { + const tmpRoot = await fs.mkdtemp( + path.join(os.tmpdir(), "verify-harness-adapters-"), + ); + console.log(`▸ tmpdir: ${tmpRoot}\n`); + + const taskSpec: TaskSpec = { + id: "smoke-united_13", + instruction: + "What is the price difference between economy and business class on United CHI→GRU?", + initUrl: "https://www.united.com", + precomputedRubric: { + items: [ + { + criterion: "Identify correct route", + description: + "Agent identifies the United CHI→GRU economy and business class fares.", + max_points: 2, + }, + { + criterion: "Report price delta", + description: + "Agent reports a numeric difference between economy and business.", + max_points: 3, + }, + ], + }, + expectedAnswer: "Approximately $4,000 difference.", + }; + + console.log("▸ claudeCodeAdapter"); + const claudeTrajectory = await testClaudeCodeAdapter(taskSpec); + await testPersistence(claudeTrajectory, taskSpec, tmpRoot, "claude_code"); + await maybeRunVerifier("claude_code", claudeTrajectory, taskSpec); + + console.log("\n▸ codexAdapter"); + const codexTrajectory = await testCodexAdapter(taskSpec); + await testPersistence(codexTrajectory, taskSpec, tmpRoot, "codex"); + await maybeRunVerifier("codex", codexTrajectory, taskSpec); + + console.log("\n✓ all smoke assertions passed"); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); From 6d41a4f829887c2b58089f77a94c9e1defcad04e Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:19:41 -0700 Subject: [PATCH 2/5] fix(evals): route harnesses through verifier --- packages/evals/framework/claudeCodeRunner.ts | 2 +- packages/evals/framework/codexRunner.ts | 2 +- .../framework/harnesses/persistTrajectory.ts | 56 ++++++++++++------- .../evals/scripts/verify-harness-adapters.ts | 4 +- 4 files changed, 39 insertions(+), 25 deletions(-) diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index c306c4256..cb356ceea 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -291,7 +291,7 @@ export async function runClaudeCodeAgent({ const { V3Evaluator } = await import("@browserbasehq/stagehand"); const { RubricCache } = await import("./rubricCache.js"); - const evaluator = new V3Evaluator(verifier.v3); + const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" }); // Hydrate rubric — use precomputed if present, otherwise cache-or-generate. let rubric = verifier.taskSpec.precomputedRubric; diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts index 2c5695789..2703160d9 100644 --- a/packages/evals/framework/codexRunner.ts +++ b/packages/evals/framework/codexRunner.ts @@ -265,7 +265,7 @@ export async function runCodexAgent({ const { V3Evaluator } = await import("@browserbasehq/stagehand"); const { RubricCache } = await import("./rubricCache.js"); - const evaluator = new V3Evaluator(verifier.v3); + const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" }); let rubric = verifier.taskSpec.precomputedRubric; if (!rubric) { diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts index ffa3dd868..6efdaace8 100644 --- a/packages/evals/framework/harnesses/persistTrajectory.ts +++ b/packages/evals/framework/harnesses/persistTrajectory.ts @@ -10,15 +10,14 @@ * finishes — so this helper writes the same on-disk layout without the * event-subscription lifecycle. * - * The on-disk layout matches microsoft/fara's example_trajectory/ so saved - * trajectories can be cross-validated against verify_trajectories.py without - * a format conversion step: + * The on-disk layout matches TrajectoryRecorder.persist(): * * / * ├── task_data.json - * ├── trajectory.json (image bytes as base64, screenshots referenced by path) - * ├── screenshot_.png (only if probeEvidence carries Buffer screenshots — external - * │ harnesses don't natively, but the helper supports it) + * ├── trajectory.json (images referenced by path) + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/.png * ├── scores/ * │ └── mmrubric_v1.json (if `verdict` passed) * ├── core.log @@ -86,29 +85,44 @@ export async function persistAdapterTrajectory( } await fs.mkdir(directory, { recursive: true }); + await fs.mkdir(path.join(directory, "screenshots", "probe"), { + recursive: true, + }); + await fs.mkdir(path.join(directory, "screenshots", "agent"), { + recursive: true, + }); - // Walk steps and (when a Buffer screenshot is present, which is rare for - // external harnesses) write it to disk + replace with a path reference. - // Image modalities in agentEvidence get base64-encoded inline to keep - // trajectory.json human-readable JSON. + // Walk steps and write image bytes to disk, replacing in-memory Buffers with + // path references in trajectory.json. const serializableSteps: unknown[] = []; for (const step of opts.trajectory.steps) { const probe: ProbeEvidence = { ...step.probeEvidence }; if (probe.screenshot) { - const filename = `screenshot_${step.index + 1}.png`; - await fs.writeFile(path.join(directory, filename), probe.screenshot); - probe.screenshotPath = filename; + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile(path.join(directory, relPath), probe.screenshot); + probe.screenshotPath = relPath; delete probe.screenshot; } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; const agentEvidence = { - modalities: step.agentEvidence.modalities.map((m) => - m.type === "image" - ? { - type: "image", - bytesBase64: m.bytes.toString("base64"), - mediaType: m.mediaType, - } - : m, + modalities: await Promise.all( + step.agentEvidence.modalities.map(async (m) => { + if (m.type !== "image") return m; + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + imageSeq += 1; + await fs.writeFile(path.join(directory, relPath), m.bytes); + return { + type: "image" as const, + imagePath: relPath, + mediaType: m.mediaType, + }; + }), ), }; serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); diff --git a/packages/evals/scripts/verify-harness-adapters.ts b/packages/evals/scripts/verify-harness-adapters.ts index 6c949857d..5b9b11d9c 100644 --- a/packages/evals/scripts/verify-harness-adapters.ts +++ b/packages/evals/scripts/verify-harness-adapters.ts @@ -402,13 +402,13 @@ async function main(): Promise { criterion: "Identify correct route", description: "Agent identifies the United CHI→GRU economy and business class fares.", - max_points: 2, + maxPoints: 2, }, { criterion: "Report price delta", description: "Agent reports a numeric difference between economy and business.", - max_points: 3, + maxPoints: 3, }, ], }, From c166203d0ea889b11a0df9ee46c08de42daef51d Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:42:00 -0700 Subject: [PATCH 3/5] fix(evals): validate external harness success mode --- packages/evals/framework/claudeCodeRunner.ts | 9 +-------- packages/evals/framework/codexRunner.ts | 9 +-------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index cb356ceea..a6e1ead2f 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -309,14 +309,7 @@ export async function runClaudeCodeAgent({ }; const verdict = await evaluator.verify(trajectory, hydratedSpec); - const successMode = - verifier.successMode ?? - ((process.env.EVAL_SUCCESS_MODE as - | "outcome" - | "process" - | "both" - | undefined) || - "outcome"); + const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE; const verifiedSuccess = verdictToSuccess(verdict, successMode); const { directory: trajectoryDir } = await persistAdapterTrajectory({ diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts index 2703160d9..7f6f71e77 100644 --- a/packages/evals/framework/codexRunner.ts +++ b/packages/evals/framework/codexRunner.ts @@ -282,14 +282,7 @@ export async function runCodexAgent({ }; const verdict = await evaluator.verify(trajectory, hydratedSpec); - const successMode = - verifier.successMode ?? - ((process.env.EVAL_SUCCESS_MODE as - | "outcome" - | "process" - | "both" - | undefined) || - "outcome"); + const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE; const verifiedSuccess = verdictToSuccess(verdict, successMode); const { directory: trajectoryDir } = await persistAdapterTrajectory({ From 6532300ba6c79dac605490cec68ee7030a71e48a Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 21:33:06 -0700 Subject: [PATCH 4/5] test(evals): cover persisted trajectory images --- .../tests/framework/persistTrajectory.test.ts | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 packages/evals/tests/framework/persistTrajectory.test.ts diff --git a/packages/evals/tests/framework/persistTrajectory.test.ts b/packages/evals/tests/framework/persistTrajectory.test.ts new file mode 100644 index 000000000..65de8c72f --- /dev/null +++ b/packages/evals/tests/framework/persistTrajectory.test.ts @@ -0,0 +1,113 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { loadTrajectoryFromDisk } from "@browserbasehq/stagehand"; +import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; +import { describe, expect, it } from "vitest"; + +import { persistAdapterTrajectory } from "../../framework/harnesses/persistTrajectory.js"; + +const PROBE_PNG = Buffer.from("fake-probe-bytes-1234", "utf8"); +const AGENT_PNG = Buffer.from("fake-agent-bytes-5678", "utf8"); + +describe("persistAdapterTrajectory", () => { + it("round-trips probe and agent image evidence through loadTrajectoryFromDisk", async () => { + const tmpRoot = await fs.mkdtemp( + path.join(os.tmpdir(), "persist-adapter-roundtrip-"), + ); + + try { + const taskSpec: TaskSpec = { + id: "roundtrip-task", + instruction: "Test task", + initUrl: "https://example.com", + }; + const { directory, persisted } = await persistAdapterTrajectory({ + trajectory: makeTrajectory(taskSpec), + taskSpec, + outputRoot: tmpRoot, + runId: "roundtrip-run", + persist: true, + }); + + expect(persisted).toBe(true); + await expect(fs.readdir(directory)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(directory, "screenshots", "probe", "1.png")), + ).resolves.toEqual(PROBE_PNG); + await expect( + fs.readFile(path.join(directory, "screenshots", "agent", "1.png")), + ).resolves.toEqual(AGENT_PNG); + + const loaded = await loadTrajectoryFromDisk(directory); + const step = loaded.steps[0]; + const imageModality = step.agentEvidence.modalities.find( + ( + modality, + ): modality is Extract< + (typeof step.agentEvidence.modalities)[number], + { type: "image" } + > => modality.type === "image", + ); + const textModality = step.agentEvidence.modalities.find( + ( + modality, + ): modality is Extract< + (typeof step.agentEvidence.modalities)[number], + { type: "text" } + > => modality.type === "text", + ); + + expect(step.probeEvidence.screenshot).toEqual(PROBE_PNG); + expect(imageModality?.bytes).toEqual(AGENT_PNG); + expect(imageModality?.mediaType).toBe("image/png"); + expect(textModality?.content).toBe("navigated"); + } finally { + await fs.rm(tmpRoot, { recursive: true, force: true }); + } + }); +}); + +function makeTrajectory(task: TaskSpec): Trajectory { + return { + task, + status: "complete", + finalAnswer: "Final answer text.", + usage: { input_tokens: 100, output_tokens: 50 }, + timing: { + startedAt: "2026-05-15T10:00:00.000Z", + endedAt: "2026-05-15T10:01:00.000Z", + }, + steps: [ + { + index: 0, + actionName: "goto", + actionArgs: { url: "https://example.com" }, + reasoning: "Open the page.", + agentEvidence: { + modalities: [ + { type: "text", content: "navigated" }, + { type: "image", bytes: AGENT_PNG, mediaType: "image/png" }, + ], + }, + probeEvidence: { + url: "https://example.com", + screenshot: PROBE_PNG, + }, + toolOutput: { ok: true, result: { url: "https://example.com" } }, + startedAt: "2026-05-15T10:00:00.000Z", + finishedAt: "2026-05-15T10:00:05.000Z", + }, + ], + }; +} From 3a673e7112cbc194a250e0579651948130bc6d73 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:45:32 -0700 Subject: [PATCH 5/5] fix(evals): align harness verifier result API --- packages/evals/framework/claudeCodeRunner.ts | 28 +- packages/evals/framework/codexRunner.ts | 28 +- .../framework/harnesses/persistTrajectory.ts | 18 +- .../framework/harnesses/trajectoryAdapter.ts | 12 +- .../evals/scripts/verify-harness-adapters.ts | 434 ------------------ .../tests/framework/persistTrajectory.test.ts | 19 +- 6 files changed, 68 insertions(+), 471 deletions(-) delete mode 100644 packages/evals/scripts/verify-harness-adapters.ts diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index a6e1ead2f..0d1b68569 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -6,7 +6,7 @@ import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js"; import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js"; -import { verdictToSuccess } from "./verifierAdapter.js"; +import { evaluationResultToSuccess } from "./verifierAdapter.js"; type ClaudeSdkMessage = Record; type ClaudeQuery = AsyncIterable; @@ -48,8 +48,9 @@ export interface ClaudeCodeRunnerInput { /** * Optional verifier integration. When provided, the runner builds a * Trajectory from the SDK message stream (via claudeCodeAdapter), runs - * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict - * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE). + * V3Evaluator.verify() against the supplied TaskSpec, and folds the + * EvaluationResult into the returned TaskResult ({_success} mode follows + * EVAL_SUCCESS_MODE). * When omitted, the runner falls back to parsing the legacy EVAL_RESULT * line — preserves current behavior for callers that haven't migrated. */ @@ -308,21 +309,24 @@ export async function runClaudeCodeAgent({ precomputedRubric: rubric, }; - const verdict = await evaluator.verify(trajectory, hydratedSpec); + const evaluationResult = await evaluator.verify(trajectory, hydratedSpec); const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE; - const verifiedSuccess = verdictToSuccess(verdict, successMode); + const verifiedSuccess = evaluationResultToSuccess( + evaluationResult, + successMode, + ); const { directory: trajectoryDir } = await persistAdapterTrajectory({ trajectory, taskSpec: hydratedSpec, - verdict, + evaluationResult, outputRoot: verifier.trajectoryRoot, runId: verifier.runId, }); logger.log({ category: "claude_code", - message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`, + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`, level: 1, }); @@ -330,9 +334,9 @@ export async function runClaudeCodeAgent({ ...baseResult, _success: verifiedSuccess, error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage), - outcomeSuccess: verdict.outcomeSuccess, - processScore: verdict.processScore, - evidenceInsufficient: verdict.evidenceInsufficient, + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, criterionCount: rubric.items.length, stepCount: trajectory.steps.length, trajectoryDir, @@ -350,6 +354,10 @@ export async function runClaudeCodeAgent({ } } +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; +} + function buildClaudeCodeMetrics( resultMessage: ClaudeSdkMessage | undefined, ): Record { diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts index 7f6f71e77..bd38965bb 100644 --- a/packages/evals/framework/codexRunner.ts +++ b/packages/evals/framework/codexRunner.ts @@ -6,7 +6,7 @@ import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js"; import { codexAdapter } from "./harnesses/codexAdapter.js"; import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js"; -import { verdictToSuccess } from "./verifierAdapter.js"; +import { evaluationResultToSuccess } from "./verifierAdapter.js"; type MetricValue = { count: number; value: number }; type CodexEvent = Record; @@ -57,8 +57,9 @@ export interface CodexRunnerInput { /** * Optional verifier integration. When provided, the runner builds a * Trajectory from the codex event stream (via codexAdapter), runs - * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict - * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE). + * V3Evaluator.verify() against the supplied TaskSpec, and folds the + * EvaluationResult into the returned TaskResult ({_success} mode follows + * EVAL_SUCCESS_MODE). * When omitted, the runner falls back to parsing the legacy JSON result — * preserves current behavior for callers that haven't migrated. */ @@ -281,21 +282,24 @@ export async function runCodexAgent({ precomputedRubric: rubric, }; - const verdict = await evaluator.verify(trajectory, hydratedSpec); + const evaluationResult = await evaluator.verify(trajectory, hydratedSpec); const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE; - const verifiedSuccess = verdictToSuccess(verdict, successMode); + const verifiedSuccess = evaluationResultToSuccess( + evaluationResult, + successMode, + ); const { directory: trajectoryDir } = await persistAdapterTrajectory({ trajectory, taskSpec: hydratedSpec, - verdict, + evaluationResult, outputRoot: verifier.trajectoryRoot, runId: verifier.runId, }); logger.log({ category: "codex", - message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`, + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`, level: 1, }); @@ -303,9 +307,9 @@ export async function runCodexAgent({ ...baseResult, _success: verifiedSuccess, error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage), - outcomeSuccess: verdict.outcomeSuccess, - processScore: verdict.processScore, - evidenceInsufficient: verdict.evidenceInsufficient, + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, criterionCount: rubric.items.length, stepCount: trajectory.steps.length, trajectoryDir, @@ -323,6 +327,10 @@ export async function runCodexAgent({ } } +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; +} + function tryParseCodexJson( candidate: string, ): Omit | undefined { diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts index 6efdaace8..ab6cf4daa 100644 --- a/packages/evals/framework/harnesses/persistTrajectory.ts +++ b/packages/evals/framework/harnesses/persistTrajectory.ts @@ -19,7 +19,7 @@ * │ ├── probe/.png * │ └── agent/.png * ├── scores/ - * │ └── mmrubric_v1.json (if `verdict` passed) + * │ └── result.json (if `evaluationResult` passed) * ├── core.log * └── times.json * @@ -32,17 +32,17 @@ import fs from "node:fs/promises"; import path from "node:path"; import type { + EvaluationResult, ProbeEvidence, TaskSpec, Trajectory, - Verdict, } from "@browserbasehq/stagehand"; export interface PersistAdapterTrajectoryOptions { trajectory: Trajectory; taskSpec: TaskSpec; - /** Verdict from V3Evaluator.verify(). Written to scores/mmrubric_v1.json. */ - verdict?: Verdict; + /** EvaluationResult from V3Evaluator.verify(). Written to scores/result.json. */ + evaluationResult?: EvaluationResult; /** * Output directory root. Final layout lives at `///`. * Defaults to `/.trajectories`. @@ -143,8 +143,8 @@ export async function persistAdapterTrajectory( status: opts.trajectory.status, finalAnswer: opts.trajectory.finalAnswer ?? null, }; - if (opts.verdict) { - taskData.verdict = opts.verdict; + if (opts.evaluationResult) { + taskData.result = opts.evaluationResult; } await fs.writeFile( path.join(directory, "task_data.json"), @@ -165,10 +165,10 @@ export async function persistAdapterTrajectory( ); await fs.mkdir(path.join(directory, "scores"), { recursive: true }); - if (opts.verdict) { + if (opts.evaluationResult) { await fs.writeFile( - path.join(directory, "scores", "mmrubric_v1.json"), - JSON.stringify(opts.verdict, null, 2), + path.join(directory, "scores", "result.json"), + JSON.stringify(opts.evaluationResult, null, 2), ); } diff --git a/packages/evals/framework/harnesses/trajectoryAdapter.ts b/packages/evals/framework/harnesses/trajectoryAdapter.ts index cb3afac65..ec1b02319 100644 --- a/packages/evals/framework/harnesses/trajectoryAdapter.ts +++ b/packages/evals/framework/harnesses/trajectoryAdapter.ts @@ -3,14 +3,12 @@ * provider-shaped event/message log) into the Stagehand `Trajectory` shape * that V3Evaluator.verify() consumes. * - * The verifier is harness-agnostic (Trajectory + TaskSpec → Verdict; pure - * function, no live browser). That property is what lets non-Stagehand + * The verifier is harness-agnostic (Trajectory + TaskSpec → EvaluationResult, + * no live browser). That property is what lets non-Stagehand * harnesses — Claude Code, Codex — be scored with the same rubric pipeline * we use for Stagehand. Each external harness ships its own * `TrajectoryAdapter` that maps its tool-call/message log to * a `Trajectory`. The verifier never knows which adapter produced it. - * - * @see ~/.claude/plans/verifier-rewrite-plan.html §07 "External harness adapters" */ import type { AgentEvidence, @@ -26,9 +24,9 @@ import type { * the same Trajectory. * * Empty `probeEvidence` on every step is supported — the verifier degrades - * gracefully via the `evidence_insufficient` path (paper's uncontrollable- - * failure principle). Text-heavy tasks (extract, lookup, search) still get a - * meaningful outcome verdict; visual-grounding criteria get flagged as + * gracefully via the `evidence_insufficient` path. Text-heavy tasks + * (extract, lookup, search) still get a + * meaningful outcome assessment; visual-grounding criteria get flagged as * evidence_insufficient rather than silently miscredited. */ export interface TrajectoryAdapter { diff --git a/packages/evals/scripts/verify-harness-adapters.ts b/packages/evals/scripts/verify-harness-adapters.ts deleted file mode 100644 index 5b9b11d9c..000000000 --- a/packages/evals/scripts/verify-harness-adapters.ts +++ /dev/null @@ -1,434 +0,0 @@ -/** - * External-harness adapter smoke test — verifies the claudeCodeAdapter and - * codexAdapter end-to-end without launching a browser. - * - * Hand-rolls synthetic harness results (tool-use messages for Claude Code, - * ThreadEvents for Codex) and asserts: - * 1. The produced Trajectory has the expected step count. - * 2. Text and JSON modalities are populated where they should be. - * 3. finalAnswer is captured. - * 4. status === "complete". - * - * Bonus (gated on GEMINI_API_KEY): feeds the synthetic trajectory into a real - * V3Evaluator.verify() with a tiny synthetic rubric, then prints the verdict. - * - * Run via: pnpm exec tsx packages/evals/scripts/verify-harness-adapters.ts - */ -import assert from "node:assert/strict"; -import fs from "node:fs/promises"; -import os from "node:os"; -import path from "node:path"; - -import { claudeCodeAdapter } from "../framework/harnesses/claudeCodeAdapter.js"; -import { codexAdapter } from "../framework/harnesses/codexAdapter.js"; -import { persistAdapterTrajectory } from "../framework/harnesses/persistTrajectory.js"; -import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; - -async function testClaudeCodeAdapter(taskSpec: TaskSpec): Promise { - // Hand-rolled SDK message stream that mirrors what the Claude Agent SDK - // emits for a two-tool-call session with reasoning between them. - const messages: Array> = [ - { - type: "assistant", - message: { - content: [ - { - type: "text", - text: "I'll start by navigating to the United Airlines website.", - }, - { - type: "tool_use", - id: "tu_1", - name: "browse", - input: { command: "browse navigate https://www.united.com" }, - }, - ], - }, - }, - { - type: "user", - message: { - content: [ - { - type: "tool_result", - tool_use_id: "tu_1", - content: [ - { type: "text", text: "Navigated to https://www.united.com" }, - ], - is_error: false, - }, - ], - }, - }, - { - type: "assistant", - message: { - content: [ - { - type: "text", - text: "Now I'll look up the flight prices.", - }, - { - type: "tool_use", - id: "tu_2", - name: "browse", - input: { command: "browse extract { economy, business } prices" }, - }, - ], - }, - }, - { - type: "user", - message: { - content: [ - { - type: "tool_result", - tool_use_id: "tu_2", - content: [ - { - type: "text", - text: '{"economy":"$1,200","business":"$5,200"}', - }, - ], - is_error: false, - }, - ], - }, - }, - { - type: "assistant", - message: { - content: [ - { - type: "text", - text: "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).", - }, - ], - }, - }, - { - type: "result", - subtype: "success", - result: - "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).", - duration_ms: 1234, - num_turns: 3, - }, - ]; - - const trajectory = claudeCodeAdapter.fromHarnessResult( - { - messages, - status: "complete", - usage: { input_tokens: 100, output_tokens: 80 }, - }, - taskSpec, - ); - - assert.equal( - trajectory.steps.length, - 2, - `expected 2 steps from 2 tool_use blocks, got ${trajectory.steps.length}`, - ); - assert.equal(trajectory.steps[0].actionName, "browse"); - assert.equal(trajectory.steps[1].actionName, "browse"); - assert.equal(trajectory.status, "complete"); - assert.ok( - trajectory.finalAnswer?.includes("$4,000"), - `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`, - ); - - // Step 0: reasoning text modality + result text modality. - const step0Modalities = trajectory.steps[0].agentEvidence.modalities; - assert.ok( - step0Modalities.some( - (m) => m.type === "text" && m.content.includes("navigating"), - ), - "expected reasoning text in step 0 modalities", - ); - assert.ok( - step0Modalities.some( - (m) => - m.type === "text" && - m.content.includes("Navigated to https://www.united.com"), - ), - "expected tool-result text in step 0 modalities", - ); - - // Step 1 carries the second reasoning + result content. tool_result content - // is a structured array of {type, text} blocks, which the adapter forwards - // as the json modality (with a stringified text mirror). Accept either path. - const step1Modalities = trajectory.steps[1].agentEvidence.modalities; - const step1Joined = JSON.stringify(step1Modalities); - assert.ok( - step1Joined.includes("economy"), - `expected step 1 modalities to mention 'economy'; got ${step1Joined}`, - ); - - // Both steps must have empty probeEvidence — external harnesses don't - // produce screenshots natively. That's what triggers evidence_insufficient - // in the verifier downstream. - for (const step of trajectory.steps) { - assert.deepEqual( - step.probeEvidence, - {}, - `expected empty probeEvidence for external-harness step ${step.index}`, - ); - } - - console.log( - ` ✓ claudeCodeAdapter — ${trajectory.steps.length} steps, finalAnswer captured, probeEvidence empty`, - ); - - return trajectory; -} - -async function testCodexAdapter(taskSpec: TaskSpec): Promise { - // Hand-rolled codex ThreadEvent stream. Mirrors what runCodexAgent - // accumulates into its `events` array. - const events: Array> = [ - { type: "thread.started", thread_id: "thread-smoke" }, - { type: "turn.started" }, - { - type: "item.completed", - item: { - id: "rs-1", - type: "reasoning", - text: "I should start by navigating to the United website.", - }, - }, - { - type: "item.completed", - item: { - id: "ce-1", - type: "command_execution", - command: "browse navigate https://www.united.com", - aggregated_output: "Navigated to https://www.united.com", - exit_code: 0, - status: "completed", - }, - }, - { - type: "item.completed", - item: { - id: "rs-2", - type: "reasoning", - text: "Now extract the prices via the MCP browser tool.", - }, - }, - { - type: "item.completed", - item: { - id: "mc-1", - type: "mcp_tool_call", - server: "stagehand_browser", - tool: "extract", - arguments: { instruction: "Get prices" }, - result: { - content: [ - { - type: "text", - text: '{"economy":"$1,200","business":"$5,200"}', - }, - ], - structured_content: { economy: "$1,200", business: "$5,200" }, - }, - status: "completed", - }, - }, - { - type: "item.completed", - item: { - id: "am-1", - type: "agent_message", - text: "The price difference is approximately $4,000.", - }, - }, - { - type: "turn.completed", - usage: { - input_tokens: 120, - cached_input_tokens: 10, - output_tokens: 50, - reasoning_output_tokens: 5, - }, - }, - ]; - - const trajectory = codexAdapter.fromHarnessResult( - { - events, - status: "complete", - usage: { - input_tokens: 120, - output_tokens: 50, - reasoning_tokens: 5, - cached_input_tokens: 10, - }, - }, - taskSpec, - ); - - assert.equal( - trajectory.steps.length, - 2, - `expected 2 steps (command_execution + mcp_tool_call), got ${trajectory.steps.length}`, - ); - assert.equal(trajectory.steps[0].actionName, "browse"); - assert.equal(trajectory.steps[1].actionName, "stagehand_browser.extract"); - assert.equal(trajectory.status, "complete"); - assert.ok( - trajectory.finalAnswer?.includes("$4,000"), - `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`, - ); - - // Reasoning items must be folded into the following tool call. - assert.ok( - trajectory.steps[0].reasoning.includes("navigating"), - "expected first reasoning to be folded into step 0", - ); - assert.ok( - trajectory.steps[1].reasoning.includes("MCP browser tool"), - "expected second reasoning to be folded into step 1", - ); - - // The MCP tool result should produce a json modality from structured_content. - const step1Modalities = trajectory.steps[1].agentEvidence.modalities; - assert.ok( - step1Modalities.some( - (m) => - m.type === "json" && - typeof m.content === "object" && - m.content !== null && - (m.content as Record).economy === "$1,200", - ), - "expected json modality with structured_content on step 1", - ); - - // Probe evidence empty across the board. - for (const step of trajectory.steps) { - assert.deepEqual( - step.probeEvidence, - {}, - `expected empty probeEvidence for external-harness step ${step.index}`, - ); - } - - console.log( - ` ✓ codexAdapter — ${trajectory.steps.length} steps, reasoning folded, structured_content → json modality`, - ); - - return trajectory; -} - -async function testPersistence( - trajectory: Trajectory, - taskSpec: TaskSpec, - tmpRoot: string, - label: string, -): Promise { - const { directory, persisted } = await persistAdapterTrajectory({ - trajectory, - taskSpec, - outputRoot: tmpRoot, - runId: `smoke-${label}`, - persist: true, - }); - assert.equal(persisted, true); - - const entries = await fs.readdir(directory); - assert.ok( - entries.includes("task_data.json"), - "expected task_data.json on disk", - ); - assert.ok( - entries.includes("trajectory.json"), - "expected trajectory.json on disk", - ); - assert.ok(entries.includes("times.json"), "expected times.json on disk"); - assert.ok(entries.includes("core.log"), "expected core.log on disk"); - assert.ok(entries.includes("scores"), "expected scores/ directory on disk"); - console.log(` ✓ persistAdapterTrajectory(${label}) — wrote ${directory}`); -} - -async function maybeRunVerifier( - label: string, - trajectory: Trajectory, - taskSpec: TaskSpec, -): Promise { - const apiKey = - process.env.GEMINI_API_KEY || process.env.GOOGLE_GENERATIVE_AI_API_KEY; - if (!apiKey) { - console.log( - ` – V3Evaluator.verify(${label}) skipped (no GEMINI_API_KEY in env)`, - ); - return; - } - - const { V3Evaluator } = await import("@browserbasehq/stagehand"); - // Construct a V3 stub just for its logger (V3Evaluator only needs that). - // We can't `init()` it (no browser) but the verify path never touches the - // browser, only LLMProvider. - const { V3 } = await import("@browserbasehq/stagehand"); - // V3 requires V3Options; pass a minimal one with disablePino so we don't - // spin up the pino worker. - const v3 = new V3({ env: "LOCAL", verbose: 0, disablePino: true }); - - const evaluator = new V3Evaluator(v3, { backend: "verifier" }); - try { - const verdict = await evaluator.verify(trajectory, taskSpec); - console.log( - ` ✓ V3Evaluator.verify(${label}) — outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${verdict.perCriterion.length} evidence_insufficient=${verdict.evidenceInsufficient.length}`, - ); - } finally { - // V3 instance was never init'd, no teardown needed. - } -} - -async function main(): Promise { - const tmpRoot = await fs.mkdtemp( - path.join(os.tmpdir(), "verify-harness-adapters-"), - ); - console.log(`▸ tmpdir: ${tmpRoot}\n`); - - const taskSpec: TaskSpec = { - id: "smoke-united_13", - instruction: - "What is the price difference between economy and business class on United CHI→GRU?", - initUrl: "https://www.united.com", - precomputedRubric: { - items: [ - { - criterion: "Identify correct route", - description: - "Agent identifies the United CHI→GRU economy and business class fares.", - maxPoints: 2, - }, - { - criterion: "Report price delta", - description: - "Agent reports a numeric difference between economy and business.", - maxPoints: 3, - }, - ], - }, - expectedAnswer: "Approximately $4,000 difference.", - }; - - console.log("▸ claudeCodeAdapter"); - const claudeTrajectory = await testClaudeCodeAdapter(taskSpec); - await testPersistence(claudeTrajectory, taskSpec, tmpRoot, "claude_code"); - await maybeRunVerifier("claude_code", claudeTrajectory, taskSpec); - - console.log("\n▸ codexAdapter"); - const codexTrajectory = await testCodexAdapter(taskSpec); - await testPersistence(codexTrajectory, taskSpec, tmpRoot, "codex"); - await maybeRunVerifier("codex", codexTrajectory, taskSpec); - - console.log("\n✓ all smoke assertions passed"); -} - -main().catch((error) => { - console.error(error); - process.exit(1); -}); diff --git a/packages/evals/tests/framework/persistTrajectory.test.ts b/packages/evals/tests/framework/persistTrajectory.test.ts index 65de8c72f..c281ac996 100644 --- a/packages/evals/tests/framework/persistTrajectory.test.ts +++ b/packages/evals/tests/framework/persistTrajectory.test.ts @@ -3,7 +3,11 @@ import os from "node:os"; import path from "node:path"; import { loadTrajectoryFromDisk } from "@browserbasehq/stagehand"; -import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; +import type { + EvaluationResult, + TaskSpec, + Trajectory, +} from "@browserbasehq/stagehand"; import { describe, expect, it } from "vitest"; import { persistAdapterTrajectory } from "../../framework/harnesses/persistTrajectory.js"; @@ -23,9 +27,16 @@ describe("persistAdapterTrajectory", () => { instruction: "Test task", initUrl: "https://example.com", }; + const evaluationResult: EvaluationResult = { + outcomeSuccess: true, + processScore: 1, + perCriterion: [], + evidenceInsufficient: [], + }; const { directory, persisted } = await persistAdapterTrajectory({ trajectory: makeTrajectory(taskSpec), taskSpec, + evaluationResult, outputRoot: tmpRoot, runId: "roundtrip-run", persist: true, @@ -48,6 +59,12 @@ describe("persistAdapterTrajectory", () => { await expect( fs.readFile(path.join(directory, "screenshots", "agent", "1.png")), ).resolves.toEqual(AGENT_PNG); + await expect( + fs.readFile(path.join(directory, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + await expect( + fs.readFile(path.join(directory, "task_data.json"), "utf8"), + ).resolves.toContain('"result"'); const loaded = await loadTrajectoryFromDisk(directory); const step = loaded.steps[0];