diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index 6ec620233..0d1b68569 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -1,9 +1,12 @@ -import type { AvailableModel } from "@browserbasehq/stagehand"; +import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { TaskResult } from "./types.js"; import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; +import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js"; +import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js"; +import { evaluationResultToSuccess } from "./verifierAdapter.js"; type ClaudeSdkMessage = Record; type ClaudeQuery = AsyncIterable; @@ -16,6 +19,25 @@ export type ClaudeAgentSdk = { }) => ClaudeQuery; }; +export interface ClaudeCodeVerifierConfig { + /** + * V3 instance used solely as the LLM-client carrier for V3Evaluator. The + * instance does NOT need to have `init()` been called — V3Evaluator.verify() + * uses only `v3.logger` to construct its LLMProvider. + */ + v3: V3; + /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */ + taskSpec: TaskSpec; + /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */ + dataset: string; + /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */ + successMode?: "outcome" | "process" | "both"; + /** Override trajectory persistence root. */ + trajectoryRoot?: string; + /** Override the run id (defaults to ISO timestamp). */ + runId?: string; +} + export interface ClaudeCodeRunnerInput { plan: ExternalHarnessTaskPlan; model: AvailableModel; @@ -23,6 +45,16 @@ export interface ClaudeCodeRunnerInput { toolAdapter?: PreparedClaudeCodeToolAdapter; signal?: AbortSignal; sdk?: ClaudeAgentSdk; + /** + * Optional verifier integration. When provided, the runner builds a + * Trajectory from the SDK message stream (via claudeCodeAdapter), runs + * V3Evaluator.verify() against the supplied TaskSpec, and folds the + * EvaluationResult into the returned TaskResult ({_success} mode follows + * EVAL_SUCCESS_MODE). + * When omitted, the runner falls back to parsing the legacy EVAL_RESULT + * line — preserves current behavior for callers that haven't migrated. + */ + verifier?: ClaudeCodeVerifierConfig; } export interface ParsedClaudeCodeResult { @@ -124,7 +156,9 @@ export async function runClaudeCodeAgent({ toolAdapter, signal, sdk: injectedSdk, + verifier, }: ClaudeCodeRunnerInput): Promise { + const startedAt = new Date().toISOString(); const sdk = injectedSdk ?? (await loadClaudeAgentSdk()); const abortController = new AbortController(); if (signal) { @@ -220,8 +254,10 @@ export async function runClaudeCodeAgent({ parsed.summary ?? stopReason ?? (resultText || transcriptText || "Claude Code did not report success"); + const endedAt = new Date().toISOString(); + const tokenUsage = extractClaudeCodeTokenUsage(resultMessage); - return { + const baseResult: TaskResult = { _success: parsed.success, error: !parsed.success ? errorMessage : undefined, reasoning: parsed.summary, @@ -232,6 +268,94 @@ export async function runClaudeCodeAgent({ logs: logger.getLogs(), metrics: buildClaudeCodeMetrics(resultMessage), }; + + if (!verifier) { + return baseResult; + } + + // Build a Trajectory from the SDK message stream and run the rubric verifier. + try { + const trajectory = claudeCodeAdapter.fromHarnessResult( + { + messages, + finalAnswer: parsed.finalAnswer ?? resultText, + status: status === "completed" ? "complete" : "error", + usage: { + input_tokens: tokenUsage.inputTokens, + output_tokens: tokenUsage.outputTokens, + cached_input_tokens: tokenUsage.cacheReadInputTokens, + }, + timing: { startedAt, endedAt }, + }, + verifier.taskSpec, + ); + + const { V3Evaluator } = await import("@browserbasehq/stagehand"); + const { RubricCache } = await import("./rubricCache.js"); + const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" }); + + // Hydrate rubric — use precomputed if present, otherwise cache-or-generate. + let rubric = verifier.taskSpec.precomputedRubric; + if (!rubric) { + if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + rubric = await evaluator.generateRubric(verifier.taskSpec); + } else { + const cache = new RubricCache({ dataset: verifier.dataset }); + rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator); + } + } + const hydratedSpec: TaskSpec = { + ...verifier.taskSpec, + precomputedRubric: rubric, + }; + + const evaluationResult = await evaluator.verify(trajectory, hydratedSpec); + const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE; + const verifiedSuccess = evaluationResultToSuccess( + evaluationResult, + successMode, + ); + + const { directory: trajectoryDir } = await persistAdapterTrajectory({ + trajectory, + taskSpec: hydratedSpec, + evaluationResult, + outputRoot: verifier.trajectoryRoot, + runId: verifier.runId, + }); + + logger.log({ + category: "claude_code", + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`, + level: 1, + }); + + return { + ...baseResult, + _success: verifiedSuccess, + error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage), + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + }; + } catch (verifyError) { + logger.warn({ + category: "claude_code", + message: `verifier integration failed: ${stringifyError(verifyError)}`, + level: 0, + auxiliary: { + error: { value: stringifyError(verifyError), type: "string" }, + }, + }); + return baseResult; + } +} + +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; } function buildClaudeCodeMetrics( diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts index 4d2844efa..bd38965bb 100644 --- a/packages/evals/framework/codexRunner.ts +++ b/packages/evals/framework/codexRunner.ts @@ -1,9 +1,12 @@ -import type { AvailableModel } from "@browserbasehq/stagehand"; +import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; import type { TaskResult } from "./types.js"; import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js"; +import { codexAdapter } from "./harnesses/codexAdapter.js"; +import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js"; +import { evaluationResultToSuccess } from "./verifierAdapter.js"; type MetricValue = { count: number; value: number }; type CodexEvent = Record; @@ -25,6 +28,25 @@ export type CodexSdk = { startThread: (options?: Record) => CodexThread; }; +export interface CodexVerifierConfig { + /** + * V3 instance used solely as the LLM-client carrier for V3Evaluator. The + * instance does NOT need to have `init()` been called — V3Evaluator.verify() + * uses only `v3.logger` to construct its LLMProvider. + */ + v3: V3; + /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */ + taskSpec: TaskSpec; + /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */ + dataset: string; + /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */ + successMode?: "outcome" | "process" | "both"; + /** Override trajectory persistence root. */ + trajectoryRoot?: string; + /** Override the run id (defaults to ISO timestamp). */ + runId?: string; +} + export interface CodexRunnerInput { plan: ExternalHarnessTaskPlan; model: AvailableModel; @@ -32,6 +54,16 @@ export interface CodexRunnerInput { toolAdapter?: PreparedCodexToolAdapter; signal?: AbortSignal; sdk?: CodexSdk; + /** + * Optional verifier integration. When provided, the runner builds a + * Trajectory from the codex event stream (via codexAdapter), runs + * V3Evaluator.verify() against the supplied TaskSpec, and folds the + * EvaluationResult into the returned TaskResult ({_success} mode follows + * EVAL_SUCCESS_MODE). + * When omitted, the runner falls back to parsing the legacy JSON result — + * preserves current behavior for callers that haven't migrated. + */ + verifier?: CodexVerifierConfig; } export interface ParsedCodexResult { @@ -114,7 +146,9 @@ export async function runCodexAgent({ toolAdapter, signal, sdk: injectedSdk, + verifier, }: CodexRunnerInput): Promise { + const startedAt = new Date().toISOString(); const sdk = injectedSdk ?? (await loadCodexSdk(toolAdapter?.env)); const prompt = buildCodexPrompt(plan, toolAdapter?.promptInstructions); const events: CodexEvent[] = []; @@ -191,8 +225,9 @@ export async function runCodexAgent({ finalResponse || transcriptText || "Codex did not report success"); + const endedAt = new Date().toISOString(); - return { + const baseResult: TaskResult = { _success: parsed.success, error: !parsed.success ? errorMessage : undefined, reasoning: parsed.summary, @@ -203,6 +238,97 @@ export async function runCodexAgent({ logs: logger.getLogs(), metrics: buildCodexMetrics(usage), }; + + if (!verifier) { + return baseResult; + } + + try { + const trajectory = codexAdapter.fromHarnessResult( + { + events, + finalAnswer: parsed.finalAnswer ?? finalResponse, + status: status === "completed" ? "complete" : "error", + usage: { + input_tokens: toFiniteNumber(usage?.input_tokens), + output_tokens: toFiniteNumber(usage?.output_tokens), + ...(usage?.reasoning_output_tokens !== undefined && { + reasoning_tokens: toFiniteNumber(usage.reasoning_output_tokens), + }), + ...(usage?.cached_input_tokens !== undefined && { + cached_input_tokens: toFiniteNumber(usage.cached_input_tokens), + }), + }, + timing: { startedAt, endedAt }, + }, + verifier.taskSpec, + ); + + const { V3Evaluator } = await import("@browserbasehq/stagehand"); + const { RubricCache } = await import("./rubricCache.js"); + const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" }); + + let rubric = verifier.taskSpec.precomputedRubric; + if (!rubric) { + if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + rubric = await evaluator.generateRubric(verifier.taskSpec); + } else { + const cache = new RubricCache({ dataset: verifier.dataset }); + rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator); + } + } + const hydratedSpec: TaskSpec = { + ...verifier.taskSpec, + precomputedRubric: rubric, + }; + + const evaluationResult = await evaluator.verify(trajectory, hydratedSpec); + const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE; + const verifiedSuccess = evaluationResultToSuccess( + evaluationResult, + successMode, + ); + + const { directory: trajectoryDir } = await persistAdapterTrajectory({ + trajectory, + taskSpec: hydratedSpec, + evaluationResult, + outputRoot: verifier.trajectoryRoot, + runId: verifier.runId, + }); + + logger.log({ + category: "codex", + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`, + level: 1, + }); + + return { + ...baseResult, + _success: verifiedSuccess, + error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage), + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + }; + } catch (verifyError) { + logger.warn({ + category: "codex", + message: `verifier integration failed: ${stringifyError(verifyError)}`, + level: 0, + auxiliary: { + error: { value: stringifyError(verifyError), type: "string" }, + }, + }); + return baseResult; + } +} + +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; } function tryParseCodexJson( diff --git a/packages/evals/framework/harnesses/claudeCodeAdapter.ts b/packages/evals/framework/harnesses/claudeCodeAdapter.ts new file mode 100644 index 000000000..fd680895b --- /dev/null +++ b/packages/evals/framework/harnesses/claudeCodeAdapter.ts @@ -0,0 +1,225 @@ +/** + * claudeCodeAdapter — converts a Claude Code SDK run into a `Trajectory` the + * verifier can consume. + * + * Input shape: the SDK emits a stream of `ClaudeSdkMessage` objects of + * different `type`s — assistant (model output, may contain tool_use blocks), + * user (tool_result blocks for prior tool_use calls), and result (final + * outcome with cost/usage/turn counts). We accumulate the stream upstream in + * `runClaudeCodeAgent` and hand the full list here. + * + * Mapping: + * - Each `tool_use` block in an assistant message becomes one normalized + * tool call, paired with its matching `tool_result` from a subsequent + * user message (by `tool_use_id`). + * - Assistant `text` blocks that precede a tool_use are folded into that + * tool call's `reasoning`. Trailing text after the last tool call (and + * the final result message's `result` string when present) becomes the + * `finalAnswer`. + * - The result message's usage carries forward as the trajectory usage. + * + * Failure modes: + * - max_turns / sdk_error → status = "error", but we still emit whatever + * steps we have. The verifier flags evidence_insufficient on criteria it + * can't ground. + */ +import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; +import { + buildTrajectory, + type NormalizedToolCall, + type TrajectoryAdapter, +} from "./trajectoryAdapter.js"; + +/** Subset of the harness result we need to build a trajectory. */ +export interface ClaudeCodeRunResult { + /** Raw SDK message stream collected during execution, in arrival order. */ + messages: Array>; + /** Final assistant message captured separately (optional — falls back to messages). */ + finalAnswer?: string; + /** Trajectory-level status. Defaults to "complete". */ + status?: Trajectory["status"]; + /** Optional usage to fold into Trajectory.usage. */ + usage?: Partial; + /** Optional run start/end timing. Adapter fills with now-now otherwise. */ + timing?: Partial; +} + +interface ToolUseBlock { + /** tool_use_id used to match against tool_result blocks. */ + id: string; + name: string; + input: Record; + /** Assistant text accumulated before this tool call (becomes `reasoning`). */ + reasoningPrefix: string; +} + +interface ToolResultBlock { + toolUseId: string; + /** Concatenated text content of the result. */ + text: string; + /** Original structured content when not flattened to text. */ + raw?: unknown; + isError: boolean; +} + +export class ClaudeCodeTrajectoryAdapter + implements TrajectoryAdapter +{ + fromHarnessResult( + result: ClaudeCodeRunResult, + taskSpec: TaskSpec, + ): Trajectory { + const toolUses: ToolUseBlock[] = []; + const toolResults = new Map(); + const trailingTextParts: string[] = []; + let resultMessageText: string | undefined; + + let pendingReasoning = ""; + + for (const message of result.messages) { + const type = String((message as Record).type ?? ""); + const inner = (message as Record).message; + if (type === "result") { + const r = (message as Record).result; + if (typeof r === "string" && r.trim()) { + resultMessageText = r; + } + continue; + } + if (!isRecord(inner)) continue; + const content = inner.content; + if (!Array.isArray(content)) { + if (typeof content === "string" && type === "assistant") { + pendingReasoning = appendText(pendingReasoning, content); + trailingTextParts.push(content); + } + continue; + } + + if (type === "assistant") { + for (const block of content) { + if (!isRecord(block)) continue; + const blockType = String(block.type ?? ""); + if (blockType === "text" && typeof block.text === "string") { + pendingReasoning = appendText(pendingReasoning, block.text); + trailingTextParts.push(block.text); + continue; + } + if (blockType === "tool_use") { + const id = typeof block.id === "string" ? block.id : ""; + const name = typeof block.name === "string" ? block.name : "tool"; + const input = isRecord(block.input) + ? (block.input as Record) + : {}; + toolUses.push({ + id, + name, + input, + reasoningPrefix: pendingReasoning, + }); + // Once a tool_use lands, the buffered text belonged to its reasoning; + // future tool calls start with empty reasoning unless more text arrives. + pendingReasoning = ""; + // The text we just folded into reasoning is not the final answer. + // Drop it from trailingTextParts. + trailingTextParts.length = 0; + } + } + continue; + } + + if (type === "user") { + for (const block of content) { + if (!isRecord(block)) continue; + const blockType = String(block.type ?? ""); + if (blockType !== "tool_result") continue; + const toolUseId = + typeof block.tool_use_id === "string" ? block.tool_use_id : ""; + const isError = block.is_error === true; + const { text, raw } = extractToolResultContent(block.content); + toolResults.set(toolUseId, { + toolUseId, + text, + raw, + isError, + }); + } + continue; + } + } + + const toolCalls: NormalizedToolCall[] = toolUses.map((use) => { + const matched = toolResults.get(use.id); + const ok = matched ? !matched.isError : true; + const resultPayload = + matched?.raw !== undefined ? matched.raw : (matched?.text ?? ""); + return { + name: use.name, + args: use.input, + result: resultPayload, + ok, + ...(matched?.isError && matched.text && { error: matched.text }), + reasoning: use.reasoningPrefix.trim() || undefined, + }; + }); + + const trailing = trailingTextParts.join("\n").trim(); + const finalAnswer = + result.finalAnswer ?? + resultMessageText ?? + (trailing.length > 0 ? trailing : undefined); + + return buildTrajectory({ + taskSpec, + toolCalls, + finalAnswer, + status: result.status ?? "complete", + usage: result.usage, + timing: result.timing, + }); + } +} + +export const claudeCodeAdapter = new ClaudeCodeTrajectoryAdapter(); + +function appendText(buffer: string, addition: string): string { + if (!addition) return buffer; + if (!buffer) return addition; + return `${buffer}\n${addition}`; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +/** + * tool_result `content` can be: + * - a string (legacy) + * - an array of { type: "text", text } / { type: "image", source } blocks + * + * We flatten text blocks and preserve the original array (when structured) as + * `raw` so adapters that want the json modality can keep it. + */ +function extractToolResultContent(content: unknown): { + text: string; + raw?: unknown; +} { + if (typeof content === "string") { + return { text: content }; + } + if (!Array.isArray(content)) { + return { text: "" }; + } + const parts: string[] = []; + for (const block of content) { + if (!isRecord(block)) continue; + if (block.type === "text" && typeof block.text === "string") { + parts.push(block.text); + } else if (block.type === "image") { + parts.push("[image]"); + } else if (typeof block.text === "string") { + parts.push(block.text); + } + } + return { text: parts.join("\n"), raw: content }; +} diff --git a/packages/evals/framework/harnesses/codexAdapter.ts b/packages/evals/framework/harnesses/codexAdapter.ts new file mode 100644 index 000000000..cd313dd72 --- /dev/null +++ b/packages/evals/framework/harnesses/codexAdapter.ts @@ -0,0 +1,223 @@ +/** + * codexAdapter — converts a Codex SDK run into a `Trajectory` the verifier + * can consume. + * + * Input shape: codex emits `ThreadEvent`s — `item.completed` carrying a + * `ThreadItem` (command_execution, file_change, mcp_tool_call, agent_message, + * reasoning, web_search, todo_list, error), plus `turn.completed` for usage. + * We accumulate the full event list upstream in `runCodexAgent` and hand it + * here. + * + * Mapping: + * - command_execution items → tool call named `bash` (or the command's + * leading token), args = { command }, result = aggregated_output, + * ok = exit_code === 0. + * - mcp_tool_call items → tool call named `${server}.${tool}`, args = + * arguments, result = structured_content (json modality) when present, + * else flattened content text. ok = status !== "failed". + * - reasoning items between item.completed events → folded into the next + * tool call's reasoning string. + * - agent_message items → the final answer (last wins). + * - error items → captured as a failed tool call so the verifier sees the + * pattern (a no-op `error` action with the message in toolOutput.error). + * - file_change items → captured as a tool call named `file_change` with the + * change set in args (rare in browser eval contexts). + * - web_search items → captured as a tool call named `web_search` with the + * query in args. + * - todo_list items → not surfaced as tool calls (they aren't actions). + */ +import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand"; +import { + buildTrajectory, + type NormalizedToolCall, + type TrajectoryAdapter, +} from "./trajectoryAdapter.js"; + +export interface CodexRunResult { + /** All ThreadEvents collected from the SDK stream, in arrival order. */ + events: Array>; + /** Last `agent_message` text. Adapter falls back to scanning events otherwise. */ + finalAnswer?: string; + /** Trajectory-level status. Defaults to "complete". */ + status?: Trajectory["status"]; + /** Optional usage to fold into Trajectory.usage. */ + usage?: Partial; + /** Optional run start/end timing. Adapter fills with now-now otherwise. */ + timing?: Partial; +} + +export class CodexTrajectoryAdapter + implements TrajectoryAdapter +{ + fromHarnessResult(result: CodexRunResult, taskSpec: TaskSpec): Trajectory { + const toolCalls: NormalizedToolCall[] = []; + let pendingReasoning = ""; + let latestAgentMessage: string | undefined; + + for (const event of result.events) { + const type = String((event as Record).type ?? ""); + if (type !== "item.completed") continue; + const item = (event as Record).item; + if (!isRecord(item)) continue; + const itemType = String(item.type ?? ""); + + if (itemType === "reasoning" && typeof item.text === "string") { + pendingReasoning = pendingReasoning + ? `${pendingReasoning}\n${item.text}` + : item.text; + continue; + } + + if (itemType === "agent_message" && typeof item.text === "string") { + // Drop buffered reasoning that didn't precede a tool call. + pendingReasoning = ""; + latestAgentMessage = item.text; + continue; + } + + const call = normalizeItem(itemType, item, pendingReasoning); + if (call) { + toolCalls.push(call); + pendingReasoning = ""; + } + } + + const finalAnswer = result.finalAnswer ?? latestAgentMessage; + + return buildTrajectory({ + taskSpec, + toolCalls, + finalAnswer, + status: result.status ?? "complete", + usage: result.usage, + timing: result.timing, + }); + } +} + +export const codexAdapter = new CodexTrajectoryAdapter(); + +function normalizeItem( + itemType: string, + item: Record, + reasoning: string, +): NormalizedToolCall | undefined { + if (itemType === "command_execution") { + const command = typeof item.command === "string" ? item.command : ""; + const exitCode = + typeof item.exit_code === "number" ? item.exit_code : undefined; + const status = String(item.status ?? ""); + const ok = exitCode === 0 || status === "completed"; + const output = + typeof item.aggregated_output === "string" ? item.aggregated_output : ""; + // Use the leading token as the action name (`bash`, `browse`, etc.) when + // possible; falls back to `command_execution`. + const leading = command.split(/\s+/, 1)[0] || "command_execution"; + return { + name: leading, + args: { command, ...(exitCode !== undefined && { exit_code: exitCode }) }, + result: output, + ok, + ...(!ok && { + error: + exitCode !== undefined + ? `exit code ${exitCode}` + : `command status ${status}`, + }), + reasoning: reasoning || undefined, + }; + } + + if (itemType === "mcp_tool_call") { + const server = typeof item.server === "string" ? item.server : "mcp"; + const tool = typeof item.tool === "string" ? item.tool : "tool"; + const args = isRecord(item.arguments) + ? (item.arguments as Record) + : {}; + const status = String(item.status ?? ""); + const ok = status !== "failed"; + const mcpResult = isRecord(item.result) ? item.result : undefined; + const structured = mcpResult?.structured_content; + const content = mcpResult?.content; + const errorMessage = isRecord(item.error) + ? typeof item.error.message === "string" + ? item.error.message + : undefined + : undefined; + + // Prefer structured_content (json modality) when present, else flatten + // content blocks to text. Falls back to error message when failed. + let payload: unknown; + if (structured !== undefined && structured !== null) { + payload = structured; + } else if (Array.isArray(content)) { + const parts: string[] = []; + for (const block of content) { + if (!isRecord(block)) continue; + if (block.type === "text" && typeof block.text === "string") { + parts.push(block.text); + } else if (block.type === "image") { + parts.push("[image]"); + } else if (typeof block.text === "string") { + parts.push(block.text); + } + } + payload = parts.join("\n"); + } else if (!ok && errorMessage) { + payload = errorMessage; + } else { + payload = ""; + } + + return { + name: `${server}.${tool}`, + args, + result: payload, + ok, + ...(errorMessage && !ok && { error: errorMessage }), + reasoning: reasoning || undefined, + }; + } + + if (itemType === "web_search") { + const query = typeof item.query === "string" ? item.query : ""; + return { + name: "web_search", + args: { query }, + result: "", + ok: true, + reasoning: reasoning || undefined, + }; + } + + if (itemType === "file_change") { + const changes = Array.isArray(item.changes) ? item.changes : []; + const status = String(item.status ?? ""); + return { + name: "file_change", + args: { changes }, + result: { status, changes }, + ok: status === "completed", + reasoning: reasoning || undefined, + }; + } + + if (itemType === "error") { + const message = + typeof item.message === "string" ? item.message : "codex error item"; + return { + name: "error", + args: {}, + result: message, + ok: false, + error: message, + reasoning: reasoning || undefined, + }; + } + + return undefined; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts new file mode 100644 index 000000000..ab6cf4daa --- /dev/null +++ b/packages/evals/framework/harnesses/persistTrajectory.ts @@ -0,0 +1,199 @@ +/** + * persistAdapterTrajectory — writes the on-disk layout used by the Stagehand + * `TrajectoryRecorder.persist()` for trajectories built by external-harness + * adapters (claude_code, codex). + * + * `TrajectoryRecorder` itself is coupled to v3.bus events: it subscribes + * during the agent run, accumulates partial steps, and emits the final + * trajectory on finish(). External harnesses don't go through that bus — + * they produce a complete `Trajectory` synchronously after the harness + * finishes — so this helper writes the same on-disk layout without the + * event-subscription lifecycle. + * + * The on-disk layout matches TrajectoryRecorder.persist(): + * + * / + * ├── task_data.json + * ├── trajectory.json (images referenced by path) + * ├── screenshots/ + * │ ├── probe/.png + * │ └── agent/.png + * ├── scores/ + * │ └── result.json (if `evaluationResult` passed) + * ├── core.log + * └── times.json + * + * Honors `VERIFIER_PERSIST_TRAJECTORIES` for default on/off (matches + * TrajectoryRecorder's convention): + * - "1" / "true": always persist. + * - "0" / "false": never persist. + * - unset: persists when not in CI. + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import type { + EvaluationResult, + ProbeEvidence, + TaskSpec, + Trajectory, +} from "@browserbasehq/stagehand"; + +export interface PersistAdapterTrajectoryOptions { + trajectory: Trajectory; + taskSpec: TaskSpec; + /** EvaluationResult from V3Evaluator.verify(). Written to scores/result.json. */ + evaluationResult?: EvaluationResult; + /** + * Output directory root. Final layout lives at `///`. + * Defaults to `/.trajectories`. + */ + outputRoot?: string; + /** Run identifier (e.g., ISO timestamp). Defaults to a fresh timestamp. */ + runId?: string; + /** + * Override the env-gated persistence default. `true` always persists, + * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES. + */ + persist?: boolean; +} + +export interface PersistAdapterTrajectoryResult { + /** The directory the trajectory was (or would have been) persisted to. */ + directory: string; + /** Whether persistence actually wrote files. */ + persisted: boolean; +} + +function shouldPersist(override: boolean | undefined): boolean { + if (override !== undefined) return override; + const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase(); + if (env === "1" || env === "true") return true; + if (env === "0" || env === "false") return false; + return !process.env.CI; +} + +export async function persistAdapterTrajectory( + opts: PersistAdapterTrajectoryOptions, +): Promise { + const runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); + const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories"); + const directory = path.join(root, runId, opts.taskSpec.id); + const persisted = shouldPersist(opts.persist); + + if (!persisted) { + return { directory, persisted: false }; + } + + await fs.mkdir(directory, { recursive: true }); + await fs.mkdir(path.join(directory, "screenshots", "probe"), { + recursive: true, + }); + await fs.mkdir(path.join(directory, "screenshots", "agent"), { + recursive: true, + }); + + // Walk steps and write image bytes to disk, replacing in-memory Buffers with + // path references in trajectory.json. + const serializableSteps: unknown[] = []; + for (const step of opts.trajectory.steps) { + const probe: ProbeEvidence = { ...step.probeEvidence }; + if (probe.screenshot) { + const relPath = `screenshots/probe/${step.index + 1}.png`; + await fs.writeFile(path.join(directory, relPath), probe.screenshot); + probe.screenshotPath = relPath; + delete probe.screenshot; + } + + const imageModalities = step.agentEvidence.modalities.filter( + (m) => m.type === "image", + ); + const multipleImages = imageModalities.length > 1; + let imageSeq = 0; + const agentEvidence = { + modalities: await Promise.all( + step.agentEvidence.modalities.map(async (m) => { + if (m.type !== "image") return m; + const suffix = multipleImages ? `_${imageSeq}` : ""; + const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`; + imageSeq += 1; + await fs.writeFile(path.join(directory, relPath), m.bytes); + return { + type: "image" as const, + imagePath: relPath, + mediaType: m.mediaType, + }; + }), + ), + }; + serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence }); + } + + const serialized = { + ...opts.trajectory, + steps: serializableSteps, + } as unknown; + + await fs.writeFile( + path.join(directory, "trajectory.json"), + JSON.stringify(serialized, null, 2), + ); + + const taskData: Record = { + task: opts.trajectory.task, + status: opts.trajectory.status, + finalAnswer: opts.trajectory.finalAnswer ?? null, + }; + if (opts.evaluationResult) { + taskData.result = opts.evaluationResult; + } + await fs.writeFile( + path.join(directory, "task_data.json"), + JSON.stringify(taskData, null, 2), + ); + + await fs.writeFile( + path.join(directory, "times.json"), + JSON.stringify( + { + timing: opts.trajectory.timing, + usage: opts.trajectory.usage, + stepCount: opts.trajectory.steps.length, + }, + null, + 2, + ), + ); + + await fs.mkdir(path.join(directory, "scores"), { recursive: true }); + if (opts.evaluationResult) { + await fs.writeFile( + path.join(directory, "scores", "result.json"), + JSON.stringify(opts.evaluationResult, null, 2), + ); + } + + await fs.writeFile( + path.join(directory, "core.log"), + coreLog(opts.trajectory), + ); + + return { directory, persisted: true }; +} + +function coreLog(trajectory: Trajectory): string { + return ( + trajectory.steps + .map((step) => + JSON.stringify({ + step: step.index, + action: step.actionName, + url: step.probeEvidence.url ?? null, + ok: step.toolOutput.ok, + reasoning: step.reasoning || undefined, + startedAt: step.startedAt, + finishedAt: step.finishedAt, + }), + ) + .join("\n") + "\n" + ); +} diff --git a/packages/evals/framework/harnesses/trajectoryAdapter.ts b/packages/evals/framework/harnesses/trajectoryAdapter.ts new file mode 100644 index 000000000..ec1b02319 --- /dev/null +++ b/packages/evals/framework/harnesses/trajectoryAdapter.ts @@ -0,0 +1,206 @@ +/** + * TrajectoryAdapter — converts an external harness's natural output (a + * provider-shaped event/message log) into the Stagehand `Trajectory` shape + * that V3Evaluator.verify() consumes. + * + * The verifier is harness-agnostic (Trajectory + TaskSpec → EvaluationResult, + * no live browser). That property is what lets non-Stagehand + * harnesses — Claude Code, Codex — be scored with the same rubric pipeline + * we use for Stagehand. Each external harness ships its own + * `TrajectoryAdapter` that maps its tool-call/message log to + * a `Trajectory`. The verifier never knows which adapter produced it. + */ +import type { + AgentEvidence, + AgentEvidenceModality, + TaskSpec, + Trajectory, + TrajectoryStep, +} from "@browserbasehq/stagehand"; + +/** + * Adapter interface. Implementations are pure: no I/O, no live browser, no + * mutation of the input result. The same harness result should always produce + * the same Trajectory. + * + * Empty `probeEvidence` on every step is supported — the verifier degrades + * gracefully via the `evidence_insufficient` path. Text-heavy tasks + * (extract, lookup, search) still get a + * meaningful outcome assessment; visual-grounding criteria get flagged as + * evidence_insufficient rather than silently miscredited. + */ +export interface TrajectoryAdapter { + /** + * Convert the external harness's natural output into a Trajectory. Must be + * deterministic given the input. + */ + fromHarnessResult(result: THarnessResult, taskSpec: TaskSpec): Trajectory; +} + +/** + * Normalized tool invocation. Adapters parse harness-specific event/message + * shapes into this canonical structure before mapping to `TrajectoryStep`. + * + * The fields are deliberately permissive — harnesses vary in what they + * surface, and we want a single mapping helper to handle all of them. + */ +export interface NormalizedToolCall { + /** Tool name (e.g., "Bash", "mcp__stagehand_browser__run", "container.exec"). */ + name: string; + /** Tool arguments. Empty object if the harness doesn't surface them. */ + args: Record; + /** + * Tool result. Strings become a text modality; objects become a json modality. + * `undefined` is allowed (e.g., when the tool failed before producing output). + */ + result: unknown; + /** True if the tool reported success. Adapters infer this from harness flags. */ + ok: boolean; + /** Free-form error string when `ok === false`. */ + error?: string; + /** Optional reasoning text the assistant emitted before/with this tool call. */ + reasoning?: string; + /** Wall-clock when the call started. Falls back to call site's "now" if absent. */ + startedAt?: string; + /** Wall-clock when the call finished. Falls back to startedAt. */ + finishedAt?: string; +} + +/** + * Convert a `NormalizedToolCall` into a Trajectory `AgentEvidence`. Strings + * map to a single text modality; objects map to a json modality (plus a text + * modality with the stringified form so plain text-relevance prompts can + * grok structured output). Reasoning text becomes its own text modality — + * the verifier weights reasoning highly when grounding criteria without + * screenshots. + * + * `probeEvidence` is intentionally not produced here — external harnesses + * don't emit independent observations natively. See `actionToProbeEvidence` + * if a harness eventually grows that capability. + */ +export function actionToAgentEvidence( + call: Pick, +): AgentEvidence { + const modalities: AgentEvidenceModality[] = []; + + if (call.reasoning) { + modalities.push({ type: "text", content: call.reasoning }); + } + + const result = call.result; + if (result === undefined || result === null) { + return { modalities }; + } + + if (typeof result === "string") { + if (result.length > 0) { + modalities.push({ type: "text", content: result }); + } + } else if (Buffer.isBuffer(result)) { + modalities.push({ + type: "image", + bytes: result, + mediaType: "image/png", + }); + } else if (typeof result === "object") { + // Provide both a JSON modality (preserved structure for prompts that + // accept JSON) and a stringified text modality (cheap fallback for prompts + // that only consume text). Step 2 relevance scoring tolerates duplicates. + modalities.push({ type: "json", content: result }); + const asText = safeStringify(result); + if (asText) { + modalities.push({ type: "text", content: asText }); + } + } else { + // Numbers, booleans, etc. — stringify so the verifier has a text handle. + modalities.push({ type: "text", content: String(result) }); + } + + return { modalities }; +} + +/** + * Materialize a `TrajectoryStep` from a normalized tool call. Centralizes the + * step-shape contract so every adapter produces verifier-equivalent steps. + */ +export function toolCallToTrajectoryStep( + index: number, + call: NormalizedToolCall, + fallbackTimestamp: string, +): TrajectoryStep { + const startedAt = call.startedAt ?? fallbackTimestamp; + const finishedAt = call.finishedAt ?? startedAt; + return { + index, + actionName: call.name, + actionArgs: call.args, + reasoning: call.reasoning ?? "", + agentEvidence: actionToAgentEvidence(call), + // External harnesses don't natively produce screenshots/aria/scroll, so + // probeEvidence stays empty. The verifier handles this via the + // evidence_insufficient path. + probeEvidence: {}, + toolOutput: { + ok: call.ok, + result: call.result, + ...(call.error && { error: call.error }), + }, + startedAt, + finishedAt, + }; +} + +/** + * Build a `Trajectory` from a sequence of normalized tool calls + the task + * metadata. Adapters call this after parsing their harness's event log. + */ +export interface BuildTrajectoryOptions { + taskSpec: TaskSpec; + toolCalls: NormalizedToolCall[]; + finalAnswer?: string; + status?: Trajectory["status"]; + /** Token usage if the harness surfaced it; partial fields are filled with 0. */ + usage?: Partial; + /** Defaults to `now` for both endpoints if the harness didn't track timing. */ + timing?: Partial; +} + +export function buildTrajectory(opts: BuildTrajectoryOptions): Trajectory { + const now = new Date().toISOString(); + const steps: TrajectoryStep[] = opts.toolCalls.map((call, idx) => + toolCallToTrajectoryStep(idx, call, now), + ); + + const startedAt = opts.timing?.startedAt ?? steps[0]?.startedAt ?? now; + const endedAt = + opts.timing?.endedAt ?? steps[steps.length - 1]?.finishedAt ?? startedAt; + + return { + task: opts.taskSpec, + steps, + finalAnswer: opts.finalAnswer, + status: opts.status ?? "complete", + usage: { + input_tokens: opts.usage?.input_tokens ?? 0, + output_tokens: opts.usage?.output_tokens ?? 0, + ...(opts.usage?.reasoning_tokens !== undefined && { + reasoning_tokens: opts.usage.reasoning_tokens, + }), + ...(opts.usage?.cached_input_tokens !== undefined && { + cached_input_tokens: opts.usage.cached_input_tokens, + }), + ...(opts.usage?.inference_time_ms !== undefined && { + inference_time_ms: opts.usage.inference_time_ms, + }), + }, + timing: { startedAt, endedAt }, + }; +} + +function safeStringify(value: unknown): string | undefined { + try { + return JSON.stringify(value); + } catch { + return undefined; + } +} diff --git a/packages/evals/tests/framework/persistTrajectory.test.ts b/packages/evals/tests/framework/persistTrajectory.test.ts new file mode 100644 index 000000000..c281ac996 --- /dev/null +++ b/packages/evals/tests/framework/persistTrajectory.test.ts @@ -0,0 +1,130 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { loadTrajectoryFromDisk } from "@browserbasehq/stagehand"; +import type { + EvaluationResult, + TaskSpec, + Trajectory, +} from "@browserbasehq/stagehand"; +import { describe, expect, it } from "vitest"; + +import { persistAdapterTrajectory } from "../../framework/harnesses/persistTrajectory.js"; + +const PROBE_PNG = Buffer.from("fake-probe-bytes-1234", "utf8"); +const AGENT_PNG = Buffer.from("fake-agent-bytes-5678", "utf8"); + +describe("persistAdapterTrajectory", () => { + it("round-trips probe and agent image evidence through loadTrajectoryFromDisk", async () => { + const tmpRoot = await fs.mkdtemp( + path.join(os.tmpdir(), "persist-adapter-roundtrip-"), + ); + + try { + const taskSpec: TaskSpec = { + id: "roundtrip-task", + instruction: "Test task", + initUrl: "https://example.com", + }; + const evaluationResult: EvaluationResult = { + outcomeSuccess: true, + processScore: 1, + perCriterion: [], + evidenceInsufficient: [], + }; + const { directory, persisted } = await persistAdapterTrajectory({ + trajectory: makeTrajectory(taskSpec), + taskSpec, + evaluationResult, + outputRoot: tmpRoot, + runId: "roundtrip-run", + persist: true, + }); + + expect(persisted).toBe(true); + await expect(fs.readdir(directory)).resolves.toEqual( + expect.arrayContaining([ + "core.log", + "scores", + "screenshots", + "task_data.json", + "times.json", + "trajectory.json", + ]), + ); + await expect( + fs.readFile(path.join(directory, "screenshots", "probe", "1.png")), + ).resolves.toEqual(PROBE_PNG); + await expect( + fs.readFile(path.join(directory, "screenshots", "agent", "1.png")), + ).resolves.toEqual(AGENT_PNG); + await expect( + fs.readFile(path.join(directory, "scores", "result.json"), "utf8"), + ).resolves.toContain('"outcomeSuccess": true'); + await expect( + fs.readFile(path.join(directory, "task_data.json"), "utf8"), + ).resolves.toContain('"result"'); + + const loaded = await loadTrajectoryFromDisk(directory); + const step = loaded.steps[0]; + const imageModality = step.agentEvidence.modalities.find( + ( + modality, + ): modality is Extract< + (typeof step.agentEvidence.modalities)[number], + { type: "image" } + > => modality.type === "image", + ); + const textModality = step.agentEvidence.modalities.find( + ( + modality, + ): modality is Extract< + (typeof step.agentEvidence.modalities)[number], + { type: "text" } + > => modality.type === "text", + ); + + expect(step.probeEvidence.screenshot).toEqual(PROBE_PNG); + expect(imageModality?.bytes).toEqual(AGENT_PNG); + expect(imageModality?.mediaType).toBe("image/png"); + expect(textModality?.content).toBe("navigated"); + } finally { + await fs.rm(tmpRoot, { recursive: true, force: true }); + } + }); +}); + +function makeTrajectory(task: TaskSpec): Trajectory { + return { + task, + status: "complete", + finalAnswer: "Final answer text.", + usage: { input_tokens: 100, output_tokens: 50 }, + timing: { + startedAt: "2026-05-15T10:00:00.000Z", + endedAt: "2026-05-15T10:01:00.000Z", + }, + steps: [ + { + index: 0, + actionName: "goto", + actionArgs: { url: "https://example.com" }, + reasoning: "Open the page.", + agentEvidence: { + modalities: [ + { type: "text", content: "navigated" }, + { type: "image", bytes: AGENT_PNG, mediaType: "image/png" }, + ], + }, + probeEvidence: { + url: "https://example.com", + screenshot: PROBE_PNG, + }, + toolOutput: { ok: true, result: { url: "https://example.com" } }, + startedAt: "2026-05-15T10:00:00.000Z", + finishedAt: "2026-05-15T10:00:05.000Z", + }, + ], + }; +}