From 9535d8226741c6652d57ec0dda8f1d4c4efe6f50 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:55:51 -0700
Subject: [PATCH 1/5] feat(evals): add verifier harness adapters

---
 packages/evals/framework/claudeCodeRunner.ts  | 127 ++++-
 packages/evals/framework/codexRunner.ts       | 129 +++++-
 .../framework/harnesses/claudeCodeAdapter.ts  | 225 +++++++++
 .../evals/framework/harnesses/codexAdapter.ts | 223 +++++++++
 .../framework/harnesses/persistTrajectory.ts  | 185 ++++++++
 .../framework/harnesses/trajectoryAdapter.ts  | 208 +++++++++
 .../evals/scripts/verify-harness-adapters.ts  | 434 ++++++++++++++++++
 7 files changed, 1527 insertions(+), 4 deletions(-)
 create mode 100644 packages/evals/framework/harnesses/claudeCodeAdapter.ts
 create mode 100644 packages/evals/framework/harnesses/codexAdapter.ts
 create mode 100644 packages/evals/framework/harnesses/persistTrajectory.ts
 create mode 100644 packages/evals/framework/harnesses/trajectoryAdapter.ts
 create mode 100644 packages/evals/scripts/verify-harness-adapters.ts
diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index 6ec620233..c306c4256 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -1,9 +1,12 @@
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
 import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
 import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
+import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js";
+import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
+import { verdictToSuccess } from "./verifierAdapter.js";
 
 type ClaudeSdkMessage = Record<string, unknown>;
 type ClaudeQuery = AsyncIterable<ClaudeSdkMessage>;
@@ -16,6 +19,25 @@ export type ClaudeAgentSdk = {
   }) => ClaudeQuery;
 };
 
+export interface ClaudeCodeVerifierConfig {
+  /**
+   * V3 instance used solely as the LLM-client carrier for V3Evaluator. The
+   * instance does NOT need to have `init()` been called — V3Evaluator.verify()
+   * uses only `v3.logger` to construct its LLMProvider.
+   */
+  v3: V3;
+  /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */
+  taskSpec: TaskSpec;
+  /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
+  dataset: string;
+  /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
+  successMode?: "outcome" | "process" | "both";
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+}
+
 export interface ClaudeCodeRunnerInput {
   plan: ExternalHarnessTaskPlan;
   model: AvailableModel;
@@ -23,6 +45,15 @@ export interface ClaudeCodeRunnerInput {
   toolAdapter?: PreparedClaudeCodeToolAdapter;
   signal?: AbortSignal;
   sdk?: ClaudeAgentSdk;
+  /**
+   * Optional verifier integration. When provided, the runner builds a
+   * Trajectory from the SDK message stream (via claudeCodeAdapter), runs
+   * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict
+   * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE).
+   * When omitted, the runner falls back to parsing the legacy EVAL_RESULT
+   * line — preserves current behavior for callers that haven't migrated.
+   */
+  verifier?: ClaudeCodeVerifierConfig;
 }
 
 export interface ParsedClaudeCodeResult {
@@ -124,7 +155,9 @@ export async function runClaudeCodeAgent({
   toolAdapter,
   signal,
   sdk: injectedSdk,
+  verifier,
 }: ClaudeCodeRunnerInput): Promise<TaskResult> {
+  const startedAt = new Date().toISOString();
   const sdk = injectedSdk ?? (await loadClaudeAgentSdk());
   const abortController = new AbortController();
   if (signal) {
@@ -220,8 +253,10 @@ export async function runClaudeCodeAgent({
     parsed.summary ??
     stopReason ??
     (resultText || transcriptText || "Claude Code did not report success");
+  const endedAt = new Date().toISOString();
+  const tokenUsage = extractClaudeCodeTokenUsage(resultMessage);
 
-  return {
+  const baseResult: TaskResult = {
     _success: parsed.success,
     error: !parsed.success ? errorMessage : undefined,
     reasoning: parsed.summary,
@@ -232,6 +267,94 @@ export async function runClaudeCodeAgent({
     logs: logger.getLogs(),
     metrics: buildClaudeCodeMetrics(resultMessage),
   };
+
+  if (!verifier) {
+    return baseResult;
+  }
+
+  // Build a Trajectory from the SDK message stream and run the rubric verifier.
+  try {
+    const trajectory = claudeCodeAdapter.fromHarnessResult(
+      {
+        messages,
+        finalAnswer: parsed.finalAnswer ?? resultText,
+        status: status === "completed" ? "complete" : "error",
+        usage: {
+          input_tokens: tokenUsage.inputTokens,
+          output_tokens: tokenUsage.outputTokens,
+          cached_input_tokens: tokenUsage.cacheReadInputTokens,
+        },
+        timing: { startedAt, endedAt },
+      },
+      verifier.taskSpec,
+    );
+
+    const { V3Evaluator } = await import("@browserbasehq/stagehand");
+    const { RubricCache } = await import("./rubricCache.js");
+    const evaluator = new V3Evaluator(verifier.v3);
+
+    // Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
+    let rubric = verifier.taskSpec.precomputedRubric;
+    if (!rubric) {
+      if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+        rubric = await evaluator.generateRubric(verifier.taskSpec);
+      } else {
+        const cache = new RubricCache({ dataset: verifier.dataset });
+        rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator);
+      }
+    }
+    const hydratedSpec: TaskSpec = {
+      ...verifier.taskSpec,
+      precomputedRubric: rubric,
+    };
+
+    const verdict = await evaluator.verify(trajectory, hydratedSpec);
+    const successMode =
+      verifier.successMode ??
+      ((process.env.EVAL_SUCCESS_MODE as
+        | "outcome"
+        | "process"
+        | "both"
+        | undefined) ||
+        "outcome");
+    const verifiedSuccess = verdictToSuccess(verdict, successMode);
+
+    const { directory: trajectoryDir } = await persistAdapterTrajectory({
+      trajectory,
+      taskSpec: hydratedSpec,
+      verdict,
+      outputRoot: verifier.trajectoryRoot,
+      runId: verifier.runId,
+    });
+
+    logger.log({
+      category: "claude_code",
+      message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`,
+      level: 1,
+    });
+
+    return {
+      ...baseResult,
+      _success: verifiedSuccess,
+      error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
+      outcomeSuccess: verdict.outcomeSuccess,
+      processScore: verdict.processScore,
+      evidenceInsufficient: verdict.evidenceInsufficient,
+      criterionCount: rubric.items.length,
+      stepCount: trajectory.steps.length,
+      trajectoryDir,
+    };
+  } catch (verifyError) {
+    logger.warn({
+      category: "claude_code",
+      message: `verifier integration failed: ${stringifyError(verifyError)}`,
+      level: 0,
+      auxiliary: {
+        error: { value: stringifyError(verifyError), type: "string" },
+      },
+    });
+    return baseResult;
+  }
 }
 
 function buildClaudeCodeMetrics(
diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts
index 4d2844efa..2c5695789 100644
--- a/packages/evals/framework/codexRunner.ts
+++ b/packages/evals/framework/codexRunner.ts
@@ -1,9 +1,12 @@
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
 import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
 import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js";
+import { codexAdapter } from "./harnesses/codexAdapter.js";
+import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
+import { verdictToSuccess } from "./verifierAdapter.js";
 
 type MetricValue = { count: number; value: number };
 type CodexEvent = Record<string, unknown>;
@@ -25,6 +28,25 @@ export type CodexSdk = {
   startThread: (options?: Record<string, unknown>) => CodexThread;
 };
 
+export interface CodexVerifierConfig {
+  /**
+   * V3 instance used solely as the LLM-client carrier for V3Evaluator. The
+   * instance does NOT need to have `init()` been called — V3Evaluator.verify()
+   * uses only `v3.logger` to construct its LLMProvider.
+   */
+  v3: V3;
+  /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */
+  taskSpec: TaskSpec;
+  /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
+  dataset: string;
+  /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
+  successMode?: "outcome" | "process" | "both";
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+}
+
 export interface CodexRunnerInput {
   plan: ExternalHarnessTaskPlan;
   model: AvailableModel;
@@ -32,6 +54,15 @@ export interface CodexRunnerInput {
   toolAdapter?: PreparedCodexToolAdapter;
   signal?: AbortSignal;
   sdk?: CodexSdk;
+  /**
+   * Optional verifier integration. When provided, the runner builds a
+   * Trajectory from the codex event stream (via codexAdapter), runs
+   * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict
+   * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE).
+   * When omitted, the runner falls back to parsing the legacy JSON result —
+   * preserves current behavior for callers that haven't migrated.
+   */
+  verifier?: CodexVerifierConfig;
 }
 
 export interface ParsedCodexResult {
@@ -114,7 +145,9 @@ export async function runCodexAgent({
   toolAdapter,
   signal,
   sdk: injectedSdk,
+  verifier,
 }: CodexRunnerInput): Promise<TaskResult> {
+  const startedAt = new Date().toISOString();
   const sdk = injectedSdk ?? (await loadCodexSdk(toolAdapter?.env));
   const prompt = buildCodexPrompt(plan, toolAdapter?.promptInstructions);
   const events: CodexEvent[] = [];
@@ -191,8 +224,9 @@ export async function runCodexAgent({
       finalResponse ||
       transcriptText ||
       "Codex did not report success");
+  const endedAt = new Date().toISOString();
 
-  return {
+  const baseResult: TaskResult = {
     _success: parsed.success,
     error: !parsed.success ? errorMessage : undefined,
     reasoning: parsed.summary,
@@ -203,6 +237,97 @@ export async function runCodexAgent({
     logs: logger.getLogs(),
     metrics: buildCodexMetrics(usage),
   };
+
+  if (!verifier) {
+    return baseResult;
+  }
+
+  try {
+    const trajectory = codexAdapter.fromHarnessResult(
+      {
+        events,
+        finalAnswer: parsed.finalAnswer ?? finalResponse,
+        status: status === "completed" ? "complete" : "error",
+        usage: {
+          input_tokens: toFiniteNumber(usage?.input_tokens),
+          output_tokens: toFiniteNumber(usage?.output_tokens),
+          ...(usage?.reasoning_output_tokens !== undefined && {
+            reasoning_tokens: toFiniteNumber(usage.reasoning_output_tokens),
+          }),
+          ...(usage?.cached_input_tokens !== undefined && {
+            cached_input_tokens: toFiniteNumber(usage.cached_input_tokens),
+          }),
+        },
+        timing: { startedAt, endedAt },
+      },
+      verifier.taskSpec,
+    );
+
+    const { V3Evaluator } = await import("@browserbasehq/stagehand");
+    const { RubricCache } = await import("./rubricCache.js");
+    const evaluator = new V3Evaluator(verifier.v3);
+
+    let rubric = verifier.taskSpec.precomputedRubric;
+    if (!rubric) {
+      if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+        rubric = await evaluator.generateRubric(verifier.taskSpec);
+      } else {
+        const cache = new RubricCache({ dataset: verifier.dataset });
+        rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator);
+      }
+    }
+    const hydratedSpec: TaskSpec = {
+      ...verifier.taskSpec,
+      precomputedRubric: rubric,
+    };
+
+    const verdict = await evaluator.verify(trajectory, hydratedSpec);
+    const successMode =
+      verifier.successMode ??
+      ((process.env.EVAL_SUCCESS_MODE as
+        | "outcome"
+        | "process"
+        | "both"
+        | undefined) ||
+        "outcome");
+    const verifiedSuccess = verdictToSuccess(verdict, successMode);
+
+    const { directory: trajectoryDir } = await persistAdapterTrajectory({
+      trajectory,
+      taskSpec: hydratedSpec,
+      verdict,
+      outputRoot: verifier.trajectoryRoot,
+      runId: verifier.runId,
+    });
+
+    logger.log({
+      category: "codex",
+      message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`,
+      level: 1,
+    });
+
+    return {
+      ...baseResult,
+      _success: verifiedSuccess,
+      error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
+      outcomeSuccess: verdict.outcomeSuccess,
+      processScore: verdict.processScore,
+      evidenceInsufficient: verdict.evidenceInsufficient,
+      criterionCount: rubric.items.length,
+      stepCount: trajectory.steps.length,
+      trajectoryDir,
+    };
+  } catch (verifyError) {
+    logger.warn({
+      category: "codex",
+      message: `verifier integration failed: ${stringifyError(verifyError)}`,
+      level: 0,
+      auxiliary: {
+        error: { value: stringifyError(verifyError), type: "string" },
+      },
+    });
+    return baseResult;
+  }
 }
 
 function tryParseCodexJson(
diff --git a/packages/evals/framework/harnesses/claudeCodeAdapter.ts b/packages/evals/framework/harnesses/claudeCodeAdapter.ts
new file mode 100644
index 000000000..fd680895b
--- /dev/null
+++ b/packages/evals/framework/harnesses/claudeCodeAdapter.ts
@@ -0,0 +1,225 @@
+/**
+ * claudeCodeAdapter — converts a Claude Code SDK run into a `Trajectory` the
+ * verifier can consume.
+ *
+ * Input shape: the SDK emits a stream of `ClaudeSdkMessage` objects of
+ * different `type`s — assistant (model output, may contain tool_use blocks),
+ * user (tool_result blocks for prior tool_use calls), and result (final
+ * outcome with cost/usage/turn counts). We accumulate the stream upstream in
+ * `runClaudeCodeAgent` and hand the full list here.
+ *
+ * Mapping:
+ *   - Each `tool_use` block in an assistant message becomes one normalized
+ *     tool call, paired with its matching `tool_result` from a subsequent
+ *     user message (by `tool_use_id`).
+ *   - Assistant `text` blocks that precede a tool_use are folded into that
+ *     tool call's `reasoning`. Trailing text after the last tool call (and
+ *     the final result message's `result` string when present) becomes the
+ *     `finalAnswer`.
+ *   - The result message's usage carries forward as the trajectory usage.
+ *
+ * Failure modes:
+ *   - max_turns / sdk_error → status = "error", but we still emit whatever
+ *     steps we have. The verifier flags evidence_insufficient on criteria it
+ *     can't ground.
+ */
+import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+import {
+  buildTrajectory,
+  type NormalizedToolCall,
+  type TrajectoryAdapter,
+} from "./trajectoryAdapter.js";
+
+/** Subset of the harness result we need to build a trajectory. */
+export interface ClaudeCodeRunResult {
+  /** Raw SDK message stream collected during execution, in arrival order. */
+  messages: Array<Record<string, unknown>>;
+  /** Final assistant message captured separately (optional — falls back to messages). */
+  finalAnswer?: string;
+  /** Trajectory-level status. Defaults to "complete". */
+  status?: Trajectory["status"];
+  /** Optional usage to fold into Trajectory.usage. */
+  usage?: Partial<Trajectory["usage"]>;
+  /** Optional run start/end timing. Adapter fills with now-now otherwise. */
+  timing?: Partial<Trajectory["timing"]>;
+}
+
+interface ToolUseBlock {
+  /** tool_use_id used to match against tool_result blocks. */
+  id: string;
+  name: string;
+  input: Record<string, unknown>;
+  /** Assistant text accumulated before this tool call (becomes `reasoning`). */
+  reasoningPrefix: string;
+}
+
+interface ToolResultBlock {
+  toolUseId: string;
+  /** Concatenated text content of the result. */
+  text: string;
+  /** Original structured content when not flattened to text. */
+  raw?: unknown;
+  isError: boolean;
+}
+
+export class ClaudeCodeTrajectoryAdapter
+  implements TrajectoryAdapter<ClaudeCodeRunResult>
+{
+  fromHarnessResult(
+    result: ClaudeCodeRunResult,
+    taskSpec: TaskSpec,
+  ): Trajectory {
+    const toolUses: ToolUseBlock[] = [];
+    const toolResults = new Map<string, ToolResultBlock>();
+    const trailingTextParts: string[] = [];
+    let resultMessageText: string | undefined;
+
+    let pendingReasoning = "";
+
+    for (const message of result.messages) {
+      const type = String((message as Record<string, unknown>).type ?? "");
+      const inner = (message as Record<string, unknown>).message;
+      if (type === "result") {
+        const r = (message as Record<string, unknown>).result;
+        if (typeof r === "string" && r.trim()) {
+          resultMessageText = r;
+        }
+        continue;
+      }
+      if (!isRecord(inner)) continue;
+      const content = inner.content;
+      if (!Array.isArray(content)) {
+        if (typeof content === "string" && type === "assistant") {
+          pendingReasoning = appendText(pendingReasoning, content);
+          trailingTextParts.push(content);
+        }
+        continue;
+      }
+
+      if (type === "assistant") {
+        for (const block of content) {
+          if (!isRecord(block)) continue;
+          const blockType = String(block.type ?? "");
+          if (blockType === "text" && typeof block.text === "string") {
+            pendingReasoning = appendText(pendingReasoning, block.text);
+            trailingTextParts.push(block.text);
+            continue;
+          }
+          if (blockType === "tool_use") {
+            const id = typeof block.id === "string" ? block.id : "";
+            const name = typeof block.name === "string" ? block.name : "tool";
+            const input = isRecord(block.input)
+              ? (block.input as Record<string, unknown>)
+              : {};
+            toolUses.push({
+              id,
+              name,
+              input,
+              reasoningPrefix: pendingReasoning,
+            });
+            // Once a tool_use lands, the buffered text belonged to its reasoning;
+            // future tool calls start with empty reasoning unless more text arrives.
+            pendingReasoning = "";
+            // The text we just folded into reasoning is not the final answer.
+            // Drop it from trailingTextParts.
+            trailingTextParts.length = 0;
+          }
+        }
+        continue;
+      }
+
+      if (type === "user") {
+        for (const block of content) {
+          if (!isRecord(block)) continue;
+          const blockType = String(block.type ?? "");
+          if (blockType !== "tool_result") continue;
+          const toolUseId =
+            typeof block.tool_use_id === "string" ? block.tool_use_id : "";
+          const isError = block.is_error === true;
+          const { text, raw } = extractToolResultContent(block.content);
+          toolResults.set(toolUseId, {
+            toolUseId,
+            text,
+            raw,
+            isError,
+          });
+        }
+        continue;
+      }
+    }
+
+    const toolCalls: NormalizedToolCall[] = toolUses.map((use) => {
+      const matched = toolResults.get(use.id);
+      const ok = matched ? !matched.isError : true;
+      const resultPayload =
+        matched?.raw !== undefined ? matched.raw : (matched?.text ?? "");
+      return {
+        name: use.name,
+        args: use.input,
+        result: resultPayload,
+        ok,
+        ...(matched?.isError && matched.text && { error: matched.text }),
+        reasoning: use.reasoningPrefix.trim() || undefined,
+      };
+    });
+
+    const trailing = trailingTextParts.join("\n").trim();
+    const finalAnswer =
+      result.finalAnswer ??
+      resultMessageText ??
+      (trailing.length > 0 ? trailing : undefined);
+
+    return buildTrajectory({
+      taskSpec,
+      toolCalls,
+      finalAnswer,
+      status: result.status ?? "complete",
+      usage: result.usage,
+      timing: result.timing,
+    });
+  }
+}
+
+export const claudeCodeAdapter = new ClaudeCodeTrajectoryAdapter();
+
+function appendText(buffer: string, addition: string): string {
+  if (!addition) return buffer;
+  if (!buffer) return addition;
+  return `${buffer}\n${addition}`;
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null;
+}
+
+/**
+ * tool_result `content` can be:
+ *   - a string (legacy)
+ *   - an array of { type: "text", text } / { type: "image", source } blocks
+ *
+ * We flatten text blocks and preserve the original array (when structured) as
+ * `raw` so adapters that want the json modality can keep it.
+ */
+function extractToolResultContent(content: unknown): {
+  text: string;
+  raw?: unknown;
+} {
+  if (typeof content === "string") {
+    return { text: content };
+  }
+  if (!Array.isArray(content)) {
+    return { text: "" };
+  }
+  const parts: string[] = [];
+  for (const block of content) {
+    if (!isRecord(block)) continue;
+    if (block.type === "text" && typeof block.text === "string") {
+      parts.push(block.text);
+    } else if (block.type === "image") {
+      parts.push("[image]");
+    } else if (typeof block.text === "string") {
+      parts.push(block.text);
+    }
+  }
+  return { text: parts.join("\n"), raw: content };
+}
diff --git a/packages/evals/framework/harnesses/codexAdapter.ts b/packages/evals/framework/harnesses/codexAdapter.ts
new file mode 100644
index 000000000..cd313dd72
--- /dev/null
+++ b/packages/evals/framework/harnesses/codexAdapter.ts
@@ -0,0 +1,223 @@
+/**
+ * codexAdapter — converts a Codex SDK run into a `Trajectory` the verifier
+ * can consume.
+ *
+ * Input shape: codex emits `ThreadEvent`s — `item.completed` carrying a
+ * `ThreadItem` (command_execution, file_change, mcp_tool_call, agent_message,
+ * reasoning, web_search, todo_list, error), plus `turn.completed` for usage.
+ * We accumulate the full event list upstream in `runCodexAgent` and hand it
+ * here.
+ *
+ * Mapping:
+ *   - command_execution items → tool call named `bash` (or the command's
+ *     leading token), args = { command }, result = aggregated_output,
+ *     ok = exit_code === 0.
+ *   - mcp_tool_call items → tool call named `${server}.${tool}`, args =
+ *     arguments, result = structured_content (json modality) when present,
+ *     else flattened content text. ok = status !== "failed".
+ *   - reasoning items between item.completed events → folded into the next
+ *     tool call's reasoning string.
+ *   - agent_message items → the final answer (last wins).
+ *   - error items → captured as a failed tool call so the verifier sees the
+ *     pattern (a no-op `error` action with the message in toolOutput.error).
+ *   - file_change items → captured as a tool call named `file_change` with the
+ *     change set in args (rare in browser eval contexts).
+ *   - web_search items → captured as a tool call named `web_search` with the
+ *     query in args.
+ *   - todo_list items → not surfaced as tool calls (they aren't actions).
+ */
+import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+import {
+  buildTrajectory,
+  type NormalizedToolCall,
+  type TrajectoryAdapter,
+} from "./trajectoryAdapter.js";
+
+export interface CodexRunResult {
+  /** All ThreadEvents collected from the SDK stream, in arrival order. */
+  events: Array<Record<string, unknown>>;
+  /** Last `agent_message` text. Adapter falls back to scanning events otherwise. */
+  finalAnswer?: string;
+  /** Trajectory-level status. Defaults to "complete". */
+  status?: Trajectory["status"];
+  /** Optional usage to fold into Trajectory.usage. */
+  usage?: Partial<Trajectory["usage"]>;
+  /** Optional run start/end timing. Adapter fills with now-now otherwise. */
+  timing?: Partial<Trajectory["timing"]>;
+}
+
+export class CodexTrajectoryAdapter
+  implements TrajectoryAdapter<CodexRunResult>
+{
+  fromHarnessResult(result: CodexRunResult, taskSpec: TaskSpec): Trajectory {
+    const toolCalls: NormalizedToolCall[] = [];
+    let pendingReasoning = "";
+    let latestAgentMessage: string | undefined;
+
+    for (const event of result.events) {
+      const type = String((event as Record<string, unknown>).type ?? "");
+      if (type !== "item.completed") continue;
+      const item = (event as Record<string, unknown>).item;
+      if (!isRecord(item)) continue;
+      const itemType = String(item.type ?? "");
+
+      if (itemType === "reasoning" && typeof item.text === "string") {
+        pendingReasoning = pendingReasoning
+          ? `${pendingReasoning}\n${item.text}`
+          : item.text;
+        continue;
+      }
+
+      if (itemType === "agent_message" && typeof item.text === "string") {
+        // Drop buffered reasoning that didn't precede a tool call.
+        pendingReasoning = "";
+        latestAgentMessage = item.text;
+        continue;
+      }
+
+      const call = normalizeItem(itemType, item, pendingReasoning);
+      if (call) {
+        toolCalls.push(call);
+        pendingReasoning = "";
+      }
+    }
+
+    const finalAnswer = result.finalAnswer ?? latestAgentMessage;
+
+    return buildTrajectory({
+      taskSpec,
+      toolCalls,
+      finalAnswer,
+      status: result.status ?? "complete",
+      usage: result.usage,
+      timing: result.timing,
+    });
+  }
+}
+
+export const codexAdapter = new CodexTrajectoryAdapter();
+
+function normalizeItem(
+  itemType: string,
+  item: Record<string, unknown>,
+  reasoning: string,
+): NormalizedToolCall | undefined {
+  if (itemType === "command_execution") {
+    const command = typeof item.command === "string" ? item.command : "";
+    const exitCode =
+      typeof item.exit_code === "number" ? item.exit_code : undefined;
+    const status = String(item.status ?? "");
+    const ok = exitCode === 0 || status === "completed";
+    const output =
+      typeof item.aggregated_output === "string" ? item.aggregated_output : "";
+    // Use the leading token as the action name (`bash`, `browse`, etc.) when
+    // possible; falls back to `command_execution`.
+    const leading = command.split(/\s+/, 1)[0] || "command_execution";
+    return {
+      name: leading,
+      args: { command, ...(exitCode !== undefined && { exit_code: exitCode }) },
+      result: output,
+      ok,
+      ...(!ok && {
+        error:
+          exitCode !== undefined
+            ? `exit code ${exitCode}`
+            : `command status ${status}`,
+      }),
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "mcp_tool_call") {
+    const server = typeof item.server === "string" ? item.server : "mcp";
+    const tool = typeof item.tool === "string" ? item.tool : "tool";
+    const args = isRecord(item.arguments)
+      ? (item.arguments as Record<string, unknown>)
+      : {};
+    const status = String(item.status ?? "");
+    const ok = status !== "failed";
+    const mcpResult = isRecord(item.result) ? item.result : undefined;
+    const structured = mcpResult?.structured_content;
+    const content = mcpResult?.content;
+    const errorMessage = isRecord(item.error)
+      ? typeof item.error.message === "string"
+        ? item.error.message
+        : undefined
+      : undefined;
+
+    // Prefer structured_content (json modality) when present, else flatten
+    // content blocks to text. Falls back to error message when failed.
+    let payload: unknown;
+    if (structured !== undefined && structured !== null) {
+      payload = structured;
+    } else if (Array.isArray(content)) {
+      const parts: string[] = [];
+      for (const block of content) {
+        if (!isRecord(block)) continue;
+        if (block.type === "text" && typeof block.text === "string") {
+          parts.push(block.text);
+        } else if (block.type === "image") {
+          parts.push("[image]");
+        } else if (typeof block.text === "string") {
+          parts.push(block.text);
+        }
+      }
+      payload = parts.join("\n");
+    } else if (!ok && errorMessage) {
+      payload = errorMessage;
+    } else {
+      payload = "";
+    }
+
+    return {
+      name: `${server}.${tool}`,
+      args,
+      result: payload,
+      ok,
+      ...(errorMessage && !ok && { error: errorMessage }),
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "web_search") {
+    const query = typeof item.query === "string" ? item.query : "";
+    return {
+      name: "web_search",
+      args: { query },
+      result: "",
+      ok: true,
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "file_change") {
+    const changes = Array.isArray(item.changes) ? item.changes : [];
+    const status = String(item.status ?? "");
+    return {
+      name: "file_change",
+      args: { changes },
+      result: { status, changes },
+      ok: status === "completed",
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "error") {
+    const message =
+      typeof item.message === "string" ? item.message : "codex error item";
+    return {
+      name: "error",
+      args: {},
+      result: message,
+      ok: false,
+      error: message,
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  return undefined;
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null;
+}
diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts
new file mode 100644
index 000000000..ffa3dd868
--- /dev/null
+++ b/packages/evals/framework/harnesses/persistTrajectory.ts
@@ -0,0 +1,185 @@
+/**
+ * persistAdapterTrajectory — writes the on-disk layout used by the Stagehand
+ * `TrajectoryRecorder.persist()` for trajectories built by external-harness
+ * adapters (claude_code, codex).
+ *
+ * `TrajectoryRecorder` itself is coupled to v3.bus events: it subscribes
+ * during the agent run, accumulates partial steps, and emits the final
+ * trajectory on finish(). External harnesses don't go through that bus —
+ * they produce a complete `Trajectory` synchronously after the harness
+ * finishes — so this helper writes the same on-disk layout without the
+ * event-subscription lifecycle.
+ *
+ * The on-disk layout matches microsoft/fara's example_trajectory/ so saved
+ * trajectories can be cross-validated against verify_trajectories.py without
+ * a format conversion step:
+ *
+ *   <dir>/
+ *     ├── task_data.json
+ *     ├── trajectory.json   (image bytes as base64, screenshots referenced by path)
+ *     ├── screenshot_<N>.png (only if probeEvidence carries Buffer screenshots — external
+ *     │                       harnesses don't natively, but the helper supports it)
+ *     ├── scores/
+ *     │   └── mmrubric_v1.json  (if `verdict` passed)
+ *     ├── core.log
+ *     └── times.json
+ *
+ * Honors `VERIFIER_PERSIST_TRAJECTORIES` for default on/off (matches
+ * TrajectoryRecorder's convention):
+ *   - "1" / "true": always persist.
+ *   - "0" / "false": never persist.
+ *   - unset: persists when not in CI.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import type {
+  ProbeEvidence,
+  TaskSpec,
+  Trajectory,
+  Verdict,
+} from "@browserbasehq/stagehand";
+
+export interface PersistAdapterTrajectoryOptions {
+  trajectory: Trajectory;
+  taskSpec: TaskSpec;
+  /** Verdict from V3Evaluator.verify(). Written to scores/mmrubric_v1.json. */
+  verdict?: Verdict;
+  /**
+   * Output directory root. Final layout lives at `<outputRoot>/<runId>/<task.id>/`.
+   * Defaults to `<cwd>/.trajectories`.
+   */
+  outputRoot?: string;
+  /** Run identifier (e.g., ISO timestamp). Defaults to a fresh timestamp. */
+  runId?: string;
+  /**
+   * Override the env-gated persistence default. `true` always persists,
+   * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES.
+   */
+  persist?: boolean;
+}
+
+export interface PersistAdapterTrajectoryResult {
+  /** The directory the trajectory was (or would have been) persisted to. */
+  directory: string;
+  /** Whether persistence actually wrote files. */
+  persisted: boolean;
+}
+
+function shouldPersist(override: boolean | undefined): boolean {
+  if (override !== undefined) return override;
+  const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase();
+  if (env === "1" || env === "true") return true;
+  if (env === "0" || env === "false") return false;
+  return !process.env.CI;
+}
+
+export async function persistAdapterTrajectory(
+  opts: PersistAdapterTrajectoryOptions,
+): Promise<PersistAdapterTrajectoryResult> {
+  const runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-");
+  const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories");
+  const directory = path.join(root, runId, opts.taskSpec.id);
+  const persisted = shouldPersist(opts.persist);
+
+  if (!persisted) {
+    return { directory, persisted: false };
+  }
+
+  await fs.mkdir(directory, { recursive: true });
+
+  // Walk steps and (when a Buffer screenshot is present, which is rare for
+  // external harnesses) write it to disk + replace with a path reference.
+  // Image modalities in agentEvidence get base64-encoded inline to keep
+  // trajectory.json human-readable JSON.
+  const serializableSteps: unknown[] = [];
+  for (const step of opts.trajectory.steps) {
+    const probe: ProbeEvidence = { ...step.probeEvidence };
+    if (probe.screenshot) {
+      const filename = `screenshot_${step.index + 1}.png`;
+      await fs.writeFile(path.join(directory, filename), probe.screenshot);
+      probe.screenshotPath = filename;
+      delete probe.screenshot;
+    }
+    const agentEvidence = {
+      modalities: step.agentEvidence.modalities.map((m) =>
+        m.type === "image"
+          ? {
+              type: "image",
+              bytesBase64: m.bytes.toString("base64"),
+              mediaType: m.mediaType,
+            }
+          : m,
+      ),
+    };
+    serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence });
+  }
+
+  const serialized = {
+    ...opts.trajectory,
+    steps: serializableSteps,
+  } as unknown;
+
+  await fs.writeFile(
+    path.join(directory, "trajectory.json"),
+    JSON.stringify(serialized, null, 2),
+  );
+
+  const taskData: Record<string, unknown> = {
+    task: opts.trajectory.task,
+    status: opts.trajectory.status,
+    finalAnswer: opts.trajectory.finalAnswer ?? null,
+  };
+  if (opts.verdict) {
+    taskData.verdict = opts.verdict;
+  }
+  await fs.writeFile(
+    path.join(directory, "task_data.json"),
+    JSON.stringify(taskData, null, 2),
+  );
+
+  await fs.writeFile(
+    path.join(directory, "times.json"),
+    JSON.stringify(
+      {
+        timing: opts.trajectory.timing,
+        usage: opts.trajectory.usage,
+        stepCount: opts.trajectory.steps.length,
+      },
+      null,
+      2,
+    ),
+  );
+
+  await fs.mkdir(path.join(directory, "scores"), { recursive: true });
+  if (opts.verdict) {
+    await fs.writeFile(
+      path.join(directory, "scores", "mmrubric_v1.json"),
+      JSON.stringify(opts.verdict, null, 2),
+    );
+  }
+
+  await fs.writeFile(
+    path.join(directory, "core.log"),
+    coreLog(opts.trajectory),
+  );
+
+  return { directory, persisted: true };
+}
+
+function coreLog(trajectory: Trajectory): string {
+  return (
+    trajectory.steps
+      .map((step) =>
+        JSON.stringify({
+          step: step.index,
+          action: step.actionName,
+          url: step.probeEvidence.url ?? null,
+          ok: step.toolOutput.ok,
+          reasoning: step.reasoning || undefined,
+          startedAt: step.startedAt,
+          finishedAt: step.finishedAt,
+        }),
+      )
+      .join("\n") + "\n"
+  );
+}
diff --git a/packages/evals/framework/harnesses/trajectoryAdapter.ts b/packages/evals/framework/harnesses/trajectoryAdapter.ts
new file mode 100644
index 000000000..cb3afac65
--- /dev/null
+++ b/packages/evals/framework/harnesses/trajectoryAdapter.ts
@@ -0,0 +1,208 @@
+/**
+ * TrajectoryAdapter — converts an external harness's natural output (a
+ * provider-shaped event/message log) into the Stagehand `Trajectory` shape
+ * that V3Evaluator.verify() consumes.
+ *
+ * The verifier is harness-agnostic (Trajectory + TaskSpec → Verdict; pure
+ * function, no live browser). That property is what lets non-Stagehand
+ * harnesses — Claude Code, Codex — be scored with the same rubric pipeline
+ * we use for Stagehand. Each external harness ships its own
+ * `TrajectoryAdapter<THarnessResult>` that maps its tool-call/message log to
+ * a `Trajectory`. The verifier never knows which adapter produced it.
+ *
+ * @see ~/.claude/plans/verifier-rewrite-plan.html §07 "External harness adapters"
+ */
+import type {
+  AgentEvidence,
+  AgentEvidenceModality,
+  TaskSpec,
+  Trajectory,
+  TrajectoryStep,
+} from "@browserbasehq/stagehand";
+
+/**
+ * Adapter interface. Implementations are pure: no I/O, no live browser, no
+ * mutation of the input result. The same harness result should always produce
+ * the same Trajectory.
+ *
+ * Empty `probeEvidence` on every step is supported — the verifier degrades
+ * gracefully via the `evidence_insufficient` path (paper's uncontrollable-
+ * failure principle). Text-heavy tasks (extract, lookup, search) still get a
+ * meaningful outcome verdict; visual-grounding criteria get flagged as
+ * evidence_insufficient rather than silently miscredited.
+ */
+export interface TrajectoryAdapter<THarnessResult> {
+  /**
+   * Convert the external harness's natural output into a Trajectory. Must be
+   * deterministic given the input.
+   */
+  fromHarnessResult(result: THarnessResult, taskSpec: TaskSpec): Trajectory;
+}
+
+/**
+ * Normalized tool invocation. Adapters parse harness-specific event/message
+ * shapes into this canonical structure before mapping to `TrajectoryStep`.
+ *
+ * The fields are deliberately permissive — harnesses vary in what they
+ * surface, and we want a single mapping helper to handle all of them.
+ */
+export interface NormalizedToolCall {
+  /** Tool name (e.g., "Bash", "mcp__stagehand_browser__run", "container.exec"). */
+  name: string;
+  /** Tool arguments. Empty object if the harness doesn't surface them. */
+  args: Record<string, unknown>;
+  /**
+   * Tool result. Strings become a text modality; objects become a json modality.
+   * `undefined` is allowed (e.g., when the tool failed before producing output).
+   */
+  result: unknown;
+  /** True if the tool reported success. Adapters infer this from harness flags. */
+  ok: boolean;
+  /** Free-form error string when `ok === false`. */
+  error?: string;
+  /** Optional reasoning text the assistant emitted before/with this tool call. */
+  reasoning?: string;
+  /** Wall-clock when the call started. Falls back to call site's "now" if absent. */
+  startedAt?: string;
+  /** Wall-clock when the call finished. Falls back to startedAt. */
+  finishedAt?: string;
+}
+
+/**
+ * Convert a `NormalizedToolCall` into a Trajectory `AgentEvidence`. Strings
+ * map to a single text modality; objects map to a json modality (plus a text
+ * modality with the stringified form so plain text-relevance prompts can
+ * grok structured output). Reasoning text becomes its own text modality —
+ * the verifier weights reasoning highly when grounding criteria without
+ * screenshots.
+ *
+ * `probeEvidence` is intentionally not produced here — external harnesses
+ * don't emit independent observations natively. See `actionToProbeEvidence`
+ * if a harness eventually grows that capability.
+ */
+export function actionToAgentEvidence(
+  call: Pick<NormalizedToolCall, "result" | "reasoning">,
+): AgentEvidence {
+  const modalities: AgentEvidenceModality[] = [];
+
+  if (call.reasoning) {
+    modalities.push({ type: "text", content: call.reasoning });
+  }
+
+  const result = call.result;
+  if (result === undefined || result === null) {
+    return { modalities };
+  }
+
+  if (typeof result === "string") {
+    if (result.length > 0) {
+      modalities.push({ type: "text", content: result });
+    }
+  } else if (Buffer.isBuffer(result)) {
+    modalities.push({
+      type: "image",
+      bytes: result,
+      mediaType: "image/png",
+    });
+  } else if (typeof result === "object") {
+    // Provide both a JSON modality (preserved structure for prompts that
+    // accept JSON) and a stringified text modality (cheap fallback for prompts
+    // that only consume text). Step 2 relevance scoring tolerates duplicates.
+    modalities.push({ type: "json", content: result });
+    const asText = safeStringify(result);
+    if (asText) {
+      modalities.push({ type: "text", content: asText });
+    }
+  } else {
+    // Numbers, booleans, etc. — stringify so the verifier has a text handle.
+    modalities.push({ type: "text", content: String(result) });
+  }
+
+  return { modalities };
+}
+
+/**
+ * Materialize a `TrajectoryStep` from a normalized tool call. Centralizes the
+ * step-shape contract so every adapter produces verifier-equivalent steps.
+ */
+export function toolCallToTrajectoryStep(
+  index: number,
+  call: NormalizedToolCall,
+  fallbackTimestamp: string,
+): TrajectoryStep {
+  const startedAt = call.startedAt ?? fallbackTimestamp;
+  const finishedAt = call.finishedAt ?? startedAt;
+  return {
+    index,
+    actionName: call.name,
+    actionArgs: call.args,
+    reasoning: call.reasoning ?? "",
+    agentEvidence: actionToAgentEvidence(call),
+    // External harnesses don't natively produce screenshots/aria/scroll, so
+    // probeEvidence stays empty. The verifier handles this via the
+    // evidence_insufficient path.
+    probeEvidence: {},
+    toolOutput: {
+      ok: call.ok,
+      result: call.result,
+      ...(call.error && { error: call.error }),
+    },
+    startedAt,
+    finishedAt,
+  };
+}
+
+/**
+ * Build a `Trajectory` from a sequence of normalized tool calls + the task
+ * metadata. Adapters call this after parsing their harness's event log.
+ */
+export interface BuildTrajectoryOptions {
+  taskSpec: TaskSpec;
+  toolCalls: NormalizedToolCall[];
+  finalAnswer?: string;
+  status?: Trajectory["status"];
+  /** Token usage if the harness surfaced it; partial fields are filled with 0. */
+  usage?: Partial<Trajectory["usage"]>;
+  /** Defaults to `now` for both endpoints if the harness didn't track timing. */
+  timing?: Partial<Trajectory["timing"]>;
+}
+
+export function buildTrajectory(opts: BuildTrajectoryOptions): Trajectory {
+  const now = new Date().toISOString();
+  const steps: TrajectoryStep[] = opts.toolCalls.map((call, idx) =>
+    toolCallToTrajectoryStep(idx, call, now),
+  );
+
+  const startedAt = opts.timing?.startedAt ?? steps[0]?.startedAt ?? now;
+  const endedAt =
+    opts.timing?.endedAt ?? steps[steps.length - 1]?.finishedAt ?? startedAt;
+
+  return {
+    task: opts.taskSpec,
+    steps,
+    finalAnswer: opts.finalAnswer,
+    status: opts.status ?? "complete",
+    usage: {
+      input_tokens: opts.usage?.input_tokens ?? 0,
+      output_tokens: opts.usage?.output_tokens ?? 0,
+      ...(opts.usage?.reasoning_tokens !== undefined && {
+        reasoning_tokens: opts.usage.reasoning_tokens,
+      }),
+      ...(opts.usage?.cached_input_tokens !== undefined && {
+        cached_input_tokens: opts.usage.cached_input_tokens,
+      }),
+      ...(opts.usage?.inference_time_ms !== undefined && {
+        inference_time_ms: opts.usage.inference_time_ms,
+      }),
+    },
+    timing: { startedAt, endedAt },
+  };
+}
+
+function safeStringify(value: unknown): string | undefined {
+  try {
+    return JSON.stringify(value);
+  } catch {
+    return undefined;
+  }
+}
diff --git a/packages/evals/scripts/verify-harness-adapters.ts b/packages/evals/scripts/verify-harness-adapters.ts
new file mode 100644
index 000000000..6c949857d
--- /dev/null
+++ b/packages/evals/scripts/verify-harness-adapters.ts
@@ -0,0 +1,434 @@
+/**
+ * External-harness adapter smoke test — verifies the claudeCodeAdapter and
+ * codexAdapter end-to-end without launching a browser.
+ *
+ * Hand-rolls synthetic harness results (tool-use messages for Claude Code,
+ * ThreadEvents for Codex) and asserts:
+ *   1. The produced Trajectory has the expected step count.
+ *   2. Text and JSON modalities are populated where they should be.
+ *   3. finalAnswer is captured.
+ *   4. status === "complete".
+ *
+ * Bonus (gated on GEMINI_API_KEY): feeds the synthetic trajectory into a real
+ * V3Evaluator.verify() with a tiny synthetic rubric, then prints the verdict.
+ *
+ * Run via:  pnpm exec tsx packages/evals/scripts/verify-harness-adapters.ts
+ */
+import assert from "node:assert/strict";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { claudeCodeAdapter } from "../framework/harnesses/claudeCodeAdapter.js";
+import { codexAdapter } from "../framework/harnesses/codexAdapter.js";
+import { persistAdapterTrajectory } from "../framework/harnesses/persistTrajectory.js";
+import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+
+async function testClaudeCodeAdapter(taskSpec: TaskSpec): Promise<Trajectory> {
+  // Hand-rolled SDK message stream that mirrors what the Claude Agent SDK
+  // emits for a two-tool-call session with reasoning between them.
+  const messages: Array<Record<string, unknown>> = [
+    {
+      type: "assistant",
+      message: {
+        content: [
+          {
+            type: "text",
+            text: "I'll start by navigating to the United Airlines website.",
+          },
+          {
+            type: "tool_use",
+            id: "tu_1",
+            name: "browse",
+            input: { command: "browse navigate https://www.united.com" },
+          },
+        ],
+      },
+    },
+    {
+      type: "user",
+      message: {
+        content: [
+          {
+            type: "tool_result",
+            tool_use_id: "tu_1",
+            content: [
+              { type: "text", text: "Navigated to https://www.united.com" },
+            ],
+            is_error: false,
+          },
+        ],
+      },
+    },
+    {
+      type: "assistant",
+      message: {
+        content: [
+          {
+            type: "text",
+            text: "Now I'll look up the flight prices.",
+          },
+          {
+            type: "tool_use",
+            id: "tu_2",
+            name: "browse",
+            input: { command: "browse extract { economy, business } prices" },
+          },
+        ],
+      },
+    },
+    {
+      type: "user",
+      message: {
+        content: [
+          {
+            type: "tool_result",
+            tool_use_id: "tu_2",
+            content: [
+              {
+                type: "text",
+                text: '{"economy":"$1,200","business":"$5,200"}',
+              },
+            ],
+            is_error: false,
+          },
+        ],
+      },
+    },
+    {
+      type: "assistant",
+      message: {
+        content: [
+          {
+            type: "text",
+            text: "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).",
+          },
+        ],
+      },
+    },
+    {
+      type: "result",
+      subtype: "success",
+      result:
+        "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).",
+      duration_ms: 1234,
+      num_turns: 3,
+    },
+  ];
+
+  const trajectory = claudeCodeAdapter.fromHarnessResult(
+    {
+      messages,
+      status: "complete",
+      usage: { input_tokens: 100, output_tokens: 80 },
+    },
+    taskSpec,
+  );
+
+  assert.equal(
+    trajectory.steps.length,
+    2,
+    `expected 2 steps from 2 tool_use blocks, got ${trajectory.steps.length}`,
+  );
+  assert.equal(trajectory.steps[0].actionName, "browse");
+  assert.equal(trajectory.steps[1].actionName, "browse");
+  assert.equal(trajectory.status, "complete");
+  assert.ok(
+    trajectory.finalAnswer?.includes("$4,000"),
+    `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`,
+  );
+
+  // Step 0: reasoning text modality + result text modality.
+  const step0Modalities = trajectory.steps[0].agentEvidence.modalities;
+  assert.ok(
+    step0Modalities.some(
+      (m) => m.type === "text" && m.content.includes("navigating"),
+    ),
+    "expected reasoning text in step 0 modalities",
+  );
+  assert.ok(
+    step0Modalities.some(
+      (m) =>
+        m.type === "text" &&
+        m.content.includes("Navigated to https://www.united.com"),
+    ),
+    "expected tool-result text in step 0 modalities",
+  );
+
+  // Step 1 carries the second reasoning + result content. tool_result content
+  // is a structured array of {type, text} blocks, which the adapter forwards
+  // as the json modality (with a stringified text mirror). Accept either path.
+  const step1Modalities = trajectory.steps[1].agentEvidence.modalities;
+  const step1Joined = JSON.stringify(step1Modalities);
+  assert.ok(
+    step1Joined.includes("economy"),
+    `expected step 1 modalities to mention 'economy'; got ${step1Joined}`,
+  );
+
+  // Both steps must have empty probeEvidence — external harnesses don't
+  // produce screenshots natively. That's what triggers evidence_insufficient
+  // in the verifier downstream.
+  for (const step of trajectory.steps) {
+    assert.deepEqual(
+      step.probeEvidence,
+      {},
+      `expected empty probeEvidence for external-harness step ${step.index}`,
+    );
+  }
+
+  console.log(
+    `  ✓ claudeCodeAdapter — ${trajectory.steps.length} steps, finalAnswer captured, probeEvidence empty`,
+  );
+
+  return trajectory;
+}
+
+async function testCodexAdapter(taskSpec: TaskSpec): Promise<Trajectory> {
+  // Hand-rolled codex ThreadEvent stream. Mirrors what runCodexAgent
+  // accumulates into its `events` array.
+  const events: Array<Record<string, unknown>> = [
+    { type: "thread.started", thread_id: "thread-smoke" },
+    { type: "turn.started" },
+    {
+      type: "item.completed",
+      item: {
+        id: "rs-1",
+        type: "reasoning",
+        text: "I should start by navigating to the United website.",
+      },
+    },
+    {
+      type: "item.completed",
+      item: {
+        id: "ce-1",
+        type: "command_execution",
+        command: "browse navigate https://www.united.com",
+        aggregated_output: "Navigated to https://www.united.com",
+        exit_code: 0,
+        status: "completed",
+      },
+    },
+    {
+      type: "item.completed",
+      item: {
+        id: "rs-2",
+        type: "reasoning",
+        text: "Now extract the prices via the MCP browser tool.",
+      },
+    },
+    {
+      type: "item.completed",
+      item: {
+        id: "mc-1",
+        type: "mcp_tool_call",
+        server: "stagehand_browser",
+        tool: "extract",
+        arguments: { instruction: "Get prices" },
+        result: {
+          content: [
+            {
+              type: "text",
+              text: '{"economy":"$1,200","business":"$5,200"}',
+            },
+          ],
+          structured_content: { economy: "$1,200", business: "$5,200" },
+        },
+        status: "completed",
+      },
+    },
+    {
+      type: "item.completed",
+      item: {
+        id: "am-1",
+        type: "agent_message",
+        text: "The price difference is approximately $4,000.",
+      },
+    },
+    {
+      type: "turn.completed",
+      usage: {
+        input_tokens: 120,
+        cached_input_tokens: 10,
+        output_tokens: 50,
+        reasoning_output_tokens: 5,
+      },
+    },
+  ];
+
+  const trajectory = codexAdapter.fromHarnessResult(
+    {
+      events,
+      status: "complete",
+      usage: {
+        input_tokens: 120,
+        output_tokens: 50,
+        reasoning_tokens: 5,
+        cached_input_tokens: 10,
+      },
+    },
+    taskSpec,
+  );
+
+  assert.equal(
+    trajectory.steps.length,
+    2,
+    `expected 2 steps (command_execution + mcp_tool_call), got ${trajectory.steps.length}`,
+  );
+  assert.equal(trajectory.steps[0].actionName, "browse");
+  assert.equal(trajectory.steps[1].actionName, "stagehand_browser.extract");
+  assert.equal(trajectory.status, "complete");
+  assert.ok(
+    trajectory.finalAnswer?.includes("$4,000"),
+    `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`,
+  );
+
+  // Reasoning items must be folded into the following tool call.
+  assert.ok(
+    trajectory.steps[0].reasoning.includes("navigating"),
+    "expected first reasoning to be folded into step 0",
+  );
+  assert.ok(
+    trajectory.steps[1].reasoning.includes("MCP browser tool"),
+    "expected second reasoning to be folded into step 1",
+  );
+
+  // The MCP tool result should produce a json modality from structured_content.
+  const step1Modalities = trajectory.steps[1].agentEvidence.modalities;
+  assert.ok(
+    step1Modalities.some(
+      (m) =>
+        m.type === "json" &&
+        typeof m.content === "object" &&
+        m.content !== null &&
+        (m.content as Record<string, unknown>).economy === "$1,200",
+    ),
+    "expected json modality with structured_content on step 1",
+  );
+
+  // Probe evidence empty across the board.
+  for (const step of trajectory.steps) {
+    assert.deepEqual(
+      step.probeEvidence,
+      {},
+      `expected empty probeEvidence for external-harness step ${step.index}`,
+    );
+  }
+
+  console.log(
+    `  ✓ codexAdapter — ${trajectory.steps.length} steps, reasoning folded, structured_content → json modality`,
+  );
+
+  return trajectory;
+}
+
+async function testPersistence(
+  trajectory: Trajectory,
+  taskSpec: TaskSpec,
+  tmpRoot: string,
+  label: string,
+): Promise<void> {
+  const { directory, persisted } = await persistAdapterTrajectory({
+    trajectory,
+    taskSpec,
+    outputRoot: tmpRoot,
+    runId: `smoke-${label}`,
+    persist: true,
+  });
+  assert.equal(persisted, true);
+
+  const entries = await fs.readdir(directory);
+  assert.ok(
+    entries.includes("task_data.json"),
+    "expected task_data.json on disk",
+  );
+  assert.ok(
+    entries.includes("trajectory.json"),
+    "expected trajectory.json on disk",
+  );
+  assert.ok(entries.includes("times.json"), "expected times.json on disk");
+  assert.ok(entries.includes("core.log"), "expected core.log on disk");
+  assert.ok(entries.includes("scores"), "expected scores/ directory on disk");
+  console.log(`  ✓ persistAdapterTrajectory(${label}) — wrote ${directory}`);
+}
+
+async function maybeRunVerifier(
+  label: string,
+  trajectory: Trajectory,
+  taskSpec: TaskSpec,
+): Promise<void> {
+  const apiKey =
+    process.env.GEMINI_API_KEY || process.env.GOOGLE_GENERATIVE_AI_API_KEY;
+  if (!apiKey) {
+    console.log(
+      `  – V3Evaluator.verify(${label}) skipped (no GEMINI_API_KEY in env)`,
+    );
+    return;
+  }
+
+  const { V3Evaluator } = await import("@browserbasehq/stagehand");
+  // Construct a V3 stub just for its logger (V3Evaluator only needs that).
+  // We can't `init()` it (no browser) but the verify path never touches the
+  // browser, only LLMProvider.
+  const { V3 } = await import("@browserbasehq/stagehand");
+  // V3 requires V3Options; pass a minimal one with disablePino so we don't
+  // spin up the pino worker.
+  const v3 = new V3({ env: "LOCAL", verbose: 0, disablePino: true });
+
+  const evaluator = new V3Evaluator(v3, { backend: "verifier" });
+  try {
+    const verdict = await evaluator.verify(trajectory, taskSpec);
+    console.log(
+      `  ✓ V3Evaluator.verify(${label}) — outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${verdict.perCriterion.length} evidence_insufficient=${verdict.evidenceInsufficient.length}`,
+    );
+  } finally {
+    // V3 instance was never init'd, no teardown needed.
+  }
+}
+
+async function main(): Promise<void> {
+  const tmpRoot = await fs.mkdtemp(
+    path.join(os.tmpdir(), "verify-harness-adapters-"),
+  );
+  console.log(`▸ tmpdir: ${tmpRoot}\n`);
+
+  const taskSpec: TaskSpec = {
+    id: "smoke-united_13",
+    instruction:
+      "What is the price difference between economy and business class on United CHI→GRU?",
+    initUrl: "https://www.united.com",
+    precomputedRubric: {
+      items: [
+        {
+          criterion: "Identify correct route",
+          description:
+            "Agent identifies the United CHI→GRU economy and business class fares.",
+          max_points: 2,
+        },
+        {
+          criterion: "Report price delta",
+          description:
+            "Agent reports a numeric difference between economy and business.",
+          max_points: 3,
+        },
+      ],
+    },
+    expectedAnswer: "Approximately $4,000 difference.",
+  };
+
+  console.log("▸ claudeCodeAdapter");
+  const claudeTrajectory = await testClaudeCodeAdapter(taskSpec);
+  await testPersistence(claudeTrajectory, taskSpec, tmpRoot, "claude_code");
+  await maybeRunVerifier("claude_code", claudeTrajectory, taskSpec);
+
+  console.log("\n▸ codexAdapter");
+  const codexTrajectory = await testCodexAdapter(taskSpec);
+  await testPersistence(codexTrajectory, taskSpec, tmpRoot, "codex");
+  await maybeRunVerifier("codex", codexTrajectory, taskSpec);
+
+  console.log("\n✓ all smoke assertions passed");
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});

From 6d41a4f829887c2b58089f77a94c9e1defcad04e Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:19:41 -0700
Subject: [PATCH 2/5] fix(evals): route harnesses through verifier

---
 packages/evals/framework/claudeCodeRunner.ts  |  2 +-
 packages/evals/framework/codexRunner.ts       |  2 +-
 .../framework/harnesses/persistTrajectory.ts  | 56 ++++++++++++-------
 .../evals/scripts/verify-harness-adapters.ts  |  4 +-
 4 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index c306c4256..cb356ceea 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -291,7 +291,7 @@ export async function runClaudeCodeAgent({
 
     const { V3Evaluator } = await import("@browserbasehq/stagehand");
     const { RubricCache } = await import("./rubricCache.js");
-    const evaluator = new V3Evaluator(verifier.v3);
+    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
 
     // Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
     let rubric = verifier.taskSpec.precomputedRubric;
diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts
index 2c5695789..2703160d9 100644
--- a/packages/evals/framework/codexRunner.ts
+++ b/packages/evals/framework/codexRunner.ts
@@ -265,7 +265,7 @@ export async function runCodexAgent({
 
     const { V3Evaluator } = await import("@browserbasehq/stagehand");
     const { RubricCache } = await import("./rubricCache.js");
-    const evaluator = new V3Evaluator(verifier.v3);
+    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
 
     let rubric = verifier.taskSpec.precomputedRubric;
     if (!rubric) {
diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts
index ffa3dd868..6efdaace8 100644
--- a/packages/evals/framework/harnesses/persistTrajectory.ts
+++ b/packages/evals/framework/harnesses/persistTrajectory.ts
@@ -10,15 +10,14 @@
  * finishes — so this helper writes the same on-disk layout without the
  * event-subscription lifecycle.
  *
- * The on-disk layout matches microsoft/fara's example_trajectory/ so saved
- * trajectories can be cross-validated against verify_trajectories.py without
- * a format conversion step:
+ * The on-disk layout matches TrajectoryRecorder.persist():
  *
  *   <dir>/
  *     ├── task_data.json
- *     ├── trajectory.json   (image bytes as base64, screenshots referenced by path)
- *     ├── screenshot_<N>.png (only if probeEvidence carries Buffer screenshots — external
- *     │                       harnesses don't natively, but the helper supports it)
+ *     ├── trajectory.json   (images referenced by path)
+ *     ├── screenshots/
+ *     │   ├── probe/<N>.png
+ *     │   └── agent/<N>.png
  *     ├── scores/
  *     │   └── mmrubric_v1.json  (if `verdict` passed)
  *     ├── core.log
@@ -86,29 +85,44 @@ export async function persistAdapterTrajectory(
   }
 
   await fs.mkdir(directory, { recursive: true });
+  await fs.mkdir(path.join(directory, "screenshots", "probe"), {
+    recursive: true,
+  });
+  await fs.mkdir(path.join(directory, "screenshots", "agent"), {
+    recursive: true,
+  });
 
-  // Walk steps and (when a Buffer screenshot is present, which is rare for
-  // external harnesses) write it to disk + replace with a path reference.
-  // Image modalities in agentEvidence get base64-encoded inline to keep
-  // trajectory.json human-readable JSON.
+  // Walk steps and write image bytes to disk, replacing in-memory Buffers with
+  // path references in trajectory.json.
   const serializableSteps: unknown[] = [];
   for (const step of opts.trajectory.steps) {
     const probe: ProbeEvidence = { ...step.probeEvidence };
     if (probe.screenshot) {
-      const filename = `screenshot_${step.index + 1}.png`;
-      await fs.writeFile(path.join(directory, filename), probe.screenshot);
-      probe.screenshotPath = filename;
+      const relPath = `screenshots/probe/${step.index + 1}.png`;
+      await fs.writeFile(path.join(directory, relPath), probe.screenshot);
+      probe.screenshotPath = relPath;
       delete probe.screenshot;
     }
+
+    const imageModalities = step.agentEvidence.modalities.filter(
+      (m) => m.type === "image",
+    );
+    const multipleImages = imageModalities.length > 1;
+    let imageSeq = 0;
     const agentEvidence = {
-      modalities: step.agentEvidence.modalities.map((m) =>
-        m.type === "image"
-          ? {
-              type: "image",
-              bytesBase64: m.bytes.toString("base64"),
-              mediaType: m.mediaType,
-            }
-          : m,
+      modalities: await Promise.all(
+        step.agentEvidence.modalities.map(async (m) => {
+          if (m.type !== "image") return m;
+          const suffix = multipleImages ? `_${imageSeq}` : "";
+          const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`;
+          imageSeq += 1;
+          await fs.writeFile(path.join(directory, relPath), m.bytes);
+          return {
+            type: "image" as const,
+            imagePath: relPath,
+            mediaType: m.mediaType,
+          };
+        }),
       ),
     };
     serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence });
diff --git a/packages/evals/scripts/verify-harness-adapters.ts b/packages/evals/scripts/verify-harness-adapters.ts
index 6c949857d..5b9b11d9c 100644
--- a/packages/evals/scripts/verify-harness-adapters.ts
+++ b/packages/evals/scripts/verify-harness-adapters.ts
@@ -402,13 +402,13 @@ async function main(): Promise<void> {
           criterion: "Identify correct route",
           description:
             "Agent identifies the United CHI→GRU economy and business class fares.",
-          max_points: 2,
+          maxPoints: 2,
         },
         {
           criterion: "Report price delta",
           description:
             "Agent reports a numeric difference between economy and business.",
-          max_points: 3,
+          maxPoints: 3,
         },
       ],
     },

From c166203d0ea889b11a0df9ee46c08de42daef51d Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:42:00 -0700
Subject: [PATCH 3/5] fix(evals): validate external harness success mode

---
 packages/evals/framework/claudeCodeRunner.ts | 9 +--------
 packages/evals/framework/codexRunner.ts      | 9 +--------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index cb356ceea..a6e1ead2f 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -309,14 +309,7 @@ export async function runClaudeCodeAgent({
     };
 
     const verdict = await evaluator.verify(trajectory, hydratedSpec);
-    const successMode =
-      verifier.successMode ??
-      ((process.env.EVAL_SUCCESS_MODE as
-        | "outcome"
-        | "process"
-        | "both"
-        | undefined) ||
-        "outcome");
+    const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
     const verifiedSuccess = verdictToSuccess(verdict, successMode);
 
     const { directory: trajectoryDir } = await persistAdapterTrajectory({
diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts
index 2703160d9..7f6f71e77 100644
--- a/packages/evals/framework/codexRunner.ts
+++ b/packages/evals/framework/codexRunner.ts
@@ -282,14 +282,7 @@ export async function runCodexAgent({
     };
 
     const verdict = await evaluator.verify(trajectory, hydratedSpec);
-    const successMode =
-      verifier.successMode ??
-      ((process.env.EVAL_SUCCESS_MODE as
-        | "outcome"
-        | "process"
-        | "both"
-        | undefined) ||
-        "outcome");
+    const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
     const verifiedSuccess = verdictToSuccess(verdict, successMode);
 
     const { directory: trajectoryDir } = await persistAdapterTrajectory({

From 6532300ba6c79dac605490cec68ee7030a71e48a Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 21:33:06 -0700
Subject: [PATCH 4/5] test(evals): cover persisted trajectory images

---
 .../tests/framework/persistTrajectory.test.ts | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 packages/evals/tests/framework/persistTrajectory.test.ts

diff --git a/packages/evals/tests/framework/persistTrajectory.test.ts b/packages/evals/tests/framework/persistTrajectory.test.ts
new file mode 100644
index 000000000..65de8c72f
--- /dev/null
+++ b/packages/evals/tests/framework/persistTrajectory.test.ts
@@ -0,0 +1,113 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { loadTrajectoryFromDisk } from "@browserbasehq/stagehand";
+import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+import { describe, expect, it } from "vitest";
+
+import { persistAdapterTrajectory } from "../../framework/harnesses/persistTrajectory.js";
+
+const PROBE_PNG = Buffer.from("fake-probe-bytes-1234", "utf8");
+const AGENT_PNG = Buffer.from("fake-agent-bytes-5678", "utf8");
+
+describe("persistAdapterTrajectory", () => {
+  it("round-trips probe and agent image evidence through loadTrajectoryFromDisk", async () => {
+    const tmpRoot = await fs.mkdtemp(
+      path.join(os.tmpdir(), "persist-adapter-roundtrip-"),
+    );
+
+    try {
+      const taskSpec: TaskSpec = {
+        id: "roundtrip-task",
+        instruction: "Test task",
+        initUrl: "https://example.com",
+      };
+      const { directory, persisted } = await persistAdapterTrajectory({
+        trajectory: makeTrajectory(taskSpec),
+        taskSpec,
+        outputRoot: tmpRoot,
+        runId: "roundtrip-run",
+        persist: true,
+      });
+
+      expect(persisted).toBe(true);
+      await expect(fs.readdir(directory)).resolves.toEqual(
+        expect.arrayContaining([
+          "core.log",
+          "scores",
+          "screenshots",
+          "task_data.json",
+          "times.json",
+          "trajectory.json",
+        ]),
+      );
+      await expect(
+        fs.readFile(path.join(directory, "screenshots", "probe", "1.png")),
+      ).resolves.toEqual(PROBE_PNG);
+      await expect(
+        fs.readFile(path.join(directory, "screenshots", "agent", "1.png")),
+      ).resolves.toEqual(AGENT_PNG);
+
+      const loaded = await loadTrajectoryFromDisk(directory);
+      const step = loaded.steps[0];
+      const imageModality = step.agentEvidence.modalities.find(
+        (
+          modality,
+        ): modality is Extract<
+          (typeof step.agentEvidence.modalities)[number],
+          { type: "image" }
+        > => modality.type === "image",
+      );
+      const textModality = step.agentEvidence.modalities.find(
+        (
+          modality,
+        ): modality is Extract<
+          (typeof step.agentEvidence.modalities)[number],
+          { type: "text" }
+        > => modality.type === "text",
+      );
+
+      expect(step.probeEvidence.screenshot).toEqual(PROBE_PNG);
+      expect(imageModality?.bytes).toEqual(AGENT_PNG);
+      expect(imageModality?.mediaType).toBe("image/png");
+      expect(textModality?.content).toBe("navigated");
+    } finally {
+      await fs.rm(tmpRoot, { recursive: true, force: true });
+    }
+  });
+});
+
+function makeTrajectory(task: TaskSpec): Trajectory {
+  return {
+    task,
+    status: "complete",
+    finalAnswer: "Final answer text.",
+    usage: { input_tokens: 100, output_tokens: 50 },
+    timing: {
+      startedAt: "2026-05-15T10:00:00.000Z",
+      endedAt: "2026-05-15T10:01:00.000Z",
+    },
+    steps: [
+      {
+        index: 0,
+        actionName: "goto",
+        actionArgs: { url: "https://example.com" },
+        reasoning: "Open the page.",
+        agentEvidence: {
+          modalities: [
+            { type: "text", content: "navigated" },
+            { type: "image", bytes: AGENT_PNG, mediaType: "image/png" },
+          ],
+        },
+        probeEvidence: {
+          url: "https://example.com",
+          screenshot: PROBE_PNG,
+        },
+        toolOutput: { ok: true, result: { url: "https://example.com" } },
+        startedAt: "2026-05-15T10:00:00.000Z",
+        finishedAt: "2026-05-15T10:00:05.000Z",
+      },
+    ],
+  };
+}

From 3a673e7112cbc194a250e0579651948130bc6d73 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:45:32 -0700
Subject: [PATCH 5/5] fix(evals): align harness verifier result API

---
 packages/evals/framework/claudeCodeRunner.ts  |  28 +-
 packages/evals/framework/codexRunner.ts       |  28 +-
 .../framework/harnesses/persistTrajectory.ts  |  18 +-
 .../framework/harnesses/trajectoryAdapter.ts  |  12 +-
 .../evals/scripts/verify-harness-adapters.ts  | 434 ------------------
 .../tests/framework/persistTrajectory.test.ts |  19 +-
 6 files changed, 68 insertions(+), 471 deletions(-)
 delete mode 100644 packages/evals/scripts/verify-harness-adapters.ts

diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index a6e1ead2f..0d1b68569 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -6,7 +6,7 @@ import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
 import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
 import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js";
 import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
-import { verdictToSuccess } from "./verifierAdapter.js";
+import { evaluationResultToSuccess } from "./verifierAdapter.js";
 
 type ClaudeSdkMessage = Record<string, unknown>;
 type ClaudeQuery = AsyncIterable<ClaudeSdkMessage>;
@@ -48,8 +48,9 @@ export interface ClaudeCodeRunnerInput {
   /**
    * Optional verifier integration. When provided, the runner builds a
    * Trajectory from the SDK message stream (via claudeCodeAdapter), runs
-   * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict
-   * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE).
+   * V3Evaluator.verify() against the supplied TaskSpec, and folds the
+   * EvaluationResult into the returned TaskResult ({_success} mode follows
+   * EVAL_SUCCESS_MODE).
    * When omitted, the runner falls back to parsing the legacy EVAL_RESULT
    * line — preserves current behavior for callers that haven't migrated.
    */
@@ -308,21 +309,24 @@ export async function runClaudeCodeAgent({
       precomputedRubric: rubric,
     };
 
-    const verdict = await evaluator.verify(trajectory, hydratedSpec);
+    const evaluationResult = await evaluator.verify(trajectory, hydratedSpec);
     const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
-    const verifiedSuccess = verdictToSuccess(verdict, successMode);
+    const verifiedSuccess = evaluationResultToSuccess(
+      evaluationResult,
+      successMode,
+    );
 
     const { directory: trajectoryDir } = await persistAdapterTrajectory({
       trajectory,
       taskSpec: hydratedSpec,
-      verdict,
+      evaluationResult,
       outputRoot: verifier.trajectoryRoot,
       runId: verifier.runId,
     });
 
     logger.log({
       category: "claude_code",
-      message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`,
+      message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`,
       level: 1,
     });
 
@@ -330,9 +334,9 @@ export async function runClaudeCodeAgent({
       ...baseResult,
       _success: verifiedSuccess,
       error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
-      outcomeSuccess: verdict.outcomeSuccess,
-      processScore: verdict.processScore,
-      evidenceInsufficient: verdict.evidenceInsufficient,
+      outcomeSuccess: evaluationResult.outcomeSuccess,
+      processScore: evaluationResult.processScore,
+      evidenceInsufficient: evaluationResult.evidenceInsufficient,
       criterionCount: rubric.items.length,
       stepCount: trajectory.steps.length,
       trajectoryDir,
@@ -350,6 +354,10 @@ export async function runClaudeCodeAgent({
   }
 }
 
+function formatProcessScore(score: number | undefined): string {
+  return typeof score === "number" ? score.toFixed(2) : "n/a";
+}
+
 function buildClaudeCodeMetrics(
   resultMessage: ClaudeSdkMessage | undefined,
 ): Record<string, MetricValue> {
diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts
index 7f6f71e77..bd38965bb 100644
--- a/packages/evals/framework/codexRunner.ts
+++ b/packages/evals/framework/codexRunner.ts
@@ -6,7 +6,7 @@ import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
 import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js";
 import { codexAdapter } from "./harnesses/codexAdapter.js";
 import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
-import { verdictToSuccess } from "./verifierAdapter.js";
+import { evaluationResultToSuccess } from "./verifierAdapter.js";
 
 type MetricValue = { count: number; value: number };
 type CodexEvent = Record<string, unknown>;
@@ -57,8 +57,9 @@ export interface CodexRunnerInput {
   /**
    * Optional verifier integration. When provided, the runner builds a
    * Trajectory from the codex event stream (via codexAdapter), runs
-   * V3Evaluator.verify() against the supplied TaskSpec, and folds the verdict
-   * into the returned TaskResult ({_success} mode follows EVAL_SUCCESS_MODE).
+   * V3Evaluator.verify() against the supplied TaskSpec, and folds the
+   * EvaluationResult into the returned TaskResult ({_success} mode follows
+   * EVAL_SUCCESS_MODE).
    * When omitted, the runner falls back to parsing the legacy JSON result —
    * preserves current behavior for callers that haven't migrated.
    */
@@ -281,21 +282,24 @@ export async function runCodexAgent({
       precomputedRubric: rubric,
     };
 
-    const verdict = await evaluator.verify(trajectory, hydratedSpec);
+    const evaluationResult = await evaluator.verify(trajectory, hydratedSpec);
     const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
-    const verifiedSuccess = verdictToSuccess(verdict, successMode);
+    const verifiedSuccess = evaluationResultToSuccess(
+      evaluationResult,
+      successMode,
+    );
 
     const { directory: trajectoryDir } = await persistAdapterTrajectory({
       trajectory,
       taskSpec: hydratedSpec,
-      verdict,
+      evaluationResult,
       outputRoot: verifier.trajectoryRoot,
       runId: verifier.runId,
     });
 
     logger.log({
       category: "codex",
-      message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} steps=${trajectory.steps.length}`,
+      message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`,
       level: 1,
     });
 
@@ -303,9 +307,9 @@ export async function runCodexAgent({
       ...baseResult,
       _success: verifiedSuccess,
       error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
-      outcomeSuccess: verdict.outcomeSuccess,
-      processScore: verdict.processScore,
-      evidenceInsufficient: verdict.evidenceInsufficient,
+      outcomeSuccess: evaluationResult.outcomeSuccess,
+      processScore: evaluationResult.processScore,
+      evidenceInsufficient: evaluationResult.evidenceInsufficient,
       criterionCount: rubric.items.length,
       stepCount: trajectory.steps.length,
       trajectoryDir,
@@ -323,6 +327,10 @@ export async function runCodexAgent({
   }
 }
 
+function formatProcessScore(score: number | undefined): string {
+  return typeof score === "number" ? score.toFixed(2) : "n/a";
+}
+
 function tryParseCodexJson(
   candidate: string,
 ): Omit<ParsedCodexResult, "raw"> | undefined {
diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts
index 6efdaace8..ab6cf4daa 100644
--- a/packages/evals/framework/harnesses/persistTrajectory.ts
+++ b/packages/evals/framework/harnesses/persistTrajectory.ts
@@ -19,7 +19,7 @@
  *     │   ├── probe/<N>.png
  *     │   └── agent/<N>.png
  *     ├── scores/
- *     │   └── mmrubric_v1.json  (if `verdict` passed)
+ *     │   └── result.json       (if `evaluationResult` passed)
  *     ├── core.log
  *     └── times.json
  *
@@ -32,17 +32,17 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import type {
+  EvaluationResult,
   ProbeEvidence,
   TaskSpec,
   Trajectory,
-  Verdict,
 } from "@browserbasehq/stagehand";
 
 export interface PersistAdapterTrajectoryOptions {
   trajectory: Trajectory;
   taskSpec: TaskSpec;
-  /** Verdict from V3Evaluator.verify(). Written to scores/mmrubric_v1.json. */
-  verdict?: Verdict;
+  /** EvaluationResult from V3Evaluator.verify(). Written to scores/result.json. */
+  evaluationResult?: EvaluationResult;
   /**
    * Output directory root. Final layout lives at `<outputRoot>/<runId>/<task.id>/`.
    * Defaults to `<cwd>/.trajectories`.
@@ -143,8 +143,8 @@ export async function persistAdapterTrajectory(
     status: opts.trajectory.status,
     finalAnswer: opts.trajectory.finalAnswer ?? null,
   };
-  if (opts.verdict) {
-    taskData.verdict = opts.verdict;
+  if (opts.evaluationResult) {
+    taskData.result = opts.evaluationResult;
   }
   await fs.writeFile(
     path.join(directory, "task_data.json"),
@@ -165,10 +165,10 @@ export async function persistAdapterTrajectory(
   );
 
   await fs.mkdir(path.join(directory, "scores"), { recursive: true });
-  if (opts.verdict) {
+  if (opts.evaluationResult) {
     await fs.writeFile(
-      path.join(directory, "scores", "mmrubric_v1.json"),
-      JSON.stringify(opts.verdict, null, 2),
+      path.join(directory, "scores", "result.json"),
+      JSON.stringify(opts.evaluationResult, null, 2),
     );
   }
 
diff --git a/packages/evals/framework/harnesses/trajectoryAdapter.ts b/packages/evals/framework/harnesses/trajectoryAdapter.ts
index cb3afac65..ec1b02319 100644
--- a/packages/evals/framework/harnesses/trajectoryAdapter.ts
+++ b/packages/evals/framework/harnesses/trajectoryAdapter.ts
@@ -3,14 +3,12 @@
  * provider-shaped event/message log) into the Stagehand `Trajectory` shape
  * that V3Evaluator.verify() consumes.
  *
- * The verifier is harness-agnostic (Trajectory + TaskSpec → Verdict; pure
- * function, no live browser). That property is what lets non-Stagehand
+ * The verifier is harness-agnostic (Trajectory + TaskSpec → EvaluationResult,
+ * no live browser). That property is what lets non-Stagehand
  * harnesses — Claude Code, Codex — be scored with the same rubric pipeline
  * we use for Stagehand. Each external harness ships its own
  * `TrajectoryAdapter<THarnessResult>` that maps its tool-call/message log to
  * a `Trajectory`. The verifier never knows which adapter produced it.
- *
- * @see ~/.claude/plans/verifier-rewrite-plan.html §07 "External harness adapters"
  */
 import type {
   AgentEvidence,
@@ -26,9 +24,9 @@ import type {
  * the same Trajectory.
  *
  * Empty `probeEvidence` on every step is supported — the verifier degrades
- * gracefully via the `evidence_insufficient` path (paper's uncontrollable-
- * failure principle). Text-heavy tasks (extract, lookup, search) still get a
- * meaningful outcome verdict; visual-grounding criteria get flagged as
+ * gracefully via the `evidence_insufficient` path. Text-heavy tasks
+ * (extract, lookup, search) still get a
+ * meaningful outcome assessment; visual-grounding criteria get flagged as
  * evidence_insufficient rather than silently miscredited.
  */
 export interface TrajectoryAdapter<THarnessResult> {
diff --git a/packages/evals/scripts/verify-harness-adapters.ts b/packages/evals/scripts/verify-harness-adapters.ts
deleted file mode 100644
index 5b9b11d9c..000000000
--- a/packages/evals/scripts/verify-harness-adapters.ts
+++ /dev/null
@@ -1,434 +0,0 @@
-/**
- * External-harness adapter smoke test — verifies the claudeCodeAdapter and
- * codexAdapter end-to-end without launching a browser.
- *
- * Hand-rolls synthetic harness results (tool-use messages for Claude Code,
- * ThreadEvents for Codex) and asserts:
- *   1. The produced Trajectory has the expected step count.
- *   2. Text and JSON modalities are populated where they should be.
- *   3. finalAnswer is captured.
- *   4. status === "complete".
- *
- * Bonus (gated on GEMINI_API_KEY): feeds the synthetic trajectory into a real
- * V3Evaluator.verify() with a tiny synthetic rubric, then prints the verdict.
- *
- * Run via:  pnpm exec tsx packages/evals/scripts/verify-harness-adapters.ts
- */
-import assert from "node:assert/strict";
-import fs from "node:fs/promises";
-import os from "node:os";
-import path from "node:path";
-
-import { claudeCodeAdapter } from "../framework/harnesses/claudeCodeAdapter.js";
-import { codexAdapter } from "../framework/harnesses/codexAdapter.js";
-import { persistAdapterTrajectory } from "../framework/harnesses/persistTrajectory.js";
-import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
-
-async function testClaudeCodeAdapter(taskSpec: TaskSpec): Promise<Trajectory> {
-  // Hand-rolled SDK message stream that mirrors what the Claude Agent SDK
-  // emits for a two-tool-call session with reasoning between them.
-  const messages: Array<Record<string, unknown>> = [
-    {
-      type: "assistant",
-      message: {
-        content: [
-          {
-            type: "text",
-            text: "I'll start by navigating to the United Airlines website.",
-          },
-          {
-            type: "tool_use",
-            id: "tu_1",
-            name: "browse",
-            input: { command: "browse navigate https://www.united.com" },
-          },
-        ],
-      },
-    },
-    {
-      type: "user",
-      message: {
-        content: [
-          {
-            type: "tool_result",
-            tool_use_id: "tu_1",
-            content: [
-              { type: "text", text: "Navigated to https://www.united.com" },
-            ],
-            is_error: false,
-          },
-        ],
-      },
-    },
-    {
-      type: "assistant",
-      message: {
-        content: [
-          {
-            type: "text",
-            text: "Now I'll look up the flight prices.",
-          },
-          {
-            type: "tool_use",
-            id: "tu_2",
-            name: "browse",
-            input: { command: "browse extract { economy, business } prices" },
-          },
-        ],
-      },
-    },
-    {
-      type: "user",
-      message: {
-        content: [
-          {
-            type: "tool_result",
-            tool_use_id: "tu_2",
-            content: [
-              {
-                type: "text",
-                text: '{"economy":"$1,200","business":"$5,200"}',
-              },
-            ],
-            is_error: false,
-          },
-        ],
-      },
-    },
-    {
-      type: "assistant",
-      message: {
-        content: [
-          {
-            type: "text",
-            text: "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).",
-          },
-        ],
-      },
-    },
-    {
-      type: "result",
-      subtype: "success",
-      result:
-        "The price difference is approximately $4,000 (business $5,200 vs economy $1,200).",
-      duration_ms: 1234,
-      num_turns: 3,
-    },
-  ];
-
-  const trajectory = claudeCodeAdapter.fromHarnessResult(
-    {
-      messages,
-      status: "complete",
-      usage: { input_tokens: 100, output_tokens: 80 },
-    },
-    taskSpec,
-  );
-
-  assert.equal(
-    trajectory.steps.length,
-    2,
-    `expected 2 steps from 2 tool_use blocks, got ${trajectory.steps.length}`,
-  );
-  assert.equal(trajectory.steps[0].actionName, "browse");
-  assert.equal(trajectory.steps[1].actionName, "browse");
-  assert.equal(trajectory.status, "complete");
-  assert.ok(
-    trajectory.finalAnswer?.includes("$4,000"),
-    `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`,
-  );
-
-  // Step 0: reasoning text modality + result text modality.
-  const step0Modalities = trajectory.steps[0].agentEvidence.modalities;
-  assert.ok(
-    step0Modalities.some(
-      (m) => m.type === "text" && m.content.includes("navigating"),
-    ),
-    "expected reasoning text in step 0 modalities",
-  );
-  assert.ok(
-    step0Modalities.some(
-      (m) =>
-        m.type === "text" &&
-        m.content.includes("Navigated to https://www.united.com"),
-    ),
-    "expected tool-result text in step 0 modalities",
-  );
-
-  // Step 1 carries the second reasoning + result content. tool_result content
-  // is a structured array of {type, text} blocks, which the adapter forwards
-  // as the json modality (with a stringified text mirror). Accept either path.
-  const step1Modalities = trajectory.steps[1].agentEvidence.modalities;
-  const step1Joined = JSON.stringify(step1Modalities);
-  assert.ok(
-    step1Joined.includes("economy"),
-    `expected step 1 modalities to mention 'economy'; got ${step1Joined}`,
-  );
-
-  // Both steps must have empty probeEvidence — external harnesses don't
-  // produce screenshots natively. That's what triggers evidence_insufficient
-  // in the verifier downstream.
-  for (const step of trajectory.steps) {
-    assert.deepEqual(
-      step.probeEvidence,
-      {},
-      `expected empty probeEvidence for external-harness step ${step.index}`,
-    );
-  }
-
-  console.log(
-    `  ✓ claudeCodeAdapter — ${trajectory.steps.length} steps, finalAnswer captured, probeEvidence empty`,
-  );
-
-  return trajectory;
-}
-
-async function testCodexAdapter(taskSpec: TaskSpec): Promise<Trajectory> {
-  // Hand-rolled codex ThreadEvent stream. Mirrors what runCodexAgent
-  // accumulates into its `events` array.
-  const events: Array<Record<string, unknown>> = [
-    { type: "thread.started", thread_id: "thread-smoke" },
-    { type: "turn.started" },
-    {
-      type: "item.completed",
-      item: {
-        id: "rs-1",
-        type: "reasoning",
-        text: "I should start by navigating to the United website.",
-      },
-    },
-    {
-      type: "item.completed",
-      item: {
-        id: "ce-1",
-        type: "command_execution",
-        command: "browse navigate https://www.united.com",
-        aggregated_output: "Navigated to https://www.united.com",
-        exit_code: 0,
-        status: "completed",
-      },
-    },
-    {
-      type: "item.completed",
-      item: {
-        id: "rs-2",
-        type: "reasoning",
-        text: "Now extract the prices via the MCP browser tool.",
-      },
-    },
-    {
-      type: "item.completed",
-      item: {
-        id: "mc-1",
-        type: "mcp_tool_call",
-        server: "stagehand_browser",
-        tool: "extract",
-        arguments: { instruction: "Get prices" },
-        result: {
-          content: [
-            {
-              type: "text",
-              text: '{"economy":"$1,200","business":"$5,200"}',
-            },
-          ],
-          structured_content: { economy: "$1,200", business: "$5,200" },
-        },
-        status: "completed",
-      },
-    },
-    {
-      type: "item.completed",
-      item: {
-        id: "am-1",
-        type: "agent_message",
-        text: "The price difference is approximately $4,000.",
-      },
-    },
-    {
-      type: "turn.completed",
-      usage: {
-        input_tokens: 120,
-        cached_input_tokens: 10,
-        output_tokens: 50,
-        reasoning_output_tokens: 5,
-      },
-    },
-  ];
-
-  const trajectory = codexAdapter.fromHarnessResult(
-    {
-      events,
-      status: "complete",
-      usage: {
-        input_tokens: 120,
-        output_tokens: 50,
-        reasoning_tokens: 5,
-        cached_input_tokens: 10,
-      },
-    },
-    taskSpec,
-  );
-
-  assert.equal(
-    trajectory.steps.length,
-    2,
-    `expected 2 steps (command_execution + mcp_tool_call), got ${trajectory.steps.length}`,
-  );
-  assert.equal(trajectory.steps[0].actionName, "browse");
-  assert.equal(trajectory.steps[1].actionName, "stagehand_browser.extract");
-  assert.equal(trajectory.status, "complete");
-  assert.ok(
-    trajectory.finalAnswer?.includes("$4,000"),
-    `expected finalAnswer to include $4,000, got: ${trajectory.finalAnswer}`,
-  );
-
-  // Reasoning items must be folded into the following tool call.
-  assert.ok(
-    trajectory.steps[0].reasoning.includes("navigating"),
-    "expected first reasoning to be folded into step 0",
-  );
-  assert.ok(
-    trajectory.steps[1].reasoning.includes("MCP browser tool"),
-    "expected second reasoning to be folded into step 1",
-  );
-
-  // The MCP tool result should produce a json modality from structured_content.
-  const step1Modalities = trajectory.steps[1].agentEvidence.modalities;
-  assert.ok(
-    step1Modalities.some(
-      (m) =>
-        m.type === "json" &&
-        typeof m.content === "object" &&
-        m.content !== null &&
-        (m.content as Record<string, unknown>).economy === "$1,200",
-    ),
-    "expected json modality with structured_content on step 1",
-  );
-
-  // Probe evidence empty across the board.
-  for (const step of trajectory.steps) {
-    assert.deepEqual(
-      step.probeEvidence,
-      {},
-      `expected empty probeEvidence for external-harness step ${step.index}`,
-    );
-  }
-
-  console.log(
-    `  ✓ codexAdapter — ${trajectory.steps.length} steps, reasoning folded, structured_content → json modality`,
-  );
-
-  return trajectory;
-}
-
-async function testPersistence(
-  trajectory: Trajectory,
-  taskSpec: TaskSpec,
-  tmpRoot: string,
-  label: string,
-): Promise<void> {
-  const { directory, persisted } = await persistAdapterTrajectory({
-    trajectory,
-    taskSpec,
-    outputRoot: tmpRoot,
-    runId: `smoke-${label}`,
-    persist: true,
-  });
-  assert.equal(persisted, true);
-
-  const entries = await fs.readdir(directory);
-  assert.ok(
-    entries.includes("task_data.json"),
-    "expected task_data.json on disk",
-  );
-  assert.ok(
-    entries.includes("trajectory.json"),
-    "expected trajectory.json on disk",
-  );
-  assert.ok(entries.includes("times.json"), "expected times.json on disk");
-  assert.ok(entries.includes("core.log"), "expected core.log on disk");
-  assert.ok(entries.includes("scores"), "expected scores/ directory on disk");
-  console.log(`  ✓ persistAdapterTrajectory(${label}) — wrote ${directory}`);
-}
-
-async function maybeRunVerifier(
-  label: string,
-  trajectory: Trajectory,
-  taskSpec: TaskSpec,
-): Promise<void> {
-  const apiKey =
-    process.env.GEMINI_API_KEY || process.env.GOOGLE_GENERATIVE_AI_API_KEY;
-  if (!apiKey) {
-    console.log(
-      `  – V3Evaluator.verify(${label}) skipped (no GEMINI_API_KEY in env)`,
-    );
-    return;
-  }
-
-  const { V3Evaluator } = await import("@browserbasehq/stagehand");
-  // Construct a V3 stub just for its logger (V3Evaluator only needs that).
-  // We can't `init()` it (no browser) but the verify path never touches the
-  // browser, only LLMProvider.
-  const { V3 } = await import("@browserbasehq/stagehand");
-  // V3 requires V3Options; pass a minimal one with disablePino so we don't
-  // spin up the pino worker.
-  const v3 = new V3({ env: "LOCAL", verbose: 0, disablePino: true });
-
-  const evaluator = new V3Evaluator(v3, { backend: "verifier" });
-  try {
-    const verdict = await evaluator.verify(trajectory, taskSpec);
-    console.log(
-      `  ✓ V3Evaluator.verify(${label}) — outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${verdict.perCriterion.length} evidence_insufficient=${verdict.evidenceInsufficient.length}`,
-    );
-  } finally {
-    // V3 instance was never init'd, no teardown needed.
-  }
-}
-
-async function main(): Promise<void> {
-  const tmpRoot = await fs.mkdtemp(
-    path.join(os.tmpdir(), "verify-harness-adapters-"),
-  );
-  console.log(`▸ tmpdir: ${tmpRoot}\n`);
-
-  const taskSpec: TaskSpec = {
-    id: "smoke-united_13",
-    instruction:
-      "What is the price difference between economy and business class on United CHI→GRU?",
-    initUrl: "https://www.united.com",
-    precomputedRubric: {
-      items: [
-        {
-          criterion: "Identify correct route",
-          description:
-            "Agent identifies the United CHI→GRU economy and business class fares.",
-          maxPoints: 2,
-        },
-        {
-          criterion: "Report price delta",
-          description:
-            "Agent reports a numeric difference between economy and business.",
-          maxPoints: 3,
-        },
-      ],
-    },
-    expectedAnswer: "Approximately $4,000 difference.",
-  };
-
-  console.log("▸ claudeCodeAdapter");
-  const claudeTrajectory = await testClaudeCodeAdapter(taskSpec);
-  await testPersistence(claudeTrajectory, taskSpec, tmpRoot, "claude_code");
-  await maybeRunVerifier("claude_code", claudeTrajectory, taskSpec);
-
-  console.log("\n▸ codexAdapter");
-  const codexTrajectory = await testCodexAdapter(taskSpec);
-  await testPersistence(codexTrajectory, taskSpec, tmpRoot, "codex");
-  await maybeRunVerifier("codex", codexTrajectory, taskSpec);
-
-  console.log("\n✓ all smoke assertions passed");
-}
-
-main().catch((error) => {
-  console.error(error);
-  process.exit(1);
-});
diff --git a/packages/evals/tests/framework/persistTrajectory.test.ts b/packages/evals/tests/framework/persistTrajectory.test.ts
index 65de8c72f..c281ac996 100644
--- a/packages/evals/tests/framework/persistTrajectory.test.ts
+++ b/packages/evals/tests/framework/persistTrajectory.test.ts
@@ -3,7 +3,11 @@ import os from "node:os";
 import path from "node:path";
 
 import { loadTrajectoryFromDisk } from "@browserbasehq/stagehand";
-import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+import type {
+  EvaluationResult,
+  TaskSpec,
+  Trajectory,
+} from "@browserbasehq/stagehand";
 import { describe, expect, it } from "vitest";
 
 import { persistAdapterTrajectory } from "../../framework/harnesses/persistTrajectory.js";
@@ -23,9 +27,16 @@ describe("persistAdapterTrajectory", () => {
         instruction: "Test task",
         initUrl: "https://example.com",
       };
+      const evaluationResult: EvaluationResult = {
+        outcomeSuccess: true,
+        processScore: 1,
+        perCriterion: [],
+        evidenceInsufficient: [],
+      };
       const { directory, persisted } = await persistAdapterTrajectory({
         trajectory: makeTrajectory(taskSpec),
         taskSpec,
+        evaluationResult,
         outputRoot: tmpRoot,
         runId: "roundtrip-run",
         persist: true,
@@ -48,6 +59,12 @@ describe("persistAdapterTrajectory", () => {
       await expect(
         fs.readFile(path.join(directory, "screenshots", "agent", "1.png")),
       ).resolves.toEqual(AGENT_PNG);
+      await expect(
+        fs.readFile(path.join(directory, "scores", "result.json"), "utf8"),
+      ).resolves.toContain('"outcomeSuccess": true');
+      await expect(
+        fs.readFile(path.join(directory, "task_data.json"), "utf8"),
+      ).resolves.toContain('"result"');
 
       const loaded = await loadTrajectoryFromDisk(directory);
       const step = loaded.steps[0];