diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index 6ec620233..0d1b68569 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -1,9 +1,12 @@
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
 import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
 import type { PreparedClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
+import { claudeCodeAdapter } from "./harnesses/claudeCodeAdapter.js";
+import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
+import { evaluationResultToSuccess } from "./verifierAdapter.js";
 
 type ClaudeSdkMessage = Record<string, unknown>;
 type ClaudeQuery = AsyncIterable<ClaudeSdkMessage>;
@@ -16,6 +19,25 @@ export type ClaudeAgentSdk = {
   }) => ClaudeQuery;
 };
 
+export interface ClaudeCodeVerifierConfig {
+  /**
+   * V3 instance used solely as the LLM-client carrier for V3Evaluator. The
+   * instance does NOT need to have `init()` been called — V3Evaluator.verify()
+   * uses only `v3.logger` to construct its LLMProvider.
+   */
+  v3: V3;
+  /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */
+  taskSpec: TaskSpec;
+  /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
+  dataset: string;
+  /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
+  successMode?: "outcome" | "process" | "both";
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+}
+
 export interface ClaudeCodeRunnerInput {
   plan: ExternalHarnessTaskPlan;
   model: AvailableModel;
@@ -23,6 +45,16 @@ export interface ClaudeCodeRunnerInput {
   toolAdapter?: PreparedClaudeCodeToolAdapter;
   signal?: AbortSignal;
   sdk?: ClaudeAgentSdk;
+  /**
+   * Optional verifier integration. When provided, the runner builds a
+   * Trajectory from the SDK message stream (via claudeCodeAdapter), runs
+   * V3Evaluator.verify() against the supplied TaskSpec, and folds the
+   * EvaluationResult into the returned TaskResult ({_success} mode follows
+   * EVAL_SUCCESS_MODE).
+   * When omitted, the runner falls back to parsing the legacy EVAL_RESULT
+   * line — preserves current behavior for callers that haven't migrated.
+   */
+  verifier?: ClaudeCodeVerifierConfig;
 }
 
 export interface ParsedClaudeCodeResult {
@@ -124,7 +156,9 @@ export async function runClaudeCodeAgent({
   toolAdapter,
   signal,
   sdk: injectedSdk,
+  verifier,
 }: ClaudeCodeRunnerInput): Promise<TaskResult> {
+  const startedAt = new Date().toISOString();
   const sdk = injectedSdk ?? (await loadClaudeAgentSdk());
   const abortController = new AbortController();
   if (signal) {
@@ -220,8 +254,10 @@ export async function runClaudeCodeAgent({
     parsed.summary ??
     stopReason ??
     (resultText || transcriptText || "Claude Code did not report success");
+  const endedAt = new Date().toISOString();
+  const tokenUsage = extractClaudeCodeTokenUsage(resultMessage);
 
-  return {
+  const baseResult: TaskResult = {
     _success: parsed.success,
     error: !parsed.success ? errorMessage : undefined,
     reasoning: parsed.summary,
@@ -232,6 +268,94 @@ export async function runClaudeCodeAgent({
     logs: logger.getLogs(),
     metrics: buildClaudeCodeMetrics(resultMessage),
   };
+
+  if (!verifier) {
+    return baseResult;
+  }
+
+  // Build a Trajectory from the SDK message stream and run the rubric verifier.
+  try {
+    const trajectory = claudeCodeAdapter.fromHarnessResult(
+      {
+        messages,
+        finalAnswer: parsed.finalAnswer ?? resultText,
+        status: status === "completed" ? "complete" : "error",
+        usage: {
+          input_tokens: tokenUsage.inputTokens,
+          output_tokens: tokenUsage.outputTokens,
+          cached_input_tokens: tokenUsage.cacheReadInputTokens,
+        },
+        timing: { startedAt, endedAt },
+      },
+      verifier.taskSpec,
+    );
+
+    const { V3Evaluator } = await import("@browserbasehq/stagehand");
+    const { RubricCache } = await import("./rubricCache.js");
+    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
+
+    // Hydrate rubric — use precomputed if present, otherwise cache-or-generate.
+    let rubric = verifier.taskSpec.precomputedRubric;
+    if (!rubric) {
+      if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+        rubric = await evaluator.generateRubric(verifier.taskSpec);
+      } else {
+        const cache = new RubricCache({ dataset: verifier.dataset });
+        rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator);
+      }
+    }
+    const hydratedSpec: TaskSpec = {
+      ...verifier.taskSpec,
+      precomputedRubric: rubric,
+    };
+
+    const evaluationResult = await evaluator.verify(trajectory, hydratedSpec);
+    const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
+    const verifiedSuccess = evaluationResultToSuccess(
+      evaluationResult,
+      successMode,
+    );
+
+    const { directory: trajectoryDir } = await persistAdapterTrajectory({
+      trajectory,
+      taskSpec: hydratedSpec,
+      evaluationResult,
+      outputRoot: verifier.trajectoryRoot,
+      runId: verifier.runId,
+    });
+
+    logger.log({
+      category: "claude_code",
+      message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`,
+      level: 1,
+    });
+
+    return {
+      ...baseResult,
+      _success: verifiedSuccess,
+      error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
+      outcomeSuccess: evaluationResult.outcomeSuccess,
+      processScore: evaluationResult.processScore,
+      evidenceInsufficient: evaluationResult.evidenceInsufficient,
+      criterionCount: rubric.items.length,
+      stepCount: trajectory.steps.length,
+      trajectoryDir,
+    };
+  } catch (verifyError) {
+    logger.warn({
+      category: "claude_code",
+      message: `verifier integration failed: ${stringifyError(verifyError)}`,
+      level: 0,
+      auxiliary: {
+        error: { value: stringifyError(verifyError), type: "string" },
+      },
+    });
+    return baseResult;
+  }
+}
+
+function formatProcessScore(score: number | undefined): string {
+  return typeof score === "number" ? score.toFixed(2) : "n/a";
 }
 
 function buildClaudeCodeMetrics(
diff --git a/packages/evals/framework/codexRunner.ts b/packages/evals/framework/codexRunner.ts
index 4d2844efa..bd38965bb 100644
--- a/packages/evals/framework/codexRunner.ts
+++ b/packages/evals/framework/codexRunner.ts
@@ -1,9 +1,12 @@
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import type { AvailableModel, TaskSpec, V3 } from "@browserbasehq/stagehand";
 import { EvalsError } from "../errors.js";
 import type { EvalLogger } from "../logger.js";
 import type { TaskResult } from "./types.js";
 import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
 import type { PreparedCodexToolAdapter } from "./codexToolAdapter.js";
+import { codexAdapter } from "./harnesses/codexAdapter.js";
+import { persistAdapterTrajectory } from "./harnesses/persistTrajectory.js";
+import { evaluationResultToSuccess } from "./verifierAdapter.js";
 
 type MetricValue = { count: number; value: number };
 type CodexEvent = Record<string, unknown>;
@@ -25,6 +28,25 @@ export type CodexSdk = {
   startThread: (options?: Record<string, unknown>) => CodexThread;
 };
 
+export interface CodexVerifierConfig {
+  /**
+   * V3 instance used solely as the LLM-client carrier for V3Evaluator. The
+   * instance does NOT need to have `init()` been called — V3Evaluator.verify()
+   * uses only `v3.logger` to construct its LLMProvider.
+   */
+  v3: V3;
+  /** TaskSpec to verify against. id + instruction + optional rubric/initUrl. */
+  taskSpec: TaskSpec;
+  /** Dataset name for rubric cache partitioning (used when no precomputedRubric). */
+  dataset: string;
+  /** Override --success mode. Defaults to EVAL_SUCCESS_MODE env or "outcome". */
+  successMode?: "outcome" | "process" | "both";
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+}
+
 export interface CodexRunnerInput {
   plan: ExternalHarnessTaskPlan;
   model: AvailableModel;
@@ -32,6 +54,16 @@ export interface CodexRunnerInput {
   toolAdapter?: PreparedCodexToolAdapter;
   signal?: AbortSignal;
   sdk?: CodexSdk;
+  /**
+   * Optional verifier integration. When provided, the runner builds a
+   * Trajectory from the codex event stream (via codexAdapter), runs
+   * V3Evaluator.verify() against the supplied TaskSpec, and folds the
+   * EvaluationResult into the returned TaskResult ({_success} mode follows
+   * EVAL_SUCCESS_MODE).
+   * When omitted, the runner falls back to parsing the legacy JSON result —
+   * preserves current behavior for callers that haven't migrated.
+   */
+  verifier?: CodexVerifierConfig;
 }
 
 export interface ParsedCodexResult {
@@ -114,7 +146,9 @@ export async function runCodexAgent({
   toolAdapter,
   signal,
   sdk: injectedSdk,
+  verifier,
 }: CodexRunnerInput): Promise<TaskResult> {
+  const startedAt = new Date().toISOString();
   const sdk = injectedSdk ?? (await loadCodexSdk(toolAdapter?.env));
   const prompt = buildCodexPrompt(plan, toolAdapter?.promptInstructions);
   const events: CodexEvent[] = [];
@@ -191,8 +225,9 @@ export async function runCodexAgent({
       finalResponse ||
       transcriptText ||
       "Codex did not report success");
+  const endedAt = new Date().toISOString();
 
-  return {
+  const baseResult: TaskResult = {
     _success: parsed.success,
     error: !parsed.success ? errorMessage : undefined,
     reasoning: parsed.summary,
@@ -203,6 +238,97 @@ export async function runCodexAgent({
     logs: logger.getLogs(),
     metrics: buildCodexMetrics(usage),
   };
+
+  if (!verifier) {
+    return baseResult;
+  }
+
+  try {
+    const trajectory = codexAdapter.fromHarnessResult(
+      {
+        events,
+        finalAnswer: parsed.finalAnswer ?? finalResponse,
+        status: status === "completed" ? "complete" : "error",
+        usage: {
+          input_tokens: toFiniteNumber(usage?.input_tokens),
+          output_tokens: toFiniteNumber(usage?.output_tokens),
+          ...(usage?.reasoning_output_tokens !== undefined && {
+            reasoning_tokens: toFiniteNumber(usage.reasoning_output_tokens),
+          }),
+          ...(usage?.cached_input_tokens !== undefined && {
+            cached_input_tokens: toFiniteNumber(usage.cached_input_tokens),
+          }),
+        },
+        timing: { startedAt, endedAt },
+      },
+      verifier.taskSpec,
+    );
+
+    const { V3Evaluator } = await import("@browserbasehq/stagehand");
+    const { RubricCache } = await import("./rubricCache.js");
+    const evaluator = new V3Evaluator(verifier.v3, { backend: "verifier" });
+
+    let rubric = verifier.taskSpec.precomputedRubric;
+    if (!rubric) {
+      if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+        rubric = await evaluator.generateRubric(verifier.taskSpec);
+      } else {
+        const cache = new RubricCache({ dataset: verifier.dataset });
+        rubric = await cache.getOrGenerate(verifier.taskSpec, evaluator);
+      }
+    }
+    const hydratedSpec: TaskSpec = {
+      ...verifier.taskSpec,
+      precomputedRubric: rubric,
+    };
+
+    const evaluationResult = await evaluator.verify(trajectory, hydratedSpec);
+    const successMode = verifier.successMode ?? process.env.EVAL_SUCCESS_MODE;
+    const verifiedSuccess = evaluationResultToSuccess(
+      evaluationResult,
+      successMode,
+    );
+
+    const { directory: trajectoryDir } = await persistAdapterTrajectory({
+      trajectory,
+      taskSpec: hydratedSpec,
+      evaluationResult,
+      outputRoot: verifier.trajectoryRoot,
+      runId: verifier.runId,
+    });
+
+    logger.log({
+      category: "codex",
+      message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} steps=${trajectory.steps.length}`,
+      level: 1,
+    });
+
+    return {
+      ...baseResult,
+      _success: verifiedSuccess,
+      error: verifiedSuccess ? undefined : (baseResult.error ?? errorMessage),
+      outcomeSuccess: evaluationResult.outcomeSuccess,
+      processScore: evaluationResult.processScore,
+      evidenceInsufficient: evaluationResult.evidenceInsufficient,
+      criterionCount: rubric.items.length,
+      stepCount: trajectory.steps.length,
+      trajectoryDir,
+    };
+  } catch (verifyError) {
+    logger.warn({
+      category: "codex",
+      message: `verifier integration failed: ${stringifyError(verifyError)}`,
+      level: 0,
+      auxiliary: {
+        error: { value: stringifyError(verifyError), type: "string" },
+      },
+    });
+    return baseResult;
+  }
+}
+
+function formatProcessScore(score: number | undefined): string {
+  return typeof score === "number" ? score.toFixed(2) : "n/a";
 }
 
 function tryParseCodexJson(
diff --git a/packages/evals/framework/harnesses/claudeCodeAdapter.ts b/packages/evals/framework/harnesses/claudeCodeAdapter.ts
new file mode 100644
index 000000000..fd680895b
--- /dev/null
+++ b/packages/evals/framework/harnesses/claudeCodeAdapter.ts
@@ -0,0 +1,225 @@
+/**
+ * claudeCodeAdapter — converts a Claude Code SDK run into a `Trajectory` the
+ * verifier can consume.
+ *
+ * Input shape: the SDK emits a stream of `ClaudeSdkMessage` objects of
+ * different `type`s — assistant (model output, may contain tool_use blocks),
+ * user (tool_result blocks for prior tool_use calls), and result (final
+ * outcome with cost/usage/turn counts). We accumulate the stream upstream in
+ * `runClaudeCodeAgent` and hand the full list here.
+ *
+ * Mapping:
+ *   - Each `tool_use` block in an assistant message becomes one normalized
+ *     tool call, paired with its matching `tool_result` from a subsequent
+ *     user message (by `tool_use_id`).
+ *   - Assistant `text` blocks that precede a tool_use are folded into that
+ *     tool call's `reasoning`. Trailing text after the last tool call (and
+ *     the final result message's `result` string when present) becomes the
+ *     `finalAnswer`.
+ *   - The result message's usage carries forward as the trajectory usage.
+ *
+ * Failure modes:
+ *   - max_turns / sdk_error → status = "error", but we still emit whatever
+ *     steps we have. The verifier flags evidence_insufficient on criteria it
+ *     can't ground.
+ */
+import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+import {
+  buildTrajectory,
+  type NormalizedToolCall,
+  type TrajectoryAdapter,
+} from "./trajectoryAdapter.js";
+
+/** Subset of the harness result we need to build a trajectory. */
+export interface ClaudeCodeRunResult {
+  /** Raw SDK message stream collected during execution, in arrival order. */
+  messages: Array<Record<string, unknown>>;
+  /** Final assistant message captured separately (optional — falls back to messages). */
+  finalAnswer?: string;
+  /** Trajectory-level status. Defaults to "complete". */
+  status?: Trajectory["status"];
+  /** Optional usage to fold into Trajectory.usage. */
+  usage?: Partial<Trajectory["usage"]>;
+  /** Optional run start/end timing. Adapter fills with now-now otherwise. */
+  timing?: Partial<Trajectory["timing"]>;
+}
+
+interface ToolUseBlock {
+  /** tool_use_id used to match against tool_result blocks. */
+  id: string;
+  name: string;
+  input: Record<string, unknown>;
+  /** Assistant text accumulated before this tool call (becomes `reasoning`). */
+  reasoningPrefix: string;
+}
+
+interface ToolResultBlock {
+  toolUseId: string;
+  /** Concatenated text content of the result. */
+  text: string;
+  /** Original structured content when not flattened to text. */
+  raw?: unknown;
+  isError: boolean;
+}
+
+export class ClaudeCodeTrajectoryAdapter
+  implements TrajectoryAdapter<ClaudeCodeRunResult>
+{
+  fromHarnessResult(
+    result: ClaudeCodeRunResult,
+    taskSpec: TaskSpec,
+  ): Trajectory {
+    const toolUses: ToolUseBlock[] = [];
+    const toolResults = new Map<string, ToolResultBlock>();
+    const trailingTextParts: string[] = [];
+    let resultMessageText: string | undefined;
+
+    let pendingReasoning = "";
+
+    for (const message of result.messages) {
+      const type = String((message as Record<string, unknown>).type ?? "");
+      const inner = (message as Record<string, unknown>).message;
+      if (type === "result") {
+        const r = (message as Record<string, unknown>).result;
+        if (typeof r === "string" && r.trim()) {
+          resultMessageText = r;
+        }
+        continue;
+      }
+      if (!isRecord(inner)) continue;
+      const content = inner.content;
+      if (!Array.isArray(content)) {
+        if (typeof content === "string" && type === "assistant") {
+          pendingReasoning = appendText(pendingReasoning, content);
+          trailingTextParts.push(content);
+        }
+        continue;
+      }
+
+      if (type === "assistant") {
+        for (const block of content) {
+          if (!isRecord(block)) continue;
+          const blockType = String(block.type ?? "");
+          if (blockType === "text" && typeof block.text === "string") {
+            pendingReasoning = appendText(pendingReasoning, block.text);
+            trailingTextParts.push(block.text);
+            continue;
+          }
+          if (blockType === "tool_use") {
+            const id = typeof block.id === "string" ? block.id : "";
+            const name = typeof block.name === "string" ? block.name : "tool";
+            const input = isRecord(block.input)
+              ? (block.input as Record<string, unknown>)
+              : {};
+            toolUses.push({
+              id,
+              name,
+              input,
+              reasoningPrefix: pendingReasoning,
+            });
+            // Once a tool_use lands, the buffered text belonged to its reasoning;
+            // future tool calls start with empty reasoning unless more text arrives.
+            pendingReasoning = "";
+            // The text we just folded into reasoning is not the final answer.
+            // Drop it from trailingTextParts.
+            trailingTextParts.length = 0;
+          }
+        }
+        continue;
+      }
+
+      if (type === "user") {
+        for (const block of content) {
+          if (!isRecord(block)) continue;
+          const blockType = String(block.type ?? "");
+          if (blockType !== "tool_result") continue;
+          const toolUseId =
+            typeof block.tool_use_id === "string" ? block.tool_use_id : "";
+          const isError = block.is_error === true;
+          const { text, raw } = extractToolResultContent(block.content);
+          toolResults.set(toolUseId, {
+            toolUseId,
+            text,
+            raw,
+            isError,
+          });
+        }
+        continue;
+      }
+    }
+
+    const toolCalls: NormalizedToolCall[] = toolUses.map((use) => {
+      const matched = toolResults.get(use.id);
+      const ok = matched ? !matched.isError : true;
+      const resultPayload =
+        matched?.raw !== undefined ? matched.raw : (matched?.text ?? "");
+      return {
+        name: use.name,
+        args: use.input,
+        result: resultPayload,
+        ok,
+        ...(matched?.isError && matched.text && { error: matched.text }),
+        reasoning: use.reasoningPrefix.trim() || undefined,
+      };
+    });
+
+    const trailing = trailingTextParts.join("\n").trim();
+    const finalAnswer =
+      result.finalAnswer ??
+      resultMessageText ??
+      (trailing.length > 0 ? trailing : undefined);
+
+    return buildTrajectory({
+      taskSpec,
+      toolCalls,
+      finalAnswer,
+      status: result.status ?? "complete",
+      usage: result.usage,
+      timing: result.timing,
+    });
+  }
+}
+
+export const claudeCodeAdapter = new ClaudeCodeTrajectoryAdapter();
+
+function appendText(buffer: string, addition: string): string {
+  if (!addition) return buffer;
+  if (!buffer) return addition;
+  return `${buffer}\n${addition}`;
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null;
+}
+
+/**
+ * tool_result `content` can be:
+ *   - a string (legacy)
+ *   - an array of { type: "text", text } / { type: "image", source } blocks
+ *
+ * We flatten text blocks and preserve the original array (when structured) as
+ * `raw` so adapters that want the json modality can keep it.
+ */
+function extractToolResultContent(content: unknown): {
+  text: string;
+  raw?: unknown;
+} {
+  if (typeof content === "string") {
+    return { text: content };
+  }
+  if (!Array.isArray(content)) {
+    return { text: "" };
+  }
+  const parts: string[] = [];
+  for (const block of content) {
+    if (!isRecord(block)) continue;
+    if (block.type === "text" && typeof block.text === "string") {
+      parts.push(block.text);
+    } else if (block.type === "image") {
+      parts.push("[image]");
+    } else if (typeof block.text === "string") {
+      parts.push(block.text);
+    }
+  }
+  return { text: parts.join("\n"), raw: content };
+}
diff --git a/packages/evals/framework/harnesses/codexAdapter.ts b/packages/evals/framework/harnesses/codexAdapter.ts
new file mode 100644
index 000000000..cd313dd72
--- /dev/null
+++ b/packages/evals/framework/harnesses/codexAdapter.ts
@@ -0,0 +1,223 @@
+/**
+ * codexAdapter — converts a Codex SDK run into a `Trajectory` the verifier
+ * can consume.
+ *
+ * Input shape: codex emits `ThreadEvent`s — `item.completed` carrying a
+ * `ThreadItem` (command_execution, file_change, mcp_tool_call, agent_message,
+ * reasoning, web_search, todo_list, error), plus `turn.completed` for usage.
+ * We accumulate the full event list upstream in `runCodexAgent` and hand it
+ * here.
+ *
+ * Mapping:
+ *   - command_execution items → tool call named `bash` (or the command's
+ *     leading token), args = { command }, result = aggregated_output,
+ *     ok = exit_code === 0.
+ *   - mcp_tool_call items → tool call named `${server}.${tool}`, args =
+ *     arguments, result = structured_content (json modality) when present,
+ *     else flattened content text. ok = status !== "failed".
+ *   - reasoning items between item.completed events → folded into the next
+ *     tool call's reasoning string.
+ *   - agent_message items → the final answer (last wins).
+ *   - error items → captured as a failed tool call so the verifier sees the
+ *     pattern (a no-op `error` action with the message in toolOutput.error).
+ *   - file_change items → captured as a tool call named `file_change` with the
+ *     change set in args (rare in browser eval contexts).
+ *   - web_search items → captured as a tool call named `web_search` with the
+ *     query in args.
+ *   - todo_list items → not surfaced as tool calls (they aren't actions).
+ */
+import type { TaskSpec, Trajectory } from "@browserbasehq/stagehand";
+import {
+  buildTrajectory,
+  type NormalizedToolCall,
+  type TrajectoryAdapter,
+} from "./trajectoryAdapter.js";
+
+export interface CodexRunResult {
+  /** All ThreadEvents collected from the SDK stream, in arrival order. */
+  events: Array<Record<string, unknown>>;
+  /** Last `agent_message` text. Adapter falls back to scanning events otherwise. */
+  finalAnswer?: string;
+  /** Trajectory-level status. Defaults to "complete". */
+  status?: Trajectory["status"];
+  /** Optional usage to fold into Trajectory.usage. */
+  usage?: Partial<Trajectory["usage"]>;
+  /** Optional run start/end timing. Adapter fills with now-now otherwise. */
+  timing?: Partial<Trajectory["timing"]>;
+}
+
+export class CodexTrajectoryAdapter
+  implements TrajectoryAdapter<CodexRunResult>
+{
+  fromHarnessResult(result: CodexRunResult, taskSpec: TaskSpec): Trajectory {
+    const toolCalls: NormalizedToolCall[] = [];
+    let pendingReasoning = "";
+    let latestAgentMessage: string | undefined;
+
+    for (const event of result.events) {
+      const type = String((event as Record<string, unknown>).type ?? "");
+      if (type !== "item.completed") continue;
+      const item = (event as Record<string, unknown>).item;
+      if (!isRecord(item)) continue;
+      const itemType = String(item.type ?? "");
+
+      if (itemType === "reasoning" && typeof item.text === "string") {
+        pendingReasoning = pendingReasoning
+          ? `${pendingReasoning}\n${item.text}`
+          : item.text;
+        continue;
+      }
+
+      if (itemType === "agent_message" && typeof item.text === "string") {
+        // Drop buffered reasoning that didn't precede a tool call.
+        pendingReasoning = "";
+        latestAgentMessage = item.text;
+        continue;
+      }
+
+      const call = normalizeItem(itemType, item, pendingReasoning);
+      if (call) {
+        toolCalls.push(call);
+        pendingReasoning = "";
+      }
+    }
+
+    const finalAnswer = result.finalAnswer ?? latestAgentMessage;
+
+    return buildTrajectory({
+      taskSpec,
+      toolCalls,
+      finalAnswer,
+      status: result.status ?? "complete",
+      usage: result.usage,
+      timing: result.timing,
+    });
+  }
+}
+
+export const codexAdapter = new CodexTrajectoryAdapter();
+
+function normalizeItem(
+  itemType: string,
+  item: Record<string, unknown>,
+  reasoning: string,
+): NormalizedToolCall | undefined {
+  if (itemType === "command_execution") {
+    const command = typeof item.command === "string" ? item.command : "";
+    const exitCode =
+      typeof item.exit_code === "number" ? item.exit_code : undefined;
+    const status = String(item.status ?? "");
+    const ok = exitCode === 0 || status === "completed";
+    const output =
+      typeof item.aggregated_output === "string" ? item.aggregated_output : "";
+    // Use the leading token as the action name (`bash`, `browse`, etc.) when
+    // possible; falls back to `command_execution`.
+    const leading = command.split(/\s+/, 1)[0] || "command_execution";
+    return {
+      name: leading,
+      args: { command, ...(exitCode !== undefined && { exit_code: exitCode }) },
+      result: output,
+      ok,
+      ...(!ok && {
+        error:
+          exitCode !== undefined
+            ? `exit code ${exitCode}`
+            : `command status ${status}`,
+      }),
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "mcp_tool_call") {
+    const server = typeof item.server === "string" ? item.server : "mcp";
+    const tool = typeof item.tool === "string" ? item.tool : "tool";
+    const args = isRecord(item.arguments)
+      ? (item.arguments as Record<string, unknown>)
+      : {};
+    const status = String(item.status ?? "");
+    const ok = status !== "failed";
+    const mcpResult = isRecord(item.result) ? item.result : undefined;
+    const structured = mcpResult?.structured_content;
+    const content = mcpResult?.content;
+    const errorMessage = isRecord(item.error)
+      ? typeof item.error.message === "string"
+        ? item.error.message
+        : undefined
+      : undefined;
+
+    // Prefer structured_content (json modality) when present, else flatten
+    // content blocks to text. Falls back to error message when failed.
+    let payload: unknown;
+    if (structured !== undefined && structured !== null) {
+      payload = structured;
+    } else if (Array.isArray(content)) {
+      const parts: string[] = [];
+      for (const block of content) {
+        if (!isRecord(block)) continue;
+        if (block.type === "text" && typeof block.text === "string") {
+          parts.push(block.text);
+        } else if (block.type === "image") {
+          parts.push("[image]");
+        } else if (typeof block.text === "string") {
+          parts.push(block.text);
+        }
+      }
+      payload = parts.join("\n");
+    } else if (!ok && errorMessage) {
+      payload = errorMessage;
+    } else {
+      payload = "";
+    }
+
+    return {
+      name: `${server}.${tool}`,
+      args,
+      result: payload,
+      ok,
+      ...(errorMessage && !ok && { error: errorMessage }),
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "web_search") {
+    const query = typeof item.query === "string" ? item.query : "";
+    return {
+      name: "web_search",
+      args: { query },
+      result: "",
+      ok: true,
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "file_change") {
+    const changes = Array.isArray(item.changes) ? item.changes : [];
+    const status = String(item.status ?? "");
+    return {
+      name: "file_change",
+      args: { changes },
+      result: { status, changes },
+      ok: status === "completed",
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  if (itemType === "error") {
+    const message =
+      typeof item.message === "string" ? item.message : "codex error item";
+    return {
+      name: "error",
+      args: {},
+      result: message,
+      ok: false,
+      error: message,
+      reasoning: reasoning || undefined,
+    };
+  }
+
+  return undefined;
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null;
+}
diff --git a/packages/evals/framework/harnesses/persistTrajectory.ts b/packages/evals/framework/harnesses/persistTrajectory.ts
new file mode 100644
index 000000000..ab6cf4daa
--- /dev/null
+++ b/packages/evals/framework/harnesses/persistTrajectory.ts
@@ -0,0 +1,199 @@
+/**
+ * persistAdapterTrajectory — writes the on-disk layout used by the Stagehand
+ * `TrajectoryRecorder.persist()` for trajectories built by external-harness
+ * adapters (claude_code, codex).
+ *
+ * `TrajectoryRecorder` itself is coupled to v3.bus events: it subscribes
+ * during the agent run, accumulates partial steps, and emits the final
+ * trajectory on finish(). External harnesses don't go through that bus —
+ * they produce a complete `Trajectory` synchronously after the harness
+ * finishes — so this helper writes the same on-disk layout without the
+ * event-subscription lifecycle.
+ *
+ * The on-disk layout matches TrajectoryRecorder.persist():
+ *
+ *   <dir>/
+ *     ├── task_data.json
+ *     ├── trajectory.json   (images referenced by path)
+ *     ├── screenshots/
+ *     │   ├── probe/<N>.png
+ *     │   └── agent/<N>.png
+ *     ├── scores/
+ *     │   └── result.json       (if `evaluationResult` passed)
+ *     ├── core.log
+ *     └── times.json
+ *
+ * Honors `VERIFIER_PERSIST_TRAJECTORIES` for default on/off (matches
+ * TrajectoryRecorder's convention):
+ *   - "1" / "true": always persist.
+ *   - "0" / "false": never persist.
+ *   - unset: persists when not in CI.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import type {
+  EvaluationResult,
+  ProbeEvidence,
+  TaskSpec,
+  Trajectory,
+} from "@browserbasehq/stagehand";
+
+export interface PersistAdapterTrajectoryOptions {
+  trajectory: Trajectory;
+  taskSpec: TaskSpec;
+  /** EvaluationResult from V3Evaluator.verify(). Written to scores/result.json. */
+  evaluationResult?: EvaluationResult;
+  /**
+   * Output directory root. Final layout lives at `<outputRoot>/<runId>/<task.id>/`.
+   * Defaults to `<cwd>/.trajectories`.
+   */
+  outputRoot?: string;
+  /** Run identifier (e.g., ISO timestamp). Defaults to a fresh timestamp. */
+  runId?: string;
+  /**
+   * Override the env-gated persistence default. `true` always persists,
+   * `false` never does, `undefined` defers to VERIFIER_PERSIST_TRAJECTORIES.
+   */
+  persist?: boolean;
+}
+
+export interface PersistAdapterTrajectoryResult {
+  /** The directory the trajectory was (or would have been) persisted to. */
+  directory: string;
+  /** Whether persistence actually wrote files. */
+  persisted: boolean;
+}
+
+function shouldPersist(override: boolean | undefined): boolean {
+  if (override !== undefined) return override;
+  const env = process.env.VERIFIER_PERSIST_TRAJECTORIES?.toLowerCase();
+  if (env === "1" || env === "true") return true;
+  if (env === "0" || env === "false") return false;
+  return !process.env.CI;
+}
+
+export async function persistAdapterTrajectory(
+  opts: PersistAdapterTrajectoryOptions,
+): Promise<PersistAdapterTrajectoryResult> {
+  const runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-");
+  const root = opts.outputRoot ?? path.join(process.cwd(), ".trajectories");
+  const directory = path.join(root, runId, opts.taskSpec.id);
+  const persisted = shouldPersist(opts.persist);
+
+  if (!persisted) {
+    return { directory, persisted: false };
+  }
+
+  await fs.mkdir(directory, { recursive: true });
+  await fs.mkdir(path.join(directory, "screenshots", "probe"), {
+    recursive: true,
+  });
+  await fs.mkdir(path.join(directory, "screenshots", "agent"), {
+    recursive: true,
+  });
+
+  // Walk steps and write image bytes to disk, replacing in-memory Buffers with
+  // path references in trajectory.json.
+  const serializableSteps: unknown[] = [];
+  for (const step of opts.trajectory.steps) {
+    const probe: ProbeEvidence = { ...step.probeEvidence };
+    if (probe.screenshot) {
+      const relPath = `screenshots/probe/${step.index + 1}.png`;
+      await fs.writeFile(path.join(directory, relPath), probe.screenshot);
+      probe.screenshotPath = relPath;
+      delete probe.screenshot;
+    }
+
+    const imageModalities = step.agentEvidence.modalities.filter(
+      (m) => m.type === "image",
+    );
+    const multipleImages = imageModalities.length > 1;
+    let imageSeq = 0;
+    const agentEvidence = {
+      modalities: await Promise.all(
+        step.agentEvidence.modalities.map(async (m) => {
+          if (m.type !== "image") return m;
+          const suffix = multipleImages ? `_${imageSeq}` : "";
+          const relPath = `screenshots/agent/${step.index + 1}${suffix}.png`;
+          imageSeq += 1;
+          await fs.writeFile(path.join(directory, relPath), m.bytes);
+          return {
+            type: "image" as const,
+            imagePath: relPath,
+            mediaType: m.mediaType,
+          };
+        }),
+      ),
+    };
+    serializableSteps.push({ ...step, probeEvidence: probe, agentEvidence });
+  }
+
+  const serialized = {
+    ...opts.trajectory,
+    steps: serializableSteps,
+  } as unknown;
+
+  await fs.writeFile(
+    path.join(directory, "trajectory.json"),
+    JSON.stringify(serialized, null, 2),
+  );
+
+  const taskData: Record<string, unknown> = {
+    task: opts.trajectory.task,
+    status: opts.trajectory.status,
+    finalAnswer: opts.trajectory.finalAnswer ?? null,
+  };
+  if (opts.evaluationResult) {
+    taskData.result = opts.evaluationResult;
+  }
+  await fs.writeFile(
+    path.join(directory, "task_data.json"),
+    JSON.stringify(taskData, null, 2),
+  );
+
+  await fs.writeFile(
+    path.join(directory, "times.json"),
+    JSON.stringify(
+      {
+        timing: opts.trajectory.timing,
+        usage: opts.trajectory.usage,
+        stepCount: opts.trajectory.steps.length,
+      },
+      null,
+      2,
+    ),
+  );
+
+  await fs.mkdir(path.join(directory, "scores"), { recursive: true });
+  if (opts.evaluationResult) {
+    await fs.writeFile(
+      path.join(directory, "scores", "result.json"),
+      JSON.stringify(opts.evaluationResult, null, 2),
+    );
+  }
+
+  await fs.writeFile(
+    path.join(directory, "core.log"),
+    coreLog(opts.trajectory),
+  );
+
+  return { directory, persisted: true };
+}
+
+function coreLog(trajectory: Trajectory): string {
+  return (
+    trajectory.steps
+      .map((step) =>
+        JSON.stringify({
+          step: step.index,
+          action: step.actionName,
+          url: step.probeEvidence.url ?? null,
+          ok: step.toolOutput.ok,
+          reasoning: step.reasoning || undefined,
+          startedAt: step.startedAt,
+          finishedAt: step.finishedAt,
+        }),
+      )
+      .join("\n") + "\n"
+  );
+}
diff --git a/packages/evals/framework/harnesses/trajectoryAdapter.ts b/packages/evals/framework/harnesses/trajectoryAdapter.ts
new file mode 100644
index 000000000..ec1b02319
--- /dev/null
+++ b/packages/evals/framework/harnesses/trajectoryAdapter.ts
@@ -0,0 +1,206 @@
+/**
+ * TrajectoryAdapter — converts an external harness's natural output (a
+ * provider-shaped event/message log) into the Stagehand `Trajectory` shape
+ * that V3Evaluator.verify() consumes.
+ *
+ * The verifier is harness-agnostic (Trajectory + TaskSpec → EvaluationResult,
+ * no live browser). That property is what lets non-Stagehand
+ * harnesses — Claude Code, Codex — be scored with the same rubric pipeline
+ * we use for Stagehand. Each external harness ships its own
+ * `TrajectoryAdapter<THarnessResult>` that maps its tool-call/message log to
+ * a `Trajectory`. The verifier never knows which adapter produced it.
+ */
+import type {
+  AgentEvidence,
+  AgentEvidenceModality,
+  TaskSpec,
+  Trajectory,
+  TrajectoryStep,
+} from "@browserbasehq/stagehand";
+
+/**
+ * Adapter interface. Implementations are pure: no I/O, no live browser, no
+ * mutation of the input result. The same harness result should always produce
+ * the same Trajectory.
+ *
+ * Empty `probeEvidence` on every step is supported — the verifier degrades
+ * gracefully via the `evidence_insufficient` path. Text-heavy tasks
+ * (extract, lookup, search) still get a
+ * meaningful outcome assessment; visual-grounding criteria get flagged as
+ * evidence_insufficient rather than silently miscredited.
+ */
+export interface TrajectoryAdapter<THarnessResult> {
+  /**
+   * Convert the external harness's natural output into a Trajectory. Must be
+   * deterministic given the input.
+   */
+  fromHarnessResult(result: THarnessResult, taskSpec: TaskSpec): Trajectory;
+}
+
+/**
+ * Normalized tool invocation. Adapters parse harness-specific event/message
+ * shapes into this canonical structure before mapping to `TrajectoryStep`.
+ *
+ * The fields are deliberately permissive — harnesses vary in what they
+ * surface, and we want a single mapping helper to handle all of them.
+ */
+export interface NormalizedToolCall {
+  /** Tool name (e.g., "Bash", "mcp__stagehand_browser__run", "container.exec"). */
+  name: string;
+  /** Tool arguments. Empty object if the harness doesn't surface them. */
+  args: Record<string, unknown>;
+  /**
+   * Tool result. Strings become a text modality; objects become a json modality.
+   * `undefined` is allowed (e.g., when the tool failed before producing output).
+   */
+  result: unknown;
+  /** True if the tool reported success. Adapters infer this from harness flags. */
+  ok: boolean;
+  /** Free-form error string when `ok === false`. */
+  error?: string;
+  /** Optional reasoning text the assistant emitted before/with this tool call. */
+  reasoning?: string;
+  /** Wall-clock when the call started. Falls back to call site's "now" if absent. */
+  startedAt?: string;
+  /** Wall-clock when the call finished. Falls back to startedAt. */
+  finishedAt?: string;
+}
+
+/**
+ * Convert a `NormalizedToolCall` into a Trajectory `AgentEvidence`. Strings
+ * map to a single text modality; objects map to a json modality (plus a text
+ * modality with the stringified form so plain text-relevance prompts can
+ * grok structured output). Reasoning text becomes its own text modality —
+ * the verifier weights reasoning highly when grounding criteria without
+ * screenshots.
+ *
+ * `probeEvidence` is intentionally not produced here — external harnesses
+ * don't emit independent observations natively. See `actionToProbeEvidence`
+ * if a harness eventually grows that capability.
+ */
+export function actionToAgentEvidence(
+  call: Pick<NormalizedToolCall, "result" | "reasoning">,
+): AgentEvidence {
+  const modalities: AgentEvidenceModality[] = [];
+
+  if (call.reasoning) {
+    modalities.push({ type: "text", content: call.reasoning });
+  }
+
+  const result = call.result;
+  if (result === undefined || result === null) {
+    return { modalities };
+  }
+
+  if (typeof result === "string") {
+    if (result.length > 0) {
+      modalities.push({ type: "text", content: result });
+    }
+  } else if (Buffer.isBuffer(result)) {
+    modalities.push({
+      type: "image",
+      bytes: result,
+      mediaType: "image/png",
+    });
+  } else if (typeof result === "object") {
+    // Provide both a JSON modality (preserved structure for prompts that
+    // accept JSON) and a stringified text modality (cheap fallback for prompts
+    // that only consume text). Step 2 relevance scoring tolerates duplicates.
+    modalities.push({ type: "json", content: result });
+    const asText = safeStringify(result);
+    if (asText) {
+      modalities.push({ type: "text", content: asText });
+    }
+  } else {
+    // Numbers, booleans, etc. — stringify so the verifier has a text handle.
+    modalities.push({ type: "text", content: String(result) });
+  }
+
+  return { modalities };
+}
+
+/**
+ * Materialize a `TrajectoryStep` from a normalized tool call. Centralizes the
+ * step-shape contract so every adapter produces verifier-equivalent steps.
+ */
+export function toolCallToTrajectoryStep(
+  index: number,
+  call: NormalizedToolCall,
+  fallbackTimestamp: string,
+): TrajectoryStep {
+  const startedAt = call.startedAt ?? fallbackTimestamp;
+  const finishedAt = call.finishedAt ?? startedAt;
+  return {
+    index,
+    actionName: call.name,
+    actionArgs: call.args,
+    reasoning: call.reasoning ?? "",
+    agentEvidence: actionToAgentEvidence(call),
+    // External harnesses don't natively produce screenshots/aria/scroll, so
+    // probeEvidence stays empty. The verifier handles this via the
+    // evidence_insufficient path.
+    probeEvidence: {},
+    toolOutput: {
+      ok: call.ok,
+      result: call.result,
+      ...(call.error && { error: call.error }),
+    },
+    startedAt,
+    finishedAt,
+  };
+}
+
+/**
+ * Build a `Trajectory` from a sequence of normalized tool calls + the task
+ * metadata. Adapters call this after parsing their harness's event log.
+ */
+export interface BuildTrajectoryOptions {
+  taskSpec: TaskSpec;
+  toolCalls: NormalizedToolCall[];
+  finalAnswer?: string;
+  status?: Trajectory["status"];
+  /** Token usage if the harness surfaced it; partial fields are filled with 0. */
+  usage?: Partial<Trajectory["usage"]>;
+  /** Defaults to `now` for both endpoints if the harness didn't track timing. */
+  timing?: Partial<Trajectory["timing"]>;
+}
+
+export function buildTrajectory(opts: BuildTrajectoryOptions): Trajectory {
+  const now = new Date().toISOString();
+  const steps: TrajectoryStep[] = opts.toolCalls.map((call, idx) =>
+    toolCallToTrajectoryStep(idx, call, now),
+  );
+
+  const startedAt = opts.timing?.startedAt ?? steps[0]?.startedAt ?? now;
+  const endedAt =
+    opts.timing?.endedAt ?? steps[steps.length - 1]?.finishedAt ?? startedAt;
+
+  return {
+    task: opts.taskSpec,
+    steps,
+    finalAnswer: opts.finalAnswer,
+    status: opts.status ?? "complete",
+    usage: {
+      input_tokens: opts.usage?.input_tokens ?? 0,
+      output_tokens: opts.usage?.output_tokens ?? 0,
+      ...(opts.usage?.reasoning_tokens !== undefined && {
+        reasoning_tokens: opts.usage.reasoning_tokens,
+      }),
+      ...(opts.usage?.cached_input_tokens !== undefined && {
+        cached_input_tokens: opts.usage.cached_input_tokens,
+      }),
+      ...(opts.usage?.inference_time_ms !== undefined && {
+        inference_time_ms: opts.usage.inference_time_ms,
+      }),
+    },
+    timing: { startedAt, endedAt },
+  };
+}
+
+function safeStringify(value: unknown): string | undefined {
+  try {
+    return JSON.stringify(value);
+  } catch {
+    return undefined;
+  }
+}
diff --git a/packages/evals/tests/framework/persistTrajectory.test.ts b/packages/evals/tests/framework/persistTrajectory.test.ts
new file mode 100644
index 000000000..c281ac996
--- /dev/null
+++ b/packages/evals/tests/framework/persistTrajectory.test.ts
@@ -0,0 +1,130 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { loadTrajectoryFromDisk } from "@browserbasehq/stagehand";
+import type {
+  EvaluationResult,
+  TaskSpec,
+  Trajectory,
+} from "@browserbasehq/stagehand";
+import { describe, expect, it } from "vitest";
+
+import { persistAdapterTrajectory } from "../../framework/harnesses/persistTrajectory.js";
+
+const PROBE_PNG = Buffer.from("fake-probe-bytes-1234", "utf8");
+const AGENT_PNG = Buffer.from("fake-agent-bytes-5678", "utf8");
+
+describe("persistAdapterTrajectory", () => {
+  it("round-trips probe and agent image evidence through loadTrajectoryFromDisk", async () => {
+    const tmpRoot = await fs.mkdtemp(
+      path.join(os.tmpdir(), "persist-adapter-roundtrip-"),
+    );
+
+    try {
+      const taskSpec: TaskSpec = {
+        id: "roundtrip-task",
+        instruction: "Test task",
+        initUrl: "https://example.com",
+      };
+      const evaluationResult: EvaluationResult = {
+        outcomeSuccess: true,
+        processScore: 1,
+        perCriterion: [],
+        evidenceInsufficient: [],
+      };
+      const { directory, persisted } = await persistAdapterTrajectory({
+        trajectory: makeTrajectory(taskSpec),
+        taskSpec,
+        evaluationResult,
+        outputRoot: tmpRoot,
+        runId: "roundtrip-run",
+        persist: true,
+      });
+
+      expect(persisted).toBe(true);
+      await expect(fs.readdir(directory)).resolves.toEqual(
+        expect.arrayContaining([
+          "core.log",
+          "scores",
+          "screenshots",
+          "task_data.json",
+          "times.json",
+          "trajectory.json",
+        ]),
+      );
+      await expect(
+        fs.readFile(path.join(directory, "screenshots", "probe", "1.png")),
+      ).resolves.toEqual(PROBE_PNG);
+      await expect(
+        fs.readFile(path.join(directory, "screenshots", "agent", "1.png")),
+      ).resolves.toEqual(AGENT_PNG);
+      await expect(
+        fs.readFile(path.join(directory, "scores", "result.json"), "utf8"),
+      ).resolves.toContain('"outcomeSuccess": true');
+      await expect(
+        fs.readFile(path.join(directory, "task_data.json"), "utf8"),
+      ).resolves.toContain('"result"');
+
+      const loaded = await loadTrajectoryFromDisk(directory);
+      const step = loaded.steps[0];
+      const imageModality = step.agentEvidence.modalities.find(
+        (
+          modality,
+        ): modality is Extract<
+          (typeof step.agentEvidence.modalities)[number],
+          { type: "image" }
+        > => modality.type === "image",
+      );
+      const textModality = step.agentEvidence.modalities.find(
+        (
+          modality,
+        ): modality is Extract<
+          (typeof step.agentEvidence.modalities)[number],
+          { type: "text" }
+        > => modality.type === "text",
+      );
+
+      expect(step.probeEvidence.screenshot).toEqual(PROBE_PNG);
+      expect(imageModality?.bytes).toEqual(AGENT_PNG);
+      expect(imageModality?.mediaType).toBe("image/png");
+      expect(textModality?.content).toBe("navigated");
+    } finally {
+      await fs.rm(tmpRoot, { recursive: true, force: true });
+    }
+  });
+});
+
+function makeTrajectory(task: TaskSpec): Trajectory {
+  return {
+    task,
+    status: "complete",
+    finalAnswer: "Final answer text.",
+    usage: { input_tokens: 100, output_tokens: 50 },
+    timing: {
+      startedAt: "2026-05-15T10:00:00.000Z",
+      endedAt: "2026-05-15T10:01:00.000Z",
+    },
+    steps: [
+      {
+        index: 0,
+        actionName: "goto",
+        actionArgs: { url: "https://example.com" },
+        reasoning: "Open the page.",
+        agentEvidence: {
+          modalities: [
+            { type: "text", content: "navigated" },
+            { type: "image", bytes: AGENT_PNG, mediaType: "image/png" },
+          ],
+        },
+        probeEvidence: {
+          url: "https://example.com",
+          screenshot: PROBE_PNG,
+        },
+        toolOutput: { ok: true, result: { url: "https://example.com" } },
+        startedAt: "2026-05-15T10:00:00.000Z",
+        finishedAt: "2026-05-15T10:00:05.000Z",
+      },
+    ],
+  };
+}