diff --git a/.gitignore b/.gitignore
index ec7d09add..a09d13c0a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ ctrf/
 **/.playwright*/
 packages/evals/playwright-mcp-screenshot-*.png
 packages/evals/chrome-devtools-mcp-screenshot-*.png
+.trajectories/
diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts
new file mode 100644
index 000000000..d7817fc93
--- /dev/null
+++ b/packages/evals/framework/rubricCache.ts
@@ -0,0 +1,132 @@
+/**
+ * Rubric cache — persists AI-generated rubrics so each task id can hydrate
+ * from disk after its first generated rubric.
+ *
+ * Used for any task whose dataset doesn't ship a precomputed_rubric
+ * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its
+ * upstream dataset already carries rubrics.
+ *
+ * Cache layout:
+ *   packages/evals/.rubric-cache/<dataset>/<task-id>.json
+ *
+ * The cache key includes the task id and instruction hash to detect drift —
+ * if either changes, the rubric is regenerated rather than served from a
+ * stale cache.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import crypto from "node:crypto";
+
+import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand";
+
+export interface RubricCacheOptions {
+  /**
+   * Root directory for cached rubrics. Defaults to
+   * `<packages/evals>/.rubric-cache`.
+   */
+  cacheRoot?: string;
+  /**
+   * Dataset name, used as a subdirectory under cacheRoot to keep different
+   * datasets' rubrics separate (e.g., "onlineMind2Web").
+   */
+  dataset: string;
+}
+
+interface CacheEntry {
+  taskId: string;
+  instructionHash: string;
+  generatedAt: string;
+  rubric: Rubric;
+}
+
+function hashInstruction(instruction: string): string {
+  return crypto
+    .createHash("sha256")
+    .update(instruction)
+    .digest("hex")
+    .slice(0, 16);
+}
+
+export class RubricCache {
+  private readonly cacheDir: string;
+
+  constructor(opts: RubricCacheOptions) {
+    const root =
+      opts.cacheRoot ??
+      path.join(process.cwd(), "packages/evals/.rubric-cache");
+    this.cacheDir = path.join(root, opts.dataset);
+  }
+
+  /**
+   * Get or generate a rubric for the task. If a fresh cache entry exists
+   * (same instruction hash), returns it. Otherwise runs Step 0a and persists.
+   */
+  async getOrGenerate(
+    taskSpec: TaskSpec,
+    evaluator: V3Evaluator,
+  ): Promise<Rubric> {
+    const cached = await this.read(taskSpec);
+    if (cached) return cached;
+
+    const rubric = await evaluator.generateRubric(taskSpec);
+    await this.write(taskSpec, rubric);
+    return rubric;
+  }
+
+  /** Read a cached rubric. Returns undefined on miss or cache-key drift. */
+  async read(taskSpec: TaskSpec): Promise<Rubric | undefined> {
+    const file = this.entryPath(taskSpec.id);
+    let raw: string;
+    try {
+      raw = await fs.readFile(file, "utf8");
+    } catch {
+      return undefined;
+    }
+    let parsed: CacheEntry;
+    try {
+      parsed = JSON.parse(raw) as CacheEntry;
+    } catch {
+      return undefined;
+    }
+    if (parsed.taskId !== taskSpec.id) {
+      console.warn(
+        `[rubric-cache] task-id mismatch for ${taskSpec.id}; regenerating`,
+      );
+      return undefined;
+    }
+    const expectedHash = hashInstruction(taskSpec.instruction);
+    if (parsed.instructionHash !== expectedHash) {
+      // Drift detected — surface a clear log and miss.
+      console.warn(
+        `[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`,
+      );
+      return undefined;
+    }
+    return parsed.rubric;
+  }
+
+  async write(taskSpec: TaskSpec, rubric: Rubric): Promise<void> {
+    await fs.mkdir(this.cacheDir, { recursive: true });
+    const entry: CacheEntry = {
+      taskId: taskSpec.id,
+      instructionHash: hashInstruction(taskSpec.instruction),
+      generatedAt: new Date().toISOString(),
+      rubric,
+    };
+    await fs.writeFile(
+      this.entryPath(taskSpec.id),
+      JSON.stringify(entry, null, 2),
+    );
+  }
+
+  /** Wipe the cache directory (used by tests / `bench cache clear`). */
+  async clear(): Promise<void> {
+    await fs.rm(this.cacheDir, { recursive: true, force: true });
+  }
+
+  private entryPath(taskId: string): string {
+    // Sanitize task id for filesystem safety.
+    const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_");
+    return path.join(this.cacheDir, `${safe}.json`);
+  }
+}
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
new file mode 100644
index 000000000..5a6763390
--- /dev/null
+++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -0,0 +1,162 @@
+/**
+ * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with
+ * the published WebTailBench `precomputed_rubric` field.
+ *
+ * This script fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins
+ * by `id`, writing back a JSONL where each row carries a
+ * `precomputed_rubric` field (parsed JSON object) alongside the existing
+ * `ques` / `web` / `category` / `id` fields.
+ *
+ * Run once after pulling the branch:
+ *   pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts
+ *
+ * Idempotent — safe to re-run; an existing precomputed_rubric on a row is
+ * overwritten with the latest upstream version.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+const HF_URL =
+  "https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv";
+
+const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", "..");
+const JSONL_PATH = path.join(
+  REPO_ROOT,
+  "packages",
+  "evals",
+  "datasets",
+  "webtailbench",
+  "WebTailBench_data.jsonl",
+);
+
+interface Rubric {
+  items: Array<Record<string, unknown>>;
+}
+
+interface LocalRow {
+  id: string;
+  category?: string;
+  ques: string;
+  web?: string;
+  precomputed_rubric?: Rubric;
+}
+
+/**
+ * Parse a TSV file with simple double-quote escaping (the WebTailBench files
+ * use `""` for literal quotes inside quoted fields). Returns rows as arrays
+ * of column values; the caller maps to a schema.
+ */
+function parseTsv(text: string): string[][] {
+  const rows: string[][] = [];
+  const lines = text.split(/\r?\n/);
+  for (const raw of lines) {
+    if (!raw) continue;
+    // Each column is either quoted (with "" escapes) or unquoted plain text.
+    const cols: string[] = [];
+    let i = 0;
+    while (i < raw.length) {
+      if (raw[i] === "\t") {
+        cols.push("");
+        i++;
+        continue;
+      }
+      let col = "";
+      if (raw[i] === '"') {
+        i++;
+        while (i < raw.length) {
+          if (raw[i] === '"') {
+            if (raw[i + 1] === '"') {
+              col += '"';
+              i += 2;
+            } else {
+              i++;
+              break;
+            }
+          } else {
+            col += raw[i];
+            i++;
+          }
+        }
+      } else {
+        const tabIdx = raw.indexOf("\t", i);
+        if (tabIdx === -1) {
+          col = raw.slice(i);
+          i = raw.length;
+        } else {
+          col = raw.slice(i, tabIdx);
+          i = tabIdx;
+        }
+      }
+      cols.push(col);
+      if (raw[i] === "\t") i++;
+    }
+    rows.push(cols);
+  }
+  return rows;
+}
+
+async function main(): Promise<void> {
+  console.log(`▸ fetching ${HF_URL}`);
+  const res = await fetch(HF_URL);
+  if (!res.ok) {
+    throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`);
+  }
+  const tsv = await res.text();
+  console.log(`  ✓ downloaded ${tsv.length} bytes`);
+
+  const rows = parseTsv(tsv);
+  const header = rows[0];
+  const idIdx = header.indexOf("id");
+  const rubricIdx = header.indexOf("precomputed_rubric");
+  if (idIdx === -1 || rubricIdx === -1) {
+    throw new Error(
+      `unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`,
+    );
+  }
+
+  const rubricsById = new Map<string, Rubric>();
+  for (let i = 1; i < rows.length; i++) {
+    const cols = rows[i];
+    if (!cols[idIdx]) continue;
+    try {
+      const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
+      rubricsById.set(cols[idIdx], parsed);
+    } catch (e) {
+      console.warn(
+        `  ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`,
+      );
+    }
+  }
+  console.log(`  ✓ parsed ${rubricsById.size} rubrics`);
+
+  const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8");
+  const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0);
+  console.log(`▸ joining into ${inLines.length} local rows`);
+
+  let matched = 0;
+  let missing = 0;
+  const out: string[] = [];
+  for (const line of inLines) {
+    const row = JSON.parse(line) as LocalRow;
+    const rubric = rubricsById.get(row.id);
+    if (rubric) {
+      row.precomputed_rubric = rubric;
+      matched++;
+    } else {
+      missing++;
+    }
+    out.push(JSON.stringify(row));
+  }
+
+  console.log(
+    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
+  );
+
+  await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
+  console.log(`✅ wrote ${JSONL_PATH}`);
+}
+
+main().catch((err) => {
+  console.error("❌ backfill failed:", err);
+  process.exit(1);
+});
diff --git a/packages/evals/tests/framework/rubricCache.test.ts b/packages/evals/tests/framework/rubricCache.test.ts
new file mode 100644
index 000000000..62afeee3d
--- /dev/null
+++ b/packages/evals/tests/framework/rubricCache.test.ts
@@ -0,0 +1,47 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+
+import { RubricCache } from "../../framework/rubricCache.js";
+
+describe("RubricCache", () => {
+  let tmpRoot = "";
+  let warn: ReturnType<typeof vi.spyOn>;
+
+  const rubric: Rubric = {
+    items: [
+      {
+        criterion: "criterion",
+        description: "description",
+        maxPoints: 1,
+      },
+    ],
+  };
+
+  beforeEach(async () => {
+    tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rubric-cache-test-"));
+    warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+  });
+
+  afterEach(async () => {
+    warn.mockRestore();
+    await fs.rm(tmpRoot, { recursive: true, force: true });
+  });
+
+  it("misses when sanitized task ids collide but the stored task id differs", async () => {
+    const cache = new RubricCache({ cacheRoot: tmpRoot, dataset: "test" });
+    const taskA: TaskSpec = { id: "task/a", instruction: "same instruction" };
+    const taskB: TaskSpec = { id: "task:a", instruction: "same instruction" };
+
+    await cache.write(taskA, rubric);
+
+    await expect(cache.read(taskB)).resolves.toBeUndefined();
+    await expect(cache.read(taskA)).resolves.toEqual(rubric);
+    expect(warn).toHaveBeenCalledWith(
+      "[rubric-cache] task-id mismatch for task:a; regenerating",
+    );
+  });
+});
diff --git a/packages/evals/tests/tui/commandTree.test.ts b/packages/evals/tests/tui/commandTree.test.ts
index d70006c1b..378499711 100644
--- a/packages/evals/tests/tui/commandTree.test.ts
+++ b/packages/evals/tests/tui/commandTree.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it, vi } from "vitest";
 import {
+  buildCommandTree,
   dispatch,
   findChild,
   resolveCommand,
@@ -207,6 +208,15 @@ describe("findChild + walkPath", () => {
   });
 });
 
+describe("buildCommandTree", () => {
+  it("exposes verify as a root command", () => {
+    const tree = buildCommandTree();
+    expect(findChild(tree, "verify")?.summary).toBe(
+      "Re-score a saved trajectory",
+    );
+  });
+});
+
 // ---------------------------------------------------------------------------
 // resolveCommand
 // ---------------------------------------------------------------------------
diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts
index 36be3e1aa..9b2d3aa83 100644
--- a/packages/evals/tests/tui/run.test.ts
+++ b/packages/evals/tests/tui/run.test.ts
@@ -119,6 +119,7 @@ describe("deriveCategoryFilter", () => {
         envOverrides: {},
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -157,6 +158,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -204,6 +206,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -254,6 +257,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -315,6 +319,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -371,6 +376,7 @@ describe("deriveCategoryFilter", () => {
           envOverrides: {},
           dryRun: true,
           preview: false,
+          successMode: "outcome",
           verbose: false,
         },
         registry,
@@ -405,6 +411,7 @@ describe("deriveCategoryFilter", () => {
           },
           dryRun: true,
           preview: false,
+          successMode: "outcome",
           verbose: false,
         },
         registry,
@@ -447,6 +454,7 @@ describe("deriveCategoryFilter", () => {
         envOverrides: {},
         dryRun: false,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
diff --git a/packages/evals/tui/commandTree.ts b/packages/evals/tui/commandTree.ts
index 7cdc6deca..c9350eff0 100644
--- a/packages/evals/tui/commandTree.ts
+++ b/packages/evals/tui/commandTree.ts
@@ -3,7 +3,7 @@
  *
  * Models the user-visible command surface as a tree:
  *   root → run, list, new, config{path,set,reset,core{path,set,reset,setup}},
- *          experiments{list,show,open,compare}
+ *          experiments{list,show,open,compare}, verify, doctor
  *
  * Both the REPL (tui/repl.ts) and argv mode (cli.ts) build the same tree
  * via `buildCommandTree()` and dispatch user input through it. This is the
@@ -643,6 +643,17 @@ export function buildCommandTree(): CommandNode {
     },
   };
 
+  const verifyNode: CommandNode = {
+    name: "verify",
+    summary: "Re-score a saved trajectory",
+    printHelp: async () =>
+      (await import("./commands/verify.js")).printVerifyHelp(),
+    handler: async (args) => {
+      const { handleVerify } = await import("./commands/verify.js");
+      await handleVerify(args);
+    },
+  };
+
   const root: CommandNode = {
     name: "evals",
     summary: "Stagehand evals CLI",
@@ -653,6 +664,7 @@ export function buildCommandTree(): CommandNode {
       configNode,
       experimentsNode,
       newNode,
+      verifyNode,
       doctorNode,
     ],
   };
diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts
index d9cc20738..087e34d08 100644
--- a/packages/evals/tui/commands/help.ts
+++ b/packages/evals/tui/commands/help.ts
@@ -28,11 +28,12 @@ export function printHelp(): void {
       `${cyan("experiments")} ${dim("[subcommand]")}`,
       "Inspect and compare Braintrust experiment runs",
     ),
-    row(`${cyan("new")} ${dim("<tier> <cat> <name>")}`, "Scaffold a new task"),
     row(
-      `${cyan("doctor")} ${dim("[--json]")}`,
-      "Health report (env keys, config, discovery)",
+      `${cyan("verify")} ${dim("<trajectory-dir> [options]")}`,
+      "Re-score a saved trajectory",
     ),
+    row(`${cyan("doctor")} ${dim("| health")}`, "Health report"),
+    row(`${cyan("new")} ${dim("<tier> <cat> <name>")}`, "Scaffold a new task"),
     row(cyan("help"), "Show this help"),
     row(cyan("clear"), "Clear the screen"),
     row(cyan("exit"), "Exit the REPL"),
@@ -101,6 +102,10 @@ export function printRunHelp(): void {
       `${cyan("--agent-modes")} ${dim("<csv>")}`,
       `Stagehand mode matrix ${gray("(dom,hybrid,cua)")}`,
     ),
+    row(
+      `${cyan("--success")} ${dim("<mode>")}`,
+      `Rubric success mode ${gray("(outcome | process | both)")}`,
+    ),
     row(`${cyan("-l, --limit")} ${dim("<n>")}`, "Max cases to run"),
     row(`${cyan("-s, --sample")} ${dim("<n>")}`, "Random sample before limit"),
     row(
diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts
index 28c842e43..d5d723403 100644
--- a/packages/evals/tui/commands/parse.ts
+++ b/packages/evals/tui/commands/parse.ts
@@ -38,10 +38,25 @@ export interface RunFlags {
   filter?: Array<[string, string]>;
   dryRun?: boolean;
   preview?: boolean;
+  /**
+   * Rubric success mode for the verifier — outcome | process | both.
+   *   outcome (default): binary EvaluationResult.outcomeSuccess.
+   *   process: EvaluationResult.processScore ≥ threshold.
+   *   both: outcome AND process.
+   * Plumbed to bench tasks via the EVAL_SUCCESS_MODE env override.
+   */
+  success?: SuccessMode;
   /** Spawn the pre-refactor index.eval.ts runner instead of the unified path. */
   legacy?: boolean;
 }
 
+export type SuccessMode = "outcome" | "process" | "both";
+const SUCCESS_MODES: ReadonlySet<SuccessMode> = new Set<SuccessMode>([
+  "outcome",
+  "process",
+  "both",
+]);
+
 export interface ConfigDefaults {
   env?: string;
   trials?: number;
@@ -68,6 +83,8 @@ export interface ResolvedRunOptions {
   agentMode?: AgentToolMode;
   agentModes?: AgentToolMode[];
   datasetFilter?: string;
+  /** Rubric success mode forwarded to bench tasks via EVAL_SUCCESS_MODE. */
+  successMode: SuccessMode;
   envOverrides: Record<string, string>;
   dryRun: boolean;
   preview: boolean;
@@ -101,6 +118,7 @@ const VALUE_FLAGS = new Set([
   "agent-mode",
   "agent-modes",
   "filter",
+  "success",
 ]);
 
 const FLAG_ALIASES: Record<string, string> = {
@@ -261,6 +279,16 @@ export function parseRunArgs(tokens: string[]): RunFlags {
           filters.push(parseFilter(value));
           break;
         }
+        case "success": {
+          const v = value.toLowerCase() as SuccessMode;
+          if (!SUCCESS_MODES.has(v)) {
+            throw new Error(
+              `--success must be one of: outcome, process, both (got "${value}")`,
+            );
+          }
+          flags.success = v;
+          break;
+        }
         default:
           break;
       }
@@ -427,6 +455,16 @@ export function resolveRunOptions(
     envOverrides.EVAL_MODEL_OVERRIDE = model;
   }
 
+  // Success mode resolves from --success first, then EVAL_SUCCESS_MODE env,
+  // then "outcome".
+  const envSuccess = (env.EVAL_SUCCESS_MODE ?? "").toLowerCase();
+  const successMode: SuccessMode =
+    flags.success ??
+    (SUCCESS_MODES.has(envSuccess as SuccessMode)
+      ? (envSuccess as SuccessMode)
+      : "outcome");
+  envOverrides.EVAL_SUCCESS_MODE = successMode;
+
   return {
     target: flags.target,
     normalizedTarget: target,
@@ -442,6 +480,7 @@ export function resolveRunOptions(
     agentMode,
     agentModes,
     datasetFilter,
+    successMode,
     envOverrides,
     dryRun: flags.dryRun ?? false,
     preview: flags.preview ?? false,
diff --git a/packages/evals/tui/commands/verify.ts b/packages/evals/tui/commands/verify.ts
new file mode 100644
index 000000000..130d0694e
--- /dev/null
+++ b/packages/evals/tui/commands/verify.ts
@@ -0,0 +1,236 @@
+/**
+ * `evals verify <trajectory-dir>` — re-score a saved trajectory offline.
+ *
+ * The verifier is browser-free: it consumes a hydrated Trajectory + TaskSpec
+ * and returns an EvaluationResult. This command reads the on-disk layout written by
+ * `TrajectoryRecorder.persist()` and feeds it through V3Evaluator.verify().
+ *
+ * Output: writes a new result file under `scores/result_<label>.json`.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+import {
+  V3,
+  V3Evaluator,
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  type AvailableModel,
+} from "@browserbasehq/stagehand";
+
+import { bold, cyan, dim, gray, green, red, yellow } from "../format.js";
+
+export interface VerifyOptions {
+  /** Absolute or cwd-relative path to a `<run-id>/<task-id>/` directory. */
+  trajectoryDir: string;
+  /** Override the verifier model. Defaults to whatever V3Evaluator picks. */
+  model?: string;
+  /** Label appended to the output result filename (default: timestamp). */
+  label?: string;
+  /** Emit machine-readable JSON to stdout instead of human summary. */
+  jsonOutput?: boolean;
+  /** Don't write to disk — print the result and exit. */
+  dryRun?: boolean;
+}
+
+export function printVerifyHelp(): void {
+  console.log(`
+${bold("evals verify")} ${dim("— re-score a saved trajectory offline")}
+
+  ${cyan("Usage")}
+    evals verify <trajectory-dir> [options]
+
+  ${cyan("Arguments")}
+    <trajectory-dir>       Path to a saved trajectory directory containing
+                           trajectory.json (typically under .trajectories/<run-id>/<task-id>/).
+
+  ${cyan("Options")}
+    --model <name>         Override the verifier LLM (default: V3Evaluator's default,
+                           currently google/gemini-2.5-flash).
+    --label <text>         Label appended to the output filename
+                           (default: rescore-<ISO timestamp>).
+                           File written to scores/result_<label>.json.
+    --json                 Emit the result as JSON to stdout instead of a human summary.
+    --dry-run              Don't write to disk; print result and exit.
+    --help, -h             This message.
+
+  ${cyan("Examples")}
+    evals verify .trajectories/2026-05-11T06-47-09-697Z/united_13
+    evals verify .trajectories/<run>/<task> --model anthropic/claude-haiku-4-5 --label tuning-pass-1
+    evals verify .trajectories/<run>/<task> --json > result.json
+`);
+}
+
+interface ParsedArgs {
+  trajectoryDir?: string;
+  model?: string;
+  label?: string;
+  json?: boolean;
+  dryRun?: boolean;
+  help?: boolean;
+}
+
+function parseArgs(args: string[]): ParsedArgs {
+  const parsed: ParsedArgs = {};
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === "--help" || a === "-h" || a === "help") {
+      parsed.help = true;
+    } else if (a === "--json") {
+      parsed.json = true;
+    } else if (a === "--dry-run") {
+      parsed.dryRun = true;
+    } else if (a === "--model") {
+      parsed.model = args[++i];
+    } else if (a === "--label") {
+      parsed.label = args[++i];
+    } else if (!a.startsWith("-") && !parsed.trajectoryDir) {
+      parsed.trajectoryDir = a;
+    } else {
+      throw new Error(
+        `Unknown argument: ${a}. Run 'evals verify --help' for usage.`,
+      );
+    }
+  }
+  return parsed;
+}
+
+export async function handleVerify(args: string[]): Promise<void> {
+  const parsed = parseArgs(args);
+  if (parsed.help || !parsed.trajectoryDir) {
+    printVerifyHelp();
+    if (!parsed.trajectoryDir) {
+      process.exitCode = parsed.help ? 0 : 1;
+    }
+    return;
+  }
+
+  const dir = path.resolve(parsed.trajectoryDir);
+  await assertTrajectoryDir(dir);
+
+  if (!parsed.json) {
+    console.log(`${cyan("▸")} loading trajectory from ${gray(dir)}`);
+  }
+  const trajectory = await loadTrajectoryFromDisk(dir);
+  if (!parsed.json) {
+    console.log(
+      `  ${green("✓")} ${trajectory.steps.length} steps · status=${trajectory.status} · task=${trajectory.task.id}`,
+    );
+  }
+
+  // ── Build a verifier without launching a browser ────────────────────────
+  // V3Evaluator.verify() only touches v3.logger (to construct an LLMProvider)
+  // and the verify(trajectory, taskSpec) call is pure. Constructing V3 without
+  // calling init() is safe and avoids any browser/Browserbase setup cost.
+  const v3 = new V3({
+    env: "LOCAL",
+    verbose: 0,
+    disableAPI: true,
+    ...(parsed.model ? { model: parsed.model as AvailableModel } : {}),
+  });
+
+  const evaluator = new V3Evaluator(v3, {
+    backend: "verifier",
+    ...(parsed.model ? { modelName: parsed.model as AvailableModel } : {}),
+  });
+
+  if (!parsed.json) {
+    console.log(
+      `${cyan("▸")} running V3Evaluator.verify()${parsed.model ? ` with model=${parsed.model}` : ""}`,
+    );
+  }
+  const startMs = Date.now();
+  const result = await evaluator.verify(trajectory, trajectory.task);
+  const elapsedMs = Date.now() - startMs;
+
+  if (parsed.json) {
+    process.stdout.write(JSON.stringify(result, null, 2) + "\n");
+    return;
+  }
+
+  // ── Human summary ──────────────────────────────────────────────────────
+  console.log(`  ${green("✓")} verified in ${(elapsedMs / 1000).toFixed(1)}s`);
+  console.log();
+  const processScore =
+    result.processScore === undefined ? "n/a" : result.processScore.toFixed(3);
+  console.log(
+    `${bold("Result")}  outcomeSuccess=${result.outcomeSuccess}  processScore=${processScore}`,
+  );
+  const perCriterion = result.perCriterion ?? [];
+  const evidenceInsufficient = result.evidenceInsufficient ?? [];
+  console.log(
+    `${dim("       ")} criteria=${perCriterion.length}  evidenceInsufficient=${evidenceInsufficient.length}`,
+  );
+
+  if (perCriterion.length > 0) {
+    console.log();
+    console.log(bold("Per-criterion"));
+    for (const c of perCriterion) {
+      const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1);
+      const flag = c.evidenceInsufficient
+        ? ` ${yellow("[evidence_insufficient]")}`
+        : "";
+      console.log(`  ${cyan(earned)}/${c.maxPoints}  ${c.criterion}${flag}`);
+      if (c.explanation) {
+        console.log(`    ${dim(c.explanation.slice(0, 220))}`);
+      }
+    }
+  }
+
+  if (result.findings && result.findings.length > 0) {
+    console.log();
+    console.log(bold(`Findings (${result.findings.length})`));
+    for (const f of result.findings) {
+      const sev =
+        f.severity === "blocking"
+          ? red(`[${f.severity}]`)
+          : f.severity === "warning"
+            ? yellow(`[${f.severity}]`)
+            : dim(`[${f.severity}]`);
+      const steps = f.relatedSteps?.length
+        ? gray(` steps=[${f.relatedSteps.join(",")}]`)
+        : "";
+      console.log(`  ${sev} ${f.category}${steps}`);
+      console.log(`    ${f.description}`);
+      if (f.suggestedAction) {
+        console.log(`    ${green("→")} ${f.suggestedAction}`);
+      }
+    }
+  }
+
+  // ── Persist ────────────────────────────────────────────────────────────
+  if (parsed.dryRun) {
+    console.log();
+    console.log(dim("dry-run: result not written to disk"));
+    return;
+  }
+  const filename = nextResultFilename(parsed.label);
+  const outPath = path.join(dir, "scores", filename);
+  await fs.mkdir(path.dirname(outPath), { recursive: true });
+  await fs.writeFile(outPath, JSON.stringify(result, null, 2));
+  console.log();
+  console.log(
+    `${green("✓")} wrote ${cyan(path.relative(process.cwd(), outPath))}`,
+  );
+}
+
+async function assertTrajectoryDir(dir: string): Promise<void> {
+  try {
+    const stat = await fs.stat(dir);
+    if (!stat.isDirectory()) {
+      throw new Error(`${dir} is not a directory`);
+    }
+  } catch (e) {
+    if ((e as NodeJS.ErrnoException).code === "ENOENT") {
+      throw new Error(`Trajectory directory not found: ${dir}`);
+    }
+    throw e;
+  }
+  try {
+    await fs.access(path.join(dir, "trajectory.json"));
+  } catch {
+    throw new Error(
+      `Missing trajectory.json in ${dir}. Is this a valid trajectory directory?`,
+    );
+  }
+}