From 1265dca00c11bf353e6c0fd3e1eae4738d478c62 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:37:54 -0700
Subject: [PATCH 1/7] fix(verifier): bound failure step parsing

---
 packages/core/lib/v3/verifier/prompts/index.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/packages/core/lib/v3/verifier/prompts/index.ts b/packages/core/lib/v3/verifier/prompts/index.ts
index dd0b3ba75..8e15d9556 100644
--- a/packages/core/lib/v3/verifier/prompts/index.ts
+++ b/packages/core/lib/v3/verifier/prompts/index.ts
@@ -1,6 +1,4 @@
-/**
- * Verifier prompts used by the rubric-based verification pipeline.
- */
+/** Verifier prompts used by the rubric-based verification pipeline. */
 export { RUBRIC_GENERATION_PROMPT } from "./rubricGeneration.js";
 export { OUTCOME_VERIFICATION_PROMPT } from "./outcomeVerification.js";
 export { RUBRIC_RESCORING_PROMPT } from "./rubricRescoring.js";

From 3d1a1b1f0d923c7b722baaf8c3adf9daac00c3a2 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:49:04 -0700
Subject: [PATCH 2/7] feat(evals): add offline verifier CLI

---
 .gitignore                                    |   1 +
 packages/evals/cli.ts                         | 181 ++++++++-----
 packages/evals/framework/rubricCache.ts       | 129 ++++++++++
 .../scripts/backfill-webtailbench-rubrics.ts  | 163 ++++++++++++
 .../evals/scripts/verify-live-trajectory.ts   | 170 +++++++++++++
 packages/evals/tests/tui/run.test.ts          |   8 +
 packages/evals/tui/commands/help.ts           |   8 +-
 packages/evals/tui/commands/parse.ts          |  39 +++
 packages/evals/tui/commands/verify.ts         | 238 ++++++++++++++++++
 9 files changed, 872 insertions(+), 65 deletions(-)
 create mode 100644 packages/evals/framework/rubricCache.ts
 create mode 100644 packages/evals/scripts/backfill-webtailbench-rubrics.ts
 create mode 100644 packages/evals/scripts/verify-live-trajectory.ts
 create mode 100644 packages/evals/tui/commands/verify.ts

diff --git a/.gitignore b/.gitignore
index ec7d09add..a09d13c0a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ ctrf/
 **/.playwright*/
 packages/evals/playwright-mcp-screenshot-*.png
 packages/evals/chrome-devtools-mcp-screenshot-*.png
+.trajectories/
diff --git a/packages/evals/cli.ts b/packages/evals/cli.ts
index 1cd9f8552..45226cabc 100644
--- a/packages/evals/cli.ts
+++ b/packages/evals/cli.ts
@@ -2,18 +2,13 @@
  * Evals CLI entry point.
  *
  * Modes:
- *   - `evals` (no args)              → interactive REPL
- *   - `evals --quiet` / `evals -q`   → REPL with no banner / welcome / inline warnings
- *   - `evals run <target> …`         → single-shot run with rich progress
- *   - `evals list [tier]`            → list discovered tasks
- *   - `evals config [sub]`           → print / get / set defaults
- *   - `evals experiments [sub]`      → inspect / compare Braintrust runs
- *   - `evals doctor` / `health`      → env-key + config + discovery health report
- *   - `evals new <tier> <cat> <name>`→ scaffold a task file
- *   - `evals help` / `-h`            → help
- *
- * Env vars:
- *   - EVALS_NO_WELCOME=1             → suppress first-run welcome panel (REPL only)
+ *   - `evals` (no args)          → interactive REPL
+ *   - `evals run <target> …`     → single-shot run with rich progress
+ *   - `evals list [tier]`        → list discovered tasks
+ *   - `evals config [sub]`       → print / get / set defaults
+ *   - `evals experiments [sub]`  → inspect / compare Braintrust runs
+ *   - `evals new <tier> <cat> <name>` → scaffold a task file
+ *   - `evals help` / `-h`        → help
  *
  * No child processes. All runs flow through framework/runEvals in-process.
  *
@@ -55,7 +50,6 @@ await (async () => {
 
 import { red } from "./tui/format.js";
 import { getCurrentDirPath, getRuntimeTasksRoot } from "./runtimePaths.js";
-import type { TaskRegistry } from "./framework/types.js";
 
 /**
  * Directory of the running entry module. Differs between source and
@@ -66,6 +60,13 @@ const ENTRY_DIR = getCurrentDirPath();
 const args = process.argv.slice(2);
 
 (async () => {
+  // Keep heavy command modules behind their command branches. The run stack
+  // imports Braintrust transitively, and importing it for `help`/`config path`
+  // makes quiet commands print optional OpenTelemetry warnings.
+  const { printHelp, printRunHelp, printListHelp, printNewHelp } = await import(
+    "./tui/commands/help.js"
+  );
+
   // Best-effort shutdown: flush Braintrust telemetry and exit with the
   // conventional signal code. Does not guarantee in-flight task
   // cancellation upstream; the goal is clean process shutdown with no
@@ -94,11 +95,6 @@ const args = process.argv.slice(2);
   process.on("SIGINT", () => void handleSignal("SIGINT"));
   process.on("SIGTERM", () => void handleSignal("SIGTERM"));
 
-  // REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are
-  // REPL-only (they suppress chrome); other args route to the argv switch.
-  const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q";
-  const replLaunch = args.length === 0 || args.every(isQuietFlag);
-
   // Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress
   // handler that does cooperative-then-aggressive abort instead — this
   // path is only active when no arg-less REPL is running.
@@ -106,7 +102,7 @@ const args = process.argv.slice(2);
   // Note: raw mode disables the OS-level Ctrl+C → SIGINT translation,
   // so we forward it ourselves.
   let cleanupArgvInput = (): void => {};
-  if (!replLaunch && args.length > 0 && process.stdin.isTTY) {
+  if (args.length > 0 && process.stdin.isTTY) {
     const readline = await import("node:readline");
     const wasRaw = process.stdin.isRaw;
     readline.emitKeypressEvents(process.stdin);
@@ -127,63 +123,126 @@ const args = process.argv.slice(2);
     };
   }
 
-  // Whether to write the first-run marker in `finally`. Help-only paths and
-  // the doctor command don't count as "first uses" — they're discovery
-  // actions. The REPL marks itself. Set by the dispatch outcome below.
-  let shouldMarkFirstRun = false;
+  async function executeRun(tokens: string[]): Promise<void> {
+    const { readConfig } = await import("./tui/commands/config.js");
+    const { runCommand } = await import("./tui/commands/run.js");
+    const { parseRunArgs, resolveRunOptions } = await import(
+      "./tui/commands/parse.js"
+    );
+    const flags = parseRunArgs(tokens);
+    const configFile = readConfig(ENTRY_DIR);
+    const resolved = resolveRunOptions(
+      flags,
+      configFile.defaults,
+      process.env,
+      configFile.core,
+    );
+
+    if (flags.legacy) {
+      const { runLegacy } = await import("./tui/commands/legacy.js");
+      const { discoverTasks } = await import("./framework/discovery.js");
+      const registry = await discoverTasks(getRuntimeTasksRoot(), false);
+      await runLegacy(resolved, flags, registry);
+      return; // unreachable — runLegacy calls process.exit
+    }
+
+    await runCommand(resolved);
+  }
 
   try {
-    if (replLaunch) {
+    if (args.length === 0) {
       const { startRepl } = await import("./tui/repl.js");
-      const quiet = args.some(isQuietFlag);
-      await startRepl(ENTRY_DIR, { quiet });
+      await startRepl(ENTRY_DIR);
       return;
     }
 
-    const { buildCommandTree, dispatch, tokenizeArgv } = await import(
-      "./tui/commandTree.js"
-    );
+    const command = args[0].toLowerCase();
+    const subArgs = args.slice(1);
+    // Help is only triggered when `--help`/`-h`/`help` sits immediately
+    // after the command. Later positions are arguments or flag values and
+    // must not be swallowed (e.g. `evals run act --help` would otherwise
+    // print run help instead of erroring on the unknown `--help` flag).
+    const wantsHelp =
+      subArgs[0] === "--help" || subArgs[0] === "-h" || subArgs[0] === "help";
+
+    switch (command) {
+      case "run": {
+        if (wantsHelp) {
+          printRunHelp();
+          return;
+        }
+        await executeRun(subArgs);
+        return;
+      }
 
-    let registry: TaskRegistry | null = null;
-    const getRegistry = async (): Promise<TaskRegistry> => {
-      if (!registry) {
+      case "list": {
+        if (wantsHelp) {
+          printListHelp();
+          return;
+        }
+        const detailed =
+          subArgs.includes("--detailed") || subArgs.includes("-d");
+        const tierFilter = subArgs.find((a) => !a.startsWith("-"));
+        const tasksRoot = getRuntimeTasksRoot();
         const { discoverTasks } = await import("./framework/discovery.js");
-        registry = await discoverTasks(getRuntimeTasksRoot(), false);
+        const { printList } = await import("./tui/commands/list.js");
+        const registry = await discoverTasks(tasksRoot, false);
+        printList(registry, tierFilter, detailed);
+        return;
       }
-      return registry;
-    };
 
-    const tree = buildCommandTree();
-
-    const tokens = tokenizeArgv(args);
-    const outcome = await dispatch(tree, tokens, {
-      entryDir: ENTRY_DIR,
-      getRegistry,
-      setRegistry: (r) => {
-        registry = r;
-      },
-      abortRef: null,
-      contextPath: null,
-    });
-
-    // Only count real handler invocations as "first use". Doctor is a
-    // diagnostic, not a first use; help/meta paths are discovery.
-    if (outcome.kind === "ran") {
-      const top = outcome.absolutePath[0];
-      shouldMarkFirstRun = top !== "doctor";
+      case "config": {
+        const { handleConfig } = await import("./tui/commands/config.js");
+        await handleConfig(subArgs, ENTRY_DIR);
+        return;
+      }
+
+      case "experiments": {
+        const { handleExperiments } = await import(
+          "./tui/commands/experiments.js"
+        );
+        await handleExperiments(subArgs);
+        return;
+      }
+
+      case "new": {
+        if (wantsHelp) {
+          printNewHelp();
+          return;
+        }
+        const { scaffoldTask } = await import("./tui/commands/new.js");
+        scaffoldTask(subArgs);
+        return;
+      }
+
+      case "verify": {
+        const { handleVerify, printVerifyHelp } = await import(
+          "./tui/commands/verify.js"
+        );
+        if (wantsHelp) {
+          printVerifyHelp();
+          return;
+        }
+        await handleVerify(subArgs);
+        return;
+      }
+
+      case "help":
+      case "--help":
+      case "-h":
+        printHelp();
+        return;
+
+      default: {
+        // Unknown first arg → treat as run target: `evals act` == `evals run act`
+        await executeRun(args);
+        return;
+      }
     }
   } catch (err) {
     console.error(red(`Error: ${(err as Error).message}`));
     process.exitCode = 1;
   } finally {
-    if (shouldMarkFirstRun) {
-      try {
-        const { markFirstRunComplete } = await import("./tui/welcomeState.js");
-        markFirstRunComplete(ENTRY_DIR);
-      } catch {
-        // best-effort
-      }
-    }
     cleanupArgvInput();
   }
 })();
diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts
new file mode 100644
index 000000000..88bdf8430
--- /dev/null
+++ b/packages/evals/framework/rubricCache.ts
@@ -0,0 +1,129 @@
+/**
+ * Rubric cache — persists AI-generated rubrics so we run Step 0a once per
+ * task id and hydrate from disk thereafter. Honors plan §10 Q3 (resolved:
+ * generate per-task on first run + cache).
+ *
+ * Used for any task whose dataset doesn't ship a precomputed_rubric
+ * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its
+ * upstream dataset already carries rubrics.
+ *
+ * Cache layout:
+ *   packages/evals/.rubric-cache/<dataset>/<task-id>.json
+ *
+ * The cache key includes the task instruction hash to detect drift — if the
+ * instruction changes for the same task id, the rubric is regenerated rather
+ * than served from a stale cache.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import crypto from "node:crypto";
+
+import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand";
+
+export interface RubricCacheOptions {
+  /**
+   * Root directory for cached rubrics. Defaults to
+   * `<packages/evals>/.rubric-cache`.
+   */
+  cacheRoot?: string;
+  /**
+   * Dataset name, used as a subdirectory under cacheRoot to keep different
+   * datasets' rubrics separate (e.g., "onlineMind2Web").
+   */
+  dataset: string;
+}
+
+interface CacheEntry {
+  taskId: string;
+  instructionHash: string;
+  generatedAt: string;
+  rubric: Rubric;
+}
+
+function hashInstruction(instruction: string): string {
+  return crypto
+    .createHash("sha256")
+    .update(instruction)
+    .digest("hex")
+    .slice(0, 16);
+}
+
+export class RubricCache {
+  private readonly cacheDir: string;
+
+  constructor(opts: RubricCacheOptions) {
+    const root =
+      opts.cacheRoot ??
+      path.join(process.cwd(), "packages/evals/.rubric-cache");
+    this.cacheDir = path.join(root, opts.dataset);
+  }
+
+  /**
+   * Get or generate a rubric for the task. If a fresh cache entry exists
+   * (same instruction hash), returns it. Otherwise runs Step 0a and persists.
+   */
+  async getOrGenerate(
+    taskSpec: TaskSpec,
+    evaluator: V3Evaluator,
+  ): Promise<Rubric> {
+    const cached = await this.read(taskSpec);
+    if (cached) return cached;
+
+    const rubric = await evaluator.generateRubric(taskSpec);
+    await this.write(taskSpec, rubric);
+    return rubric;
+  }
+
+  /**
+   * Read a cached rubric. Returns undefined on miss or instruction-hash drift.
+   */
+  async read(taskSpec: TaskSpec): Promise<Rubric | undefined> {
+    const file = this.entryPath(taskSpec.id);
+    let raw: string;
+    try {
+      raw = await fs.readFile(file, "utf8");
+    } catch {
+      return undefined;
+    }
+    let parsed: CacheEntry;
+    try {
+      parsed = JSON.parse(raw) as CacheEntry;
+    } catch {
+      return undefined;
+    }
+    const expectedHash = hashInstruction(taskSpec.instruction);
+    if (parsed.instructionHash !== expectedHash) {
+      // Drift detected — surface a clear log and miss.
+      console.warn(
+        `[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`,
+      );
+      return undefined;
+    }
+    return parsed.rubric;
+  }
+
+  async write(taskSpec: TaskSpec, rubric: Rubric): Promise<void> {
+    await fs.mkdir(this.cacheDir, { recursive: true });
+    const entry: CacheEntry = {
+      taskId: taskSpec.id,
+      instructionHash: hashInstruction(taskSpec.instruction),
+      generatedAt: new Date().toISOString(),
+      rubric,
+    };
+    await fs.writeFile(
+      this.entryPath(taskSpec.id),
+      JSON.stringify(entry, null, 2),
+    );
+  }
+
+  /** Wipe the cache directory (used by tests / `bench cache clear`). */
+  async clear(): Promise<void> {
+    await fs.rm(this.cacheDir, { recursive: true, force: true });
+  }
+
+  private entryPath(taskId: string): string {
+    // Sanitize task id for filesystem safety.
+    const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_");
+    return path.join(this.cacheDir, `${safe}.json`);
+  }
+}
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
new file mode 100644
index 000000000..c50dcce29
--- /dev/null
+++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -0,0 +1,163 @@
+/**
+ * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with
+ * the upstream microsoft/WebTailBench `precomputed_rubric` field.
+ *
+ * The local JSONL was authored before fara released v1 rubrics. This script
+ * fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins by `id`,
+ * writing back a JSONL where each row carries a `precomputed_rubric` field
+ * (parsed JSON object) alongside the existing `ques` / `web` / `category` /
+ * `id` fields.
+ *
+ * Run once after pulling the branch:
+ *   pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts
+ *
+ * Idempotent — safe to re-run; an existing precomputed_rubric on a row is
+ * overwritten with the latest upstream version.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+const HF_URL =
+  "https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv";
+
+const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", "..");
+const JSONL_PATH = path.join(
+  REPO_ROOT,
+  "packages",
+  "evals",
+  "datasets",
+  "webtailbench",
+  "WebTailBench_data.jsonl",
+);
+
+interface Rubric {
+  items: Array<Record<string, unknown>>;
+}
+
+interface LocalRow {
+  id: string;
+  category?: string;
+  ques: string;
+  web?: string;
+  precomputed_rubric?: Rubric;
+}
+
+/**
+ * Parse a TSV file with simple double-quote escaping (the WebTailBench files
+ * use `""` for literal quotes inside quoted fields). Returns rows as arrays
+ * of column values; the caller maps to a schema.
+ */
+function parseTsv(text: string): string[][] {
+  const rows: string[][] = [];
+  const lines = text.split(/\r?\n/);
+  for (const raw of lines) {
+    if (!raw) continue;
+    // Each column is either quoted (with "" escapes) or unquoted plain text.
+    const cols: string[] = [];
+    let i = 0;
+    while (i < raw.length) {
+      if (raw[i] === "\t") {
+        cols.push("");
+        i++;
+        continue;
+      }
+      let col = "";
+      if (raw[i] === '"') {
+        i++;
+        while (i < raw.length) {
+          if (raw[i] === '"') {
+            if (raw[i + 1] === '"') {
+              col += '"';
+              i += 2;
+            } else {
+              i++;
+              break;
+            }
+          } else {
+            col += raw[i];
+            i++;
+          }
+        }
+      } else {
+        const tabIdx = raw.indexOf("\t", i);
+        if (tabIdx === -1) {
+          col = raw.slice(i);
+          i = raw.length;
+        } else {
+          col = raw.slice(i, tabIdx);
+          i = tabIdx;
+        }
+      }
+      cols.push(col);
+      if (raw[i] === "\t") i++;
+    }
+    rows.push(cols);
+  }
+  return rows;
+}
+
+async function main(): Promise<void> {
+  console.log(`▸ fetching ${HF_URL}`);
+  const res = await fetch(HF_URL);
+  if (!res.ok) {
+    throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`);
+  }
+  const tsv = await res.text();
+  console.log(`  ✓ downloaded ${tsv.length} bytes`);
+
+  const rows = parseTsv(tsv);
+  const header = rows[0];
+  const idIdx = header.indexOf("id");
+  const rubricIdx = header.indexOf("precomputed_rubric");
+  if (idIdx === -1 || rubricIdx === -1) {
+    throw new Error(
+      `unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`,
+    );
+  }
+
+  const rubricsById = new Map<string, Rubric>();
+  for (let i = 1; i < rows.length; i++) {
+    const cols = rows[i];
+    if (!cols[idIdx]) continue;
+    try {
+      const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
+      rubricsById.set(cols[idIdx], parsed);
+    } catch (e) {
+      console.warn(
+        `  ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`,
+      );
+    }
+  }
+  console.log(`  ✓ parsed ${rubricsById.size} rubrics`);
+
+  const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8");
+  const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0);
+  console.log(`▸ joining into ${inLines.length} local rows`);
+
+  let matched = 0;
+  let missing = 0;
+  const out: string[] = [];
+  for (const line of inLines) {
+    const row = JSON.parse(line) as LocalRow;
+    const rubric = rubricsById.get(row.id);
+    if (rubric) {
+      row.precomputed_rubric = rubric;
+      matched++;
+    } else {
+      missing++;
+    }
+    out.push(JSON.stringify(row));
+  }
+
+  console.log(
+    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
+  );
+
+  await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
+  console.log(`✅ wrote ${JSONL_PATH}`);
+}
+
+main().catch((err) => {
+  console.error("❌ backfill failed:", err);
+  process.exit(1);
+});
diff --git a/packages/evals/scripts/verify-live-trajectory.ts b/packages/evals/scripts/verify-live-trajectory.ts
new file mode 100644
index 000000000..4f3acdc95
--- /dev/null
+++ b/packages/evals/scripts/verify-live-trajectory.ts
@@ -0,0 +1,170 @@
+/**
+ * Wave 0 end-to-end verification — runs a tiny live agent task and asserts the
+ * TrajectoryRecorder captures bus events from the real v3AgentHandler.
+ *
+ * Deliberately minimal: env=LOCAL (no Browserbase costs), 3 max steps, a stable
+ * destination, and a DOM-mode agent. The goal is to confirm bus event wiring,
+ * not to test agent capability.
+ *
+ *   pnpm tsx packages/evals/scripts/verify-live-trajectory.ts
+ *
+ * Requires one of GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env.
+ */
+import "dotenv/config";
+import assert from "node:assert/strict";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { V3, V3Evaluator } from "@browserbasehq/stagehand";
+import type { TaskSpec } from "@browserbasehq/stagehand";
+import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js";
+
+async function main(): Promise<void> {
+  const tmpRoot = await fs.mkdtemp(
+    path.join(os.tmpdir(), "verifier-rewrite-live-"),
+  );
+  console.log(`▸ tmpdir: ${tmpRoot}`);
+
+  const v3 = new V3({
+    env: "LOCAL",
+    verbose: 0,
+    model: "google/gemini-2.5-flash",
+  });
+  await v3.init();
+  console.log("  ✓ V3 initialized");
+
+  const page = v3.context.pages()[0];
+  await page.goto("https://example.com", { timeoutMs: 60_000 });
+  console.log("  ✓ navigated to example.com");
+
+  const taskSpec: TaskSpec = {
+    id: "live-example-com",
+    instruction: "Extract the heading text from example.com",
+    initUrl: "https://example.com",
+  };
+
+  const recorder = new TrajectoryRecorder({
+    v3,
+    taskSpec,
+    outputRoot: tmpRoot,
+    runId: "live-run",
+    persist: true,
+  });
+  recorder.start();
+  console.log("  ✓ TrajectoryRecorder subscribed to bus");
+
+  const agent = v3.agent({
+    model: "google/gemini-2.5-flash",
+    mode: "dom",
+  });
+
+  const start = Date.now();
+  const result = await agent.execute({
+    instruction:
+      "Extract the main heading text on the current page using the extract tool, then call done with that text as the reasoning.",
+    maxSteps: 3,
+  });
+  console.log(`  ✓ agent.execute completed in ${Date.now() - start}ms`);
+  console.log(`    final message: "${result.message}"`);
+  console.log(`    actions: ${result.actions.length}`);
+
+  const trajectory = await recorder.finish({
+    status: "complete",
+    finalAnswer: result.message,
+    usage: result.usage,
+  });
+
+  await v3.close();
+  console.log("  ✓ V3 closed");
+
+  // ── Assertions ──────────────────────────────────────────────────────────
+  assert.ok(
+    trajectory.steps.length > 0,
+    `expected at least 1 trajectory step, got ${trajectory.steps.length}`,
+  );
+  console.log(`  ✓ trajectory has ${trajectory.steps.length} steps`);
+
+  const stepsWithScreenshot = trajectory.steps.filter(
+    (s) => s.probeEvidence.screenshotPath || s.probeEvidence.screenshot,
+  );
+  assert.ok(
+    stepsWithScreenshot.length > 0,
+    "expected at least one step with a probe screenshot",
+  );
+  console.log(
+    `  ✓ ${stepsWithScreenshot.length}/${trajectory.steps.length} steps carry a probe screenshot`,
+  );
+
+  const stepsWithUrl = trajectory.steps.filter(
+    (s) => typeof s.probeEvidence.url === "string" && s.probeEvidence.url,
+  );
+  assert.ok(
+    stepsWithUrl.length > 0,
+    "expected at least one step with a probe url",
+  );
+  console.log(
+    `  ✓ ${stepsWithUrl.length}/${trajectory.steps.length} steps carry a probe url`,
+  );
+
+  const stepsWithEvidence = trajectory.steps.filter(
+    (s) => s.agentEvidence.modalities.length > 0,
+  );
+  assert.ok(
+    stepsWithEvidence.length > 0,
+    "expected at least one step with tier-1 agent evidence modalities",
+  );
+  console.log(
+    `  ✓ ${stepsWithEvidence.length}/${trajectory.steps.length} steps carry tier-1 evidence`,
+  );
+
+  // ── On-disk layout ─────────────────────────────────────────────────────
+  const taskDir = path.join(tmpRoot, "live-run", "live-example-com");
+  const files = await fs.readdir(taskDir);
+  assert.ok(files.includes("trajectory.json"), "trajectory.json missing");
+  assert.ok(files.includes("task_data.json"), "task_data.json missing");
+  assert.ok(files.includes("times.json"), "times.json missing");
+  const screenshotFiles = files.filter((f) => f.startsWith("screenshot_"));
+  assert.ok(
+    screenshotFiles.length > 0,
+    "expected at least one persisted screenshot",
+  );
+  console.log(
+    `  ✓ on-disk: trajectory.json + task_data.json + times.json + ${screenshotFiles.length} screenshots`,
+  );
+
+  // ── verify() runs Wave 1 pipeline on the live trajectory ──────────────
+  console.log("\n▸ running V3Evaluator.verify() (Step 0a + Step 8)…");
+  const verdict = await new V3Evaluator(v3, { backend: "verifier" }).verify(
+    trajectory,
+    taskSpec,
+  );
+  console.log(
+    `  ✓ generated rubric with ${verdict.perCriterion.length} criteria`,
+  );
+  console.log(
+    `  ✓ outcomeSuccess=${verdict.outcomeSuccess}, processScore=${verdict.processScore}`,
+  );
+  assert.equal(typeof verdict.outcomeSuccess, "boolean");
+  assert.ok(
+    verdict.perCriterion.length > 0,
+    "expected generated rubric to have at least one criterion",
+  );
+  const raw = verdict.rawSteps as
+    | { primaryIntent?: string; rubricSource?: string }
+    | undefined;
+  assert.equal(raw?.rubricSource, "generated");
+  assert.ok(
+    typeof raw?.primaryIntent === "string" && raw.primaryIntent.length > 0,
+    "expected outcome verifier to populate primary_intent",
+  );
+  console.log(`    primary_intent: "${raw.primaryIntent.slice(0, 120)}"`);
+
+  console.log(`\n✅ Wave 0 live verification OK — trajectory at ${taskDir}`);
+  // Keep tmpdir for inspection; user can rm -rf if needed.
+}
+
+main().catch((err) => {
+  console.error("\n❌ Wave 0 live verification FAILED:", err);
+  process.exit(1);
+});
diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts
index 36be3e1aa..9b2d3aa83 100644
--- a/packages/evals/tests/tui/run.test.ts
+++ b/packages/evals/tests/tui/run.test.ts
@@ -119,6 +119,7 @@ describe("deriveCategoryFilter", () => {
         envOverrides: {},
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -157,6 +158,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -204,6 +206,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -254,6 +257,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -315,6 +319,7 @@ describe("deriveCategoryFilter", () => {
         },
         dryRun: true,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
@@ -371,6 +376,7 @@ describe("deriveCategoryFilter", () => {
           envOverrides: {},
           dryRun: true,
           preview: false,
+          successMode: "outcome",
           verbose: false,
         },
         registry,
@@ -405,6 +411,7 @@ describe("deriveCategoryFilter", () => {
           },
           dryRun: true,
           preview: false,
+          successMode: "outcome",
           verbose: false,
         },
         registry,
@@ -447,6 +454,7 @@ describe("deriveCategoryFilter", () => {
         envOverrides: {},
         dryRun: false,
         preview: false,
+        successMode: "outcome",
         verbose: false,
       },
       registry,
diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts
index d9cc20738..439c2f61b 100644
--- a/packages/evals/tui/commands/help.ts
+++ b/packages/evals/tui/commands/help.ts
@@ -29,10 +29,6 @@ export function printHelp(): void {
       "Inspect and compare Braintrust experiment runs",
     ),
     row(`${cyan("new")} ${dim("<tier> <cat> <name>")}`, "Scaffold a new task"),
-    row(
-      `${cyan("doctor")} ${dim("[--json]")}`,
-      "Health report (env keys, config, discovery)",
-    ),
     row(cyan("help"), "Show this help"),
     row(cyan("clear"), "Clear the screen"),
     row(cyan("exit"), "Exit the REPL"),
@@ -101,6 +97,10 @@ export function printRunHelp(): void {
       `${cyan("--agent-modes")} ${dim("<csv>")}`,
       `Stagehand mode matrix ${gray("(dom,hybrid,cua)")}`,
     ),
+    row(
+      `${cyan("--success")} ${dim("<mode>")}`,
+      `Rubric success mode ${gray("(outcome | process | both)")}`,
+    ),
     row(`${cyan("-l, --limit")} ${dim("<n>")}`, "Max cases to run"),
     row(`${cyan("-s, --sample")} ${dim("<n>")}`, "Random sample before limit"),
     row(
diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts
index 28c842e43..f793d3414 100644
--- a/packages/evals/tui/commands/parse.ts
+++ b/packages/evals/tui/commands/parse.ts
@@ -38,10 +38,25 @@ export interface RunFlags {
   filter?: Array<[string, string]>;
   dryRun?: boolean;
   preview?: boolean;
+  /**
+   * Rubric success mode for the verifier — outcome | process | both.
+   *   outcome (default): binary Verdict.outcomeSuccess (matches fara-7b's reported metric).
+   *   process: Verdict.processScore ≥ threshold.
+   *   both: outcome AND process.
+   * Plumbed to bench tasks via the EVAL_SUCCESS_MODE env override.
+   */
+  success?: SuccessMode;
   /** Spawn the pre-refactor index.eval.ts runner instead of the unified path. */
   legacy?: boolean;
 }
 
+export type SuccessMode = "outcome" | "process" | "both";
+const SUCCESS_MODES: ReadonlySet<SuccessMode> = new Set<SuccessMode>([
+  "outcome",
+  "process",
+  "both",
+]);
+
 export interface ConfigDefaults {
   env?: string;
   trials?: number;
@@ -68,6 +83,8 @@ export interface ResolvedRunOptions {
   agentMode?: AgentToolMode;
   agentModes?: AgentToolMode[];
   datasetFilter?: string;
+  /** Rubric success mode forwarded to bench tasks via EVAL_SUCCESS_MODE. */
+  successMode: SuccessMode;
   envOverrides: Record<string, string>;
   dryRun: boolean;
   preview: boolean;
@@ -101,6 +118,7 @@ const VALUE_FLAGS = new Set([
   "agent-mode",
   "agent-modes",
   "filter",
+  "success",
 ]);
 
 const FLAG_ALIASES: Record<string, string> = {
@@ -261,6 +279,16 @@ export function parseRunArgs(tokens: string[]): RunFlags {
           filters.push(parseFilter(value));
           break;
         }
+        case "success": {
+          const v = value.toLowerCase() as SuccessMode;
+          if (!SUCCESS_MODES.has(v)) {
+            throw new Error(
+              `--success must be one of: outcome, process, both (got "${value}")`,
+            );
+          }
+          flags.success = v;
+          break;
+        }
         default:
           break;
       }
@@ -427,6 +455,16 @@ export function resolveRunOptions(
     envOverrides.EVAL_MODEL_OVERRIDE = model;
   }
 
+  // Success mode resolves from --success first, then EVAL_SUCCESS_MODE env,
+  // then "outcome" (matches fara-7b's reported metric).
+  const envSuccess = (env.EVAL_SUCCESS_MODE ?? "").toLowerCase();
+  const successMode: SuccessMode =
+    flags.success ??
+    (SUCCESS_MODES.has(envSuccess as SuccessMode)
+      ? (envSuccess as SuccessMode)
+      : "outcome");
+  envOverrides.EVAL_SUCCESS_MODE = successMode;
+
   return {
     target: flags.target,
     normalizedTarget: target,
@@ -442,6 +480,7 @@ export function resolveRunOptions(
     agentMode,
     agentModes,
     datasetFilter,
+    successMode,
     envOverrides,
     dryRun: flags.dryRun ?? false,
     preview: flags.preview ?? false,
diff --git a/packages/evals/tui/commands/verify.ts b/packages/evals/tui/commands/verify.ts
new file mode 100644
index 000000000..2468d1786
--- /dev/null
+++ b/packages/evals/tui/commands/verify.ts
@@ -0,0 +1,238 @@
+/**
+ * `evals verify <trajectory-dir>` — re-score a saved trajectory offline.
+ *
+ * The verifier is browser-free: it consumes a hydrated Trajectory + TaskSpec
+ * and returns a Verdict. This command reads the on-disk layout written by
+ * `TrajectoryRecorder.persist()` (matching microsoft/fara's
+ * example_trajectory shape) and feeds it through V3Evaluator.verify().
+ *
+ * Output: writes a new verdict file under `scores/mmrubric_<label>.json` so
+ * re-runs don't clobber the original live-run verdict at `mmrubric_v1.json`.
+ *
+ * Velocity unlock: iterating on Step 6 prompts goes from "re-run a full
+ * agent loop on Browserbase" to "re-run one LLM call against a saved
+ * trajectory." Roughly 100× faster for prompt-tuning work.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+import {
+  V3,
+  V3Evaluator,
+  loadTrajectoryFromDisk,
+  nextVerdictFilename,
+  type AvailableModel,
+} from "@browserbasehq/stagehand";
+
+import { bold, cyan, dim, gray, green, red, yellow } from "../format.js";
+
+export interface VerifyOptions {
+  /** Absolute or cwd-relative path to a `<run-id>/<task-id>/` directory. */
+  trajectoryDir: string;
+  /** Override the verifier model. Defaults to whatever V3Evaluator picks. */
+  model?: string;
+  /** Label appended to the output verdict filename (default: timestamp). */
+  label?: string;
+  /** Emit machine-readable JSON to stdout instead of human summary. */
+  jsonOutput?: boolean;
+  /** Don't write to disk — print the verdict and exit. */
+  dryRun?: boolean;
+}
+
+export function printVerifyHelp(): void {
+  console.log(`
+${bold("evals verify")} ${dim("— re-score a saved trajectory offline")}
+
+  ${cyan("Usage")}
+    evals verify <trajectory-dir> [options]
+
+  ${cyan("Arguments")}
+    <trajectory-dir>       Path to a saved trajectory directory containing
+                           trajectory.json (typically under .trajectories/<run-id>/<task-id>/).
+
+  ${cyan("Options")}
+    --model <name>         Override the verifier LLM (default: V3Evaluator's default,
+                           currently google/gemini-2.5-flash).
+    --label <text>         Label appended to the output filename
+                           (default: rescore-<ISO timestamp>).
+                           File written to scores/mmrubric_<label>.json.
+    --json                 Emit the Verdict as JSON to stdout instead of a human summary.
+    --dry-run              Don't write to disk; print verdict and exit.
+    --help, -h             This message.
+
+  ${cyan("Examples")}
+    evals verify .trajectories/2026-05-11T06-47-09-697Z/united_13
+    evals verify .trajectories/<run>/<task> --model anthropic/claude-haiku-4-5 --label tuning-pass-1
+    evals verify .trajectories/<run>/<task> --json > verdict.json
+`);
+}
+
+interface ParsedArgs {
+  trajectoryDir?: string;
+  model?: string;
+  label?: string;
+  json?: boolean;
+  dryRun?: boolean;
+  help?: boolean;
+}
+
+function parseArgs(args: string[]): ParsedArgs {
+  const parsed: ParsedArgs = {};
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === "--help" || a === "-h" || a === "help") {
+      parsed.help = true;
+    } else if (a === "--json") {
+      parsed.json = true;
+    } else if (a === "--dry-run") {
+      parsed.dryRun = true;
+    } else if (a === "--model") {
+      parsed.model = args[++i];
+    } else if (a === "--label") {
+      parsed.label = args[++i];
+    } else if (!a.startsWith("-") && !parsed.trajectoryDir) {
+      parsed.trajectoryDir = a;
+    } else {
+      throw new Error(
+        `Unknown argument: ${a}. Run 'evals verify --help' for usage.`,
+      );
+    }
+  }
+  return parsed;
+}
+
+export async function handleVerify(args: string[]): Promise<void> {
+  const parsed = parseArgs(args);
+  if (parsed.help || !parsed.trajectoryDir) {
+    printVerifyHelp();
+    if (!parsed.trajectoryDir) {
+      process.exitCode = parsed.help ? 0 : 1;
+    }
+    return;
+  }
+
+  const dir = path.resolve(parsed.trajectoryDir);
+  await assertTrajectoryDir(dir);
+
+  if (!parsed.json) {
+    console.log(`${cyan("▸")} loading trajectory from ${gray(dir)}`);
+  }
+  const trajectory = await loadTrajectoryFromDisk(dir);
+  if (!parsed.json) {
+    console.log(
+      `  ${green("✓")} ${trajectory.steps.length} steps · status=${trajectory.status} · task=${trajectory.task.id}`,
+    );
+  }
+
+  // ── Build a verifier without launching a browser ────────────────────────
+  // V3Evaluator.verify() only touches v3.logger (to construct an LLMProvider)
+  // and the verify(trajectory, taskSpec) call is pure. Constructing V3 without
+  // calling init() is safe and avoids any browser/Browserbase setup cost.
+  const v3 = new V3({
+    env: "LOCAL",
+    verbose: 0,
+    disableAPI: true,
+    ...(parsed.model ? { model: parsed.model as AvailableModel } : {}),
+  });
+
+  const evaluator = new V3Evaluator(v3, {
+    backend: "verifier",
+    ...(parsed.model ? { modelName: parsed.model as AvailableModel } : {}),
+  });
+
+  if (!parsed.json) {
+    console.log(
+      `${cyan("▸")} running V3Evaluator.verify()${parsed.model ? ` with model=${parsed.model}` : ""}`,
+    );
+  }
+  const startMs = Date.now();
+  const verdict = await evaluator.verify(trajectory, trajectory.task);
+  const elapsedMs = Date.now() - startMs;
+
+  if (parsed.json) {
+    process.stdout.write(JSON.stringify(verdict, null, 2) + "\n");
+    return;
+  }
+
+  // ── Human summary ──────────────────────────────────────────────────────
+  console.log(`  ${green("✓")} verified in ${(elapsedMs / 1000).toFixed(1)}s`);
+  console.log();
+  console.log(
+    `${bold("Verdict")}  outcomeSuccess=${verdict.outcomeSuccess}  processScore=${verdict.processScore.toFixed(3)}`,
+  );
+  console.log(
+    `${dim("        ")} criteria=${verdict.perCriterion.length}  evidenceInsufficient=${verdict.evidenceInsufficient.length}`,
+  );
+
+  if (verdict.perCriterion.length > 0) {
+    console.log();
+    console.log(bold("Per-criterion"));
+    for (const c of verdict.perCriterion) {
+      const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1);
+      const flag = c.evidenceInsufficient
+        ? ` ${yellow("[evidence_insufficient]")}`
+        : "";
+      console.log(`  ${cyan(earned)}/${c.maxPoints}  ${c.criterion}${flag}`);
+      if (c.justification) {
+        console.log(`    ${dim(c.justification.slice(0, 220))}`);
+      }
+    }
+  }
+
+  if (verdict.findings && verdict.findings.length > 0) {
+    console.log();
+    console.log(bold(`Findings (${verdict.findings.length})`));
+    for (const f of verdict.findings) {
+      const sev =
+        f.severity === "blocking"
+          ? red(`[${f.severity}]`)
+          : f.severity === "warning"
+            ? yellow(`[${f.severity}]`)
+            : dim(`[${f.severity}]`);
+      const steps = f.relatedSteps?.length
+        ? gray(` steps=[${f.relatedSteps.join(",")}]`)
+        : "";
+      console.log(`  ${sev} ${f.category}${steps}`);
+      console.log(`    ${f.description}`);
+      if (f.suggestedAction) {
+        console.log(`    ${green("→")} ${f.suggestedAction}`);
+      }
+    }
+  }
+
+  // ── Persist ────────────────────────────────────────────────────────────
+  if (parsed.dryRun) {
+    console.log();
+    console.log(dim("dry-run: verdict not written to disk"));
+    return;
+  }
+  const filename = nextVerdictFilename(parsed.label);
+  const outPath = path.join(dir, "scores", filename);
+  await fs.mkdir(path.dirname(outPath), { recursive: true });
+  await fs.writeFile(outPath, JSON.stringify(verdict, null, 2));
+  console.log();
+  console.log(
+    `${green("✓")} wrote ${cyan(path.relative(process.cwd(), outPath))}`,
+  );
+}
+
+async function assertTrajectoryDir(dir: string): Promise<void> {
+  try {
+    const stat = await fs.stat(dir);
+    if (!stat.isDirectory()) {
+      throw new Error(`${dir} is not a directory`);
+    }
+  } catch (e) {
+    if ((e as NodeJS.ErrnoException).code === "ENOENT") {
+      throw new Error(`Trajectory directory not found: ${dir}`);
+    }
+    throw e;
+  }
+  try {
+    await fs.access(path.join(dir, "trajectory.json"));
+  } catch {
+    throw new Error(
+      `Missing trajectory.json in ${dir}. Is this a valid trajectory directory?`,
+    );
+  }
+}

From 3069b2f17ce166b0ca57edb3db1a5cbfe51f2dcf Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:17:48 -0700
Subject: [PATCH 3/7] fix(evals): use camel raw verifier metadata

---
 packages/evals/scripts/verify-live-trajectory.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/evals/scripts/verify-live-trajectory.ts b/packages/evals/scripts/verify-live-trajectory.ts
index 4f3acdc95..c14c96b7f 100644
--- a/packages/evals/scripts/verify-live-trajectory.ts
+++ b/packages/evals/scripts/verify-live-trajectory.ts
@@ -156,9 +156,9 @@ async function main(): Promise<void> {
   assert.equal(raw?.rubricSource, "generated");
   assert.ok(
     typeof raw?.primaryIntent === "string" && raw.primaryIntent.length > 0,
-    "expected outcome verifier to populate primary_intent",
+    "expected outcome verifier to populate primaryIntent",
   );
-  console.log(`    primary_intent: "${raw.primaryIntent.slice(0, 120)}"`);
+  console.log(`    primaryIntent: "${raw.primaryIntent.slice(0, 120)}"`);
 
   console.log(`\n✅ Wave 0 live verification OK — trajectory at ${taskDir}`);
   // Keep tmpdir for inspection; user can rm -rf if needed.

From fd4b79737c494b468df75647868b73d307284169 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:40:02 -0700
Subject: [PATCH 4/7] fix(evals): restore command tree verifier cli

---
 packages/evals/cli.ts                         | 181 ++++++------------
 packages/evals/framework/rubricCache.ts       |  16 +-
 .../scripts/backfill-webtailbench-rubrics.ts  |  11 +-
 .../evals/tests/framework/rubricCache.test.ts |  47 +++++
 packages/evals/tests/tui/commandTree.test.ts  |  10 +
 packages/evals/tui/commandTree.ts             |  14 +-
 packages/evals/tui/commands/help.ts           |   4 +
 packages/evals/tui/commands/parse.ts          |   4 +-
 packages/evals/tui/commands/verify.ts         |   3 +-
 9 files changed, 153 insertions(+), 137 deletions(-)
 create mode 100644 packages/evals/tests/framework/rubricCache.test.ts

diff --git a/packages/evals/cli.ts b/packages/evals/cli.ts
index 45226cabc..1cd9f8552 100644
--- a/packages/evals/cli.ts
+++ b/packages/evals/cli.ts
@@ -2,13 +2,18 @@
  * Evals CLI entry point.
  *
  * Modes:
- *   - `evals` (no args)          → interactive REPL
- *   - `evals run <target> …`     → single-shot run with rich progress
- *   - `evals list [tier]`        → list discovered tasks
- *   - `evals config [sub]`       → print / get / set defaults
- *   - `evals experiments [sub]`  → inspect / compare Braintrust runs
- *   - `evals new <tier> <cat> <name>` → scaffold a task file
- *   - `evals help` / `-h`        → help
+ *   - `evals` (no args)              → interactive REPL
+ *   - `evals --quiet` / `evals -q`   → REPL with no banner / welcome / inline warnings
+ *   - `evals run <target> …`         → single-shot run with rich progress
+ *   - `evals list [tier]`            → list discovered tasks
+ *   - `evals config [sub]`           → print / get / set defaults
+ *   - `evals experiments [sub]`      → inspect / compare Braintrust runs
+ *   - `evals doctor` / `health`      → env-key + config + discovery health report
+ *   - `evals new <tier> <cat> <name>`→ scaffold a task file
+ *   - `evals help` / `-h`            → help
+ *
+ * Env vars:
+ *   - EVALS_NO_WELCOME=1             → suppress first-run welcome panel (REPL only)
  *
  * No child processes. All runs flow through framework/runEvals in-process.
  *
@@ -50,6 +55,7 @@ await (async () => {
 
 import { red } from "./tui/format.js";
 import { getCurrentDirPath, getRuntimeTasksRoot } from "./runtimePaths.js";
+import type { TaskRegistry } from "./framework/types.js";
 
 /**
  * Directory of the running entry module. Differs between source and
@@ -60,13 +66,6 @@ const ENTRY_DIR = getCurrentDirPath();
 const args = process.argv.slice(2);
 
 (async () => {
-  // Keep heavy command modules behind their command branches. The run stack
-  // imports Braintrust transitively, and importing it for `help`/`config path`
-  // makes quiet commands print optional OpenTelemetry warnings.
-  const { printHelp, printRunHelp, printListHelp, printNewHelp } = await import(
-    "./tui/commands/help.js"
-  );
-
   // Best-effort shutdown: flush Braintrust telemetry and exit with the
   // conventional signal code. Does not guarantee in-flight task
   // cancellation upstream; the goal is clean process shutdown with no
@@ -95,6 +94,11 @@ const args = process.argv.slice(2);
   process.on("SIGINT", () => void handleSignal("SIGINT"));
   process.on("SIGTERM", () => void handleSignal("SIGTERM"));
 
+  // REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are
+  // REPL-only (they suppress chrome); other args route to the argv switch.
+  const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q";
+  const replLaunch = args.length === 0 || args.every(isQuietFlag);
+
   // Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress
   // handler that does cooperative-then-aggressive abort instead — this
   // path is only active when no arg-less REPL is running.
@@ -102,7 +106,7 @@ const args = process.argv.slice(2);
   // Note: raw mode disables the OS-level Ctrl+C → SIGINT translation,
   // so we forward it ourselves.
   let cleanupArgvInput = (): void => {};
-  if (args.length > 0 && process.stdin.isTTY) {
+  if (!replLaunch && args.length > 0 && process.stdin.isTTY) {
     const readline = await import("node:readline");
     const wasRaw = process.stdin.isRaw;
     readline.emitKeypressEvents(process.stdin);
@@ -123,126 +127,63 @@ const args = process.argv.slice(2);
     };
   }
 
-  async function executeRun(tokens: string[]): Promise<void> {
-    const { readConfig } = await import("./tui/commands/config.js");
-    const { runCommand } = await import("./tui/commands/run.js");
-    const { parseRunArgs, resolveRunOptions } = await import(
-      "./tui/commands/parse.js"
-    );
-    const flags = parseRunArgs(tokens);
-    const configFile = readConfig(ENTRY_DIR);
-    const resolved = resolveRunOptions(
-      flags,
-      configFile.defaults,
-      process.env,
-      configFile.core,
-    );
-
-    if (flags.legacy) {
-      const { runLegacy } = await import("./tui/commands/legacy.js");
-      const { discoverTasks } = await import("./framework/discovery.js");
-      const registry = await discoverTasks(getRuntimeTasksRoot(), false);
-      await runLegacy(resolved, flags, registry);
-      return; // unreachable — runLegacy calls process.exit
-    }
-
-    await runCommand(resolved);
-  }
+  // Whether to write the first-run marker in `finally`. Help-only paths and
+  // the doctor command don't count as "first uses" — they're discovery
+  // actions. The REPL marks itself. Set by the dispatch outcome below.
+  let shouldMarkFirstRun = false;
 
   try {
-    if (args.length === 0) {
+    if (replLaunch) {
       const { startRepl } = await import("./tui/repl.js");
-      await startRepl(ENTRY_DIR);
+      const quiet = args.some(isQuietFlag);
+      await startRepl(ENTRY_DIR, { quiet });
       return;
     }
 
-    const command = args[0].toLowerCase();
-    const subArgs = args.slice(1);
-    // Help is only triggered when `--help`/`-h`/`help` sits immediately
-    // after the command. Later positions are arguments or flag values and
-    // must not be swallowed (e.g. `evals run act --help` would otherwise
-    // print run help instead of erroring on the unknown `--help` flag).
-    const wantsHelp =
-      subArgs[0] === "--help" || subArgs[0] === "-h" || subArgs[0] === "help";
-
-    switch (command) {
-      case "run": {
-        if (wantsHelp) {
-          printRunHelp();
-          return;
-        }
-        await executeRun(subArgs);
-        return;
-      }
+    const { buildCommandTree, dispatch, tokenizeArgv } = await import(
+      "./tui/commandTree.js"
+    );
 
-      case "list": {
-        if (wantsHelp) {
-          printListHelp();
-          return;
-        }
-        const detailed =
-          subArgs.includes("--detailed") || subArgs.includes("-d");
-        const tierFilter = subArgs.find((a) => !a.startsWith("-"));
-        const tasksRoot = getRuntimeTasksRoot();
+    let registry: TaskRegistry | null = null;
+    const getRegistry = async (): Promise<TaskRegistry> => {
+      if (!registry) {
         const { discoverTasks } = await import("./framework/discovery.js");
-        const { printList } = await import("./tui/commands/list.js");
-        const registry = await discoverTasks(tasksRoot, false);
-        printList(registry, tierFilter, detailed);
-        return;
-      }
-
-      case "config": {
-        const { handleConfig } = await import("./tui/commands/config.js");
-        await handleConfig(subArgs, ENTRY_DIR);
-        return;
-      }
-
-      case "experiments": {
-        const { handleExperiments } = await import(
-          "./tui/commands/experiments.js"
-        );
-        await handleExperiments(subArgs);
-        return;
-      }
-
-      case "new": {
-        if (wantsHelp) {
-          printNewHelp();
-          return;
-        }
-        const { scaffoldTask } = await import("./tui/commands/new.js");
-        scaffoldTask(subArgs);
-        return;
-      }
-
-      case "verify": {
-        const { handleVerify, printVerifyHelp } = await import(
-          "./tui/commands/verify.js"
-        );
-        if (wantsHelp) {
-          printVerifyHelp();
-          return;
-        }
-        await handleVerify(subArgs);
-        return;
+        registry = await discoverTasks(getRuntimeTasksRoot(), false);
       }
+      return registry;
+    };
 
-      case "help":
-      case "--help":
-      case "-h":
-        printHelp();
-        return;
-
-      default: {
-        // Unknown first arg → treat as run target: `evals act` == `evals run act`
-        await executeRun(args);
-        return;
-      }
+    const tree = buildCommandTree();
+
+    const tokens = tokenizeArgv(args);
+    const outcome = await dispatch(tree, tokens, {
+      entryDir: ENTRY_DIR,
+      getRegistry,
+      setRegistry: (r) => {
+        registry = r;
+      },
+      abortRef: null,
+      contextPath: null,
+    });
+
+    // Only count real handler invocations as "first use". Doctor is a
+    // diagnostic, not a first use; help/meta paths are discovery.
+    if (outcome.kind === "ran") {
+      const top = outcome.absolutePath[0];
+      shouldMarkFirstRun = top !== "doctor";
     }
   } catch (err) {
     console.error(red(`Error: ${(err as Error).message}`));
     process.exitCode = 1;
   } finally {
+    if (shouldMarkFirstRun) {
+      try {
+        const { markFirstRunComplete } = await import("./tui/welcomeState.js");
+        markFirstRunComplete(ENTRY_DIR);
+      } catch {
+        // best-effort
+      }
+    }
     cleanupArgvInput();
   }
 })();
diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts
index 88bdf8430..622f0dfc9 100644
--- a/packages/evals/framework/rubricCache.ts
+++ b/packages/evals/framework/rubricCache.ts
@@ -10,9 +10,9 @@
  * Cache layout:
  *   packages/evals/.rubric-cache/<dataset>/<task-id>.json
  *
- * The cache key includes the task instruction hash to detect drift — if the
- * instruction changes for the same task id, the rubric is regenerated rather
- * than served from a stale cache.
+ * The cache key includes the task id and instruction hash to detect drift —
+ * if either changes, the rubric is regenerated rather than served from a
+ * stale cache.
  */
 import fs from "node:fs/promises";
 import path from "node:path";
@@ -74,9 +74,7 @@ export class RubricCache {
     return rubric;
   }
 
-  /**
-   * Read a cached rubric. Returns undefined on miss or instruction-hash drift.
-   */
+  /** Read a cached rubric. Returns undefined on miss or cache-key drift. */
   async read(taskSpec: TaskSpec): Promise<Rubric | undefined> {
     const file = this.entryPath(taskSpec.id);
     let raw: string;
@@ -91,6 +89,12 @@ export class RubricCache {
     } catch {
       return undefined;
     }
+    if (parsed.taskId !== taskSpec.id) {
+      console.warn(
+        `[rubric-cache] task-id mismatch for ${taskSpec.id}; regenerating`,
+      );
+      return undefined;
+    }
     const expectedHash = hashInstruction(taskSpec.instruction);
     if (parsed.instructionHash !== expectedHash) {
       // Drift detected — surface a clear log and miss.
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
index c50dcce29..5a6763390 100644
--- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts
+++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -1,12 +1,11 @@
 /**
  * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with
- * the upstream microsoft/WebTailBench `precomputed_rubric` field.
+ * the published WebTailBench `precomputed_rubric` field.
  *
- * The local JSONL was authored before fara released v1 rubrics. This script
- * fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins by `id`,
- * writing back a JSONL where each row carries a `precomputed_rubric` field
- * (parsed JSON object) alongside the existing `ques` / `web` / `category` /
- * `id` fields.
+ * This script fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins
+ * by `id`, writing back a JSONL where each row carries a
+ * `precomputed_rubric` field (parsed JSON object) alongside the existing
+ * `ques` / `web` / `category` / `id` fields.
  *
  * Run once after pulling the branch:
  *   pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts
diff --git a/packages/evals/tests/framework/rubricCache.test.ts b/packages/evals/tests/framework/rubricCache.test.ts
new file mode 100644
index 000000000..62afeee3d
--- /dev/null
+++ b/packages/evals/tests/framework/rubricCache.test.ts
@@ -0,0 +1,47 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+
+import { RubricCache } from "../../framework/rubricCache.js";
+
+describe("RubricCache", () => {
+  let tmpRoot = "";
+  let warn: ReturnType<typeof vi.spyOn>;
+
+  const rubric: Rubric = {
+    items: [
+      {
+        criterion: "criterion",
+        description: "description",
+        maxPoints: 1,
+      },
+    ],
+  };
+
+  beforeEach(async () => {
+    tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rubric-cache-test-"));
+    warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+  });
+
+  afterEach(async () => {
+    warn.mockRestore();
+    await fs.rm(tmpRoot, { recursive: true, force: true });
+  });
+
+  it("misses when sanitized task ids collide but the stored task id differs", async () => {
+    const cache = new RubricCache({ cacheRoot: tmpRoot, dataset: "test" });
+    const taskA: TaskSpec = { id: "task/a", instruction: "same instruction" };
+    const taskB: TaskSpec = { id: "task:a", instruction: "same instruction" };
+
+    await cache.write(taskA, rubric);
+
+    await expect(cache.read(taskB)).resolves.toBeUndefined();
+    await expect(cache.read(taskA)).resolves.toEqual(rubric);
+    expect(warn).toHaveBeenCalledWith(
+      "[rubric-cache] task-id mismatch for task:a; regenerating",
+    );
+  });
+});
diff --git a/packages/evals/tests/tui/commandTree.test.ts b/packages/evals/tests/tui/commandTree.test.ts
index d70006c1b..378499711 100644
--- a/packages/evals/tests/tui/commandTree.test.ts
+++ b/packages/evals/tests/tui/commandTree.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it, vi } from "vitest";
 import {
+  buildCommandTree,
   dispatch,
   findChild,
   resolveCommand,
@@ -207,6 +208,15 @@ describe("findChild + walkPath", () => {
   });
 });
 
+describe("buildCommandTree", () => {
+  it("exposes verify as a root command", () => {
+    const tree = buildCommandTree();
+    expect(findChild(tree, "verify")?.summary).toBe(
+      "Re-score a saved trajectory",
+    );
+  });
+});
+
 // ---------------------------------------------------------------------------
 // resolveCommand
 // ---------------------------------------------------------------------------
diff --git a/packages/evals/tui/commandTree.ts b/packages/evals/tui/commandTree.ts
index 7cdc6deca..c9350eff0 100644
--- a/packages/evals/tui/commandTree.ts
+++ b/packages/evals/tui/commandTree.ts
@@ -3,7 +3,7 @@
  *
  * Models the user-visible command surface as a tree:
  *   root → run, list, new, config{path,set,reset,core{path,set,reset,setup}},
- *          experiments{list,show,open,compare}
+ *          experiments{list,show,open,compare}, verify, doctor
  *
  * Both the REPL (tui/repl.ts) and argv mode (cli.ts) build the same tree
  * via `buildCommandTree()` and dispatch user input through it. This is the
@@ -643,6 +643,17 @@ export function buildCommandTree(): CommandNode {
     },
   };
 
+  const verifyNode: CommandNode = {
+    name: "verify",
+    summary: "Re-score a saved trajectory",
+    printHelp: async () =>
+      (await import("./commands/verify.js")).printVerifyHelp(),
+    handler: async (args) => {
+      const { handleVerify } = await import("./commands/verify.js");
+      await handleVerify(args);
+    },
+  };
+
   const root: CommandNode = {
     name: "evals",
     summary: "Stagehand evals CLI",
@@ -653,6 +664,7 @@ export function buildCommandTree(): CommandNode {
       configNode,
       experimentsNode,
       newNode,
+      verifyNode,
       doctorNode,
     ],
   };
diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts
index 439c2f61b..14ca82810 100644
--- a/packages/evals/tui/commands/help.ts
+++ b/packages/evals/tui/commands/help.ts
@@ -28,6 +28,10 @@ export function printHelp(): void {
       `${cyan("experiments")} ${dim("[subcommand]")}`,
       "Inspect and compare Braintrust experiment runs",
     ),
+    row(
+      `${cyan("verify")} ${dim("<trajectory-dir> [options]")}`,
+      "Re-score a saved trajectory",
+    ),
     row(`${cyan("new")} ${dim("<tier> <cat> <name>")}`, "Scaffold a new task"),
     row(cyan("help"), "Show this help"),
     row(cyan("clear"), "Clear the screen"),
diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts
index f793d3414..06a679d26 100644
--- a/packages/evals/tui/commands/parse.ts
+++ b/packages/evals/tui/commands/parse.ts
@@ -40,7 +40,7 @@ export interface RunFlags {
   preview?: boolean;
   /**
    * Rubric success mode for the verifier — outcome | process | both.
-   *   outcome (default): binary Verdict.outcomeSuccess (matches fara-7b's reported metric).
+   *   outcome (default): binary Verdict.outcomeSuccess.
    *   process: Verdict.processScore ≥ threshold.
    *   both: outcome AND process.
    * Plumbed to bench tasks via the EVAL_SUCCESS_MODE env override.
@@ -456,7 +456,7 @@ export function resolveRunOptions(
   }
 
   // Success mode resolves from --success first, then EVAL_SUCCESS_MODE env,
-  // then "outcome" (matches fara-7b's reported metric).
+  // then "outcome".
   const envSuccess = (env.EVAL_SUCCESS_MODE ?? "").toLowerCase();
   const successMode: SuccessMode =
     flags.success ??
diff --git a/packages/evals/tui/commands/verify.ts b/packages/evals/tui/commands/verify.ts
index 2468d1786..b7a36f116 100644
--- a/packages/evals/tui/commands/verify.ts
+++ b/packages/evals/tui/commands/verify.ts
@@ -3,8 +3,7 @@
  *
  * The verifier is browser-free: it consumes a hydrated Trajectory + TaskSpec
  * and returns a Verdict. This command reads the on-disk layout written by
- * `TrajectoryRecorder.persist()` (matching microsoft/fara's
- * example_trajectory shape) and feeds it through V3Evaluator.verify().
+ * `TrajectoryRecorder.persist()` and feeds it through V3Evaluator.verify().
  *
  * Output: writes a new verdict file under `scores/mmrubric_<label>.json` so
  * re-runs don't clobber the original live-run verdict at `mmrubric_v1.json`.

From 68583d04238003c58103986899b74eae662ea4fa Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:44:04 -0700
Subject: [PATCH 5/7] fix(evals): include doctor in restored help

---
 packages/evals/tui/commands/help.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts
index 14ca82810..087e34d08 100644
--- a/packages/evals/tui/commands/help.ts
+++ b/packages/evals/tui/commands/help.ts
@@ -32,6 +32,7 @@ export function printHelp(): void {
       `${cyan("verify")} ${dim("<trajectory-dir> [options]")}`,
       "Re-score a saved trajectory",
     ),
+    row(`${cyan("doctor")} ${dim("| health")}`, "Health report"),
     row(`${cyan("new")} ${dim("<tier> <cat> <name>")}`, "Scaffold a new task"),
     row(cyan("help"), "Show this help"),
     row(cyan("clear"), "Clear the screen"),

From 42a81b96658619b213fbff48719a01a8e7d0f5a8 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 15:20:12 -0700
Subject: [PATCH 6/7] docs(evals): remove rollout comments from offline
 verifier

---
 packages/evals/framework/rubricCache.ts          | 5 ++---
 packages/evals/scripts/verify-live-trajectory.ts | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts
index 622f0dfc9..d7817fc93 100644
--- a/packages/evals/framework/rubricCache.ts
+++ b/packages/evals/framework/rubricCache.ts
@@ -1,7 +1,6 @@
 /**
- * Rubric cache — persists AI-generated rubrics so we run Step 0a once per
- * task id and hydrate from disk thereafter. Honors plan §10 Q3 (resolved:
- * generate per-task on first run + cache).
+ * Rubric cache — persists AI-generated rubrics so each task id can hydrate
+ * from disk after its first generated rubric.
  *
  * Used for any task whose dataset doesn't ship a precomputed_rubric
  * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its
diff --git a/packages/evals/scripts/verify-live-trajectory.ts b/packages/evals/scripts/verify-live-trajectory.ts
index c14c96b7f..7d8ed7102 100644
--- a/packages/evals/scripts/verify-live-trajectory.ts
+++ b/packages/evals/scripts/verify-live-trajectory.ts
@@ -1,5 +1,5 @@
 /**
- * Wave 0 end-to-end verification — runs a tiny live agent task and asserts the
+ * End-to-end verification — runs a tiny live agent task and asserts the
  * TrajectoryRecorder captures bus events from the real v3AgentHandler.
  *
  * Deliberately minimal: env=LOCAL (no Browserbase costs), 3 max steps, a stable
@@ -133,7 +133,7 @@ async function main(): Promise<void> {
     `  ✓ on-disk: trajectory.json + task_data.json + times.json + ${screenshotFiles.length} screenshots`,
   );
 
-  // ── verify() runs Wave 1 pipeline on the live trajectory ──────────────
+  // ── verify() runs the verifier pipeline on the live trajectory ────────
   console.log("\n▸ running V3Evaluator.verify() (Step 0a + Step 8)…");
   const verdict = await new V3Evaluator(v3, { backend: "verifier" }).verify(
     trajectory,
@@ -160,11 +160,11 @@ async function main(): Promise<void> {
   );
   console.log(`    primaryIntent: "${raw.primaryIntent.slice(0, 120)}"`);
 
-  console.log(`\n✅ Wave 0 live verification OK — trajectory at ${taskDir}`);
+  console.log(`\n✅ Live verification OK — trajectory at ${taskDir}`);
   // Keep tmpdir for inspection; user can rm -rf if needed.
 }
 
 main().catch((err) => {
-  console.error("\n❌ Wave 0 live verification FAILED:", err);
+  console.error("\n❌ Live verification FAILED:", err);
   process.exit(1);
 });

From 4f141e7cd8c1b366836d75b12283f8b224eb71c9 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:40:50 -0700
Subject: [PATCH 7/7] fix(evals): align offline verifier result naming

---
 .../core/lib/v3/verifier/prompts/index.ts     |   4 +-
 .../evals/scripts/verify-live-trajectory.ts   | 170 ------------------
 packages/evals/tui/commands/parse.ts          |   4 +-
 packages/evals/tui/commands/verify.ts         |  55 +++---
 4 files changed, 32 insertions(+), 201 deletions(-)
 delete mode 100644 packages/evals/scripts/verify-live-trajectory.ts

diff --git a/packages/core/lib/v3/verifier/prompts/index.ts b/packages/core/lib/v3/verifier/prompts/index.ts
index 8e15d9556..dd0b3ba75 100644
--- a/packages/core/lib/v3/verifier/prompts/index.ts
+++ b/packages/core/lib/v3/verifier/prompts/index.ts
@@ -1,4 +1,6 @@
-/** Verifier prompts used by the rubric-based verification pipeline. */
+/**
+ * Verifier prompts used by the rubric-based verification pipeline.
+ */
 export { RUBRIC_GENERATION_PROMPT } from "./rubricGeneration.js";
 export { OUTCOME_VERIFICATION_PROMPT } from "./outcomeVerification.js";
 export { RUBRIC_RESCORING_PROMPT } from "./rubricRescoring.js";
diff --git a/packages/evals/scripts/verify-live-trajectory.ts b/packages/evals/scripts/verify-live-trajectory.ts
deleted file mode 100644
index 7d8ed7102..000000000
--- a/packages/evals/scripts/verify-live-trajectory.ts
+++ /dev/null
@@ -1,170 +0,0 @@
-/**
- * End-to-end verification — runs a tiny live agent task and asserts the
- * TrajectoryRecorder captures bus events from the real v3AgentHandler.
- *
- * Deliberately minimal: env=LOCAL (no Browserbase costs), 3 max steps, a stable
- * destination, and a DOM-mode agent. The goal is to confirm bus event wiring,
- * not to test agent capability.
- *
- *   pnpm tsx packages/evals/scripts/verify-live-trajectory.ts
- *
- * Requires one of GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env.
- */
-import "dotenv/config";
-import assert from "node:assert/strict";
-import fs from "node:fs/promises";
-import os from "node:os";
-import path from "node:path";
-
-import { V3, V3Evaluator } from "@browserbasehq/stagehand";
-import type { TaskSpec } from "@browserbasehq/stagehand";
-import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js";
-
-async function main(): Promise<void> {
-  const tmpRoot = await fs.mkdtemp(
-    path.join(os.tmpdir(), "verifier-rewrite-live-"),
-  );
-  console.log(`▸ tmpdir: ${tmpRoot}`);
-
-  const v3 = new V3({
-    env: "LOCAL",
-    verbose: 0,
-    model: "google/gemini-2.5-flash",
-  });
-  await v3.init();
-  console.log("  ✓ V3 initialized");
-
-  const page = v3.context.pages()[0];
-  await page.goto("https://example.com", { timeoutMs: 60_000 });
-  console.log("  ✓ navigated to example.com");
-
-  const taskSpec: TaskSpec = {
-    id: "live-example-com",
-    instruction: "Extract the heading text from example.com",
-    initUrl: "https://example.com",
-  };
-
-  const recorder = new TrajectoryRecorder({
-    v3,
-    taskSpec,
-    outputRoot: tmpRoot,
-    runId: "live-run",
-    persist: true,
-  });
-  recorder.start();
-  console.log("  ✓ TrajectoryRecorder subscribed to bus");
-
-  const agent = v3.agent({
-    model: "google/gemini-2.5-flash",
-    mode: "dom",
-  });
-
-  const start = Date.now();
-  const result = await agent.execute({
-    instruction:
-      "Extract the main heading text on the current page using the extract tool, then call done with that text as the reasoning.",
-    maxSteps: 3,
-  });
-  console.log(`  ✓ agent.execute completed in ${Date.now() - start}ms`);
-  console.log(`    final message: "${result.message}"`);
-  console.log(`    actions: ${result.actions.length}`);
-
-  const trajectory = await recorder.finish({
-    status: "complete",
-    finalAnswer: result.message,
-    usage: result.usage,
-  });
-
-  await v3.close();
-  console.log("  ✓ V3 closed");
-
-  // ── Assertions ──────────────────────────────────────────────────────────
-  assert.ok(
-    trajectory.steps.length > 0,
-    `expected at least 1 trajectory step, got ${trajectory.steps.length}`,
-  );
-  console.log(`  ✓ trajectory has ${trajectory.steps.length} steps`);
-
-  const stepsWithScreenshot = trajectory.steps.filter(
-    (s) => s.probeEvidence.screenshotPath || s.probeEvidence.screenshot,
-  );
-  assert.ok(
-    stepsWithScreenshot.length > 0,
-    "expected at least one step with a probe screenshot",
-  );
-  console.log(
-    `  ✓ ${stepsWithScreenshot.length}/${trajectory.steps.length} steps carry a probe screenshot`,
-  );
-
-  const stepsWithUrl = trajectory.steps.filter(
-    (s) => typeof s.probeEvidence.url === "string" && s.probeEvidence.url,
-  );
-  assert.ok(
-    stepsWithUrl.length > 0,
-    "expected at least one step with a probe url",
-  );
-  console.log(
-    `  ✓ ${stepsWithUrl.length}/${trajectory.steps.length} steps carry a probe url`,
-  );
-
-  const stepsWithEvidence = trajectory.steps.filter(
-    (s) => s.agentEvidence.modalities.length > 0,
-  );
-  assert.ok(
-    stepsWithEvidence.length > 0,
-    "expected at least one step with tier-1 agent evidence modalities",
-  );
-  console.log(
-    `  ✓ ${stepsWithEvidence.length}/${trajectory.steps.length} steps carry tier-1 evidence`,
-  );
-
-  // ── On-disk layout ─────────────────────────────────────────────────────
-  const taskDir = path.join(tmpRoot, "live-run", "live-example-com");
-  const files = await fs.readdir(taskDir);
-  assert.ok(files.includes("trajectory.json"), "trajectory.json missing");
-  assert.ok(files.includes("task_data.json"), "task_data.json missing");
-  assert.ok(files.includes("times.json"), "times.json missing");
-  const screenshotFiles = files.filter((f) => f.startsWith("screenshot_"));
-  assert.ok(
-    screenshotFiles.length > 0,
-    "expected at least one persisted screenshot",
-  );
-  console.log(
-    `  ✓ on-disk: trajectory.json + task_data.json + times.json + ${screenshotFiles.length} screenshots`,
-  );
-
-  // ── verify() runs the verifier pipeline on the live trajectory ────────
-  console.log("\n▸ running V3Evaluator.verify() (Step 0a + Step 8)…");
-  const verdict = await new V3Evaluator(v3, { backend: "verifier" }).verify(
-    trajectory,
-    taskSpec,
-  );
-  console.log(
-    `  ✓ generated rubric with ${verdict.perCriterion.length} criteria`,
-  );
-  console.log(
-    `  ✓ outcomeSuccess=${verdict.outcomeSuccess}, processScore=${verdict.processScore}`,
-  );
-  assert.equal(typeof verdict.outcomeSuccess, "boolean");
-  assert.ok(
-    verdict.perCriterion.length > 0,
-    "expected generated rubric to have at least one criterion",
-  );
-  const raw = verdict.rawSteps as
-    | { primaryIntent?: string; rubricSource?: string }
-    | undefined;
-  assert.equal(raw?.rubricSource, "generated");
-  assert.ok(
-    typeof raw?.primaryIntent === "string" && raw.primaryIntent.length > 0,
-    "expected outcome verifier to populate primaryIntent",
-  );
-  console.log(`    primaryIntent: "${raw.primaryIntent.slice(0, 120)}"`);
-
-  console.log(`\n✅ Live verification OK — trajectory at ${taskDir}`);
-  // Keep tmpdir for inspection; user can rm -rf if needed.
-}
-
-main().catch((err) => {
-  console.error("\n❌ Live verification FAILED:", err);
-  process.exit(1);
-});
diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts
index 06a679d26..d5d723403 100644
--- a/packages/evals/tui/commands/parse.ts
+++ b/packages/evals/tui/commands/parse.ts
@@ -40,8 +40,8 @@ export interface RunFlags {
   preview?: boolean;
   /**
    * Rubric success mode for the verifier — outcome | process | both.
-   *   outcome (default): binary Verdict.outcomeSuccess.
-   *   process: Verdict.processScore ≥ threshold.
+   *   outcome (default): binary EvaluationResult.outcomeSuccess.
+   *   process: EvaluationResult.processScore ≥ threshold.
    *   both: outcome AND process.
    * Plumbed to bench tasks via the EVAL_SUCCESS_MODE env override.
    */
diff --git a/packages/evals/tui/commands/verify.ts b/packages/evals/tui/commands/verify.ts
index b7a36f116..130d0694e 100644
--- a/packages/evals/tui/commands/verify.ts
+++ b/packages/evals/tui/commands/verify.ts
@@ -2,15 +2,10 @@
  * `evals verify <trajectory-dir>` — re-score a saved trajectory offline.
  *
  * The verifier is browser-free: it consumes a hydrated Trajectory + TaskSpec
- * and returns a Verdict. This command reads the on-disk layout written by
+ * and returns an EvaluationResult. This command reads the on-disk layout written by
  * `TrajectoryRecorder.persist()` and feeds it through V3Evaluator.verify().
  *
- * Output: writes a new verdict file under `scores/mmrubric_<label>.json` so
- * re-runs don't clobber the original live-run verdict at `mmrubric_v1.json`.
- *
- * Velocity unlock: iterating on Step 6 prompts goes from "re-run a full
- * agent loop on Browserbase" to "re-run one LLM call against a saved
- * trajectory." Roughly 100× faster for prompt-tuning work.
+ * Output: writes a new result file under `scores/result_<label>.json`.
  */
 import fs from "node:fs/promises";
 import path from "node:path";
@@ -19,7 +14,7 @@ import {
   V3,
   V3Evaluator,
   loadTrajectoryFromDisk,
-  nextVerdictFilename,
+  nextResultFilename,
   type AvailableModel,
 } from "@browserbasehq/stagehand";
 
@@ -30,11 +25,11 @@ export interface VerifyOptions {
   trajectoryDir: string;
   /** Override the verifier model. Defaults to whatever V3Evaluator picks. */
   model?: string;
-  /** Label appended to the output verdict filename (default: timestamp). */
+  /** Label appended to the output result filename (default: timestamp). */
   label?: string;
   /** Emit machine-readable JSON to stdout instead of human summary. */
   jsonOutput?: boolean;
-  /** Don't write to disk — print the verdict and exit. */
+  /** Don't write to disk — print the result and exit. */
   dryRun?: boolean;
 }
 
@@ -54,15 +49,15 @@ ${bold("evals verify")} ${dim("— re-score a saved trajectory offline")}
                            currently google/gemini-2.5-flash).
     --label <text>         Label appended to the output filename
                            (default: rescore-<ISO timestamp>).
-                           File written to scores/mmrubric_<label>.json.
-    --json                 Emit the Verdict as JSON to stdout instead of a human summary.
-    --dry-run              Don't write to disk; print verdict and exit.
+                           File written to scores/result_<label>.json.
+    --json                 Emit the result as JSON to stdout instead of a human summary.
+    --dry-run              Don't write to disk; print result and exit.
     --help, -h             This message.
 
   ${cyan("Examples")}
     evals verify .trajectories/2026-05-11T06-47-09-697Z/united_13
     evals verify .trajectories/<run>/<task> --model anthropic/claude-haiku-4-5 --label tuning-pass-1
-    evals verify .trajectories/<run>/<task> --json > verdict.json
+    evals verify .trajectories/<run>/<task> --json > result.json
 `);
 }
 
@@ -145,43 +140,47 @@ export async function handleVerify(args: string[]): Promise<void> {
     );
   }
   const startMs = Date.now();
-  const verdict = await evaluator.verify(trajectory, trajectory.task);
+  const result = await evaluator.verify(trajectory, trajectory.task);
   const elapsedMs = Date.now() - startMs;
 
   if (parsed.json) {
-    process.stdout.write(JSON.stringify(verdict, null, 2) + "\n");
+    process.stdout.write(JSON.stringify(result, null, 2) + "\n");
     return;
   }
 
   // ── Human summary ──────────────────────────────────────────────────────
   console.log(`  ${green("✓")} verified in ${(elapsedMs / 1000).toFixed(1)}s`);
   console.log();
+  const processScore =
+    result.processScore === undefined ? "n/a" : result.processScore.toFixed(3);
   console.log(
-    `${bold("Verdict")}  outcomeSuccess=${verdict.outcomeSuccess}  processScore=${verdict.processScore.toFixed(3)}`,
+    `${bold("Result")}  outcomeSuccess=${result.outcomeSuccess}  processScore=${processScore}`,
   );
+  const perCriterion = result.perCriterion ?? [];
+  const evidenceInsufficient = result.evidenceInsufficient ?? [];
   console.log(
-    `${dim("        ")} criteria=${verdict.perCriterion.length}  evidenceInsufficient=${verdict.evidenceInsufficient.length}`,
+    `${dim("       ")} criteria=${perCriterion.length}  evidenceInsufficient=${evidenceInsufficient.length}`,
   );
 
-  if (verdict.perCriterion.length > 0) {
+  if (perCriterion.length > 0) {
     console.log();
     console.log(bold("Per-criterion"));
-    for (const c of verdict.perCriterion) {
+    for (const c of perCriterion) {
       const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1);
       const flag = c.evidenceInsufficient
         ? ` ${yellow("[evidence_insufficient]")}`
         : "";
       console.log(`  ${cyan(earned)}/${c.maxPoints}  ${c.criterion}${flag}`);
-      if (c.justification) {
-        console.log(`    ${dim(c.justification.slice(0, 220))}`);
+      if (c.explanation) {
+        console.log(`    ${dim(c.explanation.slice(0, 220))}`);
       }
     }
   }
 
-  if (verdict.findings && verdict.findings.length > 0) {
+  if (result.findings && result.findings.length > 0) {
     console.log();
-    console.log(bold(`Findings (${verdict.findings.length})`));
-    for (const f of verdict.findings) {
+    console.log(bold(`Findings (${result.findings.length})`));
+    for (const f of result.findings) {
       const sev =
         f.severity === "blocking"
           ? red(`[${f.severity}]`)
@@ -202,13 +201,13 @@ export async function handleVerify(args: string[]): Promise<void> {
   // ── Persist ────────────────────────────────────────────────────────────
   if (parsed.dryRun) {
     console.log();
-    console.log(dim("dry-run: verdict not written to disk"));
+    console.log(dim("dry-run: result not written to disk"));
     return;
   }
-  const filename = nextVerdictFilename(parsed.label);
+  const filename = nextResultFilename(parsed.label);
   const outPath = path.join(dir, "scores", filename);
   await fs.mkdir(path.dirname(outPath), { recursive: true });
-  await fs.writeFile(outPath, JSON.stringify(verdict, null, 2));
+  await fs.writeFile(outPath, JSON.stringify(result, null, 2));
   console.log();
   console.log(
     `${green("✓")} wrote ${cyan(path.relative(process.cwd(), outPath))}`,