From 1265dca00c11bf353e6c0fd3e1eae4738d478c62 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:37:54 -0700 Subject: [PATCH 1/7] fix(verifier): bound failure step parsing --- packages/core/lib/v3/verifier/prompts/index.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/core/lib/v3/verifier/prompts/index.ts b/packages/core/lib/v3/verifier/prompts/index.ts index dd0b3ba75..8e15d9556 100644 --- a/packages/core/lib/v3/verifier/prompts/index.ts +++ b/packages/core/lib/v3/verifier/prompts/index.ts @@ -1,6 +1,4 @@ -/** - * Verifier prompts used by the rubric-based verification pipeline. - */ +/** Verifier prompts used by the rubric-based verification pipeline. */ export { RUBRIC_GENERATION_PROMPT } from "./rubricGeneration.js"; export { OUTCOME_VERIFICATION_PROMPT } from "./outcomeVerification.js"; export { RUBRIC_RESCORING_PROMPT } from "./rubricRescoring.js"; From 3d1a1b1f0d923c7b722baaf8c3adf9daac00c3a2 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:49:04 -0700 Subject: [PATCH 2/7] feat(evals): add offline verifier CLI --- .gitignore | 1 + packages/evals/cli.ts | 181 ++++++++----- packages/evals/framework/rubricCache.ts | 129 ++++++++++ .../scripts/backfill-webtailbench-rubrics.ts | 163 ++++++++++++ .../evals/scripts/verify-live-trajectory.ts | 170 +++++++++++++ packages/evals/tests/tui/run.test.ts | 8 + packages/evals/tui/commands/help.ts | 8 +- packages/evals/tui/commands/parse.ts | 39 +++ packages/evals/tui/commands/verify.ts | 238 ++++++++++++++++++ 9 files changed, 872 insertions(+), 65 deletions(-) create mode 100644 packages/evals/framework/rubricCache.ts create mode 100644 packages/evals/scripts/backfill-webtailbench-rubrics.ts create mode 100644 packages/evals/scripts/verify-live-trajectory.ts create mode 100644 packages/evals/tui/commands/verify.ts diff --git a/.gitignore b/.gitignore index ec7d09add..a09d13c0a 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ ctrf/ **/.playwright*/ packages/evals/playwright-mcp-screenshot-*.png packages/evals/chrome-devtools-mcp-screenshot-*.png +.trajectories/ diff --git a/packages/evals/cli.ts b/packages/evals/cli.ts index 1cd9f8552..45226cabc 100644 --- a/packages/evals/cli.ts +++ b/packages/evals/cli.ts @@ -2,18 +2,13 @@ * Evals CLI entry point. * * Modes: - * - `evals` (no args) → interactive REPL - * - `evals --quiet` / `evals -q` → REPL with no banner / welcome / inline warnings - * - `evals run …` → single-shot run with rich progress - * - `evals list [tier]` → list discovered tasks - * - `evals config [sub]` → print / get / set defaults - * - `evals experiments [sub]` → inspect / compare Braintrust runs - * - `evals doctor` / `health` → env-key + config + discovery health report - * - `evals new `→ scaffold a task file - * - `evals help` / `-h` → help - * - * Env vars: - * - EVALS_NO_WELCOME=1 → suppress first-run welcome panel (REPL only) + * - `evals` (no args) → interactive REPL + * - `evals run …` → single-shot run with rich progress + * - `evals list [tier]` → list discovered tasks + * - `evals config [sub]` → print / get / set defaults + * - `evals experiments [sub]` → inspect / compare Braintrust runs + * - `evals new ` → scaffold a task file + * - `evals help` / `-h` → help * * No child processes. All runs flow through framework/runEvals in-process. * @@ -55,7 +50,6 @@ await (async () => { import { red } from "./tui/format.js"; import { getCurrentDirPath, getRuntimeTasksRoot } from "./runtimePaths.js"; -import type { TaskRegistry } from "./framework/types.js"; /** * Directory of the running entry module. Differs between source and @@ -66,6 +60,13 @@ const ENTRY_DIR = getCurrentDirPath(); const args = process.argv.slice(2); (async () => { + // Keep heavy command modules behind their command branches. The run stack + // imports Braintrust transitively, and importing it for `help`/`config path` + // makes quiet commands print optional OpenTelemetry warnings. + const { printHelp, printRunHelp, printListHelp, printNewHelp } = await import( + "./tui/commands/help.js" + ); + // Best-effort shutdown: flush Braintrust telemetry and exit with the // conventional signal code. Does not guarantee in-flight task // cancellation upstream; the goal is clean process shutdown with no @@ -94,11 +95,6 @@ const args = process.argv.slice(2); process.on("SIGINT", () => void handleSignal("SIGINT")); process.on("SIGTERM", () => void handleSignal("SIGTERM")); - // REPL launch: zero args, or only `--quiet`/`-q` flags. Quiet flags are - // REPL-only (they suppress chrome); other args route to the argv switch. - const isQuietFlag = (a: string): boolean => a === "--quiet" || a === "-q"; - const replLaunch = args.length === 0 || args.every(isQuietFlag); - // Argv mode: Esc behaves like Ctrl+C. The REPL has its own keypress // handler that does cooperative-then-aggressive abort instead — this // path is only active when no arg-less REPL is running. @@ -106,7 +102,7 @@ const args = process.argv.slice(2); // Note: raw mode disables the OS-level Ctrl+C → SIGINT translation, // so we forward it ourselves. let cleanupArgvInput = (): void => {}; - if (!replLaunch && args.length > 0 && process.stdin.isTTY) { + if (args.length > 0 && process.stdin.isTTY) { const readline = await import("node:readline"); const wasRaw = process.stdin.isRaw; readline.emitKeypressEvents(process.stdin); @@ -127,63 +123,126 @@ const args = process.argv.slice(2); }; } - // Whether to write the first-run marker in `finally`. Help-only paths and - // the doctor command don't count as "first uses" — they're discovery - // actions. The REPL marks itself. Set by the dispatch outcome below. - let shouldMarkFirstRun = false; + async function executeRun(tokens: string[]): Promise { + const { readConfig } = await import("./tui/commands/config.js"); + const { runCommand } = await import("./tui/commands/run.js"); + const { parseRunArgs, resolveRunOptions } = await import( + "./tui/commands/parse.js" + ); + const flags = parseRunArgs(tokens); + const configFile = readConfig(ENTRY_DIR); + const resolved = resolveRunOptions( + flags, + configFile.defaults, + process.env, + configFile.core, + ); + + if (flags.legacy) { + const { runLegacy } = await import("./tui/commands/legacy.js"); + const { discoverTasks } = await import("./framework/discovery.js"); + const registry = await discoverTasks(getRuntimeTasksRoot(), false); + await runLegacy(resolved, flags, registry); + return; // unreachable — runLegacy calls process.exit + } + + await runCommand(resolved); + } try { - if (replLaunch) { + if (args.length === 0) { const { startRepl } = await import("./tui/repl.js"); - const quiet = args.some(isQuietFlag); - await startRepl(ENTRY_DIR, { quiet }); + await startRepl(ENTRY_DIR); return; } - const { buildCommandTree, dispatch, tokenizeArgv } = await import( - "./tui/commandTree.js" - ); + const command = args[0].toLowerCase(); + const subArgs = args.slice(1); + // Help is only triggered when `--help`/`-h`/`help` sits immediately + // after the command. Later positions are arguments or flag values and + // must not be swallowed (e.g. `evals run act --help` would otherwise + // print run help instead of erroring on the unknown `--help` flag). + const wantsHelp = + subArgs[0] === "--help" || subArgs[0] === "-h" || subArgs[0] === "help"; + + switch (command) { + case "run": { + if (wantsHelp) { + printRunHelp(); + return; + } + await executeRun(subArgs); + return; + } - let registry: TaskRegistry | null = null; - const getRegistry = async (): Promise => { - if (!registry) { + case "list": { + if (wantsHelp) { + printListHelp(); + return; + } + const detailed = + subArgs.includes("--detailed") || subArgs.includes("-d"); + const tierFilter = subArgs.find((a) => !a.startsWith("-")); + const tasksRoot = getRuntimeTasksRoot(); const { discoverTasks } = await import("./framework/discovery.js"); - registry = await discoverTasks(getRuntimeTasksRoot(), false); + const { printList } = await import("./tui/commands/list.js"); + const registry = await discoverTasks(tasksRoot, false); + printList(registry, tierFilter, detailed); + return; } - return registry; - }; - const tree = buildCommandTree(); - - const tokens = tokenizeArgv(args); - const outcome = await dispatch(tree, tokens, { - entryDir: ENTRY_DIR, - getRegistry, - setRegistry: (r) => { - registry = r; - }, - abortRef: null, - contextPath: null, - }); - - // Only count real handler invocations as "first use". Doctor is a - // diagnostic, not a first use; help/meta paths are discovery. - if (outcome.kind === "ran") { - const top = outcome.absolutePath[0]; - shouldMarkFirstRun = top !== "doctor"; + case "config": { + const { handleConfig } = await import("./tui/commands/config.js"); + await handleConfig(subArgs, ENTRY_DIR); + return; + } + + case "experiments": { + const { handleExperiments } = await import( + "./tui/commands/experiments.js" + ); + await handleExperiments(subArgs); + return; + } + + case "new": { + if (wantsHelp) { + printNewHelp(); + return; + } + const { scaffoldTask } = await import("./tui/commands/new.js"); + scaffoldTask(subArgs); + return; + } + + case "verify": { + const { handleVerify, printVerifyHelp } = await import( + "./tui/commands/verify.js" + ); + if (wantsHelp) { + printVerifyHelp(); + return; + } + await handleVerify(subArgs); + return; + } + + case "help": + case "--help": + case "-h": + printHelp(); + return; + + default: { + // Unknown first arg → treat as run target: `evals act` == `evals run act` + await executeRun(args); + return; + } } } catch (err) { console.error(red(`Error: ${(err as Error).message}`)); process.exitCode = 1; } finally { - if (shouldMarkFirstRun) { - try { - const { markFirstRunComplete } = await import("./tui/welcomeState.js"); - markFirstRunComplete(ENTRY_DIR); - } catch { - // best-effort - } - } cleanupArgvInput(); } })(); diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts new file mode 100644 index 000000000..88bdf8430 --- /dev/null +++ b/packages/evals/framework/rubricCache.ts @@ -0,0 +1,129 @@ +/** + * Rubric cache — persists AI-generated rubrics so we run Step 0a once per + * task id and hydrate from disk thereafter. Honors plan §10 Q3 (resolved: + * generate per-task on first run + cache). + * + * Used for any task whose dataset doesn't ship a precomputed_rubric + * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its + * upstream dataset already carries rubrics. + * + * Cache layout: + * packages/evals/.rubric-cache//.json + * + * The cache key includes the task instruction hash to detect drift — if the + * instruction changes for the same task id, the rubric is regenerated rather + * than served from a stale cache. + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import crypto from "node:crypto"; + +import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand"; + +export interface RubricCacheOptions { + /** + * Root directory for cached rubrics. Defaults to + * `/.rubric-cache`. + */ + cacheRoot?: string; + /** + * Dataset name, used as a subdirectory under cacheRoot to keep different + * datasets' rubrics separate (e.g., "onlineMind2Web"). + */ + dataset: string; +} + +interface CacheEntry { + taskId: string; + instructionHash: string; + generatedAt: string; + rubric: Rubric; +} + +function hashInstruction(instruction: string): string { + return crypto + .createHash("sha256") + .update(instruction) + .digest("hex") + .slice(0, 16); +} + +export class RubricCache { + private readonly cacheDir: string; + + constructor(opts: RubricCacheOptions) { + const root = + opts.cacheRoot ?? + path.join(process.cwd(), "packages/evals/.rubric-cache"); + this.cacheDir = path.join(root, opts.dataset); + } + + /** + * Get or generate a rubric for the task. If a fresh cache entry exists + * (same instruction hash), returns it. Otherwise runs Step 0a and persists. + */ + async getOrGenerate( + taskSpec: TaskSpec, + evaluator: V3Evaluator, + ): Promise { + const cached = await this.read(taskSpec); + if (cached) return cached; + + const rubric = await evaluator.generateRubric(taskSpec); + await this.write(taskSpec, rubric); + return rubric; + } + + /** + * Read a cached rubric. Returns undefined on miss or instruction-hash drift. + */ + async read(taskSpec: TaskSpec): Promise { + const file = this.entryPath(taskSpec.id); + let raw: string; + try { + raw = await fs.readFile(file, "utf8"); + } catch { + return undefined; + } + let parsed: CacheEntry; + try { + parsed = JSON.parse(raw) as CacheEntry; + } catch { + return undefined; + } + const expectedHash = hashInstruction(taskSpec.instruction); + if (parsed.instructionHash !== expectedHash) { + // Drift detected — surface a clear log and miss. + console.warn( + `[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`, + ); + return undefined; + } + return parsed.rubric; + } + + async write(taskSpec: TaskSpec, rubric: Rubric): Promise { + await fs.mkdir(this.cacheDir, { recursive: true }); + const entry: CacheEntry = { + taskId: taskSpec.id, + instructionHash: hashInstruction(taskSpec.instruction), + generatedAt: new Date().toISOString(), + rubric, + }; + await fs.writeFile( + this.entryPath(taskSpec.id), + JSON.stringify(entry, null, 2), + ); + } + + /** Wipe the cache directory (used by tests / `bench cache clear`). */ + async clear(): Promise { + await fs.rm(this.cacheDir, { recursive: true, force: true }); + } + + private entryPath(taskId: string): string { + // Sanitize task id for filesystem safety. + const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_"); + return path.join(this.cacheDir, `${safe}.json`); + } +} diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts new file mode 100644 index 000000000..c50dcce29 --- /dev/null +++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts @@ -0,0 +1,163 @@ +/** + * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with + * the upstream microsoft/WebTailBench `precomputed_rubric` field. + * + * The local JSONL was authored before fara released v1 rubrics. This script + * fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins by `id`, + * writing back a JSONL where each row carries a `precomputed_rubric` field + * (parsed JSON object) alongside the existing `ques` / `web` / `category` / + * `id` fields. + * + * Run once after pulling the branch: + * pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts + * + * Idempotent — safe to re-run; an existing precomputed_rubric on a row is + * overwritten with the latest upstream version. + */ +import fs from "node:fs/promises"; +import path from "node:path"; + +const HF_URL = + "https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv"; + +const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", ".."); +const JSONL_PATH = path.join( + REPO_ROOT, + "packages", + "evals", + "datasets", + "webtailbench", + "WebTailBench_data.jsonl", +); + +interface Rubric { + items: Array>; +} + +interface LocalRow { + id: string; + category?: string; + ques: string; + web?: string; + precomputed_rubric?: Rubric; +} + +/** + * Parse a TSV file with simple double-quote escaping (the WebTailBench files + * use `""` for literal quotes inside quoted fields). Returns rows as arrays + * of column values; the caller maps to a schema. + */ +function parseTsv(text: string): string[][] { + const rows: string[][] = []; + const lines = text.split(/\r?\n/); + for (const raw of lines) { + if (!raw) continue; + // Each column is either quoted (with "" escapes) or unquoted plain text. + const cols: string[] = []; + let i = 0; + while (i < raw.length) { + if (raw[i] === "\t") { + cols.push(""); + i++; + continue; + } + let col = ""; + if (raw[i] === '"') { + i++; + while (i < raw.length) { + if (raw[i] === '"') { + if (raw[i + 1] === '"') { + col += '"'; + i += 2; + } else { + i++; + break; + } + } else { + col += raw[i]; + i++; + } + } + } else { + const tabIdx = raw.indexOf("\t", i); + if (tabIdx === -1) { + col = raw.slice(i); + i = raw.length; + } else { + col = raw.slice(i, tabIdx); + i = tabIdx; + } + } + cols.push(col); + if (raw[i] === "\t") i++; + } + rows.push(cols); + } + return rows; +} + +async function main(): Promise { + console.log(`▸ fetching ${HF_URL}`); + const res = await fetch(HF_URL); + if (!res.ok) { + throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`); + } + const tsv = await res.text(); + console.log(` ✓ downloaded ${tsv.length} bytes`); + + const rows = parseTsv(tsv); + const header = rows[0]; + const idIdx = header.indexOf("id"); + const rubricIdx = header.indexOf("precomputed_rubric"); + if (idIdx === -1 || rubricIdx === -1) { + throw new Error( + `unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`, + ); + } + + const rubricsById = new Map(); + for (let i = 1; i < rows.length; i++) { + const cols = rows[i]; + if (!cols[idIdx]) continue; + try { + const parsed = JSON.parse(cols[rubricIdx]) as Rubric; + rubricsById.set(cols[idIdx], parsed); + } catch (e) { + console.warn( + ` ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`, + ); + } + } + console.log(` ✓ parsed ${rubricsById.size} rubrics`); + + const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8"); + const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0); + console.log(`▸ joining into ${inLines.length} local rows`); + + let matched = 0; + let missing = 0; + const out: string[] = []; + for (const line of inLines) { + const row = JSON.parse(line) as LocalRow; + const rubric = rubricsById.get(row.id); + if (rubric) { + row.precomputed_rubric = rubric; + matched++; + } else { + missing++; + } + out.push(JSON.stringify(row)); + } + + console.log( + ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`, + ); + + await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8"); + console.log(`✅ wrote ${JSONL_PATH}`); +} + +main().catch((err) => { + console.error("❌ backfill failed:", err); + process.exit(1); +}); diff --git a/packages/evals/scripts/verify-live-trajectory.ts b/packages/evals/scripts/verify-live-trajectory.ts new file mode 100644 index 000000000..4f3acdc95 --- /dev/null +++ b/packages/evals/scripts/verify-live-trajectory.ts @@ -0,0 +1,170 @@ +/** + * Wave 0 end-to-end verification — runs a tiny live agent task and asserts the + * TrajectoryRecorder captures bus events from the real v3AgentHandler. + * + * Deliberately minimal: env=LOCAL (no Browserbase costs), 3 max steps, a stable + * destination, and a DOM-mode agent. The goal is to confirm bus event wiring, + * not to test agent capability. + * + * pnpm tsx packages/evals/scripts/verify-live-trajectory.ts + * + * Requires one of GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env. + */ +import "dotenv/config"; +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { V3, V3Evaluator } from "@browserbasehq/stagehand"; +import type { TaskSpec } from "@browserbasehq/stagehand"; +import { TrajectoryRecorder } from "../framework/trajectoryRecorder.js"; + +async function main(): Promise { + const tmpRoot = await fs.mkdtemp( + path.join(os.tmpdir(), "verifier-rewrite-live-"), + ); + console.log(`▸ tmpdir: ${tmpRoot}`); + + const v3 = new V3({ + env: "LOCAL", + verbose: 0, + model: "google/gemini-2.5-flash", + }); + await v3.init(); + console.log(" ✓ V3 initialized"); + + const page = v3.context.pages()[0]; + await page.goto("https://example.com", { timeoutMs: 60_000 }); + console.log(" ✓ navigated to example.com"); + + const taskSpec: TaskSpec = { + id: "live-example-com", + instruction: "Extract the heading text from example.com", + initUrl: "https://example.com", + }; + + const recorder = new TrajectoryRecorder({ + v3, + taskSpec, + outputRoot: tmpRoot, + runId: "live-run", + persist: true, + }); + recorder.start(); + console.log(" ✓ TrajectoryRecorder subscribed to bus"); + + const agent = v3.agent({ + model: "google/gemini-2.5-flash", + mode: "dom", + }); + + const start = Date.now(); + const result = await agent.execute({ + instruction: + "Extract the main heading text on the current page using the extract tool, then call done with that text as the reasoning.", + maxSteps: 3, + }); + console.log(` ✓ agent.execute completed in ${Date.now() - start}ms`); + console.log(` final message: "${result.message}"`); + console.log(` actions: ${result.actions.length}`); + + const trajectory = await recorder.finish({ + status: "complete", + finalAnswer: result.message, + usage: result.usage, + }); + + await v3.close(); + console.log(" ✓ V3 closed"); + + // ── Assertions ────────────────────────────────────────────────────────── + assert.ok( + trajectory.steps.length > 0, + `expected at least 1 trajectory step, got ${trajectory.steps.length}`, + ); + console.log(` ✓ trajectory has ${trajectory.steps.length} steps`); + + const stepsWithScreenshot = trajectory.steps.filter( + (s) => s.probeEvidence.screenshotPath || s.probeEvidence.screenshot, + ); + assert.ok( + stepsWithScreenshot.length > 0, + "expected at least one step with a probe screenshot", + ); + console.log( + ` ✓ ${stepsWithScreenshot.length}/${trajectory.steps.length} steps carry a probe screenshot`, + ); + + const stepsWithUrl = trajectory.steps.filter( + (s) => typeof s.probeEvidence.url === "string" && s.probeEvidence.url, + ); + assert.ok( + stepsWithUrl.length > 0, + "expected at least one step with a probe url", + ); + console.log( + ` ✓ ${stepsWithUrl.length}/${trajectory.steps.length} steps carry a probe url`, + ); + + const stepsWithEvidence = trajectory.steps.filter( + (s) => s.agentEvidence.modalities.length > 0, + ); + assert.ok( + stepsWithEvidence.length > 0, + "expected at least one step with tier-1 agent evidence modalities", + ); + console.log( + ` ✓ ${stepsWithEvidence.length}/${trajectory.steps.length} steps carry tier-1 evidence`, + ); + + // ── On-disk layout ───────────────────────────────────────────────────── + const taskDir = path.join(tmpRoot, "live-run", "live-example-com"); + const files = await fs.readdir(taskDir); + assert.ok(files.includes("trajectory.json"), "trajectory.json missing"); + assert.ok(files.includes("task_data.json"), "task_data.json missing"); + assert.ok(files.includes("times.json"), "times.json missing"); + const screenshotFiles = files.filter((f) => f.startsWith("screenshot_")); + assert.ok( + screenshotFiles.length > 0, + "expected at least one persisted screenshot", + ); + console.log( + ` ✓ on-disk: trajectory.json + task_data.json + times.json + ${screenshotFiles.length} screenshots`, + ); + + // ── verify() runs Wave 1 pipeline on the live trajectory ────────────── + console.log("\n▸ running V3Evaluator.verify() (Step 0a + Step 8)…"); + const verdict = await new V3Evaluator(v3, { backend: "verifier" }).verify( + trajectory, + taskSpec, + ); + console.log( + ` ✓ generated rubric with ${verdict.perCriterion.length} criteria`, + ); + console.log( + ` ✓ outcomeSuccess=${verdict.outcomeSuccess}, processScore=${verdict.processScore}`, + ); + assert.equal(typeof verdict.outcomeSuccess, "boolean"); + assert.ok( + verdict.perCriterion.length > 0, + "expected generated rubric to have at least one criterion", + ); + const raw = verdict.rawSteps as + | { primaryIntent?: string; rubricSource?: string } + | undefined; + assert.equal(raw?.rubricSource, "generated"); + assert.ok( + typeof raw?.primaryIntent === "string" && raw.primaryIntent.length > 0, + "expected outcome verifier to populate primary_intent", + ); + console.log(` primary_intent: "${raw.primaryIntent.slice(0, 120)}"`); + + console.log(`\n✅ Wave 0 live verification OK — trajectory at ${taskDir}`); + // Keep tmpdir for inspection; user can rm -rf if needed. +} + +main().catch((err) => { + console.error("\n❌ Wave 0 live verification FAILED:", err); + process.exit(1); +}); diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts index 36be3e1aa..9b2d3aa83 100644 --- a/packages/evals/tests/tui/run.test.ts +++ b/packages/evals/tests/tui/run.test.ts @@ -119,6 +119,7 @@ describe("deriveCategoryFilter", () => { envOverrides: {}, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -157,6 +158,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -204,6 +206,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -254,6 +257,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -315,6 +319,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -371,6 +376,7 @@ describe("deriveCategoryFilter", () => { envOverrides: {}, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -405,6 +411,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -447,6 +454,7 @@ describe("deriveCategoryFilter", () => { envOverrides: {}, dryRun: false, preview: false, + successMode: "outcome", verbose: false, }, registry, diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts index d9cc20738..439c2f61b 100644 --- a/packages/evals/tui/commands/help.ts +++ b/packages/evals/tui/commands/help.ts @@ -29,10 +29,6 @@ export function printHelp(): void { "Inspect and compare Braintrust experiment runs", ), row(`${cyan("new")} ${dim(" ")}`, "Scaffold a new task"), - row( - `${cyan("doctor")} ${dim("[--json]")}`, - "Health report (env keys, config, discovery)", - ), row(cyan("help"), "Show this help"), row(cyan("clear"), "Clear the screen"), row(cyan("exit"), "Exit the REPL"), @@ -101,6 +97,10 @@ export function printRunHelp(): void { `${cyan("--agent-modes")} ${dim("")}`, `Stagehand mode matrix ${gray("(dom,hybrid,cua)")}`, ), + row( + `${cyan("--success")} ${dim("")}`, + `Rubric success mode ${gray("(outcome | process | both)")}`, + ), row(`${cyan("-l, --limit")} ${dim("")}`, "Max cases to run"), row(`${cyan("-s, --sample")} ${dim("")}`, "Random sample before limit"), row( diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts index 28c842e43..f793d3414 100644 --- a/packages/evals/tui/commands/parse.ts +++ b/packages/evals/tui/commands/parse.ts @@ -38,10 +38,25 @@ export interface RunFlags { filter?: Array<[string, string]>; dryRun?: boolean; preview?: boolean; + /** + * Rubric success mode for the verifier — outcome | process | both. + * outcome (default): binary Verdict.outcomeSuccess (matches fara-7b's reported metric). + * process: Verdict.processScore ≥ threshold. + * both: outcome AND process. + * Plumbed to bench tasks via the EVAL_SUCCESS_MODE env override. + */ + success?: SuccessMode; /** Spawn the pre-refactor index.eval.ts runner instead of the unified path. */ legacy?: boolean; } +export type SuccessMode = "outcome" | "process" | "both"; +const SUCCESS_MODES: ReadonlySet = new Set([ + "outcome", + "process", + "both", +]); + export interface ConfigDefaults { env?: string; trials?: number; @@ -68,6 +83,8 @@ export interface ResolvedRunOptions { agentMode?: AgentToolMode; agentModes?: AgentToolMode[]; datasetFilter?: string; + /** Rubric success mode forwarded to bench tasks via EVAL_SUCCESS_MODE. */ + successMode: SuccessMode; envOverrides: Record; dryRun: boolean; preview: boolean; @@ -101,6 +118,7 @@ const VALUE_FLAGS = new Set([ "agent-mode", "agent-modes", "filter", + "success", ]); const FLAG_ALIASES: Record = { @@ -261,6 +279,16 @@ export function parseRunArgs(tokens: string[]): RunFlags { filters.push(parseFilter(value)); break; } + case "success": { + const v = value.toLowerCase() as SuccessMode; + if (!SUCCESS_MODES.has(v)) { + throw new Error( + `--success must be one of: outcome, process, both (got "${value}")`, + ); + } + flags.success = v; + break; + } default: break; } @@ -427,6 +455,16 @@ export function resolveRunOptions( envOverrides.EVAL_MODEL_OVERRIDE = model; } + // Success mode resolves from --success first, then EVAL_SUCCESS_MODE env, + // then "outcome" (matches fara-7b's reported metric). + const envSuccess = (env.EVAL_SUCCESS_MODE ?? "").toLowerCase(); + const successMode: SuccessMode = + flags.success ?? + (SUCCESS_MODES.has(envSuccess as SuccessMode) + ? (envSuccess as SuccessMode) + : "outcome"); + envOverrides.EVAL_SUCCESS_MODE = successMode; + return { target: flags.target, normalizedTarget: target, @@ -442,6 +480,7 @@ export function resolveRunOptions( agentMode, agentModes, datasetFilter, + successMode, envOverrides, dryRun: flags.dryRun ?? false, preview: flags.preview ?? false, diff --git a/packages/evals/tui/commands/verify.ts b/packages/evals/tui/commands/verify.ts new file mode 100644 index 000000000..2468d1786 --- /dev/null +++ b/packages/evals/tui/commands/verify.ts @@ -0,0 +1,238 @@ +/** + * `evals verify ` — re-score a saved trajectory offline. + * + * The verifier is browser-free: it consumes a hydrated Trajectory + TaskSpec + * and returns a Verdict. This command reads the on-disk layout written by + * `TrajectoryRecorder.persist()` (matching microsoft/fara's + * example_trajectory shape) and feeds it through V3Evaluator.verify(). + * + * Output: writes a new verdict file under `scores/mmrubric_