diff --git a/.gitignore b/.gitignore index ec7d09add..a09d13c0a 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ ctrf/ **/.playwright*/ packages/evals/playwright-mcp-screenshot-*.png packages/evals/chrome-devtools-mcp-screenshot-*.png +.trajectories/ diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts new file mode 100644 index 000000000..d7817fc93 --- /dev/null +++ b/packages/evals/framework/rubricCache.ts @@ -0,0 +1,132 @@ +/** + * Rubric cache — persists AI-generated rubrics so each task id can hydrate + * from disk after its first generated rubric. + * + * Used for any task whose dataset doesn't ship a precomputed_rubric + * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its + * upstream dataset already carries rubrics. + * + * Cache layout: + * packages/evals/.rubric-cache//.json + * + * The cache key includes the task id and instruction hash to detect drift — + * if either changes, the rubric is regenerated rather than served from a + * stale cache. + */ +import fs from "node:fs/promises"; +import path from "node:path"; +import crypto from "node:crypto"; + +import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand"; + +export interface RubricCacheOptions { + /** + * Root directory for cached rubrics. Defaults to + * `/.rubric-cache`. + */ + cacheRoot?: string; + /** + * Dataset name, used as a subdirectory under cacheRoot to keep different + * datasets' rubrics separate (e.g., "onlineMind2Web"). + */ + dataset: string; +} + +interface CacheEntry { + taskId: string; + instructionHash: string; + generatedAt: string; + rubric: Rubric; +} + +function hashInstruction(instruction: string): string { + return crypto + .createHash("sha256") + .update(instruction) + .digest("hex") + .slice(0, 16); +} + +export class RubricCache { + private readonly cacheDir: string; + + constructor(opts: RubricCacheOptions) { + const root = + opts.cacheRoot ?? + path.join(process.cwd(), "packages/evals/.rubric-cache"); + this.cacheDir = path.join(root, opts.dataset); + } + + /** + * Get or generate a rubric for the task. If a fresh cache entry exists + * (same instruction hash), returns it. Otherwise runs Step 0a and persists. + */ + async getOrGenerate( + taskSpec: TaskSpec, + evaluator: V3Evaluator, + ): Promise { + const cached = await this.read(taskSpec); + if (cached) return cached; + + const rubric = await evaluator.generateRubric(taskSpec); + await this.write(taskSpec, rubric); + return rubric; + } + + /** Read a cached rubric. Returns undefined on miss or cache-key drift. */ + async read(taskSpec: TaskSpec): Promise { + const file = this.entryPath(taskSpec.id); + let raw: string; + try { + raw = await fs.readFile(file, "utf8"); + } catch { + return undefined; + } + let parsed: CacheEntry; + try { + parsed = JSON.parse(raw) as CacheEntry; + } catch { + return undefined; + } + if (parsed.taskId !== taskSpec.id) { + console.warn( + `[rubric-cache] task-id mismatch for ${taskSpec.id}; regenerating`, + ); + return undefined; + } + const expectedHash = hashInstruction(taskSpec.instruction); + if (parsed.instructionHash !== expectedHash) { + // Drift detected — surface a clear log and miss. + console.warn( + `[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`, + ); + return undefined; + } + return parsed.rubric; + } + + async write(taskSpec: TaskSpec, rubric: Rubric): Promise { + await fs.mkdir(this.cacheDir, { recursive: true }); + const entry: CacheEntry = { + taskId: taskSpec.id, + instructionHash: hashInstruction(taskSpec.instruction), + generatedAt: new Date().toISOString(), + rubric, + }; + await fs.writeFile( + this.entryPath(taskSpec.id), + JSON.stringify(entry, null, 2), + ); + } + + /** Wipe the cache directory (used by tests / `bench cache clear`). */ + async clear(): Promise { + await fs.rm(this.cacheDir, { recursive: true, force: true }); + } + + private entryPath(taskId: string): string { + // Sanitize task id for filesystem safety. + const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_"); + return path.join(this.cacheDir, `${safe}.json`); + } +} diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts new file mode 100644 index 000000000..5a6763390 --- /dev/null +++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts @@ -0,0 +1,162 @@ +/** + * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with + * the published WebTailBench `precomputed_rubric` field. + * + * This script fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins + * by `id`, writing back a JSONL where each row carries a + * `precomputed_rubric` field (parsed JSON object) alongside the existing + * `ques` / `web` / `category` / `id` fields. + * + * Run once after pulling the branch: + * pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts + * + * Idempotent — safe to re-run; an existing precomputed_rubric on a row is + * overwritten with the latest upstream version. + */ +import fs from "node:fs/promises"; +import path from "node:path"; + +const HF_URL = + "https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv"; + +const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", ".."); +const JSONL_PATH = path.join( + REPO_ROOT, + "packages", + "evals", + "datasets", + "webtailbench", + "WebTailBench_data.jsonl", +); + +interface Rubric { + items: Array>; +} + +interface LocalRow { + id: string; + category?: string; + ques: string; + web?: string; + precomputed_rubric?: Rubric; +} + +/** + * Parse a TSV file with simple double-quote escaping (the WebTailBench files + * use `""` for literal quotes inside quoted fields). Returns rows as arrays + * of column values; the caller maps to a schema. + */ +function parseTsv(text: string): string[][] { + const rows: string[][] = []; + const lines = text.split(/\r?\n/); + for (const raw of lines) { + if (!raw) continue; + // Each column is either quoted (with "" escapes) or unquoted plain text. + const cols: string[] = []; + let i = 0; + while (i < raw.length) { + if (raw[i] === "\t") { + cols.push(""); + i++; + continue; + } + let col = ""; + if (raw[i] === '"') { + i++; + while (i < raw.length) { + if (raw[i] === '"') { + if (raw[i + 1] === '"') { + col += '"'; + i += 2; + } else { + i++; + break; + } + } else { + col += raw[i]; + i++; + } + } + } else { + const tabIdx = raw.indexOf("\t", i); + if (tabIdx === -1) { + col = raw.slice(i); + i = raw.length; + } else { + col = raw.slice(i, tabIdx); + i = tabIdx; + } + } + cols.push(col); + if (raw[i] === "\t") i++; + } + rows.push(cols); + } + return rows; +} + +async function main(): Promise { + console.log(`▸ fetching ${HF_URL}`); + const res = await fetch(HF_URL); + if (!res.ok) { + throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`); + } + const tsv = await res.text(); + console.log(` ✓ downloaded ${tsv.length} bytes`); + + const rows = parseTsv(tsv); + const header = rows[0]; + const idIdx = header.indexOf("id"); + const rubricIdx = header.indexOf("precomputed_rubric"); + if (idIdx === -1 || rubricIdx === -1) { + throw new Error( + `unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`, + ); + } + + const rubricsById = new Map(); + for (let i = 1; i < rows.length; i++) { + const cols = rows[i]; + if (!cols[idIdx]) continue; + try { + const parsed = JSON.parse(cols[rubricIdx]) as Rubric; + rubricsById.set(cols[idIdx], parsed); + } catch (e) { + console.warn( + ` ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`, + ); + } + } + console.log(` ✓ parsed ${rubricsById.size} rubrics`); + + const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8"); + const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0); + console.log(`▸ joining into ${inLines.length} local rows`); + + let matched = 0; + let missing = 0; + const out: string[] = []; + for (const line of inLines) { + const row = JSON.parse(line) as LocalRow; + const rubric = rubricsById.get(row.id); + if (rubric) { + row.precomputed_rubric = rubric; + matched++; + } else { + missing++; + } + out.push(JSON.stringify(row)); + } + + console.log( + ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`, + ); + + await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8"); + console.log(`✅ wrote ${JSONL_PATH}`); +} + +main().catch((err) => { + console.error("❌ backfill failed:", err); + process.exit(1); +}); diff --git a/packages/evals/tests/framework/rubricCache.test.ts b/packages/evals/tests/framework/rubricCache.test.ts new file mode 100644 index 000000000..62afeee3d --- /dev/null +++ b/packages/evals/tests/framework/rubricCache.test.ts @@ -0,0 +1,47 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { Rubric, TaskSpec } from "@browserbasehq/stagehand"; + +import { RubricCache } from "../../framework/rubricCache.js"; + +describe("RubricCache", () => { + let tmpRoot = ""; + let warn: ReturnType; + + const rubric: Rubric = { + items: [ + { + criterion: "criterion", + description: "description", + maxPoints: 1, + }, + ], + }; + + beforeEach(async () => { + tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rubric-cache-test-")); + warn = vi.spyOn(console, "warn").mockImplementation(() => {}); + }); + + afterEach(async () => { + warn.mockRestore(); + await fs.rm(tmpRoot, { recursive: true, force: true }); + }); + + it("misses when sanitized task ids collide but the stored task id differs", async () => { + const cache = new RubricCache({ cacheRoot: tmpRoot, dataset: "test" }); + const taskA: TaskSpec = { id: "task/a", instruction: "same instruction" }; + const taskB: TaskSpec = { id: "task:a", instruction: "same instruction" }; + + await cache.write(taskA, rubric); + + await expect(cache.read(taskB)).resolves.toBeUndefined(); + await expect(cache.read(taskA)).resolves.toEqual(rubric); + expect(warn).toHaveBeenCalledWith( + "[rubric-cache] task-id mismatch for task:a; regenerating", + ); + }); +}); diff --git a/packages/evals/tests/tui/commandTree.test.ts b/packages/evals/tests/tui/commandTree.test.ts index d70006c1b..378499711 100644 --- a/packages/evals/tests/tui/commandTree.test.ts +++ b/packages/evals/tests/tui/commandTree.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it, vi } from "vitest"; import { + buildCommandTree, dispatch, findChild, resolveCommand, @@ -207,6 +208,15 @@ describe("findChild + walkPath", () => { }); }); +describe("buildCommandTree", () => { + it("exposes verify as a root command", () => { + const tree = buildCommandTree(); + expect(findChild(tree, "verify")?.summary).toBe( + "Re-score a saved trajectory", + ); + }); +}); + // --------------------------------------------------------------------------- // resolveCommand // --------------------------------------------------------------------------- diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts index 36be3e1aa..9b2d3aa83 100644 --- a/packages/evals/tests/tui/run.test.ts +++ b/packages/evals/tests/tui/run.test.ts @@ -119,6 +119,7 @@ describe("deriveCategoryFilter", () => { envOverrides: {}, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -157,6 +158,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -204,6 +206,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -254,6 +257,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -315,6 +319,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -371,6 +376,7 @@ describe("deriveCategoryFilter", () => { envOverrides: {}, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -405,6 +411,7 @@ describe("deriveCategoryFilter", () => { }, dryRun: true, preview: false, + successMode: "outcome", verbose: false, }, registry, @@ -447,6 +454,7 @@ describe("deriveCategoryFilter", () => { envOverrides: {}, dryRun: false, preview: false, + successMode: "outcome", verbose: false, }, registry, diff --git a/packages/evals/tui/commandTree.ts b/packages/evals/tui/commandTree.ts index 7cdc6deca..c9350eff0 100644 --- a/packages/evals/tui/commandTree.ts +++ b/packages/evals/tui/commandTree.ts @@ -3,7 +3,7 @@ * * Models the user-visible command surface as a tree: * root → run, list, new, config{path,set,reset,core{path,set,reset,setup}}, - * experiments{list,show,open,compare} + * experiments{list,show,open,compare}, verify, doctor * * Both the REPL (tui/repl.ts) and argv mode (cli.ts) build the same tree * via `buildCommandTree()` and dispatch user input through it. This is the @@ -643,6 +643,17 @@ export function buildCommandTree(): CommandNode { }, }; + const verifyNode: CommandNode = { + name: "verify", + summary: "Re-score a saved trajectory", + printHelp: async () => + (await import("./commands/verify.js")).printVerifyHelp(), + handler: async (args) => { + const { handleVerify } = await import("./commands/verify.js"); + await handleVerify(args); + }, + }; + const root: CommandNode = { name: "evals", summary: "Stagehand evals CLI", @@ -653,6 +664,7 @@ export function buildCommandTree(): CommandNode { configNode, experimentsNode, newNode, + verifyNode, doctorNode, ], }; diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts index d9cc20738..087e34d08 100644 --- a/packages/evals/tui/commands/help.ts +++ b/packages/evals/tui/commands/help.ts @@ -28,11 +28,12 @@ export function printHelp(): void { `${cyan("experiments")} ${dim("[subcommand]")}`, "Inspect and compare Braintrust experiment runs", ), - row(`${cyan("new")} ${dim(" ")}`, "Scaffold a new task"), row( - `${cyan("doctor")} ${dim("[--json]")}`, - "Health report (env keys, config, discovery)", + `${cyan("verify")} ${dim(" [options]")}`, + "Re-score a saved trajectory", ), + row(`${cyan("doctor")} ${dim("| health")}`, "Health report"), + row(`${cyan("new")} ${dim(" ")}`, "Scaffold a new task"), row(cyan("help"), "Show this help"), row(cyan("clear"), "Clear the screen"), row(cyan("exit"), "Exit the REPL"), @@ -101,6 +102,10 @@ export function printRunHelp(): void { `${cyan("--agent-modes")} ${dim("")}`, `Stagehand mode matrix ${gray("(dom,hybrid,cua)")}`, ), + row( + `${cyan("--success")} ${dim("")}`, + `Rubric success mode ${gray("(outcome | process | both)")}`, + ), row(`${cyan("-l, --limit")} ${dim("")}`, "Max cases to run"), row(`${cyan("-s, --sample")} ${dim("")}`, "Random sample before limit"), row( diff --git a/packages/evals/tui/commands/parse.ts b/packages/evals/tui/commands/parse.ts index 28c842e43..d5d723403 100644 --- a/packages/evals/tui/commands/parse.ts +++ b/packages/evals/tui/commands/parse.ts @@ -38,10 +38,25 @@ export interface RunFlags { filter?: Array<[string, string]>; dryRun?: boolean; preview?: boolean; + /** + * Rubric success mode for the verifier — outcome | process | both. + * outcome (default): binary EvaluationResult.outcomeSuccess. + * process: EvaluationResult.processScore ≥ threshold. + * both: outcome AND process. + * Plumbed to bench tasks via the EVAL_SUCCESS_MODE env override. + */ + success?: SuccessMode; /** Spawn the pre-refactor index.eval.ts runner instead of the unified path. */ legacy?: boolean; } +export type SuccessMode = "outcome" | "process" | "both"; +const SUCCESS_MODES: ReadonlySet = new Set([ + "outcome", + "process", + "both", +]); + export interface ConfigDefaults { env?: string; trials?: number; @@ -68,6 +83,8 @@ export interface ResolvedRunOptions { agentMode?: AgentToolMode; agentModes?: AgentToolMode[]; datasetFilter?: string; + /** Rubric success mode forwarded to bench tasks via EVAL_SUCCESS_MODE. */ + successMode: SuccessMode; envOverrides: Record; dryRun: boolean; preview: boolean; @@ -101,6 +118,7 @@ const VALUE_FLAGS = new Set([ "agent-mode", "agent-modes", "filter", + "success", ]); const FLAG_ALIASES: Record = { @@ -261,6 +279,16 @@ export function parseRunArgs(tokens: string[]): RunFlags { filters.push(parseFilter(value)); break; } + case "success": { + const v = value.toLowerCase() as SuccessMode; + if (!SUCCESS_MODES.has(v)) { + throw new Error( + `--success must be one of: outcome, process, both (got "${value}")`, + ); + } + flags.success = v; + break; + } default: break; } @@ -427,6 +455,16 @@ export function resolveRunOptions( envOverrides.EVAL_MODEL_OVERRIDE = model; } + // Success mode resolves from --success first, then EVAL_SUCCESS_MODE env, + // then "outcome". + const envSuccess = (env.EVAL_SUCCESS_MODE ?? "").toLowerCase(); + const successMode: SuccessMode = + flags.success ?? + (SUCCESS_MODES.has(envSuccess as SuccessMode) + ? (envSuccess as SuccessMode) + : "outcome"); + envOverrides.EVAL_SUCCESS_MODE = successMode; + return { target: flags.target, normalizedTarget: target, @@ -442,6 +480,7 @@ export function resolveRunOptions( agentMode, agentModes, datasetFilter, + successMode, envOverrides, dryRun: flags.dryRun ?? false, preview: flags.preview ?? false, diff --git a/packages/evals/tui/commands/verify.ts b/packages/evals/tui/commands/verify.ts new file mode 100644 index 000000000..130d0694e --- /dev/null +++ b/packages/evals/tui/commands/verify.ts @@ -0,0 +1,236 @@ +/** + * `evals verify ` — re-score a saved trajectory offline. + * + * The verifier is browser-free: it consumes a hydrated Trajectory + TaskSpec + * and returns an EvaluationResult. This command reads the on-disk layout written by + * `TrajectoryRecorder.persist()` and feeds it through V3Evaluator.verify(). + * + * Output: writes a new result file under `scores/result_