Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ ctrf/
**/.playwright*/
packages/evals/playwright-mcp-screenshot-*.png
packages/evals/chrome-devtools-mcp-screenshot-*.png
.trajectories/
132 changes: 132 additions & 0 deletions packages/evals/framework/rubricCache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/**
* Rubric cache — persists AI-generated rubrics so each task id can hydrate
* from disk after its first generated rubric.
*
* Used for any task whose dataset doesn't ship a precomputed_rubric
* (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its
* upstream dataset already carries rubrics.
*
* Cache layout:
* packages/evals/.rubric-cache/<dataset>/<task-id>.json
*
* The cache key includes the task id and instruction hash to detect drift —
* if either changes, the rubric is regenerated rather than served from a
* stale cache.
*/
import fs from "node:fs/promises";
import path from "node:path";
import crypto from "node:crypto";

import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand";

export interface RubricCacheOptions {
/**
* Root directory for cached rubrics. Defaults to
* `<packages/evals>/.rubric-cache`.
*/
cacheRoot?: string;
/**
* Dataset name, used as a subdirectory under cacheRoot to keep different
* datasets' rubrics separate (e.g., "onlineMind2Web").
*/
dataset: string;
}

interface CacheEntry {
taskId: string;
instructionHash: string;
generatedAt: string;
rubric: Rubric;
}

function hashInstruction(instruction: string): string {
return crypto
.createHash("sha256")
.update(instruction)
.digest("hex")
.slice(0, 16);
}

export class RubricCache {
private readonly cacheDir: string;

constructor(opts: RubricCacheOptions) {
const root =
opts.cacheRoot ??
path.join(process.cwd(), "packages/evals/.rubric-cache");
this.cacheDir = path.join(root, opts.dataset);
}

/**
* Get or generate a rubric for the task. If a fresh cache entry exists
* (same instruction hash), returns it. Otherwise runs Step 0a and persists.
*/
async getOrGenerate(
taskSpec: TaskSpec,
evaluator: V3Evaluator,
): Promise<Rubric> {
const cached = await this.read(taskSpec);
if (cached) return cached;

const rubric = await evaluator.generateRubric(taskSpec);
await this.write(taskSpec, rubric);
return rubric;
}

/** Read a cached rubric. Returns undefined on miss or cache-key drift. */
async read(taskSpec: TaskSpec): Promise<Rubric | undefined> {
const file = this.entryPath(taskSpec.id);
let raw: string;
try {
raw = await fs.readFile(file, "utf8");
} catch {
return undefined;
}
let parsed: CacheEntry;
try {
parsed = JSON.parse(raw) as CacheEntry;
} catch {
return undefined;
}
if (parsed.taskId !== taskSpec.id) {
console.warn(
`[rubric-cache] task-id mismatch for ${taskSpec.id}; regenerating`,
);
return undefined;
}
const expectedHash = hashInstruction(taskSpec.instruction);
if (parsed.instructionHash !== expectedHash) {
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
// Drift detected — surface a clear log and miss.
console.warn(
`[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`,
);
return undefined;
}
return parsed.rubric;
}

async write(taskSpec: TaskSpec, rubric: Rubric): Promise<void> {
await fs.mkdir(this.cacheDir, { recursive: true });
const entry: CacheEntry = {
taskId: taskSpec.id,
instructionHash: hashInstruction(taskSpec.instruction),
generatedAt: new Date().toISOString(),
rubric,
};
await fs.writeFile(
this.entryPath(taskSpec.id),
JSON.stringify(entry, null, 2),
);
}

/** Wipe the cache directory (used by tests / `bench cache clear`). */
async clear(): Promise<void> {
await fs.rm(this.cacheDir, { recursive: true, force: true });
}

private entryPath(taskId: string): string {
// Sanitize task id for filesystem safety.
const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_");
return path.join(this.cacheDir, `${safe}.json`);
}
}
162 changes: 162 additions & 0 deletions packages/evals/scripts/backfill-webtailbench-rubrics.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/**
* Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with
* the published WebTailBench `precomputed_rubric` field.
*
* This script fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins
* by `id`, writing back a JSONL where each row carries a
* `precomputed_rubric` field (parsed JSON object) alongside the existing
* `ques` / `web` / `category` / `id` fields.
*
* Run once after pulling the branch:
* pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts
*
* Idempotent — safe to re-run; an existing precomputed_rubric on a row is
* overwritten with the latest upstream version.
*/
import fs from "node:fs/promises";
import path from "node:path";

const HF_URL =
"https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv";

const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", "..");
const JSONL_PATH = path.join(
REPO_ROOT,
"packages",
"evals",
"datasets",
"webtailbench",
"WebTailBench_data.jsonl",
);

interface Rubric {
items: Array<Record<string, unknown>>;
}

interface LocalRow {
id: string;
category?: string;
ques: string;
web?: string;
precomputed_rubric?: Rubric;
}

/**
* Parse a TSV file with simple double-quote escaping (the WebTailBench files
* use `""` for literal quotes inside quoted fields). Returns rows as arrays
* of column values; the caller maps to a schema.
*/
function parseTsv(text: string): string[][] {
const rows: string[][] = [];
const lines = text.split(/\r?\n/);
for (const raw of lines) {
if (!raw) continue;
// Each column is either quoted (with "" escapes) or unquoted plain text.
const cols: string[] = [];
let i = 0;
while (i < raw.length) {
if (raw[i] === "\t") {
cols.push("");
i++;
continue;
}
let col = "";
if (raw[i] === '"') {
i++;
while (i < raw.length) {
if (raw[i] === '"') {
if (raw[i + 1] === '"') {
col += '"';
i += 2;
} else {
i++;
break;
}
} else {
col += raw[i];
i++;
}
}
} else {
const tabIdx = raw.indexOf("\t", i);
if (tabIdx === -1) {
col = raw.slice(i);
i = raw.length;
} else {
col = raw.slice(i, tabIdx);
i = tabIdx;
}
}
cols.push(col);
if (raw[i] === "\t") i++;
}
rows.push(cols);
}
return rows;
}

async function main(): Promise<void> {
console.log(`▸ fetching ${HF_URL}`);
const res = await fetch(HF_URL);
if (!res.ok) {
throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`);
}
const tsv = await res.text();
console.log(` ✓ downloaded ${tsv.length} bytes`);

const rows = parseTsv(tsv);
const header = rows[0];
const idIdx = header.indexOf("id");
const rubricIdx = header.indexOf("precomputed_rubric");
if (idIdx === -1 || rubricIdx === -1) {
throw new Error(
`unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`,
);
}

const rubricsById = new Map<string, Rubric>();
for (let i = 1; i < rows.length; i++) {
const cols = rows[i];
if (!cols[idIdx]) continue;
try {
const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
rubricsById.set(cols[idIdx], parsed);
} catch (e) {
console.warn(
` ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`,
);
}
}
console.log(` ✓ parsed ${rubricsById.size} rubrics`);

const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8");
const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0);
console.log(`▸ joining into ${inLines.length} local rows`);

let matched = 0;
let missing = 0;
const out: string[] = [];
for (const line of inLines) {
const row = JSON.parse(line) as LocalRow;
const rubric = rubricsById.get(row.id);
if (rubric) {
row.precomputed_rubric = rubric;
matched++;
} else {
missing++;
}
out.push(JSON.stringify(row));
}

console.log(
` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
);

await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
console.log(`✅ wrote ${JSONL_PATH}`);
}

main().catch((err) => {
console.error("❌ backfill failed:", err);
process.exit(1);
});
47 changes: 47 additions & 0 deletions packages/evals/tests/framework/rubricCache.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";

import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";

import { RubricCache } from "../../framework/rubricCache.js";

describe("RubricCache", () => {
let tmpRoot = "";
let warn: ReturnType<typeof vi.spyOn>;

const rubric: Rubric = {
items: [
{
criterion: "criterion",
description: "description",
maxPoints: 1,
},
],
};

beforeEach(async () => {
tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rubric-cache-test-"));
warn = vi.spyOn(console, "warn").mockImplementation(() => {});
});

afterEach(async () => {
warn.mockRestore();
await fs.rm(tmpRoot, { recursive: true, force: true });
});

it("misses when sanitized task ids collide but the stored task id differs", async () => {
const cache = new RubricCache({ cacheRoot: tmpRoot, dataset: "test" });
const taskA: TaskSpec = { id: "task/a", instruction: "same instruction" };
const taskB: TaskSpec = { id: "task:a", instruction: "same instruction" };

await cache.write(taskA, rubric);

await expect(cache.read(taskB)).resolves.toBeUndefined();
await expect(cache.read(taskA)).resolves.toEqual(rubric);
expect(warn).toHaveBeenCalledWith(
"[rubric-cache] task-id mismatch for task:a; regenerating",
);
});
});
10 changes: 10 additions & 0 deletions packages/evals/tests/tui/commandTree.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { describe, expect, it, vi } from "vitest";
import {
buildCommandTree,
dispatch,
findChild,
resolveCommand,
Expand Down Expand Up @@ -207,6 +208,15 @@ describe("findChild + walkPath", () => {
});
});

describe("buildCommandTree", () => {
it("exposes verify as a root command", () => {
const tree = buildCommandTree();
expect(findChild(tree, "verify")?.summary).toBe(
"Re-score a saved trajectory",
);
});
});

// ---------------------------------------------------------------------------
// resolveCommand
// ---------------------------------------------------------------------------
Expand Down
Loading
Loading