-
Notifications
You must be signed in to change notification settings - Fork 1.5k
feat(evals): add offline verifier CLI #2134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
miguelg719
wants to merge
7
commits into
miguelgonzalez/verifier-05-core-engine
Choose a base branch
from
miguelgonzalez/verifier-06-offline-cli
base: miguelgonzalez/verifier-05-core-engine
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
1265dca
fix(verifier): bound failure step parsing
miguelg719 3d1a1b1
feat(evals): add offline verifier CLI
miguelg719 3069b2f
fix(evals): use camel raw verifier metadata
miguelg719 fd4b797
fix(evals): restore command tree verifier cli
miguelg719 68583d0
fix(evals): include doctor in restored help
miguelg719 42a81b9
docs(evals): remove rollout comments from offline verifier
miguelg719 4f141e7
fix(evals): align offline verifier result naming
miguelg719 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| /** | ||
| * Rubric cache — persists AI-generated rubrics so each task id can hydrate | ||
| * from disk after its first generated rubric. | ||
| * | ||
| * Used for any task whose dataset doesn't ship a precomputed_rubric | ||
| * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its | ||
| * upstream dataset already carries rubrics. | ||
| * | ||
| * Cache layout: | ||
| * packages/evals/.rubric-cache/<dataset>/<task-id>.json | ||
| * | ||
| * The cache key includes the task id and instruction hash to detect drift — | ||
| * if either changes, the rubric is regenerated rather than served from a | ||
| * stale cache. | ||
| */ | ||
| import fs from "node:fs/promises"; | ||
| import path from "node:path"; | ||
| import crypto from "node:crypto"; | ||
|
|
||
| import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand"; | ||
|
|
||
| export interface RubricCacheOptions { | ||
| /** | ||
| * Root directory for cached rubrics. Defaults to | ||
| * `<packages/evals>/.rubric-cache`. | ||
| */ | ||
| cacheRoot?: string; | ||
| /** | ||
| * Dataset name, used as a subdirectory under cacheRoot to keep different | ||
| * datasets' rubrics separate (e.g., "onlineMind2Web"). | ||
| */ | ||
| dataset: string; | ||
| } | ||
|
|
||
| interface CacheEntry { | ||
| taskId: string; | ||
| instructionHash: string; | ||
| generatedAt: string; | ||
| rubric: Rubric; | ||
| } | ||
|
|
||
| function hashInstruction(instruction: string): string { | ||
| return crypto | ||
| .createHash("sha256") | ||
| .update(instruction) | ||
| .digest("hex") | ||
| .slice(0, 16); | ||
| } | ||
|
|
||
| export class RubricCache { | ||
| private readonly cacheDir: string; | ||
|
|
||
| constructor(opts: RubricCacheOptions) { | ||
| const root = | ||
| opts.cacheRoot ?? | ||
| path.join(process.cwd(), "packages/evals/.rubric-cache"); | ||
| this.cacheDir = path.join(root, opts.dataset); | ||
| } | ||
|
|
||
| /** | ||
| * Get or generate a rubric for the task. If a fresh cache entry exists | ||
| * (same instruction hash), returns it. Otherwise runs Step 0a and persists. | ||
| */ | ||
| async getOrGenerate( | ||
| taskSpec: TaskSpec, | ||
| evaluator: V3Evaluator, | ||
| ): Promise<Rubric> { | ||
| const cached = await this.read(taskSpec); | ||
| if (cached) return cached; | ||
|
|
||
| const rubric = await evaluator.generateRubric(taskSpec); | ||
| await this.write(taskSpec, rubric); | ||
| return rubric; | ||
| } | ||
|
|
||
| /** Read a cached rubric. Returns undefined on miss or cache-key drift. */ | ||
| async read(taskSpec: TaskSpec): Promise<Rubric | undefined> { | ||
| const file = this.entryPath(taskSpec.id); | ||
| let raw: string; | ||
| try { | ||
| raw = await fs.readFile(file, "utf8"); | ||
| } catch { | ||
| return undefined; | ||
| } | ||
| let parsed: CacheEntry; | ||
| try { | ||
| parsed = JSON.parse(raw) as CacheEntry; | ||
| } catch { | ||
| return undefined; | ||
| } | ||
| if (parsed.taskId !== taskSpec.id) { | ||
| console.warn( | ||
| `[rubric-cache] task-id mismatch for ${taskSpec.id}; regenerating`, | ||
| ); | ||
| return undefined; | ||
| } | ||
| const expectedHash = hashInstruction(taskSpec.instruction); | ||
| if (parsed.instructionHash !== expectedHash) { | ||
| // Drift detected — surface a clear log and miss. | ||
| console.warn( | ||
| `[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`, | ||
| ); | ||
| return undefined; | ||
| } | ||
| return parsed.rubric; | ||
| } | ||
|
|
||
| async write(taskSpec: TaskSpec, rubric: Rubric): Promise<void> { | ||
| await fs.mkdir(this.cacheDir, { recursive: true }); | ||
| const entry: CacheEntry = { | ||
| taskId: taskSpec.id, | ||
| instructionHash: hashInstruction(taskSpec.instruction), | ||
| generatedAt: new Date().toISOString(), | ||
| rubric, | ||
| }; | ||
| await fs.writeFile( | ||
| this.entryPath(taskSpec.id), | ||
| JSON.stringify(entry, null, 2), | ||
| ); | ||
| } | ||
|
|
||
| /** Wipe the cache directory (used by tests / `bench cache clear`). */ | ||
| async clear(): Promise<void> { | ||
| await fs.rm(this.cacheDir, { recursive: true, force: true }); | ||
| } | ||
|
|
||
| private entryPath(taskId: string): string { | ||
| // Sanitize task id for filesystem safety. | ||
| const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_"); | ||
| return path.join(this.cacheDir, `${safe}.json`); | ||
| } | ||
| } | ||
162 changes: 162 additions & 0 deletions
162
packages/evals/scripts/backfill-webtailbench-rubrics.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| /** | ||
| * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with | ||
| * the published WebTailBench `precomputed_rubric` field. | ||
| * | ||
| * This script fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins | ||
| * by `id`, writing back a JSONL where each row carries a | ||
| * `precomputed_rubric` field (parsed JSON object) alongside the existing | ||
| * `ques` / `web` / `category` / `id` fields. | ||
| * | ||
| * Run once after pulling the branch: | ||
| * pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts | ||
| * | ||
| * Idempotent — safe to re-run; an existing precomputed_rubric on a row is | ||
| * overwritten with the latest upstream version. | ||
| */ | ||
| import fs from "node:fs/promises"; | ||
| import path from "node:path"; | ||
|
|
||
| const HF_URL = | ||
| "https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv"; | ||
|
|
||
| const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", ".."); | ||
| const JSONL_PATH = path.join( | ||
| REPO_ROOT, | ||
| "packages", | ||
| "evals", | ||
| "datasets", | ||
| "webtailbench", | ||
| "WebTailBench_data.jsonl", | ||
| ); | ||
|
|
||
| interface Rubric { | ||
| items: Array<Record<string, unknown>>; | ||
| } | ||
|
|
||
| interface LocalRow { | ||
| id: string; | ||
| category?: string; | ||
| ques: string; | ||
| web?: string; | ||
| precomputed_rubric?: Rubric; | ||
| } | ||
|
|
||
| /** | ||
| * Parse a TSV file with simple double-quote escaping (the WebTailBench files | ||
| * use `""` for literal quotes inside quoted fields). Returns rows as arrays | ||
| * of column values; the caller maps to a schema. | ||
| */ | ||
| function parseTsv(text: string): string[][] { | ||
| const rows: string[][] = []; | ||
| const lines = text.split(/\r?\n/); | ||
| for (const raw of lines) { | ||
| if (!raw) continue; | ||
| // Each column is either quoted (with "" escapes) or unquoted plain text. | ||
| const cols: string[] = []; | ||
| let i = 0; | ||
| while (i < raw.length) { | ||
| if (raw[i] === "\t") { | ||
| cols.push(""); | ||
| i++; | ||
| continue; | ||
| } | ||
| let col = ""; | ||
| if (raw[i] === '"') { | ||
| i++; | ||
| while (i < raw.length) { | ||
| if (raw[i] === '"') { | ||
| if (raw[i + 1] === '"') { | ||
| col += '"'; | ||
| i += 2; | ||
| } else { | ||
| i++; | ||
| break; | ||
| } | ||
| } else { | ||
| col += raw[i]; | ||
| i++; | ||
| } | ||
| } | ||
| } else { | ||
| const tabIdx = raw.indexOf("\t", i); | ||
| if (tabIdx === -1) { | ||
| col = raw.slice(i); | ||
| i = raw.length; | ||
| } else { | ||
| col = raw.slice(i, tabIdx); | ||
| i = tabIdx; | ||
| } | ||
| } | ||
| cols.push(col); | ||
| if (raw[i] === "\t") i++; | ||
| } | ||
| rows.push(cols); | ||
| } | ||
| return rows; | ||
| } | ||
|
|
||
| async function main(): Promise<void> { | ||
| console.log(`▸ fetching ${HF_URL}`); | ||
| const res = await fetch(HF_URL); | ||
| if (!res.ok) { | ||
| throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`); | ||
| } | ||
| const tsv = await res.text(); | ||
| console.log(` ✓ downloaded ${tsv.length} bytes`); | ||
|
|
||
| const rows = parseTsv(tsv); | ||
| const header = rows[0]; | ||
| const idIdx = header.indexOf("id"); | ||
| const rubricIdx = header.indexOf("precomputed_rubric"); | ||
| if (idIdx === -1 || rubricIdx === -1) { | ||
| throw new Error( | ||
| `unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`, | ||
| ); | ||
| } | ||
|
|
||
| const rubricsById = new Map<string, Rubric>(); | ||
| for (let i = 1; i < rows.length; i++) { | ||
| const cols = rows[i]; | ||
| if (!cols[idIdx]) continue; | ||
| try { | ||
| const parsed = JSON.parse(cols[rubricIdx]) as Rubric; | ||
| rubricsById.set(cols[idIdx], parsed); | ||
| } catch (e) { | ||
| console.warn( | ||
| ` ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`, | ||
| ); | ||
| } | ||
| } | ||
| console.log(` ✓ parsed ${rubricsById.size} rubrics`); | ||
|
|
||
| const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8"); | ||
| const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0); | ||
| console.log(`▸ joining into ${inLines.length} local rows`); | ||
|
|
||
| let matched = 0; | ||
| let missing = 0; | ||
| const out: string[] = []; | ||
| for (const line of inLines) { | ||
| const row = JSON.parse(line) as LocalRow; | ||
| const rubric = rubricsById.get(row.id); | ||
| if (rubric) { | ||
| row.precomputed_rubric = rubric; | ||
| matched++; | ||
| } else { | ||
| missing++; | ||
| } | ||
| out.push(JSON.stringify(row)); | ||
| } | ||
|
|
||
| console.log( | ||
| ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`, | ||
| ); | ||
|
|
||
| await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8"); | ||
| console.log(`✅ wrote ${JSONL_PATH}`); | ||
| } | ||
|
|
||
| main().catch((err) => { | ||
| console.error("❌ backfill failed:", err); | ||
| process.exit(1); | ||
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| import fs from "node:fs/promises"; | ||
| import os from "node:os"; | ||
| import path from "node:path"; | ||
|
|
||
| import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; | ||
| import type { Rubric, TaskSpec } from "@browserbasehq/stagehand"; | ||
|
|
||
| import { RubricCache } from "../../framework/rubricCache.js"; | ||
|
|
||
| describe("RubricCache", () => { | ||
| let tmpRoot = ""; | ||
| let warn: ReturnType<typeof vi.spyOn>; | ||
|
|
||
| const rubric: Rubric = { | ||
| items: [ | ||
| { | ||
| criterion: "criterion", | ||
| description: "description", | ||
| maxPoints: 1, | ||
| }, | ||
| ], | ||
| }; | ||
|
|
||
| beforeEach(async () => { | ||
| tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rubric-cache-test-")); | ||
| warn = vi.spyOn(console, "warn").mockImplementation(() => {}); | ||
| }); | ||
|
|
||
| afterEach(async () => { | ||
| warn.mockRestore(); | ||
| await fs.rm(tmpRoot, { recursive: true, force: true }); | ||
| }); | ||
|
|
||
| it("misses when sanitized task ids collide but the stored task id differs", async () => { | ||
| const cache = new RubricCache({ cacheRoot: tmpRoot, dataset: "test" }); | ||
| const taskA: TaskSpec = { id: "task/a", instruction: "same instruction" }; | ||
| const taskB: TaskSpec = { id: "task:a", instruction: "same instruction" }; | ||
|
|
||
| await cache.write(taskA, rubric); | ||
|
|
||
| await expect(cache.read(taskB)).resolves.toBeUndefined(); | ||
| await expect(cache.read(taskA)).resolves.toEqual(rubric); | ||
| expect(warn).toHaveBeenCalledWith( | ||
| "[rubric-cache] task-id mismatch for task:a; regenerating", | ||
| ); | ||
| }); | ||
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.