browserbase · miguelg719 · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,4 @@ ctrf/
 **/.playwright*/
 packages/evals/playwright-mcp-screenshot-*.png
 packages/evals/chrome-devtools-mcp-screenshot-*.png
+.trajectories/
diff --git a/packages/evals/framework/rubricCache.ts b/packages/evals/framework/rubricCache.ts
@@ -0,0 +1,132 @@
+/**
+ * Rubric cache — persists AI-generated rubrics so each task id can hydrate
+ * from disk after its first generated rubric.
+ *
+ * Used for any task whose dataset doesn't ship a precomputed_rubric
+ * (Mind2Web, ad-hoc bench tasks, etc.). WebTailBench is exempt — its
+ * upstream dataset already carries rubrics.
+ *
+ * Cache layout:
+ *   packages/evals/.rubric-cache/<dataset>/<task-id>.json
+ *
+ * The cache key includes the task id and instruction hash to detect drift —
+ * if either changes, the rubric is regenerated rather than served from a
+ * stale cache.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import crypto from "node:crypto";
+
+import type { Rubric, TaskSpec, V3Evaluator } from "@browserbasehq/stagehand";
+
+export interface RubricCacheOptions {
+  /**
+   * Root directory for cached rubrics. Defaults to
+   * `<packages/evals>/.rubric-cache`.
+   */
+  cacheRoot?: string;
+  /**
+   * Dataset name, used as a subdirectory under cacheRoot to keep different
+   * datasets' rubrics separate (e.g., "onlineMind2Web").
+   */
+  dataset: string;
+}
+
+interface CacheEntry {
+  taskId: string;
+  instructionHash: string;
+  generatedAt: string;
+  rubric: Rubric;
+}
+
+function hashInstruction(instruction: string): string {
+  return crypto
+    .createHash("sha256")
+    .update(instruction)
+    .digest("hex")
+    .slice(0, 16);
+}
+
+export class RubricCache {
+  private readonly cacheDir: string;
+
+  constructor(opts: RubricCacheOptions) {
+    const root =
+      opts.cacheRoot ??
+      path.join(process.cwd(), "packages/evals/.rubric-cache");
+    this.cacheDir = path.join(root, opts.dataset);
+  }
+
+  /**
+   * Get or generate a rubric for the task. If a fresh cache entry exists
+   * (same instruction hash), returns it. Otherwise runs Step 0a and persists.
+   */
+  async getOrGenerate(
+    taskSpec: TaskSpec,
+    evaluator: V3Evaluator,
+  ): Promise<Rubric> {
+    const cached = await this.read(taskSpec);
+    if (cached) return cached;
+
+    const rubric = await evaluator.generateRubric(taskSpec);
+    await this.write(taskSpec, rubric);
+    return rubric;
+  }
+
+  /** Read a cached rubric. Returns undefined on miss or cache-key drift. */
+  async read(taskSpec: TaskSpec): Promise<Rubric | undefined> {
+    const file = this.entryPath(taskSpec.id);
+    let raw: string;
+    try {
+      raw = await fs.readFile(file, "utf8");
+    } catch {
+      return undefined;
+    }
+    let parsed: CacheEntry;
+    try {
+      parsed = JSON.parse(raw) as CacheEntry;
+    } catch {
+      return undefined;
+    }
+    if (parsed.taskId !== taskSpec.id) {
+      console.warn(
+        `[rubric-cache] task-id mismatch for ${taskSpec.id}; regenerating`,
+      );
+      return undefined;
+    }
+    const expectedHash = hashInstruction(taskSpec.instruction);
+    if (parsed.instructionHash !== expectedHash) {
+      // Drift detected — surface a clear log and miss.
+      console.warn(
+        `[rubric-cache] instruction-hash drift for ${taskSpec.id}; regenerating`,
+      );
+      return undefined;
+    }
+    return parsed.rubric;
+  }
+
+  async write(taskSpec: TaskSpec, rubric: Rubric): Promise<void> {
+    await fs.mkdir(this.cacheDir, { recursive: true });
+    const entry: CacheEntry = {
+      taskId: taskSpec.id,
+      instructionHash: hashInstruction(taskSpec.instruction),
+      generatedAt: new Date().toISOString(),
+      rubric,
+    };
+    await fs.writeFile(
+      this.entryPath(taskSpec.id),
+      JSON.stringify(entry, null, 2),
+    );
+  }
+
+  /** Wipe the cache directory (used by tests / `bench cache clear`). */
+  async clear(): Promise<void> {
+    await fs.rm(this.cacheDir, { recursive: true, force: true });
+  }
+
+  private entryPath(taskId: string): string {
+    // Sanitize task id for filesystem safety.
+    const safe = taskId.replace(/[^A-Za-z0-9._-]/g, "_");
+    return path.join(this.cacheDir, `${safe}.json`);
+  }
+}
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -0,0 +1,162 @@
+/**
+ * Backfill packages/evals/datasets/webtailbench/WebTailBench_data.jsonl with
+ * the published WebTailBench `precomputed_rubric` field.
+ *
+ * This script fetches WebTailBench-v1-rubrics.tsv from HuggingFace and joins
+ * by `id`, writing back a JSONL where each row carries a
+ * `precomputed_rubric` field (parsed JSON object) alongside the existing
+ * `ques` / `web` / `category` / `id` fields.
+ *
+ * Run once after pulling the branch:
+ *   pnpm tsx packages/evals/scripts/backfill-webtailbench-rubrics.ts
+ *
+ * Idempotent — safe to re-run; an existing precomputed_rubric on a row is
+ * overwritten with the latest upstream version.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+const HF_URL =
+  "https://huggingface.co/datasets/microsoft/WebTailBench/resolve/main/WebTailBench-v1-rubrics.tsv";
+
+const REPO_ROOT = path.resolve(import.meta.dirname, "..", "..", "..");
+const JSONL_PATH = path.join(
+  REPO_ROOT,
+  "packages",
+  "evals",
+  "datasets",
+  "webtailbench",
+  "WebTailBench_data.jsonl",
+);
+
+interface Rubric {
+  items: Array<Record<string, unknown>>;
+}
+
+interface LocalRow {
+  id: string;
+  category?: string;
+  ques: string;
+  web?: string;
+  precomputed_rubric?: Rubric;
+}
+
+/**
+ * Parse a TSV file with simple double-quote escaping (the WebTailBench files
+ * use `""` for literal quotes inside quoted fields). Returns rows as arrays
+ * of column values; the caller maps to a schema.
+ */
+function parseTsv(text: string): string[][] {
+  const rows: string[][] = [];
+  const lines = text.split(/\r?\n/);
+  for (const raw of lines) {
+    if (!raw) continue;
+    // Each column is either quoted (with "" escapes) or unquoted plain text.
+    const cols: string[] = [];
+    let i = 0;
+    while (i < raw.length) {
+      if (raw[i] === "\t") {
+        cols.push("");
+        i++;
+        continue;
+      }
+      let col = "";
+      if (raw[i] === '"') {
+        i++;
+        while (i < raw.length) {
+          if (raw[i] === '"') {
+            if (raw[i + 1] === '"') {
+              col += '"';
+              i += 2;
+            } else {
+              i++;
+              break;
+            }
+          } else {
+            col += raw[i];
+            i++;
+          }
+        }
+      } else {
+        const tabIdx = raw.indexOf("\t", i);
+        if (tabIdx === -1) {
+          col = raw.slice(i);
+          i = raw.length;
+        } else {
+          col = raw.slice(i, tabIdx);
+          i = tabIdx;
+        }
+      }
+      cols.push(col);
+      if (raw[i] === "\t") i++;
+    }
+    rows.push(cols);
+  }
+  return rows;
+}
+
+async function main(): Promise<void> {
+  console.log(`▸ fetching ${HF_URL}`);
+  const res = await fetch(HF_URL);
+  if (!res.ok) {
+    throw new Error(`HF fetch failed: ${res.status} ${res.statusText}`);
+  }
+  const tsv = await res.text();
+  console.log(`  ✓ downloaded ${tsv.length} bytes`);
+
+  const rows = parseTsv(tsv);
+  const header = rows[0];
+  const idIdx = header.indexOf("id");
+  const rubricIdx = header.indexOf("precomputed_rubric");
+  if (idIdx === -1 || rubricIdx === -1) {
+    throw new Error(
+      `unexpected TSV header: ${header.join(", ")} (need 'id' and 'precomputed_rubric')`,
+    );
+  }
+
+  const rubricsById = new Map<string, Rubric>();
+  for (let i = 1; i < rows.length; i++) {
+    const cols = rows[i];
+    if (!cols[idIdx]) continue;
+    try {
+      const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
+      rubricsById.set(cols[idIdx], parsed);
+    } catch (e) {
+      console.warn(
+        `  ! row ${i} (id=${cols[idIdx]}) — invalid JSON in precomputed_rubric: ${e instanceof Error ? e.message : e}`,
+      );
+    }
+  }
+  console.log(`  ✓ parsed ${rubricsById.size} rubrics`);
+
+  const jsonlRaw = await fs.readFile(JSONL_PATH, "utf8");
+  const inLines = jsonlRaw.split(/\r?\n/).filter((l) => l.trim().length > 0);
+  console.log(`▸ joining into ${inLines.length} local rows`);
+
+  let matched = 0;
+  let missing = 0;
+  const out: string[] = [];
+  for (const line of inLines) {
+    const row = JSON.parse(line) as LocalRow;
+    const rubric = rubricsById.get(row.id);
+    if (rubric) {
+      row.precomputed_rubric = rubric;
+      matched++;
+    } else {
+      missing++;
+    }
+    out.push(JSON.stringify(row));
+  }
+
+  console.log(
+    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
+  );
+
+  await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
+  console.log(`✅ wrote ${JSONL_PATH}`);
+}
+
+main().catch((err) => {
+  console.error("❌ backfill failed:", err);
+  process.exit(1);
+});
diff --git a/packages/evals/tests/framework/rubricCache.test.ts b/packages/evals/tests/framework/rubricCache.test.ts
@@ -0,0 +1,47 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+
+import { RubricCache } from "../../framework/rubricCache.js";
+
+describe("RubricCache", () => {
+  let tmpRoot = "";
+  let warn: ReturnType<typeof vi.spyOn>;
+
+  const rubric: Rubric = {
+    items: [
+      {
+        criterion: "criterion",
+        description: "description",
+        maxPoints: 1,
+      },
+    ],
+  };
+
+  beforeEach(async () => {
+    tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rubric-cache-test-"));
+    warn = vi.spyOn(console, "warn").mockImplementation(() => {});
+  });
+
+  afterEach(async () => {
+    warn.mockRestore();
+    await fs.rm(tmpRoot, { recursive: true, force: true });
+  });
+
+  it("misses when sanitized task ids collide but the stored task id differs", async () => {
+    const cache = new RubricCache({ cacheRoot: tmpRoot, dataset: "test" });
+    const taskA: TaskSpec = { id: "task/a", instruction: "same instruction" };
+    const taskB: TaskSpec = { id: "task:a", instruction: "same instruction" };
+
+    await cache.write(taskA, rubric);
+
+    await expect(cache.read(taskB)).resolves.toBeUndefined();
+    await expect(cache.read(taskA)).resolves.toEqual(rubric);
+    expect(warn).toHaveBeenCalledWith(
+      "[rubric-cache] task-id mismatch for task:a; regenerating",
+    );
+  });
+});
diff --git a/packages/evals/tests/tui/commandTree.test.ts b/packages/evals/tests/tui/commandTree.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it, vi } from "vitest";
 import {
+  buildCommandTree,
   dispatch,
   findChild,
   resolveCommand,
@@ -207,6 +208,15 @@ describe("findChild + walkPath", () => {
   });
 });
 
+describe("buildCommandTree", () => {
+  it("exposes verify as a root command", () => {
+    const tree = buildCommandTree();
+    expect(findChild(tree, "verify")?.summary).toBe(
+      "Re-score a saved trajectory",
+    );
+  });
+});
+
 // ---------------------------------------------------------------------------
 // resolveCommand
 // ---------------------------------------------------------------------------