browserbase · miguelg719 · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
@@ -0,0 +1,177 @@
+/**
+ * verifierAdapter — runs a bench task through the verifier pipeline.
+ *
+ * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate
+ * with one call:
+ *
+ *   const { evaluationResult, trajectory } = await runWithVerifier({
+ *     v3,
+ *     agent,
+ *     taskSpec: { id, instruction, initUrl, precomputedRubric? },
+ *     maxSteps: 50,
+ *   });
+ *
+ * Behavior:
+ *   1. Resolves the rubric from the task, cache, or evaluator.
+ *   2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus.
+ *   3. Runs V3Evaluator.verify() on the recorded Trajectory.
+ *   4. Returns { trajectory, evaluationResult, agentResult }.
+ *
+ * Persistence and rubric caching are gated by env vars:
+ *   VERIFIER_PERSIST_TRAJECTORIES   — on locally, off in CI by default.
+ *   VERIFIER_DISABLE_RUBRIC_CACHE   — set to "1" to bypass the cache (forces
+ *                                     fresh rubric generation every time).
+ */
+import {
+  V3Evaluator,
+  normalizeRubric,
+  type AgentInstance,
+  type AgentExecuteOptions,
+  type AgentResult,
+  type EvaluationResult,
+  type Rubric,
+  type TaskSpec,
+  type Trajectory,
+  type V3,
+} from "@browserbasehq/stagehand";
+
+import { RubricCache } from "./rubricCache.js";
+import { TrajectoryRecorder } from "./trajectoryRecorder.js";
+
+export interface RunWithVerifierOptions {
+  v3: V3;
+  agent: AgentInstance;
+  taskSpec: TaskSpec;
+  /**
+   * Dataset name for rubric cache partitioning. Each task lives under
+   * `.rubric-cache/<dataset>/<task-id>.json`.
+   */
+  dataset: string;
+  /** Agent execute options. `instruction` is filled from taskSpec.instruction. */
+  agentOptions?: Omit<AgentExecuteOptions, "instruction">;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+}
+
+export interface RunWithVerifierResult {
+  trajectory: Trajectory;
+  evaluationResult: EvaluationResult;
+  agentResult: AgentResult;
+  /** Resolved rubric (precomputed, cached, or freshly generated). */
+  rubric: Rubric;
+  /** Where the trajectory was persisted (or would have been, if disabled). */
+  trajectoryDir: string;
+}
+
+export async function runWithVerifier(
+  opts: RunWithVerifierOptions,
+): Promise<RunWithVerifierResult> {
+  const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } =
+    opts;
+  const evaluator = new V3Evaluator(v3, { backend: "verifier" });
+
+  // ── Resolve rubric ──────────────────────────────────────────────────────
+  let resolvedRubric: Rubric;
+  if (taskSpec.precomputedRubric) {
+    resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
+  } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+    resolvedRubric = await evaluator.generateRubric(taskSpec);
+  } else {
+    const cache = new RubricCache({ dataset });
+    resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
+  }
+
+  // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
+  const hydratedTaskSpec: TaskSpec = {
+    ...taskSpec,
+    precomputedRubric: resolvedRubric,
+  };
+
+  // ── Record trajectory around agent.execute() ───────────────────────────
+  const recorder = new TrajectoryRecorder({
+    v3,
+    taskSpec: hydratedTaskSpec,
+    runId,
+    outputRoot: trajectoryRoot,
+  });
+  recorder.start();
+
+  let agentResult: AgentResult;
+  let recorderStatus: "complete" | "aborted" | "error" = "complete";
+  try {
+    agentResult = await agent.execute({
+      ...agentOptions,
+      instruction: taskSpec.instruction,
+    });
+  } catch (e) {
+    recorderStatus = "error";
+    const trajectory = await recorder.finish({ status: recorderStatus });
+    // Re-throw after persisting so the bench task can decide how to report.
+    const wrapped = e instanceof Error ? e : new Error(String(e));
+    Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory });
+    throw wrapped;
+  }
+
+  const trajectory = await recorder.finish({
+    status: recorderStatus,
+    finalAnswer: agentResult.message,
+    usage: agentResult.usage,
+  });
+
+  // ── Verify ──────────────────────────────────────────────────────────────
+  const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec);
+  await recorder.persistResult(evaluationResult);
+
+  return {
+    trajectory,
+    evaluationResult,
+    agentResult,
+    rubric: resolvedRubric,
+    trajectoryDir: recorder.directory,
+  };
+}
+
+/**
+ * Decide bench task success from an EvaluationResult using the --success flag's
+ * semantics.
+ *
+ * `outcome` (default) — strict binary outcome.
+ * `process`           — rubric process score ≥ threshold (default 0.8).
+ * `both`              — both conditions must hold.
+ */
+export type EvalSuccessMode = "outcome" | "process" | "both";
+
+export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode {
+  if (typeof mode !== "string") return "outcome";
+  const normalized = mode.trim().toLowerCase();
+  if (
+    normalized === "outcome" ||
+    normalized === "process" ||
+    normalized === "both"
+  ) {
+    return normalized;
+  }
+  return "outcome";
+}
+
+export function evaluationResultToSuccess(
+  result: EvaluationResult,
+  mode: unknown = "outcome",
+  processThreshold = 0.8,
+): boolean {
+  const resolvedMode = resolveEvalSuccessMode(mode);
+  const outcomeOk = result.outcomeSuccess;
+  const processOk =
+    typeof result.processScore === "number" &&
+    result.processScore >= processThreshold;
+  switch (resolvedMode) {
+    case "outcome":
+      return outcomeOk;
+    case "process":
+      return processOk;
+    case "both":
+      return outcomeOk && processOk;
+  }
+}
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -29,7 +29,7 @@ const JSONL_PATH = path.join(
   "WebTailBench_data.jsonl",
 );
 
-interface Rubric {
+interface RawRubric {
   items: Array<Record<string, unknown>>;
 }
 
@@ -38,7 +38,7 @@ interface LocalRow {
   category?: string;
   ques: string;
   web?: string;
-  precomputed_rubric?: Rubric;
+  precomputed_rubric?: RawRubric;
 }
 
 /**
@@ -114,12 +114,12 @@ async function main(): Promise<void> {
     );
   }
 
-  const rubricsById = new Map<string, Rubric>();
+  const rubricsById = new Map<string, RawRubric>();
   for (let i = 1; i < rows.length; i++) {
     const cols = rows[i];
     if (!cols[idIdx]) continue;
     try {
-      const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
+      const parsed = JSON.parse(cols[rubricIdx]) as RawRubric;
       rubricsById.set(cols[idIdx], parsed);
     } catch (e) {
       console.warn(
@@ -149,7 +149,7 @@ async function main(): Promise<void> {
   }
 
   console.log(
-    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
+    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`,
   );
 
   await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");

diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts
@@ -1,5 +1,5 @@
 import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand";
 import { tasksConfig } from "../taskConfig.js";
 import { getPackageRootDir } from "../runtimePaths.js";
 import {
@@ -32,6 +32,12 @@ export const buildWebTailBenchTestcases = (
     ques: string;
     category?: string;
     web?: string;
+    /**
+     * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv
+     * via packages/evals/scripts/backfill-webtailbench-rubrics.ts.
+     * When present, the verifier uses these upstream criteria directly.
+     */
+    precomputed_rubric?: unknown;
     [key: string]: unknown;
   };
 
@@ -42,7 +48,23 @@ export const buildWebTailBenchTestcases = (
   }
 
   const candidates = parseJsonlRows(lines, isWebTailBenchRow);
-  const rows = applySampling(candidates, sampleCount, maxCases);
+
+  // EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs,
+  // preserving the order given and ignoring sampling / limit knobs.
+  const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS
+    ? process.env.EVAL_WEBTAILBENCH_IDS.split(",")
+        .map((s) => s.trim())
+        .filter(Boolean)
+    : null;
+  let rows: WebTailBenchRow[];
+  if (explicitIds && explicitIds.length > 0) {
+    const byId = new Map(candidates.map((r) => [r.id, r]));
+    rows = explicitIds
+      .map((id) => byId.get(id))
+      .filter((r): r is WebTailBenchRow => Boolean(r));
+  } else {
+    rows = applySampling(candidates, sampleCount, maxCases);
+  }
 
   const allTestcases: Testcase[] = [];
   for (const modelEntry of normalizeAgentModelEntries(models)) {
@@ -57,6 +79,7 @@ export const buildWebTailBenchTestcases = (
           category: row.category,
           ques: row.ques,
           web: row.web,
+          precomputed_rubric: normalizeRubric(row.precomputed_rubric),
         },
       };
       const taskCategories =