From adca1438f266c6cc2e10c3878afbf43cf76010ab Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:50:01 -0700
Subject: [PATCH 1/5] feat(evals): wire WebTailBench through verifier

---
 packages/evals/framework/verifierAdapter.ts   | 160 ++++++++++++++
 .../evals/scripts/verify-webtailbench-task.ts | 200 ++++++++++++++++++
 packages/evals/suites/webtailbench.ts         |  29 ++-
 .../evals/tasks/bench/agent/webtailbench.ts   | 109 +++++-----
 4 files changed, 445 insertions(+), 53 deletions(-)
 create mode 100644 packages/evals/framework/verifierAdapter.ts
 create mode 100644 packages/evals/scripts/verify-webtailbench-task.ts
diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
new file mode 100644
index 000000000..5351b28b3
--- /dev/null
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -0,0 +1,160 @@
+/**
+ * verifierAdapter — runs a bench task through the verifier pipeline.
+ *
+ * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate
+ * with one call:
+ *
+ *   const { verdict, trajectory } = await runWithVerifier({
+ *     v3,
+ *     agent,
+ *     taskSpec: { id, instruction, initUrl, precomputedRubric? },
+ *     maxSteps: 50,
+ *   });
+ *
+ * Behavior:
+ *   1. Resolves the rubric — precomputedRubric (e.g., upstream WebTailBench),
+ *      or generates via Step 0a and caches under .rubric-cache/<dataset>/.
+ *   2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus.
+ *   3. Runs V3Evaluator.verify() on the recorded Trajectory.
+ *   4. Returns { trajectory, verdict, agentResult }.
+ *
+ * Persistence and rubric caching are gated by env vars (plan §10 Q2 + Q3):
+ *   VERIFIER_PERSIST_TRAJECTORIES   — on locally, off in CI by default.
+ *   VERIFIER_DISABLE_RUBRIC_CACHE   — set to "1" to bypass the cache (forces
+ *                                     a fresh Step 0a call every time).
+ */
+import {
+  V3Evaluator,
+  type AgentInstance,
+  type AgentExecuteOptions,
+  type AgentResult,
+  type Rubric,
+  type TaskSpec,
+  type Trajectory,
+  type V3,
+  type Verdict,
+} from "@browserbasehq/stagehand";
+
+import { RubricCache } from "./rubricCache.js";
+import { TrajectoryRecorder } from "./trajectoryRecorder.js";
+
+export interface RunWithVerifierOptions {
+  v3: V3;
+  agent: AgentInstance;
+  taskSpec: TaskSpec;
+  /**
+   * Dataset name for rubric cache partitioning. Each task lives under
+   * `.rubric-cache/<dataset>/<task-id>.json`.
+   */
+  dataset: string;
+  /** Agent execute options. `instruction` is filled from taskSpec.instruction. */
+  agentOptions?: Omit<AgentExecuteOptions, "instruction">;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+}
+
+export interface RunWithVerifierResult {
+  trajectory: Trajectory;
+  verdict: Verdict;
+  agentResult: AgentResult;
+  /** Resolved rubric (precomputed, cached, or freshly generated). */
+  rubric: Rubric;
+  /** Where the trajectory was persisted (or would have been, if disabled). */
+  trajectoryDir: string;
+}
+
+export async function runWithVerifier(
+  opts: RunWithVerifierOptions,
+): Promise<RunWithVerifierResult> {
+  const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } =
+    opts;
+  const evaluator = new V3Evaluator(v3, { backend: "verifier" });
+
+  // ── Resolve rubric ──────────────────────────────────────────────────────
+  let resolvedRubric: Rubric;
+  if (taskSpec.precomputedRubric) {
+    resolvedRubric = taskSpec.precomputedRubric;
+  } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+    resolvedRubric = await evaluator.generateRubric(taskSpec);
+  } else {
+    const cache = new RubricCache({ dataset });
+    resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
+  }
+
+  // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
+  const hydratedTaskSpec: TaskSpec = {
+    ...taskSpec,
+    precomputedRubric: resolvedRubric,
+  };
+
+  // ── Record trajectory around agent.execute() ───────────────────────────
+  const recorder = new TrajectoryRecorder({
+    v3,
+    taskSpec: hydratedTaskSpec,
+    runId,
+    outputRoot: trajectoryRoot,
+  });
+  recorder.start();
+
+  let agentResult: AgentResult;
+  let recorderStatus: "complete" | "aborted" | "error" = "complete";
+  try {
+    agentResult = await agent.execute({
+      ...agentOptions,
+      instruction: taskSpec.instruction,
+    });
+  } catch (e) {
+    recorderStatus = "error";
+    const trajectory = await recorder.finish({ status: recorderStatus });
+    // Re-throw after persisting so the bench task can decide how to report.
+    const wrapped = e instanceof Error ? e : new Error(String(e));
+    Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory });
+    throw wrapped;
+  }
+
+  const trajectory = await recorder.finish({
+    status: recorderStatus,
+    finalAnswer: agentResult.message,
+    usage: agentResult.usage,
+  });
+
+  // ── Verify ──────────────────────────────────────────────────────────────
+  const verdict = await evaluator.verify(trajectory, hydratedTaskSpec);
+  await recorder.persistVerdict(verdict);
+
+  return {
+    trajectory,
+    verdict,
+    agentResult,
+    rubric: resolvedRubric,
+    trajectoryDir: recorder.directory,
+  };
+}
+
+/**
+ * Decide bench task success from a Verdict using the --success flag's
+ * semantics (mirrors fara's CLI knob, plan §03).
+ *
+ * `outcome` (default) — strict binary outcome. Matches fara-7b's reported
+ *                       metric.
+ * `process`           — rubric process score ≥ threshold (default 0.8).
+ * `both`              — both conditions must hold.
+ */
+export function verdictToSuccess(
+  verdict: Verdict,
+  mode: "outcome" | "process" | "both" = "outcome",
+  processThreshold = 0.8,
+): boolean {
+  const outcomeOk = verdict.outcomeSuccess;
+  const processOk = verdict.processScore >= processThreshold;
+  switch (mode) {
+    case "outcome":
+      return outcomeOk;
+    case "process":
+      return processOk;
+    case "both":
+      return outcomeOk && processOk;
+  }
+}
diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts
new file mode 100644
index 000000000..4e1848e84
--- /dev/null
+++ b/packages/evals/scripts/verify-webtailbench-task.ts
@@ -0,0 +1,200 @@
+/**
+ * End-to-end Wave 1 verification on a real WebTailBench task.
+ *
+ * Loads one row from packages/evals/datasets/webtailbench/WebTailBench_data.jsonl
+ * (which carries upstream precomputed_rubric), runs the agent on Browserbase
+ * via runWithVerifier, and asserts:
+ *   1. Recorder captures a non-trivial trajectory.
+ *   2. Verifier uses the upstream rubric (rubricSource = "precomputed").
+ *   3. Step 6 rescoring produces per-criterion scores (no evidence_insufficient).
+ *   4. Step 8 outcome returns a boolean verdict with reasoning.
+ *
+ *   pnpm tsx packages/evals/scripts/verify-webtailbench-task.ts [task_id]
+ *
+ * Defaults to united_13. Requires BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID
+ * and a GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env.
+ */
+import "dotenv/config";
+import assert from "node:assert/strict";
+import fs from "node:fs/promises";
+import path from "node:path";
+
+import { V3 } from "@browserbasehq/stagehand";
+import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+import { runWithVerifier } from "../framework/verifierAdapter.js";
+
+interface WebTailBenchRow {
+  id: string;
+  category?: string;
+  ques: string;
+  web?: string;
+  precomputed_rubric?: Rubric;
+}
+
+const DEFAULT_TASK_ID = "united_13";
+const JSONL = path.resolve(
+  import.meta.dirname,
+  "..",
+  "datasets",
+  "webtailbench",
+  "WebTailBench_data.jsonl",
+);
+
+async function loadRow(taskId: string): Promise<WebTailBenchRow> {
+  const raw = await fs.readFile(JSONL, "utf8");
+  for (const line of raw.split(/\r?\n/)) {
+    if (!line.trim()) continue;
+    const row = JSON.parse(line) as WebTailBenchRow;
+    if (row.id === taskId) return row;
+  }
+  throw new Error(`task id ${taskId} not found in ${JSONL}`);
+}
+
+async function main(): Promise<void> {
+  const taskId = process.argv[2] ?? DEFAULT_TASK_ID;
+  const mode = (process.env.AGENT_MODE ?? "hybrid") as "dom" | "hybrid" | "cua";
+  const model =
+    process.env.AGENT_MODEL ??
+    (mode === "cua" ? "anthropic/claude-haiku-4-5" : "google/gemini-2.5-flash");
+  console.log(`▸ loading WebTailBench task: ${taskId}`);
+  console.log(`  mode=${mode}  model=${model}`);
+  const row = await loadRow(taskId);
+  console.log(`  ✓ ${row.ques.slice(0, 100)}`);
+  console.log(
+    `  ✓ rubric: ${row.precomputed_rubric ? `${row.precomputed_rubric.items.length} criteria` : "MISSING"}`,
+  );
+  assert.ok(
+    row.precomputed_rubric && row.precomputed_rubric.items.length > 0,
+    "task should carry a precomputed rubric (run backfill-webtailbench-rubrics.ts first)",
+  );
+
+  // Most WebTailBench sites block local browser traffic; ideally this runs on
+  // BROWSERBASE. Defaults to LOCAL when Browserbase creds aren't configured —
+  // the verifier still exercises end-to-end on whatever trajectory we capture,
+  // even if the agent fails fast against anti-bot.
+  const useBrowserbase =
+    process.env.BROWSERBASE_API_KEY && process.env.BROWSERBASE_PROJECT_ID;
+  const env = useBrowserbase ? "BROWSERBASE" : "LOCAL";
+  console.log(`▸ initializing V3 on ${env}`);
+  const v3 = new V3({
+    env,
+    verbose: 1,
+    model,
+    // Keep the agent loop local even on env=BROWSERBASE — without this V3
+    // would auto-create an apiClient and dispatch agent.execute() to the
+    // remote server-side loop, which doesn't emit on our local bus. The
+    // evals framework does this same opt-out in packages/evals/initV3.ts:121
+    // via process.env.USE_API. disableAPI is the targeted flag; we used
+    // experimental: true previously as a heavier-handed equivalent.
+    disableAPI: true,
+  });
+  await v3.init();
+
+  const page = v3.context.pages()[0];
+  const startUrl = row.web || "https://www.google.com";
+  await page.goto(startUrl, { timeoutMs: 120_000 });
+  console.log(`  ✓ navigated to ${startUrl}`);
+
+  const agent = v3.agent({
+    mode,
+    model,
+  });
+
+  const taskSpec: TaskSpec = {
+    id: row.id,
+    instruction: row.ques,
+    initUrl: startUrl,
+    precomputedRubric: row.precomputed_rubric,
+  };
+
+  console.log("▸ running agent + verifier pipeline");
+  const startMs = Date.now();
+  const result = await runWithVerifier({
+    v3,
+    agent,
+    taskSpec,
+    dataset: "webtailbench",
+    agentOptions: { maxSteps: 30 },
+  });
+  console.log(
+    `  ✓ completed in ${((Date.now() - startMs) / 1000).toFixed(1)}s`,
+  );
+
+  // Diagnostic: show what the agent did internally vs what reached the bus.
+  console.log(`  agent.actions: ${result.agentResult.actions.length}`);
+  console.log(`  agent.completed: ${result.agentResult.completed}`);
+  console.log(
+    `  agent.usage: ${JSON.stringify(result.agentResult.usage ?? {})}`,
+  );
+  if (result.agentResult.actions.length > 0) {
+    console.log("  first 5 internal actions:");
+    for (const a of result.agentResult.actions.slice(0, 5)) {
+      console.log(`    - ${a.type ?? "?"}  ${(a.action ?? "").slice(0, 80)}`);
+    }
+  }
+
+  await v3.close();
+
+  // ── Assertions ──────────────────────────────────────────────────────────
+  const { trajectory, verdict, rubric, trajectoryDir } = result;
+  console.log(`\n▸ trajectory: ${trajectory.steps.length} steps`);
+  console.log(`  directory: ${trajectoryDir}`);
+  console.log(`\n▸ verdict:`);
+  console.log(
+    `  outcomeSuccess=${verdict.outcomeSuccess}  processScore=${verdict.processScore.toFixed(3)}`,
+  );
+  console.log(
+    `  per-criterion (${verdict.perCriterion.length}/${rubric.items.length}):`,
+  );
+  for (const c of verdict.perCriterion) {
+    const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1);
+    const flag = c.evidenceInsufficient ? " [evidence_insufficient]" : "";
+    console.log(`    - ${earned}/${c.maxPoints}  ${c.criterion}${flag}`);
+    if (c.justification) {
+      console.log(`        ${c.justification.slice(0, 200)}`);
+    }
+  }
+  const raw = verdict.rawSteps as
+    | { primaryIntent?: string; reasoning?: string; rubricSource?: string }
+    | undefined;
+  console.log(`\n▸ rubric source: ${raw?.rubricSource}`);
+  console.log(`▸ primary intent: ${raw?.primaryIntent}`);
+
+  if (verdict.findings && verdict.findings.length > 0) {
+    console.log(`\n▸ findings (${verdict.findings.length}):`);
+    for (const f of verdict.findings) {
+      const steps = f.relatedSteps?.length
+        ? `  steps=[${f.relatedSteps.join(",")}]`
+        : "";
+      console.log(`  [${f.severity}] ${f.category}${steps}`);
+      console.log(`    ${f.description}`);
+      if (f.suggestedAction) {
+        console.log(`    → ${f.suggestedAction}`);
+      }
+    }
+  } else {
+    console.log(`\n▸ findings: (none)`);
+  }
+
+  assert.equal(
+    raw?.rubricSource,
+    "precomputed",
+    "expected verifier to use the upstream precomputed rubric",
+  );
+  assert.equal(verdict.perCriterion.length, rubric.items.length);
+  const fullySufficient = verdict.perCriterion.every(
+    (c) => !c.evidenceInsufficient,
+  );
+  assert.ok(
+    fullySufficient,
+    "expected Step 6 to score every criterion (no evidence_insufficient flags)",
+  );
+  assert.equal(typeof verdict.outcomeSuccess, "boolean");
+
+  console.log(`\n✅ Wave 1 WebTailBench verification OK`);
+}
+
+main().catch((err) => {
+  console.error("\n❌ Wave 1 WebTailBench verification FAILED:", err);
+  process.exit(1);
+});
diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts
index 4bd2015fa..eb20029e4 100644
--- a/packages/evals/suites/webtailbench.ts
+++ b/packages/evals/suites/webtailbench.ts
@@ -1,5 +1,5 @@
 import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import type { AvailableModel, Rubric } from "@browserbasehq/stagehand";
 import { tasksConfig } from "../taskConfig.js";
 import { getPackageRootDir } from "../runtimePaths.js";
 import {
@@ -32,6 +32,13 @@ export const buildWebTailBenchTestcases = (
     ques: string;
     category?: string;
     web?: string;
+    /**
+     * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv
+     * via packages/evals/scripts/backfill-webtailbench-rubrics.ts.
+     * When present, the verifier skips Step 0a generation and uses these
+     * upstream criteria directly.
+     */
+    precomputed_rubric?: Rubric;
     [key: string]: unknown;
   };
 
@@ -42,7 +49,24 @@ export const buildWebTailBenchTestcases = (
   }
 
   const candidates = parseJsonlRows(lines, isWebTailBenchRow);
-  const rows = applySampling(candidates, sampleCount, maxCases);
+
+  // EVAL_WEBTAILBENCH_IDS — comma-separated task IDs. When set, restricts the
+  // suite to exactly those IDs (in the order given) and ignores sampling /
+  // limit knobs. Used by verifier-A/B experiments to pin a deterministic slice.
+  const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS
+    ? process.env.EVAL_WEBTAILBENCH_IDS.split(",")
+        .map((s) => s.trim())
+        .filter(Boolean)
+    : null;
+  let rows: WebTailBenchRow[];
+  if (explicitIds && explicitIds.length > 0) {
+    const byId = new Map(candidates.map((r) => [r.id, r]));
+    rows = explicitIds
+      .map((id) => byId.get(id))
+      .filter((r): r is WebTailBenchRow => Boolean(r));
+  } else {
+    rows = applySampling(candidates, sampleCount, maxCases);
+  }
 
   const allTestcases: Testcase[] = [];
   for (const modelEntry of normalizeAgentModelEntries(models)) {
@@ -57,6 +81,7 @@ export const buildWebTailBenchTestcases = (
           category: row.category,
           ques: row.ques,
           web: row.web,
+          precomputed_rubric: row.precomputed_rubric,
         },
       };
       const taskCategories =
diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts
index 7f60e5775..a5b3433cf 100644
--- a/packages/evals/tasks/bench/agent/webtailbench.ts
+++ b/packages/evals/tasks/bench/agent/webtailbench.ts
@@ -1,19 +1,37 @@
+import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+
 import { defineBenchTask } from "../../../framework/defineTask.js";
-import { V3Evaluator } from "@browserbasehq/stagehand";
-import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js";
-import { imageResize } from "../../../utils/imageResize.js";
+import {
+  runWithVerifier,
+  verdictToSuccess,
+} from "../../../framework/verifierAdapter.js";
 
+/**
+ * WebTailBench bench task.
+ *
+ * Wave 1 MVP: runs the agent through the new TrajectoryRecorder +
+ * V3Evaluator.verify() pipeline (process + outcome scoring grounded in the
+ * paper's MMRubricAgent). The previous polling-based ScreenshotCollector +
+ * V3Evaluator.ask() flow is gone.
+ *
+ * The local WebTailBench JSONL doesn't carry precomputed_rubric (the
+ * upstream HF dataset does — Wave 2 dataset swap pending). Until then the
+ * verifier generates a rubric via Step 0a on first encounter per task id
+ * and caches under packages/evals/.rubric-cache/webtailbench/.
+ *
+ * --success knob: defaults to "outcome" (matches fara-7b's reported metric).
+ * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both.
+ */
 export default defineBenchTask(
   { name: "agent/webtailbench" },
   async ({ v3, logger, debugUrl, sessionUrl, modelName, input }) => {
-    let screenshotCollector: ScreenshotCollector | null = null;
-
     try {
       const params = ((input && input.params) || {}) as {
         id?: string;
         category?: string;
         ques?: string;
         web?: string;
+        precomputed_rubric?: Rubric;
       };
 
       if (!params.ques) {
@@ -27,11 +45,8 @@ export default defineBenchTask(
       }
 
       const page = v3.context.pages()[0];
-      // web field is always empty in WebTailBench; start from Google
       const startUrl = params.web || "https://www.google.com";
-      await page.goto(startUrl, {
-        timeoutMs: 120_000,
-      });
+      await page.goto(startUrl, { timeoutMs: 120_000 });
 
       const systemPrompt = `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. You will need to navigate to the appropriate website to complete the task.`;
       const agentMode = input.agentMode ?? (input.isCUA ? "cua" : "hybrid");
@@ -41,70 +56,62 @@ export default defineBenchTask(
         systemPrompt,
       });
 
-      screenshotCollector = new ScreenshotCollector(v3, {
-        interval: 3000,
-        maxScreenshots: 8,
-      });
-      screenshotCollector.start();
-
-      const agentResult = await agent.execute({
+      const taskSpec: TaskSpec = {
+        id: params.id ?? `webtailbench/${input.name}`,
         instruction: params.ques,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
-      });
+        initUrl: startUrl,
+        precomputedRubric: params.precomputed_rubric,
+      };
 
-      // Stop collecting and get all screenshots
-      let screenshots = await screenshotCollector.stop();
+      const { verdict, trajectory, trajectoryDir, rubric } =
+        await runWithVerifier({
+          v3,
+          agent,
+          taskSpec,
+          dataset: "webtailbench",
+          agentOptions: {
+            maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+          },
+        });
 
-      // Resize screenshots if we have any
-      if (screenshots.length > 0) {
-        screenshots = await Promise.all(
-          screenshots.map(async (screenshot) => {
-            return await imageResize(screenshot, 0.7);
-          }),
-        );
-      }
+      const successMode =
+        (process.env.EVAL_SUCCESS_MODE as "outcome" | "process" | "both") ||
+        "outcome";
 
       logger.log({
         category: "evaluation",
-        message: `Collected ${screenshots.length} screenshots for evaluation`,
+        message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`,
         level: 1,
       });
 
-      const evaluator = new V3Evaluator(v3);
-      const evalResult = await evaluator.ask({
-        question: `Did the agent successfully complete this task: "${params.ques}"? Note that the agent does not have purchasing/booking capabilities; mark as pass if the agent has successfully performed all necessary steps for the task up to the point of purchasing/booking/entering payment/user information`,
-        screenshot: screenshots,
-        agentReasoning:
-          agentResult.message ||
-          "no reasoning available, agent potentially hit step limit",
-      });
-
-      // Clear screenshot buffers to free memory
-      screenshots.length = 0;
-
       return {
-        _success: evalResult.evaluation === "YES",
-        reasoning: evalResult.reasoning,
+        _success: verdictToSuccess(verdict, successMode),
+        outcomeSuccess: verdict.outcomeSuccess,
+        processScore: verdict.processScore,
+        evidenceInsufficient: verdict.evidenceInsufficient,
+        criterionCount: rubric.items.length,
+        stepCount: trajectory.steps.length,
+        trajectoryDir,
+        primaryIntent:
+          (verdict.rawSteps as { primaryIntent?: string } | undefined)
+            ?.primaryIntent ?? undefined,
+        reasoning:
+          (verdict.rawSteps as { reasoning?: string } | undefined)?.reasoning ??
+          undefined,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
     } catch (error) {
+      const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
       return {
         _success: false,
         error,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
-    } finally {
-      if (screenshotCollector) {
-        try {
-          await screenshotCollector.stop();
-        } catch {
-          // Ignore errors during cleanup
-        }
-      }
     }
   },
 );

From 986624ecec76b70dfe12f39c12c3d7bdf69ca5ae Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:18:34 -0700
Subject: [PATCH 2/5] fix(evals): normalize verifier rubric inputs

---
 packages/evals/framework/verifierAdapter.ts   |  3 ++-
 .../scripts/backfill-webtailbench-rubrics.ts  |  8 ++++----
 .../evals/scripts/verify-webtailbench-task.ts | 12 +++++-------
 packages/evals/suites/webtailbench.ts         | 10 +++++++---
 .../evals/tasks/bench/agent/webtailbench.ts   | 19 ++++++++++---------
 5 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
index 5351b28b3..971205f6b 100644
--- a/packages/evals/framework/verifierAdapter.ts
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -25,6 +25,7 @@
  */
 import {
   V3Evaluator,
+  normalizeRubric,
   type AgentInstance,
   type AgentExecuteOptions,
   type AgentResult,
@@ -75,7 +76,7 @@ export async function runWithVerifier(
   // ── Resolve rubric ──────────────────────────────────────────────────────
   let resolvedRubric: Rubric;
   if (taskSpec.precomputedRubric) {
-    resolvedRubric = taskSpec.precomputedRubric;
+    resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
   } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
     resolvedRubric = await evaluator.generateRubric(taskSpec);
   } else {
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
index 5a6763390..965722fa4 100644
--- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts
+++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -29,7 +29,7 @@ const JSONL_PATH = path.join(
   "WebTailBench_data.jsonl",
 );
 
-interface Rubric {
+interface SerializedRubric {
   items: Array<Record<string, unknown>>;
 }
 
@@ -38,7 +38,7 @@ interface LocalRow {
   category?: string;
   ques: string;
   web?: string;
-  precomputed_rubric?: Rubric;
+  precomputed_rubric?: SerializedRubric;
 }
 
 /**
@@ -114,12 +114,12 @@ async function main(): Promise<void> {
     );
   }
 
-  const rubricsById = new Map<string, Rubric>();
+  const rubricsById = new Map<string, SerializedRubric>();
   for (let i = 1; i < rows.length; i++) {
     const cols = rows[i];
     if (!cols[idIdx]) continue;
     try {
-      const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
+      const parsed = JSON.parse(cols[rubricIdx]) as SerializedRubric;
       rubricsById.set(cols[idIdx], parsed);
     } catch (e) {
       console.warn(
diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts
index 4e1848e84..84666a74b 100644
--- a/packages/evals/scripts/verify-webtailbench-task.ts
+++ b/packages/evals/scripts/verify-webtailbench-task.ts
@@ -19,8 +19,8 @@ import assert from "node:assert/strict";
 import fs from "node:fs/promises";
 import path from "node:path";
 
-import { V3 } from "@browserbasehq/stagehand";
-import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+import { V3, normalizeRubric } from "@browserbasehq/stagehand";
+import type { SerializedRubric, TaskSpec } from "@browserbasehq/stagehand";
 import { runWithVerifier } from "../framework/verifierAdapter.js";
 
 interface WebTailBenchRow {
@@ -28,7 +28,7 @@ interface WebTailBenchRow {
   category?: string;
   ques: string;
   web?: string;
-  precomputed_rubric?: Rubric;
+  precomputed_rubric?: SerializedRubric;
 }
 
 const DEFAULT_TASK_ID = "united_13";
@@ -104,7 +104,7 @@ async function main(): Promise<void> {
     id: row.id,
     instruction: row.ques,
     initUrl: startUrl,
-    precomputedRubric: row.precomputed_rubric,
+    precomputedRubric: normalizeRubric(row.precomputed_rubric),
   };
 
   console.log("▸ running agent + verifier pipeline");
@@ -154,9 +154,7 @@ async function main(): Promise<void> {
       console.log(`        ${c.justification.slice(0, 200)}`);
     }
   }
-  const raw = verdict.rawSteps as
-    | { primaryIntent?: string; reasoning?: string; rubricSource?: string }
-    | undefined;
+  const raw = verdict.rawSteps;
   console.log(`\n▸ rubric source: ${raw?.rubricSource}`);
   console.log(`▸ primary intent: ${raw?.primaryIntent}`);
 
diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts
index eb20029e4..bf8c5c919 100644
--- a/packages/evals/suites/webtailbench.ts
+++ b/packages/evals/suites/webtailbench.ts
@@ -1,5 +1,9 @@
 import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
-import type { AvailableModel, Rubric } from "@browserbasehq/stagehand";
+import {
+  normalizeRubric,
+  type AvailableModel,
+  type SerializedRubric,
+} from "@browserbasehq/stagehand";
 import { tasksConfig } from "../taskConfig.js";
 import { getPackageRootDir } from "../runtimePaths.js";
 import {
@@ -38,7 +42,7 @@ export const buildWebTailBenchTestcases = (
      * When present, the verifier skips Step 0a generation and uses these
      * upstream criteria directly.
      */
-    precomputed_rubric?: Rubric;
+    precomputed_rubric?: SerializedRubric;
     [key: string]: unknown;
   };
 
@@ -81,7 +85,7 @@ export const buildWebTailBenchTestcases = (
           category: row.category,
           ques: row.ques,
           web: row.web,
-          precomputed_rubric: row.precomputed_rubric,
+          precomputed_rubric: normalizeRubric(row.precomputed_rubric),
         },
       };
       const taskCategories =
diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts
index a5b3433cf..33224713f 100644
--- a/packages/evals/tasks/bench/agent/webtailbench.ts
+++ b/packages/evals/tasks/bench/agent/webtailbench.ts
@@ -1,4 +1,9 @@
-import type { Rubric, TaskSpec } from "@browserbasehq/stagehand";
+import {
+  normalizeRubric,
+  type Rubric,
+  type SerializedRubric,
+  type TaskSpec,
+} from "@browserbasehq/stagehand";
 
 import { defineBenchTask } from "../../../framework/defineTask.js";
 import {
@@ -31,7 +36,7 @@ export default defineBenchTask(
         category?: string;
         ques?: string;
         web?: string;
-        precomputed_rubric?: Rubric;
+        precomputed_rubric?: Rubric | SerializedRubric;
       };
 
       if (!params.ques) {
@@ -60,7 +65,7 @@ export default defineBenchTask(
         id: params.id ?? `webtailbench/${input.name}`,
         instruction: params.ques,
         initUrl: startUrl,
-        precomputedRubric: params.precomputed_rubric,
+        precomputedRubric: normalizeRubric(params.precomputed_rubric),
       };
 
       const { verdict, trajectory, trajectoryDir, rubric } =
@@ -92,12 +97,8 @@ export default defineBenchTask(
         criterionCount: rubric.items.length,
         stepCount: trajectory.steps.length,
         trajectoryDir,
-        primaryIntent:
-          (verdict.rawSteps as { primaryIntent?: string } | undefined)
-            ?.primaryIntent ?? undefined,
-        reasoning:
-          (verdict.rawSteps as { reasoning?: string } | undefined)?.reasoning ??
-          undefined,
+        primaryIntent: verdict.rawSteps?.primaryIntent,
+        reasoning: verdict.rawSteps?.reasoning,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),

From 92afba0bdc110a2a3ec5b8a6a87732e429d0fd7b Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:40:59 -0700
Subject: [PATCH 3/5] fix(evals): validate verifier success mode

---
 packages/evals/framework/verifierAdapter.ts   | 25 ++++++++++++---
 .../evals/tasks/bench/agent/webtailbench.ts   | 10 +++---
 .../tests/framework/verifierAdapter.test.ts   | 32 +++++++++++++++++++
 3 files changed, 56 insertions(+), 11 deletions(-)
 create mode 100644 packages/evals/tests/framework/verifierAdapter.test.ts

diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
index 971205f6b..5ecfc7da9 100644
--- a/packages/evals/framework/verifierAdapter.ts
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -136,21 +136,36 @@ export async function runWithVerifier(
 
 /**
  * Decide bench task success from a Verdict using the --success flag's
- * semantics (mirrors fara's CLI knob, plan §03).
+ * semantics.
  *
- * `outcome` (default) — strict binary outcome. Matches fara-7b's reported
- *                       metric.
+ * `outcome` (default) — strict binary outcome.
  * `process`           — rubric process score ≥ threshold (default 0.8).
  * `both`              — both conditions must hold.
  */
+export type EvalSuccessMode = "outcome" | "process" | "both";
+
+export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode {
+  if (typeof mode !== "string") return "outcome";
+  const normalized = mode.trim().toLowerCase();
+  if (
+    normalized === "outcome" ||
+    normalized === "process" ||
+    normalized === "both"
+  ) {
+    return normalized;
+  }
+  return "outcome";
+}
+
 export function verdictToSuccess(
   verdict: Verdict,
-  mode: "outcome" | "process" | "both" = "outcome",
+  mode: unknown = "outcome",
   processThreshold = 0.8,
 ): boolean {
+  const resolvedMode = resolveEvalSuccessMode(mode);
   const outcomeOk = verdict.outcomeSuccess;
   const processOk = verdict.processScore >= processThreshold;
-  switch (mode) {
+  switch (resolvedMode) {
     case "outcome":
       return outcomeOk;
     case "process":
diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts
index 33224713f..9081171be 100644
--- a/packages/evals/tasks/bench/agent/webtailbench.ts
+++ b/packages/evals/tasks/bench/agent/webtailbench.ts
@@ -15,8 +15,8 @@ import {
  * WebTailBench bench task.
  *
  * Wave 1 MVP: runs the agent through the new TrajectoryRecorder +
- * V3Evaluator.verify() pipeline (process + outcome scoring grounded in the
- * paper's MMRubricAgent). The previous polling-based ScreenshotCollector +
+ * V3Evaluator.verify() pipeline (process + outcome scoring grounded in saved
+ * trajectory evidence). The previous polling-based ScreenshotCollector +
  * V3Evaluator.ask() flow is gone.
  *
  * The local WebTailBench JSONL doesn't carry precomputed_rubric (the
@@ -24,7 +24,7 @@ import {
  * verifier generates a rubric via Step 0a on first encounter per task id
  * and caches under packages/evals/.rubric-cache/webtailbench/.
  *
- * --success knob: defaults to "outcome" (matches fara-7b's reported metric).
+ * --success knob: defaults to "outcome".
  * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both.
  */
 export default defineBenchTask(
@@ -79,9 +79,7 @@ export default defineBenchTask(
           },
         });
 
-      const successMode =
-        (process.env.EVAL_SUCCESS_MODE as "outcome" | "process" | "both") ||
-        "outcome";
+      const successMode = process.env.EVAL_SUCCESS_MODE;
 
       logger.log({
         category: "evaluation",
diff --git a/packages/evals/tests/framework/verifierAdapter.test.ts b/packages/evals/tests/framework/verifierAdapter.test.ts
new file mode 100644
index 000000000..6f446d1ff
--- /dev/null
+++ b/packages/evals/tests/framework/verifierAdapter.test.ts
@@ -0,0 +1,32 @@
+import { describe, expect, it } from "vitest";
+import type { Verdict } from "@browserbasehq/stagehand";
+
+import {
+  resolveEvalSuccessMode,
+  verdictToSuccess,
+} from "../../framework/verifierAdapter.js";
+
+const baseVerdict: Verdict = {
+  outcomeSuccess: true,
+  processScore: 0.5,
+  perCriterion: [],
+  taskValidity: { isAmbiguous: false, isInvalid: false },
+  evidenceInsufficient: [],
+};
+
+describe("resolveEvalSuccessMode", () => {
+  it("defaults invalid env/config values to outcome", () => {
+    expect(resolveEvalSuccessMode(undefined)).toBe("outcome");
+    expect(resolveEvalSuccessMode("bad-value")).toBe("outcome");
+    expect(resolveEvalSuccessMode(" PROCESS ")).toBe("process");
+  });
+});
+
+describe("verdictToSuccess", () => {
+  it("uses validated success modes", () => {
+    expect(verdictToSuccess(baseVerdict, "outcome")).toBe(true);
+    expect(verdictToSuccess(baseVerdict, "process")).toBe(false);
+    expect(verdictToSuccess(baseVerdict, "both")).toBe(false);
+    expect(verdictToSuccess(baseVerdict, "invalid")).toBe(true);
+  });
+});

From 7652e3226573a7ae533925a634e1009e8db37322 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 15:20:50 -0700
Subject: [PATCH 4/5] docs(evals): remove rollout comments from verifier
 adapter

---
 packages/evals/framework/verifierAdapter.ts        |  2 +-
 packages/evals/scripts/verify-webtailbench-task.ts |  8 ++++----
 packages/evals/tasks/bench/agent/webtailbench.ts   | 13 +++++--------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
index 5ecfc7da9..2e6c8a8a2 100644
--- a/packages/evals/framework/verifierAdapter.ts
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -18,7 +18,7 @@
  *   3. Runs V3Evaluator.verify() on the recorded Trajectory.
  *   4. Returns { trajectory, verdict, agentResult }.
  *
- * Persistence and rubric caching are gated by env vars (plan §10 Q2 + Q3):
+ * Persistence and rubric caching are gated by env vars:
  *   VERIFIER_PERSIST_TRAJECTORIES   — on locally, off in CI by default.
  *   VERIFIER_DISABLE_RUBRIC_CACHE   — set to "1" to bypass the cache (forces
  *                                     a fresh Step 0a call every time).
diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts
index 84666a74b..c670edf4c 100644
--- a/packages/evals/scripts/verify-webtailbench-task.ts
+++ b/packages/evals/scripts/verify-webtailbench-task.ts
@@ -1,8 +1,8 @@
 /**
- * End-to-end Wave 1 verification on a real WebTailBench task.
+ * End-to-end verification on a real WebTailBench task.
  *
  * Loads one row from packages/evals/datasets/webtailbench/WebTailBench_data.jsonl
- * (which carries upstream precomputed_rubric), runs the agent on Browserbase
+ * (which carries `precomputed_rubric`), runs the agent on Browserbase
  * via runWithVerifier, and asserts:
  *   1. Recorder captures a non-trivial trajectory.
  *   2. Verifier uses the upstream rubric (rubricSource = "precomputed").
@@ -189,10 +189,10 @@ async function main(): Promise<void> {
   );
   assert.equal(typeof verdict.outcomeSuccess, "boolean");
 
-  console.log(`\n✅ Wave 1 WebTailBench verification OK`);
+  console.log(`\n✅ WebTailBench verification OK`);
 }
 
 main().catch((err) => {
-  console.error("\n❌ Wave 1 WebTailBench verification FAILED:", err);
+  console.error("\n❌ WebTailBench verification FAILED:", err);
   process.exit(1);
 });
diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts
index 9081171be..052bc1b26 100644
--- a/packages/evals/tasks/bench/agent/webtailbench.ts
+++ b/packages/evals/tasks/bench/agent/webtailbench.ts
@@ -14,15 +14,12 @@ import {
 /**
  * WebTailBench bench task.
  *
- * Wave 1 MVP: runs the agent through the new TrajectoryRecorder +
- * V3Evaluator.verify() pipeline (process + outcome scoring grounded in saved
- * trajectory evidence). The previous polling-based ScreenshotCollector +
- * V3Evaluator.ask() flow is gone.
+ * Runs the agent through TrajectoryRecorder + V3Evaluator.verify() so process
+ * and outcome scoring are grounded in saved trajectory evidence.
  *
- * The local WebTailBench JSONL doesn't carry precomputed_rubric (the
- * upstream HF dataset does — Wave 2 dataset swap pending). Until then the
- * verifier generates a rubric via Step 0a on first encounter per task id
- * and caches under packages/evals/.rubric-cache/webtailbench/.
+ * If a row does not carry `precomputed_rubric`, the verifier generates a
+ * rubric on first encounter per task id and caches it under
+ * packages/evals/.rubric-cache/webtailbench/.
  *
  * --success knob: defaults to "outcome".
  * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both.

From 47dc1d582dad2c17d64a26aa076824b351964107 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:42:34 -0700
Subject: [PATCH 5/5] fix(evals): align verifier adapter result API

---
 packages/evals/framework/verifierAdapter.ts   |  31 +--
 .../scripts/backfill-webtailbench-rubrics.ts  |  10 +-
 .../evals/scripts/verify-webtailbench-task.ts | 198 ------------------
 packages/evals/suites/webtailbench.ts         |  16 +-
 .../evals/tasks/bench/agent/webtailbench.ts   |  31 ++-
 .../tests/framework/verifierAdapter.test.ts   |  23 +-
 6 files changed, 56 insertions(+), 253 deletions(-)
 delete mode 100644 packages/evals/scripts/verify-webtailbench-task.ts

diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
index 2e6c8a8a2..8dc40bd2b 100644
--- a/packages/evals/framework/verifierAdapter.ts
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -4,7 +4,7 @@
  * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate
  * with one call:
  *
- *   const { verdict, trajectory } = await runWithVerifier({
+ *   const { evaluationResult, trajectory } = await runWithVerifier({
  *     v3,
  *     agent,
  *     taskSpec: { id, instruction, initUrl, precomputedRubric? },
@@ -12,16 +12,15 @@
  *   });
  *
  * Behavior:
- *   1. Resolves the rubric — precomputedRubric (e.g., upstream WebTailBench),
- *      or generates via Step 0a and caches under .rubric-cache/<dataset>/.
+ *   1. Resolves the rubric from the task, cache, or evaluator.
  *   2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus.
  *   3. Runs V3Evaluator.verify() on the recorded Trajectory.
- *   4. Returns { trajectory, verdict, agentResult }.
+ *   4. Returns { trajectory, evaluationResult, agentResult }.
  *
  * Persistence and rubric caching are gated by env vars:
  *   VERIFIER_PERSIST_TRAJECTORIES   — on locally, off in CI by default.
  *   VERIFIER_DISABLE_RUBRIC_CACHE   — set to "1" to bypass the cache (forces
- *                                     a fresh Step 0a call every time).
+ *                                     fresh rubric generation every time).
  */
 import {
   V3Evaluator,
@@ -29,11 +28,11 @@ import {
   type AgentInstance,
   type AgentExecuteOptions,
   type AgentResult,
+  type EvaluationResult,
   type Rubric,
   type TaskSpec,
   type Trajectory,
   type V3,
-  type Verdict,
 } from "@browserbasehq/stagehand";
 
 import { RubricCache } from "./rubricCache.js";
@@ -58,7 +57,7 @@ export interface RunWithVerifierOptions {
 
 export interface RunWithVerifierResult {
   trajectory: Trajectory;
-  verdict: Verdict;
+  evaluationResult: EvaluationResult;
   agentResult: AgentResult;
   /** Resolved rubric (precomputed, cached, or freshly generated). */
   rubric: Rubric;
@@ -122,12 +121,12 @@ export async function runWithVerifier(
   });
 
   // ── Verify ──────────────────────────────────────────────────────────────
-  const verdict = await evaluator.verify(trajectory, hydratedTaskSpec);
-  await recorder.persistVerdict(verdict);
+  const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec);
+  await recorder.persistResult(evaluationResult);
 
   return {
     trajectory,
-    verdict,
+    evaluationResult,
     agentResult,
     rubric: resolvedRubric,
     trajectoryDir: recorder.directory,
@@ -135,7 +134,7 @@ export async function runWithVerifier(
 }
 
 /**
- * Decide bench task success from a Verdict using the --success flag's
+ * Decide bench task success from an EvaluationResult using the --success flag's
  * semantics.
  *
  * `outcome` (default) — strict binary outcome.
@@ -157,14 +156,16 @@ export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode {
   return "outcome";
 }
 
-export function verdictToSuccess(
-  verdict: Verdict,
+export function evaluationResultToSuccess(
+  result: EvaluationResult,
   mode: unknown = "outcome",
   processThreshold = 0.8,
 ): boolean {
   const resolvedMode = resolveEvalSuccessMode(mode);
-  const outcomeOk = verdict.outcomeSuccess;
-  const processOk = verdict.processScore >= processThreshold;
+  const outcomeOk = result.outcomeSuccess;
+  const processOk =
+    typeof result.processScore === "number" &&
+    result.processScore >= processThreshold;
   switch (resolvedMode) {
     case "outcome":
       return outcomeOk;
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
index 965722fa4..22fb87de1 100644
--- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts
+++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -29,7 +29,7 @@ const JSONL_PATH = path.join(
   "WebTailBench_data.jsonl",
 );
 
-interface SerializedRubric {
+interface RawRubric {
   items: Array<Record<string, unknown>>;
 }
 
@@ -38,7 +38,7 @@ interface LocalRow {
   category?: string;
   ques: string;
   web?: string;
-  precomputed_rubric?: SerializedRubric;
+  precomputed_rubric?: RawRubric;
 }
 
 /**
@@ -114,12 +114,12 @@ async function main(): Promise<void> {
     );
   }
 
-  const rubricsById = new Map<string, SerializedRubric>();
+  const rubricsById = new Map<string, RawRubric>();
   for (let i = 1; i < rows.length; i++) {
     const cols = rows[i];
     if (!cols[idIdx]) continue;
     try {
-      const parsed = JSON.parse(cols[rubricIdx]) as SerializedRubric;
+      const parsed = JSON.parse(cols[rubricIdx]) as RawRubric;
       rubricsById.set(cols[idIdx], parsed);
     } catch (e) {
       console.warn(
@@ -149,7 +149,7 @@ async function main(): Promise<void> {
   }
 
   console.log(
-    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
+    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`,
   );
 
   await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts
deleted file mode 100644
index c670edf4c..000000000
--- a/packages/evals/scripts/verify-webtailbench-task.ts
+++ /dev/null
@@ -1,198 +0,0 @@
-/**
- * End-to-end verification on a real WebTailBench task.
- *
- * Loads one row from packages/evals/datasets/webtailbench/WebTailBench_data.jsonl
- * (which carries `precomputed_rubric`), runs the agent on Browserbase
- * via runWithVerifier, and asserts:
- *   1. Recorder captures a non-trivial trajectory.
- *   2. Verifier uses the upstream rubric (rubricSource = "precomputed").
- *   3. Step 6 rescoring produces per-criterion scores (no evidence_insufficient).
- *   4. Step 8 outcome returns a boolean verdict with reasoning.
- *
- *   pnpm tsx packages/evals/scripts/verify-webtailbench-task.ts [task_id]
- *
- * Defaults to united_13. Requires BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID
- * and a GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env.
- */
-import "dotenv/config";
-import assert from "node:assert/strict";
-import fs from "node:fs/promises";
-import path from "node:path";
-
-import { V3, normalizeRubric } from "@browserbasehq/stagehand";
-import type { SerializedRubric, TaskSpec } from "@browserbasehq/stagehand";
-import { runWithVerifier } from "../framework/verifierAdapter.js";
-
-interface WebTailBenchRow {
-  id: string;
-  category?: string;
-  ques: string;
-  web?: string;
-  precomputed_rubric?: SerializedRubric;
-}
-
-const DEFAULT_TASK_ID = "united_13";
-const JSONL = path.resolve(
-  import.meta.dirname,
-  "..",
-  "datasets",
-  "webtailbench",
-  "WebTailBench_data.jsonl",
-);
-
-async function loadRow(taskId: string): Promise<WebTailBenchRow> {
-  const raw = await fs.readFile(JSONL, "utf8");
-  for (const line of raw.split(/\r?\n/)) {
-    if (!line.trim()) continue;
-    const row = JSON.parse(line) as WebTailBenchRow;
-    if (row.id === taskId) return row;
-  }
-  throw new Error(`task id ${taskId} not found in ${JSONL}`);
-}
-
-async function main(): Promise<void> {
-  const taskId = process.argv[2] ?? DEFAULT_TASK_ID;
-  const mode = (process.env.AGENT_MODE ?? "hybrid") as "dom" | "hybrid" | "cua";
-  const model =
-    process.env.AGENT_MODEL ??
-    (mode === "cua" ? "anthropic/claude-haiku-4-5" : "google/gemini-2.5-flash");
-  console.log(`▸ loading WebTailBench task: ${taskId}`);
-  console.log(`  mode=${mode}  model=${model}`);
-  const row = await loadRow(taskId);
-  console.log(`  ✓ ${row.ques.slice(0, 100)}`);
-  console.log(
-    `  ✓ rubric: ${row.precomputed_rubric ? `${row.precomputed_rubric.items.length} criteria` : "MISSING"}`,
-  );
-  assert.ok(
-    row.precomputed_rubric && row.precomputed_rubric.items.length > 0,
-    "task should carry a precomputed rubric (run backfill-webtailbench-rubrics.ts first)",
-  );
-
-  // Most WebTailBench sites block local browser traffic; ideally this runs on
-  // BROWSERBASE. Defaults to LOCAL when Browserbase creds aren't configured —
-  // the verifier still exercises end-to-end on whatever trajectory we capture,
-  // even if the agent fails fast against anti-bot.
-  const useBrowserbase =
-    process.env.BROWSERBASE_API_KEY && process.env.BROWSERBASE_PROJECT_ID;
-  const env = useBrowserbase ? "BROWSERBASE" : "LOCAL";
-  console.log(`▸ initializing V3 on ${env}`);
-  const v3 = new V3({
-    env,
-    verbose: 1,
-    model,
-    // Keep the agent loop local even on env=BROWSERBASE — without this V3
-    // would auto-create an apiClient and dispatch agent.execute() to the
-    // remote server-side loop, which doesn't emit on our local bus. The
-    // evals framework does this same opt-out in packages/evals/initV3.ts:121
-    // via process.env.USE_API. disableAPI is the targeted flag; we used
-    // experimental: true previously as a heavier-handed equivalent.
-    disableAPI: true,
-  });
-  await v3.init();
-
-  const page = v3.context.pages()[0];
-  const startUrl = row.web || "https://www.google.com";
-  await page.goto(startUrl, { timeoutMs: 120_000 });
-  console.log(`  ✓ navigated to ${startUrl}`);
-
-  const agent = v3.agent({
-    mode,
-    model,
-  });
-
-  const taskSpec: TaskSpec = {
-    id: row.id,
-    instruction: row.ques,
-    initUrl: startUrl,
-    precomputedRubric: normalizeRubric(row.precomputed_rubric),
-  };
-
-  console.log("▸ running agent + verifier pipeline");
-  const startMs = Date.now();
-  const result = await runWithVerifier({
-    v3,
-    agent,
-    taskSpec,
-    dataset: "webtailbench",
-    agentOptions: { maxSteps: 30 },
-  });
-  console.log(
-    `  ✓ completed in ${((Date.now() - startMs) / 1000).toFixed(1)}s`,
-  );
-
-  // Diagnostic: show what the agent did internally vs what reached the bus.
-  console.log(`  agent.actions: ${result.agentResult.actions.length}`);
-  console.log(`  agent.completed: ${result.agentResult.completed}`);
-  console.log(
-    `  agent.usage: ${JSON.stringify(result.agentResult.usage ?? {})}`,
-  );
-  if (result.agentResult.actions.length > 0) {
-    console.log("  first 5 internal actions:");
-    for (const a of result.agentResult.actions.slice(0, 5)) {
-      console.log(`    - ${a.type ?? "?"}  ${(a.action ?? "").slice(0, 80)}`);
-    }
-  }
-
-  await v3.close();
-
-  // ── Assertions ──────────────────────────────────────────────────────────
-  const { trajectory, verdict, rubric, trajectoryDir } = result;
-  console.log(`\n▸ trajectory: ${trajectory.steps.length} steps`);
-  console.log(`  directory: ${trajectoryDir}`);
-  console.log(`\n▸ verdict:`);
-  console.log(
-    `  outcomeSuccess=${verdict.outcomeSuccess}  processScore=${verdict.processScore.toFixed(3)}`,
-  );
-  console.log(
-    `  per-criterion (${verdict.perCriterion.length}/${rubric.items.length}):`,
-  );
-  for (const c of verdict.perCriterion) {
-    const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1);
-    const flag = c.evidenceInsufficient ? " [evidence_insufficient]" : "";
-    console.log(`    - ${earned}/${c.maxPoints}  ${c.criterion}${flag}`);
-    if (c.justification) {
-      console.log(`        ${c.justification.slice(0, 200)}`);
-    }
-  }
-  const raw = verdict.rawSteps;
-  console.log(`\n▸ rubric source: ${raw?.rubricSource}`);
-  console.log(`▸ primary intent: ${raw?.primaryIntent}`);
-
-  if (verdict.findings && verdict.findings.length > 0) {
-    console.log(`\n▸ findings (${verdict.findings.length}):`);
-    for (const f of verdict.findings) {
-      const steps = f.relatedSteps?.length
-        ? `  steps=[${f.relatedSteps.join(",")}]`
-        : "";
-      console.log(`  [${f.severity}] ${f.category}${steps}`);
-      console.log(`    ${f.description}`);
-      if (f.suggestedAction) {
-        console.log(`    → ${f.suggestedAction}`);
-      }
-    }
-  } else {
-    console.log(`\n▸ findings: (none)`);
-  }
-
-  assert.equal(
-    raw?.rubricSource,
-    "precomputed",
-    "expected verifier to use the upstream precomputed rubric",
-  );
-  assert.equal(verdict.perCriterion.length, rubric.items.length);
-  const fullySufficient = verdict.perCriterion.every(
-    (c) => !c.evidenceInsufficient,
-  );
-  assert.ok(
-    fullySufficient,
-    "expected Step 6 to score every criterion (no evidence_insufficient flags)",
-  );
-  assert.equal(typeof verdict.outcomeSuccess, "boolean");
-
-  console.log(`\n✅ WebTailBench verification OK`);
-}
-
-main().catch((err) => {
-  console.error("\n❌ WebTailBench verification FAILED:", err);
-  process.exit(1);
-});
diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts
index bf8c5c919..d704449c2 100644
--- a/packages/evals/suites/webtailbench.ts
+++ b/packages/evals/suites/webtailbench.ts
@@ -1,9 +1,5 @@
 import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
-import {
-  normalizeRubric,
-  type AvailableModel,
-  type SerializedRubric,
-} from "@browserbasehq/stagehand";
+import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand";
 import { tasksConfig } from "../taskConfig.js";
 import { getPackageRootDir } from "../runtimePaths.js";
 import {
@@ -39,10 +35,9 @@ export const buildWebTailBenchTestcases = (
     /**
      * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv
      * via packages/evals/scripts/backfill-webtailbench-rubrics.ts.
-     * When present, the verifier skips Step 0a generation and uses these
-     * upstream criteria directly.
+     * When present, the verifier uses these upstream criteria directly.
      */
-    precomputed_rubric?: SerializedRubric;
+    precomputed_rubric?: unknown;
     [key: string]: unknown;
   };
 
@@ -54,9 +49,8 @@ export const buildWebTailBenchTestcases = (
 
   const candidates = parseJsonlRows(lines, isWebTailBenchRow);
 
-  // EVAL_WEBTAILBENCH_IDS — comma-separated task IDs. When set, restricts the
-  // suite to exactly those IDs (in the order given) and ignores sampling /
-  // limit knobs. Used by verifier-A/B experiments to pin a deterministic slice.
+  // EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs,
+  // preserving the order given and ignoring sampling / limit knobs.
   const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS
     ? process.env.EVAL_WEBTAILBENCH_IDS.split(",")
         .map((s) => s.trim())
diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts
index 052bc1b26..e3791348b 100644
--- a/packages/evals/tasks/bench/agent/webtailbench.ts
+++ b/packages/evals/tasks/bench/agent/webtailbench.ts
@@ -1,14 +1,9 @@
-import {
-  normalizeRubric,
-  type Rubric,
-  type SerializedRubric,
-  type TaskSpec,
-} from "@browserbasehq/stagehand";
+import { normalizeRubric, type TaskSpec } from "@browserbasehq/stagehand";
 
 import { defineBenchTask } from "../../../framework/defineTask.js";
 import {
+  evaluationResultToSuccess,
   runWithVerifier,
-  verdictToSuccess,
 } from "../../../framework/verifierAdapter.js";
 
 /**
@@ -33,7 +28,7 @@ export default defineBenchTask(
         category?: string;
         ques?: string;
         web?: string;
-        precomputed_rubric?: Rubric | SerializedRubric;
+        precomputed_rubric?: unknown;
       };
 
       if (!params.ques) {
@@ -65,7 +60,7 @@ export default defineBenchTask(
         precomputedRubric: normalizeRubric(params.precomputed_rubric),
       };
 
-      const { verdict, trajectory, trajectoryDir, rubric } =
+      const { evaluationResult, trajectory, trajectoryDir, rubric } =
         await runWithVerifier({
           v3,
           agent,
@@ -80,20 +75,20 @@ export default defineBenchTask(
 
       logger.log({
         category: "evaluation",
-        message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`,
+        message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`,
         level: 1,
       });
 
       return {
-        _success: verdictToSuccess(verdict, successMode),
-        outcomeSuccess: verdict.outcomeSuccess,
-        processScore: verdict.processScore,
-        evidenceInsufficient: verdict.evidenceInsufficient,
+        _success: evaluationResultToSuccess(evaluationResult, successMode),
+        outcomeSuccess: evaluationResult.outcomeSuccess,
+        processScore: evaluationResult.processScore,
+        evidenceInsufficient: evaluationResult.evidenceInsufficient,
         criterionCount: rubric.items.length,
         stepCount: trajectory.steps.length,
         trajectoryDir,
-        primaryIntent: verdict.rawSteps?.primaryIntent,
-        reasoning: verdict.rawSteps?.reasoning,
+        primaryIntent: evaluationResult.rawSteps?.primaryIntent,
+        reasoning: evaluationResult.rawSteps?.reasoning,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
@@ -111,3 +106,7 @@ export default defineBenchTask(
     }
   },
 );
+
+function formatProcessScore(score: number | undefined): string {
+  return typeof score === "number" ? score.toFixed(2) : "n/a";
+}
diff --git a/packages/evals/tests/framework/verifierAdapter.test.ts b/packages/evals/tests/framework/verifierAdapter.test.ts
index 6f446d1ff..fc21cdd3f 100644
--- a/packages/evals/tests/framework/verifierAdapter.test.ts
+++ b/packages/evals/tests/framework/verifierAdapter.test.ts
@@ -1,12 +1,12 @@
 import { describe, expect, it } from "vitest";
-import type { Verdict } from "@browserbasehq/stagehand";
+import type { EvaluationResult } from "@browserbasehq/stagehand";
 
 import {
+  evaluationResultToSuccess,
   resolveEvalSuccessMode,
-  verdictToSuccess,
 } from "../../framework/verifierAdapter.js";
 
-const baseVerdict: Verdict = {
+const baseResult: EvaluationResult = {
   outcomeSuccess: true,
   processScore: 0.5,
   perCriterion: [],
@@ -22,11 +22,18 @@ describe("resolveEvalSuccessMode", () => {
   });
 });
 
-describe("verdictToSuccess", () => {
+describe("evaluationResultToSuccess", () => {
   it("uses validated success modes", () => {
-    expect(verdictToSuccess(baseVerdict, "outcome")).toBe(true);
-    expect(verdictToSuccess(baseVerdict, "process")).toBe(false);
-    expect(verdictToSuccess(baseVerdict, "both")).toBe(false);
-    expect(verdictToSuccess(baseVerdict, "invalid")).toBe(true);
+    expect(evaluationResultToSuccess(baseResult, "outcome")).toBe(true);
+    expect(evaluationResultToSuccess(baseResult, "process")).toBe(false);
+    expect(evaluationResultToSuccess(baseResult, "both")).toBe(false);
+    expect(evaluationResultToSuccess(baseResult, "invalid")).toBe(true);
+  });
+
+  it("treats missing process score as a failed process gate", () => {
+    const outcomeOnly: EvaluationResult = { outcomeSuccess: true };
+    expect(evaluationResultToSuccess(outcomeOnly, "outcome")).toBe(true);
+    expect(evaluationResultToSuccess(outcomeOnly, "process")).toBe(false);
+    expect(evaluationResultToSuccess(outcomeOnly, "both")).toBe(false);
   });
 });