browserbase · miguelg719 · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 16, 2026
diff --git a/packages/evals/datasets/webtailbench/WebTailBench_data.jsonl b/packages/evals/datasets/webtailbench/WebTailBench_data.jsonl
diff --git a/packages/evals/framework/adHocRubric.ts b/packages/evals/framework/adHocRubric.ts
@@ -0,0 +1,27 @@
+/**
+ * adHocRubric — synthesize a Rubric from one or more natural-language
+ * criteria without invoking the LLM-based rubric generator.
+ *
+ * Used by migrated custom agent tasks whose original verification was a
+ * single `V3Evaluator.ask({question})` YES/NO call. Each criterion becomes
+ * a 1-point rubric item.
+ *
+ * For tasks that already have a concrete predicate ("Does the page show
+ * flights from SF to NY?"), pass the predicate verbatim. For the lazy
+ * "did the agent complete this task successfully? <instruction>" pattern,
+ * pass the instruction.
+ */
+import type { Rubric } from "@browserbasehq/stagehand";
+
+export function adHocRubric(...criteria: string[]): Rubric {
+  if (criteria.length === 0) {
+    throw new Error("adHocRubric requires at least one criterion");
+  }
+  return {
+    items: criteria.map((c) => ({
+      criterion: c,
+      description: c,
+      maxPoints: 1,
+    })),
+  };
+}
diff --git a/packages/evals/suites/onlineMind2Web.ts b/packages/evals/suites/onlineMind2Web.ts
@@ -52,7 +52,23 @@ export const buildOnlineMind2WebTestcases = (
   }
 
   const candidates = parseJsonlRows(lines, isMind2WebRow);
-  const rows = applySampling(candidates, sampleCount, maxCases);
+
+  // EVAL_ONLINEMIND2WEB_IDS restricts the suite to exactly those task ids,
+  // preserving the order given and ignoring sampling / limit knobs.
+  const explicitIds = process.env.EVAL_ONLINEMIND2WEB_IDS
+    ? process.env.EVAL_ONLINEMIND2WEB_IDS.split(",")
+        .map((s) => s.trim())
+        .filter(Boolean)
+    : null;
+  let rows: Mind2WebRow[];
+  if (explicitIds && explicitIds.length > 0) {
+    const byId = new Map(candidates.map((r) => [r.task_id, r]));
+    rows = explicitIds
+      .map((id) => byId.get(id))
+      .filter((r): r is Mind2WebRow => Boolean(r));
+  } else {
+    rows = applySampling(candidates, sampleCount, maxCases);
+  }
 
   const allTestcases: Testcase[] = [];
   for (const modelEntry of normalizeAgentModelEntries(models)) {

diff --git a/packages/evals/tasks/bench/agent/alibaba_supplier_search.ts b/packages/evals/tasks/bench/agent/alibaba_supplier_search.ts
@@ -1,69 +1,59 @@
+import type { TaskSpec } from "@browserbasehq/stagehand";
+
 import { defineBenchTask } from "../../../framework/defineTask.js";
-import { V3Evaluator } from "@browserbasehq/stagehand";
-import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js";
+import { adHocRubric } from "../../../framework/adHocRubric.js";
+import {
+  runWithVerifier,
+  evaluationResultToSuccess,
+} from "../../../framework/verifierAdapter.js";
 
 export default defineBenchTask(
   { name: "agent/alibaba_supplier_search" },
   async ({ debugUrl, sessionUrl, logger, agent, v3 }) => {
     try {
+      const initUrl = "https://www.alibaba.com/";
       const page = v3.context.pages()[0];
-      await page.goto("https://www.alibaba.com/");
-
-      // Start collecting screenshots throughout the agent's journey
-      const screenshotCollector = new ScreenshotCollector(v3, {
-        interval: 3000,
-        maxScreenshots: 15,
-      });
-      screenshotCollector.start();
+      await page.goto(initUrl);
 
       const instruction =
         "Search for 'solar panels' on Alibaba and find 3 suppliers. For each supplier, tell me their company name, minimum order quantity, and price range if available.";
-      const agentResult = await agent.execute({
-        instruction,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
-      });
 
-      // Stop and collect all screenshots from the journey
-      const screenshots = await screenshotCollector.stop();
-
-      logger.log({
-        category: "evaluation",
-        message: `Collected ${screenshots.length} screenshots for evaluation`,
-        level: 1,
-      });
+      const taskSpec: TaskSpec = {
+        id: "agent/alibaba_supplier_search",
+        instruction,
+        initUrl,
+        precomputedRubric: adHocRubric(
+          `did the agent complete this task successfully? ${instruction}`,
+        ),
+      };
 
-      const evaluator = new V3Evaluator(v3);
-      const { evaluation, reasoning } = await evaluator.ask({
-        question: `did the agent complete this task successfully? ${instruction}`,
-        screenshot: screenshots,
-        agentReasoning: agentResult.message,
+      const { evaluationResult, trajectoryDir } = await runWithVerifier({
+        v3,
+        agent,
+        taskSpec,
+        dataset: "agent-custom",
+        agentOptions: {
+          maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
+        },
       });
 
-      console.log(`reasoning: ${reasoning}`);
-
-      const success = evaluation === "YES";
+      const successMode = process.env.EVAL_SUCCESS_MODE;
 
-      if (!success) {
-        return {
-          _success: false,
-          message: reasoning,
-          debugUrl,
-          sessionUrl,
-          logs: logger.getLogs(),
-        };
-      }
       return {
-        _success: true,
+        _success: evaluationResultToSuccess(evaluationResult, successMode),
+        outcomeSuccess: evaluationResult.outcomeSuccess,
+        processScore: evaluationResult.processScore,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
     } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error);
+      const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
       return {
         _success: false,
-        message: errorMessage,
+        error,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),

diff --git a/packages/evals/tasks/bench/agent/all_recipes.ts b/packages/evals/tasks/bench/agent/all_recipes.ts
@@ -1,47 +1,59 @@
-import { V3Evaluator } from "@browserbasehq/stagehand";
+import type { TaskSpec } from "@browserbasehq/stagehand";
+
 import { defineBenchTask } from "../../../framework/defineTask.js";
+import { adHocRubric } from "../../../framework/adHocRubric.js";
+import {
+  runWithVerifier,
+  evaluationResultToSuccess,
+} from "../../../framework/verifierAdapter.js";
 
 export default defineBenchTask(
   { name: "agent/all_recipes" },
   async ({ debugUrl, sessionUrl, logger, agent, v3 }) => {
     try {
+      const initUrl = "https://www.allrecipes.com/";
       const page = v3.context.pages()[0];
-      await page.goto("https://www.allrecipes.com/");
-      const evaluator = new V3Evaluator(v3);
-      const agentResult = await agent.execute({
-        instruction:
-          "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.",
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
-      });
+      await page.goto(initUrl);
 
-      const { evaluation, reasoning } = await evaluator.ask({
-        question: "Did the agent find a recipe for Beef Wellington",
-      });
+      const instruction =
+        "Search for a recipe for Beef Wellington on Allrecipes that has at least 200 reviews and an average rating of 4.5 stars or higher. List the main ingredients required for the dish.";
 
-      logger.log(agentResult);
+      const taskSpec: TaskSpec = {
+        id: "agent/all_recipes",
+        instruction,
+        initUrl,
+        precomputedRubric: adHocRubric(
+          "Did the agent find a recipe for Beef Wellington",
+        ),
+      };
 
-      const success = evaluation === "YES";
+      const { evaluationResult, trajectoryDir } = await runWithVerifier({
+        v3,
+        agent,
+        taskSpec,
+        dataset: "agent-custom",
+        agentOptions: {
+          maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 30,
+        },
+      });
 
-      if (!success) {
-        return {
-          _success: false,
-          message: reasoning,
-          debugUrl,
-          sessionUrl,
-          logs: logger.getLogs(),
-        };
-      }
+      const successMode = process.env.EVAL_SUCCESS_MODE;
 
       return {
-        _success: true,
+        _success: evaluationResultToSuccess(evaluationResult, successMode),
+        outcomeSuccess: evaluationResult.outcomeSuccess,
+        processScore: evaluationResult.processScore,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
     } catch (error) {
+      const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
       return {
         _success: false,
         error,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),

diff --git a/packages/evals/tasks/bench/agent/amazon_shoes_cart.ts b/packages/evals/tasks/bench/agent/amazon_shoes_cart.ts
@@ -1,69 +1,59 @@
+import type { TaskSpec } from "@browserbasehq/stagehand";
+
 import { defineBenchTask } from "../../../framework/defineTask.js";
-import { V3Evaluator } from "@browserbasehq/stagehand";
-import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js";
+import { adHocRubric } from "../../../framework/adHocRubric.js";
+import {
+  runWithVerifier,
+  evaluationResultToSuccess,
+} from "../../../framework/verifierAdapter.js";
 
 export default defineBenchTask(
   { name: "agent/amazon_shoes_cart" },
   async ({ debugUrl, sessionUrl, logger, agent, v3 }) => {
     try {
+      const initUrl = "https://www.amazon.com";
       const page = v3.context.pages()[0];
-      await page.goto("https://www.amazon.com");
-
-      // Start collecting screenshots throughout the agent's journey
-      const screenshotCollector = new ScreenshotCollector(v3, {
-        interval: 3000,
-        maxScreenshots: 15,
-      });
-      screenshotCollector.start();
+      await page.goto(initUrl);
 
       const instruction =
         "go to amazon, and add a pair of black running shoes to cart in size 14. stop after you add the item to cart, and reach the login page";
-      const agentResult = await agent.execute({
-        instruction,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 40,
-      });
 
-      // Stop and collect all screenshots from the journey
-      const screenshots = await screenshotCollector.stop();
-
-      logger.log({
-        category: "evaluation",
-        message: `Collected ${screenshots.length} screenshots for evaluation`,
-        level: 1,
-      });
+      const taskSpec: TaskSpec = {
+        id: "agent/amazon_shoes_cart",
+        instruction,
+        initUrl,
+        precomputedRubric: adHocRubric(
+          `did the agent complete this task successfully? ${instruction}`,
+        ),
+      };
 
-      const evaluator = new V3Evaluator(v3);
-      const { evaluation, reasoning } = await evaluator.ask({
-        question: `did the agent complete this task successfully? ${instruction}`,
-        screenshot: screenshots,
-        agentReasoning: agentResult.message,
+      const { evaluationResult, trajectoryDir } = await runWithVerifier({
+        v3,
+        agent,
+        taskSpec,
+        dataset: "agent-custom",
+        agentOptions: {
+          maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 40,
+        },
       });
 
-      console.log(`reasoning: ${reasoning}`);
-
-      const success = evaluation === "YES";
+      const successMode = process.env.EVAL_SUCCESS_MODE;
 
-      if (!success) {
-        return {
-          _success: false,
-          message: reasoning,
-          debugUrl,
-          sessionUrl,
-          logs: logger.getLogs(),
-        };
-      }
       return {
-        _success: true,
+        _success: evaluationResultToSuccess(evaluationResult, successMode),
+        outcomeSuccess: evaluationResult.outcomeSuccess,
+        processScore: evaluationResult.processScore,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
     } catch (error) {
-      const errorMessage =
-        error instanceof Error ? error.message : String(error);
+      const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
       return {
         _success: false,
-        message: errorMessage,
+        error,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),