diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
new file mode 100644
index 000000000..8dc40bd2b
--- /dev/null
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -0,0 +1,177 @@
+/**
+ * verifierAdapter — runs a bench task through the verifier pipeline.
+ *
+ * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate
+ * with one call:
+ *
+ *   const { evaluationResult, trajectory } = await runWithVerifier({
+ *     v3,
+ *     agent,
+ *     taskSpec: { id, instruction, initUrl, precomputedRubric? },
+ *     maxSteps: 50,
+ *   });
+ *
+ * Behavior:
+ *   1. Resolves the rubric from the task, cache, or evaluator.
+ *   2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus.
+ *   3. Runs V3Evaluator.verify() on the recorded Trajectory.
+ *   4. Returns { trajectory, evaluationResult, agentResult }.
+ *
+ * Persistence and rubric caching are gated by env vars:
+ *   VERIFIER_PERSIST_TRAJECTORIES   — on locally, off in CI by default.
+ *   VERIFIER_DISABLE_RUBRIC_CACHE   — set to "1" to bypass the cache (forces
+ *                                     fresh rubric generation every time).
+ */
+import {
+  V3Evaluator,
+  normalizeRubric,
+  type AgentInstance,
+  type AgentExecuteOptions,
+  type AgentResult,
+  type EvaluationResult,
+  type Rubric,
+  type TaskSpec,
+  type Trajectory,
+  type V3,
+} from "@browserbasehq/stagehand";
+
+import { RubricCache } from "./rubricCache.js";
+import { TrajectoryRecorder } from "./trajectoryRecorder.js";
+
+export interface RunWithVerifierOptions {
+  v3: V3;
+  agent: AgentInstance;
+  taskSpec: TaskSpec;
+  /**
+   * Dataset name for rubric cache partitioning. Each task lives under
+   * `.rubric-cache/<dataset>/<task-id>.json`.
+   */
+  dataset: string;
+  /** Agent execute options. `instruction` is filled from taskSpec.instruction. */
+  agentOptions?: Omit<AgentExecuteOptions, "instruction">;
+  /** Override the run id (defaults to ISO timestamp). */
+  runId?: string;
+  /** Override trajectory persistence root. */
+  trajectoryRoot?: string;
+}
+
+export interface RunWithVerifierResult {
+  trajectory: Trajectory;
+  evaluationResult: EvaluationResult;
+  agentResult: AgentResult;
+  /** Resolved rubric (precomputed, cached, or freshly generated). */
+  rubric: Rubric;
+  /** Where the trajectory was persisted (or would have been, if disabled). */
+  trajectoryDir: string;
+}
+
+export async function runWithVerifier(
+  opts: RunWithVerifierOptions,
+): Promise<RunWithVerifierResult> {
+  const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } =
+    opts;
+  const evaluator = new V3Evaluator(v3, { backend: "verifier" });
+
+  // ── Resolve rubric ──────────────────────────────────────────────────────
+  let resolvedRubric: Rubric;
+  if (taskSpec.precomputedRubric) {
+    resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
+  } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+    resolvedRubric = await evaluator.generateRubric(taskSpec);
+  } else {
+    const cache = new RubricCache({ dataset });
+    resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
+  }
+
+  // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
+  const hydratedTaskSpec: TaskSpec = {
+    ...taskSpec,
+    precomputedRubric: resolvedRubric,
+  };
+
+  // ── Record trajectory around agent.execute() ───────────────────────────
+  const recorder = new TrajectoryRecorder({
+    v3,
+    taskSpec: hydratedTaskSpec,
+    runId,
+    outputRoot: trajectoryRoot,
+  });
+  recorder.start();
+
+  let agentResult: AgentResult;
+  let recorderStatus: "complete" | "aborted" | "error" = "complete";
+  try {
+    agentResult = await agent.execute({
+      ...agentOptions,
+      instruction: taskSpec.instruction,
+    });
+  } catch (e) {
+    recorderStatus = "error";
+    const trajectory = await recorder.finish({ status: recorderStatus });
+    // Re-throw after persisting so the bench task can decide how to report.
+    const wrapped = e instanceof Error ? e : new Error(String(e));
+    Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory });
+    throw wrapped;
+  }
+
+  const trajectory = await recorder.finish({
+    status: recorderStatus,
+    finalAnswer: agentResult.message,
+    usage: agentResult.usage,
+  });
+
+  // ── Verify ──────────────────────────────────────────────────────────────
+  const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec);
+  await recorder.persistResult(evaluationResult);
+
+  return {
+    trajectory,
+    evaluationResult,
+    agentResult,
+    rubric: resolvedRubric,
+    trajectoryDir: recorder.directory,
+  };
+}
+
+/**
+ * Decide bench task success from an EvaluationResult using the --success flag's
+ * semantics.
+ *
+ * `outcome` (default) — strict binary outcome.
+ * `process`           — rubric process score ≥ threshold (default 0.8).
+ * `both`              — both conditions must hold.
+ */
+export type EvalSuccessMode = "outcome" | "process" | "both";
+
+export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode {
+  if (typeof mode !== "string") return "outcome";
+  const normalized = mode.trim().toLowerCase();
+  if (
+    normalized === "outcome" ||
+    normalized === "process" ||
+    normalized === "both"
+  ) {
+    return normalized;
+  }
+  return "outcome";
+}
+
+export function evaluationResultToSuccess(
+  result: EvaluationResult,
+  mode: unknown = "outcome",
+  processThreshold = 0.8,
+): boolean {
+  const resolvedMode = resolveEvalSuccessMode(mode);
+  const outcomeOk = result.outcomeSuccess;
+  const processOk =
+    typeof result.processScore === "number" &&
+    result.processScore >= processThreshold;
+  switch (resolvedMode) {
+    case "outcome":
+      return outcomeOk;
+    case "process":
+      return processOk;
+    case "both":
+      return outcomeOk && processOk;
+  }
+}
diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
index 5a6763390..22fb87de1 100644
--- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts
+++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts
@@ -29,7 +29,7 @@ const JSONL_PATH = path.join(
   "WebTailBench_data.jsonl",
 );
 
-interface Rubric {
+interface RawRubric {
   items: Array<Record<string, unknown>>;
 }
 
@@ -38,7 +38,7 @@ interface LocalRow {
   category?: string;
   ques: string;
   web?: string;
-  precomputed_rubric?: Rubric;
+  precomputed_rubric?: RawRubric;
 }
 
 /**
@@ -114,12 +114,12 @@ async function main(): Promise<void> {
     );
   }
 
-  const rubricsById = new Map<string, Rubric>();
+  const rubricsById = new Map<string, RawRubric>();
   for (let i = 1; i < rows.length; i++) {
     const cols = rows[i];
     if (!cols[idIdx]) continue;
     try {
-      const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
+      const parsed = JSON.parse(cols[rubricIdx]) as RawRubric;
       rubricsById.set(cols[idIdx], parsed);
     } catch (e) {
       console.warn(
@@ -149,7 +149,7 @@ async function main(): Promise<void> {
   }
 
   console.log(
-    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
+    `  ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`,
   );
 
   await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts
index 4bd2015fa..d704449c2 100644
--- a/packages/evals/suites/webtailbench.ts
+++ b/packages/evals/suites/webtailbench.ts
@@ -1,5 +1,5 @@
 import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
-import type { AvailableModel } from "@browserbasehq/stagehand";
+import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand";
 import { tasksConfig } from "../taskConfig.js";
 import { getPackageRootDir } from "../runtimePaths.js";
 import {
@@ -32,6 +32,12 @@ export const buildWebTailBenchTestcases = (
     ques: string;
     category?: string;
     web?: string;
+    /**
+     * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv
+     * via packages/evals/scripts/backfill-webtailbench-rubrics.ts.
+     * When present, the verifier uses these upstream criteria directly.
+     */
+    precomputed_rubric?: unknown;
     [key: string]: unknown;
   };
 
@@ -42,7 +48,23 @@ export const buildWebTailBenchTestcases = (
   }
 
   const candidates = parseJsonlRows(lines, isWebTailBenchRow);
-  const rows = applySampling(candidates, sampleCount, maxCases);
+
+  // EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs,
+  // preserving the order given and ignoring sampling / limit knobs.
+  const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS
+    ? process.env.EVAL_WEBTAILBENCH_IDS.split(",")
+        .map((s) => s.trim())
+        .filter(Boolean)
+    : null;
+  let rows: WebTailBenchRow[];
+  if (explicitIds && explicitIds.length > 0) {
+    const byId = new Map(candidates.map((r) => [r.id, r]));
+    rows = explicitIds
+      .map((id) => byId.get(id))
+      .filter((r): r is WebTailBenchRow => Boolean(r));
+  } else {
+    rows = applySampling(candidates, sampleCount, maxCases);
+  }
 
   const allTestcases: Testcase[] = [];
   for (const modelEntry of normalizeAgentModelEntries(models)) {
@@ -57,6 +79,7 @@ export const buildWebTailBenchTestcases = (
           category: row.category,
           ques: row.ques,
           web: row.web,
+          precomputed_rubric: normalizeRubric(row.precomputed_rubric),
         },
       };
       const taskCategories =
diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts
index 7f60e5775..e3791348b 100644
--- a/packages/evals/tasks/bench/agent/webtailbench.ts
+++ b/packages/evals/tasks/bench/agent/webtailbench.ts
@@ -1,19 +1,34 @@
+import { normalizeRubric, type TaskSpec } from "@browserbasehq/stagehand";
+
 import { defineBenchTask } from "../../../framework/defineTask.js";
-import { V3Evaluator } from "@browserbasehq/stagehand";
-import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js";
-import { imageResize } from "../../../utils/imageResize.js";
+import {
+  evaluationResultToSuccess,
+  runWithVerifier,
+} from "../../../framework/verifierAdapter.js";
 
+/**
+ * WebTailBench bench task.
+ *
+ * Runs the agent through TrajectoryRecorder + V3Evaluator.verify() so process
+ * and outcome scoring are grounded in saved trajectory evidence.
+ *
+ * If a row does not carry `precomputed_rubric`, the verifier generates a
+ * rubric on first encounter per task id and caches it under
+ * packages/evals/.rubric-cache/webtailbench/.
+ *
+ * --success knob: defaults to "outcome".
+ * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both.
+ */
 export default defineBenchTask(
   { name: "agent/webtailbench" },
   async ({ v3, logger, debugUrl, sessionUrl, modelName, input }) => {
-    let screenshotCollector: ScreenshotCollector | null = null;
-
     try {
       const params = ((input && input.params) || {}) as {
         id?: string;
         category?: string;
         ques?: string;
         web?: string;
+        precomputed_rubric?: unknown;
       };
 
       if (!params.ques) {
@@ -27,11 +42,8 @@ export default defineBenchTask(
       }
 
       const page = v3.context.pages()[0];
-      // web field is always empty in WebTailBench; start from Google
       const startUrl = params.web || "https://www.google.com";
-      await page.goto(startUrl, {
-        timeoutMs: 120_000,
-      });
+      await page.goto(startUrl, { timeoutMs: 120_000 });
 
       const systemPrompt = `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: <answer>" summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. You will need to navigate to the appropriate website to complete the task.`;
       const agentMode = input.agentMode ?? (input.isCUA ? "cua" : "hybrid");
@@ -41,70 +53,60 @@ export default defineBenchTask(
         systemPrompt,
       });
 
-      screenshotCollector = new ScreenshotCollector(v3, {
-        interval: 3000,
-        maxScreenshots: 8,
-      });
-      screenshotCollector.start();
-
-      const agentResult = await agent.execute({
+      const taskSpec: TaskSpec = {
+        id: params.id ?? `webtailbench/${input.name}`,
         instruction: params.ques,
-        maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
-      });
+        initUrl: startUrl,
+        precomputedRubric: normalizeRubric(params.precomputed_rubric),
+      };
 
-      // Stop collecting and get all screenshots
-      let screenshots = await screenshotCollector.stop();
+      const { evaluationResult, trajectory, trajectoryDir, rubric } =
+        await runWithVerifier({
+          v3,
+          agent,
+          taskSpec,
+          dataset: "webtailbench",
+          agentOptions: {
+            maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50,
+          },
+        });
 
-      // Resize screenshots if we have any
-      if (screenshots.length > 0) {
-        screenshots = await Promise.all(
-          screenshots.map(async (screenshot) => {
-            return await imageResize(screenshot, 0.7);
-          }),
-        );
-      }
+      const successMode = process.env.EVAL_SUCCESS_MODE;
 
       logger.log({
         category: "evaluation",
-        message: `Collected ${screenshots.length} screenshots for evaluation`,
+        message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`,
         level: 1,
       });
 
-      const evaluator = new V3Evaluator(v3);
-      const evalResult = await evaluator.ask({
-        question: `Did the agent successfully complete this task: "${params.ques}"? Note that the agent does not have purchasing/booking capabilities; mark as pass if the agent has successfully performed all necessary steps for the task up to the point of purchasing/booking/entering payment/user information`,
-        screenshot: screenshots,
-        agentReasoning:
-          agentResult.message ||
-          "no reasoning available, agent potentially hit step limit",
-      });
-
-      // Clear screenshot buffers to free memory
-      screenshots.length = 0;
-
       return {
-        _success: evalResult.evaluation === "YES",
-        reasoning: evalResult.reasoning,
+        _success: evaluationResultToSuccess(evaluationResult, successMode),
+        outcomeSuccess: evaluationResult.outcomeSuccess,
+        processScore: evaluationResult.processScore,
+        evidenceInsufficient: evaluationResult.evidenceInsufficient,
+        criterionCount: rubric.items.length,
+        stepCount: trajectory.steps.length,
+        trajectoryDir,
+        primaryIntent: evaluationResult.rawSteps?.primaryIntent,
+        reasoning: evaluationResult.rawSteps?.reasoning,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
     } catch (error) {
+      const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir;
       return {
         _success: false,
         error,
+        trajectoryDir,
         debugUrl,
         sessionUrl,
         logs: logger.getLogs(),
       };
-    } finally {
-      if (screenshotCollector) {
-        try {
-          await screenshotCollector.stop();
-        } catch {
-          // Ignore errors during cleanup
-        }
-      }
     }
   },
 );
+
+function formatProcessScore(score: number | undefined): string {
+  return typeof score === "number" ? score.toFixed(2) : "n/a";
+}
diff --git a/packages/evals/tests/framework/verifierAdapter.test.ts b/packages/evals/tests/framework/verifierAdapter.test.ts
new file mode 100644
index 000000000..fc21cdd3f
--- /dev/null
+++ b/packages/evals/tests/framework/verifierAdapter.test.ts
@@ -0,0 +1,39 @@
+import { describe, expect, it } from "vitest";
+import type { EvaluationResult } from "@browserbasehq/stagehand";
+
+import {
+  evaluationResultToSuccess,
+  resolveEvalSuccessMode,
+} from "../../framework/verifierAdapter.js";
+
+const baseResult: EvaluationResult = {
+  outcomeSuccess: true,
+  processScore: 0.5,
+  perCriterion: [],
+  taskValidity: { isAmbiguous: false, isInvalid: false },
+  evidenceInsufficient: [],
+};
+
+describe("resolveEvalSuccessMode", () => {
+  it("defaults invalid env/config values to outcome", () => {
+    expect(resolveEvalSuccessMode(undefined)).toBe("outcome");
+    expect(resolveEvalSuccessMode("bad-value")).toBe("outcome");
+    expect(resolveEvalSuccessMode(" PROCESS ")).toBe("process");
+  });
+});
+
+describe("evaluationResultToSuccess", () => {
+  it("uses validated success modes", () => {
+    expect(evaluationResultToSuccess(baseResult, "outcome")).toBe(true);
+    expect(evaluationResultToSuccess(baseResult, "process")).toBe(false);
+    expect(evaluationResultToSuccess(baseResult, "both")).toBe(false);
+    expect(evaluationResultToSuccess(baseResult, "invalid")).toBe(true);
+  });
+
+  it("treats missing process score as a failed process gate", () => {
+    const outcomeOnly: EvaluationResult = { outcomeSuccess: true };
+    expect(evaluationResultToSuccess(outcomeOnly, "outcome")).toBe(true);
+    expect(evaluationResultToSuccess(outcomeOnly, "process")).toBe(false);
+    expect(evaluationResultToSuccess(outcomeOnly, "both")).toBe(false);
+  });
+});