diff --git a/.changeset/verifier-evaluator-shell.md b/.changeset/verifier-evaluator-shell.md
new file mode 100644
index 000000000..4cac71a83
--- /dev/null
+++ b/.changeset/verifier-evaluator-shell.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add verifier trajectory, rubric, and evaluation-result types with normalized public naming.
diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index ffb6726df..8e21fb030 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -24,6 +24,11 @@ import { tool } from "ai";
 import { getAISDKLanguageModel } from "./llm/LLMProvider.js";
 import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js";
 import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js";
+import {
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
+} from "./verifier/index.js";
 
 export { V3 } from "./v3.js";
 export { V3 as Stagehand } from "./v3.js";
@@ -64,6 +69,31 @@ export type {
   V3EvaluatorConstructorOptions,
   V3EvaluatorOptions,
 } from "../v3Evaluator.js";
+export type {
+  Trajectory,
+  TrajectoryStep,
+  TrajectoryStatus,
+  TrajectoryUsage,
+  TaskSpec,
+  Rubric,
+  RubricCriterion,
+  AgentEvidence,
+  AgentEvidenceModality,
+  ProbeEvidence,
+  ToolOutput,
+  Verifier,
+  EvaluationResult,
+  CriterionScore,
+  FirstPointOfFailure,
+  TaskValidity,
+  VerifierFinding,
+  VerifierRawSteps,
+} from "./verifier/index.js";
+export {
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
+} from "./verifier/index.js";
 export { tool } from "ai";
 export { getAISDKLanguageModel } from "./llm/LLMProvider.js";
 export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js";
@@ -114,6 +144,9 @@ const StagehandDefault = {
   toJsonSchema,
   connectToMCPServer,
   V3Evaluator,
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
   tool,
   getAISDKLanguageModel,
   __internalCreateInMemoryAgentCacheHandle,
diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
new file mode 100644
index 000000000..4061533ab
--- /dev/null
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -0,0 +1,28 @@
+/**
+ * Public re-exports for the verifier subsystem.
+ */
+export type {
+  AgentEvidence,
+  AgentEvidenceModality,
+  CriterionScore,
+  EvaluationResult,
+  FirstPointOfFailure,
+  ProbeEvidence,
+  Rubric,
+  RubricCriterion,
+  TaskSpec,
+  TaskValidity,
+  ToolOutput,
+  Trajectory,
+  TrajectoryStatus,
+  TrajectoryStep,
+  TrajectoryUsage,
+  Verifier,
+  VerifierFinding,
+  VerifierRawSteps,
+} from "./types.js";
+export {
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
+} from "./trajectory.js";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
new file mode 100644
index 000000000..a18f025c3
--- /dev/null
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -0,0 +1,189 @@
+import type {
+  AgentEvidenceModality,
+  ProbeEvidence,
+  Rubric,
+  Trajectory,
+  TrajectoryStep,
+} from "./types.js";
+
+type RawRubricCriterion = {
+  criterion: unknown;
+  description: unknown;
+  max_points?: unknown;
+  maxPoints?: unknown;
+  condition?: unknown;
+};
+
+type RawRubric = {
+  items?: unknown;
+};
+
+/**
+ * Convert dataset or generated rubric JSON into the public Stagehand shape.
+ * Snake-case dataset fields are accepted here so serialized quirks do not leak
+ * into the canonical rubric type.
+ */
+export function normalizeRubric(rubric: unknown): Rubric | undefined {
+  if (rubric == null) return undefined;
+  if (typeof rubric !== "object") {
+    throw new TypeError("Rubric must be an object");
+  }
+
+  const rawRubric = rubric as RawRubric;
+  if (!Array.isArray(rawRubric.items)) {
+    throw new TypeError("Rubric is missing an items array");
+  }
+
+  return {
+    items: rawRubric.items.map((item) => {
+      const criterion = normalizeRequiredString(item.criterion, "criterion");
+      const description = normalizeRequiredString(
+        item.description,
+        "description",
+      );
+      const maxPoints = normalizeMaxPoints(item);
+
+      if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) {
+        throw new TypeError(
+          `Rubric criterion "${criterion}" is missing a numeric maxPoints value`,
+        );
+      }
+
+      return {
+        criterion,
+        description,
+        maxPoints,
+        ...(typeof item.condition === "string" && {
+          condition: item.condition,
+        }),
+      };
+    }),
+  };
+}
+
+function normalizeRequiredString(value: unknown, fieldName: string): string {
+  if (typeof value === "string" && value.length) {
+    return value;
+  }
+
+  throw new TypeError(`Rubric criterion is missing a ${fieldName} value`);
+}
+
+function normalizeMaxPoints(item: RawRubricCriterion): unknown {
+  return item.maxPoints ?? item.max_points;
+}
+
+function normalizeResultLabel(label?: string): string {
+  return (label ?? `rescore-${new Date().toISOString()}`).replace(
+    /[^A-Za-z0-9._-]/g,
+    "_",
+  );
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// On-disk loader
+// ─────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Hydrate a Trajectory from the on-disk directory layout written by
+ * TrajectoryRecorder.persist(). Used by the offline re-scoring CLI (`bench
+ * verify`) and by any consumer that wants to feed a saved trajectory back
+ * into V3Evaluator.verify() without running an agent.
+ *
+ * Reverses the recorder's serialization tweaks:
+ *   - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`.
+ *   - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on
+ *     disk (human-readable JSON) instead of raw Buffer; we decode back.
+ *
+ * @param dir absolute or cwd-relative path to a `<run-id>/<task-id>/` directory.
+ */
+export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
+  const fs = await import("node:fs/promises");
+  const path = await import("node:path");
+  const trajectoryDir = path.resolve(dir);
+
+  const trajectoryPath = path.join(trajectoryDir, "trajectory.json");
+  const raw = await fs.readFile(trajectoryPath, "utf8");
+  const parsed = JSON.parse(raw) as Trajectory & {
+    steps: Array<
+      TrajectoryStep & {
+        agentEvidence: {
+          modalities: Array<
+            | { type: "text"; content: string }
+            | {
+                type: "image";
+                mediaType: string;
+                // On-disk form (recorder writes base64); accept either to
+                // tolerate hand-edited fixtures.
+                bytes?: unknown;
+                bytesBase64?: string;
+              }
+            | { type: "json"; content: unknown }
+          >;
+        };
+        probeEvidence: ProbeEvidence;
+      }
+    >;
+  };
+
+  const resolveWithinTrajectoryDir = (candidate: string): string => {
+    const resolved = path.resolve(trajectoryDir, candidate);
+    const relative = path.relative(trajectoryDir, resolved);
+    const outside =
+      relative === ".." ||
+      relative.startsWith(`..${path.sep}`) ||
+      path.isAbsolute(relative);
+
+    if (outside) {
+      throw new Error(
+        `Trajectory screenshotPath escapes trajectory directory: ${candidate}`,
+      );
+    }
+
+    return resolved;
+  };
+
+  for (const step of parsed.steps) {
+    // Rehydrate tier-2 probe screenshot from its on-disk file reference.
+    const probe = step.probeEvidence;
+    if (probe?.screenshotPath && !probe.screenshot) {
+      const resolved = resolveWithinTrajectoryDir(probe.screenshotPath);
+      try {
+        probe.screenshot = await fs.readFile(resolved);
+      } catch {
+        // Missing screenshot file: leave probe.screenshot unset. The verifier's
+        // evidence_insufficient path will handle it.
+      }
+    }
+
+    // Decode image modalities from base64 back to Buffer.
+    if (step.agentEvidence?.modalities) {
+      step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => {
+        // The on-disk shape carries bytesBase64 instead of bytes, so we look
+        // through `unknown` here rather than rely on the typed union.
+        const raw = m as unknown as { bytesBase64?: string };
+        if (m.type === "image" && typeof raw.bytesBase64 === "string") {
+          return {
+            type: "image" as const,
+            bytes: Buffer.from(raw.bytesBase64, "base64"),
+            mediaType: m.mediaType,
+          };
+        }
+        return m as AgentEvidenceModality;
+      });
+    }
+  }
+
+  return parsed;
+}
+
+/**
+ * Build a `result*.json` filename for persisted evaluator output.
+ *
+ * Convention: the live run writes `result.json`; offline re-score attempts use
+ * a label-based name (e.g., `result_rescore-2026-05-11.json`) so they coexist
+ * without collisions and remain easy to diff.
+ */
+export function nextResultFilename(label?: string): string {
+  return `result_${normalizeResultLabel(label)}.json`;
+}
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
new file mode 100644
index 000000000..88b7e275e
--- /dev/null
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -0,0 +1,306 @@
+/**
+ * Shared verifier types for trajectories, rubrics, evidence, and results.
+ *
+ * The verifier consumes saved trajectories instead of a live browser. DOM and
+ * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve
+ * screenshots sent to the provider plus independent harness probes.
+ */
+
+/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */
+export interface TrajectoryUsage {
+  input_tokens: number;
+  output_tokens: number;
+  reasoning_tokens?: number;
+  cached_input_tokens?: number;
+  inference_time_ms?: number;
+}
+
+/** A single criterion in a Stagehand rubric. */
+export interface RubricCriterion {
+  /** Short name of the criterion (e.g., "Add ground beef to cart"). */
+  criterion: string;
+  /** What to evaluate and how to award partial credit. */
+  description: string;
+  /** Maximum points for this criterion. */
+  maxPoints: number;
+  /**
+   * Applicability rule for situational criteria. When this condition is not
+   * met, the criterion is excluded from scoring rather than counted as failed.
+   */
+  condition?: string;
+}
+
+/** A rubric — list of criteria for a task. */
+export interface Rubric {
+  items: RubricCriterion[];
+}
+
+/**
+ * Spec for a single task being verified. Carried both at runtime and into the
+ * verifier alongside the trajectory.
+ */
+export interface TaskSpec {
+  /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */
+  id: string;
+  /** Task instruction shown to the agent. */
+  instruction: string;
+  /** Starting URL, if any. */
+  initUrl?: string;
+  /** Rubric carried by the dataset or generated by a verifier backend. */
+  precomputedRubric?: Rubric;
+  /** Optional reference answer (set when dataset ships one). */
+  expectedAnswer?: string;
+}
+
+/**
+ * A single modality unit in tier-1 agent evidence. Mirrors the shape of
+ * ModelMessage content parts so we can reproduce what the LLM ingested.
+ */
+export type AgentEvidenceModality =
+  | { type: "text"; content: string }
+  | { type: "image"; bytes: Buffer; mediaType: string }
+  | { type: "json"; content: unknown };
+
+/**
+ * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the
+ * tool result for this step.
+ *
+ * Modes:
+ *   - CUA: usually a single image modality (the screenshot sent to the provider).
+ *   - Hybrid: tool result with optional screenshotBase64 → one image + one text.
+ *   - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities.
+ */
+export interface AgentEvidence {
+  modalities: AgentEvidenceModality[];
+}
+
+/**
+ * Tier 2 — independent harness probes around this step.
+ *
+ * If a probe wasn't captured, the field is absent (not null).
+ */
+export interface ProbeEvidence {
+  /** URL after the step's tool execution. */
+  url?: string;
+  /**
+   * Bus screenshot captured after the step. Path on disk is preferred once
+   * persisted; in-memory Buffer is used during a live run.
+   */
+  screenshot?: Buffer;
+  /** Reference to the persisted screenshot file under the trajectory dir. */
+  screenshotPath?: string;
+  /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */
+  scroll?: { top: number; pageHeight: number };
+  /** Accessibility tree snapshot. */
+  ariaTree?: string;
+  /** Verifier-requested probes, keyed by criterion id. */
+  onDemand?: Record<string, unknown>;
+}
+
+/** Outcome of a single tool execution as seen by the harness. */
+export interface ToolOutput {
+  ok: boolean;
+  /**
+   * The tool's return value. Same payload that flowed into agentEvidence
+   * modalities, but in its native shape (e.g., the extract result, the act
+   * describe-string) rather than serialized for the LLM.
+   */
+  result: unknown;
+  error?: string;
+}
+
+/** One step in a trajectory: action + reasoning + evidence + outcome. */
+export interface TrajectoryStep {
+  index: number;
+  actionName: string;
+  actionArgs: Record<string, unknown>;
+  /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */
+  reasoning: string;
+  agentEvidence: AgentEvidence;
+  probeEvidence: ProbeEvidence;
+  toolOutput: ToolOutput;
+  /** ISO 8601 timestamp when the step's tool execution started. */
+  startedAt: string;
+  /** ISO 8601 timestamp when the step's tool execution finished. */
+  finishedAt: string;
+}
+
+/** Terminal status of the agent run. */
+export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
+
+/**
+ * Full trajectory for one task run.
+ *
+ * The on-disk layout is one directory per task:
+ *
+ *   .trajectories/<run-id>/<task-id>/
+ *     ├── task_data.json    — TaskSpec + result metadata
+ *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
+ *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
+ *     ├── scores/
+ *     │   └── result.json       — Result from V3Evaluator.verify()
+ *     ├── core.log          — captured action log
+ *     └── times.json        — step timing + token usage
+ */
+export interface Trajectory {
+  task: TaskSpec;
+  steps: TrajectoryStep[];
+  finalAnswer?: string;
+  status: TrajectoryStatus;
+  usage: TrajectoryUsage;
+  timing: { startedAt: string; endedAt: string };
+}
+
+/** Score for a single rubric criterion after evidence analysis + rescoring. */
+export interface CriterionScore {
+  /** Matches RubricCriterion.criterion (the criterion's short name). */
+  criterion: string;
+  /** Maximum possible points for this criterion. */
+  maxPoints: number;
+  /**
+   * Points earned post-evidence-analysis (paper's post_image_earned_points).
+   * Null if the criterion was conditional and its condition wasn't met (excluded
+   * from both numerator and denominator in the process score).
+   */
+  earnedPoints: number | null;
+  /** Verifier's explanation for the score. */
+  explanation: string;
+  /**
+   * True if the criterion is conditional and its condition was determined to
+   * be met. Absent for non-conditional criteria.
+   */
+  conditionMet?: boolean;
+  /**
+   * Set when the verifier had no evidence to ground this criterion in either
+   * tier. Per paper §2, treated as uncontrollable failure → full credit, but
+   * surfaced here so dashboards can flag low-confidence results.
+   */
+  evidenceInsufficient?: boolean;
+}
+
+/**
+ * First-point-of-failure analysis (paper Step 9a). Identifies the earliest
+ * step where the agent's trajectory went off-track, using a structured error
+ * taxonomy (7 top-level categories, 1.1–7.4 sub-codes).
+ */
+export interface FirstPointOfFailure {
+  stepIndex: number;
+  /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */
+  errorCode: string;
+  /** Top-level category name (Selection, Hallucination, etc.). */
+  category: string;
+  /** Verifier's reasoning for selecting this point. */
+  description?: string;
+}
+
+/**
+ * Structured observation surfaced by the verifier that another agent or
+ * tooling could act on. Findings are emitted opportunistically by Step 8
+ * (outcome verification) when the verifier notices actionable patterns —
+ * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.
+ *
+ * Not produced for every task: when nothing actionable surfaces, the
+ * `findings` array on the EvaluationResult is empty. Consumers should treat the
+ * field as advisory, not as part of the formal score.
+ */
+export interface VerifierFinding {
+  /**
+   * Category of the observation. Open-ended enum — additional categories may
+   * be added as verifier backends surface new failure modes.
+   */
+  category:
+    | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries)
+    | "agent_strategy" // higher-level planning / decision-making problems
+    | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory
+    | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps)
+    | "task_specification" // task instruction was ambiguous / under- or over-specified
+    | "verifier_uncertainty" // verifier itself couldn't confidently decide
+    | "other";
+  /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */
+  severity: "info" | "warning" | "blocking";
+  /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */
+  description: string;
+  /**
+   * Optional concrete next action another agent could take. Should be
+   * specific enough that it can be acted on without further reasoning —
+   * e.g., "Try double_click instead of triple_click to clear placeholder
+   * text on this form field."
+   */
+  suggestedAction?: string;
+  /** Step indices in the trajectory where this pattern showed up. */
+  relatedSteps?: number[];
+}
+
+/** Stable debugging summary emitted by verifier backends. */
+export interface VerifierRawSteps {
+  backend?: "legacy" | "verifier";
+  primaryIntent?: string;
+  reasoning?: string;
+  rubricSource?: "precomputed" | "generated" | "none";
+  approach?: "a" | "b";
+  optionalsMode?: "folded" | "separate" | "skip";
+  totalEarned?: number;
+  totalMax?: number;
+  evidenceImages?: number;
+  evidenceTexts?: number;
+  evidenceOriginalScreenshots?: number;
+  legacyEvaluation?: string;
+  screenshotCount?: number;
+}
+
+/** Task-validity classification (paper Step 10). */
+export interface TaskValidity {
+  /** True if the task is underspecified / has multiple valid interpretations. */
+  isAmbiguous: boolean;
+  /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */
+  isInvalid: boolean;
+  /** Optional sub-codes from the task-classification taxonomy. */
+  ambiguityCodes?: string[];
+  invalidTaskCodes?: string[];
+}
+
+/**
+ * Evaluator output. Legacy evaluation may only populate outcome fields; richer
+ * verifier backends can also populate process scoring and diagnostics.
+ *
+ * Process and outcome are deliberately independent when both are present:
+ * an agent can follow the right steps but get blocked (high process, low
+ * outcome), or succeed through an unexpected path (variable process, high
+ * outcome).
+ */
+export interface EvaluationResult {
+  /** Did the agent accomplish the task from the user's perspective? */
+  outcomeSuccess: boolean;
+  /** Human-readable explanation for the outcome. */
+  explanation?: string;
+  /** Aggregated earned/max across applicable criteria, in [0, 1]. */
+  processScore?: number;
+  /** Per-criterion breakdown after rescoring. */
+  perCriterion?: CriterionScore[];
+  /** Step 9a — first step where the trajectory went off-track, if any. */
+  firstPointOfFailure?: FirstPointOfFailure;
+  /** Step 10 — task-itself ambiguity / validity. */
+  taskValidity?: TaskValidity;
+  /**
+   * Ids (RubricCriterion.criterion strings) of criteria where neither tier of
+   * evidence resolved the question. Treated as uncontrollable → full credit,
+   * but flagged here so consumers can decide whether to discount the score.
+   */
+  evidenceInsufficient?: string[];
+  /**
+   * Structured observations from the verifier that a downstream tool or
+   * follow-up agent could act on. Opportunistic — empty when the verifier
+   * doesn't notice anything actionable. Not part of the score; advisory.
+   */
+  findings?: VerifierFinding[];
+  /** Debugging summary from the active evaluator backend. */
+  rawSteps?: VerifierRawSteps;
+}
+
+/**
+ * Verifier interface. Implementations consume a Trajectory + TaskSpec and
+ * return an EvaluationResult — they MUST NOT touch a live browser.
+ */
+export interface Verifier {
+  verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<EvaluationResult>;
+}
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index e1d384f8c..379cf4589 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -9,11 +9,20 @@ import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
 import type {
   EvaluateOptions,
   BatchAskOptions,
-  EvaluationResult,
+  EvaluationResult as LegacyEvaluationResult,
 } from "./v3/types/private/evaluator.js";
 import { V3 } from "./v3/v3.js";
 import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
 import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js";
+import type {
+  Trajectory,
+  TaskSpec,
+  EvaluationResult,
+  Rubric,
+  Verifier,
+  AgentEvidenceModality,
+  VerifierFinding,
+} from "./v3/verifier/index.js";
 
 const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND";
 const DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = "legacy";
@@ -43,7 +52,7 @@ type NormalizedConstructorOptions = {
   backend?: V3EvaluatorBackend;
 };
 
-export class V3Evaluator {
+export class V3Evaluator implements Verifier {
   private readonly backend: V3EvaluatorBackend;
   private readonly legacyEvaluator: LegacyV3Evaluator;
 
@@ -67,23 +76,80 @@ export class V3Evaluator {
     );
   }
 
-  async ask(options: EvaluateOptions): Promise<EvaluationResult> {
+  async ask(options: EvaluateOptions): Promise<LegacyEvaluationResult> {
     return this.getLegacyBackend("ask").ask(options);
   }
 
-  async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {
+  async batchAsk(options: BatchAskOptions): Promise<LegacyEvaluationResult[]> {
     return this.getLegacyBackend("batchAsk").batchAsk(options);
   }
 
+  async verify(
+    trajectory: Trajectory,
+    taskSpec: TaskSpec,
+  ): Promise<EvaluationResult> {
+    assertVerifierInput(trajectory, taskSpec);
+
+    if (this.backend === "legacy") {
+      return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec);
+    }
+
+    return this.unavailableVerifierBackend("verify");
+  }
+
+  async generateRubric(taskSpec: TaskSpec): Promise<Rubric> {
+    if (!taskSpec?.id) {
+      throw new StagehandInvalidArgumentError(
+        "TaskSpec.id is required for rubric generation",
+      );
+    }
+
+    if (this.backend === "verifier") {
+      return this.unavailableVerifierBackend("generateRubric");
+    }
+
+    return {
+      items: [legacyTaskCompletionCriterion(taskSpec)],
+    };
+  }
+
   private getLegacyBackend(methodName: string): LegacyV3Evaluator {
     if (this.backend === "legacy") {
       return this.legacyEvaluator;
     }
 
+    return this.unavailableVerifierBackend(methodName);
+  }
+
+  private unavailableVerifierBackend(methodName: string): never {
     throw new StagehandInvalidArgumentError(
       `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`,
     );
   }
+
+  private async verifyTrajectoryWithLegacyEvaluator(
+    trajectory: Trajectory,
+    taskSpec: TaskSpec,
+  ): Promise<EvaluationResult> {
+    const screenshots = collectLegacyScreenshots(trajectory);
+    const agentReasoning = renderLegacyAgentReasoning(trajectory);
+    const answer = trajectory.finalAnswer;
+
+    if (!screenshots.length && !answer) {
+      return legacyInsufficientEvidenceResult(
+        "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.",
+      );
+    }
+
+    const result = await this.legacyEvaluator.ask({
+      question: taskSpec.instruction,
+      screenshot: screenshots.length ? screenshots : false,
+      answer,
+      agentReasoning,
+    });
+
+    return legacyEvaluationToResult(result, screenshots.length);
+  }
 }
 
 function normalizeConstructorOptions(
@@ -127,3 +193,142 @@ function resolveEvaluatorBackend(
     `Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`,
   );
 }
+
+function assertVerifierInput(trajectory: Trajectory, taskSpec: TaskSpec): void {
+  if (!taskSpec?.id) {
+    throw new StagehandInvalidArgumentError(
+      "TaskSpec.id is required for verification",
+    );
+  }
+  if (!trajectory) {
+    throw new StagehandInvalidArgumentError(
+      "Trajectory is required for verification",
+    );
+  }
+}
+
+function legacyTaskCompletionCriterion(taskSpec: TaskSpec) {
+  return {
+    criterion: "legacy-task-completion",
+    description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`,
+    maxPoints: 1,
+  };
+}
+
+function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] {
+  const screenshots: Buffer[] = [];
+
+  for (const step of trajectory.steps ?? []) {
+    if (Buffer.isBuffer(step.probeEvidence?.screenshot)) {
+      screenshots.push(step.probeEvidence.screenshot);
+      continue;
+    }
+
+    const agentImage = step.agentEvidence?.modalities?.find(
+      (
+        modality,
+      ): modality is Extract<AgentEvidenceModality, { type: "image" }> =>
+        modality.type === "image" && Buffer.isBuffer(modality.bytes),
+    );
+
+    if (agentImage) {
+      screenshots.push(agentImage.bytes);
+    }
+  }
+
+  return screenshots;
+}
+
+function renderLegacyAgentReasoning(
+  trajectory: Trajectory,
+): string | undefined {
+  const stepLines = (trajectory.steps ?? []).map((step) => {
+    const output = step.toolOutput?.error
+      ? `Tool error: ${step.toolOutput.error}`
+      : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`;
+    return [
+      `Step ${step.index}: ${step.actionName}`,
+      step.reasoning ? `Reasoning: ${step.reasoning}` : undefined,
+      output,
+    ]
+      .filter(Boolean)
+      .join("\n");
+  });
+
+  if (!stepLines.length) {
+    return undefined;
+  }
+
+  return truncateForPrompt(
+    `Agent trajectory:\n${stepLines.join("\n\n")}`,
+    16000,
+  );
+}
+
+function stringifyForPrompt(value: unknown): string {
+  if (typeof value === "string") {
+    return value;
+  }
+
+  try {
+    const serialized = JSON.stringify(value);
+    return serialized ?? String(value);
+  } catch {
+    return String(value);
+  }
+}
+
+function truncateForPrompt(value: string, maxLength: number): string {
+  if (value.length <= maxLength) {
+    return value;
+  }
+
+  return `${value.slice(0, maxLength)}... [truncated]`;
+}
+
+function legacyEvaluationToResult(
+  result: LegacyEvaluationResult,
+  screenshotCount: number,
+): EvaluationResult {
+  const outcomeSuccess = result.evaluation === "YES";
+  const invalid = result.evaluation === "INVALID";
+  const findings: VerifierFinding[] = invalid
+    ? [
+        {
+          category: "verifier_uncertainty",
+          severity: "warning",
+          description: result.reasoning,
+        },
+      ]
+    : [];
+
+  return {
+    outcomeSuccess,
+    explanation: result.reasoning,
+    ...(findings.length ? { findings } : {}),
+    rawSteps: {
+      backend: "legacy",
+      legacyEvaluation: result.evaluation,
+      screenshotCount,
+    },
+  };
+}
+
+function legacyInsufficientEvidenceResult(reason: string): EvaluationResult {
+  return {
+    outcomeSuccess: false,
+    explanation: reason,
+    findings: [
+      {
+        category: "trajectory_capture",
+        severity: "blocking",
+        description: reason,
+      },
+    ],
+    rawSteps: {
+      backend: "legacy",
+      legacyEvaluation: "INVALID",
+      screenshotCount: 0,
+    },
+  };
+}
diff --git a/packages/core/lib/v3LegacyEvaluator.ts b/packages/core/lib/v3LegacyEvaluator.ts
index 64ec89ef2..5662d25ac 100644
--- a/packages/core/lib/v3LegacyEvaluator.ts
+++ b/packages/core/lib/v3LegacyEvaluator.ts
@@ -74,6 +74,7 @@ export class LegacyV3Evaluator {
     if (Array.isArray(screenshot)) {
       return this._evaluateWithMultipleScreenshots({
         question,
+        answer,
         screenshots: screenshot,
         systemPrompt,
         agentReasoning,
@@ -224,12 +225,14 @@ export class LegacyV3Evaluator {
 
   private async _evaluateWithMultipleScreenshots(options: {
     question: string;
+    answer?: string;
     screenshots: Buffer[];
     systemPrompt?: string;
     agentReasoning?: string;
   }): Promise<EvaluationResult> {
     const {
       question,
+      answer,
       screenshots,
       agentReasoning,
       systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
@@ -272,6 +275,9 @@ export class LegacyV3Evaluator {
                   ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
                   : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
               },
+              ...(answer
+                ? [{ type: "text" as const, text: `the answer is ${answer}` }]
+                : []),
               ...imageContents,
             ],
           },
diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts
index eda61d500..e73cde417 100644
--- a/packages/core/tests/unit/public-api/export-surface.test.ts
+++ b/packages/core/tests/unit/public-api/export-surface.test.ts
@@ -43,8 +43,11 @@ const publicApiShape = {
   isZod4Schema: Stagehand.isZod4Schema,
   jsonSchemaToZod: Stagehand.jsonSchemaToZod,
   loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv,
+  loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk,
   localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema,
   modelToAgentProviderMap: Stagehand.modelToAgentProviderMap,
+  nextResultFilename: Stagehand.nextResultFilename,
+  normalizeRubric: Stagehand.normalizeRubric,
   pageTextSchema: Stagehand.pageTextSchema,
   providerEnvVarMap: Stagehand.providerEnvVarMap,
   toGeminiSchema: Stagehand.toGeminiSchema,
diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts
index 8d710da4d..2c2524238 100644
--- a/packages/core/tests/unit/public-api/v3-core.test.ts
+++ b/packages/core/tests/unit/public-api/v3-core.test.ts
@@ -134,6 +134,18 @@ describe("V3 Core public API types", () => {
       >();
     });
 
+    it("has verifier facade methods", () => {
+      expectTypeOf<V3EvaluatorInstance["verify"]>().toExtend<
+        (
+          trajectory: Stagehand.Trajectory,
+          taskSpec: Stagehand.TaskSpec,
+        ) => Promise<Stagehand.EvaluationResult>
+      >();
+      expectTypeOf<V3EvaluatorInstance["generateRubric"]>().toExtend<
+        (taskSpec: Stagehand.TaskSpec) => Promise<Stagehand.Rubric>
+      >();
+    });
+
     it("accepts legacy evaluator backend options", () => {
       const mockV3 = {} as Stagehand.Stagehand;
       expectTypeOf<typeof Stagehand.V3Evaluator>().toBeConstructibleWith(
@@ -143,35 +155,6 @@ describe("V3 Core public API types", () => {
         } satisfies Stagehand.V3EvaluatorConstructorOptions,
       );
     });
-
-    it("rejects verifier backend before the verifier PR is installed", async () => {
-      const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, {
-        backend: "verifier",
-      });
-
-      await expect(
-        evaluator.ask({ question: "Was the task completed?" }),
-      ).rejects.toThrow(
-        "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
-      );
-    });
-
-    it("rejects invalid evaluator backend env values", () => {
-      const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND;
-      process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend";
-
-      try {
-        expect(
-          () => new Stagehand.V3Evaluator({} as Stagehand.Stagehand),
-        ).toThrow('Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"');
-      } finally {
-        if (previousBackend === undefined) {
-          delete process.env.STAGEHAND_EVALUATOR_BACKEND;
-        } else {
-          process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend;
-        }
-      }
-    });
   });
 
   describe("V3FunctionName", () => {
diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts
new file mode 100644
index 000000000..b97c93ba2
--- /dev/null
+++ b/packages/core/tests/unit/v3-evaluator.test.ts
@@ -0,0 +1,210 @@
+import { describe, expect, it, vi } from "vitest";
+
+import { V3Evaluator } from "../../lib/v3Evaluator.js";
+import type { V3 } from "../../lib/v3/v3.js";
+import type { TaskSpec, Trajectory } from "../../lib/v3/verifier/index.js";
+
+describe("V3Evaluator verifier facade", () => {
+  it("rejects verifier backend before the verifier PR is installed", async () => {
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "verifier",
+    });
+
+    await expect(
+      evaluator.ask({ question: "Was the task completed?" }),
+    ).rejects.toThrow(
+      "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+    );
+  });
+
+  it("rejects verify when the verifier backend is selected before the verifier PR is installed", async () => {
+    const taskSpec: TaskSpec = {
+      id: "verifier-unavailable",
+      instruction: "Complete the task",
+    };
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "verifier",
+    });
+
+    await expect(
+      evaluator.verify(makeTrajectory(taskSpec), taskSpec),
+    ).rejects.toThrow(
+      "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+    );
+  });
+
+  it("rejects generateRubric when the verifier backend is selected before the verifier PR is installed", async () => {
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "verifier",
+    });
+
+    await expect(
+      evaluator.generateRubric({
+        id: "rubric-unavailable",
+        instruction: "Complete the task",
+      }),
+    ).rejects.toThrow(
+      "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+    );
+  });
+
+  it("maps legacy YES evaluations with trajectory screenshots to a successful result", async () => {
+    const taskSpec: TaskSpec = {
+      id: "success",
+      instruction: "Complete the task",
+    };
+    const screenshot = Buffer.from("screenshot");
+    const trajectory = makeTrajectory(taskSpec, {
+      screenshot,
+      finalAnswer: "The task is complete.",
+    });
+    const ask = vi.fn().mockResolvedValue({
+      evaluation: "YES",
+      reasoning: "The screenshot shows completion.",
+    });
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+    Object.defineProperty(evaluator, "legacyEvaluator", {
+      value: { ask },
+    });
+
+    const result = await evaluator.verify(trajectory, taskSpec);
+
+    expect(ask).toHaveBeenCalledWith(
+      expect.objectContaining({
+        question: taskSpec.instruction,
+        screenshot: [screenshot],
+        answer: "The task is complete.",
+      }),
+    );
+    expect(result.outcomeSuccess).toBe(true);
+    expect(result.explanation).toBe("The screenshot shows completion.");
+    expect(result.processScore).toBeUndefined();
+    expect(result.perCriterion).toBeUndefined();
+  });
+
+  it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => {
+    const taskSpec: TaskSpec = {
+      id: "reasoning-budget",
+      instruction: "Complete the task",
+    };
+    const longToolOutput = "x".repeat(3000);
+    const ask = vi.fn().mockResolvedValue({
+      evaluation: "YES",
+      reasoning: "The trajectory shows completion.",
+    });
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+    Object.defineProperty(evaluator, "legacyEvaluator", {
+      value: { ask },
+    });
+
+    await evaluator.verify(
+      makeTrajectory(taskSpec, {
+        finalAnswer: "The task is complete.",
+        toolResult: longToolOutput,
+      }),
+      taskSpec,
+    );
+
+    const firstCall = ask.mock.calls[0]?.[0];
+    expect(firstCall?.agentReasoning).toContain(longToolOutput);
+    expect(firstCall?.agentReasoning).not.toContain("Final answer:");
+    expect(firstCall?.answer).toBe("The task is complete.");
+  });
+
+  it("returns an evidence-insufficient legacy result for empty trajectories", async () => {
+    const taskSpec: TaskSpec = {
+      id: "empty",
+      instruction: "Complete the task",
+    };
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+
+    const result = await evaluator.verify(
+      makeEmptyTrajectory(taskSpec),
+      taskSpec,
+    );
+
+    expect(result).toMatchObject({
+      outcomeSuccess: false,
+      explanation:
+        "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.",
+      rawSteps: {
+        backend: "legacy",
+        legacyEvaluation: "INVALID",
+        screenshotCount: 0,
+      },
+    });
+    expect(result.processScore).toBeUndefined();
+    expect(result.perCriterion).toBeUndefined();
+  });
+
+  it("rejects invalid evaluator backend env values", () => {
+    const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND;
+    process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend";
+
+    try {
+      expect(() => new V3Evaluator({} as V3)).toThrow(
+        'Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"',
+      );
+    } finally {
+      if (previousBackend === undefined) {
+        delete process.env.STAGEHAND_EVALUATOR_BACKEND;
+      } else {
+        process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend;
+      }
+    }
+  });
+});
+
+function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory {
+  return {
+    task: taskSpec,
+    steps: [],
+    status: "complete",
+    usage: {
+      input_tokens: 0,
+      output_tokens: 0,
+    },
+    timing: {
+      startedAt: new Date(0).toISOString(),
+      endedAt: new Date(0).toISOString(),
+    },
+  };
+}
+
+function makeTrajectory(
+  taskSpec: TaskSpec,
+  options: {
+    screenshot?: Buffer;
+    finalAnswer?: string;
+    toolResult?: unknown;
+  } = {},
+): Trajectory {
+  return {
+    ...makeEmptyTrajectory(taskSpec),
+    steps: [
+      {
+        index: 0,
+        actionName: "act",
+        actionArgs: {},
+        reasoning: "I completed the task.",
+        agentEvidence: { modalities: [] },
+        probeEvidence: options.screenshot
+          ? { screenshot: options.screenshot }
+          : {},
+        toolOutput: {
+          ok: true,
+          result: options.toolResult ?? "done",
+        },
+        startedAt: new Date(0).toISOString(),
+        finishedAt: new Date(0).toISOString(),
+      },
+    ],
+    finalAnswer: options.finalAnswer,
+  };
+}
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
new file mode 100644
index 000000000..4b09e53a1
--- /dev/null
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -0,0 +1,150 @@
+import { mkdtemp, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import path from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import {
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
+} from "../../lib/v3/verifier/trajectory.js";
+
+describe("verifier trajectory utilities", () => {
+  it("normalizes serialized empty earned points out of public rubrics", () => {
+    expect(
+      normalizeRubric({
+        items: [
+          {
+            criterion: "Criterion",
+            description: "Description",
+            max_points: 1,
+            earned_points: "",
+          },
+        ],
+      }),
+    ).toEqual({
+      items: [
+        {
+          criterion: "Criterion",
+          description: "Description",
+          maxPoints: 1,
+        },
+      ],
+    });
+  });
+
+  it("round-trips serialized snake_case rubrics to public camelCase rubrics", () => {
+    expect(
+      normalizeRubric({
+        items: [
+          {
+            criterion: "Criterion",
+            description: "Description",
+            max_points: 3,
+            earned_points: "2",
+            condition: "Only if relevant",
+            justification: "Partial credit.",
+          },
+        ],
+      }),
+    ).toEqual({
+      items: [
+        {
+          criterion: "Criterion",
+          description: "Description",
+          maxPoints: 3,
+          condition: "Only if relevant",
+        },
+      ],
+    });
+  });
+
+  it("loads trajectory screenshots and image modalities from disk", async () => {
+    const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
+    const screenshot = Buffer.from("probe screenshot");
+    const agentImage = Buffer.from("agent image");
+    await writeFile(path.join(dir, "screenshot_1.png"), screenshot);
+    await writeFile(
+      path.join(dir, "trajectory.json"),
+      JSON.stringify({
+        task: { id: "task", instruction: "Do the task" },
+        status: "complete",
+        usage: { input_tokens: 0, output_tokens: 0 },
+        timing: {
+          startedAt: new Date(0).toISOString(),
+          endedAt: new Date(0).toISOString(),
+        },
+        steps: [
+          {
+            index: 0,
+            actionName: "act",
+            actionArgs: {},
+            reasoning: "",
+            agentEvidence: {
+              modalities: [
+                {
+                  type: "image",
+                  mediaType: "image/png",
+                  bytesBase64: agentImage.toString("base64"),
+                },
+              ],
+            },
+            probeEvidence: { screenshotPath: "screenshot_1.png" },
+            toolOutput: { ok: true, result: null },
+            startedAt: new Date(0).toISOString(),
+            finishedAt: new Date(0).toISOString(),
+          },
+        ],
+      }),
+    );
+
+    const trajectory = await loadTrajectoryFromDisk(dir);
+    const modality = trajectory.steps[0].agentEvidence.modalities[0];
+
+    expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
+    expect(modality.type).toBe("image");
+    if (modality.type === "image") {
+      expect(modality.bytes).toEqual(agentImage);
+    }
+  });
+
+  it("rejects screenshot paths outside the trajectory directory", async () => {
+    const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
+    await writeFile(
+      path.join(dir, "trajectory.json"),
+      JSON.stringify({
+        task: { id: "task", instruction: "Do the task" },
+        status: "complete",
+        usage: { input_tokens: 0, output_tokens: 0 },
+        timing: {
+          startedAt: new Date(0).toISOString(),
+          endedAt: new Date(0).toISOString(),
+        },
+        steps: [
+          {
+            index: 0,
+            actionName: "act",
+            actionArgs: {},
+            reasoning: "",
+            agentEvidence: { modalities: [] },
+            probeEvidence: { screenshotPath: "../../../etc/passwd" },
+            toolOutput: { ok: true, result: null },
+            startedAt: new Date(0).toISOString(),
+            finishedAt: new Date(0).toISOString(),
+          },
+        ],
+      }),
+    );
+
+    await expect(loadTrajectoryFromDisk(dir)).rejects.toThrow(
+      "escapes trajectory directory",
+    );
+  });
+
+  it("sanitizes result filename labels", () => {
+    expect(nextResultFilename("rescore / task:one?")).toBe(
+      "result_rescore___task_one_.json",
+    );
+  });
+});