From 5a7af3ca60b00ee34f0e61be969d87b7c3a2835e Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:42:51 -0700
Subject: [PATCH 01/14] feat(verifier): add verifier evaluator shell

---
 packages/core/lib/v3/index.ts                 |  23 ++
 packages/core/lib/v3/verifier/index.ts        |  30 ++
 packages/core/lib/v3/verifier/trajectory.ts   | 282 ++++++++++++++++++
 packages/core/lib/v3/verifier/verifier.ts     | 157 ++++++++++
 packages/core/lib/v3Evaluator.ts              | 245 ++++++++++++++-
 .../tests/unit/public-api/v3-core.test.ts     |  40 +++
 6 files changed, 776 insertions(+), 1 deletion(-)
 create mode 100644 packages/core/lib/v3/verifier/index.ts
 create mode 100644 packages/core/lib/v3/verifier/trajectory.ts
 create mode 100644 packages/core/lib/v3/verifier/verifier.ts

diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index ffb6726df..88f999e16 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -64,6 +64,29 @@ export type {
   V3EvaluatorConstructorOptions,
   V3EvaluatorOptions,
 } from "../v3Evaluator.js";
+export type {
+  Trajectory,
+  TrajectoryStep,
+  TrajectoryStatus,
+  TrajectoryUsage,
+  TaskSpec,
+  Rubric,
+  RubricCriterion,
+  AgentEvidence,
+  AgentEvidenceModality,
+  ProbeEvidence,
+  ToolOutput,
+  Verifier,
+  Verdict,
+  CriterionScore,
+  FirstPointOfFailure,
+  TaskValidity,
+  VerifierFinding,
+} from "./verifier/index.js";
+export {
+  loadTrajectoryFromDisk,
+  nextVerdictFilename,
+} from "./verifier/index.js";
 export { tool } from "ai";
 export { getAISDKLanguageModel } from "./llm/LLMProvider.js";
 export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js";
diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
new file mode 100644
index 000000000..b04363458
--- /dev/null
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -0,0 +1,30 @@
+/**
+ * Public re-exports for the verifier subsystem.
+ *
+ * Wave 0 ships the trajectory + verdict types and a stub verifier. The
+ * RubricVerifier port (Wave 1+) stays internal until the prompts stabilize.
+ */
+export type {
+  Trajectory,
+  TrajectoryStep,
+  TrajectoryStatus,
+  TrajectoryUsage,
+  TaskSpec,
+  Rubric,
+  RubricCriterion,
+  AgentEvidence,
+  AgentEvidenceModality,
+  ProbeEvidence,
+  ToolOutput,
+} from "./trajectory.js";
+export { loadTrajectoryFromDisk, nextVerdictFilename } from "./trajectory.js";
+
+export type {
+  Verifier,
+  Verdict,
+  CriterionScore,
+  FirstPointOfFailure,
+  TaskValidity,
+  VerifierFinding,
+  StubVerdictReason,
+} from "./verifier.js";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
new file mode 100644
index 000000000..6912dbc74
--- /dev/null
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -0,0 +1,282 @@
+/**
+ * Trajectory — structured record of an agent run, consumed by the verifier.
+ *
+ * Trajectories are produced by the harness (TrajectoryRecorder in
+ * packages/evals) from the bus events emitted by v3AgentHandler /
+ * v3CuaAgentHandler. They are persisted on-disk in a layout matching
+ * microsoft/fara's example_trajectory/ (task_data.json + trajectory.json +
+ * screenshot_N.png + scores/) so we can cross-validate against
+ * CUAVerifierBench's verify_trajectories.py without format-conversion.
+ *
+ * Two evidence channels per step:
+ *   - agentEvidence ("tier 1") — what the agent's LLM consumed as the tool
+ *     result. For DOM/hybrid agents these are the tool returns (extract JSON,
+ *     ariaTree text, act describe-string, goto URL). For CUA this is the
+ *     screenshot the provider received.
+ *   - probeEvidence ("tier 2") — independent observations the harness took
+ *     around each step (page.screenshot, page.url, optionally a11y).
+ *
+ * The verifier consumes both. They can disagree; conflict resolution is the
+ * verifier's job (see Verdict.evidenceInsufficient + per-criterion logging).
+ */
+
+/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */
+export interface TrajectoryUsage {
+  input_tokens: number;
+  output_tokens: number;
+  reasoning_tokens?: number;
+  cached_input_tokens?: number;
+  inference_time_ms?: number;
+}
+
+/**
+ * A single criterion in a rubric. Mirrors fara's per-item schema:
+ *   { criterion, description, max_points, justification, earned_points }
+ * Conditional criteria carry an extra "condition" field; only counted when met.
+ */
+export interface RubricCriterion {
+  /** Short name of the criterion (e.g., "Add ground beef to cart"). */
+  criterion: string;
+  /** What to evaluate and how to award partial credit. */
+  description: string;
+  /** Maximum points for this criterion. */
+  max_points: number;
+  /**
+   * Triggering condition for conditional criteria. Only counted when met
+   * (paper's "Mutually Exclusive Conditionals" pattern).
+   */
+  condition?: string;
+  /** Filled by the verifier during scoring; empty in precomputed rubrics. */
+  justification?: string;
+  /**
+   * Filled by the verifier during scoring; empty string in unscored rubrics.
+   * Loose type to mirror fara's data, where unscored items carry "" and scored
+   * items carry a number.
+   */
+  earned_points?: number | string;
+}
+
+/** A rubric — list of criteria for a task. */
+export interface Rubric {
+  items: RubricCriterion[];
+}
+
+/**
+ * Spec for a single task being verified. Carried both at runtime (handed to
+ * agent.execute) and into the verifier alongside the trajectory.
+ */
+export interface TaskSpec {
+  /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */
+  id: string;
+  /** Task instruction shown to the agent. */
+  instruction: string;
+  /** Starting URL, if any. */
+  initUrl?: string;
+  /**
+   * Rubric carried by the dataset (e.g., WebTailBench's precomputed_rubric).
+   * If absent, the verifier generates one via Step 0a and caches under
+   * packages/evals/.rubric-cache/.
+   */
+  precomputedRubric?: Rubric;
+  /** Optional reference answer (set when dataset ships one). */
+  expectedAnswer?: string;
+}
+
+/**
+ * A single modality unit in tier-1 agent evidence. Mirrors the shape of
+ * ModelMessage content parts so we can reproduce what the LLM ingested.
+ */
+export type AgentEvidenceModality =
+  | { type: "text"; content: string }
+  | { type: "image"; bytes: Buffer; mediaType: string }
+  | { type: "json"; content: unknown };
+
+/**
+ * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the
+ * tool result for this step.
+ *
+ * Modes:
+ *   - CUA: usually a single image modality (the screenshot sent to the provider).
+ *   - Hybrid: tool result with optional screenshotBase64 → one image + one text.
+ *   - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities.
+ */
+export interface AgentEvidence {
+  modalities: AgentEvidenceModality[];
+}
+
+/**
+ * Tier 2 — independent harness probes around this step. Cheap and always-on
+ * for v0 (just url) and v1 (+a11y, +scroll). v2 adds verifier-requested probes
+ * keyed on the criterion that requested them.
+ *
+ * If a probe wasn't captured, the field is absent (not null).
+ */
+export interface ProbeEvidence {
+  /** v0.5 — URL after the step's tool execution. */
+  url?: string;
+  /**
+   * v0 — bus screenshot (page.screenshot post-step). Path on disk is preferred
+   * once persisted; in-memory Buffer is used during a live run.
+   */
+  screenshot?: Buffer;
+  /** Reference to the persisted screenshot file under the trajectory dir. */
+  screenshotPath?: string;
+  /** v1 — viewport scroll context. Lets the verifier reason about "did the agent see the full page". */
+  scroll?: { top: number; pageHeight: number };
+  /** v1 — accessibility tree snapshot. */
+  ariaTree?: string;
+  /** v2 — verifier-requested probes, keyed by criterion id. */
+  onDemand?: Record<string, unknown>;
+}
+
+/** Outcome of a single tool execution as seen by the harness. */
+export interface ToolOutput {
+  ok: boolean;
+  /**
+   * The tool's return value. Same payload that flowed into agentEvidence
+   * modalities, but in its native shape (e.g., the extract result, the act
+   * describe-string) rather than serialized for the LLM.
+   */
+  result: unknown;
+  error?: string;
+}
+
+/** One step in a trajectory: action + reasoning + evidence + outcome. */
+export interface TrajectoryStep {
+  index: number;
+  actionName: string;
+  actionArgs: Record<string, unknown>;
+  /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */
+  reasoning: string;
+  agentEvidence: AgentEvidence;
+  probeEvidence: ProbeEvidence;
+  toolOutput: ToolOutput;
+  /** ISO 8601 timestamp when the step's tool execution started. */
+  startedAt: string;
+  /** ISO 8601 timestamp when the step's tool execution finished. */
+  finishedAt: string;
+}
+
+/** Terminal status of the agent run. */
+export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
+
+/**
+ * Full trajectory for one task run.
+ *
+ * The on-disk layout is one directory per task:
+ *
+ *   .trajectories/<run-id>/<task-id>/
+ *     ├── task_data.json    — TaskSpec + Verdict (filled on completion)
+ *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
+ *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
+ *     ├── scores/
+ *     │   └── mmrubric_v1.json  — Verdict from V3Evaluator.verify()
+ *     ├── core.log          — action log mirroring fara's core.log
+ *     └── times.json        — step timing + token usage
+ */
+export interface Trajectory {
+  task: TaskSpec;
+  steps: TrajectoryStep[];
+  finalAnswer?: string;
+  status: TrajectoryStatus;
+  usage: TrajectoryUsage;
+  timing: { startedAt: string; endedAt: string };
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// On-disk loader
+// ─────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Hydrate a Trajectory from the on-disk directory layout written by
+ * TrajectoryRecorder.persist(). Used by the offline re-scoring CLI (`bench
+ * verify`) and by any consumer that wants to feed a saved trajectory back
+ * into V3Evaluator.verify() without running an agent.
+ *
+ * Reverses the recorder's serialization tweaks:
+ *   - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`.
+ *   - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on
+ *     disk (human-readable JSON) instead of raw Buffer; we decode back.
+ *
+ * @param dir absolute or cwd-relative path to a `<run-id>/<task-id>/` directory.
+ */
+export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
+  const fs = await import("node:fs/promises");
+  const path = await import("node:path");
+
+  const trajectoryPath = path.join(dir, "trajectory.json");
+  const raw = await fs.readFile(trajectoryPath, "utf8");
+  const parsed = JSON.parse(raw) as Trajectory & {
+    steps: Array<
+      TrajectoryStep & {
+        agentEvidence: {
+          modalities: Array<
+            | { type: "text"; content: string }
+            | {
+                type: "image";
+                mediaType: string;
+                // On-disk form (recorder writes base64); accept either to
+                // tolerate hand-edited fixtures.
+                bytes?: unknown;
+                bytesBase64?: string;
+              }
+            | { type: "json"; content: unknown }
+          >;
+        };
+        probeEvidence: ProbeEvidence;
+      }
+    >;
+  };
+
+  for (const step of parsed.steps) {
+    // Rehydrate tier-2 probe screenshot from its on-disk file reference.
+    const probe = step.probeEvidence;
+    if (probe?.screenshotPath && !probe.screenshot) {
+      const resolved = path.isAbsolute(probe.screenshotPath)
+        ? probe.screenshotPath
+        : path.join(dir, probe.screenshotPath);
+      try {
+        probe.screenshot = await fs.readFile(resolved);
+      } catch {
+        // Missing screenshot file: leave probe.screenshot unset. The verifier's
+        // evidence_insufficient path will handle it.
+      }
+    }
+
+    // Decode image modalities from base64 back to Buffer.
+    if (step.agentEvidence?.modalities) {
+      step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => {
+        // The on-disk shape carries bytesBase64 instead of bytes, so we look
+        // through `unknown` here rather than rely on the typed union.
+        const raw = m as unknown as { bytesBase64?: string };
+        if (m.type === "image" && typeof raw.bytesBase64 === "string") {
+          return {
+            type: "image" as const,
+            bytes: Buffer.from(raw.bytesBase64, "base64"),
+            mediaType: m.mediaType,
+          };
+        }
+        return m as AgentEvidenceModality;
+      });
+    }
+  }
+
+  return parsed;
+}
+
+/**
+ * Locate the next available `mmrubric_*.json` filename for a given trajectory
+ * directory. Used by offline re-scoring to avoid overwriting prior verdicts.
+ *
+ * Convention: prefer a label-based name (e.g., `mmrubric_rescore-2026-05-11.json`)
+ * over numeric versioning so multiple offline rescore attempts coexist without
+ * collisions and remain easy to diff. Falls back to a timestamp if the caller
+ * doesn't provide a label.
+ */
+export function nextVerdictFilename(label?: string): string {
+  const safeLabel = (label ?? `rescore-${new Date().toISOString()}`).replace(
+    /[^A-Za-z0-9._-]/g,
+    "_",
+  );
+  return `mmrubric_${safeLabel}.json`;
+}
diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts
new file mode 100644
index 000000000..49994774c
--- /dev/null
+++ b/packages/core/lib/v3/verifier/verifier.ts
@@ -0,0 +1,157 @@
+/**
+ * Verifier — interface and result types for the rubric-based verifier that
+ * replaces V3Evaluator's single-pass YES/NO judge.
+ *
+ * Modeled on microsoft/fara's MMRubricAgent (arxiv 2511.19663, "The Art of
+ * Building Verifiers for Computer Use Agents"). The verifier never touches a
+ * live browser — it consumes a Trajectory + TaskSpec and returns a structured
+ * Verdict. That property is what lets us re-score saved trajectories offline.
+ *
+ * Wave 0 ships only the types and a stub implementation (`evidence_insufficient`
+ * for everything). Wave 1 ports the MMRubricAgent pipeline (Steps 1–6 + Step 8).
+ */
+
+import type { Trajectory, TaskSpec } from "./trajectory.js";
+
+/** Score for a single rubric criterion after evidence analysis + rescoring. */
+export interface CriterionScore {
+  /** Matches RubricCriterion.criterion (the criterion's short name). */
+  criterion: string;
+  /** Maximum possible points for this criterion. */
+  maxPoints: number;
+  /**
+   * Points earned post-evidence-analysis (paper's post_image_earned_points).
+   * Null if the criterion was conditional and its condition wasn't met (excluded
+   * from both numerator and denominator in the process score).
+   */
+  earnedPoints: number | null;
+  /** Verifier's free-text justification for the score. */
+  justification: string;
+  /**
+   * True if the criterion is conditional and its condition was determined to
+   * be met. Absent for non-conditional criteria.
+   */
+  conditionMet?: boolean;
+  /**
+   * Set when the verifier had no evidence to ground this criterion in either
+   * tier. Per paper §2, treated as uncontrollable failure → full credit, but
+   * surfaced here so dashboards can flag low-confidence verdicts.
+   */
+  evidenceInsufficient?: boolean;
+}
+
+/**
+ * First-point-of-failure analysis (paper Step 9a). Identifies the earliest
+ * step where the agent's trajectory went off-track, using a structured error
+ * taxonomy (7 top-level categories, 1.1–7.4 sub-codes).
+ */
+export interface FirstPointOfFailure {
+  stepIndex: number;
+  /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */
+  errorCode: string;
+  /** Top-level category name (Selection, Hallucination, etc.). */
+  category: string;
+  /** Verifier's reasoning for selecting this point. */
+  description?: string;
+}
+
+/**
+ * Structured observation surfaced by the verifier that another agent or
+ * tooling could act on. Findings are emitted opportunistically by Step 8
+ * (outcome verification) when the verifier notices actionable patterns —
+ * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.
+ *
+ * Not produced for every task: when nothing actionable surfaces, the
+ * `findings` array on the Verdict is empty. Consumers should treat the
+ * field as advisory, not as part of the formal score.
+ */
+export interface VerifierFinding {
+  /**
+   * Category of the observation. Open-ended enum — additional categories may
+   * be added as Wave 2/3 verifier steps surface new failure modes.
+   */
+  category:
+    | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries)
+    | "agent_strategy" // higher-level planning / decision-making problems
+    | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory
+    | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps)
+    | "task_specification" // task instruction was ambiguous / under- or over-specified
+    | "verifier_uncertainty" // verifier itself couldn't confidently decide
+    | "other";
+  /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */
+  severity: "info" | "warning" | "blocking";
+  /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */
+  description: string;
+  /**
+   * Optional concrete next action another agent could take. Should be
+   * specific enough that it can be acted on without further reasoning —
+   * e.g., "Try double_click instead of triple_click to clear placeholder
+   * text on this form field."
+   */
+  suggestedAction?: string;
+  /** Step indices in the trajectory where this pattern showed up. */
+  relatedSteps?: number[];
+}
+
+/** Task-validity classification (paper Step 10). */
+export interface TaskValidity {
+  /** True if the task is underspecified / has multiple valid interpretations. */
+  isAmbiguous: boolean;
+  /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */
+  isInvalid: boolean;
+  /** Optional sub-codes from the task-classification taxonomy. */
+  ambiguityCodes?: string[];
+  invalidTaskCodes?: string[];
+}
+
+/**
+ * The verifier's output. Process score + outcome verdict + diagnostic signals.
+ *
+ * Process and outcome are deliberately independent (paper §2): an agent can
+ * follow the right steps but get blocked (high process, low outcome), or
+ * succeed through an unexpected path (variable process, high outcome).
+ */
+export interface Verdict {
+  /** Step 8 — did the agent accomplish the task from the user's perspective? */
+  outcomeSuccess: boolean;
+  /** Aggregated earned/max across applicable criteria, in [0, 1]. */
+  processScore: number;
+  /** Per-criterion breakdown after rescoring. */
+  perCriterion: CriterionScore[];
+  /** Step 9a — first step where the trajectory went off-track, if any. */
+  firstPointOfFailure?: FirstPointOfFailure;
+  /** Step 10 — task-itself ambiguity / validity. */
+  taskValidity: TaskValidity;
+  /**
+   * Ids (RubricCriterion.criterion strings) of criteria where neither tier of
+   * evidence resolved the question. Treated as uncontrollable → full credit,
+   * but flagged here so consumers can decide whether to discount the score.
+   */
+  evidenceInsufficient: string[];
+  /**
+   * Structured observations from the verifier that a downstream tool or
+   * follow-up agent could act on. Opportunistic — empty when the verifier
+   * doesn't notice anything actionable. Not part of the score; advisory.
+   */
+  findings?: VerifierFinding[];
+  /**
+   * Intermediate per-step data — the paper's intermediate_mm_rubric_steps
+   * payload. Opaque shape; useful for debugging and prompt iteration, but not
+   * part of the stable contract.
+   */
+  rawSteps?: unknown;
+}
+
+/** Reason a stub verifier emits when the rubric pipeline hasn't shipped yet. */
+export type StubVerdictReason =
+  | "wave-0-stub"
+  | "no-rubric"
+  | "empty-trajectory";
+
+/**
+ * Verifier interface. Implementations consume a Trajectory + TaskSpec and
+ * return a Verdict — they MUST NOT touch a live browser.
+ */
+export interface Verifier {
+  verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<Verdict>;
+}
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index e1d384f8c..73009b0ad 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -14,6 +14,15 @@ import type {
 import { V3 } from "./v3/v3.js";
 import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
 import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js";
+import type {
+  Trajectory,
+  TaskSpec,
+  Verdict,
+  Rubric,
+  Verifier,
+  AgentEvidenceModality,
+  VerifierFinding,
+} from "./v3/verifier/index.js";
 
 const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND";
 const DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = "legacy";
@@ -43,7 +52,7 @@ type NormalizedConstructorOptions = {
   backend?: V3EvaluatorBackend;
 };
 
-export class V3Evaluator {
+export class V3Evaluator implements Verifier {
   private readonly backend: V3EvaluatorBackend;
   private readonly legacyEvaluator: LegacyV3Evaluator;
 
@@ -75,15 +84,70 @@ export class V3Evaluator {
     return this.getLegacyBackend("batchAsk").batchAsk(options);
   }
 
+  async verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<Verdict> {
+    assertVerifierInput(trajectory, taskSpec);
+
+    if (this.backend === "legacy") {
+      return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec);
+    }
+
+    return this.unavailableVerifierBackend("verify");
+  }
+
+  async generateRubric(taskSpec: TaskSpec): Promise<Rubric> {
+    if (!taskSpec?.id) {
+      throw new StagehandInvalidArgumentError(
+        "TaskSpec.id is required for rubric generation",
+      );
+    }
+
+    if (this.backend === "verifier") {
+      return this.unavailableVerifierBackend("generateRubric");
+    }
+
+    return {
+      items: [legacyTaskCompletionCriterion(taskSpec)],
+    };
+  }
+
   private getLegacyBackend(methodName: string): LegacyV3Evaluator {
     if (this.backend === "legacy") {
       return this.legacyEvaluator;
     }
 
+    return this.unavailableVerifierBackend(methodName);
+  }
+
+  private unavailableVerifierBackend(methodName: string): never {
     throw new StagehandInvalidArgumentError(
       `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`,
     );
   }
+
+  private async verifyTrajectoryWithLegacyEvaluator(
+    trajectory: Trajectory,
+    taskSpec: TaskSpec,
+  ): Promise<Verdict> {
+    const screenshots = collectLegacyScreenshots(trajectory);
+    const agentReasoning = renderLegacyAgentReasoning(trajectory);
+    const answer = trajectory.finalAnswer;
+
+    if (!screenshots.length && !answer) {
+      return legacyInsufficientEvidenceVerdict(
+        taskSpec,
+        "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.",
+      );
+    }
+
+    const result = await this.legacyEvaluator.ask({
+      question: taskSpec.instruction,
+      screenshot: screenshots.length ? screenshots : false,
+      answer,
+      agentReasoning,
+    });
+
+    return legacyEvaluationToVerdict(result, taskSpec, screenshots.length);
+  }
 }
 
 function normalizeConstructorOptions(
@@ -127,3 +191,182 @@ function resolveEvaluatorBackend(
     `Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`,
   );
 }
+
+function assertVerifierInput(trajectory: Trajectory, taskSpec: TaskSpec): void {
+  if (!taskSpec?.id) {
+    throw new StagehandInvalidArgumentError(
+      "TaskSpec.id is required for verification",
+    );
+  }
+  if (!trajectory) {
+    throw new StagehandInvalidArgumentError(
+      "Trajectory is required for verification",
+    );
+  }
+}
+
+function legacyTaskCompletionCriterion(taskSpec: TaskSpec) {
+  return {
+    criterion: "legacy-task-completion",
+    description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`,
+    max_points: 1,
+  };
+}
+
+function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] {
+  const screenshots: Buffer[] = [];
+
+  for (const step of trajectory.steps ?? []) {
+    if (Buffer.isBuffer(step.probeEvidence?.screenshot)) {
+      screenshots.push(step.probeEvidence.screenshot);
+      continue;
+    }
+
+    const agentImage = step.agentEvidence?.modalities?.find(
+      (
+        modality,
+      ): modality is Extract<AgentEvidenceModality, { type: "image" }> =>
+        modality.type === "image" && Buffer.isBuffer(modality.bytes),
+    );
+
+    if (agentImage) {
+      screenshots.push(agentImage.bytes);
+    }
+  }
+
+  return screenshots;
+}
+
+function renderLegacyAgentReasoning(
+  trajectory: Trajectory,
+): string | undefined {
+  const stepLines = (trajectory.steps ?? []).map((step) => {
+    const output = step.toolOutput?.error
+      ? `Tool error: ${step.toolOutput.error}`
+      : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`;
+    return [
+      `Step ${step.index}: ${step.actionName}`,
+      step.reasoning ? `Reasoning: ${step.reasoning}` : undefined,
+      output,
+    ]
+      .filter(Boolean)
+      .join("\n");
+  });
+
+  const sections = [
+    stepLines.length
+      ? `Agent trajectory:\n${stepLines.join("\n\n")}`
+      : undefined,
+    trajectory.finalAnswer
+      ? `Final answer:\n${trajectory.finalAnswer}`
+      : undefined,
+  ].filter(Boolean);
+
+  if (!sections.length) {
+    return undefined;
+  }
+
+  return truncateForPrompt(sections.join("\n\n"), 16000);
+}
+
+function stringifyForPrompt(value: unknown): string {
+  if (typeof value === "string") {
+    return truncateForPrompt(value, 2000);
+  }
+
+  try {
+    return truncateForPrompt(JSON.stringify(value), 2000);
+  } catch {
+    return String(value);
+  }
+}
+
+function truncateForPrompt(value: string, maxLength: number): string {
+  if (value.length <= maxLength) {
+    return value;
+  }
+
+  return `${value.slice(0, maxLength)}... [truncated]`;
+}
+
+function legacyEvaluationToVerdict(
+  result: EvaluationResult,
+  taskSpec: TaskSpec,
+  screenshotCount: number,
+): Verdict {
+  const outcomeSuccess = result.evaluation === "YES";
+  const invalid = result.evaluation === "INVALID";
+  const criterion = legacyTaskCompletionCriterion(taskSpec);
+  const findings: VerifierFinding[] = invalid
+    ? [
+        {
+          category: "verifier_uncertainty",
+          severity: "warning",
+          description: result.reasoning,
+        },
+      ]
+    : [];
+
+  return {
+    outcomeSuccess,
+    processScore: outcomeSuccess ? 1 : 0,
+    perCriterion: [
+      {
+        criterion: criterion.criterion,
+        maxPoints: criterion.max_points,
+        earnedPoints: outcomeSuccess ? 1 : 0,
+        justification: result.reasoning,
+        evidenceInsufficient: invalid,
+      },
+    ],
+    taskValidity: {
+      isAmbiguous: false,
+      isInvalid: false,
+    },
+    evidenceInsufficient: invalid ? [criterion.criterion] : [],
+    findings,
+    rawSteps: {
+      backend: "legacy",
+      legacyEvaluation: result.evaluation,
+      screenshotCount,
+    },
+  };
+}
+
+function legacyInsufficientEvidenceVerdict(
+  taskSpec: TaskSpec,
+  reason: string,
+): Verdict {
+  const criterion = legacyTaskCompletionCriterion(taskSpec);
+
+  return {
+    outcomeSuccess: false,
+    processScore: 0,
+    perCriterion: [
+      {
+        criterion: criterion.criterion,
+        maxPoints: criterion.max_points,
+        earnedPoints: 0,
+        justification: reason,
+        evidenceInsufficient: true,
+      },
+    ],
+    taskValidity: {
+      isAmbiguous: false,
+      isInvalid: false,
+    },
+    evidenceInsufficient: [criterion.criterion],
+    findings: [
+      {
+        category: "trajectory_capture",
+        severity: "blocking",
+        description: reason,
+      },
+    ],
+    rawSteps: {
+      backend: "legacy",
+      legacyEvaluation: "INVALID",
+      screenshotCount: 0,
+    },
+  };
+}
diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts
index 8d710da4d..d1975a48c 100644
--- a/packages/core/tests/unit/public-api/v3-core.test.ts
+++ b/packages/core/tests/unit/public-api/v3-core.test.ts
@@ -134,6 +134,18 @@ describe("V3 Core public API types", () => {
       >();
     });
 
+    it("has verifier facade methods", () => {
+      expectTypeOf<V3EvaluatorInstance["verify"]>().toExtend<
+        (
+          trajectory: Stagehand.Trajectory,
+          taskSpec: Stagehand.TaskSpec,
+        ) => Promise<Stagehand.Verdict>
+      >();
+      expectTypeOf<V3EvaluatorInstance["generateRubric"]>().toExtend<
+        (taskSpec: Stagehand.TaskSpec) => Promise<Stagehand.Rubric>
+      >();
+    });
+
     it("accepts legacy evaluator backend options", () => {
       const mockV3 = {} as Stagehand.Stagehand;
       expectTypeOf<typeof Stagehand.V3Evaluator>().toBeConstructibleWith(
@@ -156,6 +168,34 @@ describe("V3 Core public API types", () => {
       );
     });
 
+    it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => {
+      const taskSpec: Stagehand.TaskSpec = {
+        id: "empty",
+        instruction: "Complete the task",
+      };
+      const trajectory: Stagehand.Trajectory = {
+        task: taskSpec,
+        steps: [],
+        status: "complete",
+        usage: {
+          input_tokens: 0,
+          output_tokens: 0,
+        },
+        timing: {
+          startedAt: new Date(0).toISOString(),
+          endedAt: new Date(0).toISOString(),
+        },
+      };
+      const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, {
+        backend: "legacy",
+      });
+
+      const verdict = await evaluator.verify(trajectory, taskSpec);
+
+      expect(verdict.outcomeSuccess).toBe(false);
+      expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]);
+    });
+
     it("rejects invalid evaluator backend env values", () => {
       const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND;
       process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend";

From fadc5a8b4246497a31c6ba9ffdaf35764bacfbc0 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:16:05 -0700
Subject: [PATCH 02/14] fix(verifier): normalize public rubric naming

---
 .changeset/verifier-evaluator-shell.md      |  5 ++
 packages/core/lib/v3/index.ts               |  5 ++
 packages/core/lib/v3/verifier/index.ts      | 10 ++-
 packages/core/lib/v3/verifier/trajectory.ts | 75 +++++++++++++++++----
 packages/core/lib/v3/verifier/verifier.ts   | 25 +++++--
 packages/core/lib/v3Evaluator.ts            |  6 +-
 6 files changed, 104 insertions(+), 22 deletions(-)
 create mode 100644 .changeset/verifier-evaluator-shell.md

diff --git a/.changeset/verifier-evaluator-shell.md b/.changeset/verifier-evaluator-shell.md
new file mode 100644
index 000000000..8e603b499
--- /dev/null
+++ b/.changeset/verifier-evaluator-shell.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add verifier trajectory, rubric, and verdict types with normalized public naming.
diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index 88f999e16..8fdcc6b75 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -72,6 +72,9 @@ export type {
   TaskSpec,
   Rubric,
   RubricCriterion,
+  SerializedRubric,
+  SerializedRubricCriterion,
+  RubricInput,
   AgentEvidence,
   AgentEvidenceModality,
   ProbeEvidence,
@@ -82,10 +85,12 @@ export type {
   FirstPointOfFailure,
   TaskValidity,
   VerifierFinding,
+  VerifierRawSteps,
 } from "./verifier/index.js";
 export {
   loadTrajectoryFromDisk,
   nextVerdictFilename,
+  normalizeRubric,
 } from "./verifier/index.js";
 export { tool } from "ai";
 export { getAISDKLanguageModel } from "./llm/LLMProvider.js";
diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index b04363458..b39c94488 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -12,12 +12,19 @@ export type {
   TaskSpec,
   Rubric,
   RubricCriterion,
+  SerializedRubric,
+  SerializedRubricCriterion,
+  RubricInput,
   AgentEvidence,
   AgentEvidenceModality,
   ProbeEvidence,
   ToolOutput,
 } from "./trajectory.js";
-export { loadTrajectoryFromDisk, nextVerdictFilename } from "./trajectory.js";
+export {
+  loadTrajectoryFromDisk,
+  nextVerdictFilename,
+  normalizeRubric,
+} from "./trajectory.js";
 
 export type {
   Verifier,
@@ -26,5 +33,6 @@ export type {
   FirstPointOfFailure,
   TaskValidity,
   VerifierFinding,
+  VerifierRawSteps,
   StubVerdictReason,
 } from "./verifier.js";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 6912dbc74..d8228c287 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -3,10 +3,8 @@
  *
  * Trajectories are produced by the harness (TrajectoryRecorder in
  * packages/evals) from the bus events emitted by v3AgentHandler /
- * v3CuaAgentHandler. They are persisted on-disk in a layout matching
- * microsoft/fara's example_trajectory/ (task_data.json + trajectory.json +
- * screenshot_N.png + scores/) so we can cross-validate against
- * CUAVerifierBench's verify_trajectories.py without format-conversion.
+ * v3CuaAgentHandler. They can be persisted on disk and reloaded for offline
+ * verifier scoring.
  *
  * Two evidence channels per step:
  *   - agentEvidence ("tier 1") — what the agent's LLM consumed as the tool
@@ -30,9 +28,9 @@ export interface TrajectoryUsage {
 }
 
 /**
- * A single criterion in a rubric. Mirrors fara's per-item schema:
- *   { criterion, description, max_points, justification, earned_points }
- * Conditional criteria carry an extra "condition" field; only counted when met.
+ * A single criterion in a Stagehand rubric. Dataset and model wire formats may
+ * use fara-style `max_points` / `earned_points`; normalize those with
+ * `normalizeRubric()` at the boundary.
  */
 export interface RubricCriterion {
   /** Short name of the criterion (e.g., "Add ground beef to cart"). */
@@ -40,7 +38,7 @@ export interface RubricCriterion {
   /** What to evaluate and how to award partial credit. */
   description: string;
   /** Maximum points for this criterion. */
-  max_points: number;
+  maxPoints: number;
   /**
    * Triggering condition for conditional criteria. Only counted when met
    * (paper's "Mutually Exclusive Conditionals" pattern).
@@ -49,11 +47,10 @@ export interface RubricCriterion {
   /** Filled by the verifier during scoring; empty in precomputed rubrics. */
   justification?: string;
   /**
-   * Filled by the verifier during scoring; empty string in unscored rubrics.
-   * Loose type to mirror fara's data, where unscored items carry "" and scored
-   * items carry a number.
+   * Filled by the verifier during scoring; empty string in some serialized
+   * upstream rubrics and a number in scored rubrics.
    */
-  earned_points?: number | string;
+  earnedPoints?: number | string;
 }
 
 /** A rubric — list of criteria for a task. */
@@ -61,6 +58,60 @@ export interface Rubric {
   items: RubricCriterion[];
 }
 
+/**
+ * FARA/upstream rubric item shape as stored in datasets and prompt responses.
+ * Keep this at IO boundaries; core verifier types use camelCase.
+ */
+export interface SerializedRubricCriterion {
+  criterion: string;
+  description: string;
+  max_points: number;
+  condition?: string;
+  justification?: string;
+  earned_points?: number | string;
+}
+
+/** Serialized rubric shape used by upstream datasets and generated JSON. */
+export interface SerializedRubric {
+  items: SerializedRubricCriterion[];
+}
+
+export type RubricInput = Rubric | SerializedRubric;
+
+/** Convert a Stagehand or serialized rubric into the public Stagehand shape. */
+export function normalizeRubric(
+  rubric: RubricInput | null | undefined,
+): Rubric | undefined {
+  if (!rubric) return undefined;
+
+  return {
+    items: rubric.items.map((item) => {
+      const raw = item as RubricCriterion &
+        Partial<SerializedRubricCriterion>;
+      const maxPoints =
+        typeof raw.maxPoints === "number" ? raw.maxPoints : raw.max_points;
+
+      if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) {
+        throw new TypeError(
+          `Rubric criterion "${raw.criterion}" is missing a numeric maxPoints value`,
+        );
+      }
+
+      const earnedPoints = raw.earnedPoints ?? raw.earned_points;
+      return {
+        criterion: raw.criterion,
+        description: raw.description,
+        maxPoints,
+        ...(raw.condition !== undefined && { condition: raw.condition }),
+        ...(raw.justification !== undefined && {
+          justification: raw.justification,
+        }),
+        ...(earnedPoints !== undefined && { earnedPoints }),
+      };
+    }),
+  };
+}
+
 /**
  * Spec for a single task being verified. Carried both at runtime (handed to
  * agent.execute) and into the verifier alongside the trajectory.
diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts
index 49994774c..659675486 100644
--- a/packages/core/lib/v3/verifier/verifier.ts
+++ b/packages/core/lib/v3/verifier/verifier.ts
@@ -93,6 +93,23 @@ export interface VerifierFinding {
   relatedSteps?: number[];
 }
 
+/** Stable debugging summary emitted by verifier backends. */
+export interface VerifierRawSteps {
+  backend?: "legacy" | "verifier";
+  primaryIntent?: string;
+  reasoning?: string;
+  rubricSource?: "precomputed" | "generated" | "none";
+  approach?: "a" | "b";
+  optionalsMode?: "folded" | "separate" | "skip";
+  totalEarned?: number;
+  totalMax?: number;
+  evidenceImages?: number;
+  evidenceTexts?: number;
+  evidenceOriginalScreenshots?: number;
+  legacyEvaluation?: string;
+  screenshotCount?: number;
+}
+
 /** Task-validity classification (paper Step 10). */
 export interface TaskValidity {
   /** True if the task is underspecified / has multiple valid interpretations. */
@@ -134,12 +151,8 @@ export interface Verdict {
    * doesn't notice anything actionable. Not part of the score; advisory.
    */
   findings?: VerifierFinding[];
-  /**
-   * Intermediate per-step data — the paper's intermediate_mm_rubric_steps
-   * payload. Opaque shape; useful for debugging and prompt iteration, but not
-   * part of the stable contract.
-   */
-  rawSteps?: unknown;
+  /** Debugging summary from the active evaluator backend. */
+  rawSteps?: VerifierRawSteps;
 }
 
 /** Reason a stub verifier emits when the rubric pipeline hasn't shipped yet. */
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index 73009b0ad..8adc7d5f1 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -209,7 +209,7 @@ function legacyTaskCompletionCriterion(taskSpec: TaskSpec) {
   return {
     criterion: "legacy-task-completion",
     description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`,
-    max_points: 1,
+    maxPoints: 1,
   };
 }
 
@@ -313,7 +313,7 @@ function legacyEvaluationToVerdict(
     perCriterion: [
       {
         criterion: criterion.criterion,
-        maxPoints: criterion.max_points,
+        maxPoints: criterion.maxPoints,
         earnedPoints: outcomeSuccess ? 1 : 0,
         justification: result.reasoning,
         evidenceInsufficient: invalid,
@@ -345,7 +345,7 @@ function legacyInsufficientEvidenceVerdict(
     perCriterion: [
       {
         criterion: criterion.criterion,
-        maxPoints: criterion.max_points,
+        maxPoints: criterion.maxPoints,
         earnedPoints: 0,
         justification: reason,
         evidenceInsufficient: true,

From 2765781ad3aa90175578f9518bd51a78b1b42f0b Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:21:01 -0700
Subject: [PATCH 03/14] style(verifier): format rubric normalizer

---
 packages/core/lib/v3/verifier/trajectory.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index d8228c287..d992ce9ee 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -86,8 +86,7 @@ export function normalizeRubric(
 
   return {
     items: rubric.items.map((item) => {
-      const raw = item as RubricCriterion &
-        Partial<SerializedRubricCriterion>;
+      const raw = item as RubricCriterion & Partial<SerializedRubricCriterion>;
       const maxPoints =
         typeof raw.maxPoints === "number" ? raw.maxPoints : raw.max_points;
 

From 0088a3c3c62fa4d8ffa18afbc6cd1e863716f256 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:34:48 -0700
Subject: [PATCH 04/14] chore(verifier): remove upstream verifier references

---
 packages/core/lib/v3/verifier/trajectory.ts |  6 +++---
 packages/core/lib/v3/verifier/verifier.ts   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index d992ce9ee..6fb604d11 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -29,7 +29,7 @@ export interface TrajectoryUsage {
 
 /**
  * A single criterion in a Stagehand rubric. Dataset and model wire formats may
- * use fara-style `max_points` / `earned_points`; normalize those with
+ * use serialized `max_points` / `earned_points`; normalize those with
  * `normalizeRubric()` at the boundary.
  */
 export interface RubricCriterion {
@@ -59,7 +59,7 @@ export interface Rubric {
 }
 
 /**
- * FARA/upstream rubric item shape as stored in datasets and prompt responses.
+ * Serialized rubric item shape as stored in datasets and prompt responses.
  * Keep this at IO boundaries; core verifier types use camelCase.
  */
 export interface SerializedRubricCriterion {
@@ -221,7 +221,7 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
  *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
  *     ├── scores/
  *     │   └── mmrubric_v1.json  — Verdict from V3Evaluator.verify()
- *     ├── core.log          — action log mirroring fara's core.log
+ *     ├── core.log          — captured action log
  *     └── times.json        — step timing + token usage
  */
 export interface Trajectory {
diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts
index 659675486..02461d77e 100644
--- a/packages/core/lib/v3/verifier/verifier.ts
+++ b/packages/core/lib/v3/verifier/verifier.ts
@@ -2,13 +2,13 @@
  * Verifier — interface and result types for the rubric-based verifier that
  * replaces V3Evaluator's single-pass YES/NO judge.
  *
- * Modeled on microsoft/fara's MMRubricAgent (arxiv 2511.19663, "The Art of
- * Building Verifiers for Computer Use Agents"). The verifier never touches a
- * live browser — it consumes a Trajectory + TaskSpec and returns a structured
- * Verdict. That property is what lets us re-score saved trajectories offline.
+ * Modeled on rubric-based verifier pipelines for computer-use agents. The
+ * verifier never touches a live browser — it consumes a Trajectory + TaskSpec
+ * and returns a structured Verdict. That property is what lets us re-score
+ * saved trajectories offline.
  *
  * Wave 0 ships only the types and a stub implementation (`evidence_insufficient`
- * for everything). Wave 1 ports the MMRubricAgent pipeline (Steps 1–6 + Step 8).
+ * for everything). Wave 1 adds the rubric generation/scoring pipeline.
  */
 
 import type { Trajectory, TaskSpec } from "./trajectory.js";

From 5b4479549fbf79e7207da4b850fab4062c6d5695 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 15:18:50 -0700
Subject: [PATCH 05/14] docs(verifier): remove rollout comments from public
 types

---
 packages/core/lib/v3/verifier/index.ts    | 3 ---
 packages/core/lib/v3/verifier/verifier.ts | 5 +----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index b39c94488..144ab033b 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -1,8 +1,5 @@
 /**
  * Public re-exports for the verifier subsystem.
- *
- * Wave 0 ships the trajectory + verdict types and a stub verifier. The
- * RubricVerifier port (Wave 1+) stays internal until the prompts stabilize.
  */
 export type {
   Trajectory,
diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts
index 02461d77e..3ae764b2a 100644
--- a/packages/core/lib/v3/verifier/verifier.ts
+++ b/packages/core/lib/v3/verifier/verifier.ts
@@ -6,9 +6,6 @@
  * verifier never touches a live browser — it consumes a Trajectory + TaskSpec
  * and returns a structured Verdict. That property is what lets us re-score
  * saved trajectories offline.
- *
- * Wave 0 ships only the types and a stub implementation (`evidence_insufficient`
- * for everything). Wave 1 adds the rubric generation/scoring pipeline.
  */
 
 import type { Trajectory, TaskSpec } from "./trajectory.js";
@@ -68,7 +65,7 @@ export interface FirstPointOfFailure {
 export interface VerifierFinding {
   /**
    * Category of the observation. Open-ended enum — additional categories may
-   * be added as Wave 2/3 verifier steps surface new failure modes.
+   * be added as verifier backends surface new failure modes.
    */
   category:
     | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries)

From 5e883781bc5e5b925e6a56929b38eaa26beab83e Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 15:22:43 -0700
Subject: [PATCH 06/14] refactor(verifier): consolidate public types

---
 packages/core/lib/v3/verifier/index.ts      |  39 ++-
 packages/core/lib/v3/verifier/trajectory.ts | 227 ++-----------
 packages/core/lib/v3/verifier/types.ts      | 339 ++++++++++++++++++++
 packages/core/lib/v3/verifier/verifier.ts   | 177 +---------
 4 files changed, 393 insertions(+), 389 deletions(-)
 create mode 100644 packages/core/lib/v3/verifier/types.ts

diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index 144ab033b..1b76eb388 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -2,34 +2,31 @@
  * Public re-exports for the verifier subsystem.
  */
 export type {
-  Trajectory,
-  TrajectoryStep,
-  TrajectoryStatus,
-  TrajectoryUsage,
-  TaskSpec,
+  AgentEvidence,
+  AgentEvidenceModality,
+  CriterionScore,
+  FirstPointOfFailure,
+  ProbeEvidence,
   Rubric,
   RubricCriterion,
+  RubricInput,
   SerializedRubric,
   SerializedRubricCriterion,
-  RubricInput,
-  AgentEvidence,
-  AgentEvidenceModality,
-  ProbeEvidence,
+  StubVerdictReason,
+  TaskSpec,
+  TaskValidity,
   ToolOutput,
-} from "./trajectory.js";
+  Trajectory,
+  TrajectoryStatus,
+  TrajectoryStep,
+  TrajectoryUsage,
+  Verdict,
+  Verifier,
+  VerifierFinding,
+  VerifierRawSteps,
+} from "./types.js";
 export {
   loadTrajectoryFromDisk,
   nextVerdictFilename,
   normalizeRubric,
 } from "./trajectory.js";
-
-export type {
-  Verifier,
-  Verdict,
-  CriterionScore,
-  FirstPointOfFailure,
-  TaskValidity,
-  VerifierFinding,
-  VerifierRawSteps,
-  StubVerdictReason,
-} from "./verifier.js";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 6fb604d11..3dbb7e5a6 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -1,82 +1,29 @@
-/**
- * Trajectory — structured record of an agent run, consumed by the verifier.
- *
- * Trajectories are produced by the harness (TrajectoryRecorder in
- * packages/evals) from the bus events emitted by v3AgentHandler /
- * v3CuaAgentHandler. They can be persisted on disk and reloaded for offline
- * verifier scoring.
- *
- * Two evidence channels per step:
- *   - agentEvidence ("tier 1") — what the agent's LLM consumed as the tool
- *     result. For DOM/hybrid agents these are the tool returns (extract JSON,
- *     ariaTree text, act describe-string, goto URL). For CUA this is the
- *     screenshot the provider received.
- *   - probeEvidence ("tier 2") — independent observations the harness took
- *     around each step (page.screenshot, page.url, optionally a11y).
- *
- * The verifier consumes both. They can disagree; conflict resolution is the
- * verifier's job (see Verdict.evidenceInsufficient + per-criterion logging).
- */
-
-/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */
-export interface TrajectoryUsage {
-  input_tokens: number;
-  output_tokens: number;
-  reasoning_tokens?: number;
-  cached_input_tokens?: number;
-  inference_time_ms?: number;
-}
-
-/**
- * A single criterion in a Stagehand rubric. Dataset and model wire formats may
- * use serialized `max_points` / `earned_points`; normalize those with
- * `normalizeRubric()` at the boundary.
- */
-export interface RubricCriterion {
-  /** Short name of the criterion (e.g., "Add ground beef to cart"). */
-  criterion: string;
-  /** What to evaluate and how to award partial credit. */
-  description: string;
-  /** Maximum points for this criterion. */
-  maxPoints: number;
-  /**
-   * Triggering condition for conditional criteria. Only counted when met
-   * (paper's "Mutually Exclusive Conditionals" pattern).
-   */
-  condition?: string;
-  /** Filled by the verifier during scoring; empty in precomputed rubrics. */
-  justification?: string;
-  /**
-   * Filled by the verifier during scoring; empty string in some serialized
-   * upstream rubrics and a number in scored rubrics.
-   */
-  earnedPoints?: number | string;
-}
-
-/** A rubric — list of criteria for a task. */
-export interface Rubric {
-  items: RubricCriterion[];
-}
-
-/**
- * Serialized rubric item shape as stored in datasets and prompt responses.
- * Keep this at IO boundaries; core verifier types use camelCase.
- */
-export interface SerializedRubricCriterion {
-  criterion: string;
-  description: string;
-  max_points: number;
-  condition?: string;
-  justification?: string;
-  earned_points?: number | string;
-}
-
-/** Serialized rubric shape used by upstream datasets and generated JSON. */
-export interface SerializedRubric {
-  items: SerializedRubricCriterion[];
-}
-
-export type RubricInput = Rubric | SerializedRubric;
+import type {
+  ProbeEvidence,
+  Rubric,
+  RubricCriterion,
+  RubricInput,
+  SerializedRubricCriterion,
+  Trajectory,
+  TrajectoryStep,
+} from "./types.js";
+
+export type {
+  AgentEvidence,
+  AgentEvidenceModality,
+  ProbeEvidence,
+  Rubric,
+  RubricCriterion,
+  RubricInput,
+  SerializedRubric,
+  SerializedRubricCriterion,
+  TaskSpec,
+  ToolOutput,
+  Trajectory,
+  TrajectoryStatus,
+  TrajectoryStep,
+  TrajectoryUsage,
+} from "./types.js";
 
 /** Convert a Stagehand or serialized rubric into the public Stagehand shape. */
 export function normalizeRubric(
@@ -111,128 +58,6 @@ export function normalizeRubric(
   };
 }
 
-/**
- * Spec for a single task being verified. Carried both at runtime (handed to
- * agent.execute) and into the verifier alongside the trajectory.
- */
-export interface TaskSpec {
-  /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */
-  id: string;
-  /** Task instruction shown to the agent. */
-  instruction: string;
-  /** Starting URL, if any. */
-  initUrl?: string;
-  /**
-   * Rubric carried by the dataset (e.g., WebTailBench's precomputed_rubric).
-   * If absent, the verifier generates one via Step 0a and caches under
-   * packages/evals/.rubric-cache/.
-   */
-  precomputedRubric?: Rubric;
-  /** Optional reference answer (set when dataset ships one). */
-  expectedAnswer?: string;
-}
-
-/**
- * A single modality unit in tier-1 agent evidence. Mirrors the shape of
- * ModelMessage content parts so we can reproduce what the LLM ingested.
- */
-export type AgentEvidenceModality =
-  | { type: "text"; content: string }
-  | { type: "image"; bytes: Buffer; mediaType: string }
-  | { type: "json"; content: unknown };
-
-/**
- * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the
- * tool result for this step.
- *
- * Modes:
- *   - CUA: usually a single image modality (the screenshot sent to the provider).
- *   - Hybrid: tool result with optional screenshotBase64 → one image + one text.
- *   - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities.
- */
-export interface AgentEvidence {
-  modalities: AgentEvidenceModality[];
-}
-
-/**
- * Tier 2 — independent harness probes around this step. Cheap and always-on
- * for v0 (just url) and v1 (+a11y, +scroll). v2 adds verifier-requested probes
- * keyed on the criterion that requested them.
- *
- * If a probe wasn't captured, the field is absent (not null).
- */
-export interface ProbeEvidence {
-  /** v0.5 — URL after the step's tool execution. */
-  url?: string;
-  /**
-   * v0 — bus screenshot (page.screenshot post-step). Path on disk is preferred
-   * once persisted; in-memory Buffer is used during a live run.
-   */
-  screenshot?: Buffer;
-  /** Reference to the persisted screenshot file under the trajectory dir. */
-  screenshotPath?: string;
-  /** v1 — viewport scroll context. Lets the verifier reason about "did the agent see the full page". */
-  scroll?: { top: number; pageHeight: number };
-  /** v1 — accessibility tree snapshot. */
-  ariaTree?: string;
-  /** v2 — verifier-requested probes, keyed by criterion id. */
-  onDemand?: Record<string, unknown>;
-}
-
-/** Outcome of a single tool execution as seen by the harness. */
-export interface ToolOutput {
-  ok: boolean;
-  /**
-   * The tool's return value. Same payload that flowed into agentEvidence
-   * modalities, but in its native shape (e.g., the extract result, the act
-   * describe-string) rather than serialized for the LLM.
-   */
-  result: unknown;
-  error?: string;
-}
-
-/** One step in a trajectory: action + reasoning + evidence + outcome. */
-export interface TrajectoryStep {
-  index: number;
-  actionName: string;
-  actionArgs: Record<string, unknown>;
-  /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */
-  reasoning: string;
-  agentEvidence: AgentEvidence;
-  probeEvidence: ProbeEvidence;
-  toolOutput: ToolOutput;
-  /** ISO 8601 timestamp when the step's tool execution started. */
-  startedAt: string;
-  /** ISO 8601 timestamp when the step's tool execution finished. */
-  finishedAt: string;
-}
-
-/** Terminal status of the agent run. */
-export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
-
-/**
- * Full trajectory for one task run.
- *
- * The on-disk layout is one directory per task:
- *
- *   .trajectories/<run-id>/<task-id>/
- *     ├── task_data.json    — TaskSpec + Verdict (filled on completion)
- *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
- *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
- *     ├── scores/
- *     │   └── mmrubric_v1.json  — Verdict from V3Evaluator.verify()
- *     ├── core.log          — captured action log
- *     └── times.json        — step timing + token usage
- */
-export interface Trajectory {
-  task: TaskSpec;
-  steps: TrajectoryStep[];
-  finalAnswer?: string;
-  status: TrajectoryStatus;
-  usage: TrajectoryUsage;
-  timing: { startedAt: string; endedAt: string };
-}
-
 // ─────────────────────────────────────────────────────────────────────────────
 // On-disk loader
 // ─────────────────────────────────────────────────────────────────────────────
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
new file mode 100644
index 000000000..fb0901f60
--- /dev/null
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -0,0 +1,339 @@
+/**
+ * Shared verifier types for trajectories, rubrics, evidence, and verdicts.
+ *
+ * The verifier consumes saved trajectories instead of a live browser. DOM and
+ * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve
+ * screenshots sent to the provider plus independent harness probes.
+ */
+
+/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */
+export interface TrajectoryUsage {
+  input_tokens: number;
+  output_tokens: number;
+  reasoning_tokens?: number;
+  cached_input_tokens?: number;
+  inference_time_ms?: number;
+}
+
+/**
+ * A single criterion in a Stagehand rubric. Dataset and model wire formats may
+ * use serialized `max_points` / `earned_points`; normalize those with
+ * `normalizeRubric()` at the boundary.
+ */
+export interface RubricCriterion {
+  /** Short name of the criterion (e.g., "Add ground beef to cart"). */
+  criterion: string;
+  /** What to evaluate and how to award partial credit. */
+  description: string;
+  /** Maximum points for this criterion. */
+  maxPoints: number;
+  /**
+   * Triggering condition for conditional criteria. Only counted when met
+   * (paper's "Mutually Exclusive Conditionals" pattern).
+   */
+  condition?: string;
+  /** Filled by the verifier during scoring; empty in precomputed rubrics. */
+  justification?: string;
+  /**
+   * Filled by the verifier during scoring; empty string in some serialized
+   * upstream rubrics and a number in scored rubrics.
+   */
+  earnedPoints?: number | string;
+}
+
+/** A rubric — list of criteria for a task. */
+export interface Rubric {
+  items: RubricCriterion[];
+}
+
+/**
+ * Serialized rubric item shape as stored in datasets and prompt responses.
+ * Keep this at IO boundaries; core verifier types use camelCase.
+ */
+export interface SerializedRubricCriterion {
+  criterion: string;
+  description: string;
+  max_points: number;
+  condition?: string;
+  justification?: string;
+  earned_points?: number | string;
+}
+
+/** Serialized rubric shape used by upstream datasets and generated JSON. */
+export interface SerializedRubric {
+  items: SerializedRubricCriterion[];
+}
+
+export type RubricInput = Rubric | SerializedRubric;
+
+/**
+ * Spec for a single task being verified. Carried both at runtime and into the
+ * verifier alongside the trajectory.
+ */
+export interface TaskSpec {
+  /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */
+  id: string;
+  /** Task instruction shown to the agent. */
+  instruction: string;
+  /** Starting URL, if any. */
+  initUrl?: string;
+  /** Rubric carried by the dataset or generated by a verifier backend. */
+  precomputedRubric?: Rubric;
+  /** Optional reference answer (set when dataset ships one). */
+  expectedAnswer?: string;
+}
+
+/**
+ * A single modality unit in tier-1 agent evidence. Mirrors the shape of
+ * ModelMessage content parts so we can reproduce what the LLM ingested.
+ */
+export type AgentEvidenceModality =
+  | { type: "text"; content: string }
+  | { type: "image"; bytes: Buffer; mediaType: string }
+  | { type: "json"; content: unknown };
+
+/**
+ * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the
+ * tool result for this step.
+ *
+ * Modes:
+ *   - CUA: usually a single image modality (the screenshot sent to the provider).
+ *   - Hybrid: tool result with optional screenshotBase64 → one image + one text.
+ *   - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities.
+ */
+export interface AgentEvidence {
+  modalities: AgentEvidenceModality[];
+}
+
+/**
+ * Tier 2 — independent harness probes around this step.
+ *
+ * If a probe wasn't captured, the field is absent (not null).
+ */
+export interface ProbeEvidence {
+  /** URL after the step's tool execution. */
+  url?: string;
+  /**
+   * Bus screenshot captured after the step. Path on disk is preferred once
+   * persisted; in-memory Buffer is used during a live run.
+   */
+  screenshot?: Buffer;
+  /** Reference to the persisted screenshot file under the trajectory dir. */
+  screenshotPath?: string;
+  /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */
+  scroll?: { top: number; pageHeight: number };
+  /** Accessibility tree snapshot. */
+  ariaTree?: string;
+  /** Verifier-requested probes, keyed by criterion id. */
+  onDemand?: Record<string, unknown>;
+}
+
+/** Outcome of a single tool execution as seen by the harness. */
+export interface ToolOutput {
+  ok: boolean;
+  /**
+   * The tool's return value. Same payload that flowed into agentEvidence
+   * modalities, but in its native shape (e.g., the extract result, the act
+   * describe-string) rather than serialized for the LLM.
+   */
+  result: unknown;
+  error?: string;
+}
+
+/** One step in a trajectory: action + reasoning + evidence + outcome. */
+export interface TrajectoryStep {
+  index: number;
+  actionName: string;
+  actionArgs: Record<string, unknown>;
+  /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */
+  reasoning: string;
+  agentEvidence: AgentEvidence;
+  probeEvidence: ProbeEvidence;
+  toolOutput: ToolOutput;
+  /** ISO 8601 timestamp when the step's tool execution started. */
+  startedAt: string;
+  /** ISO 8601 timestamp when the step's tool execution finished. */
+  finishedAt: string;
+}
+
+/** Terminal status of the agent run. */
+export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
+
+/**
+ * Full trajectory for one task run.
+ *
+ * The on-disk layout is one directory per task:
+ *
+ *   .trajectories/<run-id>/<task-id>/
+ *     ├── task_data.json    — TaskSpec + Verdict (filled on completion)
+ *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
+ *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
+ *     ├── scores/
+ *     │   └── mmrubric_v1.json  — Verdict from V3Evaluator.verify()
+ *     ├── core.log          — captured action log
+ *     └── times.json        — step timing + token usage
+ */
+export interface Trajectory {
+  task: TaskSpec;
+  steps: TrajectoryStep[];
+  finalAnswer?: string;
+  status: TrajectoryStatus;
+  usage: TrajectoryUsage;
+  timing: { startedAt: string; endedAt: string };
+}
+
+/** Score for a single rubric criterion after evidence analysis + rescoring. */
+export interface CriterionScore {
+  /** Matches RubricCriterion.criterion (the criterion's short name). */
+  criterion: string;
+  /** Maximum possible points for this criterion. */
+  maxPoints: number;
+  /**
+   * Points earned post-evidence-analysis (paper's post_image_earned_points).
+   * Null if the criterion was conditional and its condition wasn't met (excluded
+   * from both numerator and denominator in the process score).
+   */
+  earnedPoints: number | null;
+  /** Verifier's free-text justification for the score. */
+  justification: string;
+  /**
+   * True if the criterion is conditional and its condition was determined to
+   * be met. Absent for non-conditional criteria.
+   */
+  conditionMet?: boolean;
+  /**
+   * Set when the verifier had no evidence to ground this criterion in either
+   * tier. Per paper §2, treated as uncontrollable failure → full credit, but
+   * surfaced here so dashboards can flag low-confidence verdicts.
+   */
+  evidenceInsufficient?: boolean;
+}
+
+/**
+ * First-point-of-failure analysis (paper Step 9a). Identifies the earliest
+ * step where the agent's trajectory went off-track, using a structured error
+ * taxonomy (7 top-level categories, 1.1–7.4 sub-codes).
+ */
+export interface FirstPointOfFailure {
+  stepIndex: number;
+  /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */
+  errorCode: string;
+  /** Top-level category name (Selection, Hallucination, etc.). */
+  category: string;
+  /** Verifier's reasoning for selecting this point. */
+  description?: string;
+}
+
+/**
+ * Structured observation surfaced by the verifier that another agent or
+ * tooling could act on. Findings are emitted opportunistically by Step 8
+ * (outcome verification) when the verifier notices actionable patterns —
+ * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.
+ *
+ * Not produced for every task: when nothing actionable surfaces, the
+ * `findings` array on the Verdict is empty. Consumers should treat the
+ * field as advisory, not as part of the formal score.
+ */
+export interface VerifierFinding {
+  /**
+   * Category of the observation. Open-ended enum — additional categories may
+   * be added as verifier backends surface new failure modes.
+   */
+  category:
+    | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries)
+    | "agent_strategy" // higher-level planning / decision-making problems
+    | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory
+    | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps)
+    | "task_specification" // task instruction was ambiguous / under- or over-specified
+    | "verifier_uncertainty" // verifier itself couldn't confidently decide
+    | "other";
+  /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */
+  severity: "info" | "warning" | "blocking";
+  /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */
+  description: string;
+  /**
+   * Optional concrete next action another agent could take. Should be
+   * specific enough that it can be acted on without further reasoning —
+   * e.g., "Try double_click instead of triple_click to clear placeholder
+   * text on this form field."
+   */
+  suggestedAction?: string;
+  /** Step indices in the trajectory where this pattern showed up. */
+  relatedSteps?: number[];
+}
+
+/** Stable debugging summary emitted by verifier backends. */
+export interface VerifierRawSteps {
+  backend?: "legacy" | "verifier";
+  primaryIntent?: string;
+  reasoning?: string;
+  rubricSource?: "precomputed" | "generated" | "none";
+  approach?: "a" | "b";
+  optionalsMode?: "folded" | "separate" | "skip";
+  totalEarned?: number;
+  totalMax?: number;
+  evidenceImages?: number;
+  evidenceTexts?: number;
+  evidenceOriginalScreenshots?: number;
+  legacyEvaluation?: string;
+  screenshotCount?: number;
+}
+
+/** Task-validity classification (paper Step 10). */
+export interface TaskValidity {
+  /** True if the task is underspecified / has multiple valid interpretations. */
+  isAmbiguous: boolean;
+  /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */
+  isInvalid: boolean;
+  /** Optional sub-codes from the task-classification taxonomy. */
+  ambiguityCodes?: string[];
+  invalidTaskCodes?: string[];
+}
+
+/**
+ * The verifier's output. Process score + outcome verdict + diagnostic signals.
+ *
+ * Process and outcome are deliberately independent (paper §2): an agent can
+ * follow the right steps but get blocked (high process, low outcome), or
+ * succeed through an unexpected path (variable process, high outcome).
+ */
+export interface Verdict {
+  /** Step 8 — did the agent accomplish the task from the user's perspective? */
+  outcomeSuccess: boolean;
+  /** Aggregated earned/max across applicable criteria, in [0, 1]. */
+  processScore: number;
+  /** Per-criterion breakdown after rescoring. */
+  perCriterion: CriterionScore[];
+  /** Step 9a — first step where the trajectory went off-track, if any. */
+  firstPointOfFailure?: FirstPointOfFailure;
+  /** Step 10 — task-itself ambiguity / validity. */
+  taskValidity: TaskValidity;
+  /**
+   * Ids (RubricCriterion.criterion strings) of criteria where neither tier of
+   * evidence resolved the question. Treated as uncontrollable → full credit,
+   * but flagged here so consumers can decide whether to discount the score.
+   */
+  evidenceInsufficient: string[];
+  /**
+   * Structured observations from the verifier that a downstream tool or
+   * follow-up agent could act on. Opportunistic — empty when the verifier
+   * doesn't notice anything actionable. Not part of the score; advisory.
+   */
+  findings?: VerifierFinding[];
+  /** Debugging summary from the active evaluator backend. */
+  rawSteps?: VerifierRawSteps;
+}
+
+/** Reason a stub verifier can emit. */
+export type StubVerdictReason =
+  | "wave-0-stub"
+  | "no-rubric"
+  | "empty-trajectory";
+
+/**
+ * Verifier interface. Implementations consume a Trajectory + TaskSpec and
+ * return a Verdict — they MUST NOT touch a live browser.
+ */
+export interface Verifier {
+  verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<Verdict>;
+}
diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts
index 3ae764b2a..57167f386 100644
--- a/packages/core/lib/v3/verifier/verifier.ts
+++ b/packages/core/lib/v3/verifier/verifier.ts
@@ -1,167 +1,10 @@
-/**
- * Verifier — interface and result types for the rubric-based verifier that
- * replaces V3Evaluator's single-pass YES/NO judge.
- *
- * Modeled on rubric-based verifier pipelines for computer-use agents. The
- * verifier never touches a live browser — it consumes a Trajectory + TaskSpec
- * and returns a structured Verdict. That property is what lets us re-score
- * saved trajectories offline.
- */
-
-import type { Trajectory, TaskSpec } from "./trajectory.js";
-
-/** Score for a single rubric criterion after evidence analysis + rescoring. */
-export interface CriterionScore {
-  /** Matches RubricCriterion.criterion (the criterion's short name). */
-  criterion: string;
-  /** Maximum possible points for this criterion. */
-  maxPoints: number;
-  /**
-   * Points earned post-evidence-analysis (paper's post_image_earned_points).
-   * Null if the criterion was conditional and its condition wasn't met (excluded
-   * from both numerator and denominator in the process score).
-   */
-  earnedPoints: number | null;
-  /** Verifier's free-text justification for the score. */
-  justification: string;
-  /**
-   * True if the criterion is conditional and its condition was determined to
-   * be met. Absent for non-conditional criteria.
-   */
-  conditionMet?: boolean;
-  /**
-   * Set when the verifier had no evidence to ground this criterion in either
-   * tier. Per paper §2, treated as uncontrollable failure → full credit, but
-   * surfaced here so dashboards can flag low-confidence verdicts.
-   */
-  evidenceInsufficient?: boolean;
-}
-
-/**
- * First-point-of-failure analysis (paper Step 9a). Identifies the earliest
- * step where the agent's trajectory went off-track, using a structured error
- * taxonomy (7 top-level categories, 1.1–7.4 sub-codes).
- */
-export interface FirstPointOfFailure {
-  stepIndex: number;
-  /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */
-  errorCode: string;
-  /** Top-level category name (Selection, Hallucination, etc.). */
-  category: string;
-  /** Verifier's reasoning for selecting this point. */
-  description?: string;
-}
-
-/**
- * Structured observation surfaced by the verifier that another agent or
- * tooling could act on. Findings are emitted opportunistically by Step 8
- * (outcome verification) when the verifier notices actionable patterns —
- * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.
- *
- * Not produced for every task: when nothing actionable surfaces, the
- * `findings` array on the Verdict is empty. Consumers should treat the
- * field as advisory, not as part of the formal score.
- */
-export interface VerifierFinding {
-  /**
-   * Category of the observation. Open-ended enum — additional categories may
-   * be added as verifier backends surface new failure modes.
-   */
-  category:
-    | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries)
-    | "agent_strategy" // higher-level planning / decision-making problems
-    | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory
-    | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps)
-    | "task_specification" // task instruction was ambiguous / under- or over-specified
-    | "verifier_uncertainty" // verifier itself couldn't confidently decide
-    | "other";
-  /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */
-  severity: "info" | "warning" | "blocking";
-  /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */
-  description: string;
-  /**
-   * Optional concrete next action another agent could take. Should be
-   * specific enough that it can be acted on without further reasoning —
-   * e.g., "Try double_click instead of triple_click to clear placeholder
-   * text on this form field."
-   */
-  suggestedAction?: string;
-  /** Step indices in the trajectory where this pattern showed up. */
-  relatedSteps?: number[];
-}
-
-/** Stable debugging summary emitted by verifier backends. */
-export interface VerifierRawSteps {
-  backend?: "legacy" | "verifier";
-  primaryIntent?: string;
-  reasoning?: string;
-  rubricSource?: "precomputed" | "generated" | "none";
-  approach?: "a" | "b";
-  optionalsMode?: "folded" | "separate" | "skip";
-  totalEarned?: number;
-  totalMax?: number;
-  evidenceImages?: number;
-  evidenceTexts?: number;
-  evidenceOriginalScreenshots?: number;
-  legacyEvaluation?: string;
-  screenshotCount?: number;
-}
-
-/** Task-validity classification (paper Step 10). */
-export interface TaskValidity {
-  /** True if the task is underspecified / has multiple valid interpretations. */
-  isAmbiguous: boolean;
-  /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */
-  isInvalid: boolean;
-  /** Optional sub-codes from the task-classification taxonomy. */
-  ambiguityCodes?: string[];
-  invalidTaskCodes?: string[];
-}
-
-/**
- * The verifier's output. Process score + outcome verdict + diagnostic signals.
- *
- * Process and outcome are deliberately independent (paper §2): an agent can
- * follow the right steps but get blocked (high process, low outcome), or
- * succeed through an unexpected path (variable process, high outcome).
- */
-export interface Verdict {
-  /** Step 8 — did the agent accomplish the task from the user's perspective? */
-  outcomeSuccess: boolean;
-  /** Aggregated earned/max across applicable criteria, in [0, 1]. */
-  processScore: number;
-  /** Per-criterion breakdown after rescoring. */
-  perCriterion: CriterionScore[];
-  /** Step 9a — first step where the trajectory went off-track, if any. */
-  firstPointOfFailure?: FirstPointOfFailure;
-  /** Step 10 — task-itself ambiguity / validity. */
-  taskValidity: TaskValidity;
-  /**
-   * Ids (RubricCriterion.criterion strings) of criteria where neither tier of
-   * evidence resolved the question. Treated as uncontrollable → full credit,
-   * but flagged here so consumers can decide whether to discount the score.
-   */
-  evidenceInsufficient: string[];
-  /**
-   * Structured observations from the verifier that a downstream tool or
-   * follow-up agent could act on. Opportunistic — empty when the verifier
-   * doesn't notice anything actionable. Not part of the score; advisory.
-   */
-  findings?: VerifierFinding[];
-  /** Debugging summary from the active evaluator backend. */
-  rawSteps?: VerifierRawSteps;
-}
-
-/** Reason a stub verifier emits when the rubric pipeline hasn't shipped yet. */
-export type StubVerdictReason =
-  | "wave-0-stub"
-  | "no-rubric"
-  | "empty-trajectory";
-
-/**
- * Verifier interface. Implementations consume a Trajectory + TaskSpec and
- * return a Verdict — they MUST NOT touch a live browser.
- */
-export interface Verifier {
-  verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<Verdict>;
-}
+export type {
+  CriterionScore,
+  FirstPointOfFailure,
+  StubVerdictReason,
+  TaskValidity,
+  Verdict,
+  Verifier,
+  VerifierFinding,
+  VerifierRawSteps,
+} from "./types.js";

From d68ada609bce75b78e8b5282e2c3bf3cda159a7b Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 15:29:32 -0700
Subject: [PATCH 07/14] refactor(verifier): remove rollout stub reason

---
 packages/core/lib/v3/verifier/types.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
index fb0901f60..71f2d26af 100644
--- a/packages/core/lib/v3/verifier/types.ts
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -326,7 +326,7 @@ export interface Verdict {
 
 /** Reason a stub verifier can emit. */
 export type StubVerdictReason =
-  | "wave-0-stub"
+  | "stub-verifier"
   | "no-rubric"
   | "empty-trajectory";
 

From 41708e182348f000daf53997a3b195388704c4b4 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 16:20:20 -0700
Subject: [PATCH 08/14] refactor(verifier): remove proxy type barrels

---
 packages/core/lib/v3/verifier/trajectory.ts | 16 ----------------
 packages/core/lib/v3/verifier/verifier.ts   | 10 ----------
 2 files changed, 26 deletions(-)
 delete mode 100644 packages/core/lib/v3/verifier/verifier.ts

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 3dbb7e5a6..e0ca79401 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -1,28 +1,12 @@
 import type {
-  ProbeEvidence,
-  Rubric,
-  RubricCriterion,
-  RubricInput,
-  SerializedRubricCriterion,
-  Trajectory,
-  TrajectoryStep,
-} from "./types.js";
-
-export type {
-  AgentEvidence,
   AgentEvidenceModality,
   ProbeEvidence,
   Rubric,
   RubricCriterion,
   RubricInput,
-  SerializedRubric,
   SerializedRubricCriterion,
-  TaskSpec,
-  ToolOutput,
   Trajectory,
-  TrajectoryStatus,
   TrajectoryStep,
-  TrajectoryUsage,
 } from "./types.js";
 
 /** Convert a Stagehand or serialized rubric into the public Stagehand shape. */
diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts
deleted file mode 100644
index 57167f386..000000000
--- a/packages/core/lib/v3/verifier/verifier.ts
+++ /dev/null
@@ -1,10 +0,0 @@
-export type {
-  CriterionScore,
-  FirstPointOfFailure,
-  StubVerdictReason,
-  TaskValidity,
-  Verdict,
-  Verifier,
-  VerifierFinding,
-  VerifierRawSteps,
-} from "./types.js";

From 356f48172eeb3ac30a129565d33ca5529fb6407c Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 16:21:01 -0700
Subject: [PATCH 09/14] fix(verifier): keep rubric earned points numeric

---
 packages/core/lib/v3/verifier/trajectory.ts | 24 ++++++++++++++++++++-
 packages/core/lib/v3/verifier/types.ts      |  7 ++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index e0ca79401..40cd27dff 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -27,7 +27,10 @@ export function normalizeRubric(
         );
       }
 
-      const earnedPoints = raw.earnedPoints ?? raw.earned_points;
+      const earnedPoints = normalizeEarnedPoints(
+        raw.earnedPoints ?? raw.earned_points,
+        raw.criterion,
+      );
       return {
         criterion: raw.criterion,
         description: raw.description,
@@ -42,6 +45,25 @@ export function normalizeRubric(
   };
 }
 
+function normalizeEarnedPoints(
+  value: number | string | undefined,
+  criterion: string,
+): number | undefined {
+  if (value === undefined) return undefined;
+  if (typeof value === "number") {
+    if (Number.isFinite(value)) return value;
+  } else {
+    const trimmed = value.trim();
+    if (trimmed === "") return undefined;
+    const parsed = Number(trimmed);
+    if (Number.isFinite(parsed)) return parsed;
+  }
+
+  throw new TypeError(
+    `Rubric criterion "${criterion}" has a non-numeric earnedPoints value`,
+  );
+}
+
 // ─────────────────────────────────────────────────────────────────────────────
 // On-disk loader
 // ─────────────────────────────────────────────────────────────────────────────
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
index 71f2d26af..b3f1011f0 100644
--- a/packages/core/lib/v3/verifier/types.ts
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -34,11 +34,8 @@ export interface RubricCriterion {
   condition?: string;
   /** Filled by the verifier during scoring; empty in precomputed rubrics. */
   justification?: string;
-  /**
-   * Filled by the verifier during scoring; empty string in some serialized
-   * upstream rubrics and a number in scored rubrics.
-   */
-  earnedPoints?: number | string;
+  /** Filled by the verifier during scoring; omitted in precomputed rubrics. */
+  earnedPoints?: number;
 }
 
 /** A rubric — list of criteria for a task. */

From b8d195f22552c61910c60aec957105843a944ee3 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 16:21:58 -0700
Subject: [PATCH 10/14] fix(verifier): constrain trajectory screenshot paths

---
 packages/core/lib/v3/verifier/trajectory.ts | 22 ++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 40cd27dff..6ce491115 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -84,6 +84,7 @@ function normalizeEarnedPoints(
 export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
   const fs = await import("node:fs/promises");
   const path = await import("node:path");
+  const trajectoryDir = path.resolve(dir);
 
   const trajectoryPath = path.join(dir, "trajectory.json");
   const raw = await fs.readFile(trajectoryPath, "utf8");
@@ -109,13 +110,28 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
     >;
   };
 
+  const resolveWithinTrajectoryDir = (candidate: string): string => {
+    const resolved = path.resolve(trajectoryDir, candidate);
+    const relative = path.relative(trajectoryDir, resolved);
+    const outside =
+      relative === ".." ||
+      relative.startsWith(`..${path.sep}`) ||
+      path.isAbsolute(relative);
+
+    if (outside) {
+      throw new Error(
+        `Trajectory screenshotPath escapes trajectory directory: ${candidate}`,
+      );
+    }
+
+    return resolved;
+  };
+
   for (const step of parsed.steps) {
     // Rehydrate tier-2 probe screenshot from its on-disk file reference.
     const probe = step.probeEvidence;
     if (probe?.screenshotPath && !probe.screenshot) {
-      const resolved = path.isAbsolute(probe.screenshotPath)
-        ? probe.screenshotPath
-        : path.join(dir, probe.screenshotPath);
+      const resolved = resolveWithinTrajectoryDir(probe.screenshotPath);
       try {
         probe.screenshot = await fs.readFile(resolved);
       } catch {

From 4fc1400ce90b39a8dab131b378c6b7f450eb2af6 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 16:25:11 -0700
Subject: [PATCH 11/14] test(verifier): cover trajectory normalization
 boundaries

---
 .../tests/unit/verifier-trajectory.test.ts    | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 packages/core/tests/unit/verifier-trajectory.test.ts

diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
new file mode 100644
index 000000000..0ed4fc596
--- /dev/null
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -0,0 +1,68 @@
+import { mkdtemp, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import path from "node:path";
+
+import { describe, expect, it } from "vitest";
+
+import {
+  loadTrajectoryFromDisk,
+  normalizeRubric,
+} from "../../lib/v3/verifier/trajectory.js";
+
+describe("verifier trajectory utilities", () => {
+  it("normalizes serialized empty earned points out of public rubrics", () => {
+    expect(
+      normalizeRubric({
+        items: [
+          {
+            criterion: "Criterion",
+            description: "Description",
+            max_points: 1,
+            earned_points: "",
+          },
+        ],
+      }),
+    ).toEqual({
+      items: [
+        {
+          criterion: "Criterion",
+          description: "Description",
+          maxPoints: 1,
+        },
+      ],
+    });
+  });
+
+  it("rejects screenshot paths outside the trajectory directory", async () => {
+    const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
+    await writeFile(
+      path.join(dir, "trajectory.json"),
+      JSON.stringify({
+        task: { id: "task", instruction: "Do the task" },
+        status: "complete",
+        usage: { input_tokens: 0, output_tokens: 0 },
+        timing: {
+          startedAt: new Date(0).toISOString(),
+          endedAt: new Date(0).toISOString(),
+        },
+        steps: [
+          {
+            index: 0,
+            actionName: "act",
+            actionArgs: {},
+            reasoning: "",
+            agentEvidence: { modalities: [] },
+            probeEvidence: { screenshotPath: "../../../etc/passwd" },
+            toolOutput: { ok: true, result: null },
+            startedAt: new Date(0).toISOString(),
+            finishedAt: new Date(0).toISOString(),
+          },
+        ],
+      }),
+    );
+
+    await expect(loadTrajectoryFromDisk(dir)).rejects.toThrow(
+      "escapes trajectory directory",
+    );
+  });
+});

From d87b5e72dc0746599e05f88fdd4df1e4ee97b343 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 21:28:45 -0700
Subject: [PATCH 12/14] test(verifier): cover evaluator facade helpers

---
 packages/core/lib/v3/verifier/index.ts        |   1 -
 packages/core/lib/v3/verifier/trajectory.ts   |  29 +--
 packages/core/lib/v3Evaluator.ts              |   5 +-
 .../tests/unit/public-api/v3-core.test.ts     |  57 -----
 packages/core/tests/unit/v3-evaluator.test.ts | 201 ++++++++++++++++++
 .../tests/unit/verifier-trajectory.test.ts    |  84 ++++++++
 6 files changed, 305 insertions(+), 72 deletions(-)
 create mode 100644 packages/core/tests/unit/v3-evaluator.test.ts

diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index 1b76eb388..ce62b4d0a 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -12,7 +12,6 @@ export type {
   RubricInput,
   SerializedRubric,
   SerializedRubricCriterion,
-  StubVerdictReason,
   TaskSpec,
   TaskValidity,
   ToolOutput,
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index 6ce491115..f5e300403 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -17,27 +17,26 @@ export function normalizeRubric(
 
   return {
     items: rubric.items.map((item) => {
-      const raw = item as RubricCriterion & Partial<SerializedRubricCriterion>;
-      const maxPoints =
-        typeof raw.maxPoints === "number" ? raw.maxPoints : raw.max_points;
+      const serialized = isSerializedRubricCriterion(item);
+      const maxPoints = serialized ? item.max_points : item.maxPoints;
 
       if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) {
         throw new TypeError(
-          `Rubric criterion "${raw.criterion}" is missing a numeric maxPoints value`,
+          `Rubric criterion "${item.criterion}" is missing a numeric maxPoints value`,
         );
       }
 
       const earnedPoints = normalizeEarnedPoints(
-        raw.earnedPoints ?? raw.earned_points,
-        raw.criterion,
+        serialized ? item.earned_points : item.earnedPoints,
+        item.criterion,
       );
       return {
-        criterion: raw.criterion,
-        description: raw.description,
+        criterion: item.criterion,
+        description: item.description,
         maxPoints,
-        ...(raw.condition !== undefined && { condition: raw.condition }),
-        ...(raw.justification !== undefined && {
-          justification: raw.justification,
+        ...(item.condition !== undefined && { condition: item.condition }),
+        ...(item.justification !== undefined && {
+          justification: item.justification,
         }),
         ...(earnedPoints !== undefined && { earnedPoints }),
       };
@@ -45,6 +44,12 @@ export function normalizeRubric(
   };
 }
 
+function isSerializedRubricCriterion(
+  item: RubricCriterion | SerializedRubricCriterion,
+): item is SerializedRubricCriterion {
+  return "max_points" in item;
+}
+
 function normalizeEarnedPoints(
   value: number | string | undefined,
   criterion: string,
@@ -86,7 +91,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
   const path = await import("node:path");
   const trajectoryDir = path.resolve(dir);
 
-  const trajectoryPath = path.join(dir, "trajectory.json");
+  const trajectoryPath = path.join(trajectoryDir, "trajectory.json");
   const raw = await fs.readFile(trajectoryPath, "utf8");
   const parsed = JSON.parse(raw) as Trajectory & {
     steps: Array<
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index 8adc7d5f1..a25c9cf44 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -271,11 +271,12 @@ function renderLegacyAgentReasoning(
 
 function stringifyForPrompt(value: unknown): string {
   if (typeof value === "string") {
-    return truncateForPrompt(value, 2000);
+    return value;
   }
 
   try {
-    return truncateForPrompt(JSON.stringify(value), 2000);
+    const serialized = JSON.stringify(value);
+    return serialized ?? String(value);
   } catch {
     return String(value);
   }
diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts
index d1975a48c..5e767ae57 100644
--- a/packages/core/tests/unit/public-api/v3-core.test.ts
+++ b/packages/core/tests/unit/public-api/v3-core.test.ts
@@ -155,63 +155,6 @@ describe("V3 Core public API types", () => {
         } satisfies Stagehand.V3EvaluatorConstructorOptions,
       );
     });
-
-    it("rejects verifier backend before the verifier PR is installed", async () => {
-      const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, {
-        backend: "verifier",
-      });
-
-      await expect(
-        evaluator.ask({ question: "Was the task completed?" }),
-      ).rejects.toThrow(
-        "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
-      );
-    });
-
-    it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => {
-      const taskSpec: Stagehand.TaskSpec = {
-        id: "empty",
-        instruction: "Complete the task",
-      };
-      const trajectory: Stagehand.Trajectory = {
-        task: taskSpec,
-        steps: [],
-        status: "complete",
-        usage: {
-          input_tokens: 0,
-          output_tokens: 0,
-        },
-        timing: {
-          startedAt: new Date(0).toISOString(),
-          endedAt: new Date(0).toISOString(),
-        },
-      };
-      const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, {
-        backend: "legacy",
-      });
-
-      const verdict = await evaluator.verify(trajectory, taskSpec);
-
-      expect(verdict.outcomeSuccess).toBe(false);
-      expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]);
-    });
-
-    it("rejects invalid evaluator backend env values", () => {
-      const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND;
-      process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend";
-
-      try {
-        expect(
-          () => new Stagehand.V3Evaluator({} as Stagehand.Stagehand),
-        ).toThrow('Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"');
-      } finally {
-        if (previousBackend === undefined) {
-          delete process.env.STAGEHAND_EVALUATOR_BACKEND;
-        } else {
-          process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend;
-        }
-      }
-    });
   });
 
   describe("V3FunctionName", () => {
diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts
new file mode 100644
index 000000000..c755c86b7
--- /dev/null
+++ b/packages/core/tests/unit/v3-evaluator.test.ts
@@ -0,0 +1,201 @@
+import { describe, expect, it, vi } from "vitest";
+
+import { V3Evaluator } from "../../lib/v3Evaluator.js";
+import type { V3 } from "../../lib/v3/v3.js";
+import type { TaskSpec, Trajectory } from "../../lib/v3/verifier/index.js";
+
+describe("V3Evaluator verifier facade", () => {
+  it("rejects verifier backend before the verifier PR is installed", async () => {
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "verifier",
+    });
+
+    await expect(
+      evaluator.ask({ question: "Was the task completed?" }),
+    ).rejects.toThrow(
+      "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+    );
+  });
+
+  it("rejects verify when the verifier backend is selected before the verifier PR is installed", async () => {
+    const taskSpec: TaskSpec = {
+      id: "verifier-unavailable",
+      instruction: "Complete the task",
+    };
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "verifier",
+    });
+
+    await expect(
+      evaluator.verify(makeTrajectory(taskSpec), taskSpec),
+    ).rejects.toThrow(
+      "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+    );
+  });
+
+  it("rejects generateRubric when the verifier backend is selected before the verifier PR is installed", async () => {
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "verifier",
+    });
+
+    await expect(
+      evaluator.generateRubric({
+        id: "rubric-unavailable",
+        instruction: "Complete the task",
+      }),
+    ).rejects.toThrow(
+      "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available",
+    );
+  });
+
+  it("maps legacy YES evaluations with trajectory screenshots to a successful verdict", async () => {
+    const taskSpec: TaskSpec = {
+      id: "success",
+      instruction: "Complete the task",
+    };
+    const screenshot = Buffer.from("screenshot");
+    const trajectory = makeTrajectory(taskSpec, {
+      screenshot,
+      finalAnswer: "The task is complete.",
+    });
+    const ask = vi.fn().mockResolvedValue({
+      evaluation: "YES",
+      reasoning: "The screenshot shows completion.",
+    });
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+    Object.defineProperty(evaluator, "legacyEvaluator", {
+      value: { ask },
+    });
+
+    const verdict = await evaluator.verify(trajectory, taskSpec);
+
+    expect(ask).toHaveBeenCalledWith(
+      expect.objectContaining({
+        question: taskSpec.instruction,
+        screenshot: [screenshot],
+        answer: "The task is complete.",
+      }),
+    );
+    expect(verdict.outcomeSuccess).toBe(true);
+    expect(verdict.processScore).toBe(1);
+    expect(verdict.perCriterion[0]).toMatchObject({
+      criterion: "legacy-task-completion",
+      earnedPoints: 1,
+      evidenceInsufficient: false,
+    });
+  });
+
+  it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => {
+    const taskSpec: TaskSpec = {
+      id: "reasoning-budget",
+      instruction: "Complete the task",
+    };
+    const longToolOutput = "x".repeat(3000);
+    const ask = vi.fn().mockResolvedValue({
+      evaluation: "YES",
+      reasoning: "The trajectory shows completion.",
+    });
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+    Object.defineProperty(evaluator, "legacyEvaluator", {
+      value: { ask },
+    });
+
+    await evaluator.verify(
+      makeTrajectory(taskSpec, {
+        finalAnswer: "The task is complete.",
+        toolResult: longToolOutput,
+      }),
+      taskSpec,
+    );
+
+    const firstCall = ask.mock.calls[0]?.[0];
+    expect(firstCall?.agentReasoning).toContain(longToolOutput);
+  });
+
+  it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => {
+    const taskSpec: TaskSpec = {
+      id: "empty",
+      instruction: "Complete the task",
+    };
+    const evaluator = new V3Evaluator({} as V3, {
+      backend: "legacy",
+    });
+
+    const verdict = await evaluator.verify(
+      makeEmptyTrajectory(taskSpec),
+      taskSpec,
+    );
+
+    expect(verdict.outcomeSuccess).toBe(false);
+    expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]);
+  });
+
+  it("rejects invalid evaluator backend env values", () => {
+    const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND;
+    process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend";
+
+    try {
+      expect(() => new V3Evaluator({} as V3)).toThrow(
+        'Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"',
+      );
+    } finally {
+      if (previousBackend === undefined) {
+        delete process.env.STAGEHAND_EVALUATOR_BACKEND;
+      } else {
+        process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend;
+      }
+    }
+  });
+});
+
+function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory {
+  return {
+    task: taskSpec,
+    steps: [],
+    status: "complete",
+    usage: {
+      input_tokens: 0,
+      output_tokens: 0,
+    },
+    timing: {
+      startedAt: new Date(0).toISOString(),
+      endedAt: new Date(0).toISOString(),
+    },
+  };
+}
+
+function makeTrajectory(
+  taskSpec: TaskSpec,
+  options: {
+    screenshot?: Buffer;
+    finalAnswer?: string;
+    toolResult?: unknown;
+  } = {},
+): Trajectory {
+  return {
+    ...makeEmptyTrajectory(taskSpec),
+    steps: [
+      {
+        index: 0,
+        actionName: "act",
+        actionArgs: {},
+        reasoning: "I completed the task.",
+        agentEvidence: { modalities: [] },
+        probeEvidence: options.screenshot
+          ? { screenshot: options.screenshot }
+          : {},
+        toolOutput: {
+          ok: true,
+          result: options.toolResult ?? "done",
+        },
+        startedAt: new Date(0).toISOString(),
+        finishedAt: new Date(0).toISOString(),
+      },
+    ],
+    finalAnswer: options.finalAnswer,
+  };
+}
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index 0ed4fc596..51f9c0b8b 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -6,6 +6,7 @@ import { describe, expect, it } from "vitest";
 
 import {
   loadTrajectoryFromDisk,
+  nextVerdictFilename,
   normalizeRubric,
 } from "../../lib/v3/verifier/trajectory.js";
 
@@ -33,6 +34,83 @@ describe("verifier trajectory utilities", () => {
     });
   });
 
+  it("round-trips serialized snake_case rubrics to public camelCase rubrics", () => {
+    expect(
+      normalizeRubric({
+        items: [
+          {
+            criterion: "Criterion",
+            description: "Description",
+            max_points: 3,
+            earned_points: "2",
+            condition: "Only if relevant",
+            justification: "Partial credit.",
+          },
+        ],
+      }),
+    ).toEqual({
+      items: [
+        {
+          criterion: "Criterion",
+          description: "Description",
+          maxPoints: 3,
+          earnedPoints: 2,
+          condition: "Only if relevant",
+          justification: "Partial credit.",
+        },
+      ],
+    });
+  });
+
+  it("loads trajectory screenshots and image modalities from disk", async () => {
+    const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
+    const screenshot = Buffer.from("probe screenshot");
+    const agentImage = Buffer.from("agent image");
+    await writeFile(path.join(dir, "screenshot_1.png"), screenshot);
+    await writeFile(
+      path.join(dir, "trajectory.json"),
+      JSON.stringify({
+        task: { id: "task", instruction: "Do the task" },
+        status: "complete",
+        usage: { input_tokens: 0, output_tokens: 0 },
+        timing: {
+          startedAt: new Date(0).toISOString(),
+          endedAt: new Date(0).toISOString(),
+        },
+        steps: [
+          {
+            index: 0,
+            actionName: "act",
+            actionArgs: {},
+            reasoning: "",
+            agentEvidence: {
+              modalities: [
+                {
+                  type: "image",
+                  mediaType: "image/png",
+                  bytesBase64: agentImage.toString("base64"),
+                },
+              ],
+            },
+            probeEvidence: { screenshotPath: "screenshot_1.png" },
+            toolOutput: { ok: true, result: null },
+            startedAt: new Date(0).toISOString(),
+            finishedAt: new Date(0).toISOString(),
+          },
+        ],
+      }),
+    );
+
+    const trajectory = await loadTrajectoryFromDisk(dir);
+    const modality = trajectory.steps[0].agentEvidence.modalities[0];
+
+    expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot);
+    expect(modality.type).toBe("image");
+    if (modality.type === "image") {
+      expect(modality.bytes).toEqual(agentImage);
+    }
+  });
+
   it("rejects screenshot paths outside the trajectory directory", async () => {
     const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-"));
     await writeFile(
@@ -65,4 +143,10 @@ describe("verifier trajectory utilities", () => {
       "escapes trajectory directory",
     );
   });
+
+  it("sanitizes verdict filename labels", () => {
+    expect(nextVerdictFilename("rescore / task:one?")).toBe(
+      "mmrubric_rescore___task_one_.json",
+    );
+  });
 });

From 18265cac01d284be1b0bbab03970bef4dfe23475 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:07:10 -0700
Subject: [PATCH 13/14] fix(verifier): clean public result API

---
 packages/core/lib/v3/index.ts                 |  15 ++-
 packages/core/lib/v3/verifier/index.ts        |   7 +-
 packages/core/lib/v3/verifier/trajectory.ts   | 111 +++++++++---------
 packages/core/lib/v3/verifier/types.ts        |  82 ++++---------
 packages/core/lib/v3Evaluator.ts              |  85 ++++----------
 packages/core/lib/v3LegacyEvaluator.ts        |   6 +
 .../unit/public-api/export-surface.test.ts    |   3 +
 .../tests/unit/public-api/v3-core.test.ts     |   2 +-
 packages/core/tests/unit/v3-evaluator.test.ts |  35 ++++--
 .../tests/unit/verifier-trajectory.test.ts    |  10 +-
 10 files changed, 155 insertions(+), 201 deletions(-)

diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts
index 8fdcc6b75..8e21fb030 100644
--- a/packages/core/lib/v3/index.ts
+++ b/packages/core/lib/v3/index.ts
@@ -24,6 +24,11 @@ import { tool } from "ai";
 import { getAISDKLanguageModel } from "./llm/LLMProvider.js";
 import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js";
 import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js";
+import {
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
+} from "./verifier/index.js";
 
 export { V3 } from "./v3.js";
 export { V3 as Stagehand } from "./v3.js";
@@ -72,15 +77,12 @@ export type {
   TaskSpec,
   Rubric,
   RubricCriterion,
-  SerializedRubric,
-  SerializedRubricCriterion,
-  RubricInput,
   AgentEvidence,
   AgentEvidenceModality,
   ProbeEvidence,
   ToolOutput,
   Verifier,
-  Verdict,
+  EvaluationResult,
   CriterionScore,
   FirstPointOfFailure,
   TaskValidity,
@@ -89,7 +91,7 @@ export type {
 } from "./verifier/index.js";
 export {
   loadTrajectoryFromDisk,
-  nextVerdictFilename,
+  nextResultFilename,
   normalizeRubric,
 } from "./verifier/index.js";
 export { tool } from "ai";
@@ -142,6 +144,9 @@ const StagehandDefault = {
   toJsonSchema,
   connectToMCPServer,
   V3Evaluator,
+  loadTrajectoryFromDisk,
+  nextResultFilename,
+  normalizeRubric,
   tool,
   getAISDKLanguageModel,
   __internalCreateInMemoryAgentCacheHandle,
diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts
index ce62b4d0a..4061533ab 100644
--- a/packages/core/lib/v3/verifier/index.ts
+++ b/packages/core/lib/v3/verifier/index.ts
@@ -5,13 +5,11 @@ export type {
   AgentEvidence,
   AgentEvidenceModality,
   CriterionScore,
+  EvaluationResult,
   FirstPointOfFailure,
   ProbeEvidence,
   Rubric,
   RubricCriterion,
-  RubricInput,
-  SerializedRubric,
-  SerializedRubricCriterion,
   TaskSpec,
   TaskValidity,
   ToolOutput,
@@ -19,13 +17,12 @@ export type {
   TrajectoryStatus,
   TrajectoryStep,
   TrajectoryUsage,
-  Verdict,
   Verifier,
   VerifierFinding,
   VerifierRawSteps,
 } from "./types.js";
 export {
   loadTrajectoryFromDisk,
-  nextVerdictFilename,
+  nextResultFilename,
   normalizeRubric,
 } from "./trajectory.js";
diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts
index f5e300403..a18f025c3 100644
--- a/packages/core/lib/v3/verifier/trajectory.ts
+++ b/packages/core/lib/v3/verifier/trajectory.ts
@@ -2,70 +2,81 @@ import type {
   AgentEvidenceModality,
   ProbeEvidence,
   Rubric,
-  RubricCriterion,
-  RubricInput,
-  SerializedRubricCriterion,
   Trajectory,
   TrajectoryStep,
 } from "./types.js";
 
-/** Convert a Stagehand or serialized rubric into the public Stagehand shape. */
-export function normalizeRubric(
-  rubric: RubricInput | null | undefined,
-): Rubric | undefined {
-  if (!rubric) return undefined;
+type RawRubricCriterion = {
+  criterion: unknown;
+  description: unknown;
+  max_points?: unknown;
+  maxPoints?: unknown;
+  condition?: unknown;
+};
+
+type RawRubric = {
+  items?: unknown;
+};
+
+/**
+ * Convert dataset or generated rubric JSON into the public Stagehand shape.
+ * Snake-case dataset fields are accepted here so serialized quirks do not leak
+ * into the canonical rubric type.
+ */
+export function normalizeRubric(rubric: unknown): Rubric | undefined {
+  if (rubric == null) return undefined;
+  if (typeof rubric !== "object") {
+    throw new TypeError("Rubric must be an object");
+  }
+
+  const rawRubric = rubric as RawRubric;
+  if (!Array.isArray(rawRubric.items)) {
+    throw new TypeError("Rubric is missing an items array");
+  }
 
   return {
-    items: rubric.items.map((item) => {
-      const serialized = isSerializedRubricCriterion(item);
-      const maxPoints = serialized ? item.max_points : item.maxPoints;
+    items: rawRubric.items.map((item) => {
+      const criterion = normalizeRequiredString(item.criterion, "criterion");
+      const description = normalizeRequiredString(
+        item.description,
+        "description",
+      );
+      const maxPoints = normalizeMaxPoints(item);
 
       if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) {
         throw new TypeError(
-          `Rubric criterion "${item.criterion}" is missing a numeric maxPoints value`,
+          `Rubric criterion "${criterion}" is missing a numeric maxPoints value`,
         );
       }
 
-      const earnedPoints = normalizeEarnedPoints(
-        serialized ? item.earned_points : item.earnedPoints,
-        item.criterion,
-      );
       return {
-        criterion: item.criterion,
-        description: item.description,
+        criterion,
+        description,
         maxPoints,
-        ...(item.condition !== undefined && { condition: item.condition }),
-        ...(item.justification !== undefined && {
-          justification: item.justification,
+        ...(typeof item.condition === "string" && {
+          condition: item.condition,
         }),
-        ...(earnedPoints !== undefined && { earnedPoints }),
       };
     }),
   };
 }
 
-function isSerializedRubricCriterion(
-  item: RubricCriterion | SerializedRubricCriterion,
-): item is SerializedRubricCriterion {
-  return "max_points" in item;
+function normalizeRequiredString(value: unknown, fieldName: string): string {
+  if (typeof value === "string" && value.length) {
+    return value;
+  }
+
+  throw new TypeError(`Rubric criterion is missing a ${fieldName} value`);
 }
 
-function normalizeEarnedPoints(
-  value: number | string | undefined,
-  criterion: string,
-): number | undefined {
-  if (value === undefined) return undefined;
-  if (typeof value === "number") {
-    if (Number.isFinite(value)) return value;
-  } else {
-    const trimmed = value.trim();
-    if (trimmed === "") return undefined;
-    const parsed = Number(trimmed);
-    if (Number.isFinite(parsed)) return parsed;
-  }
+function normalizeMaxPoints(item: RawRubricCriterion): unknown {
+  return item.maxPoints ?? item.max_points;
+}
 
-  throw new TypeError(
-    `Rubric criterion "${criterion}" has a non-numeric earnedPoints value`,
+function normalizeResultLabel(label?: string): string {
+  return (label ?? `rescore-${new Date().toISOString()}`).replace(
+    /[^A-Za-z0-9._-]/g,
+    "_",
   );
 }
 
@@ -167,18 +178,12 @@ export async function loadTrajectoryFromDisk(dir: string): Promise<Trajectory> {
 }
 
 /**
- * Locate the next available `mmrubric_*.json` filename for a given trajectory
- * directory. Used by offline re-scoring to avoid overwriting prior verdicts.
+ * Build a `result*.json` filename for persisted evaluator output.
  *
- * Convention: prefer a label-based name (e.g., `mmrubric_rescore-2026-05-11.json`)
- * over numeric versioning so multiple offline rescore attempts coexist without
- * collisions and remain easy to diff. Falls back to a timestamp if the caller
- * doesn't provide a label.
+ * Convention: the live run writes `result.json`; offline re-score attempts use
+ * a label-based name (e.g., `result_rescore-2026-05-11.json`) so they coexist
+ * without collisions and remain easy to diff.
  */
-export function nextVerdictFilename(label?: string): string {
-  const safeLabel = (label ?? `rescore-${new Date().toISOString()}`).replace(
-    /[^A-Za-z0-9._-]/g,
-    "_",
-  );
-  return `mmrubric_${safeLabel}.json`;
+export function nextResultFilename(label?: string): string {
+  return `result_${normalizeResultLabel(label)}.json`;
 }
diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts
index b3f1011f0..88b7e275e 100644
--- a/packages/core/lib/v3/verifier/types.ts
+++ b/packages/core/lib/v3/verifier/types.ts
@@ -1,5 +1,5 @@
 /**
- * Shared verifier types for trajectories, rubrics, evidence, and verdicts.
+ * Shared verifier types for trajectories, rubrics, evidence, and results.
  *
  * The verifier consumes saved trajectories instead of a live browser. DOM and
  * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve
@@ -15,11 +15,7 @@ export interface TrajectoryUsage {
   inference_time_ms?: number;
 }
 
-/**
- * A single criterion in a Stagehand rubric. Dataset and model wire formats may
- * use serialized `max_points` / `earned_points`; normalize those with
- * `normalizeRubric()` at the boundary.
- */
+/** A single criterion in a Stagehand rubric. */
 export interface RubricCriterion {
   /** Short name of the criterion (e.g., "Add ground beef to cart"). */
   criterion: string;
@@ -28,14 +24,10 @@ export interface RubricCriterion {
   /** Maximum points for this criterion. */
   maxPoints: number;
   /**
-   * Triggering condition for conditional criteria. Only counted when met
-   * (paper's "Mutually Exclusive Conditionals" pattern).
+   * Applicability rule for situational criteria. When this condition is not
+   * met, the criterion is excluded from scoring rather than counted as failed.
    */
   condition?: string;
-  /** Filled by the verifier during scoring; empty in precomputed rubrics. */
-  justification?: string;
-  /** Filled by the verifier during scoring; omitted in precomputed rubrics. */
-  earnedPoints?: number;
 }
 
 /** A rubric — list of criteria for a task. */
@@ -43,26 +35,6 @@ export interface Rubric {
   items: RubricCriterion[];
 }
 
-/**
- * Serialized rubric item shape as stored in datasets and prompt responses.
- * Keep this at IO boundaries; core verifier types use camelCase.
- */
-export interface SerializedRubricCriterion {
-  criterion: string;
-  description: string;
-  max_points: number;
-  condition?: string;
-  justification?: string;
-  earned_points?: number | string;
-}
-
-/** Serialized rubric shape used by upstream datasets and generated JSON. */
-export interface SerializedRubric {
-  items: SerializedRubricCriterion[];
-}
-
-export type RubricInput = Rubric | SerializedRubric;
-
 /**
  * Spec for a single task being verified. Carried both at runtime and into the
  * verifier alongside the trajectory.
@@ -162,11 +134,11 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
  * The on-disk layout is one directory per task:
  *
  *   .trajectories/<run-id>/<task-id>/
- *     ├── task_data.json    — TaskSpec + Verdict (filled on completion)
+ *     ├── task_data.json    — TaskSpec + result metadata
  *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
  *     ├── screenshot_1.png  — probeEvidence.screenshot for step 1, etc.
  *     ├── scores/
- *     │   └── mmrubric_v1.json  — Verdict from V3Evaluator.verify()
+ *     │   └── result.json       — Result from V3Evaluator.verify()
  *     ├── core.log          — captured action log
  *     └── times.json        — step timing + token usage
  */
@@ -191,8 +163,8 @@ export interface CriterionScore {
    * from both numerator and denominator in the process score).
    */
   earnedPoints: number | null;
-  /** Verifier's free-text justification for the score. */
-  justification: string;
+  /** Verifier's explanation for the score. */
+  explanation: string;
   /**
    * True if the criterion is conditional and its condition was determined to
    * be met. Absent for non-conditional criteria.
@@ -201,7 +173,7 @@ export interface CriterionScore {
   /**
    * Set when the verifier had no evidence to ground this criterion in either
    * tier. Per paper §2, treated as uncontrollable failure → full credit, but
-   * surfaced here so dashboards can flag low-confidence verdicts.
+   * surfaced here so dashboards can flag low-confidence results.
    */
   evidenceInsufficient?: boolean;
 }
@@ -228,7 +200,7 @@ export interface FirstPointOfFailure {
  * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.
  *
  * Not produced for every task: when nothing actionable surfaces, the
- * `findings` array on the Verdict is empty. Consumers should treat the
+ * `findings` array on the EvaluationResult is empty. Consumers should treat the
  * field as advisory, not as part of the formal score.
  */
 export interface VerifierFinding {
@@ -288,29 +260,33 @@ export interface TaskValidity {
 }
 
 /**
- * The verifier's output. Process score + outcome verdict + diagnostic signals.
+ * Evaluator output. Legacy evaluation may only populate outcome fields; richer
+ * verifier backends can also populate process scoring and diagnostics.
  *
- * Process and outcome are deliberately independent (paper §2): an agent can
- * follow the right steps but get blocked (high process, low outcome), or
- * succeed through an unexpected path (variable process, high outcome).
+ * Process and outcome are deliberately independent when both are present:
+ * an agent can follow the right steps but get blocked (high process, low
+ * outcome), or succeed through an unexpected path (variable process, high
+ * outcome).
  */
-export interface Verdict {
-  /** Step 8 — did the agent accomplish the task from the user's perspective? */
+export interface EvaluationResult {
+  /** Did the agent accomplish the task from the user's perspective? */
   outcomeSuccess: boolean;
+  /** Human-readable explanation for the outcome. */
+  explanation?: string;
   /** Aggregated earned/max across applicable criteria, in [0, 1]. */
-  processScore: number;
+  processScore?: number;
   /** Per-criterion breakdown after rescoring. */
-  perCriterion: CriterionScore[];
+  perCriterion?: CriterionScore[];
   /** Step 9a — first step where the trajectory went off-track, if any. */
   firstPointOfFailure?: FirstPointOfFailure;
   /** Step 10 — task-itself ambiguity / validity. */
-  taskValidity: TaskValidity;
+  taskValidity?: TaskValidity;
   /**
    * Ids (RubricCriterion.criterion strings) of criteria where neither tier of
    * evidence resolved the question. Treated as uncontrollable → full credit,
    * but flagged here so consumers can decide whether to discount the score.
    */
-  evidenceInsufficient: string[];
+  evidenceInsufficient?: string[];
   /**
    * Structured observations from the verifier that a downstream tool or
    * follow-up agent could act on. Opportunistic — empty when the verifier
@@ -321,16 +297,10 @@ export interface Verdict {
   rawSteps?: VerifierRawSteps;
 }
 
-/** Reason a stub verifier can emit. */
-export type StubVerdictReason =
-  | "stub-verifier"
-  | "no-rubric"
-  | "empty-trajectory";
-
 /**
  * Verifier interface. Implementations consume a Trajectory + TaskSpec and
- * return a Verdict — they MUST NOT touch a live browser.
+ * return an EvaluationResult — they MUST NOT touch a live browser.
  */
 export interface Verifier {
-  verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<Verdict>;
+  verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<EvaluationResult>;
 }
diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts
index a25c9cf44..379cf4589 100644
--- a/packages/core/lib/v3Evaluator.ts
+++ b/packages/core/lib/v3Evaluator.ts
@@ -9,7 +9,7 @@ import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
 import type {
   EvaluateOptions,
   BatchAskOptions,
-  EvaluationResult,
+  EvaluationResult as LegacyEvaluationResult,
 } from "./v3/types/private/evaluator.js";
 import { V3 } from "./v3/v3.js";
 import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
@@ -17,7 +17,7 @@ import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js";
 import type {
   Trajectory,
   TaskSpec,
-  Verdict,
+  EvaluationResult,
   Rubric,
   Verifier,
   AgentEvidenceModality,
@@ -76,15 +76,18 @@ export class V3Evaluator implements Verifier {
     );
   }
 
-  async ask(options: EvaluateOptions): Promise<EvaluationResult> {
+  async ask(options: EvaluateOptions): Promise<LegacyEvaluationResult> {
     return this.getLegacyBackend("ask").ask(options);
   }
 
-  async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {
+  async batchAsk(options: BatchAskOptions): Promise<LegacyEvaluationResult[]> {
     return this.getLegacyBackend("batchAsk").batchAsk(options);
   }
 
-  async verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise<Verdict> {
+  async verify(
+    trajectory: Trajectory,
+    taskSpec: TaskSpec,
+  ): Promise<EvaluationResult> {
     assertVerifierInput(trajectory, taskSpec);
 
     if (this.backend === "legacy") {
@@ -127,14 +130,13 @@ export class V3Evaluator implements Verifier {
   private async verifyTrajectoryWithLegacyEvaluator(
     trajectory: Trajectory,
     taskSpec: TaskSpec,
-  ): Promise<Verdict> {
+  ): Promise<EvaluationResult> {
     const screenshots = collectLegacyScreenshots(trajectory);
     const agentReasoning = renderLegacyAgentReasoning(trajectory);
     const answer = trajectory.finalAnswer;
 
     if (!screenshots.length && !answer) {
-      return legacyInsufficientEvidenceVerdict(
-        taskSpec,
+      return legacyInsufficientEvidenceResult(
         "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.",
       );
     }
@@ -146,7 +148,7 @@ export class V3Evaluator implements Verifier {
       agentReasoning,
     });
 
-    return legacyEvaluationToVerdict(result, taskSpec, screenshots.length);
+    return legacyEvaluationToResult(result, screenshots.length);
   }
 }
 
@@ -253,20 +255,14 @@ function renderLegacyAgentReasoning(
       .join("\n");
   });
 
-  const sections = [
-    stepLines.length
-      ? `Agent trajectory:\n${stepLines.join("\n\n")}`
-      : undefined,
-    trajectory.finalAnswer
-      ? `Final answer:\n${trajectory.finalAnswer}`
-      : undefined,
-  ].filter(Boolean);
-
-  if (!sections.length) {
+  if (!stepLines.length) {
     return undefined;
   }
 
-  return truncateForPrompt(sections.join("\n\n"), 16000);
+  return truncateForPrompt(
+    `Agent trajectory:\n${stepLines.join("\n\n")}`,
+    16000,
+  );
 }
 
 function stringifyForPrompt(value: unknown): string {
@@ -290,14 +286,12 @@ function truncateForPrompt(value: string, maxLength: number): string {
   return `${value.slice(0, maxLength)}... [truncated]`;
 }
 
-function legacyEvaluationToVerdict(
-  result: EvaluationResult,
-  taskSpec: TaskSpec,
+function legacyEvaluationToResult(
+  result: LegacyEvaluationResult,
   screenshotCount: number,
-): Verdict {
+): EvaluationResult {
   const outcomeSuccess = result.evaluation === "YES";
   const invalid = result.evaluation === "INVALID";
-  const criterion = legacyTaskCompletionCriterion(taskSpec);
   const findings: VerifierFinding[] = invalid
     ? [
         {
@@ -310,22 +304,8 @@ function legacyEvaluationToVerdict(
 
   return {
     outcomeSuccess,
-    processScore: outcomeSuccess ? 1 : 0,
-    perCriterion: [
-      {
-        criterion: criterion.criterion,
-        maxPoints: criterion.maxPoints,
-        earnedPoints: outcomeSuccess ? 1 : 0,
-        justification: result.reasoning,
-        evidenceInsufficient: invalid,
-      },
-    ],
-    taskValidity: {
-      isAmbiguous: false,
-      isInvalid: false,
-    },
-    evidenceInsufficient: invalid ? [criterion.criterion] : [],
-    findings,
+    explanation: result.reasoning,
+    ...(findings.length ? { findings } : {}),
     rawSteps: {
       backend: "legacy",
       legacyEvaluation: result.evaluation,
@@ -334,29 +314,10 @@ function legacyEvaluationToVerdict(
   };
 }
 
-function legacyInsufficientEvidenceVerdict(
-  taskSpec: TaskSpec,
-  reason: string,
-): Verdict {
-  const criterion = legacyTaskCompletionCriterion(taskSpec);
-
+function legacyInsufficientEvidenceResult(reason: string): EvaluationResult {
   return {
     outcomeSuccess: false,
-    processScore: 0,
-    perCriterion: [
-      {
-        criterion: criterion.criterion,
-        maxPoints: criterion.maxPoints,
-        earnedPoints: 0,
-        justification: reason,
-        evidenceInsufficient: true,
-      },
-    ],
-    taskValidity: {
-      isAmbiguous: false,
-      isInvalid: false,
-    },
-    evidenceInsufficient: [criterion.criterion],
+    explanation: reason,
     findings: [
       {
         category: "trajectory_capture",
diff --git a/packages/core/lib/v3LegacyEvaluator.ts b/packages/core/lib/v3LegacyEvaluator.ts
index 64ec89ef2..5662d25ac 100644
--- a/packages/core/lib/v3LegacyEvaluator.ts
+++ b/packages/core/lib/v3LegacyEvaluator.ts
@@ -74,6 +74,7 @@ export class LegacyV3Evaluator {
     if (Array.isArray(screenshot)) {
       return this._evaluateWithMultipleScreenshots({
         question,
+        answer,
         screenshots: screenshot,
         systemPrompt,
         agentReasoning,
@@ -224,12 +225,14 @@ export class LegacyV3Evaluator {
 
   private async _evaluateWithMultipleScreenshots(options: {
     question: string;
+    answer?: string;
     screenshots: Buffer[];
     systemPrompt?: string;
     agentReasoning?: string;
   }): Promise<EvaluationResult> {
     const {
       question,
+      answer,
       screenshots,
       agentReasoning,
       systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
@@ -272,6 +275,9 @@ export class LegacyV3Evaluator {
                   ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
                   : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
               },
+              ...(answer
+                ? [{ type: "text" as const, text: `the answer is ${answer}` }]
+                : []),
               ...imageContents,
             ],
           },
diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts
index eda61d500..e73cde417 100644
--- a/packages/core/tests/unit/public-api/export-surface.test.ts
+++ b/packages/core/tests/unit/public-api/export-surface.test.ts
@@ -43,8 +43,11 @@ const publicApiShape = {
   isZod4Schema: Stagehand.isZod4Schema,
   jsonSchemaToZod: Stagehand.jsonSchemaToZod,
   loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv,
+  loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk,
   localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema,
   modelToAgentProviderMap: Stagehand.modelToAgentProviderMap,
+  nextResultFilename: Stagehand.nextResultFilename,
+  normalizeRubric: Stagehand.normalizeRubric,
   pageTextSchema: Stagehand.pageTextSchema,
   providerEnvVarMap: Stagehand.providerEnvVarMap,
   toGeminiSchema: Stagehand.toGeminiSchema,
diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts
index 5e767ae57..2c2524238 100644
--- a/packages/core/tests/unit/public-api/v3-core.test.ts
+++ b/packages/core/tests/unit/public-api/v3-core.test.ts
@@ -139,7 +139,7 @@ describe("V3 Core public API types", () => {
         (
           trajectory: Stagehand.Trajectory,
           taskSpec: Stagehand.TaskSpec,
-        ) => Promise<Stagehand.Verdict>
+        ) => Promise<Stagehand.EvaluationResult>
       >();
       expectTypeOf<V3EvaluatorInstance["generateRubric"]>().toExtend<
         (taskSpec: Stagehand.TaskSpec) => Promise<Stagehand.Rubric>
diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts
index c755c86b7..b97c93ba2 100644
--- a/packages/core/tests/unit/v3-evaluator.test.ts
+++ b/packages/core/tests/unit/v3-evaluator.test.ts
@@ -48,7 +48,7 @@ describe("V3Evaluator verifier facade", () => {
     );
   });
 
-  it("maps legacy YES evaluations with trajectory screenshots to a successful verdict", async () => {
+  it("maps legacy YES evaluations with trajectory screenshots to a successful result", async () => {
     const taskSpec: TaskSpec = {
       id: "success",
       instruction: "Complete the task",
@@ -69,7 +69,7 @@ describe("V3Evaluator verifier facade", () => {
       value: { ask },
     });
 
-    const verdict = await evaluator.verify(trajectory, taskSpec);
+    const result = await evaluator.verify(trajectory, taskSpec);
 
     expect(ask).toHaveBeenCalledWith(
       expect.objectContaining({
@@ -78,13 +78,10 @@ describe("V3Evaluator verifier facade", () => {
         answer: "The task is complete.",
       }),
     );
-    expect(verdict.outcomeSuccess).toBe(true);
-    expect(verdict.processScore).toBe(1);
-    expect(verdict.perCriterion[0]).toMatchObject({
-      criterion: "legacy-task-completion",
-      earnedPoints: 1,
-      evidenceInsufficient: false,
-    });
+    expect(result.outcomeSuccess).toBe(true);
+    expect(result.explanation).toBe("The screenshot shows completion.");
+    expect(result.processScore).toBeUndefined();
+    expect(result.perCriterion).toBeUndefined();
   });
 
   it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => {
@@ -114,9 +111,11 @@ describe("V3Evaluator verifier facade", () => {
 
     const firstCall = ask.mock.calls[0]?.[0];
     expect(firstCall?.agentReasoning).toContain(longToolOutput);
+    expect(firstCall?.agentReasoning).not.toContain("Final answer:");
+    expect(firstCall?.answer).toBe("The task is complete.");
   });
 
-  it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => {
+  it("returns an evidence-insufficient legacy result for empty trajectories", async () => {
     const taskSpec: TaskSpec = {
       id: "empty",
       instruction: "Complete the task",
@@ -125,13 +124,23 @@ describe("V3Evaluator verifier facade", () => {
       backend: "legacy",
     });
 
-    const verdict = await evaluator.verify(
+    const result = await evaluator.verify(
       makeEmptyTrajectory(taskSpec),
       taskSpec,
     );
 
-    expect(verdict.outcomeSuccess).toBe(false);
-    expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]);
+    expect(result).toMatchObject({
+      outcomeSuccess: false,
+      explanation:
+        "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.",
+      rawSteps: {
+        backend: "legacy",
+        legacyEvaluation: "INVALID",
+        screenshotCount: 0,
+      },
+    });
+    expect(result.processScore).toBeUndefined();
+    expect(result.perCriterion).toBeUndefined();
   });
 
   it("rejects invalid evaluator backend env values", () => {
diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts
index 51f9c0b8b..4b09e53a1 100644
--- a/packages/core/tests/unit/verifier-trajectory.test.ts
+++ b/packages/core/tests/unit/verifier-trajectory.test.ts
@@ -6,7 +6,7 @@ import { describe, expect, it } from "vitest";
 
 import {
   loadTrajectoryFromDisk,
-  nextVerdictFilename,
+  nextResultFilename,
   normalizeRubric,
 } from "../../lib/v3/verifier/trajectory.js";
 
@@ -54,9 +54,7 @@ describe("verifier trajectory utilities", () => {
           criterion: "Criterion",
           description: "Description",
           maxPoints: 3,
-          earnedPoints: 2,
           condition: "Only if relevant",
-          justification: "Partial credit.",
         },
       ],
     });
@@ -144,9 +142,9 @@ describe("verifier trajectory utilities", () => {
     );
   });
 
-  it("sanitizes verdict filename labels", () => {
-    expect(nextVerdictFilename("rescore / task:one?")).toBe(
-      "mmrubric_rescore___task_one_.json",
+  it("sanitizes result filename labels", () => {
+    expect(nextResultFilename("rescore / task:one?")).toBe(
+      "result_rescore___task_one_.json",
     );
   });
 });

From 60e43217aad0449ce258071d11809b0f04178bbc Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:47:31 -0700
Subject: [PATCH 14/14] docs(verifier): align evaluator changeset wording

---
 .changeset/verifier-evaluator-shell.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.changeset/verifier-evaluator-shell.md b/.changeset/verifier-evaluator-shell.md
index 8e603b499..4cac71a83 100644
--- a/.changeset/verifier-evaluator-shell.md
+++ b/.changeset/verifier-evaluator-shell.md
@@ -2,4 +2,4 @@
 "@browserbasehq/stagehand": patch
 ---
 
-Add verifier trajectory, rubric, and verdict types with normalized public naming.
+Add verifier trajectory, rubric, and evaluation-result types with normalized public naming.