From 5a7af3ca60b00ee34f0e61be969d87b7c3a2835e Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:42:51 -0700 Subject: [PATCH 01/14] feat(verifier): add verifier evaluator shell --- packages/core/lib/v3/index.ts | 23 ++ packages/core/lib/v3/verifier/index.ts | 30 ++ packages/core/lib/v3/verifier/trajectory.ts | 282 ++++++++++++++++++ packages/core/lib/v3/verifier/verifier.ts | 157 ++++++++++ packages/core/lib/v3Evaluator.ts | 245 ++++++++++++++- .../tests/unit/public-api/v3-core.test.ts | 40 +++ 6 files changed, 776 insertions(+), 1 deletion(-) create mode 100644 packages/core/lib/v3/verifier/index.ts create mode 100644 packages/core/lib/v3/verifier/trajectory.ts create mode 100644 packages/core/lib/v3/verifier/verifier.ts diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index ffb6726df..88f999e16 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -64,6 +64,29 @@ export type { V3EvaluatorConstructorOptions, V3EvaluatorOptions, } from "../v3Evaluator.js"; +export type { + Trajectory, + TrajectoryStep, + TrajectoryStatus, + TrajectoryUsage, + TaskSpec, + Rubric, + RubricCriterion, + AgentEvidence, + AgentEvidenceModality, + ProbeEvidence, + ToolOutput, + Verifier, + Verdict, + CriterionScore, + FirstPointOfFailure, + TaskValidity, + VerifierFinding, +} from "./verifier/index.js"; +export { + loadTrajectoryFromDisk, + nextVerdictFilename, +} from "./verifier/index.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts new file mode 100644 index 000000000..b04363458 --- /dev/null +++ b/packages/core/lib/v3/verifier/index.ts @@ -0,0 +1,30 @@ +/** + * Public re-exports for the verifier subsystem. + * + * Wave 0 ships the trajectory + verdict types and a stub verifier. The + * RubricVerifier port (Wave 1+) stays internal until the prompts stabilize. + */ +export type { + Trajectory, + TrajectoryStep, + TrajectoryStatus, + TrajectoryUsage, + TaskSpec, + Rubric, + RubricCriterion, + AgentEvidence, + AgentEvidenceModality, + ProbeEvidence, + ToolOutput, +} from "./trajectory.js"; +export { loadTrajectoryFromDisk, nextVerdictFilename } from "./trajectory.js"; + +export type { + Verifier, + Verdict, + CriterionScore, + FirstPointOfFailure, + TaskValidity, + VerifierFinding, + StubVerdictReason, +} from "./verifier.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts new file mode 100644 index 000000000..6912dbc74 --- /dev/null +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -0,0 +1,282 @@ +/** + * Trajectory — structured record of an agent run, consumed by the verifier. + * + * Trajectories are produced by the harness (TrajectoryRecorder in + * packages/evals) from the bus events emitted by v3AgentHandler / + * v3CuaAgentHandler. They are persisted on-disk in a layout matching + * microsoft/fara's example_trajectory/ (task_data.json + trajectory.json + + * screenshot_N.png + scores/) so we can cross-validate against + * CUAVerifierBench's verify_trajectories.py without format-conversion. + * + * Two evidence channels per step: + * - agentEvidence ("tier 1") — what the agent's LLM consumed as the tool + * result. For DOM/hybrid agents these are the tool returns (extract JSON, + * ariaTree text, act describe-string, goto URL). For CUA this is the + * screenshot the provider received. + * - probeEvidence ("tier 2") — independent observations the harness took + * around each step (page.screenshot, page.url, optionally a11y). + * + * The verifier consumes both. They can disagree; conflict resolution is the + * verifier's job (see Verdict.evidenceInsufficient + per-criterion logging). + */ + +/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */ +export interface TrajectoryUsage { + input_tokens: number; + output_tokens: number; + reasoning_tokens?: number; + cached_input_tokens?: number; + inference_time_ms?: number; +} + +/** + * A single criterion in a rubric. Mirrors fara's per-item schema: + * { criterion, description, max_points, justification, earned_points } + * Conditional criteria carry an extra "condition" field; only counted when met. + */ +export interface RubricCriterion { + /** Short name of the criterion (e.g., "Add ground beef to cart"). */ + criterion: string; + /** What to evaluate and how to award partial credit. */ + description: string; + /** Maximum points for this criterion. */ + max_points: number; + /** + * Triggering condition for conditional criteria. Only counted when met + * (paper's "Mutually Exclusive Conditionals" pattern). + */ + condition?: string; + /** Filled by the verifier during scoring; empty in precomputed rubrics. */ + justification?: string; + /** + * Filled by the verifier during scoring; empty string in unscored rubrics. + * Loose type to mirror fara's data, where unscored items carry "" and scored + * items carry a number. + */ + earned_points?: number | string; +} + +/** A rubric — list of criteria for a task. */ +export interface Rubric { + items: RubricCriterion[]; +} + +/** + * Spec for a single task being verified. Carried both at runtime (handed to + * agent.execute) and into the verifier alongside the trajectory. + */ +export interface TaskSpec { + /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */ + id: string; + /** Task instruction shown to the agent. */ + instruction: string; + /** Starting URL, if any. */ + initUrl?: string; + /** + * Rubric carried by the dataset (e.g., WebTailBench's precomputed_rubric). + * If absent, the verifier generates one via Step 0a and caches under + * packages/evals/.rubric-cache/. + */ + precomputedRubric?: Rubric; + /** Optional reference answer (set when dataset ships one). */ + expectedAnswer?: string; +} + +/** + * A single modality unit in tier-1 agent evidence. Mirrors the shape of + * ModelMessage content parts so we can reproduce what the LLM ingested. + */ +export type AgentEvidenceModality = + | { type: "text"; content: string } + | { type: "image"; bytes: Buffer; mediaType: string } + | { type: "json"; content: unknown }; + +/** + * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the + * tool result for this step. + * + * Modes: + * - CUA: usually a single image modality (the screenshot sent to the provider). + * - Hybrid: tool result with optional screenshotBase64 → one image + one text. + * - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities. + */ +export interface AgentEvidence { + modalities: AgentEvidenceModality[]; +} + +/** + * Tier 2 — independent harness probes around this step. Cheap and always-on + * for v0 (just url) and v1 (+a11y, +scroll). v2 adds verifier-requested probes + * keyed on the criterion that requested them. + * + * If a probe wasn't captured, the field is absent (not null). + */ +export interface ProbeEvidence { + /** v0.5 — URL after the step's tool execution. */ + url?: string; + /** + * v0 — bus screenshot (page.screenshot post-step). Path on disk is preferred + * once persisted; in-memory Buffer is used during a live run. + */ + screenshot?: Buffer; + /** Reference to the persisted screenshot file under the trajectory dir. */ + screenshotPath?: string; + /** v1 — viewport scroll context. Lets the verifier reason about "did the agent see the full page". */ + scroll?: { top: number; pageHeight: number }; + /** v1 — accessibility tree snapshot. */ + ariaTree?: string; + /** v2 — verifier-requested probes, keyed by criterion id. */ + onDemand?: Record; +} + +/** Outcome of a single tool execution as seen by the harness. */ +export interface ToolOutput { + ok: boolean; + /** + * The tool's return value. Same payload that flowed into agentEvidence + * modalities, but in its native shape (e.g., the extract result, the act + * describe-string) rather than serialized for the LLM. + */ + result: unknown; + error?: string; +} + +/** One step in a trajectory: action + reasoning + evidence + outcome. */ +export interface TrajectoryStep { + index: number; + actionName: string; + actionArgs: Record; + /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */ + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: ToolOutput; + /** ISO 8601 timestamp when the step's tool execution started. */ + startedAt: string; + /** ISO 8601 timestamp when the step's tool execution finished. */ + finishedAt: string; +} + +/** Terminal status of the agent run. */ +export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; + +/** + * Full trajectory for one task run. + * + * The on-disk layout is one directory per task: + * + * .trajectories/// + * ├── task_data.json — TaskSpec + Verdict (filled on completion) + * ├── trajectory.json — this object, with screenshotPath instead of bytes + * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. + * ├── scores/ + * │ └── mmrubric_v1.json — Verdict from V3Evaluator.verify() + * ├── core.log — action log mirroring fara's core.log + * └── times.json — step timing + token usage + */ +export interface Trajectory { + task: TaskSpec; + steps: TrajectoryStep[]; + finalAnswer?: string; + status: TrajectoryStatus; + usage: TrajectoryUsage; + timing: { startedAt: string; endedAt: string }; +} + +// ───────────────────────────────────────────────────────────────────────────── +// On-disk loader +// ───────────────────────────────────────────────────────────────────────────── + +/** + * Hydrate a Trajectory from the on-disk directory layout written by + * TrajectoryRecorder.persist(). Used by the offline re-scoring CLI (`bench + * verify`) and by any consumer that wants to feed a saved trajectory back + * into V3Evaluator.verify() without running an agent. + * + * Reverses the recorder's serialization tweaks: + * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. + * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on + * disk (human-readable JSON) instead of raw Buffer; we decode back. + * + * @param dir absolute or cwd-relative path to a `//` directory. + */ +export async function loadTrajectoryFromDisk(dir: string): Promise { + const fs = await import("node:fs/promises"); + const path = await import("node:path"); + + const trajectoryPath = path.join(dir, "trajectory.json"); + const raw = await fs.readFile(trajectoryPath, "utf8"); + const parsed = JSON.parse(raw) as Trajectory & { + steps: Array< + TrajectoryStep & { + agentEvidence: { + modalities: Array< + | { type: "text"; content: string } + | { + type: "image"; + mediaType: string; + // On-disk form (recorder writes base64); accept either to + // tolerate hand-edited fixtures. + bytes?: unknown; + bytesBase64?: string; + } + | { type: "json"; content: unknown } + >; + }; + probeEvidence: ProbeEvidence; + } + >; + }; + + for (const step of parsed.steps) { + // Rehydrate tier-2 probe screenshot from its on-disk file reference. + const probe = step.probeEvidence; + if (probe?.screenshotPath && !probe.screenshot) { + const resolved = path.isAbsolute(probe.screenshotPath) + ? probe.screenshotPath + : path.join(dir, probe.screenshotPath); + try { + probe.screenshot = await fs.readFile(resolved); + } catch { + // Missing screenshot file: leave probe.screenshot unset. The verifier's + // evidence_insufficient path will handle it. + } + } + + // Decode image modalities from base64 back to Buffer. + if (step.agentEvidence?.modalities) { + step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { + // The on-disk shape carries bytesBase64 instead of bytes, so we look + // through `unknown` here rather than rely on the typed union. + const raw = m as unknown as { bytesBase64?: string }; + if (m.type === "image" && typeof raw.bytesBase64 === "string") { + return { + type: "image" as const, + bytes: Buffer.from(raw.bytesBase64, "base64"), + mediaType: m.mediaType, + }; + } + return m as AgentEvidenceModality; + }); + } + } + + return parsed; +} + +/** + * Locate the next available `mmrubric_*.json` filename for a given trajectory + * directory. Used by offline re-scoring to avoid overwriting prior verdicts. + * + * Convention: prefer a label-based name (e.g., `mmrubric_rescore-2026-05-11.json`) + * over numeric versioning so multiple offline rescore attempts coexist without + * collisions and remain easy to diff. Falls back to a timestamp if the caller + * doesn't provide a label. + */ +export function nextVerdictFilename(label?: string): string { + const safeLabel = (label ?? `rescore-${new Date().toISOString()}`).replace( + /[^A-Za-z0-9._-]/g, + "_", + ); + return `mmrubric_${safeLabel}.json`; +} diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts new file mode 100644 index 000000000..49994774c --- /dev/null +++ b/packages/core/lib/v3/verifier/verifier.ts @@ -0,0 +1,157 @@ +/** + * Verifier — interface and result types for the rubric-based verifier that + * replaces V3Evaluator's single-pass YES/NO judge. + * + * Modeled on microsoft/fara's MMRubricAgent (arxiv 2511.19663, "The Art of + * Building Verifiers for Computer Use Agents"). The verifier never touches a + * live browser — it consumes a Trajectory + TaskSpec and returns a structured + * Verdict. That property is what lets us re-score saved trajectories offline. + * + * Wave 0 ships only the types and a stub implementation (`evidence_insufficient` + * for everything). Wave 1 ports the MMRubricAgent pipeline (Steps 1–6 + Step 8). + */ + +import type { Trajectory, TaskSpec } from "./trajectory.js"; + +/** Score for a single rubric criterion after evidence analysis + rescoring. */ +export interface CriterionScore { + /** Matches RubricCriterion.criterion (the criterion's short name). */ + criterion: string; + /** Maximum possible points for this criterion. */ + maxPoints: number; + /** + * Points earned post-evidence-analysis (paper's post_image_earned_points). + * Null if the criterion was conditional and its condition wasn't met (excluded + * from both numerator and denominator in the process score). + */ + earnedPoints: number | null; + /** Verifier's free-text justification for the score. */ + justification: string; + /** + * True if the criterion is conditional and its condition was determined to + * be met. Absent for non-conditional criteria. + */ + conditionMet?: boolean; + /** + * Set when the verifier had no evidence to ground this criterion in either + * tier. Per paper §2, treated as uncontrollable failure → full credit, but + * surfaced here so dashboards can flag low-confidence verdicts. + */ + evidenceInsufficient?: boolean; +} + +/** + * First-point-of-failure analysis (paper Step 9a). Identifies the earliest + * step where the agent's trajectory went off-track, using a structured error + * taxonomy (7 top-level categories, 1.1–7.4 sub-codes). + */ +export interface FirstPointOfFailure { + stepIndex: number; + /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */ + errorCode: string; + /** Top-level category name (Selection, Hallucination, etc.). */ + category: string; + /** Verifier's reasoning for selecting this point. */ + description?: string; +} + +/** + * Structured observation surfaced by the verifier that another agent or + * tooling could act on. Findings are emitted opportunistically by Step 8 + * (outcome verification) when the verifier notices actionable patterns — + * repeated tool-call failures, ambiguous task specs, evidence gaps, etc. + * + * Not produced for every task: when nothing actionable surfaces, the + * `findings` array on the Verdict is empty. Consumers should treat the + * field as advisory, not as part of the formal score. + */ +export interface VerifierFinding { + /** + * Category of the observation. Open-ended enum — additional categories may + * be added as Wave 2/3 verifier steps surface new failure modes. + */ + category: + | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries) + | "agent_strategy" // higher-level planning / decision-making problems + | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory + | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps) + | "task_specification" // task instruction was ambiguous / under- or over-specified + | "verifier_uncertainty" // verifier itself couldn't confidently decide + | "other"; + /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */ + severity: "info" | "warning" | "blocking"; + /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */ + description: string; + /** + * Optional concrete next action another agent could take. Should be + * specific enough that it can be acted on without further reasoning — + * e.g., "Try double_click instead of triple_click to clear placeholder + * text on this form field." + */ + suggestedAction?: string; + /** Step indices in the trajectory where this pattern showed up. */ + relatedSteps?: number[]; +} + +/** Task-validity classification (paper Step 10). */ +export interface TaskValidity { + /** True if the task is underspecified / has multiple valid interpretations. */ + isAmbiguous: boolean; + /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */ + isInvalid: boolean; + /** Optional sub-codes from the task-classification taxonomy. */ + ambiguityCodes?: string[]; + invalidTaskCodes?: string[]; +} + +/** + * The verifier's output. Process score + outcome verdict + diagnostic signals. + * + * Process and outcome are deliberately independent (paper §2): an agent can + * follow the right steps but get blocked (high process, low outcome), or + * succeed through an unexpected path (variable process, high outcome). + */ +export interface Verdict { + /** Step 8 — did the agent accomplish the task from the user's perspective? */ + outcomeSuccess: boolean; + /** Aggregated earned/max across applicable criteria, in [0, 1]. */ + processScore: number; + /** Per-criterion breakdown after rescoring. */ + perCriterion: CriterionScore[]; + /** Step 9a — first step where the trajectory went off-track, if any. */ + firstPointOfFailure?: FirstPointOfFailure; + /** Step 10 — task-itself ambiguity / validity. */ + taskValidity: TaskValidity; + /** + * Ids (RubricCriterion.criterion strings) of criteria where neither tier of + * evidence resolved the question. Treated as uncontrollable → full credit, + * but flagged here so consumers can decide whether to discount the score. + */ + evidenceInsufficient: string[]; + /** + * Structured observations from the verifier that a downstream tool or + * follow-up agent could act on. Opportunistic — empty when the verifier + * doesn't notice anything actionable. Not part of the score; advisory. + */ + findings?: VerifierFinding[]; + /** + * Intermediate per-step data — the paper's intermediate_mm_rubric_steps + * payload. Opaque shape; useful for debugging and prompt iteration, but not + * part of the stable contract. + */ + rawSteps?: unknown; +} + +/** Reason a stub verifier emits when the rubric pipeline hasn't shipped yet. */ +export type StubVerdictReason = + | "wave-0-stub" + | "no-rubric" + | "empty-trajectory"; + +/** + * Verifier interface. Implementations consume a Trajectory + TaskSpec and + * return a Verdict — they MUST NOT touch a live browser. + */ +export interface Verifier { + verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; +} diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index e1d384f8c..73009b0ad 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -14,6 +14,15 @@ import type { import { V3 } from "./v3/v3.js"; import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js"; import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js"; +import type { + Trajectory, + TaskSpec, + Verdict, + Rubric, + Verifier, + AgentEvidenceModality, + VerifierFinding, +} from "./v3/verifier/index.js"; const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND"; const DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = "legacy"; @@ -43,7 +52,7 @@ type NormalizedConstructorOptions = { backend?: V3EvaluatorBackend; }; -export class V3Evaluator { +export class V3Evaluator implements Verifier { private readonly backend: V3EvaluatorBackend; private readonly legacyEvaluator: LegacyV3Evaluator; @@ -75,15 +84,70 @@ export class V3Evaluator { return this.getLegacyBackend("batchAsk").batchAsk(options); } + async verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise { + assertVerifierInput(trajectory, taskSpec); + + if (this.backend === "legacy") { + return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec); + } + + return this.unavailableVerifierBackend("verify"); + } + + async generateRubric(taskSpec: TaskSpec): Promise { + if (!taskSpec?.id) { + throw new StagehandInvalidArgumentError( + "TaskSpec.id is required for rubric generation", + ); + } + + if (this.backend === "verifier") { + return this.unavailableVerifierBackend("generateRubric"); + } + + return { + items: [legacyTaskCompletionCriterion(taskSpec)], + }; + } + private getLegacyBackend(methodName: string): LegacyV3Evaluator { if (this.backend === "legacy") { return this.legacyEvaluator; } + return this.unavailableVerifierBackend(methodName); + } + + private unavailableVerifierBackend(methodName: string): never { throw new StagehandInvalidArgumentError( `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`, ); } + + private async verifyTrajectoryWithLegacyEvaluator( + trajectory: Trajectory, + taskSpec: TaskSpec, + ): Promise { + const screenshots = collectLegacyScreenshots(trajectory); + const agentReasoning = renderLegacyAgentReasoning(trajectory); + const answer = trajectory.finalAnswer; + + if (!screenshots.length && !answer) { + return legacyInsufficientEvidenceVerdict( + taskSpec, + "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.", + ); + } + + const result = await this.legacyEvaluator.ask({ + question: taskSpec.instruction, + screenshot: screenshots.length ? screenshots : false, + answer, + agentReasoning, + }); + + return legacyEvaluationToVerdict(result, taskSpec, screenshots.length); + } } function normalizeConstructorOptions( @@ -127,3 +191,182 @@ function resolveEvaluatorBackend( `Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`, ); } + +function assertVerifierInput(trajectory: Trajectory, taskSpec: TaskSpec): void { + if (!taskSpec?.id) { + throw new StagehandInvalidArgumentError( + "TaskSpec.id is required for verification", + ); + } + if (!trajectory) { + throw new StagehandInvalidArgumentError( + "Trajectory is required for verification", + ); + } +} + +function legacyTaskCompletionCriterion(taskSpec: TaskSpec) { + return { + criterion: "legacy-task-completion", + description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`, + max_points: 1, + }; +} + +function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] { + const screenshots: Buffer[] = []; + + for (const step of trajectory.steps ?? []) { + if (Buffer.isBuffer(step.probeEvidence?.screenshot)) { + screenshots.push(step.probeEvidence.screenshot); + continue; + } + + const agentImage = step.agentEvidence?.modalities?.find( + ( + modality, + ): modality is Extract => + modality.type === "image" && Buffer.isBuffer(modality.bytes), + ); + + if (agentImage) { + screenshots.push(agentImage.bytes); + } + } + + return screenshots; +} + +function renderLegacyAgentReasoning( + trajectory: Trajectory, +): string | undefined { + const stepLines = (trajectory.steps ?? []).map((step) => { + const output = step.toolOutput?.error + ? `Tool error: ${step.toolOutput.error}` + : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`; + return [ + `Step ${step.index}: ${step.actionName}`, + step.reasoning ? `Reasoning: ${step.reasoning}` : undefined, + output, + ] + .filter(Boolean) + .join("\n"); + }); + + const sections = [ + stepLines.length + ? `Agent trajectory:\n${stepLines.join("\n\n")}` + : undefined, + trajectory.finalAnswer + ? `Final answer:\n${trajectory.finalAnswer}` + : undefined, + ].filter(Boolean); + + if (!sections.length) { + return undefined; + } + + return truncateForPrompt(sections.join("\n\n"), 16000); +} + +function stringifyForPrompt(value: unknown): string { + if (typeof value === "string") { + return truncateForPrompt(value, 2000); + } + + try { + return truncateForPrompt(JSON.stringify(value), 2000); + } catch { + return String(value); + } +} + +function truncateForPrompt(value: string, maxLength: number): string { + if (value.length <= maxLength) { + return value; + } + + return `${value.slice(0, maxLength)}... [truncated]`; +} + +function legacyEvaluationToVerdict( + result: EvaluationResult, + taskSpec: TaskSpec, + screenshotCount: number, +): Verdict { + const outcomeSuccess = result.evaluation === "YES"; + const invalid = result.evaluation === "INVALID"; + const criterion = legacyTaskCompletionCriterion(taskSpec); + const findings: VerifierFinding[] = invalid + ? [ + { + category: "verifier_uncertainty", + severity: "warning", + description: result.reasoning, + }, + ] + : []; + + return { + outcomeSuccess, + processScore: outcomeSuccess ? 1 : 0, + perCriterion: [ + { + criterion: criterion.criterion, + maxPoints: criterion.max_points, + earnedPoints: outcomeSuccess ? 1 : 0, + justification: result.reasoning, + evidenceInsufficient: invalid, + }, + ], + taskValidity: { + isAmbiguous: false, + isInvalid: false, + }, + evidenceInsufficient: invalid ? [criterion.criterion] : [], + findings, + rawSteps: { + backend: "legacy", + legacyEvaluation: result.evaluation, + screenshotCount, + }, + }; +} + +function legacyInsufficientEvidenceVerdict( + taskSpec: TaskSpec, + reason: string, +): Verdict { + const criterion = legacyTaskCompletionCriterion(taskSpec); + + return { + outcomeSuccess: false, + processScore: 0, + perCriterion: [ + { + criterion: criterion.criterion, + maxPoints: criterion.max_points, + earnedPoints: 0, + justification: reason, + evidenceInsufficient: true, + }, + ], + taskValidity: { + isAmbiguous: false, + isInvalid: false, + }, + evidenceInsufficient: [criterion.criterion], + findings: [ + { + category: "trajectory_capture", + severity: "blocking", + description: reason, + }, + ], + rawSteps: { + backend: "legacy", + legacyEvaluation: "INVALID", + screenshotCount: 0, + }, + }; +} diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts index 8d710da4d..d1975a48c 100644 --- a/packages/core/tests/unit/public-api/v3-core.test.ts +++ b/packages/core/tests/unit/public-api/v3-core.test.ts @@ -134,6 +134,18 @@ describe("V3 Core public API types", () => { >(); }); + it("has verifier facade methods", () => { + expectTypeOf().toExtend< + ( + trajectory: Stagehand.Trajectory, + taskSpec: Stagehand.TaskSpec, + ) => Promise + >(); + expectTypeOf().toExtend< + (taskSpec: Stagehand.TaskSpec) => Promise + >(); + }); + it("accepts legacy evaluator backend options", () => { const mockV3 = {} as Stagehand.Stagehand; expectTypeOf().toBeConstructibleWith( @@ -156,6 +168,34 @@ describe("V3 Core public API types", () => { ); }); + it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => { + const taskSpec: Stagehand.TaskSpec = { + id: "empty", + instruction: "Complete the task", + }; + const trajectory: Stagehand.Trajectory = { + task: taskSpec, + steps: [], + status: "complete", + usage: { + input_tokens: 0, + output_tokens: 0, + }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + }; + const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, { + backend: "legacy", + }); + + const verdict = await evaluator.verify(trajectory, taskSpec); + + expect(verdict.outcomeSuccess).toBe(false); + expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]); + }); + it("rejects invalid evaluator backend env values", () => { const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND; process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend"; From fadc5a8b4246497a31c6ba9ffdaf35764bacfbc0 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:16:05 -0700 Subject: [PATCH 02/14] fix(verifier): normalize public rubric naming --- .changeset/verifier-evaluator-shell.md | 5 ++ packages/core/lib/v3/index.ts | 5 ++ packages/core/lib/v3/verifier/index.ts | 10 ++- packages/core/lib/v3/verifier/trajectory.ts | 75 +++++++++++++++++---- packages/core/lib/v3/verifier/verifier.ts | 25 +++++-- packages/core/lib/v3Evaluator.ts | 6 +- 6 files changed, 104 insertions(+), 22 deletions(-) create mode 100644 .changeset/verifier-evaluator-shell.md diff --git a/.changeset/verifier-evaluator-shell.md b/.changeset/verifier-evaluator-shell.md new file mode 100644 index 000000000..8e603b499 --- /dev/null +++ b/.changeset/verifier-evaluator-shell.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add verifier trajectory, rubric, and verdict types with normalized public naming. diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 88f999e16..8fdcc6b75 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -72,6 +72,9 @@ export type { TaskSpec, Rubric, RubricCriterion, + SerializedRubric, + SerializedRubricCriterion, + RubricInput, AgentEvidence, AgentEvidenceModality, ProbeEvidence, @@ -82,10 +85,12 @@ export type { FirstPointOfFailure, TaskValidity, VerifierFinding, + VerifierRawSteps, } from "./verifier/index.js"; export { loadTrajectoryFromDisk, nextVerdictFilename, + normalizeRubric, } from "./verifier/index.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index b04363458..b39c94488 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -12,12 +12,19 @@ export type { TaskSpec, Rubric, RubricCriterion, + SerializedRubric, + SerializedRubricCriterion, + RubricInput, AgentEvidence, AgentEvidenceModality, ProbeEvidence, ToolOutput, } from "./trajectory.js"; -export { loadTrajectoryFromDisk, nextVerdictFilename } from "./trajectory.js"; +export { + loadTrajectoryFromDisk, + nextVerdictFilename, + normalizeRubric, +} from "./trajectory.js"; export type { Verifier, @@ -26,5 +33,6 @@ export type { FirstPointOfFailure, TaskValidity, VerifierFinding, + VerifierRawSteps, StubVerdictReason, } from "./verifier.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 6912dbc74..d8228c287 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -3,10 +3,8 @@ * * Trajectories are produced by the harness (TrajectoryRecorder in * packages/evals) from the bus events emitted by v3AgentHandler / - * v3CuaAgentHandler. They are persisted on-disk in a layout matching - * microsoft/fara's example_trajectory/ (task_data.json + trajectory.json + - * screenshot_N.png + scores/) so we can cross-validate against - * CUAVerifierBench's verify_trajectories.py without format-conversion. + * v3CuaAgentHandler. They can be persisted on disk and reloaded for offline + * verifier scoring. * * Two evidence channels per step: * - agentEvidence ("tier 1") — what the agent's LLM consumed as the tool @@ -30,9 +28,9 @@ export interface TrajectoryUsage { } /** - * A single criterion in a rubric. Mirrors fara's per-item schema: - * { criterion, description, max_points, justification, earned_points } - * Conditional criteria carry an extra "condition" field; only counted when met. + * A single criterion in a Stagehand rubric. Dataset and model wire formats may + * use fara-style `max_points` / `earned_points`; normalize those with + * `normalizeRubric()` at the boundary. */ export interface RubricCriterion { /** Short name of the criterion (e.g., "Add ground beef to cart"). */ @@ -40,7 +38,7 @@ export interface RubricCriterion { /** What to evaluate and how to award partial credit. */ description: string; /** Maximum points for this criterion. */ - max_points: number; + maxPoints: number; /** * Triggering condition for conditional criteria. Only counted when met * (paper's "Mutually Exclusive Conditionals" pattern). @@ -49,11 +47,10 @@ export interface RubricCriterion { /** Filled by the verifier during scoring; empty in precomputed rubrics. */ justification?: string; /** - * Filled by the verifier during scoring; empty string in unscored rubrics. - * Loose type to mirror fara's data, where unscored items carry "" and scored - * items carry a number. + * Filled by the verifier during scoring; empty string in some serialized + * upstream rubrics and a number in scored rubrics. */ - earned_points?: number | string; + earnedPoints?: number | string; } /** A rubric — list of criteria for a task. */ @@ -61,6 +58,60 @@ export interface Rubric { items: RubricCriterion[]; } +/** + * FARA/upstream rubric item shape as stored in datasets and prompt responses. + * Keep this at IO boundaries; core verifier types use camelCase. + */ +export interface SerializedRubricCriterion { + criterion: string; + description: string; + max_points: number; + condition?: string; + justification?: string; + earned_points?: number | string; +} + +/** Serialized rubric shape used by upstream datasets and generated JSON. */ +export interface SerializedRubric { + items: SerializedRubricCriterion[]; +} + +export type RubricInput = Rubric | SerializedRubric; + +/** Convert a Stagehand or serialized rubric into the public Stagehand shape. */ +export function normalizeRubric( + rubric: RubricInput | null | undefined, +): Rubric | undefined { + if (!rubric) return undefined; + + return { + items: rubric.items.map((item) => { + const raw = item as RubricCriterion & + Partial; + const maxPoints = + typeof raw.maxPoints === "number" ? raw.maxPoints : raw.max_points; + + if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) { + throw new TypeError( + `Rubric criterion "${raw.criterion}" is missing a numeric maxPoints value`, + ); + } + + const earnedPoints = raw.earnedPoints ?? raw.earned_points; + return { + criterion: raw.criterion, + description: raw.description, + maxPoints, + ...(raw.condition !== undefined && { condition: raw.condition }), + ...(raw.justification !== undefined && { + justification: raw.justification, + }), + ...(earnedPoints !== undefined && { earnedPoints }), + }; + }), + }; +} + /** * Spec for a single task being verified. Carried both at runtime (handed to * agent.execute) and into the verifier alongside the trajectory. diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts index 49994774c..659675486 100644 --- a/packages/core/lib/v3/verifier/verifier.ts +++ b/packages/core/lib/v3/verifier/verifier.ts @@ -93,6 +93,23 @@ export interface VerifierFinding { relatedSteps?: number[]; } +/** Stable debugging summary emitted by verifier backends. */ +export interface VerifierRawSteps { + backend?: "legacy" | "verifier"; + primaryIntent?: string; + reasoning?: string; + rubricSource?: "precomputed" | "generated" | "none"; + approach?: "a" | "b"; + optionalsMode?: "folded" | "separate" | "skip"; + totalEarned?: number; + totalMax?: number; + evidenceImages?: number; + evidenceTexts?: number; + evidenceOriginalScreenshots?: number; + legacyEvaluation?: string; + screenshotCount?: number; +} + /** Task-validity classification (paper Step 10). */ export interface TaskValidity { /** True if the task is underspecified / has multiple valid interpretations. */ @@ -134,12 +151,8 @@ export interface Verdict { * doesn't notice anything actionable. Not part of the score; advisory. */ findings?: VerifierFinding[]; - /** - * Intermediate per-step data — the paper's intermediate_mm_rubric_steps - * payload. Opaque shape; useful for debugging and prompt iteration, but not - * part of the stable contract. - */ - rawSteps?: unknown; + /** Debugging summary from the active evaluator backend. */ + rawSteps?: VerifierRawSteps; } /** Reason a stub verifier emits when the rubric pipeline hasn't shipped yet. */ diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index 73009b0ad..8adc7d5f1 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -209,7 +209,7 @@ function legacyTaskCompletionCriterion(taskSpec: TaskSpec) { return { criterion: "legacy-task-completion", description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`, - max_points: 1, + maxPoints: 1, }; } @@ -313,7 +313,7 @@ function legacyEvaluationToVerdict( perCriterion: [ { criterion: criterion.criterion, - maxPoints: criterion.max_points, + maxPoints: criterion.maxPoints, earnedPoints: outcomeSuccess ? 1 : 0, justification: result.reasoning, evidenceInsufficient: invalid, @@ -345,7 +345,7 @@ function legacyInsufficientEvidenceVerdict( perCriterion: [ { criterion: criterion.criterion, - maxPoints: criterion.max_points, + maxPoints: criterion.maxPoints, earnedPoints: 0, justification: reason, evidenceInsufficient: true, From 2765781ad3aa90175578f9518bd51a78b1b42f0b Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:21:01 -0700 Subject: [PATCH 03/14] style(verifier): format rubric normalizer --- packages/core/lib/v3/verifier/trajectory.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index d8228c287..d992ce9ee 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -86,8 +86,7 @@ export function normalizeRubric( return { items: rubric.items.map((item) => { - const raw = item as RubricCriterion & - Partial; + const raw = item as RubricCriterion & Partial; const maxPoints = typeof raw.maxPoints === "number" ? raw.maxPoints : raw.max_points; From 0088a3c3c62fa4d8ffa18afbc6cd1e863716f256 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:34:48 -0700 Subject: [PATCH 04/14] chore(verifier): remove upstream verifier references --- packages/core/lib/v3/verifier/trajectory.ts | 6 +++--- packages/core/lib/v3/verifier/verifier.ts | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index d992ce9ee..6fb604d11 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -29,7 +29,7 @@ export interface TrajectoryUsage { /** * A single criterion in a Stagehand rubric. Dataset and model wire formats may - * use fara-style `max_points` / `earned_points`; normalize those with + * use serialized `max_points` / `earned_points`; normalize those with * `normalizeRubric()` at the boundary. */ export interface RubricCriterion { @@ -59,7 +59,7 @@ export interface Rubric { } /** - * FARA/upstream rubric item shape as stored in datasets and prompt responses. + * Serialized rubric item shape as stored in datasets and prompt responses. * Keep this at IO boundaries; core verifier types use camelCase. */ export interface SerializedRubricCriterion { @@ -221,7 +221,7 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. * ├── scores/ * │ └── mmrubric_v1.json — Verdict from V3Evaluator.verify() - * ├── core.log — action log mirroring fara's core.log + * ├── core.log — captured action log * └── times.json — step timing + token usage */ export interface Trajectory { diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts index 659675486..02461d77e 100644 --- a/packages/core/lib/v3/verifier/verifier.ts +++ b/packages/core/lib/v3/verifier/verifier.ts @@ -2,13 +2,13 @@ * Verifier — interface and result types for the rubric-based verifier that * replaces V3Evaluator's single-pass YES/NO judge. * - * Modeled on microsoft/fara's MMRubricAgent (arxiv 2511.19663, "The Art of - * Building Verifiers for Computer Use Agents"). The verifier never touches a - * live browser — it consumes a Trajectory + TaskSpec and returns a structured - * Verdict. That property is what lets us re-score saved trajectories offline. + * Modeled on rubric-based verifier pipelines for computer-use agents. The + * verifier never touches a live browser — it consumes a Trajectory + TaskSpec + * and returns a structured Verdict. That property is what lets us re-score + * saved trajectories offline. * * Wave 0 ships only the types and a stub implementation (`evidence_insufficient` - * for everything). Wave 1 ports the MMRubricAgent pipeline (Steps 1–6 + Step 8). + * for everything). Wave 1 adds the rubric generation/scoring pipeline. */ import type { Trajectory, TaskSpec } from "./trajectory.js"; From 5b4479549fbf79e7207da4b850fab4062c6d5695 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:18:50 -0700 Subject: [PATCH 05/14] docs(verifier): remove rollout comments from public types --- packages/core/lib/v3/verifier/index.ts | 3 --- packages/core/lib/v3/verifier/verifier.ts | 5 +---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index b39c94488..144ab033b 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -1,8 +1,5 @@ /** * Public re-exports for the verifier subsystem. - * - * Wave 0 ships the trajectory + verdict types and a stub verifier. The - * RubricVerifier port (Wave 1+) stays internal until the prompts stabilize. */ export type { Trajectory, diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts index 02461d77e..3ae764b2a 100644 --- a/packages/core/lib/v3/verifier/verifier.ts +++ b/packages/core/lib/v3/verifier/verifier.ts @@ -6,9 +6,6 @@ * verifier never touches a live browser — it consumes a Trajectory + TaskSpec * and returns a structured Verdict. That property is what lets us re-score * saved trajectories offline. - * - * Wave 0 ships only the types and a stub implementation (`evidence_insufficient` - * for everything). Wave 1 adds the rubric generation/scoring pipeline. */ import type { Trajectory, TaskSpec } from "./trajectory.js"; @@ -68,7 +65,7 @@ export interface FirstPointOfFailure { export interface VerifierFinding { /** * Category of the observation. Open-ended enum — additional categories may - * be added as Wave 2/3 verifier steps surface new failure modes. + * be added as verifier backends surface new failure modes. */ category: | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries) From 5e883781bc5e5b925e6a56929b38eaa26beab83e Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:22:43 -0700 Subject: [PATCH 06/14] refactor(verifier): consolidate public types --- packages/core/lib/v3/verifier/index.ts | 39 ++- packages/core/lib/v3/verifier/trajectory.ts | 227 ++----------- packages/core/lib/v3/verifier/types.ts | 339 ++++++++++++++++++++ packages/core/lib/v3/verifier/verifier.ts | 177 +--------- 4 files changed, 393 insertions(+), 389 deletions(-) create mode 100644 packages/core/lib/v3/verifier/types.ts diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 144ab033b..1b76eb388 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -2,34 +2,31 @@ * Public re-exports for the verifier subsystem. */ export type { - Trajectory, - TrajectoryStep, - TrajectoryStatus, - TrajectoryUsage, - TaskSpec, + AgentEvidence, + AgentEvidenceModality, + CriterionScore, + FirstPointOfFailure, + ProbeEvidence, Rubric, RubricCriterion, + RubricInput, SerializedRubric, SerializedRubricCriterion, - RubricInput, - AgentEvidence, - AgentEvidenceModality, - ProbeEvidence, + StubVerdictReason, + TaskSpec, + TaskValidity, ToolOutput, -} from "./trajectory.js"; + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + Verdict, + Verifier, + VerifierFinding, + VerifierRawSteps, +} from "./types.js"; export { loadTrajectoryFromDisk, nextVerdictFilename, normalizeRubric, } from "./trajectory.js"; - -export type { - Verifier, - Verdict, - CriterionScore, - FirstPointOfFailure, - TaskValidity, - VerifierFinding, - VerifierRawSteps, - StubVerdictReason, -} from "./verifier.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 6fb604d11..3dbb7e5a6 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -1,82 +1,29 @@ -/** - * Trajectory — structured record of an agent run, consumed by the verifier. - * - * Trajectories are produced by the harness (TrajectoryRecorder in - * packages/evals) from the bus events emitted by v3AgentHandler / - * v3CuaAgentHandler. They can be persisted on disk and reloaded for offline - * verifier scoring. - * - * Two evidence channels per step: - * - agentEvidence ("tier 1") — what the agent's LLM consumed as the tool - * result. For DOM/hybrid agents these are the tool returns (extract JSON, - * ariaTree text, act describe-string, goto URL). For CUA this is the - * screenshot the provider received. - * - probeEvidence ("tier 2") — independent observations the harness took - * around each step (page.screenshot, page.url, optionally a11y). - * - * The verifier consumes both. They can disagree; conflict resolution is the - * verifier's job (see Verdict.evidenceInsufficient + per-criterion logging). - */ - -/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */ -export interface TrajectoryUsage { - input_tokens: number; - output_tokens: number; - reasoning_tokens?: number; - cached_input_tokens?: number; - inference_time_ms?: number; -} - -/** - * A single criterion in a Stagehand rubric. Dataset and model wire formats may - * use serialized `max_points` / `earned_points`; normalize those with - * `normalizeRubric()` at the boundary. - */ -export interface RubricCriterion { - /** Short name of the criterion (e.g., "Add ground beef to cart"). */ - criterion: string; - /** What to evaluate and how to award partial credit. */ - description: string; - /** Maximum points for this criterion. */ - maxPoints: number; - /** - * Triggering condition for conditional criteria. Only counted when met - * (paper's "Mutually Exclusive Conditionals" pattern). - */ - condition?: string; - /** Filled by the verifier during scoring; empty in precomputed rubrics. */ - justification?: string; - /** - * Filled by the verifier during scoring; empty string in some serialized - * upstream rubrics and a number in scored rubrics. - */ - earnedPoints?: number | string; -} - -/** A rubric — list of criteria for a task. */ -export interface Rubric { - items: RubricCriterion[]; -} - -/** - * Serialized rubric item shape as stored in datasets and prompt responses. - * Keep this at IO boundaries; core verifier types use camelCase. - */ -export interface SerializedRubricCriterion { - criterion: string; - description: string; - max_points: number; - condition?: string; - justification?: string; - earned_points?: number | string; -} - -/** Serialized rubric shape used by upstream datasets and generated JSON. */ -export interface SerializedRubric { - items: SerializedRubricCriterion[]; -} - -export type RubricInput = Rubric | SerializedRubric; +import type { + ProbeEvidence, + Rubric, + RubricCriterion, + RubricInput, + SerializedRubricCriterion, + Trajectory, + TrajectoryStep, +} from "./types.js"; + +export type { + AgentEvidence, + AgentEvidenceModality, + ProbeEvidence, + Rubric, + RubricCriterion, + RubricInput, + SerializedRubric, + SerializedRubricCriterion, + TaskSpec, + ToolOutput, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, +} from "./types.js"; /** Convert a Stagehand or serialized rubric into the public Stagehand shape. */ export function normalizeRubric( @@ -111,128 +58,6 @@ export function normalizeRubric( }; } -/** - * Spec for a single task being verified. Carried both at runtime (handed to - * agent.execute) and into the verifier alongside the trajectory. - */ -export interface TaskSpec { - /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */ - id: string; - /** Task instruction shown to the agent. */ - instruction: string; - /** Starting URL, if any. */ - initUrl?: string; - /** - * Rubric carried by the dataset (e.g., WebTailBench's precomputed_rubric). - * If absent, the verifier generates one via Step 0a and caches under - * packages/evals/.rubric-cache/. - */ - precomputedRubric?: Rubric; - /** Optional reference answer (set when dataset ships one). */ - expectedAnswer?: string; -} - -/** - * A single modality unit in tier-1 agent evidence. Mirrors the shape of - * ModelMessage content parts so we can reproduce what the LLM ingested. - */ -export type AgentEvidenceModality = - | { type: "text"; content: string } - | { type: "image"; bytes: Buffer; mediaType: string } - | { type: "json"; content: unknown }; - -/** - * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the - * tool result for this step. - * - * Modes: - * - CUA: usually a single image modality (the screenshot sent to the provider). - * - Hybrid: tool result with optional screenshotBase64 → one image + one text. - * - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities. - */ -export interface AgentEvidence { - modalities: AgentEvidenceModality[]; -} - -/** - * Tier 2 — independent harness probes around this step. Cheap and always-on - * for v0 (just url) and v1 (+a11y, +scroll). v2 adds verifier-requested probes - * keyed on the criterion that requested them. - * - * If a probe wasn't captured, the field is absent (not null). - */ -export interface ProbeEvidence { - /** v0.5 — URL after the step's tool execution. */ - url?: string; - /** - * v0 — bus screenshot (page.screenshot post-step). Path on disk is preferred - * once persisted; in-memory Buffer is used during a live run. - */ - screenshot?: Buffer; - /** Reference to the persisted screenshot file under the trajectory dir. */ - screenshotPath?: string; - /** v1 — viewport scroll context. Lets the verifier reason about "did the agent see the full page". */ - scroll?: { top: number; pageHeight: number }; - /** v1 — accessibility tree snapshot. */ - ariaTree?: string; - /** v2 — verifier-requested probes, keyed by criterion id. */ - onDemand?: Record; -} - -/** Outcome of a single tool execution as seen by the harness. */ -export interface ToolOutput { - ok: boolean; - /** - * The tool's return value. Same payload that flowed into agentEvidence - * modalities, but in its native shape (e.g., the extract result, the act - * describe-string) rather than serialized for the LLM. - */ - result: unknown; - error?: string; -} - -/** One step in a trajectory: action + reasoning + evidence + outcome. */ -export interface TrajectoryStep { - index: number; - actionName: string; - actionArgs: Record; - /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */ - reasoning: string; - agentEvidence: AgentEvidence; - probeEvidence: ProbeEvidence; - toolOutput: ToolOutput; - /** ISO 8601 timestamp when the step's tool execution started. */ - startedAt: string; - /** ISO 8601 timestamp when the step's tool execution finished. */ - finishedAt: string; -} - -/** Terminal status of the agent run. */ -export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; - -/** - * Full trajectory for one task run. - * - * The on-disk layout is one directory per task: - * - * .trajectories/// - * ├── task_data.json — TaskSpec + Verdict (filled on completion) - * ├── trajectory.json — this object, with screenshotPath instead of bytes - * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. - * ├── scores/ - * │ └── mmrubric_v1.json — Verdict from V3Evaluator.verify() - * ├── core.log — captured action log - * └── times.json — step timing + token usage - */ -export interface Trajectory { - task: TaskSpec; - steps: TrajectoryStep[]; - finalAnswer?: string; - status: TrajectoryStatus; - usage: TrajectoryUsage; - timing: { startedAt: string; endedAt: string }; -} - // ───────────────────────────────────────────────────────────────────────────── // On-disk loader // ───────────────────────────────────────────────────────────────────────────── diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts new file mode 100644 index 000000000..fb0901f60 --- /dev/null +++ b/packages/core/lib/v3/verifier/types.ts @@ -0,0 +1,339 @@ +/** + * Shared verifier types for trajectories, rubrics, evidence, and verdicts. + * + * The verifier consumes saved trajectories instead of a live browser. DOM and + * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve + * screenshots sent to the provider plus independent harness probes. + */ + +/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */ +export interface TrajectoryUsage { + input_tokens: number; + output_tokens: number; + reasoning_tokens?: number; + cached_input_tokens?: number; + inference_time_ms?: number; +} + +/** + * A single criterion in a Stagehand rubric. Dataset and model wire formats may + * use serialized `max_points` / `earned_points`; normalize those with + * `normalizeRubric()` at the boundary. + */ +export interface RubricCriterion { + /** Short name of the criterion (e.g., "Add ground beef to cart"). */ + criterion: string; + /** What to evaluate and how to award partial credit. */ + description: string; + /** Maximum points for this criterion. */ + maxPoints: number; + /** + * Triggering condition for conditional criteria. Only counted when met + * (paper's "Mutually Exclusive Conditionals" pattern). + */ + condition?: string; + /** Filled by the verifier during scoring; empty in precomputed rubrics. */ + justification?: string; + /** + * Filled by the verifier during scoring; empty string in some serialized + * upstream rubrics and a number in scored rubrics. + */ + earnedPoints?: number | string; +} + +/** A rubric — list of criteria for a task. */ +export interface Rubric { + items: RubricCriterion[]; +} + +/** + * Serialized rubric item shape as stored in datasets and prompt responses. + * Keep this at IO boundaries; core verifier types use camelCase. + */ +export interface SerializedRubricCriterion { + criterion: string; + description: string; + max_points: number; + condition?: string; + justification?: string; + earned_points?: number | string; +} + +/** Serialized rubric shape used by upstream datasets and generated JSON. */ +export interface SerializedRubric { + items: SerializedRubricCriterion[]; +} + +export type RubricInput = Rubric | SerializedRubric; + +/** + * Spec for a single task being verified. Carried both at runtime and into the + * verifier alongside the trajectory. + */ +export interface TaskSpec { + /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */ + id: string; + /** Task instruction shown to the agent. */ + instruction: string; + /** Starting URL, if any. */ + initUrl?: string; + /** Rubric carried by the dataset or generated by a verifier backend. */ + precomputedRubric?: Rubric; + /** Optional reference answer (set when dataset ships one). */ + expectedAnswer?: string; +} + +/** + * A single modality unit in tier-1 agent evidence. Mirrors the shape of + * ModelMessage content parts so we can reproduce what the LLM ingested. + */ +export type AgentEvidenceModality = + | { type: "text"; content: string } + | { type: "image"; bytes: Buffer; mediaType: string } + | { type: "json"; content: unknown }; + +/** + * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the + * tool result for this step. + * + * Modes: + * - CUA: usually a single image modality (the screenshot sent to the provider). + * - Hybrid: tool result with optional screenshotBase64 → one image + one text. + * - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities. + */ +export interface AgentEvidence { + modalities: AgentEvidenceModality[]; +} + +/** + * Tier 2 — independent harness probes around this step. + * + * If a probe wasn't captured, the field is absent (not null). + */ +export interface ProbeEvidence { + /** URL after the step's tool execution. */ + url?: string; + /** + * Bus screenshot captured after the step. Path on disk is preferred once + * persisted; in-memory Buffer is used during a live run. + */ + screenshot?: Buffer; + /** Reference to the persisted screenshot file under the trajectory dir. */ + screenshotPath?: string; + /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */ + scroll?: { top: number; pageHeight: number }; + /** Accessibility tree snapshot. */ + ariaTree?: string; + /** Verifier-requested probes, keyed by criterion id. */ + onDemand?: Record; +} + +/** Outcome of a single tool execution as seen by the harness. */ +export interface ToolOutput { + ok: boolean; + /** + * The tool's return value. Same payload that flowed into agentEvidence + * modalities, but in its native shape (e.g., the extract result, the act + * describe-string) rather than serialized for the LLM. + */ + result: unknown; + error?: string; +} + +/** One step in a trajectory: action + reasoning + evidence + outcome. */ +export interface TrajectoryStep { + index: number; + actionName: string; + actionArgs: Record; + /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */ + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: ToolOutput; + /** ISO 8601 timestamp when the step's tool execution started. */ + startedAt: string; + /** ISO 8601 timestamp when the step's tool execution finished. */ + finishedAt: string; +} + +/** Terminal status of the agent run. */ +export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; + +/** + * Full trajectory for one task run. + * + * The on-disk layout is one directory per task: + * + * .trajectories/// + * ├── task_data.json — TaskSpec + Verdict (filled on completion) + * ├── trajectory.json — this object, with screenshotPath instead of bytes + * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. + * ├── scores/ + * │ └── mmrubric_v1.json — Verdict from V3Evaluator.verify() + * ├── core.log — captured action log + * └── times.json — step timing + token usage + */ +export interface Trajectory { + task: TaskSpec; + steps: TrajectoryStep[]; + finalAnswer?: string; + status: TrajectoryStatus; + usage: TrajectoryUsage; + timing: { startedAt: string; endedAt: string }; +} + +/** Score for a single rubric criterion after evidence analysis + rescoring. */ +export interface CriterionScore { + /** Matches RubricCriterion.criterion (the criterion's short name). */ + criterion: string; + /** Maximum possible points for this criterion. */ + maxPoints: number; + /** + * Points earned post-evidence-analysis (paper's post_image_earned_points). + * Null if the criterion was conditional and its condition wasn't met (excluded + * from both numerator and denominator in the process score). + */ + earnedPoints: number | null; + /** Verifier's free-text justification for the score. */ + justification: string; + /** + * True if the criterion is conditional and its condition was determined to + * be met. Absent for non-conditional criteria. + */ + conditionMet?: boolean; + /** + * Set when the verifier had no evidence to ground this criterion in either + * tier. Per paper §2, treated as uncontrollable failure → full credit, but + * surfaced here so dashboards can flag low-confidence verdicts. + */ + evidenceInsufficient?: boolean; +} + +/** + * First-point-of-failure analysis (paper Step 9a). Identifies the earliest + * step where the agent's trajectory went off-track, using a structured error + * taxonomy (7 top-level categories, 1.1–7.4 sub-codes). + */ +export interface FirstPointOfFailure { + stepIndex: number; + /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */ + errorCode: string; + /** Top-level category name (Selection, Hallucination, etc.). */ + category: string; + /** Verifier's reasoning for selecting this point. */ + description?: string; +} + +/** + * Structured observation surfaced by the verifier that another agent or + * tooling could act on. Findings are emitted opportunistically by Step 8 + * (outcome verification) when the verifier notices actionable patterns — + * repeated tool-call failures, ambiguous task specs, evidence gaps, etc. + * + * Not produced for every task: when nothing actionable surfaces, the + * `findings` array on the Verdict is empty. Consumers should treat the + * field as advisory, not as part of the formal score. + */ +export interface VerifierFinding { + /** + * Category of the observation. Open-ended enum — additional categories may + * be added as verifier backends surface new failure modes. + */ + category: + | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries) + | "agent_strategy" // higher-level planning / decision-making problems + | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory + | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps) + | "task_specification" // task instruction was ambiguous / under- or over-specified + | "verifier_uncertainty" // verifier itself couldn't confidently decide + | "other"; + /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */ + severity: "info" | "warning" | "blocking"; + /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */ + description: string; + /** + * Optional concrete next action another agent could take. Should be + * specific enough that it can be acted on without further reasoning — + * e.g., "Try double_click instead of triple_click to clear placeholder + * text on this form field." + */ + suggestedAction?: string; + /** Step indices in the trajectory where this pattern showed up. */ + relatedSteps?: number[]; +} + +/** Stable debugging summary emitted by verifier backends. */ +export interface VerifierRawSteps { + backend?: "legacy" | "verifier"; + primaryIntent?: string; + reasoning?: string; + rubricSource?: "precomputed" | "generated" | "none"; + approach?: "a" | "b"; + optionalsMode?: "folded" | "separate" | "skip"; + totalEarned?: number; + totalMax?: number; + evidenceImages?: number; + evidenceTexts?: number; + evidenceOriginalScreenshots?: number; + legacyEvaluation?: string; + screenshotCount?: number; +} + +/** Task-validity classification (paper Step 10). */ +export interface TaskValidity { + /** True if the task is underspecified / has multiple valid interpretations. */ + isAmbiguous: boolean; + /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */ + isInvalid: boolean; + /** Optional sub-codes from the task-classification taxonomy. */ + ambiguityCodes?: string[]; + invalidTaskCodes?: string[]; +} + +/** + * The verifier's output. Process score + outcome verdict + diagnostic signals. + * + * Process and outcome are deliberately independent (paper §2): an agent can + * follow the right steps but get blocked (high process, low outcome), or + * succeed through an unexpected path (variable process, high outcome). + */ +export interface Verdict { + /** Step 8 — did the agent accomplish the task from the user's perspective? */ + outcomeSuccess: boolean; + /** Aggregated earned/max across applicable criteria, in [0, 1]. */ + processScore: number; + /** Per-criterion breakdown after rescoring. */ + perCriterion: CriterionScore[]; + /** Step 9a — first step where the trajectory went off-track, if any. */ + firstPointOfFailure?: FirstPointOfFailure; + /** Step 10 — task-itself ambiguity / validity. */ + taskValidity: TaskValidity; + /** + * Ids (RubricCriterion.criterion strings) of criteria where neither tier of + * evidence resolved the question. Treated as uncontrollable → full credit, + * but flagged here so consumers can decide whether to discount the score. + */ + evidenceInsufficient: string[]; + /** + * Structured observations from the verifier that a downstream tool or + * follow-up agent could act on. Opportunistic — empty when the verifier + * doesn't notice anything actionable. Not part of the score; advisory. + */ + findings?: VerifierFinding[]; + /** Debugging summary from the active evaluator backend. */ + rawSteps?: VerifierRawSteps; +} + +/** Reason a stub verifier can emit. */ +export type StubVerdictReason = + | "wave-0-stub" + | "no-rubric" + | "empty-trajectory"; + +/** + * Verifier interface. Implementations consume a Trajectory + TaskSpec and + * return a Verdict — they MUST NOT touch a live browser. + */ +export interface Verifier { + verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; +} diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts index 3ae764b2a..57167f386 100644 --- a/packages/core/lib/v3/verifier/verifier.ts +++ b/packages/core/lib/v3/verifier/verifier.ts @@ -1,167 +1,10 @@ -/** - * Verifier — interface and result types for the rubric-based verifier that - * replaces V3Evaluator's single-pass YES/NO judge. - * - * Modeled on rubric-based verifier pipelines for computer-use agents. The - * verifier never touches a live browser — it consumes a Trajectory + TaskSpec - * and returns a structured Verdict. That property is what lets us re-score - * saved trajectories offline. - */ - -import type { Trajectory, TaskSpec } from "./trajectory.js"; - -/** Score for a single rubric criterion after evidence analysis + rescoring. */ -export interface CriterionScore { - /** Matches RubricCriterion.criterion (the criterion's short name). */ - criterion: string; - /** Maximum possible points for this criterion. */ - maxPoints: number; - /** - * Points earned post-evidence-analysis (paper's post_image_earned_points). - * Null if the criterion was conditional and its condition wasn't met (excluded - * from both numerator and denominator in the process score). - */ - earnedPoints: number | null; - /** Verifier's free-text justification for the score. */ - justification: string; - /** - * True if the criterion is conditional and its condition was determined to - * be met. Absent for non-conditional criteria. - */ - conditionMet?: boolean; - /** - * Set when the verifier had no evidence to ground this criterion in either - * tier. Per paper §2, treated as uncontrollable failure → full credit, but - * surfaced here so dashboards can flag low-confidence verdicts. - */ - evidenceInsufficient?: boolean; -} - -/** - * First-point-of-failure analysis (paper Step 9a). Identifies the earliest - * step where the agent's trajectory went off-track, using a structured error - * taxonomy (7 top-level categories, 1.1–7.4 sub-codes). - */ -export interface FirstPointOfFailure { - stepIndex: number; - /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */ - errorCode: string; - /** Top-level category name (Selection, Hallucination, etc.). */ - category: string; - /** Verifier's reasoning for selecting this point. */ - description?: string; -} - -/** - * Structured observation surfaced by the verifier that another agent or - * tooling could act on. Findings are emitted opportunistically by Step 8 - * (outcome verification) when the verifier notices actionable patterns — - * repeated tool-call failures, ambiguous task specs, evidence gaps, etc. - * - * Not produced for every task: when nothing actionable surfaces, the - * `findings` array on the Verdict is empty. Consumers should treat the - * field as advisory, not as part of the formal score. - */ -export interface VerifierFinding { - /** - * Category of the observation. Open-ended enum — additional categories may - * be added as verifier backends surface new failure modes. - */ - category: - | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries) - | "agent_strategy" // higher-level planning / decision-making problems - | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory - | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps) - | "task_specification" // task instruction was ambiguous / under- or over-specified - | "verifier_uncertainty" // verifier itself couldn't confidently decide - | "other"; - /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */ - severity: "info" | "warning" | "blocking"; - /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */ - description: string; - /** - * Optional concrete next action another agent could take. Should be - * specific enough that it can be acted on without further reasoning — - * e.g., "Try double_click instead of triple_click to clear placeholder - * text on this form field." - */ - suggestedAction?: string; - /** Step indices in the trajectory where this pattern showed up. */ - relatedSteps?: number[]; -} - -/** Stable debugging summary emitted by verifier backends. */ -export interface VerifierRawSteps { - backend?: "legacy" | "verifier"; - primaryIntent?: string; - reasoning?: string; - rubricSource?: "precomputed" | "generated" | "none"; - approach?: "a" | "b"; - optionalsMode?: "folded" | "separate" | "skip"; - totalEarned?: number; - totalMax?: number; - evidenceImages?: number; - evidenceTexts?: number; - evidenceOriginalScreenshots?: number; - legacyEvaluation?: string; - screenshotCount?: number; -} - -/** Task-validity classification (paper Step 10). */ -export interface TaskValidity { - /** True if the task is underspecified / has multiple valid interpretations. */ - isAmbiguous: boolean; - /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */ - isInvalid: boolean; - /** Optional sub-codes from the task-classification taxonomy. */ - ambiguityCodes?: string[]; - invalidTaskCodes?: string[]; -} - -/** - * The verifier's output. Process score + outcome verdict + diagnostic signals. - * - * Process and outcome are deliberately independent (paper §2): an agent can - * follow the right steps but get blocked (high process, low outcome), or - * succeed through an unexpected path (variable process, high outcome). - */ -export interface Verdict { - /** Step 8 — did the agent accomplish the task from the user's perspective? */ - outcomeSuccess: boolean; - /** Aggregated earned/max across applicable criteria, in [0, 1]. */ - processScore: number; - /** Per-criterion breakdown after rescoring. */ - perCriterion: CriterionScore[]; - /** Step 9a — first step where the trajectory went off-track, if any. */ - firstPointOfFailure?: FirstPointOfFailure; - /** Step 10 — task-itself ambiguity / validity. */ - taskValidity: TaskValidity; - /** - * Ids (RubricCriterion.criterion strings) of criteria where neither tier of - * evidence resolved the question. Treated as uncontrollable → full credit, - * but flagged here so consumers can decide whether to discount the score. - */ - evidenceInsufficient: string[]; - /** - * Structured observations from the verifier that a downstream tool or - * follow-up agent could act on. Opportunistic — empty when the verifier - * doesn't notice anything actionable. Not part of the score; advisory. - */ - findings?: VerifierFinding[]; - /** Debugging summary from the active evaluator backend. */ - rawSteps?: VerifierRawSteps; -} - -/** Reason a stub verifier emits when the rubric pipeline hasn't shipped yet. */ -export type StubVerdictReason = - | "wave-0-stub" - | "no-rubric" - | "empty-trajectory"; - -/** - * Verifier interface. Implementations consume a Trajectory + TaskSpec and - * return a Verdict — they MUST NOT touch a live browser. - */ -export interface Verifier { - verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; -} +export type { + CriterionScore, + FirstPointOfFailure, + StubVerdictReason, + TaskValidity, + Verdict, + Verifier, + VerifierFinding, + VerifierRawSteps, +} from "./types.js"; From d68ada609bce75b78e8b5282e2c3bf3cda159a7b Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:29:32 -0700 Subject: [PATCH 07/14] refactor(verifier): remove rollout stub reason --- packages/core/lib/v3/verifier/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index fb0901f60..71f2d26af 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -326,7 +326,7 @@ export interface Verdict { /** Reason a stub verifier can emit. */ export type StubVerdictReason = - | "wave-0-stub" + | "stub-verifier" | "no-rubric" | "empty-trajectory"; From 41708e182348f000daf53997a3b195388704c4b4 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 16:20:20 -0700 Subject: [PATCH 08/14] refactor(verifier): remove proxy type barrels --- packages/core/lib/v3/verifier/trajectory.ts | 16 ---------------- packages/core/lib/v3/verifier/verifier.ts | 10 ---------- 2 files changed, 26 deletions(-) delete mode 100644 packages/core/lib/v3/verifier/verifier.ts diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 3dbb7e5a6..e0ca79401 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -1,28 +1,12 @@ import type { - ProbeEvidence, - Rubric, - RubricCriterion, - RubricInput, - SerializedRubricCriterion, - Trajectory, - TrajectoryStep, -} from "./types.js"; - -export type { - AgentEvidence, AgentEvidenceModality, ProbeEvidence, Rubric, RubricCriterion, RubricInput, - SerializedRubric, SerializedRubricCriterion, - TaskSpec, - ToolOutput, Trajectory, - TrajectoryStatus, TrajectoryStep, - TrajectoryUsage, } from "./types.js"; /** Convert a Stagehand or serialized rubric into the public Stagehand shape. */ diff --git a/packages/core/lib/v3/verifier/verifier.ts b/packages/core/lib/v3/verifier/verifier.ts deleted file mode 100644 index 57167f386..000000000 --- a/packages/core/lib/v3/verifier/verifier.ts +++ /dev/null @@ -1,10 +0,0 @@ -export type { - CriterionScore, - FirstPointOfFailure, - StubVerdictReason, - TaskValidity, - Verdict, - Verifier, - VerifierFinding, - VerifierRawSteps, -} from "./types.js"; From 356f48172eeb3ac30a129565d33ca5529fb6407c Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 16:21:01 -0700 Subject: [PATCH 09/14] fix(verifier): keep rubric earned points numeric --- packages/core/lib/v3/verifier/trajectory.ts | 24 ++++++++++++++++++++- packages/core/lib/v3/verifier/types.ts | 7 ++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index e0ca79401..40cd27dff 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -27,7 +27,10 @@ export function normalizeRubric( ); } - const earnedPoints = raw.earnedPoints ?? raw.earned_points; + const earnedPoints = normalizeEarnedPoints( + raw.earnedPoints ?? raw.earned_points, + raw.criterion, + ); return { criterion: raw.criterion, description: raw.description, @@ -42,6 +45,25 @@ export function normalizeRubric( }; } +function normalizeEarnedPoints( + value: number | string | undefined, + criterion: string, +): number | undefined { + if (value === undefined) return undefined; + if (typeof value === "number") { + if (Number.isFinite(value)) return value; + } else { + const trimmed = value.trim(); + if (trimmed === "") return undefined; + const parsed = Number(trimmed); + if (Number.isFinite(parsed)) return parsed; + } + + throw new TypeError( + `Rubric criterion "${criterion}" has a non-numeric earnedPoints value`, + ); +} + // ───────────────────────────────────────────────────────────────────────────── // On-disk loader // ───────────────────────────────────────────────────────────────────────────── diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index 71f2d26af..b3f1011f0 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -34,11 +34,8 @@ export interface RubricCriterion { condition?: string; /** Filled by the verifier during scoring; empty in precomputed rubrics. */ justification?: string; - /** - * Filled by the verifier during scoring; empty string in some serialized - * upstream rubrics and a number in scored rubrics. - */ - earnedPoints?: number | string; + /** Filled by the verifier during scoring; omitted in precomputed rubrics. */ + earnedPoints?: number; } /** A rubric — list of criteria for a task. */ From b8d195f22552c61910c60aec957105843a944ee3 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 16:21:58 -0700 Subject: [PATCH 10/14] fix(verifier): constrain trajectory screenshot paths --- packages/core/lib/v3/verifier/trajectory.ts | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 40cd27dff..6ce491115 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -84,6 +84,7 @@ function normalizeEarnedPoints( export async function loadTrajectoryFromDisk(dir: string): Promise { const fs = await import("node:fs/promises"); const path = await import("node:path"); + const trajectoryDir = path.resolve(dir); const trajectoryPath = path.join(dir, "trajectory.json"); const raw = await fs.readFile(trajectoryPath, "utf8"); @@ -109,13 +110,28 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { >; }; + const resolveWithinTrajectoryDir = (candidate: string): string => { + const resolved = path.resolve(trajectoryDir, candidate); + const relative = path.relative(trajectoryDir, resolved); + const outside = + relative === ".." || + relative.startsWith(`..${path.sep}`) || + path.isAbsolute(relative); + + if (outside) { + throw new Error( + `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, + ); + } + + return resolved; + }; + for (const step of parsed.steps) { // Rehydrate tier-2 probe screenshot from its on-disk file reference. const probe = step.probeEvidence; if (probe?.screenshotPath && !probe.screenshot) { - const resolved = path.isAbsolute(probe.screenshotPath) - ? probe.screenshotPath - : path.join(dir, probe.screenshotPath); + const resolved = resolveWithinTrajectoryDir(probe.screenshotPath); try { probe.screenshot = await fs.readFile(resolved); } catch { From 4fc1400ce90b39a8dab131b378c6b7f450eb2af6 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 16:25:11 -0700 Subject: [PATCH 11/14] test(verifier): cover trajectory normalization boundaries --- .../tests/unit/verifier-trajectory.test.ts | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 packages/core/tests/unit/verifier-trajectory.test.ts diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts new file mode 100644 index 000000000..0ed4fc596 --- /dev/null +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -0,0 +1,68 @@ +import { mkdtemp, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import path from "node:path"; + +import { describe, expect, it } from "vitest"; + +import { + loadTrajectoryFromDisk, + normalizeRubric, +} from "../../lib/v3/verifier/trajectory.js"; + +describe("verifier trajectory utilities", () => { + it("normalizes serialized empty earned points out of public rubrics", () => { + expect( + normalizeRubric({ + items: [ + { + criterion: "Criterion", + description: "Description", + max_points: 1, + earned_points: "", + }, + ], + }), + ).toEqual({ + items: [ + { + criterion: "Criterion", + description: "Description", + maxPoints: 1, + }, + ], + }); + }); + + it("rejects screenshot paths outside the trajectory directory", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + await writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify({ + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "", + agentEvidence: { modalities: [] }, + probeEvidence: { screenshotPath: "../../../etc/passwd" }, + toolOutput: { ok: true, result: null }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + }), + ); + + await expect(loadTrajectoryFromDisk(dir)).rejects.toThrow( + "escapes trajectory directory", + ); + }); +}); From d87b5e72dc0746599e05f88fdd4df1e4ee97b343 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 21:28:45 -0700 Subject: [PATCH 12/14] test(verifier): cover evaluator facade helpers --- packages/core/lib/v3/verifier/index.ts | 1 - packages/core/lib/v3/verifier/trajectory.ts | 29 +-- packages/core/lib/v3Evaluator.ts | 5 +- .../tests/unit/public-api/v3-core.test.ts | 57 ----- packages/core/tests/unit/v3-evaluator.test.ts | 201 ++++++++++++++++++ .../tests/unit/verifier-trajectory.test.ts | 84 ++++++++ 6 files changed, 305 insertions(+), 72 deletions(-) create mode 100644 packages/core/tests/unit/v3-evaluator.test.ts diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 1b76eb388..ce62b4d0a 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -12,7 +12,6 @@ export type { RubricInput, SerializedRubric, SerializedRubricCriterion, - StubVerdictReason, TaskSpec, TaskValidity, ToolOutput, diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index 6ce491115..f5e300403 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -17,27 +17,26 @@ export function normalizeRubric( return { items: rubric.items.map((item) => { - const raw = item as RubricCriterion & Partial; - const maxPoints = - typeof raw.maxPoints === "number" ? raw.maxPoints : raw.max_points; + const serialized = isSerializedRubricCriterion(item); + const maxPoints = serialized ? item.max_points : item.maxPoints; if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) { throw new TypeError( - `Rubric criterion "${raw.criterion}" is missing a numeric maxPoints value`, + `Rubric criterion "${item.criterion}" is missing a numeric maxPoints value`, ); } const earnedPoints = normalizeEarnedPoints( - raw.earnedPoints ?? raw.earned_points, - raw.criterion, + serialized ? item.earned_points : item.earnedPoints, + item.criterion, ); return { - criterion: raw.criterion, - description: raw.description, + criterion: item.criterion, + description: item.description, maxPoints, - ...(raw.condition !== undefined && { condition: raw.condition }), - ...(raw.justification !== undefined && { - justification: raw.justification, + ...(item.condition !== undefined && { condition: item.condition }), + ...(item.justification !== undefined && { + justification: item.justification, }), ...(earnedPoints !== undefined && { earnedPoints }), }; @@ -45,6 +44,12 @@ export function normalizeRubric( }; } +function isSerializedRubricCriterion( + item: RubricCriterion | SerializedRubricCriterion, +): item is SerializedRubricCriterion { + return "max_points" in item; +} + function normalizeEarnedPoints( value: number | string | undefined, criterion: string, @@ -86,7 +91,7 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { const path = await import("node:path"); const trajectoryDir = path.resolve(dir); - const trajectoryPath = path.join(dir, "trajectory.json"); + const trajectoryPath = path.join(trajectoryDir, "trajectory.json"); const raw = await fs.readFile(trajectoryPath, "utf8"); const parsed = JSON.parse(raw) as Trajectory & { steps: Array< diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index 8adc7d5f1..a25c9cf44 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -271,11 +271,12 @@ function renderLegacyAgentReasoning( function stringifyForPrompt(value: unknown): string { if (typeof value === "string") { - return truncateForPrompt(value, 2000); + return value; } try { - return truncateForPrompt(JSON.stringify(value), 2000); + const serialized = JSON.stringify(value); + return serialized ?? String(value); } catch { return String(value); } diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts index d1975a48c..5e767ae57 100644 --- a/packages/core/tests/unit/public-api/v3-core.test.ts +++ b/packages/core/tests/unit/public-api/v3-core.test.ts @@ -155,63 +155,6 @@ describe("V3 Core public API types", () => { } satisfies Stagehand.V3EvaluatorConstructorOptions, ); }); - - it("rejects verifier backend before the verifier PR is installed", async () => { - const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, { - backend: "verifier", - }); - - await expect( - evaluator.ask({ question: "Was the task completed?" }), - ).rejects.toThrow( - "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", - ); - }); - - it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => { - const taskSpec: Stagehand.TaskSpec = { - id: "empty", - instruction: "Complete the task", - }; - const trajectory: Stagehand.Trajectory = { - task: taskSpec, - steps: [], - status: "complete", - usage: { - input_tokens: 0, - output_tokens: 0, - }, - timing: { - startedAt: new Date(0).toISOString(), - endedAt: new Date(0).toISOString(), - }, - }; - const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, { - backend: "legacy", - }); - - const verdict = await evaluator.verify(trajectory, taskSpec); - - expect(verdict.outcomeSuccess).toBe(false); - expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]); - }); - - it("rejects invalid evaluator backend env values", () => { - const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND; - process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend"; - - try { - expect( - () => new Stagehand.V3Evaluator({} as Stagehand.Stagehand), - ).toThrow('Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"'); - } finally { - if (previousBackend === undefined) { - delete process.env.STAGEHAND_EVALUATOR_BACKEND; - } else { - process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend; - } - } - }); }); describe("V3FunctionName", () => { diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts new file mode 100644 index 000000000..c755c86b7 --- /dev/null +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -0,0 +1,201 @@ +import { describe, expect, it, vi } from "vitest"; + +import { V3Evaluator } from "../../lib/v3Evaluator.js"; +import type { V3 } from "../../lib/v3/v3.js"; +import type { TaskSpec, Trajectory } from "../../lib/v3/verifier/index.js"; + +describe("V3Evaluator verifier facade", () => { + it("rejects verifier backend before the verifier PR is installed", async () => { + const evaluator = new V3Evaluator({} as V3, { + backend: "verifier", + }); + + await expect( + evaluator.ask({ question: "Was the task completed?" }), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("rejects verify when the verifier backend is selected before the verifier PR is installed", async () => { + const taskSpec: TaskSpec = { + id: "verifier-unavailable", + instruction: "Complete the task", + }; + const evaluator = new V3Evaluator({} as V3, { + backend: "verifier", + }); + + await expect( + evaluator.verify(makeTrajectory(taskSpec), taskSpec), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("rejects generateRubric when the verifier backend is selected before the verifier PR is installed", async () => { + const evaluator = new V3Evaluator({} as V3, { + backend: "verifier", + }); + + await expect( + evaluator.generateRubric({ + id: "rubric-unavailable", + instruction: "Complete the task", + }), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("maps legacy YES evaluations with trajectory screenshots to a successful verdict", async () => { + const taskSpec: TaskSpec = { + id: "success", + instruction: "Complete the task", + }; + const screenshot = Buffer.from("screenshot"); + const trajectory = makeTrajectory(taskSpec, { + screenshot, + finalAnswer: "The task is complete.", + }); + const ask = vi.fn().mockResolvedValue({ + evaluation: "YES", + reasoning: "The screenshot shows completion.", + }); + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + Object.defineProperty(evaluator, "legacyEvaluator", { + value: { ask }, + }); + + const verdict = await evaluator.verify(trajectory, taskSpec); + + expect(ask).toHaveBeenCalledWith( + expect.objectContaining({ + question: taskSpec.instruction, + screenshot: [screenshot], + answer: "The task is complete.", + }), + ); + expect(verdict.outcomeSuccess).toBe(true); + expect(verdict.processScore).toBe(1); + expect(verdict.perCriterion[0]).toMatchObject({ + criterion: "legacy-task-completion", + earnedPoints: 1, + evidenceInsufficient: false, + }); + }); + + it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => { + const taskSpec: TaskSpec = { + id: "reasoning-budget", + instruction: "Complete the task", + }; + const longToolOutput = "x".repeat(3000); + const ask = vi.fn().mockResolvedValue({ + evaluation: "YES", + reasoning: "The trajectory shows completion.", + }); + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + Object.defineProperty(evaluator, "legacyEvaluator", { + value: { ask }, + }); + + await evaluator.verify( + makeTrajectory(taskSpec, { + finalAnswer: "The task is complete.", + toolResult: longToolOutput, + }), + taskSpec, + ); + + const firstCall = ask.mock.calls[0]?.[0]; + expect(firstCall?.agentReasoning).toContain(longToolOutput); + }); + + it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => { + const taskSpec: TaskSpec = { + id: "empty", + instruction: "Complete the task", + }; + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + + const verdict = await evaluator.verify( + makeEmptyTrajectory(taskSpec), + taskSpec, + ); + + expect(verdict.outcomeSuccess).toBe(false); + expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]); + }); + + it("rejects invalid evaluator backend env values", () => { + const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND; + process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend"; + + try { + expect(() => new V3Evaluator({} as V3)).toThrow( + 'Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"', + ); + } finally { + if (previousBackend === undefined) { + delete process.env.STAGEHAND_EVALUATOR_BACKEND; + } else { + process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend; + } + } + }); +}); + +function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory { + return { + task: taskSpec, + steps: [], + status: "complete", + usage: { + input_tokens: 0, + output_tokens: 0, + }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + }; +} + +function makeTrajectory( + taskSpec: TaskSpec, + options: { + screenshot?: Buffer; + finalAnswer?: string; + toolResult?: unknown; + } = {}, +): Trajectory { + return { + ...makeEmptyTrajectory(taskSpec), + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "I completed the task.", + agentEvidence: { modalities: [] }, + probeEvidence: options.screenshot + ? { screenshot: options.screenshot } + : {}, + toolOutput: { + ok: true, + result: options.toolResult ?? "done", + }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + finalAnswer: options.finalAnswer, + }; +} diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index 0ed4fc596..51f9c0b8b 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -6,6 +6,7 @@ import { describe, expect, it } from "vitest"; import { loadTrajectoryFromDisk, + nextVerdictFilename, normalizeRubric, } from "../../lib/v3/verifier/trajectory.js"; @@ -33,6 +34,83 @@ describe("verifier trajectory utilities", () => { }); }); + it("round-trips serialized snake_case rubrics to public camelCase rubrics", () => { + expect( + normalizeRubric({ + items: [ + { + criterion: "Criterion", + description: "Description", + max_points: 3, + earned_points: "2", + condition: "Only if relevant", + justification: "Partial credit.", + }, + ], + }), + ).toEqual({ + items: [ + { + criterion: "Criterion", + description: "Description", + maxPoints: 3, + earnedPoints: 2, + condition: "Only if relevant", + justification: "Partial credit.", + }, + ], + }); + }); + + it("loads trajectory screenshots and image modalities from disk", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + const screenshot = Buffer.from("probe screenshot"); + const agentImage = Buffer.from("agent image"); + await writeFile(path.join(dir, "screenshot_1.png"), screenshot); + await writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify({ + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "", + agentEvidence: { + modalities: [ + { + type: "image", + mediaType: "image/png", + bytesBase64: agentImage.toString("base64"), + }, + ], + }, + probeEvidence: { screenshotPath: "screenshot_1.png" }, + toolOutput: { ok: true, result: null }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + }), + ); + + const trajectory = await loadTrajectoryFromDisk(dir); + const modality = trajectory.steps[0].agentEvidence.modalities[0]; + + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(modality.type).toBe("image"); + if (modality.type === "image") { + expect(modality.bytes).toEqual(agentImage); + } + }); + it("rejects screenshot paths outside the trajectory directory", async () => { const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); await writeFile( @@ -65,4 +143,10 @@ describe("verifier trajectory utilities", () => { "escapes trajectory directory", ); }); + + it("sanitizes verdict filename labels", () => { + expect(nextVerdictFilename("rescore / task:one?")).toBe( + "mmrubric_rescore___task_one_.json", + ); + }); }); From 18265cac01d284be1b0bbab03970bef4dfe23475 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:07:10 -0700 Subject: [PATCH 13/14] fix(verifier): clean public result API --- packages/core/lib/v3/index.ts | 15 ++- packages/core/lib/v3/verifier/index.ts | 7 +- packages/core/lib/v3/verifier/trajectory.ts | 111 +++++++++--------- packages/core/lib/v3/verifier/types.ts | 82 ++++--------- packages/core/lib/v3Evaluator.ts | 85 ++++---------- packages/core/lib/v3LegacyEvaluator.ts | 6 + .../unit/public-api/export-surface.test.ts | 3 + .../tests/unit/public-api/v3-core.test.ts | 2 +- packages/core/tests/unit/v3-evaluator.test.ts | 35 ++++-- .../tests/unit/verifier-trajectory.test.ts | 10 +- 10 files changed, 155 insertions(+), 201 deletions(-) diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 8fdcc6b75..8e21fb030 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -24,6 +24,11 @@ import { tool } from "ai"; import { getAISDKLanguageModel } from "./llm/LLMProvider.js"; import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; +import { + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, +} from "./verifier/index.js"; export { V3 } from "./v3.js"; export { V3 as Stagehand } from "./v3.js"; @@ -72,15 +77,12 @@ export type { TaskSpec, Rubric, RubricCriterion, - SerializedRubric, - SerializedRubricCriterion, - RubricInput, AgentEvidence, AgentEvidenceModality, ProbeEvidence, ToolOutput, Verifier, - Verdict, + EvaluationResult, CriterionScore, FirstPointOfFailure, TaskValidity, @@ -89,7 +91,7 @@ export type { } from "./verifier/index.js"; export { loadTrajectoryFromDisk, - nextVerdictFilename, + nextResultFilename, normalizeRubric, } from "./verifier/index.js"; export { tool } from "ai"; @@ -142,6 +144,9 @@ const StagehandDefault = { toJsonSchema, connectToMCPServer, V3Evaluator, + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, tool, getAISDKLanguageModel, __internalCreateInMemoryAgentCacheHandle, diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index ce62b4d0a..4061533ab 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -5,13 +5,11 @@ export type { AgentEvidence, AgentEvidenceModality, CriterionScore, + EvaluationResult, FirstPointOfFailure, ProbeEvidence, Rubric, RubricCriterion, - RubricInput, - SerializedRubric, - SerializedRubricCriterion, TaskSpec, TaskValidity, ToolOutput, @@ -19,13 +17,12 @@ export type { TrajectoryStatus, TrajectoryStep, TrajectoryUsage, - Verdict, Verifier, VerifierFinding, VerifierRawSteps, } from "./types.js"; export { loadTrajectoryFromDisk, - nextVerdictFilename, + nextResultFilename, normalizeRubric, } from "./trajectory.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts index f5e300403..a18f025c3 100644 --- a/packages/core/lib/v3/verifier/trajectory.ts +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -2,70 +2,81 @@ import type { AgentEvidenceModality, ProbeEvidence, Rubric, - RubricCriterion, - RubricInput, - SerializedRubricCriterion, Trajectory, TrajectoryStep, } from "./types.js"; -/** Convert a Stagehand or serialized rubric into the public Stagehand shape. */ -export function normalizeRubric( - rubric: RubricInput | null | undefined, -): Rubric | undefined { - if (!rubric) return undefined; +type RawRubricCriterion = { + criterion: unknown; + description: unknown; + max_points?: unknown; + maxPoints?: unknown; + condition?: unknown; +}; + +type RawRubric = { + items?: unknown; +}; + +/** + * Convert dataset or generated rubric JSON into the public Stagehand shape. + * Snake-case dataset fields are accepted here so serialized quirks do not leak + * into the canonical rubric type. + */ +export function normalizeRubric(rubric: unknown): Rubric | undefined { + if (rubric == null) return undefined; + if (typeof rubric !== "object") { + throw new TypeError("Rubric must be an object"); + } + + const rawRubric = rubric as RawRubric; + if (!Array.isArray(rawRubric.items)) { + throw new TypeError("Rubric is missing an items array"); + } return { - items: rubric.items.map((item) => { - const serialized = isSerializedRubricCriterion(item); - const maxPoints = serialized ? item.max_points : item.maxPoints; + items: rawRubric.items.map((item) => { + const criterion = normalizeRequiredString(item.criterion, "criterion"); + const description = normalizeRequiredString( + item.description, + "description", + ); + const maxPoints = normalizeMaxPoints(item); if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) { throw new TypeError( - `Rubric criterion "${item.criterion}" is missing a numeric maxPoints value`, + `Rubric criterion "${criterion}" is missing a numeric maxPoints value`, ); } - const earnedPoints = normalizeEarnedPoints( - serialized ? item.earned_points : item.earnedPoints, - item.criterion, - ); return { - criterion: item.criterion, - description: item.description, + criterion, + description, maxPoints, - ...(item.condition !== undefined && { condition: item.condition }), - ...(item.justification !== undefined && { - justification: item.justification, + ...(typeof item.condition === "string" && { + condition: item.condition, }), - ...(earnedPoints !== undefined && { earnedPoints }), }; }), }; } -function isSerializedRubricCriterion( - item: RubricCriterion | SerializedRubricCriterion, -): item is SerializedRubricCriterion { - return "max_points" in item; +function normalizeRequiredString(value: unknown, fieldName: string): string { + if (typeof value === "string" && value.length) { + return value; + } + + throw new TypeError(`Rubric criterion is missing a ${fieldName} value`); } -function normalizeEarnedPoints( - value: number | string | undefined, - criterion: string, -): number | undefined { - if (value === undefined) return undefined; - if (typeof value === "number") { - if (Number.isFinite(value)) return value; - } else { - const trimmed = value.trim(); - if (trimmed === "") return undefined; - const parsed = Number(trimmed); - if (Number.isFinite(parsed)) return parsed; - } +function normalizeMaxPoints(item: RawRubricCriterion): unknown { + return item.maxPoints ?? item.max_points; +} - throw new TypeError( - `Rubric criterion "${criterion}" has a non-numeric earnedPoints value`, +function normalizeResultLabel(label?: string): string { + return (label ?? `rescore-${new Date().toISOString()}`).replace( + /[^A-Za-z0-9._-]/g, + "_", ); } @@ -167,18 +178,12 @@ export async function loadTrajectoryFromDisk(dir: string): Promise { } /** - * Locate the next available `mmrubric_*.json` filename for a given trajectory - * directory. Used by offline re-scoring to avoid overwriting prior verdicts. + * Build a `result*.json` filename for persisted evaluator output. * - * Convention: prefer a label-based name (e.g., `mmrubric_rescore-2026-05-11.json`) - * over numeric versioning so multiple offline rescore attempts coexist without - * collisions and remain easy to diff. Falls back to a timestamp if the caller - * doesn't provide a label. + * Convention: the live run writes `result.json`; offline re-score attempts use + * a label-based name (e.g., `result_rescore-2026-05-11.json`) so they coexist + * without collisions and remain easy to diff. */ -export function nextVerdictFilename(label?: string): string { - const safeLabel = (label ?? `rescore-${new Date().toISOString()}`).replace( - /[^A-Za-z0-9._-]/g, - "_", - ); - return `mmrubric_${safeLabel}.json`; +export function nextResultFilename(label?: string): string { + return `result_${normalizeResultLabel(label)}.json`; } diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index b3f1011f0..88b7e275e 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -1,5 +1,5 @@ /** - * Shared verifier types for trajectories, rubrics, evidence, and verdicts. + * Shared verifier types for trajectories, rubrics, evidence, and results. * * The verifier consumes saved trajectories instead of a live browser. DOM and * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve @@ -15,11 +15,7 @@ export interface TrajectoryUsage { inference_time_ms?: number; } -/** - * A single criterion in a Stagehand rubric. Dataset and model wire formats may - * use serialized `max_points` / `earned_points`; normalize those with - * `normalizeRubric()` at the boundary. - */ +/** A single criterion in a Stagehand rubric. */ export interface RubricCriterion { /** Short name of the criterion (e.g., "Add ground beef to cart"). */ criterion: string; @@ -28,14 +24,10 @@ export interface RubricCriterion { /** Maximum points for this criterion. */ maxPoints: number; /** - * Triggering condition for conditional criteria. Only counted when met - * (paper's "Mutually Exclusive Conditionals" pattern). + * Applicability rule for situational criteria. When this condition is not + * met, the criterion is excluded from scoring rather than counted as failed. */ condition?: string; - /** Filled by the verifier during scoring; empty in precomputed rubrics. */ - justification?: string; - /** Filled by the verifier during scoring; omitted in precomputed rubrics. */ - earnedPoints?: number; } /** A rubric — list of criteria for a task. */ @@ -43,26 +35,6 @@ export interface Rubric { items: RubricCriterion[]; } -/** - * Serialized rubric item shape as stored in datasets and prompt responses. - * Keep this at IO boundaries; core verifier types use camelCase. - */ -export interface SerializedRubricCriterion { - criterion: string; - description: string; - max_points: number; - condition?: string; - justification?: string; - earned_points?: number | string; -} - -/** Serialized rubric shape used by upstream datasets and generated JSON. */ -export interface SerializedRubric { - items: SerializedRubricCriterion[]; -} - -export type RubricInput = Rubric | SerializedRubric; - /** * Spec for a single task being verified. Carried both at runtime and into the * verifier alongside the trajectory. @@ -162,11 +134,11 @@ export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; * The on-disk layout is one directory per task: * * .trajectories/// - * ├── task_data.json — TaskSpec + Verdict (filled on completion) + * ├── task_data.json — TaskSpec + result metadata * ├── trajectory.json — this object, with screenshotPath instead of bytes * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. * ├── scores/ - * │ └── mmrubric_v1.json — Verdict from V3Evaluator.verify() + * │ └── result.json — Result from V3Evaluator.verify() * ├── core.log — captured action log * └── times.json — step timing + token usage */ @@ -191,8 +163,8 @@ export interface CriterionScore { * from both numerator and denominator in the process score). */ earnedPoints: number | null; - /** Verifier's free-text justification for the score. */ - justification: string; + /** Verifier's explanation for the score. */ + explanation: string; /** * True if the criterion is conditional and its condition was determined to * be met. Absent for non-conditional criteria. @@ -201,7 +173,7 @@ export interface CriterionScore { /** * Set when the verifier had no evidence to ground this criterion in either * tier. Per paper §2, treated as uncontrollable failure → full credit, but - * surfaced here so dashboards can flag low-confidence verdicts. + * surfaced here so dashboards can flag low-confidence results. */ evidenceInsufficient?: boolean; } @@ -228,7 +200,7 @@ export interface FirstPointOfFailure { * repeated tool-call failures, ambiguous task specs, evidence gaps, etc. * * Not produced for every task: when nothing actionable surfaces, the - * `findings` array on the Verdict is empty. Consumers should treat the + * `findings` array on the EvaluationResult is empty. Consumers should treat the * field as advisory, not as part of the formal score. */ export interface VerifierFinding { @@ -288,29 +260,33 @@ export interface TaskValidity { } /** - * The verifier's output. Process score + outcome verdict + diagnostic signals. + * Evaluator output. Legacy evaluation may only populate outcome fields; richer + * verifier backends can also populate process scoring and diagnostics. * - * Process and outcome are deliberately independent (paper §2): an agent can - * follow the right steps but get blocked (high process, low outcome), or - * succeed through an unexpected path (variable process, high outcome). + * Process and outcome are deliberately independent when both are present: + * an agent can follow the right steps but get blocked (high process, low + * outcome), or succeed through an unexpected path (variable process, high + * outcome). */ -export interface Verdict { - /** Step 8 — did the agent accomplish the task from the user's perspective? */ +export interface EvaluationResult { + /** Did the agent accomplish the task from the user's perspective? */ outcomeSuccess: boolean; + /** Human-readable explanation for the outcome. */ + explanation?: string; /** Aggregated earned/max across applicable criteria, in [0, 1]. */ - processScore: number; + processScore?: number; /** Per-criterion breakdown after rescoring. */ - perCriterion: CriterionScore[]; + perCriterion?: CriterionScore[]; /** Step 9a — first step where the trajectory went off-track, if any. */ firstPointOfFailure?: FirstPointOfFailure; /** Step 10 — task-itself ambiguity / validity. */ - taskValidity: TaskValidity; + taskValidity?: TaskValidity; /** * Ids (RubricCriterion.criterion strings) of criteria where neither tier of * evidence resolved the question. Treated as uncontrollable → full credit, * but flagged here so consumers can decide whether to discount the score. */ - evidenceInsufficient: string[]; + evidenceInsufficient?: string[]; /** * Structured observations from the verifier that a downstream tool or * follow-up agent could act on. Opportunistic — empty when the verifier @@ -321,16 +297,10 @@ export interface Verdict { rawSteps?: VerifierRawSteps; } -/** Reason a stub verifier can emit. */ -export type StubVerdictReason = - | "stub-verifier" - | "no-rubric" - | "empty-trajectory"; - /** * Verifier interface. Implementations consume a Trajectory + TaskSpec and - * return a Verdict — they MUST NOT touch a live browser. + * return an EvaluationResult — they MUST NOT touch a live browser. */ export interface Verifier { - verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; + verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; } diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index a25c9cf44..379cf4589 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -9,7 +9,7 @@ import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js"; import type { EvaluateOptions, BatchAskOptions, - EvaluationResult, + EvaluationResult as LegacyEvaluationResult, } from "./v3/types/private/evaluator.js"; import { V3 } from "./v3/v3.js"; import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js"; @@ -17,7 +17,7 @@ import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js"; import type { Trajectory, TaskSpec, - Verdict, + EvaluationResult, Rubric, Verifier, AgentEvidenceModality, @@ -76,15 +76,18 @@ export class V3Evaluator implements Verifier { ); } - async ask(options: EvaluateOptions): Promise { + async ask(options: EvaluateOptions): Promise { return this.getLegacyBackend("ask").ask(options); } - async batchAsk(options: BatchAskOptions): Promise { + async batchAsk(options: BatchAskOptions): Promise { return this.getLegacyBackend("batchAsk").batchAsk(options); } - async verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise { + async verify( + trajectory: Trajectory, + taskSpec: TaskSpec, + ): Promise { assertVerifierInput(trajectory, taskSpec); if (this.backend === "legacy") { @@ -127,14 +130,13 @@ export class V3Evaluator implements Verifier { private async verifyTrajectoryWithLegacyEvaluator( trajectory: Trajectory, taskSpec: TaskSpec, - ): Promise { + ): Promise { const screenshots = collectLegacyScreenshots(trajectory); const agentReasoning = renderLegacyAgentReasoning(trajectory); const answer = trajectory.finalAnswer; if (!screenshots.length && !answer) { - return legacyInsufficientEvidenceVerdict( - taskSpec, + return legacyInsufficientEvidenceResult( "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.", ); } @@ -146,7 +148,7 @@ export class V3Evaluator implements Verifier { agentReasoning, }); - return legacyEvaluationToVerdict(result, taskSpec, screenshots.length); + return legacyEvaluationToResult(result, screenshots.length); } } @@ -253,20 +255,14 @@ function renderLegacyAgentReasoning( .join("\n"); }); - const sections = [ - stepLines.length - ? `Agent trajectory:\n${stepLines.join("\n\n")}` - : undefined, - trajectory.finalAnswer - ? `Final answer:\n${trajectory.finalAnswer}` - : undefined, - ].filter(Boolean); - - if (!sections.length) { + if (!stepLines.length) { return undefined; } - return truncateForPrompt(sections.join("\n\n"), 16000); + return truncateForPrompt( + `Agent trajectory:\n${stepLines.join("\n\n")}`, + 16000, + ); } function stringifyForPrompt(value: unknown): string { @@ -290,14 +286,12 @@ function truncateForPrompt(value: string, maxLength: number): string { return `${value.slice(0, maxLength)}... [truncated]`; } -function legacyEvaluationToVerdict( - result: EvaluationResult, - taskSpec: TaskSpec, +function legacyEvaluationToResult( + result: LegacyEvaluationResult, screenshotCount: number, -): Verdict { +): EvaluationResult { const outcomeSuccess = result.evaluation === "YES"; const invalid = result.evaluation === "INVALID"; - const criterion = legacyTaskCompletionCriterion(taskSpec); const findings: VerifierFinding[] = invalid ? [ { @@ -310,22 +304,8 @@ function legacyEvaluationToVerdict( return { outcomeSuccess, - processScore: outcomeSuccess ? 1 : 0, - perCriterion: [ - { - criterion: criterion.criterion, - maxPoints: criterion.maxPoints, - earnedPoints: outcomeSuccess ? 1 : 0, - justification: result.reasoning, - evidenceInsufficient: invalid, - }, - ], - taskValidity: { - isAmbiguous: false, - isInvalid: false, - }, - evidenceInsufficient: invalid ? [criterion.criterion] : [], - findings, + explanation: result.reasoning, + ...(findings.length ? { findings } : {}), rawSteps: { backend: "legacy", legacyEvaluation: result.evaluation, @@ -334,29 +314,10 @@ function legacyEvaluationToVerdict( }; } -function legacyInsufficientEvidenceVerdict( - taskSpec: TaskSpec, - reason: string, -): Verdict { - const criterion = legacyTaskCompletionCriterion(taskSpec); - +function legacyInsufficientEvidenceResult(reason: string): EvaluationResult { return { outcomeSuccess: false, - processScore: 0, - perCriterion: [ - { - criterion: criterion.criterion, - maxPoints: criterion.maxPoints, - earnedPoints: 0, - justification: reason, - evidenceInsufficient: true, - }, - ], - taskValidity: { - isAmbiguous: false, - isInvalid: false, - }, - evidenceInsufficient: [criterion.criterion], + explanation: reason, findings: [ { category: "trajectory_capture", diff --git a/packages/core/lib/v3LegacyEvaluator.ts b/packages/core/lib/v3LegacyEvaluator.ts index 64ec89ef2..5662d25ac 100644 --- a/packages/core/lib/v3LegacyEvaluator.ts +++ b/packages/core/lib/v3LegacyEvaluator.ts @@ -74,6 +74,7 @@ export class LegacyV3Evaluator { if (Array.isArray(screenshot)) { return this._evaluateWithMultipleScreenshots({ question, + answer, screenshots: screenshot, systemPrompt, agentReasoning, @@ -224,12 +225,14 @@ export class LegacyV3Evaluator { private async _evaluateWithMultipleScreenshots(options: { question: string; + answer?: string; screenshots: Buffer[]; systemPrompt?: string; agentReasoning?: string; }): Promise { const { question, + answer, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task. @@ -272,6 +275,9 @@ export class LegacyV3Evaluator { ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.` : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`, }, + ...(answer + ? [{ type: "text" as const, text: `the answer is ${answer}` }] + : []), ...imageContents, ], }, diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index eda61d500..e73cde417 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -43,8 +43,11 @@ const publicApiShape = { isZod4Schema: Stagehand.isZod4Schema, jsonSchemaToZod: Stagehand.jsonSchemaToZod, loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv, + loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk, localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema, modelToAgentProviderMap: Stagehand.modelToAgentProviderMap, + nextResultFilename: Stagehand.nextResultFilename, + normalizeRubric: Stagehand.normalizeRubric, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, toGeminiSchema: Stagehand.toGeminiSchema, diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts index 5e767ae57..2c2524238 100644 --- a/packages/core/tests/unit/public-api/v3-core.test.ts +++ b/packages/core/tests/unit/public-api/v3-core.test.ts @@ -139,7 +139,7 @@ describe("V3 Core public API types", () => { ( trajectory: Stagehand.Trajectory, taskSpec: Stagehand.TaskSpec, - ) => Promise + ) => Promise >(); expectTypeOf().toExtend< (taskSpec: Stagehand.TaskSpec) => Promise diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts index c755c86b7..b97c93ba2 100644 --- a/packages/core/tests/unit/v3-evaluator.test.ts +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -48,7 +48,7 @@ describe("V3Evaluator verifier facade", () => { ); }); - it("maps legacy YES evaluations with trajectory screenshots to a successful verdict", async () => { + it("maps legacy YES evaluations with trajectory screenshots to a successful result", async () => { const taskSpec: TaskSpec = { id: "success", instruction: "Complete the task", @@ -69,7 +69,7 @@ describe("V3Evaluator verifier facade", () => { value: { ask }, }); - const verdict = await evaluator.verify(trajectory, taskSpec); + const result = await evaluator.verify(trajectory, taskSpec); expect(ask).toHaveBeenCalledWith( expect.objectContaining({ @@ -78,13 +78,10 @@ describe("V3Evaluator verifier facade", () => { answer: "The task is complete.", }), ); - expect(verdict.outcomeSuccess).toBe(true); - expect(verdict.processScore).toBe(1); - expect(verdict.perCriterion[0]).toMatchObject({ - criterion: "legacy-task-completion", - earnedPoints: 1, - evidenceInsufficient: false, - }); + expect(result.outcomeSuccess).toBe(true); + expect(result.explanation).toBe("The screenshot shows completion."); + expect(result.processScore).toBeUndefined(); + expect(result.perCriterion).toBeUndefined(); }); it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => { @@ -114,9 +111,11 @@ describe("V3Evaluator verifier facade", () => { const firstCall = ask.mock.calls[0]?.[0]; expect(firstCall?.agentReasoning).toContain(longToolOutput); + expect(firstCall?.agentReasoning).not.toContain("Final answer:"); + expect(firstCall?.answer).toBe("The task is complete."); }); - it("returns an evidence-insufficient legacy verdict for empty trajectories", async () => { + it("returns an evidence-insufficient legacy result for empty trajectories", async () => { const taskSpec: TaskSpec = { id: "empty", instruction: "Complete the task", @@ -125,13 +124,23 @@ describe("V3Evaluator verifier facade", () => { backend: "legacy", }); - const verdict = await evaluator.verify( + const result = await evaluator.verify( makeEmptyTrajectory(taskSpec), taskSpec, ); - expect(verdict.outcomeSuccess).toBe(false); - expect(verdict.evidenceInsufficient).toEqual(["legacy-task-completion"]); + expect(result).toMatchObject({ + outcomeSuccess: false, + explanation: + "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.", + rawSteps: { + backend: "legacy", + legacyEvaluation: "INVALID", + screenshotCount: 0, + }, + }); + expect(result.processScore).toBeUndefined(); + expect(result.perCriterion).toBeUndefined(); }); it("rejects invalid evaluator backend env values", () => { diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts index 51f9c0b8b..4b09e53a1 100644 --- a/packages/core/tests/unit/verifier-trajectory.test.ts +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -6,7 +6,7 @@ import { describe, expect, it } from "vitest"; import { loadTrajectoryFromDisk, - nextVerdictFilename, + nextResultFilename, normalizeRubric, } from "../../lib/v3/verifier/trajectory.js"; @@ -54,9 +54,7 @@ describe("verifier trajectory utilities", () => { criterion: "Criterion", description: "Description", maxPoints: 3, - earnedPoints: 2, condition: "Only if relevant", - justification: "Partial credit.", }, ], }); @@ -144,9 +142,9 @@ describe("verifier trajectory utilities", () => { ); }); - it("sanitizes verdict filename labels", () => { - expect(nextVerdictFilename("rescore / task:one?")).toBe( - "mmrubric_rescore___task_one_.json", + it("sanitizes result filename labels", () => { + expect(nextResultFilename("rescore / task:one?")).toBe( + "result_rescore___task_one_.json", ); }); }); From 60e43217aad0449ce258071d11809b0f04178bbc Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:47:31 -0700 Subject: [PATCH 14/14] docs(verifier): align evaluator changeset wording --- .changeset/verifier-evaluator-shell.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changeset/verifier-evaluator-shell.md b/.changeset/verifier-evaluator-shell.md index 8e603b499..4cac71a83 100644 --- a/.changeset/verifier-evaluator-shell.md +++ b/.changeset/verifier-evaluator-shell.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -Add verifier trajectory, rubric, and verdict types with normalized public naming. +Add verifier trajectory, rubric, and evaluation-result types with normalized public naming.