diff --git a/.changeset/verifier-evaluator-shell.md b/.changeset/verifier-evaluator-shell.md new file mode 100644 index 000000000..4cac71a83 --- /dev/null +++ b/.changeset/verifier-evaluator-shell.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add verifier trajectory, rubric, and evaluation-result types with normalized public naming. diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index ffb6726df..8e21fb030 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -24,6 +24,11 @@ import { tool } from "ai"; import { getAISDKLanguageModel } from "./llm/LLMProvider.js"; import { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; import { maybeRunShutdownSupervisorFromArgv } from "./shutdown/supervisor.js"; +import { + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, +} from "./verifier/index.js"; export { V3 } from "./v3.js"; export { V3 as Stagehand } from "./v3.js"; @@ -64,6 +69,31 @@ export type { V3EvaluatorConstructorOptions, V3EvaluatorOptions, } from "../v3Evaluator.js"; +export type { + Trajectory, + TrajectoryStep, + TrajectoryStatus, + TrajectoryUsage, + TaskSpec, + Rubric, + RubricCriterion, + AgentEvidence, + AgentEvidenceModality, + ProbeEvidence, + ToolOutput, + Verifier, + EvaluationResult, + CriterionScore, + FirstPointOfFailure, + TaskValidity, + VerifierFinding, + VerifierRawSteps, +} from "./verifier/index.js"; +export { + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, +} from "./verifier/index.js"; export { tool } from "ai"; export { getAISDKLanguageModel } from "./llm/LLMProvider.js"; export { __internalCreateInMemoryAgentCacheHandle } from "./cache/serverAgentCache.js"; @@ -114,6 +144,9 @@ const StagehandDefault = { toJsonSchema, connectToMCPServer, V3Evaluator, + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, tool, getAISDKLanguageModel, __internalCreateInMemoryAgentCacheHandle, diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts new file mode 100644 index 000000000..4061533ab --- /dev/null +++ b/packages/core/lib/v3/verifier/index.ts @@ -0,0 +1,28 @@ +/** + * Public re-exports for the verifier subsystem. + */ +export type { + AgentEvidence, + AgentEvidenceModality, + CriterionScore, + EvaluationResult, + FirstPointOfFailure, + ProbeEvidence, + Rubric, + RubricCriterion, + TaskSpec, + TaskValidity, + ToolOutput, + Trajectory, + TrajectoryStatus, + TrajectoryStep, + TrajectoryUsage, + Verifier, + VerifierFinding, + VerifierRawSteps, +} from "./types.js"; +export { + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, +} from "./trajectory.js"; diff --git a/packages/core/lib/v3/verifier/trajectory.ts b/packages/core/lib/v3/verifier/trajectory.ts new file mode 100644 index 000000000..a18f025c3 --- /dev/null +++ b/packages/core/lib/v3/verifier/trajectory.ts @@ -0,0 +1,189 @@ +import type { + AgentEvidenceModality, + ProbeEvidence, + Rubric, + Trajectory, + TrajectoryStep, +} from "./types.js"; + +type RawRubricCriterion = { + criterion: unknown; + description: unknown; + max_points?: unknown; + maxPoints?: unknown; + condition?: unknown; +}; + +type RawRubric = { + items?: unknown; +}; + +/** + * Convert dataset or generated rubric JSON into the public Stagehand shape. + * Snake-case dataset fields are accepted here so serialized quirks do not leak + * into the canonical rubric type. + */ +export function normalizeRubric(rubric: unknown): Rubric | undefined { + if (rubric == null) return undefined; + if (typeof rubric !== "object") { + throw new TypeError("Rubric must be an object"); + } + + const rawRubric = rubric as RawRubric; + if (!Array.isArray(rawRubric.items)) { + throw new TypeError("Rubric is missing an items array"); + } + + return { + items: rawRubric.items.map((item) => { + const criterion = normalizeRequiredString(item.criterion, "criterion"); + const description = normalizeRequiredString( + item.description, + "description", + ); + const maxPoints = normalizeMaxPoints(item); + + if (typeof maxPoints !== "number" || !Number.isFinite(maxPoints)) { + throw new TypeError( + `Rubric criterion "${criterion}" is missing a numeric maxPoints value`, + ); + } + + return { + criterion, + description, + maxPoints, + ...(typeof item.condition === "string" && { + condition: item.condition, + }), + }; + }), + }; +} + +function normalizeRequiredString(value: unknown, fieldName: string): string { + if (typeof value === "string" && value.length) { + return value; + } + + throw new TypeError(`Rubric criterion is missing a ${fieldName} value`); +} + +function normalizeMaxPoints(item: RawRubricCriterion): unknown { + return item.maxPoints ?? item.max_points; +} + +function normalizeResultLabel(label?: string): string { + return (label ?? `rescore-${new Date().toISOString()}`).replace( + /[^A-Za-z0-9._-]/g, + "_", + ); +} + +// ───────────────────────────────────────────────────────────────────────────── +// On-disk loader +// ───────────────────────────────────────────────────────────────────────────── + +/** + * Hydrate a Trajectory from the on-disk directory layout written by + * TrajectoryRecorder.persist(). Used by the offline re-scoring CLI (`bench + * verify`) and by any consumer that wants to feed a saved trajectory back + * into V3Evaluator.verify() without running an agent. + * + * Reverses the recorder's serialization tweaks: + * - `probeEvidence.screenshotPath` → read file into `probeEvidence.screenshot`. + * - Image modalities in `agentEvidence.modalities` carry `bytesBase64` on + * disk (human-readable JSON) instead of raw Buffer; we decode back. + * + * @param dir absolute or cwd-relative path to a `//` directory. + */ +export async function loadTrajectoryFromDisk(dir: string): Promise { + const fs = await import("node:fs/promises"); + const path = await import("node:path"); + const trajectoryDir = path.resolve(dir); + + const trajectoryPath = path.join(trajectoryDir, "trajectory.json"); + const raw = await fs.readFile(trajectoryPath, "utf8"); + const parsed = JSON.parse(raw) as Trajectory & { + steps: Array< + TrajectoryStep & { + agentEvidence: { + modalities: Array< + | { type: "text"; content: string } + | { + type: "image"; + mediaType: string; + // On-disk form (recorder writes base64); accept either to + // tolerate hand-edited fixtures. + bytes?: unknown; + bytesBase64?: string; + } + | { type: "json"; content: unknown } + >; + }; + probeEvidence: ProbeEvidence; + } + >; + }; + + const resolveWithinTrajectoryDir = (candidate: string): string => { + const resolved = path.resolve(trajectoryDir, candidate); + const relative = path.relative(trajectoryDir, resolved); + const outside = + relative === ".." || + relative.startsWith(`..${path.sep}`) || + path.isAbsolute(relative); + + if (outside) { + throw new Error( + `Trajectory screenshotPath escapes trajectory directory: ${candidate}`, + ); + } + + return resolved; + }; + + for (const step of parsed.steps) { + // Rehydrate tier-2 probe screenshot from its on-disk file reference. + const probe = step.probeEvidence; + if (probe?.screenshotPath && !probe.screenshot) { + const resolved = resolveWithinTrajectoryDir(probe.screenshotPath); + try { + probe.screenshot = await fs.readFile(resolved); + } catch { + // Missing screenshot file: leave probe.screenshot unset. The verifier's + // evidence_insufficient path will handle it. + } + } + + // Decode image modalities from base64 back to Buffer. + if (step.agentEvidence?.modalities) { + step.agentEvidence.modalities = step.agentEvidence.modalities.map((m) => { + // The on-disk shape carries bytesBase64 instead of bytes, so we look + // through `unknown` here rather than rely on the typed union. + const raw = m as unknown as { bytesBase64?: string }; + if (m.type === "image" && typeof raw.bytesBase64 === "string") { + return { + type: "image" as const, + bytes: Buffer.from(raw.bytesBase64, "base64"), + mediaType: m.mediaType, + }; + } + return m as AgentEvidenceModality; + }); + } + } + + return parsed; +} + +/** + * Build a `result*.json` filename for persisted evaluator output. + * + * Convention: the live run writes `result.json`; offline re-score attempts use + * a label-based name (e.g., `result_rescore-2026-05-11.json`) so they coexist + * without collisions and remain easy to diff. + */ +export function nextResultFilename(label?: string): string { + return `result_${normalizeResultLabel(label)}.json`; +} diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts new file mode 100644 index 000000000..88b7e275e --- /dev/null +++ b/packages/core/lib/v3/verifier/types.ts @@ -0,0 +1,306 @@ +/** + * Shared verifier types for trajectories, rubrics, evidence, and results. + * + * The verifier consumes saved trajectories instead of a live browser. DOM and + * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve + * screenshots sent to the provider plus independent harness probes. + */ + +/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */ +export interface TrajectoryUsage { + input_tokens: number; + output_tokens: number; + reasoning_tokens?: number; + cached_input_tokens?: number; + inference_time_ms?: number; +} + +/** A single criterion in a Stagehand rubric. */ +export interface RubricCriterion { + /** Short name of the criterion (e.g., "Add ground beef to cart"). */ + criterion: string; + /** What to evaluate and how to award partial credit. */ + description: string; + /** Maximum points for this criterion. */ + maxPoints: number; + /** + * Applicability rule for situational criteria. When this condition is not + * met, the criterion is excluded from scoring rather than counted as failed. + */ + condition?: string; +} + +/** A rubric — list of criteria for a task. */ +export interface Rubric { + items: RubricCriterion[]; +} + +/** + * Spec for a single task being verified. Carried both at runtime and into the + * verifier alongside the trajectory. + */ +export interface TaskSpec { + /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */ + id: string; + /** Task instruction shown to the agent. */ + instruction: string; + /** Starting URL, if any. */ + initUrl?: string; + /** Rubric carried by the dataset or generated by a verifier backend. */ + precomputedRubric?: Rubric; + /** Optional reference answer (set when dataset ships one). */ + expectedAnswer?: string; +} + +/** + * A single modality unit in tier-1 agent evidence. Mirrors the shape of + * ModelMessage content parts so we can reproduce what the LLM ingested. + */ +export type AgentEvidenceModality = + | { type: "text"; content: string } + | { type: "image"; bytes: Buffer; mediaType: string } + | { type: "json"; content: unknown }; + +/** + * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the + * tool result for this step. + * + * Modes: + * - CUA: usually a single image modality (the screenshot sent to the provider). + * - Hybrid: tool result with optional screenshotBase64 → one image + one text. + * - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities. + */ +export interface AgentEvidence { + modalities: AgentEvidenceModality[]; +} + +/** + * Tier 2 — independent harness probes around this step. + * + * If a probe wasn't captured, the field is absent (not null). + */ +export interface ProbeEvidence { + /** URL after the step's tool execution. */ + url?: string; + /** + * Bus screenshot captured after the step. Path on disk is preferred once + * persisted; in-memory Buffer is used during a live run. + */ + screenshot?: Buffer; + /** Reference to the persisted screenshot file under the trajectory dir. */ + screenshotPath?: string; + /** Viewport scroll context. Lets the verifier reason about whether the agent saw the full page. */ + scroll?: { top: number; pageHeight: number }; + /** Accessibility tree snapshot. */ + ariaTree?: string; + /** Verifier-requested probes, keyed by criterion id. */ + onDemand?: Record; +} + +/** Outcome of a single tool execution as seen by the harness. */ +export interface ToolOutput { + ok: boolean; + /** + * The tool's return value. Same payload that flowed into agentEvidence + * modalities, but in its native shape (e.g., the extract result, the act + * describe-string) rather than serialized for the LLM. + */ + result: unknown; + error?: string; +} + +/** One step in a trajectory: action + reasoning + evidence + outcome. */ +export interface TrajectoryStep { + index: number; + actionName: string; + actionArgs: Record; + /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */ + reasoning: string; + agentEvidence: AgentEvidence; + probeEvidence: ProbeEvidence; + toolOutput: ToolOutput; + /** ISO 8601 timestamp when the step's tool execution started. */ + startedAt: string; + /** ISO 8601 timestamp when the step's tool execution finished. */ + finishedAt: string; +} + +/** Terminal status of the agent run. */ +export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error"; + +/** + * Full trajectory for one task run. + * + * The on-disk layout is one directory per task: + * + * .trajectories/// + * ├── task_data.json — TaskSpec + result metadata + * ├── trajectory.json — this object, with screenshotPath instead of bytes + * ├── screenshot_1.png — probeEvidence.screenshot for step 1, etc. + * ├── scores/ + * │ └── result.json — Result from V3Evaluator.verify() + * ├── core.log — captured action log + * └── times.json — step timing + token usage + */ +export interface Trajectory { + task: TaskSpec; + steps: TrajectoryStep[]; + finalAnswer?: string; + status: TrajectoryStatus; + usage: TrajectoryUsage; + timing: { startedAt: string; endedAt: string }; +} + +/** Score for a single rubric criterion after evidence analysis + rescoring. */ +export interface CriterionScore { + /** Matches RubricCriterion.criterion (the criterion's short name). */ + criterion: string; + /** Maximum possible points for this criterion. */ + maxPoints: number; + /** + * Points earned post-evidence-analysis (paper's post_image_earned_points). + * Null if the criterion was conditional and its condition wasn't met (excluded + * from both numerator and denominator in the process score). + */ + earnedPoints: number | null; + /** Verifier's explanation for the score. */ + explanation: string; + /** + * True if the criterion is conditional and its condition was determined to + * be met. Absent for non-conditional criteria. + */ + conditionMet?: boolean; + /** + * Set when the verifier had no evidence to ground this criterion in either + * tier. Per paper §2, treated as uncontrollable failure → full credit, but + * surfaced here so dashboards can flag low-confidence results. + */ + evidenceInsufficient?: boolean; +} + +/** + * First-point-of-failure analysis (paper Step 9a). Identifies the earliest + * step where the agent's trajectory went off-track, using a structured error + * taxonomy (7 top-level categories, 1.1–7.4 sub-codes). + */ +export interface FirstPointOfFailure { + stepIndex: number; + /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */ + errorCode: string; + /** Top-level category name (Selection, Hallucination, etc.). */ + category: string; + /** Verifier's reasoning for selecting this point. */ + description?: string; +} + +/** + * Structured observation surfaced by the verifier that another agent or + * tooling could act on. Findings are emitted opportunistically by Step 8 + * (outcome verification) when the verifier notices actionable patterns — + * repeated tool-call failures, ambiguous task specs, evidence gaps, etc. + * + * Not produced for every task: when nothing actionable surfaces, the + * `findings` array on the EvaluationResult is empty. Consumers should treat the + * field as advisory, not as part of the formal score. + */ +export interface VerifierFinding { + /** + * Category of the observation. Open-ended enum — additional categories may + * be added as verifier backends surface new failure modes. + */ + category: + | "agent_tool_usage" // agent's tool calls had repeated issues (misclicks, wrong args, retries) + | "agent_strategy" // higher-level planning / decision-making problems + | "rubric_quality" // criteria were overly strict, ambiguous, or contradictory + | "trajectory_capture" // gaps in evidence (missing screenshots, empty steps) + | "task_specification" // task instruction was ambiguous / under- or over-specified + | "verifier_uncertainty" // verifier itself couldn't confidently decide + | "other"; + /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */ + severity: "info" | "warning" | "blocking"; + /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */ + description: string; + /** + * Optional concrete next action another agent could take. Should be + * specific enough that it can be acted on without further reasoning — + * e.g., "Try double_click instead of triple_click to clear placeholder + * text on this form field." + */ + suggestedAction?: string; + /** Step indices in the trajectory where this pattern showed up. */ + relatedSteps?: number[]; +} + +/** Stable debugging summary emitted by verifier backends. */ +export interface VerifierRawSteps { + backend?: "legacy" | "verifier"; + primaryIntent?: string; + reasoning?: string; + rubricSource?: "precomputed" | "generated" | "none"; + approach?: "a" | "b"; + optionalsMode?: "folded" | "separate" | "skip"; + totalEarned?: number; + totalMax?: number; + evidenceImages?: number; + evidenceTexts?: number; + evidenceOriginalScreenshots?: number; + legacyEvaluation?: string; + screenshotCount?: number; +} + +/** Task-validity classification (paper Step 10). */ +export interface TaskValidity { + /** True if the task is underspecified / has multiple valid interpretations. */ + isAmbiguous: boolean; + /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */ + isInvalid: boolean; + /** Optional sub-codes from the task-classification taxonomy. */ + ambiguityCodes?: string[]; + invalidTaskCodes?: string[]; +} + +/** + * Evaluator output. Legacy evaluation may only populate outcome fields; richer + * verifier backends can also populate process scoring and diagnostics. + * + * Process and outcome are deliberately independent when both are present: + * an agent can follow the right steps but get blocked (high process, low + * outcome), or succeed through an unexpected path (variable process, high + * outcome). + */ +export interface EvaluationResult { + /** Did the agent accomplish the task from the user's perspective? */ + outcomeSuccess: boolean; + /** Human-readable explanation for the outcome. */ + explanation?: string; + /** Aggregated earned/max across applicable criteria, in [0, 1]. */ + processScore?: number; + /** Per-criterion breakdown after rescoring. */ + perCriterion?: CriterionScore[]; + /** Step 9a — first step where the trajectory went off-track, if any. */ + firstPointOfFailure?: FirstPointOfFailure; + /** Step 10 — task-itself ambiguity / validity. */ + taskValidity?: TaskValidity; + /** + * Ids (RubricCriterion.criterion strings) of criteria where neither tier of + * evidence resolved the question. Treated as uncontrollable → full credit, + * but flagged here so consumers can decide whether to discount the score. + */ + evidenceInsufficient?: string[]; + /** + * Structured observations from the verifier that a downstream tool or + * follow-up agent could act on. Opportunistic — empty when the verifier + * doesn't notice anything actionable. Not part of the score; advisory. + */ + findings?: VerifierFinding[]; + /** Debugging summary from the active evaluator backend. */ + rawSteps?: VerifierRawSteps; +} + +/** + * Verifier interface. Implementations consume a Trajectory + TaskSpec and + * return an EvaluationResult — they MUST NOT touch a live browser. + */ +export interface Verifier { + verify(trajectory: Trajectory, taskSpec: TaskSpec): Promise; +} diff --git a/packages/core/lib/v3Evaluator.ts b/packages/core/lib/v3Evaluator.ts index e1d384f8c..379cf4589 100644 --- a/packages/core/lib/v3Evaluator.ts +++ b/packages/core/lib/v3Evaluator.ts @@ -9,11 +9,20 @@ import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js"; import type { EvaluateOptions, BatchAskOptions, - EvaluationResult, + EvaluationResult as LegacyEvaluationResult, } from "./v3/types/private/evaluator.js"; import { V3 } from "./v3/v3.js"; import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js"; import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js"; +import type { + Trajectory, + TaskSpec, + EvaluationResult, + Rubric, + Verifier, + AgentEvidenceModality, + VerifierFinding, +} from "./v3/verifier/index.js"; const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND"; const DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = "legacy"; @@ -43,7 +52,7 @@ type NormalizedConstructorOptions = { backend?: V3EvaluatorBackend; }; -export class V3Evaluator { +export class V3Evaluator implements Verifier { private readonly backend: V3EvaluatorBackend; private readonly legacyEvaluator: LegacyV3Evaluator; @@ -67,23 +76,80 @@ export class V3Evaluator { ); } - async ask(options: EvaluateOptions): Promise { + async ask(options: EvaluateOptions): Promise { return this.getLegacyBackend("ask").ask(options); } - async batchAsk(options: BatchAskOptions): Promise { + async batchAsk(options: BatchAskOptions): Promise { return this.getLegacyBackend("batchAsk").batchAsk(options); } + async verify( + trajectory: Trajectory, + taskSpec: TaskSpec, + ): Promise { + assertVerifierInput(trajectory, taskSpec); + + if (this.backend === "legacy") { + return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec); + } + + return this.unavailableVerifierBackend("verify"); + } + + async generateRubric(taskSpec: TaskSpec): Promise { + if (!taskSpec?.id) { + throw new StagehandInvalidArgumentError( + "TaskSpec.id is required for rubric generation", + ); + } + + if (this.backend === "verifier") { + return this.unavailableVerifierBackend("generateRubric"); + } + + return { + items: [legacyTaskCompletionCriterion(taskSpec)], + }; + } + private getLegacyBackend(methodName: string): LegacyV3Evaluator { if (this.backend === "legacy") { return this.legacyEvaluator; } + return this.unavailableVerifierBackend(methodName); + } + + private unavailableVerifierBackend(methodName: string): never { throw new StagehandInvalidArgumentError( `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`, ); } + + private async verifyTrajectoryWithLegacyEvaluator( + trajectory: Trajectory, + taskSpec: TaskSpec, + ): Promise { + const screenshots = collectLegacyScreenshots(trajectory); + const agentReasoning = renderLegacyAgentReasoning(trajectory); + const answer = trajectory.finalAnswer; + + if (!screenshots.length && !answer) { + return legacyInsufficientEvidenceResult( + "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.", + ); + } + + const result = await this.legacyEvaluator.ask({ + question: taskSpec.instruction, + screenshot: screenshots.length ? screenshots : false, + answer, + agentReasoning, + }); + + return legacyEvaluationToResult(result, screenshots.length); + } } function normalizeConstructorOptions( @@ -127,3 +193,142 @@ function resolveEvaluatorBackend( `Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`, ); } + +function assertVerifierInput(trajectory: Trajectory, taskSpec: TaskSpec): void { + if (!taskSpec?.id) { + throw new StagehandInvalidArgumentError( + "TaskSpec.id is required for verification", + ); + } + if (!trajectory) { + throw new StagehandInvalidArgumentError( + "Trajectory is required for verification", + ); + } +} + +function legacyTaskCompletionCriterion(taskSpec: TaskSpec) { + return { + criterion: "legacy-task-completion", + description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`, + maxPoints: 1, + }; +} + +function collectLegacyScreenshots(trajectory: Trajectory): Buffer[] { + const screenshots: Buffer[] = []; + + for (const step of trajectory.steps ?? []) { + if (Buffer.isBuffer(step.probeEvidence?.screenshot)) { + screenshots.push(step.probeEvidence.screenshot); + continue; + } + + const agentImage = step.agentEvidence?.modalities?.find( + ( + modality, + ): modality is Extract => + modality.type === "image" && Buffer.isBuffer(modality.bytes), + ); + + if (agentImage) { + screenshots.push(agentImage.bytes); + } + } + + return screenshots; +} + +function renderLegacyAgentReasoning( + trajectory: Trajectory, +): string | undefined { + const stepLines = (trajectory.steps ?? []).map((step) => { + const output = step.toolOutput?.error + ? `Tool error: ${step.toolOutput.error}` + : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`; + return [ + `Step ${step.index}: ${step.actionName}`, + step.reasoning ? `Reasoning: ${step.reasoning}` : undefined, + output, + ] + .filter(Boolean) + .join("\n"); + }); + + if (!stepLines.length) { + return undefined; + } + + return truncateForPrompt( + `Agent trajectory:\n${stepLines.join("\n\n")}`, + 16000, + ); +} + +function stringifyForPrompt(value: unknown): string { + if (typeof value === "string") { + return value; + } + + try { + const serialized = JSON.stringify(value); + return serialized ?? String(value); + } catch { + return String(value); + } +} + +function truncateForPrompt(value: string, maxLength: number): string { + if (value.length <= maxLength) { + return value; + } + + return `${value.slice(0, maxLength)}... [truncated]`; +} + +function legacyEvaluationToResult( + result: LegacyEvaluationResult, + screenshotCount: number, +): EvaluationResult { + const outcomeSuccess = result.evaluation === "YES"; + const invalid = result.evaluation === "INVALID"; + const findings: VerifierFinding[] = invalid + ? [ + { + category: "verifier_uncertainty", + severity: "warning", + description: result.reasoning, + }, + ] + : []; + + return { + outcomeSuccess, + explanation: result.reasoning, + ...(findings.length ? { findings } : {}), + rawSteps: { + backend: "legacy", + legacyEvaluation: result.evaluation, + screenshotCount, + }, + }; +} + +function legacyInsufficientEvidenceResult(reason: string): EvaluationResult { + return { + outcomeSuccess: false, + explanation: reason, + findings: [ + { + category: "trajectory_capture", + severity: "blocking", + description: reason, + }, + ], + rawSteps: { + backend: "legacy", + legacyEvaluation: "INVALID", + screenshotCount: 0, + }, + }; +} diff --git a/packages/core/lib/v3LegacyEvaluator.ts b/packages/core/lib/v3LegacyEvaluator.ts index 64ec89ef2..5662d25ac 100644 --- a/packages/core/lib/v3LegacyEvaluator.ts +++ b/packages/core/lib/v3LegacyEvaluator.ts @@ -74,6 +74,7 @@ export class LegacyV3Evaluator { if (Array.isArray(screenshot)) { return this._evaluateWithMultipleScreenshots({ question, + answer, screenshots: screenshot, systemPrompt, agentReasoning, @@ -224,12 +225,14 @@ export class LegacyV3Evaluator { private async _evaluateWithMultipleScreenshots(options: { question: string; + answer?: string; screenshots: Buffer[]; systemPrompt?: string; agentReasoning?: string; }): Promise { const { question, + answer, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task. @@ -272,6 +275,9 @@ export class LegacyV3Evaluator { ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.` : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`, }, + ...(answer + ? [{ type: "text" as const, text: `the answer is ${answer}` }] + : []), ...imageContents, ], }, diff --git a/packages/core/tests/unit/public-api/export-surface.test.ts b/packages/core/tests/unit/public-api/export-surface.test.ts index eda61d500..e73cde417 100644 --- a/packages/core/tests/unit/public-api/export-surface.test.ts +++ b/packages/core/tests/unit/public-api/export-surface.test.ts @@ -43,8 +43,11 @@ const publicApiShape = { isZod4Schema: Stagehand.isZod4Schema, jsonSchemaToZod: Stagehand.jsonSchemaToZod, loadApiKeyFromEnv: Stagehand.loadApiKeyFromEnv, + loadTrajectoryFromDisk: Stagehand.loadTrajectoryFromDisk, localBrowserLaunchOptionsSchema: Stagehand.localBrowserLaunchOptionsSchema, modelToAgentProviderMap: Stagehand.modelToAgentProviderMap, + nextResultFilename: Stagehand.nextResultFilename, + normalizeRubric: Stagehand.normalizeRubric, pageTextSchema: Stagehand.pageTextSchema, providerEnvVarMap: Stagehand.providerEnvVarMap, toGeminiSchema: Stagehand.toGeminiSchema, diff --git a/packages/core/tests/unit/public-api/v3-core.test.ts b/packages/core/tests/unit/public-api/v3-core.test.ts index 8d710da4d..2c2524238 100644 --- a/packages/core/tests/unit/public-api/v3-core.test.ts +++ b/packages/core/tests/unit/public-api/v3-core.test.ts @@ -134,6 +134,18 @@ describe("V3 Core public API types", () => { >(); }); + it("has verifier facade methods", () => { + expectTypeOf().toExtend< + ( + trajectory: Stagehand.Trajectory, + taskSpec: Stagehand.TaskSpec, + ) => Promise + >(); + expectTypeOf().toExtend< + (taskSpec: Stagehand.TaskSpec) => Promise + >(); + }); + it("accepts legacy evaluator backend options", () => { const mockV3 = {} as Stagehand.Stagehand; expectTypeOf().toBeConstructibleWith( @@ -143,35 +155,6 @@ describe("V3 Core public API types", () => { } satisfies Stagehand.V3EvaluatorConstructorOptions, ); }); - - it("rejects verifier backend before the verifier PR is installed", async () => { - const evaluator = new Stagehand.V3Evaluator({} as Stagehand.Stagehand, { - backend: "verifier", - }); - - await expect( - evaluator.ask({ question: "Was the task completed?" }), - ).rejects.toThrow( - "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", - ); - }); - - it("rejects invalid evaluator backend env values", () => { - const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND; - process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend"; - - try { - expect( - () => new Stagehand.V3Evaluator({} as Stagehand.Stagehand), - ).toThrow('Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"'); - } finally { - if (previousBackend === undefined) { - delete process.env.STAGEHAND_EVALUATOR_BACKEND; - } else { - process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend; - } - } - }); }); describe("V3FunctionName", () => { diff --git a/packages/core/tests/unit/v3-evaluator.test.ts b/packages/core/tests/unit/v3-evaluator.test.ts new file mode 100644 index 000000000..b97c93ba2 --- /dev/null +++ b/packages/core/tests/unit/v3-evaluator.test.ts @@ -0,0 +1,210 @@ +import { describe, expect, it, vi } from "vitest"; + +import { V3Evaluator } from "../../lib/v3Evaluator.js"; +import type { V3 } from "../../lib/v3/v3.js"; +import type { TaskSpec, Trajectory } from "../../lib/v3/verifier/index.js"; + +describe("V3Evaluator verifier facade", () => { + it("rejects verifier backend before the verifier PR is installed", async () => { + const evaluator = new V3Evaluator({} as V3, { + backend: "verifier", + }); + + await expect( + evaluator.ask({ question: "Was the task completed?" }), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("rejects verify when the verifier backend is selected before the verifier PR is installed", async () => { + const taskSpec: TaskSpec = { + id: "verifier-unavailable", + instruction: "Complete the task", + }; + const evaluator = new V3Evaluator({} as V3, { + backend: "verifier", + }); + + await expect( + evaluator.verify(makeTrajectory(taskSpec), taskSpec), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("rejects generateRubric when the verifier backend is selected before the verifier PR is installed", async () => { + const evaluator = new V3Evaluator({} as V3, { + backend: "verifier", + }); + + await expect( + evaluator.generateRubric({ + id: "rubric-unavailable", + instruction: "Complete the task", + }), + ).rejects.toThrow( + "STAGEHAND_EVALUATOR_BACKEND=verifier, but the verifier backend is not available", + ); + }); + + it("maps legacy YES evaluations with trajectory screenshots to a successful result", async () => { + const taskSpec: TaskSpec = { + id: "success", + instruction: "Complete the task", + }; + const screenshot = Buffer.from("screenshot"); + const trajectory = makeTrajectory(taskSpec, { + screenshot, + finalAnswer: "The task is complete.", + }); + const ask = vi.fn().mockResolvedValue({ + evaluation: "YES", + reasoning: "The screenshot shows completion.", + }); + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + Object.defineProperty(evaluator, "legacyEvaluator", { + value: { ask }, + }); + + const result = await evaluator.verify(trajectory, taskSpec); + + expect(ask).toHaveBeenCalledWith( + expect.objectContaining({ + question: taskSpec.instruction, + screenshot: [screenshot], + answer: "The task is complete.", + }), + ); + expect(result.outcomeSuccess).toBe(true); + expect(result.explanation).toBe("The screenshot shows completion."); + expect(result.processScore).toBeUndefined(); + expect(result.perCriterion).toBeUndefined(); + }); + + it("keeps legacy tool output detail until the overall reasoning budget is reached", async () => { + const taskSpec: TaskSpec = { + id: "reasoning-budget", + instruction: "Complete the task", + }; + const longToolOutput = "x".repeat(3000); + const ask = vi.fn().mockResolvedValue({ + evaluation: "YES", + reasoning: "The trajectory shows completion.", + }); + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + Object.defineProperty(evaluator, "legacyEvaluator", { + value: { ask }, + }); + + await evaluator.verify( + makeTrajectory(taskSpec, { + finalAnswer: "The task is complete.", + toolResult: longToolOutput, + }), + taskSpec, + ); + + const firstCall = ask.mock.calls[0]?.[0]; + expect(firstCall?.agentReasoning).toContain(longToolOutput); + expect(firstCall?.agentReasoning).not.toContain("Final answer:"); + expect(firstCall?.answer).toBe("The task is complete."); + }); + + it("returns an evidence-insufficient legacy result for empty trajectories", async () => { + const taskSpec: TaskSpec = { + id: "empty", + instruction: "Complete the task", + }; + const evaluator = new V3Evaluator({} as V3, { + backend: "legacy", + }); + + const result = await evaluator.verify( + makeEmptyTrajectory(taskSpec), + taskSpec, + ); + + expect(result).toMatchObject({ + outcomeSuccess: false, + explanation: + "Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.", + rawSteps: { + backend: "legacy", + legacyEvaluation: "INVALID", + screenshotCount: 0, + }, + }); + expect(result.processScore).toBeUndefined(); + expect(result.perCriterion).toBeUndefined(); + }); + + it("rejects invalid evaluator backend env values", () => { + const previousBackend = process.env.STAGEHAND_EVALUATOR_BACKEND; + process.env.STAGEHAND_EVALUATOR_BACKEND = "not-a-backend"; + + try { + expect(() => new V3Evaluator({} as V3)).toThrow( + 'Invalid STAGEHAND_EVALUATOR_BACKEND="not-a-backend"', + ); + } finally { + if (previousBackend === undefined) { + delete process.env.STAGEHAND_EVALUATOR_BACKEND; + } else { + process.env.STAGEHAND_EVALUATOR_BACKEND = previousBackend; + } + } + }); +}); + +function makeEmptyTrajectory(taskSpec: TaskSpec): Trajectory { + return { + task: taskSpec, + steps: [], + status: "complete", + usage: { + input_tokens: 0, + output_tokens: 0, + }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + }; +} + +function makeTrajectory( + taskSpec: TaskSpec, + options: { + screenshot?: Buffer; + finalAnswer?: string; + toolResult?: unknown; + } = {}, +): Trajectory { + return { + ...makeEmptyTrajectory(taskSpec), + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "I completed the task.", + agentEvidence: { modalities: [] }, + probeEvidence: options.screenshot + ? { screenshot: options.screenshot } + : {}, + toolOutput: { + ok: true, + result: options.toolResult ?? "done", + }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + finalAnswer: options.finalAnswer, + }; +} diff --git a/packages/core/tests/unit/verifier-trajectory.test.ts b/packages/core/tests/unit/verifier-trajectory.test.ts new file mode 100644 index 000000000..4b09e53a1 --- /dev/null +++ b/packages/core/tests/unit/verifier-trajectory.test.ts @@ -0,0 +1,150 @@ +import { mkdtemp, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import path from "node:path"; + +import { describe, expect, it } from "vitest"; + +import { + loadTrajectoryFromDisk, + nextResultFilename, + normalizeRubric, +} from "../../lib/v3/verifier/trajectory.js"; + +describe("verifier trajectory utilities", () => { + it("normalizes serialized empty earned points out of public rubrics", () => { + expect( + normalizeRubric({ + items: [ + { + criterion: "Criterion", + description: "Description", + max_points: 1, + earned_points: "", + }, + ], + }), + ).toEqual({ + items: [ + { + criterion: "Criterion", + description: "Description", + maxPoints: 1, + }, + ], + }); + }); + + it("round-trips serialized snake_case rubrics to public camelCase rubrics", () => { + expect( + normalizeRubric({ + items: [ + { + criterion: "Criterion", + description: "Description", + max_points: 3, + earned_points: "2", + condition: "Only if relevant", + justification: "Partial credit.", + }, + ], + }), + ).toEqual({ + items: [ + { + criterion: "Criterion", + description: "Description", + maxPoints: 3, + condition: "Only if relevant", + }, + ], + }); + }); + + it("loads trajectory screenshots and image modalities from disk", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + const screenshot = Buffer.from("probe screenshot"); + const agentImage = Buffer.from("agent image"); + await writeFile(path.join(dir, "screenshot_1.png"), screenshot); + await writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify({ + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "", + agentEvidence: { + modalities: [ + { + type: "image", + mediaType: "image/png", + bytesBase64: agentImage.toString("base64"), + }, + ], + }, + probeEvidence: { screenshotPath: "screenshot_1.png" }, + toolOutput: { ok: true, result: null }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + }), + ); + + const trajectory = await loadTrajectoryFromDisk(dir); + const modality = trajectory.steps[0].agentEvidence.modalities[0]; + + expect(trajectory.steps[0].probeEvidence.screenshot).toEqual(screenshot); + expect(modality.type).toBe("image"); + if (modality.type === "image") { + expect(modality.bytes).toEqual(agentImage); + } + }); + + it("rejects screenshot paths outside the trajectory directory", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "stagehand-verifier-")); + await writeFile( + path.join(dir, "trajectory.json"), + JSON.stringify({ + task: { id: "task", instruction: "Do the task" }, + status: "complete", + usage: { input_tokens: 0, output_tokens: 0 }, + timing: { + startedAt: new Date(0).toISOString(), + endedAt: new Date(0).toISOString(), + }, + steps: [ + { + index: 0, + actionName: "act", + actionArgs: {}, + reasoning: "", + agentEvidence: { modalities: [] }, + probeEvidence: { screenshotPath: "../../../etc/passwd" }, + toolOutput: { ok: true, result: null }, + startedAt: new Date(0).toISOString(), + finishedAt: new Date(0).toISOString(), + }, + ], + }), + ); + + await expect(loadTrajectoryFromDisk(dir)).rejects.toThrow( + "escapes trajectory directory", + ); + }); + + it("sanitizes result filename labels", () => { + expect(nextResultFilename("rescore / task:one?")).toBe( + "result_rescore___task_one_.json", + ); + }); +});