diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts new file mode 100644 index 000000000..8dc40bd2b --- /dev/null +++ b/packages/evals/framework/verifierAdapter.ts @@ -0,0 +1,177 @@ +/** + * verifierAdapter — runs a bench task through the verifier pipeline. + * + * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate + * with one call: + * + * const { evaluationResult, trajectory } = await runWithVerifier({ + * v3, + * agent, + * taskSpec: { id, instruction, initUrl, precomputedRubric? }, + * maxSteps: 50, + * }); + * + * Behavior: + * 1. Resolves the rubric from the task, cache, or evaluator. + * 2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus. + * 3. Runs V3Evaluator.verify() on the recorded Trajectory. + * 4. Returns { trajectory, evaluationResult, agentResult }. + * + * Persistence and rubric caching are gated by env vars: + * VERIFIER_PERSIST_TRAJECTORIES — on locally, off in CI by default. + * VERIFIER_DISABLE_RUBRIC_CACHE — set to "1" to bypass the cache (forces + * fresh rubric generation every time). + */ +import { + V3Evaluator, + normalizeRubric, + type AgentInstance, + type AgentExecuteOptions, + type AgentResult, + type EvaluationResult, + type Rubric, + type TaskSpec, + type Trajectory, + type V3, +} from "@browserbasehq/stagehand"; + +import { RubricCache } from "./rubricCache.js"; +import { TrajectoryRecorder } from "./trajectoryRecorder.js"; + +export interface RunWithVerifierOptions { + v3: V3; + agent: AgentInstance; + taskSpec: TaskSpec; + /** + * Dataset name for rubric cache partitioning. Each task lives under + * `.rubric-cache//.json`. + */ + dataset: string; + /** Agent execute options. `instruction` is filled from taskSpec.instruction. */ + agentOptions?: Omit; + /** Override the run id (defaults to ISO timestamp). */ + runId?: string; + /** Override trajectory persistence root. */ + trajectoryRoot?: string; +} + +export interface RunWithVerifierResult { + trajectory: Trajectory; + evaluationResult: EvaluationResult; + agentResult: AgentResult; + /** Resolved rubric (precomputed, cached, or freshly generated). */ + rubric: Rubric; + /** Where the trajectory was persisted (or would have been, if disabled). */ + trajectoryDir: string; +} + +export async function runWithVerifier( + opts: RunWithVerifierOptions, +): Promise { + const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } = + opts; + const evaluator = new V3Evaluator(v3, { backend: "verifier" }); + + // ── Resolve rubric ────────────────────────────────────────────────────── + let resolvedRubric: Rubric; + if (taskSpec.precomputedRubric) { + resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!; + } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + resolvedRubric = await evaluator.generateRubric(taskSpec); + } else { + const cache = new RubricCache({ dataset }); + resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator); + } + + // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate. + const hydratedTaskSpec: TaskSpec = { + ...taskSpec, + precomputedRubric: resolvedRubric, + }; + + // ── Record trajectory around agent.execute() ─────────────────────────── + const recorder = new TrajectoryRecorder({ + v3, + taskSpec: hydratedTaskSpec, + runId, + outputRoot: trajectoryRoot, + }); + recorder.start(); + + let agentResult: AgentResult; + let recorderStatus: "complete" | "aborted" | "error" = "complete"; + try { + agentResult = await agent.execute({ + ...agentOptions, + instruction: taskSpec.instruction, + }); + } catch (e) { + recorderStatus = "error"; + const trajectory = await recorder.finish({ status: recorderStatus }); + // Re-throw after persisting so the bench task can decide how to report. + const wrapped = e instanceof Error ? e : new Error(String(e)); + Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory }); + throw wrapped; + } + + const trajectory = await recorder.finish({ + status: recorderStatus, + finalAnswer: agentResult.message, + usage: agentResult.usage, + }); + + // ── Verify ────────────────────────────────────────────────────────────── + const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec); + await recorder.persistResult(evaluationResult); + + return { + trajectory, + evaluationResult, + agentResult, + rubric: resolvedRubric, + trajectoryDir: recorder.directory, + }; +} + +/** + * Decide bench task success from an EvaluationResult using the --success flag's + * semantics. + * + * `outcome` (default) — strict binary outcome. + * `process` — rubric process score ≥ threshold (default 0.8). + * `both` — both conditions must hold. + */ +export type EvalSuccessMode = "outcome" | "process" | "both"; + +export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode { + if (typeof mode !== "string") return "outcome"; + const normalized = mode.trim().toLowerCase(); + if ( + normalized === "outcome" || + normalized === "process" || + normalized === "both" + ) { + return normalized; + } + return "outcome"; +} + +export function evaluationResultToSuccess( + result: EvaluationResult, + mode: unknown = "outcome", + processThreshold = 0.8, +): boolean { + const resolvedMode = resolveEvalSuccessMode(mode); + const outcomeOk = result.outcomeSuccess; + const processOk = + typeof result.processScore === "number" && + result.processScore >= processThreshold; + switch (resolvedMode) { + case "outcome": + return outcomeOk; + case "process": + return processOk; + case "both": + return outcomeOk && processOk; + } +} diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts index 5a6763390..22fb87de1 100644 --- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts +++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts @@ -29,7 +29,7 @@ const JSONL_PATH = path.join( "WebTailBench_data.jsonl", ); -interface Rubric { +interface RawRubric { items: Array>; } @@ -38,7 +38,7 @@ interface LocalRow { category?: string; ques: string; web?: string; - precomputed_rubric?: Rubric; + precomputed_rubric?: RawRubric; } /** @@ -114,12 +114,12 @@ async function main(): Promise { ); } - const rubricsById = new Map(); + const rubricsById = new Map(); for (let i = 1; i < rows.length; i++) { const cols = rows[i]; if (!cols[idIdx]) continue; try { - const parsed = JSON.parse(cols[rubricIdx]) as Rubric; + const parsed = JSON.parse(cols[rubricIdx]) as RawRubric; rubricsById.set(cols[idIdx], parsed); } catch (e) { console.warn( @@ -149,7 +149,7 @@ async function main(): Promise { } console.log( - ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`, + ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`, ); await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8"); diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts index 4bd2015fa..d704449c2 100644 --- a/packages/evals/suites/webtailbench.ts +++ b/packages/evals/suites/webtailbench.ts @@ -1,5 +1,5 @@ import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js"; -import type { AvailableModel } from "@browserbasehq/stagehand"; +import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand"; import { tasksConfig } from "../taskConfig.js"; import { getPackageRootDir } from "../runtimePaths.js"; import { @@ -32,6 +32,12 @@ export const buildWebTailBenchTestcases = ( ques: string; category?: string; web?: string; + /** + * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv + * via packages/evals/scripts/backfill-webtailbench-rubrics.ts. + * When present, the verifier uses these upstream criteria directly. + */ + precomputed_rubric?: unknown; [key: string]: unknown; }; @@ -42,7 +48,23 @@ export const buildWebTailBenchTestcases = ( } const candidates = parseJsonlRows(lines, isWebTailBenchRow); - const rows = applySampling(candidates, sampleCount, maxCases); + + // EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs, + // preserving the order given and ignoring sampling / limit knobs. + const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS + ? process.env.EVAL_WEBTAILBENCH_IDS.split(",") + .map((s) => s.trim()) + .filter(Boolean) + : null; + let rows: WebTailBenchRow[]; + if (explicitIds && explicitIds.length > 0) { + const byId = new Map(candidates.map((r) => [r.id, r])); + rows = explicitIds + .map((id) => byId.get(id)) + .filter((r): r is WebTailBenchRow => Boolean(r)); + } else { + rows = applySampling(candidates, sampleCount, maxCases); + } const allTestcases: Testcase[] = []; for (const modelEntry of normalizeAgentModelEntries(models)) { @@ -57,6 +79,7 @@ export const buildWebTailBenchTestcases = ( category: row.category, ques: row.ques, web: row.web, + precomputed_rubric: normalizeRubric(row.precomputed_rubric), }, }; const taskCategories = diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts index 7f60e5775..e3791348b 100644 --- a/packages/evals/tasks/bench/agent/webtailbench.ts +++ b/packages/evals/tasks/bench/agent/webtailbench.ts @@ -1,19 +1,34 @@ +import { normalizeRubric, type TaskSpec } from "@browserbasehq/stagehand"; + import { defineBenchTask } from "../../../framework/defineTask.js"; -import { V3Evaluator } from "@browserbasehq/stagehand"; -import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js"; -import { imageResize } from "../../../utils/imageResize.js"; +import { + evaluationResultToSuccess, + runWithVerifier, +} from "../../../framework/verifierAdapter.js"; +/** + * WebTailBench bench task. + * + * Runs the agent through TrajectoryRecorder + V3Evaluator.verify() so process + * and outcome scoring are grounded in saved trajectory evidence. + * + * If a row does not carry `precomputed_rubric`, the verifier generates a + * rubric on first encounter per task id and caches it under + * packages/evals/.rubric-cache/webtailbench/. + * + * --success knob: defaults to "outcome". + * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both. + */ export default defineBenchTask( { name: "agent/webtailbench" }, async ({ v3, logger, debugUrl, sessionUrl, modelName, input }) => { - let screenshotCollector: ScreenshotCollector | null = null; - try { const params = ((input && input.params) || {}) as { id?: string; category?: string; ques?: string; web?: string; + precomputed_rubric?: unknown; }; if (!params.ques) { @@ -27,11 +42,8 @@ export default defineBenchTask( } const page = v3.context.pages()[0]; - // web field is always empty in WebTailBench; start from Google const startUrl = params.web || "https://www.google.com"; - await page.goto(startUrl, { - timeoutMs: 120_000, - }); + await page.goto(startUrl, { timeoutMs: 120_000 }); const systemPrompt = `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: " summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. You will need to navigate to the appropriate website to complete the task.`; const agentMode = input.agentMode ?? (input.isCUA ? "cua" : "hybrid"); @@ -41,70 +53,60 @@ export default defineBenchTask( systemPrompt, }); - screenshotCollector = new ScreenshotCollector(v3, { - interval: 3000, - maxScreenshots: 8, - }); - screenshotCollector.start(); - - const agentResult = await agent.execute({ + const taskSpec: TaskSpec = { + id: params.id ?? `webtailbench/${input.name}`, instruction: params.ques, - maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50, - }); + initUrl: startUrl, + precomputedRubric: normalizeRubric(params.precomputed_rubric), + }; - // Stop collecting and get all screenshots - let screenshots = await screenshotCollector.stop(); + const { evaluationResult, trajectory, trajectoryDir, rubric } = + await runWithVerifier({ + v3, + agent, + taskSpec, + dataset: "webtailbench", + agentOptions: { + maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50, + }, + }); - // Resize screenshots if we have any - if (screenshots.length > 0) { - screenshots = await Promise.all( - screenshots.map(async (screenshot) => { - return await imageResize(screenshot, 0.7); - }), - ); - } + const successMode = process.env.EVAL_SUCCESS_MODE; logger.log({ category: "evaluation", - message: `Collected ${screenshots.length} screenshots for evaluation`, + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`, level: 1, }); - const evaluator = new V3Evaluator(v3); - const evalResult = await evaluator.ask({ - question: `Did the agent successfully complete this task: "${params.ques}"? Note that the agent does not have purchasing/booking capabilities; mark as pass if the agent has successfully performed all necessary steps for the task up to the point of purchasing/booking/entering payment/user information`, - screenshot: screenshots, - agentReasoning: - agentResult.message || - "no reasoning available, agent potentially hit step limit", - }); - - // Clear screenshot buffers to free memory - screenshots.length = 0; - return { - _success: evalResult.evaluation === "YES", - reasoning: evalResult.reasoning, + _success: evaluationResultToSuccess(evaluationResult, successMode), + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + primaryIntent: evaluationResult.rawSteps?.primaryIntent, + reasoning: evaluationResult.rawSteps?.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), }; } catch (error) { + const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir; return { _success: false, error, + trajectoryDir, debugUrl, sessionUrl, logs: logger.getLogs(), }; - } finally { - if (screenshotCollector) { - try { - await screenshotCollector.stop(); - } catch { - // Ignore errors during cleanup - } - } } }, ); + +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; +} diff --git a/packages/evals/tests/framework/verifierAdapter.test.ts b/packages/evals/tests/framework/verifierAdapter.test.ts new file mode 100644 index 000000000..fc21cdd3f --- /dev/null +++ b/packages/evals/tests/framework/verifierAdapter.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from "vitest"; +import type { EvaluationResult } from "@browserbasehq/stagehand"; + +import { + evaluationResultToSuccess, + resolveEvalSuccessMode, +} from "../../framework/verifierAdapter.js"; + +const baseResult: EvaluationResult = { + outcomeSuccess: true, + processScore: 0.5, + perCriterion: [], + taskValidity: { isAmbiguous: false, isInvalid: false }, + evidenceInsufficient: [], +}; + +describe("resolveEvalSuccessMode", () => { + it("defaults invalid env/config values to outcome", () => { + expect(resolveEvalSuccessMode(undefined)).toBe("outcome"); + expect(resolveEvalSuccessMode("bad-value")).toBe("outcome"); + expect(resolveEvalSuccessMode(" PROCESS ")).toBe("process"); + }); +}); + +describe("evaluationResultToSuccess", () => { + it("uses validated success modes", () => { + expect(evaluationResultToSuccess(baseResult, "outcome")).toBe(true); + expect(evaluationResultToSuccess(baseResult, "process")).toBe(false); + expect(evaluationResultToSuccess(baseResult, "both")).toBe(false); + expect(evaluationResultToSuccess(baseResult, "invalid")).toBe(true); + }); + + it("treats missing process score as a failed process gate", () => { + const outcomeOnly: EvaluationResult = { outcomeSuccess: true }; + expect(evaluationResultToSuccess(outcomeOnly, "outcome")).toBe(true); + expect(evaluationResultToSuccess(outcomeOnly, "process")).toBe(false); + expect(evaluationResultToSuccess(outcomeOnly, "both")).toBe(false); + }); +});