From adca1438f266c6cc2e10c3878afbf43cf76010ab Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:50:01 -0700 Subject: [PATCH 1/5] feat(evals): wire WebTailBench through verifier --- packages/evals/framework/verifierAdapter.ts | 160 ++++++++++++++ .../evals/scripts/verify-webtailbench-task.ts | 200 ++++++++++++++++++ packages/evals/suites/webtailbench.ts | 29 ++- .../evals/tasks/bench/agent/webtailbench.ts | 109 +++++----- 4 files changed, 445 insertions(+), 53 deletions(-) create mode 100644 packages/evals/framework/verifierAdapter.ts create mode 100644 packages/evals/scripts/verify-webtailbench-task.ts diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts new file mode 100644 index 000000000..5351b28b3 --- /dev/null +++ b/packages/evals/framework/verifierAdapter.ts @@ -0,0 +1,160 @@ +/** + * verifierAdapter — runs a bench task through the verifier pipeline. + * + * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate + * with one call: + * + * const { verdict, trajectory } = await runWithVerifier({ + * v3, + * agent, + * taskSpec: { id, instruction, initUrl, precomputedRubric? }, + * maxSteps: 50, + * }); + * + * Behavior: + * 1. Resolves the rubric — precomputedRubric (e.g., upstream WebTailBench), + * or generates via Step 0a and caches under .rubric-cache//. + * 2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus. + * 3. Runs V3Evaluator.verify() on the recorded Trajectory. + * 4. Returns { trajectory, verdict, agentResult }. + * + * Persistence and rubric caching are gated by env vars (plan §10 Q2 + Q3): + * VERIFIER_PERSIST_TRAJECTORIES — on locally, off in CI by default. + * VERIFIER_DISABLE_RUBRIC_CACHE — set to "1" to bypass the cache (forces + * a fresh Step 0a call every time). + */ +import { + V3Evaluator, + type AgentInstance, + type AgentExecuteOptions, + type AgentResult, + type Rubric, + type TaskSpec, + type Trajectory, + type V3, + type Verdict, +} from "@browserbasehq/stagehand"; + +import { RubricCache } from "./rubricCache.js"; +import { TrajectoryRecorder } from "./trajectoryRecorder.js"; + +export interface RunWithVerifierOptions { + v3: V3; + agent: AgentInstance; + taskSpec: TaskSpec; + /** + * Dataset name for rubric cache partitioning. Each task lives under + * `.rubric-cache//.json`. + */ + dataset: string; + /** Agent execute options. `instruction` is filled from taskSpec.instruction. */ + agentOptions?: Omit; + /** Override the run id (defaults to ISO timestamp). */ + runId?: string; + /** Override trajectory persistence root. */ + trajectoryRoot?: string; +} + +export interface RunWithVerifierResult { + trajectory: Trajectory; + verdict: Verdict; + agentResult: AgentResult; + /** Resolved rubric (precomputed, cached, or freshly generated). */ + rubric: Rubric; + /** Where the trajectory was persisted (or would have been, if disabled). */ + trajectoryDir: string; +} + +export async function runWithVerifier( + opts: RunWithVerifierOptions, +): Promise { + const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } = + opts; + const evaluator = new V3Evaluator(v3, { backend: "verifier" }); + + // ── Resolve rubric ────────────────────────────────────────────────────── + let resolvedRubric: Rubric; + if (taskSpec.precomputedRubric) { + resolvedRubric = taskSpec.precomputedRubric; + } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + resolvedRubric = await evaluator.generateRubric(taskSpec); + } else { + const cache = new RubricCache({ dataset }); + resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator); + } + + // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate. + const hydratedTaskSpec: TaskSpec = { + ...taskSpec, + precomputedRubric: resolvedRubric, + }; + + // ── Record trajectory around agent.execute() ─────────────────────────── + const recorder = new TrajectoryRecorder({ + v3, + taskSpec: hydratedTaskSpec, + runId, + outputRoot: trajectoryRoot, + }); + recorder.start(); + + let agentResult: AgentResult; + let recorderStatus: "complete" | "aborted" | "error" = "complete"; + try { + agentResult = await agent.execute({ + ...agentOptions, + instruction: taskSpec.instruction, + }); + } catch (e) { + recorderStatus = "error"; + const trajectory = await recorder.finish({ status: recorderStatus }); + // Re-throw after persisting so the bench task can decide how to report. + const wrapped = e instanceof Error ? e : new Error(String(e)); + Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory }); + throw wrapped; + } + + const trajectory = await recorder.finish({ + status: recorderStatus, + finalAnswer: agentResult.message, + usage: agentResult.usage, + }); + + // ── Verify ────────────────────────────────────────────────────────────── + const verdict = await evaluator.verify(trajectory, hydratedTaskSpec); + await recorder.persistVerdict(verdict); + + return { + trajectory, + verdict, + agentResult, + rubric: resolvedRubric, + trajectoryDir: recorder.directory, + }; +} + +/** + * Decide bench task success from a Verdict using the --success flag's + * semantics (mirrors fara's CLI knob, plan §03). + * + * `outcome` (default) — strict binary outcome. Matches fara-7b's reported + * metric. + * `process` — rubric process score ≥ threshold (default 0.8). + * `both` — both conditions must hold. + */ +export function verdictToSuccess( + verdict: Verdict, + mode: "outcome" | "process" | "both" = "outcome", + processThreshold = 0.8, +): boolean { + const outcomeOk = verdict.outcomeSuccess; + const processOk = verdict.processScore >= processThreshold; + switch (mode) { + case "outcome": + return outcomeOk; + case "process": + return processOk; + case "both": + return outcomeOk && processOk; + } +} diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts new file mode 100644 index 000000000..4e1848e84 --- /dev/null +++ b/packages/evals/scripts/verify-webtailbench-task.ts @@ -0,0 +1,200 @@ +/** + * End-to-end Wave 1 verification on a real WebTailBench task. + * + * Loads one row from packages/evals/datasets/webtailbench/WebTailBench_data.jsonl + * (which carries upstream precomputed_rubric), runs the agent on Browserbase + * via runWithVerifier, and asserts: + * 1. Recorder captures a non-trivial trajectory. + * 2. Verifier uses the upstream rubric (rubricSource = "precomputed"). + * 3. Step 6 rescoring produces per-criterion scores (no evidence_insufficient). + * 4. Step 8 outcome returns a boolean verdict with reasoning. + * + * pnpm tsx packages/evals/scripts/verify-webtailbench-task.ts [task_id] + * + * Defaults to united_13. Requires BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID + * and a GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env. + */ +import "dotenv/config"; +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import path from "node:path"; + +import { V3 } from "@browserbasehq/stagehand"; +import type { Rubric, TaskSpec } from "@browserbasehq/stagehand"; +import { runWithVerifier } from "../framework/verifierAdapter.js"; + +interface WebTailBenchRow { + id: string; + category?: string; + ques: string; + web?: string; + precomputed_rubric?: Rubric; +} + +const DEFAULT_TASK_ID = "united_13"; +const JSONL = path.resolve( + import.meta.dirname, + "..", + "datasets", + "webtailbench", + "WebTailBench_data.jsonl", +); + +async function loadRow(taskId: string): Promise { + const raw = await fs.readFile(JSONL, "utf8"); + for (const line of raw.split(/\r?\n/)) { + if (!line.trim()) continue; + const row = JSON.parse(line) as WebTailBenchRow; + if (row.id === taskId) return row; + } + throw new Error(`task id ${taskId} not found in ${JSONL}`); +} + +async function main(): Promise { + const taskId = process.argv[2] ?? DEFAULT_TASK_ID; + const mode = (process.env.AGENT_MODE ?? "hybrid") as "dom" | "hybrid" | "cua"; + const model = + process.env.AGENT_MODEL ?? + (mode === "cua" ? "anthropic/claude-haiku-4-5" : "google/gemini-2.5-flash"); + console.log(`▸ loading WebTailBench task: ${taskId}`); + console.log(` mode=${mode} model=${model}`); + const row = await loadRow(taskId); + console.log(` ✓ ${row.ques.slice(0, 100)}`); + console.log( + ` ✓ rubric: ${row.precomputed_rubric ? `${row.precomputed_rubric.items.length} criteria` : "MISSING"}`, + ); + assert.ok( + row.precomputed_rubric && row.precomputed_rubric.items.length > 0, + "task should carry a precomputed rubric (run backfill-webtailbench-rubrics.ts first)", + ); + + // Most WebTailBench sites block local browser traffic; ideally this runs on + // BROWSERBASE. Defaults to LOCAL when Browserbase creds aren't configured — + // the verifier still exercises end-to-end on whatever trajectory we capture, + // even if the agent fails fast against anti-bot. + const useBrowserbase = + process.env.BROWSERBASE_API_KEY && process.env.BROWSERBASE_PROJECT_ID; + const env = useBrowserbase ? "BROWSERBASE" : "LOCAL"; + console.log(`▸ initializing V3 on ${env}`); + const v3 = new V3({ + env, + verbose: 1, + model, + // Keep the agent loop local even on env=BROWSERBASE — without this V3 + // would auto-create an apiClient and dispatch agent.execute() to the + // remote server-side loop, which doesn't emit on our local bus. The + // evals framework does this same opt-out in packages/evals/initV3.ts:121 + // via process.env.USE_API. disableAPI is the targeted flag; we used + // experimental: true previously as a heavier-handed equivalent. + disableAPI: true, + }); + await v3.init(); + + const page = v3.context.pages()[0]; + const startUrl = row.web || "https://www.google.com"; + await page.goto(startUrl, { timeoutMs: 120_000 }); + console.log(` ✓ navigated to ${startUrl}`); + + const agent = v3.agent({ + mode, + model, + }); + + const taskSpec: TaskSpec = { + id: row.id, + instruction: row.ques, + initUrl: startUrl, + precomputedRubric: row.precomputed_rubric, + }; + + console.log("▸ running agent + verifier pipeline"); + const startMs = Date.now(); + const result = await runWithVerifier({ + v3, + agent, + taskSpec, + dataset: "webtailbench", + agentOptions: { maxSteps: 30 }, + }); + console.log( + ` ✓ completed in ${((Date.now() - startMs) / 1000).toFixed(1)}s`, + ); + + // Diagnostic: show what the agent did internally vs what reached the bus. + console.log(` agent.actions: ${result.agentResult.actions.length}`); + console.log(` agent.completed: ${result.agentResult.completed}`); + console.log( + ` agent.usage: ${JSON.stringify(result.agentResult.usage ?? {})}`, + ); + if (result.agentResult.actions.length > 0) { + console.log(" first 5 internal actions:"); + for (const a of result.agentResult.actions.slice(0, 5)) { + console.log(` - ${a.type ?? "?"} ${(a.action ?? "").slice(0, 80)}`); + } + } + + await v3.close(); + + // ── Assertions ────────────────────────────────────────────────────────── + const { trajectory, verdict, rubric, trajectoryDir } = result; + console.log(`\n▸ trajectory: ${trajectory.steps.length} steps`); + console.log(` directory: ${trajectoryDir}`); + console.log(`\n▸ verdict:`); + console.log( + ` outcomeSuccess=${verdict.outcomeSuccess} processScore=${verdict.processScore.toFixed(3)}`, + ); + console.log( + ` per-criterion (${verdict.perCriterion.length}/${rubric.items.length}):`, + ); + for (const c of verdict.perCriterion) { + const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1); + const flag = c.evidenceInsufficient ? " [evidence_insufficient]" : ""; + console.log(` - ${earned}/${c.maxPoints} ${c.criterion}${flag}`); + if (c.justification) { + console.log(` ${c.justification.slice(0, 200)}`); + } + } + const raw = verdict.rawSteps as + | { primaryIntent?: string; reasoning?: string; rubricSource?: string } + | undefined; + console.log(`\n▸ rubric source: ${raw?.rubricSource}`); + console.log(`▸ primary intent: ${raw?.primaryIntent}`); + + if (verdict.findings && verdict.findings.length > 0) { + console.log(`\n▸ findings (${verdict.findings.length}):`); + for (const f of verdict.findings) { + const steps = f.relatedSteps?.length + ? ` steps=[${f.relatedSteps.join(",")}]` + : ""; + console.log(` [${f.severity}] ${f.category}${steps}`); + console.log(` ${f.description}`); + if (f.suggestedAction) { + console.log(` → ${f.suggestedAction}`); + } + } + } else { + console.log(`\n▸ findings: (none)`); + } + + assert.equal( + raw?.rubricSource, + "precomputed", + "expected verifier to use the upstream precomputed rubric", + ); + assert.equal(verdict.perCriterion.length, rubric.items.length); + const fullySufficient = verdict.perCriterion.every( + (c) => !c.evidenceInsufficient, + ); + assert.ok( + fullySufficient, + "expected Step 6 to score every criterion (no evidence_insufficient flags)", + ); + assert.equal(typeof verdict.outcomeSuccess, "boolean"); + + console.log(`\n✅ Wave 1 WebTailBench verification OK`); +} + +main().catch((err) => { + console.error("\n❌ Wave 1 WebTailBench verification FAILED:", err); + process.exit(1); +}); diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts index 4bd2015fa..eb20029e4 100644 --- a/packages/evals/suites/webtailbench.ts +++ b/packages/evals/suites/webtailbench.ts @@ -1,5 +1,5 @@ import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js"; -import type { AvailableModel } from "@browserbasehq/stagehand"; +import type { AvailableModel, Rubric } from "@browserbasehq/stagehand"; import { tasksConfig } from "../taskConfig.js"; import { getPackageRootDir } from "../runtimePaths.js"; import { @@ -32,6 +32,13 @@ export const buildWebTailBenchTestcases = ( ques: string; category?: string; web?: string; + /** + * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv + * via packages/evals/scripts/backfill-webtailbench-rubrics.ts. + * When present, the verifier skips Step 0a generation and uses these + * upstream criteria directly. + */ + precomputed_rubric?: Rubric; [key: string]: unknown; }; @@ -42,7 +49,24 @@ export const buildWebTailBenchTestcases = ( } const candidates = parseJsonlRows(lines, isWebTailBenchRow); - const rows = applySampling(candidates, sampleCount, maxCases); + + // EVAL_WEBTAILBENCH_IDS — comma-separated task IDs. When set, restricts the + // suite to exactly those IDs (in the order given) and ignores sampling / + // limit knobs. Used by verifier-A/B experiments to pin a deterministic slice. + const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS + ? process.env.EVAL_WEBTAILBENCH_IDS.split(",") + .map((s) => s.trim()) + .filter(Boolean) + : null; + let rows: WebTailBenchRow[]; + if (explicitIds && explicitIds.length > 0) { + const byId = new Map(candidates.map((r) => [r.id, r])); + rows = explicitIds + .map((id) => byId.get(id)) + .filter((r): r is WebTailBenchRow => Boolean(r)); + } else { + rows = applySampling(candidates, sampleCount, maxCases); + } const allTestcases: Testcase[] = []; for (const modelEntry of normalizeAgentModelEntries(models)) { @@ -57,6 +81,7 @@ export const buildWebTailBenchTestcases = ( category: row.category, ques: row.ques, web: row.web, + precomputed_rubric: row.precomputed_rubric, }, }; const taskCategories = diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts index 7f60e5775..a5b3433cf 100644 --- a/packages/evals/tasks/bench/agent/webtailbench.ts +++ b/packages/evals/tasks/bench/agent/webtailbench.ts @@ -1,19 +1,37 @@ +import type { Rubric, TaskSpec } from "@browserbasehq/stagehand"; + import { defineBenchTask } from "../../../framework/defineTask.js"; -import { V3Evaluator } from "@browserbasehq/stagehand"; -import { ScreenshotCollector } from "../../../utils/ScreenshotCollector.js"; -import { imageResize } from "../../../utils/imageResize.js"; +import { + runWithVerifier, + verdictToSuccess, +} from "../../../framework/verifierAdapter.js"; +/** + * WebTailBench bench task. + * + * Wave 1 MVP: runs the agent through the new TrajectoryRecorder + + * V3Evaluator.verify() pipeline (process + outcome scoring grounded in the + * paper's MMRubricAgent). The previous polling-based ScreenshotCollector + + * V3Evaluator.ask() flow is gone. + * + * The local WebTailBench JSONL doesn't carry precomputed_rubric (the + * upstream HF dataset does — Wave 2 dataset swap pending). Until then the + * verifier generates a rubric via Step 0a on first encounter per task id + * and caches under packages/evals/.rubric-cache/webtailbench/. + * + * --success knob: defaults to "outcome" (matches fara-7b's reported metric). + * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both. + */ export default defineBenchTask( { name: "agent/webtailbench" }, async ({ v3, logger, debugUrl, sessionUrl, modelName, input }) => { - let screenshotCollector: ScreenshotCollector | null = null; - try { const params = ((input && input.params) || {}) as { id?: string; category?: string; ques?: string; web?: string; + precomputed_rubric?: Rubric; }; if (!params.ques) { @@ -27,11 +45,8 @@ export default defineBenchTask( } const page = v3.context.pages()[0]; - // web field is always empty in WebTailBench; start from Google const startUrl = params.web || "https://www.google.com"; - await page.goto(startUrl, { - timeoutMs: 120_000, - }); + await page.goto(startUrl, { timeoutMs: 120_000 }); const systemPrompt = `You are a helpful assistant that must solve the task by browsing. At the end, produce a single line: "Final Answer: " summarizing the requested result (e.g., score, list, or text). Current page: ${await page.title()}. You will need to navigate to the appropriate website to complete the task.`; const agentMode = input.agentMode ?? (input.isCUA ? "cua" : "hybrid"); @@ -41,70 +56,62 @@ export default defineBenchTask( systemPrompt, }); - screenshotCollector = new ScreenshotCollector(v3, { - interval: 3000, - maxScreenshots: 8, - }); - screenshotCollector.start(); - - const agentResult = await agent.execute({ + const taskSpec: TaskSpec = { + id: params.id ?? `webtailbench/${input.name}`, instruction: params.ques, - maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50, - }); + initUrl: startUrl, + precomputedRubric: params.precomputed_rubric, + }; - // Stop collecting and get all screenshots - let screenshots = await screenshotCollector.stop(); + const { verdict, trajectory, trajectoryDir, rubric } = + await runWithVerifier({ + v3, + agent, + taskSpec, + dataset: "webtailbench", + agentOptions: { + maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 50, + }, + }); - // Resize screenshots if we have any - if (screenshots.length > 0) { - screenshots = await Promise.all( - screenshots.map(async (screenshot) => { - return await imageResize(screenshot, 0.7); - }), - ); - } + const successMode = + (process.env.EVAL_SUCCESS_MODE as "outcome" | "process" | "both") || + "outcome"; logger.log({ category: "evaluation", - message: `Collected ${screenshots.length} screenshots for evaluation`, + message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`, level: 1, }); - const evaluator = new V3Evaluator(v3); - const evalResult = await evaluator.ask({ - question: `Did the agent successfully complete this task: "${params.ques}"? Note that the agent does not have purchasing/booking capabilities; mark as pass if the agent has successfully performed all necessary steps for the task up to the point of purchasing/booking/entering payment/user information`, - screenshot: screenshots, - agentReasoning: - agentResult.message || - "no reasoning available, agent potentially hit step limit", - }); - - // Clear screenshot buffers to free memory - screenshots.length = 0; - return { - _success: evalResult.evaluation === "YES", - reasoning: evalResult.reasoning, + _success: verdictToSuccess(verdict, successMode), + outcomeSuccess: verdict.outcomeSuccess, + processScore: verdict.processScore, + evidenceInsufficient: verdict.evidenceInsufficient, + criterionCount: rubric.items.length, + stepCount: trajectory.steps.length, + trajectoryDir, + primaryIntent: + (verdict.rawSteps as { primaryIntent?: string } | undefined) + ?.primaryIntent ?? undefined, + reasoning: + (verdict.rawSteps as { reasoning?: string } | undefined)?.reasoning ?? + undefined, debugUrl, sessionUrl, logs: logger.getLogs(), }; } catch (error) { + const trajectoryDir = (error as { trajectoryDir?: string }).trajectoryDir; return { _success: false, error, + trajectoryDir, debugUrl, sessionUrl, logs: logger.getLogs(), }; - } finally { - if (screenshotCollector) { - try { - await screenshotCollector.stop(); - } catch { - // Ignore errors during cleanup - } - } } }, ); From 986624ecec76b70dfe12f39c12c3d7bdf69ca5ae Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:18:34 -0700 Subject: [PATCH 2/5] fix(evals): normalize verifier rubric inputs --- packages/evals/framework/verifierAdapter.ts | 3 ++- .../scripts/backfill-webtailbench-rubrics.ts | 8 ++++---- .../evals/scripts/verify-webtailbench-task.ts | 12 +++++------- packages/evals/suites/webtailbench.ts | 10 +++++++--- .../evals/tasks/bench/agent/webtailbench.ts | 19 ++++++++++--------- 5 files changed, 28 insertions(+), 24 deletions(-) diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts index 5351b28b3..971205f6b 100644 --- a/packages/evals/framework/verifierAdapter.ts +++ b/packages/evals/framework/verifierAdapter.ts @@ -25,6 +25,7 @@ */ import { V3Evaluator, + normalizeRubric, type AgentInstance, type AgentExecuteOptions, type AgentResult, @@ -75,7 +76,7 @@ export async function runWithVerifier( // ── Resolve rubric ────────────────────────────────────────────────────── let resolvedRubric: Rubric; if (taskSpec.precomputedRubric) { - resolvedRubric = taskSpec.precomputedRubric; + resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!; } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { resolvedRubric = await evaluator.generateRubric(taskSpec); } else { diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts index 5a6763390..965722fa4 100644 --- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts +++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts @@ -29,7 +29,7 @@ const JSONL_PATH = path.join( "WebTailBench_data.jsonl", ); -interface Rubric { +interface SerializedRubric { items: Array>; } @@ -38,7 +38,7 @@ interface LocalRow { category?: string; ques: string; web?: string; - precomputed_rubric?: Rubric; + precomputed_rubric?: SerializedRubric; } /** @@ -114,12 +114,12 @@ async function main(): Promise { ); } - const rubricsById = new Map(); + const rubricsById = new Map(); for (let i = 1; i < rows.length; i++) { const cols = rows[i]; if (!cols[idIdx]) continue; try { - const parsed = JSON.parse(cols[rubricIdx]) as Rubric; + const parsed = JSON.parse(cols[rubricIdx]) as SerializedRubric; rubricsById.set(cols[idIdx], parsed); } catch (e) { console.warn( diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts index 4e1848e84..84666a74b 100644 --- a/packages/evals/scripts/verify-webtailbench-task.ts +++ b/packages/evals/scripts/verify-webtailbench-task.ts @@ -19,8 +19,8 @@ import assert from "node:assert/strict"; import fs from "node:fs/promises"; import path from "node:path"; -import { V3 } from "@browserbasehq/stagehand"; -import type { Rubric, TaskSpec } from "@browserbasehq/stagehand"; +import { V3, normalizeRubric } from "@browserbasehq/stagehand"; +import type { SerializedRubric, TaskSpec } from "@browserbasehq/stagehand"; import { runWithVerifier } from "../framework/verifierAdapter.js"; interface WebTailBenchRow { @@ -28,7 +28,7 @@ interface WebTailBenchRow { category?: string; ques: string; web?: string; - precomputed_rubric?: Rubric; + precomputed_rubric?: SerializedRubric; } const DEFAULT_TASK_ID = "united_13"; @@ -104,7 +104,7 @@ async function main(): Promise { id: row.id, instruction: row.ques, initUrl: startUrl, - precomputedRubric: row.precomputed_rubric, + precomputedRubric: normalizeRubric(row.precomputed_rubric), }; console.log("▸ running agent + verifier pipeline"); @@ -154,9 +154,7 @@ async function main(): Promise { console.log(` ${c.justification.slice(0, 200)}`); } } - const raw = verdict.rawSteps as - | { primaryIntent?: string; reasoning?: string; rubricSource?: string } - | undefined; + const raw = verdict.rawSteps; console.log(`\n▸ rubric source: ${raw?.rubricSource}`); console.log(`▸ primary intent: ${raw?.primaryIntent}`); diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts index eb20029e4..bf8c5c919 100644 --- a/packages/evals/suites/webtailbench.ts +++ b/packages/evals/suites/webtailbench.ts @@ -1,5 +1,9 @@ import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js"; -import type { AvailableModel, Rubric } from "@browserbasehq/stagehand"; +import { + normalizeRubric, + type AvailableModel, + type SerializedRubric, +} from "@browserbasehq/stagehand"; import { tasksConfig } from "../taskConfig.js"; import { getPackageRootDir } from "../runtimePaths.js"; import { @@ -38,7 +42,7 @@ export const buildWebTailBenchTestcases = ( * When present, the verifier skips Step 0a generation and uses these * upstream criteria directly. */ - precomputed_rubric?: Rubric; + precomputed_rubric?: SerializedRubric; [key: string]: unknown; }; @@ -81,7 +85,7 @@ export const buildWebTailBenchTestcases = ( category: row.category, ques: row.ques, web: row.web, - precomputed_rubric: row.precomputed_rubric, + precomputed_rubric: normalizeRubric(row.precomputed_rubric), }, }; const taskCategories = diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts index a5b3433cf..33224713f 100644 --- a/packages/evals/tasks/bench/agent/webtailbench.ts +++ b/packages/evals/tasks/bench/agent/webtailbench.ts @@ -1,4 +1,9 @@ -import type { Rubric, TaskSpec } from "@browserbasehq/stagehand"; +import { + normalizeRubric, + type Rubric, + type SerializedRubric, + type TaskSpec, +} from "@browserbasehq/stagehand"; import { defineBenchTask } from "../../../framework/defineTask.js"; import { @@ -31,7 +36,7 @@ export default defineBenchTask( category?: string; ques?: string; web?: string; - precomputed_rubric?: Rubric; + precomputed_rubric?: Rubric | SerializedRubric; }; if (!params.ques) { @@ -60,7 +65,7 @@ export default defineBenchTask( id: params.id ?? `webtailbench/${input.name}`, instruction: params.ques, initUrl: startUrl, - precomputedRubric: params.precomputed_rubric, + precomputedRubric: normalizeRubric(params.precomputed_rubric), }; const { verdict, trajectory, trajectoryDir, rubric } = @@ -92,12 +97,8 @@ export default defineBenchTask( criterionCount: rubric.items.length, stepCount: trajectory.steps.length, trajectoryDir, - primaryIntent: - (verdict.rawSteps as { primaryIntent?: string } | undefined) - ?.primaryIntent ?? undefined, - reasoning: - (verdict.rawSteps as { reasoning?: string } | undefined)?.reasoning ?? - undefined, + primaryIntent: verdict.rawSteps?.primaryIntent, + reasoning: verdict.rawSteps?.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), From 92afba0bdc110a2a3ec5b8a6a87732e429d0fd7b Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:40:59 -0700 Subject: [PATCH 3/5] fix(evals): validate verifier success mode --- packages/evals/framework/verifierAdapter.ts | 25 ++++++++++++--- .../evals/tasks/bench/agent/webtailbench.ts | 10 +++--- .../tests/framework/verifierAdapter.test.ts | 32 +++++++++++++++++++ 3 files changed, 56 insertions(+), 11 deletions(-) create mode 100644 packages/evals/tests/framework/verifierAdapter.test.ts diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts index 971205f6b..5ecfc7da9 100644 --- a/packages/evals/framework/verifierAdapter.ts +++ b/packages/evals/framework/verifierAdapter.ts @@ -136,21 +136,36 @@ export async function runWithVerifier( /** * Decide bench task success from a Verdict using the --success flag's - * semantics (mirrors fara's CLI knob, plan §03). + * semantics. * - * `outcome` (default) — strict binary outcome. Matches fara-7b's reported - * metric. + * `outcome` (default) — strict binary outcome. * `process` — rubric process score ≥ threshold (default 0.8). * `both` — both conditions must hold. */ +export type EvalSuccessMode = "outcome" | "process" | "both"; + +export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode { + if (typeof mode !== "string") return "outcome"; + const normalized = mode.trim().toLowerCase(); + if ( + normalized === "outcome" || + normalized === "process" || + normalized === "both" + ) { + return normalized; + } + return "outcome"; +} + export function verdictToSuccess( verdict: Verdict, - mode: "outcome" | "process" | "both" = "outcome", + mode: unknown = "outcome", processThreshold = 0.8, ): boolean { + const resolvedMode = resolveEvalSuccessMode(mode); const outcomeOk = verdict.outcomeSuccess; const processOk = verdict.processScore >= processThreshold; - switch (mode) { + switch (resolvedMode) { case "outcome": return outcomeOk; case "process": diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts index 33224713f..9081171be 100644 --- a/packages/evals/tasks/bench/agent/webtailbench.ts +++ b/packages/evals/tasks/bench/agent/webtailbench.ts @@ -15,8 +15,8 @@ import { * WebTailBench bench task. * * Wave 1 MVP: runs the agent through the new TrajectoryRecorder + - * V3Evaluator.verify() pipeline (process + outcome scoring grounded in the - * paper's MMRubricAgent). The previous polling-based ScreenshotCollector + + * V3Evaluator.verify() pipeline (process + outcome scoring grounded in saved + * trajectory evidence). The previous polling-based ScreenshotCollector + * V3Evaluator.ask() flow is gone. * * The local WebTailBench JSONL doesn't carry precomputed_rubric (the @@ -24,7 +24,7 @@ import { * verifier generates a rubric via Step 0a on first encounter per task id * and caches under packages/evals/.rubric-cache/webtailbench/. * - * --success knob: defaults to "outcome" (matches fara-7b's reported metric). + * --success knob: defaults to "outcome". * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both. */ export default defineBenchTask( @@ -79,9 +79,7 @@ export default defineBenchTask( }, }); - const successMode = - (process.env.EVAL_SUCCESS_MODE as "outcome" | "process" | "both") || - "outcome"; + const successMode = process.env.EVAL_SUCCESS_MODE; logger.log({ category: "evaluation", diff --git a/packages/evals/tests/framework/verifierAdapter.test.ts b/packages/evals/tests/framework/verifierAdapter.test.ts new file mode 100644 index 000000000..6f446d1ff --- /dev/null +++ b/packages/evals/tests/framework/verifierAdapter.test.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from "vitest"; +import type { Verdict } from "@browserbasehq/stagehand"; + +import { + resolveEvalSuccessMode, + verdictToSuccess, +} from "../../framework/verifierAdapter.js"; + +const baseVerdict: Verdict = { + outcomeSuccess: true, + processScore: 0.5, + perCriterion: [], + taskValidity: { isAmbiguous: false, isInvalid: false }, + evidenceInsufficient: [], +}; + +describe("resolveEvalSuccessMode", () => { + it("defaults invalid env/config values to outcome", () => { + expect(resolveEvalSuccessMode(undefined)).toBe("outcome"); + expect(resolveEvalSuccessMode("bad-value")).toBe("outcome"); + expect(resolveEvalSuccessMode(" PROCESS ")).toBe("process"); + }); +}); + +describe("verdictToSuccess", () => { + it("uses validated success modes", () => { + expect(verdictToSuccess(baseVerdict, "outcome")).toBe(true); + expect(verdictToSuccess(baseVerdict, "process")).toBe(false); + expect(verdictToSuccess(baseVerdict, "both")).toBe(false); + expect(verdictToSuccess(baseVerdict, "invalid")).toBe(true); + }); +}); From 7652e3226573a7ae533925a634e1009e8db37322 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:20:50 -0700 Subject: [PATCH 4/5] docs(evals): remove rollout comments from verifier adapter --- packages/evals/framework/verifierAdapter.ts | 2 +- packages/evals/scripts/verify-webtailbench-task.ts | 8 ++++---- packages/evals/tasks/bench/agent/webtailbench.ts | 13 +++++-------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts index 5ecfc7da9..2e6c8a8a2 100644 --- a/packages/evals/framework/verifierAdapter.ts +++ b/packages/evals/framework/verifierAdapter.ts @@ -18,7 +18,7 @@ * 3. Runs V3Evaluator.verify() on the recorded Trajectory. * 4. Returns { trajectory, verdict, agentResult }. * - * Persistence and rubric caching are gated by env vars (plan §10 Q2 + Q3): + * Persistence and rubric caching are gated by env vars: * VERIFIER_PERSIST_TRAJECTORIES — on locally, off in CI by default. * VERIFIER_DISABLE_RUBRIC_CACHE — set to "1" to bypass the cache (forces * a fresh Step 0a call every time). diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts index 84666a74b..c670edf4c 100644 --- a/packages/evals/scripts/verify-webtailbench-task.ts +++ b/packages/evals/scripts/verify-webtailbench-task.ts @@ -1,8 +1,8 @@ /** - * End-to-end Wave 1 verification on a real WebTailBench task. + * End-to-end verification on a real WebTailBench task. * * Loads one row from packages/evals/datasets/webtailbench/WebTailBench_data.jsonl - * (which carries upstream precomputed_rubric), runs the agent on Browserbase + * (which carries `precomputed_rubric`), runs the agent on Browserbase * via runWithVerifier, and asserts: * 1. Recorder captures a non-trivial trajectory. * 2. Verifier uses the upstream rubric (rubricSource = "precomputed"). @@ -189,10 +189,10 @@ async function main(): Promise { ); assert.equal(typeof verdict.outcomeSuccess, "boolean"); - console.log(`\n✅ Wave 1 WebTailBench verification OK`); + console.log(`\n✅ WebTailBench verification OK`); } main().catch((err) => { - console.error("\n❌ Wave 1 WebTailBench verification FAILED:", err); + console.error("\n❌ WebTailBench verification FAILED:", err); process.exit(1); }); diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts index 9081171be..052bc1b26 100644 --- a/packages/evals/tasks/bench/agent/webtailbench.ts +++ b/packages/evals/tasks/bench/agent/webtailbench.ts @@ -14,15 +14,12 @@ import { /** * WebTailBench bench task. * - * Wave 1 MVP: runs the agent through the new TrajectoryRecorder + - * V3Evaluator.verify() pipeline (process + outcome scoring grounded in saved - * trajectory evidence). The previous polling-based ScreenshotCollector + - * V3Evaluator.ask() flow is gone. + * Runs the agent through TrajectoryRecorder + V3Evaluator.verify() so process + * and outcome scoring are grounded in saved trajectory evidence. * - * The local WebTailBench JSONL doesn't carry precomputed_rubric (the - * upstream HF dataset does — Wave 2 dataset swap pending). Until then the - * verifier generates a rubric via Step 0a on first encounter per task id - * and caches under packages/evals/.rubric-cache/webtailbench/. + * If a row does not carry `precomputed_rubric`, the verifier generates a + * rubric on first encounter per task id and caches it under + * packages/evals/.rubric-cache/webtailbench/. * * --success knob: defaults to "outcome". * Override via the EVAL_SUCCESS_MODE env var: outcome | process | both. From 47dc1d582dad2c17d64a26aa076824b351964107 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:42:34 -0700 Subject: [PATCH 5/5] fix(evals): align verifier adapter result API --- packages/evals/framework/verifierAdapter.ts | 31 +-- .../scripts/backfill-webtailbench-rubrics.ts | 10 +- .../evals/scripts/verify-webtailbench-task.ts | 198 ------------------ packages/evals/suites/webtailbench.ts | 16 +- .../evals/tasks/bench/agent/webtailbench.ts | 31 ++- .../tests/framework/verifierAdapter.test.ts | 23 +- 6 files changed, 56 insertions(+), 253 deletions(-) delete mode 100644 packages/evals/scripts/verify-webtailbench-task.ts diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts index 2e6c8a8a2..8dc40bd2b 100644 --- a/packages/evals/framework/verifierAdapter.ts +++ b/packages/evals/framework/verifierAdapter.ts @@ -4,7 +4,7 @@ * Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate * with one call: * - * const { verdict, trajectory } = await runWithVerifier({ + * const { evaluationResult, trajectory } = await runWithVerifier({ * v3, * agent, * taskSpec: { id, instruction, initUrl, precomputedRubric? }, @@ -12,16 +12,15 @@ * }); * * Behavior: - * 1. Resolves the rubric — precomputedRubric (e.g., upstream WebTailBench), - * or generates via Step 0a and caches under .rubric-cache//. + * 1. Resolves the rubric from the task, cache, or evaluator. * 2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus. * 3. Runs V3Evaluator.verify() on the recorded Trajectory. - * 4. Returns { trajectory, verdict, agentResult }. + * 4. Returns { trajectory, evaluationResult, agentResult }. * * Persistence and rubric caching are gated by env vars: * VERIFIER_PERSIST_TRAJECTORIES — on locally, off in CI by default. * VERIFIER_DISABLE_RUBRIC_CACHE — set to "1" to bypass the cache (forces - * a fresh Step 0a call every time). + * fresh rubric generation every time). */ import { V3Evaluator, @@ -29,11 +28,11 @@ import { type AgentInstance, type AgentExecuteOptions, type AgentResult, + type EvaluationResult, type Rubric, type TaskSpec, type Trajectory, type V3, - type Verdict, } from "@browserbasehq/stagehand"; import { RubricCache } from "./rubricCache.js"; @@ -58,7 +57,7 @@ export interface RunWithVerifierOptions { export interface RunWithVerifierResult { trajectory: Trajectory; - verdict: Verdict; + evaluationResult: EvaluationResult; agentResult: AgentResult; /** Resolved rubric (precomputed, cached, or freshly generated). */ rubric: Rubric; @@ -122,12 +121,12 @@ export async function runWithVerifier( }); // ── Verify ────────────────────────────────────────────────────────────── - const verdict = await evaluator.verify(trajectory, hydratedTaskSpec); - await recorder.persistVerdict(verdict); + const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec); + await recorder.persistResult(evaluationResult); return { trajectory, - verdict, + evaluationResult, agentResult, rubric: resolvedRubric, trajectoryDir: recorder.directory, @@ -135,7 +134,7 @@ export async function runWithVerifier( } /** - * Decide bench task success from a Verdict using the --success flag's + * Decide bench task success from an EvaluationResult using the --success flag's * semantics. * * `outcome` (default) — strict binary outcome. @@ -157,14 +156,16 @@ export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode { return "outcome"; } -export function verdictToSuccess( - verdict: Verdict, +export function evaluationResultToSuccess( + result: EvaluationResult, mode: unknown = "outcome", processThreshold = 0.8, ): boolean { const resolvedMode = resolveEvalSuccessMode(mode); - const outcomeOk = verdict.outcomeSuccess; - const processOk = verdict.processScore >= processThreshold; + const outcomeOk = result.outcomeSuccess; + const processOk = + typeof result.processScore === "number" && + result.processScore >= processThreshold; switch (resolvedMode) { case "outcome": return outcomeOk; diff --git a/packages/evals/scripts/backfill-webtailbench-rubrics.ts b/packages/evals/scripts/backfill-webtailbench-rubrics.ts index 965722fa4..22fb87de1 100644 --- a/packages/evals/scripts/backfill-webtailbench-rubrics.ts +++ b/packages/evals/scripts/backfill-webtailbench-rubrics.ts @@ -29,7 +29,7 @@ const JSONL_PATH = path.join( "WebTailBench_data.jsonl", ); -interface SerializedRubric { +interface RawRubric { items: Array>; } @@ -38,7 +38,7 @@ interface LocalRow { category?: string; ques: string; web?: string; - precomputed_rubric?: SerializedRubric; + precomputed_rubric?: RawRubric; } /** @@ -114,12 +114,12 @@ async function main(): Promise { ); } - const rubricsById = new Map(); + const rubricsById = new Map(); for (let i = 1; i < rows.length; i++) { const cols = rows[i]; if (!cols[idIdx]) continue; try { - const parsed = JSON.parse(cols[rubricIdx]) as SerializedRubric; + const parsed = JSON.parse(cols[rubricIdx]) as RawRubric; rubricsById.set(cols[idIdx], parsed); } catch (e) { console.warn( @@ -149,7 +149,7 @@ async function main(): Promise { } console.log( - ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`, + ` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`, ); await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8"); diff --git a/packages/evals/scripts/verify-webtailbench-task.ts b/packages/evals/scripts/verify-webtailbench-task.ts deleted file mode 100644 index c670edf4c..000000000 --- a/packages/evals/scripts/verify-webtailbench-task.ts +++ /dev/null @@ -1,198 +0,0 @@ -/** - * End-to-end verification on a real WebTailBench task. - * - * Loads one row from packages/evals/datasets/webtailbench/WebTailBench_data.jsonl - * (which carries `precomputed_rubric`), runs the agent on Browserbase - * via runWithVerifier, and asserts: - * 1. Recorder captures a non-trivial trajectory. - * 2. Verifier uses the upstream rubric (rubricSource = "precomputed"). - * 3. Step 6 rescoring produces per-criterion scores (no evidence_insufficient). - * 4. Step 8 outcome returns a boolean verdict with reasoning. - * - * pnpm tsx packages/evals/scripts/verify-webtailbench-task.ts [task_id] - * - * Defaults to united_13. Requires BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID - * and a GEMINI_API_KEY / GOOGLE_GENERATIVE_AI_API_KEY in env. - */ -import "dotenv/config"; -import assert from "node:assert/strict"; -import fs from "node:fs/promises"; -import path from "node:path"; - -import { V3, normalizeRubric } from "@browserbasehq/stagehand"; -import type { SerializedRubric, TaskSpec } from "@browserbasehq/stagehand"; -import { runWithVerifier } from "../framework/verifierAdapter.js"; - -interface WebTailBenchRow { - id: string; - category?: string; - ques: string; - web?: string; - precomputed_rubric?: SerializedRubric; -} - -const DEFAULT_TASK_ID = "united_13"; -const JSONL = path.resolve( - import.meta.dirname, - "..", - "datasets", - "webtailbench", - "WebTailBench_data.jsonl", -); - -async function loadRow(taskId: string): Promise { - const raw = await fs.readFile(JSONL, "utf8"); - for (const line of raw.split(/\r?\n/)) { - if (!line.trim()) continue; - const row = JSON.parse(line) as WebTailBenchRow; - if (row.id === taskId) return row; - } - throw new Error(`task id ${taskId} not found in ${JSONL}`); -} - -async function main(): Promise { - const taskId = process.argv[2] ?? DEFAULT_TASK_ID; - const mode = (process.env.AGENT_MODE ?? "hybrid") as "dom" | "hybrid" | "cua"; - const model = - process.env.AGENT_MODEL ?? - (mode === "cua" ? "anthropic/claude-haiku-4-5" : "google/gemini-2.5-flash"); - console.log(`▸ loading WebTailBench task: ${taskId}`); - console.log(` mode=${mode} model=${model}`); - const row = await loadRow(taskId); - console.log(` ✓ ${row.ques.slice(0, 100)}`); - console.log( - ` ✓ rubric: ${row.precomputed_rubric ? `${row.precomputed_rubric.items.length} criteria` : "MISSING"}`, - ); - assert.ok( - row.precomputed_rubric && row.precomputed_rubric.items.length > 0, - "task should carry a precomputed rubric (run backfill-webtailbench-rubrics.ts first)", - ); - - // Most WebTailBench sites block local browser traffic; ideally this runs on - // BROWSERBASE. Defaults to LOCAL when Browserbase creds aren't configured — - // the verifier still exercises end-to-end on whatever trajectory we capture, - // even if the agent fails fast against anti-bot. - const useBrowserbase = - process.env.BROWSERBASE_API_KEY && process.env.BROWSERBASE_PROJECT_ID; - const env = useBrowserbase ? "BROWSERBASE" : "LOCAL"; - console.log(`▸ initializing V3 on ${env}`); - const v3 = new V3({ - env, - verbose: 1, - model, - // Keep the agent loop local even on env=BROWSERBASE — without this V3 - // would auto-create an apiClient and dispatch agent.execute() to the - // remote server-side loop, which doesn't emit on our local bus. The - // evals framework does this same opt-out in packages/evals/initV3.ts:121 - // via process.env.USE_API. disableAPI is the targeted flag; we used - // experimental: true previously as a heavier-handed equivalent. - disableAPI: true, - }); - await v3.init(); - - const page = v3.context.pages()[0]; - const startUrl = row.web || "https://www.google.com"; - await page.goto(startUrl, { timeoutMs: 120_000 }); - console.log(` ✓ navigated to ${startUrl}`); - - const agent = v3.agent({ - mode, - model, - }); - - const taskSpec: TaskSpec = { - id: row.id, - instruction: row.ques, - initUrl: startUrl, - precomputedRubric: normalizeRubric(row.precomputed_rubric), - }; - - console.log("▸ running agent + verifier pipeline"); - const startMs = Date.now(); - const result = await runWithVerifier({ - v3, - agent, - taskSpec, - dataset: "webtailbench", - agentOptions: { maxSteps: 30 }, - }); - console.log( - ` ✓ completed in ${((Date.now() - startMs) / 1000).toFixed(1)}s`, - ); - - // Diagnostic: show what the agent did internally vs what reached the bus. - console.log(` agent.actions: ${result.agentResult.actions.length}`); - console.log(` agent.completed: ${result.agentResult.completed}`); - console.log( - ` agent.usage: ${JSON.stringify(result.agentResult.usage ?? {})}`, - ); - if (result.agentResult.actions.length > 0) { - console.log(" first 5 internal actions:"); - for (const a of result.agentResult.actions.slice(0, 5)) { - console.log(` - ${a.type ?? "?"} ${(a.action ?? "").slice(0, 80)}`); - } - } - - await v3.close(); - - // ── Assertions ────────────────────────────────────────────────────────── - const { trajectory, verdict, rubric, trajectoryDir } = result; - console.log(`\n▸ trajectory: ${trajectory.steps.length} steps`); - console.log(` directory: ${trajectoryDir}`); - console.log(`\n▸ verdict:`); - console.log( - ` outcomeSuccess=${verdict.outcomeSuccess} processScore=${verdict.processScore.toFixed(3)}`, - ); - console.log( - ` per-criterion (${verdict.perCriterion.length}/${rubric.items.length}):`, - ); - for (const c of verdict.perCriterion) { - const earned = c.earnedPoints === null ? "—" : c.earnedPoints.toFixed(1); - const flag = c.evidenceInsufficient ? " [evidence_insufficient]" : ""; - console.log(` - ${earned}/${c.maxPoints} ${c.criterion}${flag}`); - if (c.justification) { - console.log(` ${c.justification.slice(0, 200)}`); - } - } - const raw = verdict.rawSteps; - console.log(`\n▸ rubric source: ${raw?.rubricSource}`); - console.log(`▸ primary intent: ${raw?.primaryIntent}`); - - if (verdict.findings && verdict.findings.length > 0) { - console.log(`\n▸ findings (${verdict.findings.length}):`); - for (const f of verdict.findings) { - const steps = f.relatedSteps?.length - ? ` steps=[${f.relatedSteps.join(",")}]` - : ""; - console.log(` [${f.severity}] ${f.category}${steps}`); - console.log(` ${f.description}`); - if (f.suggestedAction) { - console.log(` → ${f.suggestedAction}`); - } - } - } else { - console.log(`\n▸ findings: (none)`); - } - - assert.equal( - raw?.rubricSource, - "precomputed", - "expected verifier to use the upstream precomputed rubric", - ); - assert.equal(verdict.perCriterion.length, rubric.items.length); - const fullySufficient = verdict.perCriterion.every( - (c) => !c.evidenceInsufficient, - ); - assert.ok( - fullySufficient, - "expected Step 6 to score every criterion (no evidence_insufficient flags)", - ); - assert.equal(typeof verdict.outcomeSuccess, "boolean"); - - console.log(`\n✅ WebTailBench verification OK`); -} - -main().catch((err) => { - console.error("\n❌ WebTailBench verification FAILED:", err); - process.exit(1); -}); diff --git a/packages/evals/suites/webtailbench.ts b/packages/evals/suites/webtailbench.ts index bf8c5c919..d704449c2 100644 --- a/packages/evals/suites/webtailbench.ts +++ b/packages/evals/suites/webtailbench.ts @@ -1,9 +1,5 @@ import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js"; -import { - normalizeRubric, - type AvailableModel, - type SerializedRubric, -} from "@browserbasehq/stagehand"; +import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand"; import { tasksConfig } from "../taskConfig.js"; import { getPackageRootDir } from "../runtimePaths.js"; import { @@ -39,10 +35,9 @@ export const buildWebTailBenchTestcases = ( /** * Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv * via packages/evals/scripts/backfill-webtailbench-rubrics.ts. - * When present, the verifier skips Step 0a generation and uses these - * upstream criteria directly. + * When present, the verifier uses these upstream criteria directly. */ - precomputed_rubric?: SerializedRubric; + precomputed_rubric?: unknown; [key: string]: unknown; }; @@ -54,9 +49,8 @@ export const buildWebTailBenchTestcases = ( const candidates = parseJsonlRows(lines, isWebTailBenchRow); - // EVAL_WEBTAILBENCH_IDS — comma-separated task IDs. When set, restricts the - // suite to exactly those IDs (in the order given) and ignores sampling / - // limit knobs. Used by verifier-A/B experiments to pin a deterministic slice. + // EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs, + // preserving the order given and ignoring sampling / limit knobs. const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS ? process.env.EVAL_WEBTAILBENCH_IDS.split(",") .map((s) => s.trim()) diff --git a/packages/evals/tasks/bench/agent/webtailbench.ts b/packages/evals/tasks/bench/agent/webtailbench.ts index 052bc1b26..e3791348b 100644 --- a/packages/evals/tasks/bench/agent/webtailbench.ts +++ b/packages/evals/tasks/bench/agent/webtailbench.ts @@ -1,14 +1,9 @@ -import { - normalizeRubric, - type Rubric, - type SerializedRubric, - type TaskSpec, -} from "@browserbasehq/stagehand"; +import { normalizeRubric, type TaskSpec } from "@browserbasehq/stagehand"; import { defineBenchTask } from "../../../framework/defineTask.js"; import { + evaluationResultToSuccess, runWithVerifier, - verdictToSuccess, } from "../../../framework/verifierAdapter.js"; /** @@ -33,7 +28,7 @@ export default defineBenchTask( category?: string; ques?: string; web?: string; - precomputed_rubric?: Rubric | SerializedRubric; + precomputed_rubric?: unknown; }; if (!params.ques) { @@ -65,7 +60,7 @@ export default defineBenchTask( precomputedRubric: normalizeRubric(params.precomputed_rubric), }; - const { verdict, trajectory, trajectoryDir, rubric } = + const { evaluationResult, trajectory, trajectoryDir, rubric } = await runWithVerifier({ v3, agent, @@ -80,20 +75,20 @@ export default defineBenchTask( logger.log({ category: "evaluation", - message: `verdict: outcome=${verdict.outcomeSuccess} process=${verdict.processScore.toFixed(2)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`, + message: `result: outcome=${evaluationResult.outcomeSuccess} process=${formatProcessScore(evaluationResult.processScore)} criteria=${rubric.items.length} steps=${trajectory.steps.length}`, level: 1, }); return { - _success: verdictToSuccess(verdict, successMode), - outcomeSuccess: verdict.outcomeSuccess, - processScore: verdict.processScore, - evidenceInsufficient: verdict.evidenceInsufficient, + _success: evaluationResultToSuccess(evaluationResult, successMode), + outcomeSuccess: evaluationResult.outcomeSuccess, + processScore: evaluationResult.processScore, + evidenceInsufficient: evaluationResult.evidenceInsufficient, criterionCount: rubric.items.length, stepCount: trajectory.steps.length, trajectoryDir, - primaryIntent: verdict.rawSteps?.primaryIntent, - reasoning: verdict.rawSteps?.reasoning, + primaryIntent: evaluationResult.rawSteps?.primaryIntent, + reasoning: evaluationResult.rawSteps?.reasoning, debugUrl, sessionUrl, logs: logger.getLogs(), @@ -111,3 +106,7 @@ export default defineBenchTask( } }, ); + +function formatProcessScore(score: number | undefined): string { + return typeof score === "number" ? score.toFixed(2) : "n/a"; +} diff --git a/packages/evals/tests/framework/verifierAdapter.test.ts b/packages/evals/tests/framework/verifierAdapter.test.ts index 6f446d1ff..fc21cdd3f 100644 --- a/packages/evals/tests/framework/verifierAdapter.test.ts +++ b/packages/evals/tests/framework/verifierAdapter.test.ts @@ -1,12 +1,12 @@ import { describe, expect, it } from "vitest"; -import type { Verdict } from "@browserbasehq/stagehand"; +import type { EvaluationResult } from "@browserbasehq/stagehand"; import { + evaluationResultToSuccess, resolveEvalSuccessMode, - verdictToSuccess, } from "../../framework/verifierAdapter.js"; -const baseVerdict: Verdict = { +const baseResult: EvaluationResult = { outcomeSuccess: true, processScore: 0.5, perCriterion: [], @@ -22,11 +22,18 @@ describe("resolveEvalSuccessMode", () => { }); }); -describe("verdictToSuccess", () => { +describe("evaluationResultToSuccess", () => { it("uses validated success modes", () => { - expect(verdictToSuccess(baseVerdict, "outcome")).toBe(true); - expect(verdictToSuccess(baseVerdict, "process")).toBe(false); - expect(verdictToSuccess(baseVerdict, "both")).toBe(false); - expect(verdictToSuccess(baseVerdict, "invalid")).toBe(true); + expect(evaluationResultToSuccess(baseResult, "outcome")).toBe(true); + expect(evaluationResultToSuccess(baseResult, "process")).toBe(false); + expect(evaluationResultToSuccess(baseResult, "both")).toBe(false); + expect(evaluationResultToSuccess(baseResult, "invalid")).toBe(true); + }); + + it("treats missing process score as a failed process gate", () => { + const outcomeOnly: EvaluationResult = { outcomeSuccess: true }; + expect(evaluationResultToSuccess(outcomeOnly, "outcome")).toBe(true); + expect(evaluationResultToSuccess(outcomeOnly, "process")).toBe(false); + expect(evaluationResultToSuccess(outcomeOnly, "both")).toBe(false); }); });