Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions packages/evals/framework/verifierAdapter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
/**
* verifierAdapter — runs a bench task through the verifier pipeline.
*
* Replaces the per-task ScreenshotCollector + V3Evaluator.ask() boilerplate
* with one call:
*
* const { evaluationResult, trajectory } = await runWithVerifier({
* v3,
* agent,
* taskSpec: { id, instruction, initUrl, precomputedRubric? },
* maxSteps: 50,
* });
*
* Behavior:
* 1. Resolves the rubric from the task, cache, or evaluator.
* 2. Wraps agent.execute() with a TrajectoryRecorder subscribed to the bus.
* 3. Runs V3Evaluator.verify() on the recorded Trajectory.
* 4. Returns { trajectory, evaluationResult, agentResult }.
*
* Persistence and rubric caching are gated by env vars:
* VERIFIER_PERSIST_TRAJECTORIES — on locally, off in CI by default.
* VERIFIER_DISABLE_RUBRIC_CACHE — set to "1" to bypass the cache (forces
* fresh rubric generation every time).
*/
import {
V3Evaluator,
normalizeRubric,
type AgentInstance,
type AgentExecuteOptions,
type AgentResult,
type EvaluationResult,
type Rubric,
type TaskSpec,
type Trajectory,
type V3,
} from "@browserbasehq/stagehand";

import { RubricCache } from "./rubricCache.js";
import { TrajectoryRecorder } from "./trajectoryRecorder.js";

export interface RunWithVerifierOptions {
v3: V3;
agent: AgentInstance;
taskSpec: TaskSpec;
/**
* Dataset name for rubric cache partitioning. Each task lives under
* `.rubric-cache/<dataset>/<task-id>.json`.
*/
dataset: string;
/** Agent execute options. `instruction` is filled from taskSpec.instruction. */
agentOptions?: Omit<AgentExecuteOptions, "instruction">;
/** Override the run id (defaults to ISO timestamp). */
runId?: string;
/** Override trajectory persistence root. */
trajectoryRoot?: string;
}

export interface RunWithVerifierResult {
trajectory: Trajectory;
evaluationResult: EvaluationResult;
agentResult: AgentResult;
/** Resolved rubric (precomputed, cached, or freshly generated). */
rubric: Rubric;
/** Where the trajectory was persisted (or would have been, if disabled). */
trajectoryDir: string;
}

export async function runWithVerifier(
opts: RunWithVerifierOptions,
): Promise<RunWithVerifierResult> {
const { v3, agent, taskSpec, dataset, agentOptions, runId, trajectoryRoot } =
opts;
const evaluator = new V3Evaluator(v3, { backend: "verifier" });

// ── Resolve rubric ──────────────────────────────────────────────────────
let resolvedRubric: Rubric;
if (taskSpec.precomputedRubric) {
resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
} else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
resolvedRubric = await evaluator.generateRubric(taskSpec);
} else {
const cache = new RubricCache({ dataset });
resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
}

// Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
const hydratedTaskSpec: TaskSpec = {
...taskSpec,
precomputedRubric: resolvedRubric,
};

// ── Record trajectory around agent.execute() ───────────────────────────
const recorder = new TrajectoryRecorder({
v3,
taskSpec: hydratedTaskSpec,
runId,
outputRoot: trajectoryRoot,
});
recorder.start();

let agentResult: AgentResult;
let recorderStatus: "complete" | "aborted" | "error" = "complete";
try {
agentResult = await agent.execute({
...agentOptions,
instruction: taskSpec.instruction,
});
} catch (e) {
recorderStatus = "error";
const trajectory = await recorder.finish({ status: recorderStatus });
Copy link
Copy Markdown
Contributor

@cubic-dev-ai cubic-dev-ai Bot May 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: If recorder.finish() rejects inside the catch block, the original agent error is lost. Wrap the persistence call in its own try/catch so the original error is always rethrown.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At packages/evals/framework/verifierAdapter.ts, line 110:

<comment>If `recorder.finish()` rejects inside the catch block, the original agent error is lost. Wrap the persistence call in its own try/catch so the original error is always rethrown.</comment>

<file context>
@@ -0,0 +1,160 @@
+    });
+  } catch (e) {
+    recorderStatus = "error";
+    const trajectory = await recorder.finish({ status: recorderStatus });
+    // Re-throw after persisting so the bench task can decide how to report.
+    const wrapped = e instanceof Error ? e : new Error(String(e));
</file context>
Fix with Cubic

// Re-throw after persisting so the bench task can decide how to report.
const wrapped = e instanceof Error ? e : new Error(String(e));
Object.assign(wrapped, { trajectoryDir: recorder.directory, trajectory });
throw wrapped;
}

const trajectory = await recorder.finish({
status: recorderStatus,
finalAnswer: agentResult.message,
usage: agentResult.usage,
});

// ── Verify ──────────────────────────────────────────────────────────────
const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec);
await recorder.persistResult(evaluationResult);

return {
trajectory,
evaluationResult,
agentResult,
rubric: resolvedRubric,
trajectoryDir: recorder.directory,
};
}

/**
* Decide bench task success from an EvaluationResult using the --success flag's
* semantics.
*
* `outcome` (default) — strict binary outcome.
* `process` — rubric process score ≥ threshold (default 0.8).
* `both` — both conditions must hold.
*/
export type EvalSuccessMode = "outcome" | "process" | "both";

export function resolveEvalSuccessMode(mode: unknown): EvalSuccessMode {
if (typeof mode !== "string") return "outcome";
const normalized = mode.trim().toLowerCase();
if (
normalized === "outcome" ||
normalized === "process" ||
normalized === "both"
) {
return normalized;
}
return "outcome";
}

export function evaluationResultToSuccess(
result: EvaluationResult,
mode: unknown = "outcome",
processThreshold = 0.8,
): boolean {
const resolvedMode = resolveEvalSuccessMode(mode);
const outcomeOk = result.outcomeSuccess;
const processOk =
typeof result.processScore === "number" &&
result.processScore >= processThreshold;
switch (resolvedMode) {
case "outcome":
return outcomeOk;
case "process":
return processOk;
case "both":
return outcomeOk && processOk;
}
}
10 changes: 5 additions & 5 deletions packages/evals/scripts/backfill-webtailbench-rubrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ const JSONL_PATH = path.join(
"WebTailBench_data.jsonl",
);

interface Rubric {
interface RawRubric {
items: Array<Record<string, unknown>>;
}

Expand All @@ -38,7 +38,7 @@ interface LocalRow {
category?: string;
ques: string;
web?: string;
precomputed_rubric?: Rubric;
precomputed_rubric?: RawRubric;
}

/**
Expand Down Expand Up @@ -114,12 +114,12 @@ async function main(): Promise<void> {
);
}

const rubricsById = new Map<string, Rubric>();
const rubricsById = new Map<string, RawRubric>();
for (let i = 1; i < rows.length; i++) {
const cols = rows[i];
if (!cols[idIdx]) continue;
try {
const parsed = JSON.parse(cols[rubricIdx]) as Rubric;
const parsed = JSON.parse(cols[rubricIdx]) as RawRubric;
rubricsById.set(cols[idIdx], parsed);
} catch (e) {
console.warn(
Expand Down Expand Up @@ -149,7 +149,7 @@ async function main(): Promise<void> {
}

console.log(
` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to Step 0a generation)`,
` ✓ matched ${matched}/${inLines.length} rows; ${missing} unmatched (will fall back to generated rubrics)`,
);

await fs.writeFile(JSONL_PATH, out.join("\n") + "\n", "utf8");
Expand Down
27 changes: 25 additions & 2 deletions packages/evals/suites/webtailbench.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
import type { AvailableModel } from "@browserbasehq/stagehand";
import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand";
import { tasksConfig } from "../taskConfig.js";
import { getPackageRootDir } from "../runtimePaths.js";
import {
Expand Down Expand Up @@ -32,6 +32,12 @@ export const buildWebTailBenchTestcases = (
ques: string;
category?: string;
web?: string;
/**
* Per-task rubric ported from microsoft/WebTailBench-v1-rubrics.tsv
* via packages/evals/scripts/backfill-webtailbench-rubrics.ts.
* When present, the verifier uses these upstream criteria directly.
*/
precomputed_rubric?: unknown;
[key: string]: unknown;
};

Expand All @@ -42,7 +48,23 @@ export const buildWebTailBenchTestcases = (
}

const candidates = parseJsonlRows(lines, isWebTailBenchRow);
const rows = applySampling(candidates, sampleCount, maxCases);

// EVAL_WEBTAILBENCH_IDS restricts the suite to exactly those task IDs,
// preserving the order given and ignoring sampling / limit knobs.
const explicitIds = process.env.EVAL_WEBTAILBENCH_IDS
? process.env.EVAL_WEBTAILBENCH_IDS.split(",")
.map((s) => s.trim())
.filter(Boolean)
: null;
let rows: WebTailBenchRow[];
if (explicitIds && explicitIds.length > 0) {
const byId = new Map(candidates.map((r) => [r.id, r]));
rows = explicitIds
.map((id) => byId.get(id))
.filter((r): r is WebTailBenchRow => Boolean(r));
} else {
rows = applySampling(candidates, sampleCount, maxCases);
}

const allTestcases: Testcase[] = [];
for (const modelEntry of normalizeAgentModelEntries(models)) {
Expand All @@ -57,6 +79,7 @@ export const buildWebTailBenchTestcases = (
category: row.category,
ques: row.ques,
web: row.web,
precomputed_rubric: normalizeRubric(row.precomputed_rubric),
},
};
const taskCategories =
Expand Down
Loading
Loading