diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md new file mode 100644 index 000000000..ea9fdb4d6 --- /dev/null +++ b/packages/evals/docs/verifier-benchmark-matrix.md @@ -0,0 +1,42 @@ +# Verifier Benchmark Matrix + +Use this matrix before changing `STAGEHAND_EVALUATOR_BACKEND` defaults. +`STAGEHAND_EVALUATOR_BACKEND` selects the public evaluator backend; `VERIFIER_*` +flags tune the verifier internals once that backend is selected. + +```bash +STAGEHAND_EVALUATOR_BACKEND=legacy +STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=outcome-only +STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=a +STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=b +``` + +Use `VERIFIER_APPROACH=outcome-only` as the verifier default for benchmarks +without curated rubrics. Use approaches `a` and `b` when evaluating the rubric +pipeline itself or datasets with trusted precomputed rubrics. + +For saved trajectories, run verifier approaches against the same agent outputs +so verifier quality is isolated from agent variance: + +```bash +TRAJECTORY_GLOB=".trajectories/*" scripts/cross-verify-parallel.sh +``` + +Optional environment: + +```bash +EVALS_ENV_FILE=~/.envs/prod-evals.env +PARALLEL=8 +VERIFIER_OPTIONAL_STEPS=folded +``` + +Report at least: + +- accuracy against manually reviewed labels +- false positives and false negatives +- invalid or ambiguous task handling +- evidence-insufficient count +- latency and model cost + +Do not flip the default backend until verifier results beat or match legacy on +the target datasets and failure analysis is reviewed. diff --git a/packages/evals/framework/braintrust.ts b/packages/evals/framework/braintrust.ts index 803f30d53..3f3967347 100644 --- a/packages/evals/framework/braintrust.ts +++ b/packages/evals/framework/braintrust.ts @@ -1,3 +1,14 @@ +/** + * Braintrust tracing helper. + * + * Thin wrapper around `braintrust.traced` that lets callers carry a span into + * the work and `span.log({ output, scores, metrics, metadata })` along the + * way. Outside an active Braintrust experiment, `traced` no-ops and returns + * the callback's value unchanged, so this is safe to call from offline tools + * (e.g., `bench verify`). + */ +import type { Span, StartSpanArgs } from "braintrust"; + let braintrustPromise: Promise | undefined; export function loadBraintrust(): Promise { @@ -5,9 +16,14 @@ export function loadBraintrust(): Promise { return braintrustPromise; } +export type TracedFn = (span: Span) => Promise; + +/** Same shape as Braintrust's StartSpanArgs but `name` is required. */ +export type TracedSpanOptions = StartSpanArgs & { name: string }; + export async function tracedSpan( - fn: () => Promise, - options: { name: string }, + fn: TracedFn, + options: TracedSpanOptions, ): Promise { const { traced } = await loadBraintrust(); return traced(fn, options); diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts index 8dc40bd2b..85643e8a9 100644 --- a/packages/evals/framework/verifierAdapter.ts +++ b/packages/evals/framework/verifierAdapter.ts @@ -35,6 +35,7 @@ import { type V3, } from "@browserbasehq/stagehand"; +import { tracedSpan } from "./braintrust.js"; import { RubricCache } from "./rubricCache.js"; import { TrajectoryRecorder } from "./trajectoryRecorder.js"; @@ -73,15 +74,58 @@ export async function runWithVerifier( const evaluator = new V3Evaluator(v3, { backend: "verifier" }); // ── Resolve rubric ────────────────────────────────────────────────────── - let resolvedRubric: Rubric; - if (taskSpec.precomputedRubric) { - resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!; - } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { - resolvedRubric = await evaluator.generateRubric(taskSpec); - } else { - const cache = new RubricCache({ dataset }); - resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator); - } + const { rubric: resolvedRubric } = await tracedSpan( + async (span) => { + let rubric: Rubric; + let source: "precomputed" | "cached" | "generated"; + + if (taskSpec.precomputedRubric) { + rubric = normalizeRubric(taskSpec.precomputedRubric)!; + source = "precomputed"; + } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + rubric = await evaluator.generateRubric(taskSpec); + source = "generated"; + } else { + const cache = new RubricCache({ dataset }); + const cached = await cache.read(taskSpec); + if (cached) { + rubric = cached; + source = "cached"; + } else { + rubric = await evaluator.generateRubric(taskSpec); + await cache.write(taskSpec, rubric); + source = "generated"; + } + } + + span.log({ + output: { + source, + rubric, + }, + metadata: { + taskId: taskSpec.id, + dataset, + source, + criterionCount: rubric.items.length, + }, + }); + + return { rubric, source }; + }, + { + name: "verifier.rubric", + type: "eval", + event: { + input: { + taskId: taskSpec.id, + dataset, + hasPrecomputedRubric: Boolean(taskSpec.precomputedRubric), + cacheDisabled: process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1", + }, + }, + }, + ); // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate. const hydratedTaskSpec: TaskSpec = { @@ -101,10 +145,20 @@ export async function runWithVerifier( let agentResult: AgentResult; let recorderStatus: "complete" | "aborted" | "error" = "complete"; try { - agentResult = await agent.execute({ - ...agentOptions, - instruction: taskSpec.instruction, - }); + agentResult = await tracedSpan( + async (span) => { + const result = await agent.execute({ + ...agentOptions, + instruction: taskSpec.instruction, + }); + span.log({ + output: { message: result.message?.slice(0, 500) }, + metrics: usageMetrics(result.usage), + }); + return result; + }, + { name: "agent.execute", type: "task" }, + ); } catch (e) { recorderStatus = "error"; const trajectory = await recorder.finish({ status: recorderStatus }); @@ -121,7 +175,37 @@ export async function runWithVerifier( }); // ── Verify ────────────────────────────────────────────────────────────── - const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec); + const evaluationResult = await tracedSpan( + async (span) => { + const v = await evaluator.verify(trajectory, hydratedTaskSpec); + const rawSteps = asRecord(v.rawSteps); + span.log({ + output: v, + scores: { + outcome: v.outcomeSuccess ? 1 : 0, + process: v.processScore, + }, + metadata: { + taskId: taskSpec.id, + dataset, + stepCount: trajectory.steps.length, + criterionCount: v.perCriterion?.length ?? 0, + findingCount: v.findings?.length ?? 0, + evidenceInsufficientCount: v.evidenceInsufficient?.length ?? 0, + firstFailStep: v.firstPointOfFailure?.stepIndex, + firstFailCode: v.firstPointOfFailure?.errorCode, + isAmbiguous: v.taskValidity?.isAmbiguous, + isInvalid: v.taskValidity?.isInvalid, + ambiguityReason: v.taskValidity?.ambiguityReason, + invalidReason: v.taskValidity?.invalidReason, + primaryIntent: rawSteps?.primaryIntent, + reasoning: rawSteps?.reasoning, + }, + }); + return v; + }, + { name: "verifier.verify", type: "eval" }, + ); await recorder.persistResult(evaluationResult); return { @@ -133,6 +217,23 @@ export async function runWithVerifier( }; } +function asRecord(value: unknown): Record | undefined { + return value && typeof value === "object" + ? (value as Record) + : undefined; +} + +function usageMetrics( + usage: AgentResult["usage"] | undefined, +): Record { + if (!usage) return {}; + return Object.fromEntries( + Object.entries(usage).filter( + (e): e is [string, number] => typeof e[1] === "number", + ), + ); +} + /** * Decide bench task success from an EvaluationResult using the --success flag's * semantics. diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh new file mode 100755 index 000000000..b992b3649 --- /dev/null +++ b/scripts/cross-verify-parallel.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Parallel cross-verify: 8 verifier processes in flight at once across +# outcome-only plus the rubric approaches. + +set -e +cd "$(dirname "$0")/.." + +if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then + set -a + source "$EVALS_ENV_FILE" + set +a +fi + +PARALLEL=${PARALLEL:-8} +TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*} + +DIRS=() +while IFS= read -r d; do + DIRS+=("$d") +done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort) + +echo "[$(date +%H:%M:%S)] Found ${#DIRS[@]} trajectory dirs; parallelism=$PARALLEL" + +run_one() { + local dir="$1" + local approach="$2" + local label="cross-${approach}" + local out_file="$dir/scores/result_${label}.json" + local task + task=$(basename "$dir") + if [[ -f "$out_file" ]]; then + echo "[$(date +%H:%M:%S)] [$approach] $task: skip (exists)" + return 0 + fi + local start + start=$(date +%s) + if VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \ + pnpm exec tsx packages/evals/cli.ts verify "$dir" --label "$label" > /tmp/verify-$$-$task-$approach.log 2>&1; then + echo "[$(date +%H:%M:%S)] [$approach] $task: done in $(( $(date +%s) - start ))s" + else + echo "[$(date +%H:%M:%S)] [$approach] $task: FAILED in $(( $(date +%s) - start ))s; see /tmp/verify-$$-$task-$approach.log" + fi +} +export -f run_one +export PARALLEL + +# Build (dir, approach) job list and feed to xargs -P. +JOBS=() +for d in "${DIRS[@]}"; do + JOBS+=("$d|outcome-only") +done +for d in "${DIRS[@]}"; do + JOBS+=("$d|b") +done +for d in "${DIRS[@]}"; do + JOBS+=("$d|a") +done + +printf '%s\n' "${JOBS[@]}" | xargs -I {} -n 1 -P "$PARALLEL" bash -c ' + IFS="|" read -r dir approach <<< "$1" + run_one "$dir" "$approach" +' _ {} + +echo "[$(date +%H:%M:%S)] All cross-verifications complete." diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh new file mode 100755 index 000000000..186f1311c --- /dev/null +++ b/scripts/cross-verify.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Re-verify each stored trajectory under each verifier approach via `bench verify`. +# Lets us isolate verifier disagreement from agent variance. +# +# Inputs: every trajectory dir matched by TRAJECTORY_GLOB. +# Outputs: scores/result_cross-{outcome-only,a,b}.json next to each trajectory. + +set -e +cd "$(dirname "$0")/.." + +if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then + set -a + source "$EVALS_ENV_FILE" + set +a +fi + +# Collect trajectory dirs from persisted verifier runs. +TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*} +DIRS=() +while IFS= read -r d; do + DIRS+=("$d") +done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort) + +echo "Found ${#DIRS[@]} trajectory dirs" +for d in "${DIRS[@]}"; do + task=$(basename "$d") + echo "=== $(basename "$(dirname "$d")")/$task ===" + for approach in outcome-only b a; do + label="cross-${approach}" + out_file="$d/scores/result_${label}.json" + if [[ -f "$out_file" ]]; then + echo " [$approach] already exists, skipping" + continue + fi + echo " [$approach] verifying..." + start=$(date +%s) + VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \ + pnpm exec tsx packages/evals/cli.ts verify "$d" --label "$label" > /dev/null 2>&1 + end=$(date +%s) + echo " [$approach] done in $((end - start))s" + done +done + +echo "All cross-verifications complete."