From 0eb6685469753058e831d25ac1d5315848b94eaa Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:57:45 -0700 Subject: [PATCH 1/4] feat(evals): add verifier benchmark instrumentation --- .../evals/docs/verifier-benchmark-matrix.md | 35 +++++ packages/evals/framework/braintrust.ts | 20 ++- packages/evals/framework/verifierAdapter.ts | 129 ++++++++++++++++-- scripts/cross-verify-parallel.sh | 63 +++++++++ scripts/cross-verify.sh | 44 ++++++ 5 files changed, 275 insertions(+), 16 deletions(-) create mode 100644 packages/evals/docs/verifier-benchmark-matrix.md create mode 100755 scripts/cross-verify-parallel.sh create mode 100755 scripts/cross-verify.sh diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md new file mode 100644 index 000000000..922c1d9cd --- /dev/null +++ b/packages/evals/docs/verifier-benchmark-matrix.md @@ -0,0 +1,35 @@ +# Verifier Benchmark Matrix + +Use this matrix before changing `STAGEHAND_EVALUATOR_BACKEND` defaults. + +```bash +STAGEHAND_EVALUATOR_BACKEND=legacy +STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=a +STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=b +``` + +For saved trajectories, run both verifier approaches against the same agent +outputs so verifier quality is isolated from agent variance: + +```bash +TRAJECTORY_GLOB=".trajectories/*" scripts/cross-verify-parallel.sh +``` + +Optional environment: + +```bash +EVALS_ENV_FILE=~/.envs/prod-evals.env +PARALLEL=8 +VERIFIER_OPTIONAL_STEPS=folded +``` + +Report at least: + +- accuracy against manually reviewed labels +- false positives and false negatives +- invalid or ambiguous task handling +- evidence-insufficient count +- latency and model cost + +Do not flip the default backend until verifier results beat or match legacy on +the target datasets and failure analysis is reviewed. diff --git a/packages/evals/framework/braintrust.ts b/packages/evals/framework/braintrust.ts index 803f30d53..3f3967347 100644 --- a/packages/evals/framework/braintrust.ts +++ b/packages/evals/framework/braintrust.ts @@ -1,3 +1,14 @@ +/** + * Braintrust tracing helper. + * + * Thin wrapper around `braintrust.traced` that lets callers carry a span into + * the work and `span.log({ output, scores, metrics, metadata })` along the + * way. Outside an active Braintrust experiment, `traced` no-ops and returns + * the callback's value unchanged, so this is safe to call from offline tools + * (e.g., `bench verify`). + */ +import type { Span, StartSpanArgs } from "braintrust"; + let braintrustPromise: Promise | undefined; export function loadBraintrust(): Promise { @@ -5,9 +16,14 @@ export function loadBraintrust(): Promise { return braintrustPromise; } +export type TracedFn = (span: Span) => Promise; + +/** Same shape as Braintrust's StartSpanArgs but `name` is required. */ +export type TracedSpanOptions = StartSpanArgs & { name: string }; + export async function tracedSpan( - fn: () => Promise, - options: { name: string }, + fn: TracedFn, + options: TracedSpanOptions, ): Promise { const { traced } = await loadBraintrust(); return traced(fn, options); diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts index 8dc40bd2b..85643e8a9 100644 --- a/packages/evals/framework/verifierAdapter.ts +++ b/packages/evals/framework/verifierAdapter.ts @@ -35,6 +35,7 @@ import { type V3, } from "@browserbasehq/stagehand"; +import { tracedSpan } from "./braintrust.js"; import { RubricCache } from "./rubricCache.js"; import { TrajectoryRecorder } from "./trajectoryRecorder.js"; @@ -73,15 +74,58 @@ export async function runWithVerifier( const evaluator = new V3Evaluator(v3, { backend: "verifier" }); // ── Resolve rubric ────────────────────────────────────────────────────── - let resolvedRubric: Rubric; - if (taskSpec.precomputedRubric) { - resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!; - } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { - resolvedRubric = await evaluator.generateRubric(taskSpec); - } else { - const cache = new RubricCache({ dataset }); - resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator); - } + const { rubric: resolvedRubric } = await tracedSpan( + async (span) => { + let rubric: Rubric; + let source: "precomputed" | "cached" | "generated"; + + if (taskSpec.precomputedRubric) { + rubric = normalizeRubric(taskSpec.precomputedRubric)!; + source = "precomputed"; + } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") { + rubric = await evaluator.generateRubric(taskSpec); + source = "generated"; + } else { + const cache = new RubricCache({ dataset }); + const cached = await cache.read(taskSpec); + if (cached) { + rubric = cached; + source = "cached"; + } else { + rubric = await evaluator.generateRubric(taskSpec); + await cache.write(taskSpec, rubric); + source = "generated"; + } + } + + span.log({ + output: { + source, + rubric, + }, + metadata: { + taskId: taskSpec.id, + dataset, + source, + criterionCount: rubric.items.length, + }, + }); + + return { rubric, source }; + }, + { + name: "verifier.rubric", + type: "eval", + event: { + input: { + taskId: taskSpec.id, + dataset, + hasPrecomputedRubric: Boolean(taskSpec.precomputedRubric), + cacheDisabled: process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1", + }, + }, + }, + ); // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate. const hydratedTaskSpec: TaskSpec = { @@ -101,10 +145,20 @@ export async function runWithVerifier( let agentResult: AgentResult; let recorderStatus: "complete" | "aborted" | "error" = "complete"; try { - agentResult = await agent.execute({ - ...agentOptions, - instruction: taskSpec.instruction, - }); + agentResult = await tracedSpan( + async (span) => { + const result = await agent.execute({ + ...agentOptions, + instruction: taskSpec.instruction, + }); + span.log({ + output: { message: result.message?.slice(0, 500) }, + metrics: usageMetrics(result.usage), + }); + return result; + }, + { name: "agent.execute", type: "task" }, + ); } catch (e) { recorderStatus = "error"; const trajectory = await recorder.finish({ status: recorderStatus }); @@ -121,7 +175,37 @@ export async function runWithVerifier( }); // ── Verify ────────────────────────────────────────────────────────────── - const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec); + const evaluationResult = await tracedSpan( + async (span) => { + const v = await evaluator.verify(trajectory, hydratedTaskSpec); + const rawSteps = asRecord(v.rawSteps); + span.log({ + output: v, + scores: { + outcome: v.outcomeSuccess ? 1 : 0, + process: v.processScore, + }, + metadata: { + taskId: taskSpec.id, + dataset, + stepCount: trajectory.steps.length, + criterionCount: v.perCriterion?.length ?? 0, + findingCount: v.findings?.length ?? 0, + evidenceInsufficientCount: v.evidenceInsufficient?.length ?? 0, + firstFailStep: v.firstPointOfFailure?.stepIndex, + firstFailCode: v.firstPointOfFailure?.errorCode, + isAmbiguous: v.taskValidity?.isAmbiguous, + isInvalid: v.taskValidity?.isInvalid, + ambiguityReason: v.taskValidity?.ambiguityReason, + invalidReason: v.taskValidity?.invalidReason, + primaryIntent: rawSteps?.primaryIntent, + reasoning: rawSteps?.reasoning, + }, + }); + return v; + }, + { name: "verifier.verify", type: "eval" }, + ); await recorder.persistResult(evaluationResult); return { @@ -133,6 +217,23 @@ export async function runWithVerifier( }; } +function asRecord(value: unknown): Record | undefined { + return value && typeof value === "object" + ? (value as Record) + : undefined; +} + +function usageMetrics( + usage: AgentResult["usage"] | undefined, +): Record { + if (!usage) return {}; + return Object.fromEntries( + Object.entries(usage).filter( + (e): e is [string, number] => typeof e[1] === "number", + ), + ); +} + /** * Decide bench task success from an EvaluationResult using the --success flag's * semantics. diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh new file mode 100755 index 000000000..75a23bfdb --- /dev/null +++ b/scripts/cross-verify-parallel.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Parallel cross-verify: 8 verifier processes per approach in flight at once. +# 20 trajectories x 2 approaches = 40 verifier runs, ~2.5 batches per approach. +# Expected wall: ~5 min for B (8 in flight, 30s each, ~3 batches), ~15 min for A. +# Total ~15-20 min vs 80 min sequential. + +set -e +cd "$(dirname "$0")/.." + +if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then + set -a + source "$EVALS_ENV_FILE" + set +a +fi + +PARALLEL=${PARALLEL:-8} +TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*} + +DIRS=() +while IFS= read -r d; do + DIRS+=("$d") +done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort) + +echo "[$(date +%H:%M:%S)] Found ${#DIRS[@]} trajectory dirs; parallelism=$PARALLEL" + +run_one() { + local dir="$1" + local approach="$2" + local label="cross-${approach}" + local out_file="$dir/scores/mmrubric_${label}.json" + local task + task=$(basename "$dir") + if [[ -f "$out_file" ]]; then + echo "[$(date +%H:%M:%S)] [$approach] $task: skip (exists)" + return 0 + fi + local start + start=$(date +%s) + if VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \ + pnpm exec tsx packages/evals/cli.ts verify "$dir" --label "$label" > /tmp/verify-$$-$task-$approach.log 2>&1; then + echo "[$(date +%H:%M:%S)] [$approach] $task: done in $(( $(date +%s) - start ))s" + else + echo "[$(date +%H:%M:%S)] [$approach] $task: FAILED in $(( $(date +%s) - start ))s; see /tmp/verify-$$-$task-$approach.log" + fi +} +export -f run_one +export PARALLEL + +# Build (dir, approach) job list and feed to xargs -P. +JOBS=() +for d in "${DIRS[@]}"; do + JOBS+=("$d|b") +done +for d in "${DIRS[@]}"; do + JOBS+=("$d|a") +done + +printf '%s\n' "${JOBS[@]}" | xargs -I {} -n 1 -P "$PARALLEL" bash -c ' + IFS="|" read -r dir approach <<< "$1" + run_one "$dir" "$approach" +' _ {} + +echo "[$(date +%H:%M:%S)] All cross-verifications complete." diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh new file mode 100755 index 000000000..dfb5d8037 --- /dev/null +++ b/scripts/cross-verify.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Re-verify each stored trajectory under BOTH approaches via `bench verify`. +# Lets us isolate verifier disagreement from agent variance. +# +# Inputs: every trajectory dir matched by TRAJECTORY_GLOB. +# Outputs: scores/mmrubric_cross-{a,b}.json next to each trajectory. + +set -e +cd "$(dirname "$0")/.." + +if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then + set -a + source "$EVALS_ENV_FILE" + set +a +fi + +# Collect trajectory dirs from persisted verifier runs. +TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*} +DIRS=() +while IFS= read -r d; do + DIRS+=("$d") +done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort) + +echo "Found ${#DIRS[@]} trajectory dirs" +for d in "${DIRS[@]}"; do + task=$(basename "$d") + echo "=== $(basename "$(dirname "$d")")/$task ===" + for approach in b a; do + label="cross-${approach}" + out_file="$d/scores/mmrubric_${label}.json" + if [[ -f "$out_file" ]]; then + echo " [$approach] already exists, skipping" + continue + fi + echo " [$approach] verifying..." + start=$(date +%s) + VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \ + pnpm exec tsx packages/evals/cli.ts verify "$d" --label "$label" > /dev/null 2>&1 + end=$(date +%s) + echo " [$approach] done in $((end - start))s" + done +done + +echo "All cross-verifications complete." From 06d5a4f6d8697aaea85266ccfc54e4ef2c33f751 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:20:15 -0700 Subject: [PATCH 2/4] docs(evals): clarify verifier env naming --- packages/evals/docs/verifier-benchmark-matrix.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md index 922c1d9cd..40cd61fcd 100644 --- a/packages/evals/docs/verifier-benchmark-matrix.md +++ b/packages/evals/docs/verifier-benchmark-matrix.md @@ -1,6 +1,8 @@ # Verifier Benchmark Matrix Use this matrix before changing `STAGEHAND_EVALUATOR_BACKEND` defaults. +`STAGEHAND_EVALUATOR_BACKEND` selects the public evaluator backend; `VERIFIER_*` +flags tune the verifier internals once that backend is selected. ```bash STAGEHAND_EVALUATOR_BACKEND=legacy From 457754b987f239b4782dea2a25dc15d5e2ae363e Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 21:34:34 -0700 Subject: [PATCH 3/4] docs(evals): include outcome-only verifier matrix --- packages/evals/docs/verifier-benchmark-matrix.md | 9 +++++++-- scripts/cross-verify-parallel.sh | 9 +++++---- scripts/cross-verify.sh | 6 +++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md index 40cd61fcd..ea9fdb4d6 100644 --- a/packages/evals/docs/verifier-benchmark-matrix.md +++ b/packages/evals/docs/verifier-benchmark-matrix.md @@ -6,12 +6,17 @@ flags tune the verifier internals once that backend is selected. ```bash STAGEHAND_EVALUATOR_BACKEND=legacy +STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=outcome-only STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=a STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=b ``` -For saved trajectories, run both verifier approaches against the same agent -outputs so verifier quality is isolated from agent variance: +Use `VERIFIER_APPROACH=outcome-only` as the verifier default for benchmarks +without curated rubrics. Use approaches `a` and `b` when evaluating the rubric +pipeline itself or datasets with trusted precomputed rubrics. + +For saved trajectories, run verifier approaches against the same agent outputs +so verifier quality is isolated from agent variance: ```bash TRAJECTORY_GLOB=".trajectories/*" scripts/cross-verify-parallel.sh diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh index 75a23bfdb..1617dc1dc 100755 --- a/scripts/cross-verify-parallel.sh +++ b/scripts/cross-verify-parallel.sh @@ -1,8 +1,6 @@ #!/usr/bin/env bash -# Parallel cross-verify: 8 verifier processes per approach in flight at once. -# 20 trajectories x 2 approaches = 40 verifier runs, ~2.5 batches per approach. -# Expected wall: ~5 min for B (8 in flight, 30s each, ~3 batches), ~15 min for A. -# Total ~15-20 min vs 80 min sequential. +# Parallel cross-verify: 8 verifier processes in flight at once across +# outcome-only plus the rubric approaches. set -e cd "$(dirname "$0")/.." @@ -48,6 +46,9 @@ export PARALLEL # Build (dir, approach) job list and feed to xargs -P. JOBS=() +for d in "${DIRS[@]}"; do + JOBS+=("$d|outcome-only") +done for d in "${DIRS[@]}"; do JOBS+=("$d|b") done diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh index dfb5d8037..b554768ba 100755 --- a/scripts/cross-verify.sh +++ b/scripts/cross-verify.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash -# Re-verify each stored trajectory under BOTH approaches via `bench verify`. +# Re-verify each stored trajectory under each verifier approach via `bench verify`. # Lets us isolate verifier disagreement from agent variance. # # Inputs: every trajectory dir matched by TRAJECTORY_GLOB. -# Outputs: scores/mmrubric_cross-{a,b}.json next to each trajectory. +# Outputs: scores/mmrubric_cross-{outcome-only,a,b}.json next to each trajectory. set -e cd "$(dirname "$0")/.." @@ -25,7 +25,7 @@ echo "Found ${#DIRS[@]} trajectory dirs" for d in "${DIRS[@]}"; do task=$(basename "$d") echo "=== $(basename "$(dirname "$d")")/$task ===" - for approach in b a; do + for approach in outcome-only b a; do label="cross-${approach}" out_file="$d/scores/mmrubric_${label}.json" if [[ -f "$out_file" ]]; then From a04286ff78e0ea43efdbd3e9d92f0c0aaf4932b8 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 22:47:02 -0700 Subject: [PATCH 4/4] fix(evals): use result filenames in cross verification --- scripts/cross-verify-parallel.sh | 2 +- scripts/cross-verify.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh index 1617dc1dc..b992b3649 100755 --- a/scripts/cross-verify-parallel.sh +++ b/scripts/cross-verify-parallel.sh @@ -25,7 +25,7 @@ run_one() { local dir="$1" local approach="$2" local label="cross-${approach}" - local out_file="$dir/scores/mmrubric_${label}.json" + local out_file="$dir/scores/result_${label}.json" local task task=$(basename "$dir") if [[ -f "$out_file" ]]; then diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh index b554768ba..186f1311c 100755 --- a/scripts/cross-verify.sh +++ b/scripts/cross-verify.sh @@ -3,7 +3,7 @@ # Lets us isolate verifier disagreement from agent variance. # # Inputs: every trajectory dir matched by TRAJECTORY_GLOB. -# Outputs: scores/mmrubric_cross-{outcome-only,a,b}.json next to each trajectory. +# Outputs: scores/result_cross-{outcome-only,a,b}.json next to each trajectory. set -e cd "$(dirname "$0")/.." @@ -27,7 +27,7 @@ for d in "${DIRS[@]}"; do echo "=== $(basename "$(dirname "$d")")/$task ===" for approach in outcome-only b a; do label="cross-${approach}" - out_file="$d/scores/mmrubric_${label}.json" + out_file="$d/scores/result_${label}.json" if [[ -f "$out_file" ]]; then echo " [$approach] already exists, skipping" continue