browserbase · miguelg719 · May 15, 2026 · May 15, 2026 · May 16, 2026 · May 16, 2026
diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md
@@ -0,0 +1,42 @@
+# Verifier Benchmark Matrix
+
+Use this matrix before changing `STAGEHAND_EVALUATOR_BACKEND` defaults.
+`STAGEHAND_EVALUATOR_BACKEND` selects the public evaluator backend; `VERIFIER_*`
+flags tune the verifier internals once that backend is selected.
+
+```bash
+STAGEHAND_EVALUATOR_BACKEND=legacy
+STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=outcome-only
+STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=a
+STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=b
+```
+
+Use `VERIFIER_APPROACH=outcome-only` as the verifier default for benchmarks
+without curated rubrics. Use approaches `a` and `b` when evaluating the rubric
+pipeline itself or datasets with trusted precomputed rubrics.
+
+For saved trajectories, run verifier approaches against the same agent outputs
+so verifier quality is isolated from agent variance:
+
+```bash
+TRAJECTORY_GLOB=".trajectories/<run-prefix>*" scripts/cross-verify-parallel.sh
+```
+
+Optional environment:
+
+```bash
+EVALS_ENV_FILE=~/.envs/prod-evals.env
+PARALLEL=8
+VERIFIER_OPTIONAL_STEPS=folded
+```
+
+Report at least:
+
+- accuracy against manually reviewed labels
+- false positives and false negatives
+- invalid or ambiguous task handling
+- evidence-insufficient count
+- latency and model cost
+
+Do not flip the default backend until verifier results beat or match legacy on
+the target datasets and failure analysis is reviewed.
diff --git a/packages/evals/framework/braintrust.ts b/packages/evals/framework/braintrust.ts
@@ -1,13 +1,29 @@
+/**
+ * Braintrust tracing helper.
+ *
+ * Thin wrapper around `braintrust.traced` that lets callers carry a span into
+ * the work and `span.log({ output, scores, metrics, metadata })` along the
+ * way. Outside an active Braintrust experiment, `traced` no-ops and returns
+ * the callback's value unchanged, so this is safe to call from offline tools
+ * (e.g., `bench verify`).
+ */
+import type { Span, StartSpanArgs } from "braintrust";
+
 let braintrustPromise: Promise<typeof import("braintrust")> | undefined;
 
 export function loadBraintrust(): Promise<typeof import("braintrust")> {
   braintrustPromise ??= import("braintrust");
   return braintrustPromise;
 }
 
+export type TracedFn<T> = (span: Span) => Promise<T>;
+
+/** Same shape as Braintrust's StartSpanArgs but `name` is required. */
+export type TracedSpanOptions = StartSpanArgs & { name: string };
+
 export async function tracedSpan<T>(
-  fn: () => Promise<T>,
-  options: { name: string },
+  fn: TracedFn<T>,
+  options: TracedSpanOptions,
 ): Promise<T> {
   const { traced } = await loadBraintrust();
   return traced(fn, options);

diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
@@ -35,6 +35,7 @@ import {
   type V3,
 } from "@browserbasehq/stagehand";
 
+import { tracedSpan } from "./braintrust.js";
 import { RubricCache } from "./rubricCache.js";
 import { TrajectoryRecorder } from "./trajectoryRecorder.js";
 
@@ -73,15 +74,58 @@ export async function runWithVerifier(
   const evaluator = new V3Evaluator(v3, { backend: "verifier" });
 
   // ── Resolve rubric ──────────────────────────────────────────────────────
-  let resolvedRubric: Rubric;
-  if (taskSpec.precomputedRubric) {
-    resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
-  } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
-    resolvedRubric = await evaluator.generateRubric(taskSpec);
-  } else {
-    const cache = new RubricCache({ dataset });
-    resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
-  }
+  const { rubric: resolvedRubric } = await tracedSpan(
+    async (span) => {
+      let rubric: Rubric;
+      let source: "precomputed" | "cached" | "generated";
+
+      if (taskSpec.precomputedRubric) {
+        rubric = normalizeRubric(taskSpec.precomputedRubric)!;
+        source = "precomputed";
+      } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+        rubric = await evaluator.generateRubric(taskSpec);
+        source = "generated";
+      } else {
+        const cache = new RubricCache({ dataset });
+        const cached = await cache.read(taskSpec);
+        if (cached) {
+          rubric = cached;
+          source = "cached";
+        } else {
+          rubric = await evaluator.generateRubric(taskSpec);
+          await cache.write(taskSpec, rubric);
+          source = "generated";
+        }
+      }
+
+      span.log({
+        output: {
+          source,
+          rubric,
+        },
+        metadata: {
+          taskId: taskSpec.id,
+          dataset,
+          source,
+          criterionCount: rubric.items.length,
+        },
+      });
+
+      return { rubric, source };
+    },
+    {
+      name: "verifier.rubric",
+      type: "eval",
+      event: {
+        input: {
+          taskId: taskSpec.id,
+          dataset,
+          hasPrecomputedRubric: Boolean(taskSpec.precomputedRubric),
+          cacheDisabled: process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1",
+        },
+      },
+    },
+  );
 
   // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
   const hydratedTaskSpec: TaskSpec = {
@@ -101,10 +145,20 @@ export async function runWithVerifier(
   let agentResult: AgentResult;
   let recorderStatus: "complete" | "aborted" | "error" = "complete";
   try {
-    agentResult = await agent.execute({
-      ...agentOptions,
-      instruction: taskSpec.instruction,
-    });
+    agentResult = await tracedSpan(
+      async (span) => {
+        const result = await agent.execute({
+          ...agentOptions,
+          instruction: taskSpec.instruction,
+        });
+        span.log({
+          output: { message: result.message?.slice(0, 500) },
+          metrics: usageMetrics(result.usage),
+        });
+        return result;
+      },
+      { name: "agent.execute", type: "task" },
+    );
   } catch (e) {
     recorderStatus = "error";
     const trajectory = await recorder.finish({ status: recorderStatus });
@@ -121,7 +175,37 @@ export async function runWithVerifier(
   });
 
   // ── Verify ──────────────────────────────────────────────────────────────
-  const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec);
+  const evaluationResult = await tracedSpan(
+    async (span) => {
+      const v = await evaluator.verify(trajectory, hydratedTaskSpec);
+      const rawSteps = asRecord(v.rawSteps);
+      span.log({
+        output: v,
+        scores: {
+          outcome: v.outcomeSuccess ? 1 : 0,
+          process: v.processScore,
+        },
+        metadata: {
+          taskId: taskSpec.id,
+          dataset,
+          stepCount: trajectory.steps.length,
+          criterionCount: v.perCriterion?.length ?? 0,
+          findingCount: v.findings?.length ?? 0,
+          evidenceInsufficientCount: v.evidenceInsufficient?.length ?? 0,
+          firstFailStep: v.firstPointOfFailure?.stepIndex,
+          firstFailCode: v.firstPointOfFailure?.errorCode,
+          isAmbiguous: v.taskValidity?.isAmbiguous,
+          isInvalid: v.taskValidity?.isInvalid,
+          ambiguityReason: v.taskValidity?.ambiguityReason,
+          invalidReason: v.taskValidity?.invalidReason,
+          primaryIntent: rawSteps?.primaryIntent,
+          reasoning: rawSteps?.reasoning,
+        },
+      });
+      return v;
+    },
+    { name: "verifier.verify", type: "eval" },
+  );
   await recorder.persistResult(evaluationResult);
 
   return {
@@ -133,6 +217,23 @@ export async function runWithVerifier(
   };
 }
 
+function asRecord(value: unknown): Record<string, unknown> | undefined {
+  return value && typeof value === "object"
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function usageMetrics(
+  usage: AgentResult["usage"] | undefined,
+): Record<string, number> {
+  if (!usage) return {};
+  return Object.fromEntries(
+    Object.entries(usage).filter(
+      (e): e is [string, number] => typeof e[1] === "number",
+    ),
+  );
+}
+
 /**
  * Decide bench task success from an EvaluationResult using the --success flag's
  * semantics.

diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# Parallel cross-verify: 8 verifier processes in flight at once across
+# outcome-only plus the rubric approaches.
+
+set -e
+cd "$(dirname "$0")/.."
+
+if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then
+  set -a
+  source "$EVALS_ENV_FILE"
+  set +a
+fi
+
+PARALLEL=${PARALLEL:-8}
+TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*}
+
+DIRS=()
+while IFS= read -r d; do
+  DIRS+=("$d")
+done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort)
+
+echo "[$(date +%H:%M:%S)] Found ${#DIRS[@]} trajectory dirs; parallelism=$PARALLEL"
+
+run_one() {
+  local dir="$1"
+  local approach="$2"
+  local label="cross-${approach}"
+  local out_file="$dir/scores/result_${label}.json"
+  local task
+  task=$(basename "$dir")
+  if [[ -f "$out_file" ]]; then
+    echo "[$(date +%H:%M:%S)] [$approach] $task: skip (exists)"
+    return 0
+  fi
+  local start
+  start=$(date +%s)
+  if VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \
+       pnpm exec tsx packages/evals/cli.ts verify "$dir" --label "$label" > /tmp/verify-$$-$task-$approach.log 2>&1; then
+    echo "[$(date +%H:%M:%S)] [$approach] $task: done in $(( $(date +%s) - start ))s"
+  else
+    echo "[$(date +%H:%M:%S)] [$approach] $task: FAILED in $(( $(date +%s) - start ))s; see /tmp/verify-$$-$task-$approach.log"
+  fi
+}
+export -f run_one
+export PARALLEL
+
+# Build (dir, approach) job list and feed to xargs -P.
+JOBS=()
+for d in "${DIRS[@]}"; do
+  JOBS+=("$d|outcome-only")
+done
+for d in "${DIRS[@]}"; do
+  JOBS+=("$d|b")
+done
+for d in "${DIRS[@]}"; do
+  JOBS+=("$d|a")
+done
+
+printf '%s\n' "${JOBS[@]}" | xargs -I {} -n 1 -P "$PARALLEL" bash -c '
+  IFS="|" read -r dir approach <<< "$1"
+  run_one "$dir" "$approach"
+' _ {}
+
+echo "[$(date +%H:%M:%S)] All cross-verifications complete."
diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Re-verify each stored trajectory under each verifier approach via `bench verify`.
+# Lets us isolate verifier disagreement from agent variance.
+#
+# Inputs: every trajectory dir matched by TRAJECTORY_GLOB.
+# Outputs: scores/result_cross-{outcome-only,a,b}.json next to each trajectory.
+
+set -e
+cd "$(dirname "$0")/.."
+
+if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then
+  set -a
+  source "$EVALS_ENV_FILE"
+  set +a
+fi
+
+# Collect trajectory dirs from persisted verifier runs.
+TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*}
+DIRS=()
+while IFS= read -r d; do
+  DIRS+=("$d")
+done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort)
+
+echo "Found ${#DIRS[@]} trajectory dirs"
+for d in "${DIRS[@]}"; do
+  task=$(basename "$d")
+  echo "=== $(basename "$(dirname "$d")")/$task ==="
+  for approach in outcome-only b a; do
+    label="cross-${approach}"
+    out_file="$d/scores/result_${label}.json"
+    if [[ -f "$out_file" ]]; then
+      echo "  [$approach] already exists, skipping"
+      continue
+    fi
+    echo "  [$approach] verifying..."
+    start=$(date +%s)
+    VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \
+      pnpm exec tsx packages/evals/cli.ts verify "$d" --label "$label" > /dev/null 2>&1
+    end=$(date +%s)
+    echo "  [$approach] done in $((end - start))s"
+  done
+done
+
+echo "All cross-verifications complete."