From 0eb6685469753058e831d25ac1d5315848b94eaa Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 13:57:45 -0700
Subject: [PATCH 1/4] feat(evals): add verifier benchmark instrumentation

---
 .../evals/docs/verifier-benchmark-matrix.md   |  35 +++++
 packages/evals/framework/braintrust.ts        |  20 ++-
 packages/evals/framework/verifierAdapter.ts   | 129 ++++++++++++++++--
 scripts/cross-verify-parallel.sh              |  63 +++++++++
 scripts/cross-verify.sh                       |  44 ++++++
 5 files changed, 275 insertions(+), 16 deletions(-)
 create mode 100644 packages/evals/docs/verifier-benchmark-matrix.md
 create mode 100755 scripts/cross-verify-parallel.sh
 create mode 100755 scripts/cross-verify.sh
diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md
new file mode 100644
index 000000000..922c1d9cd
--- /dev/null
+++ b/packages/evals/docs/verifier-benchmark-matrix.md
@@ -0,0 +1,35 @@
+# Verifier Benchmark Matrix
+
+Use this matrix before changing `STAGEHAND_EVALUATOR_BACKEND` defaults.
+
+```bash
+STAGEHAND_EVALUATOR_BACKEND=legacy
+STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=a
+STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=b
+```
+
+For saved trajectories, run both verifier approaches against the same agent
+outputs so verifier quality is isolated from agent variance:
+
+```bash
+TRAJECTORY_GLOB=".trajectories/<run-prefix>*" scripts/cross-verify-parallel.sh
+```
+
+Optional environment:
+
+```bash
+EVALS_ENV_FILE=~/.envs/prod-evals.env
+PARALLEL=8
+VERIFIER_OPTIONAL_STEPS=folded
+```
+
+Report at least:
+
+- accuracy against manually reviewed labels
+- false positives and false negatives
+- invalid or ambiguous task handling
+- evidence-insufficient count
+- latency and model cost
+
+Do not flip the default backend until verifier results beat or match legacy on
+the target datasets and failure analysis is reviewed.
diff --git a/packages/evals/framework/braintrust.ts b/packages/evals/framework/braintrust.ts
index 803f30d53..3f3967347 100644
--- a/packages/evals/framework/braintrust.ts
+++ b/packages/evals/framework/braintrust.ts
@@ -1,3 +1,14 @@
+/**
+ * Braintrust tracing helper.
+ *
+ * Thin wrapper around `braintrust.traced` that lets callers carry a span into
+ * the work and `span.log({ output, scores, metrics, metadata })` along the
+ * way. Outside an active Braintrust experiment, `traced` no-ops and returns
+ * the callback's value unchanged, so this is safe to call from offline tools
+ * (e.g., `bench verify`).
+ */
+import type { Span, StartSpanArgs } from "braintrust";
+
 let braintrustPromise: Promise<typeof import("braintrust")> | undefined;
 
 export function loadBraintrust(): Promise<typeof import("braintrust")> {
@@ -5,9 +16,14 @@ export function loadBraintrust(): Promise<typeof import("braintrust")> {
   return braintrustPromise;
 }
 
+export type TracedFn<T> = (span: Span) => Promise<T>;
+
+/** Same shape as Braintrust's StartSpanArgs but `name` is required. */
+export type TracedSpanOptions = StartSpanArgs & { name: string };
+
 export async function tracedSpan<T>(
-  fn: () => Promise<T>,
-  options: { name: string },
+  fn: TracedFn<T>,
+  options: TracedSpanOptions,
 ): Promise<T> {
   const { traced } = await loadBraintrust();
   return traced(fn, options);
diff --git a/packages/evals/framework/verifierAdapter.ts b/packages/evals/framework/verifierAdapter.ts
index 8dc40bd2b..85643e8a9 100644
--- a/packages/evals/framework/verifierAdapter.ts
+++ b/packages/evals/framework/verifierAdapter.ts
@@ -35,6 +35,7 @@ import {
   type V3,
 } from "@browserbasehq/stagehand";
 
+import { tracedSpan } from "./braintrust.js";
 import { RubricCache } from "./rubricCache.js";
 import { TrajectoryRecorder } from "./trajectoryRecorder.js";
 
@@ -73,15 +74,58 @@ export async function runWithVerifier(
   const evaluator = new V3Evaluator(v3, { backend: "verifier" });
 
   // ── Resolve rubric ──────────────────────────────────────────────────────
-  let resolvedRubric: Rubric;
-  if (taskSpec.precomputedRubric) {
-    resolvedRubric = normalizeRubric(taskSpec.precomputedRubric)!;
-  } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
-    resolvedRubric = await evaluator.generateRubric(taskSpec);
-  } else {
-    const cache = new RubricCache({ dataset });
-    resolvedRubric = await cache.getOrGenerate(taskSpec, evaluator);
-  }
+  const { rubric: resolvedRubric } = await tracedSpan(
+    async (span) => {
+      let rubric: Rubric;
+      let source: "precomputed" | "cached" | "generated";
+
+      if (taskSpec.precomputedRubric) {
+        rubric = normalizeRubric(taskSpec.precomputedRubric)!;
+        source = "precomputed";
+      } else if (process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1") {
+        rubric = await evaluator.generateRubric(taskSpec);
+        source = "generated";
+      } else {
+        const cache = new RubricCache({ dataset });
+        const cached = await cache.read(taskSpec);
+        if (cached) {
+          rubric = cached;
+          source = "cached";
+        } else {
+          rubric = await evaluator.generateRubric(taskSpec);
+          await cache.write(taskSpec, rubric);
+          source = "generated";
+        }
+      }
+
+      span.log({
+        output: {
+          source,
+          rubric,
+        },
+        metadata: {
+          taskId: taskSpec.id,
+          dataset,
+          source,
+          criterionCount: rubric.items.length,
+        },
+      });
+
+      return { rubric, source };
+    },
+    {
+      name: "verifier.rubric",
+      type: "eval",
+      event: {
+        input: {
+          taskId: taskSpec.id,
+          dataset,
+          hasPrecomputedRubric: Boolean(taskSpec.precomputedRubric),
+          cacheDisabled: process.env.VERIFIER_DISABLE_RUBRIC_CACHE === "1",
+        },
+      },
+    },
+  );
 
   // Hand a fully-hydrated TaskSpec to the verifier so it doesn't regenerate.
   const hydratedTaskSpec: TaskSpec = {
@@ -101,10 +145,20 @@ export async function runWithVerifier(
   let agentResult: AgentResult;
   let recorderStatus: "complete" | "aborted" | "error" = "complete";
   try {
-    agentResult = await agent.execute({
-      ...agentOptions,
-      instruction: taskSpec.instruction,
-    });
+    agentResult = await tracedSpan(
+      async (span) => {
+        const result = await agent.execute({
+          ...agentOptions,
+          instruction: taskSpec.instruction,
+        });
+        span.log({
+          output: { message: result.message?.slice(0, 500) },
+          metrics: usageMetrics(result.usage),
+        });
+        return result;
+      },
+      { name: "agent.execute", type: "task" },
+    );
   } catch (e) {
     recorderStatus = "error";
     const trajectory = await recorder.finish({ status: recorderStatus });
@@ -121,7 +175,37 @@ export async function runWithVerifier(
   });
 
   // ── Verify ──────────────────────────────────────────────────────────────
-  const evaluationResult = await evaluator.verify(trajectory, hydratedTaskSpec);
+  const evaluationResult = await tracedSpan(
+    async (span) => {
+      const v = await evaluator.verify(trajectory, hydratedTaskSpec);
+      const rawSteps = asRecord(v.rawSteps);
+      span.log({
+        output: v,
+        scores: {
+          outcome: v.outcomeSuccess ? 1 : 0,
+          process: v.processScore,
+        },
+        metadata: {
+          taskId: taskSpec.id,
+          dataset,
+          stepCount: trajectory.steps.length,
+          criterionCount: v.perCriterion?.length ?? 0,
+          findingCount: v.findings?.length ?? 0,
+          evidenceInsufficientCount: v.evidenceInsufficient?.length ?? 0,
+          firstFailStep: v.firstPointOfFailure?.stepIndex,
+          firstFailCode: v.firstPointOfFailure?.errorCode,
+          isAmbiguous: v.taskValidity?.isAmbiguous,
+          isInvalid: v.taskValidity?.isInvalid,
+          ambiguityReason: v.taskValidity?.ambiguityReason,
+          invalidReason: v.taskValidity?.invalidReason,
+          primaryIntent: rawSteps?.primaryIntent,
+          reasoning: rawSteps?.reasoning,
+        },
+      });
+      return v;
+    },
+    { name: "verifier.verify", type: "eval" },
+  );
   await recorder.persistResult(evaluationResult);
 
   return {
@@ -133,6 +217,23 @@ export async function runWithVerifier(
   };
 }
 
+function asRecord(value: unknown): Record<string, unknown> | undefined {
+  return value && typeof value === "object"
+    ? (value as Record<string, unknown>)
+    : undefined;
+}
+
+function usageMetrics(
+  usage: AgentResult["usage"] | undefined,
+): Record<string, number> {
+  if (!usage) return {};
+  return Object.fromEntries(
+    Object.entries(usage).filter(
+      (e): e is [string, number] => typeof e[1] === "number",
+    ),
+  );
+}
+
 /**
  * Decide bench task success from an EvaluationResult using the --success flag's
  * semantics.
diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh
new file mode 100755
index 000000000..75a23bfdb
--- /dev/null
+++ b/scripts/cross-verify-parallel.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Parallel cross-verify: 8 verifier processes per approach in flight at once.
+# 20 trajectories x 2 approaches = 40 verifier runs, ~2.5 batches per approach.
+# Expected wall: ~5 min for B (8 in flight, 30s each, ~3 batches), ~15 min for A.
+# Total ~15-20 min vs 80 min sequential.
+
+set -e
+cd "$(dirname "$0")/.."
+
+if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then
+  set -a
+  source "$EVALS_ENV_FILE"
+  set +a
+fi
+
+PARALLEL=${PARALLEL:-8}
+TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*}
+
+DIRS=()
+while IFS= read -r d; do
+  DIRS+=("$d")
+done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort)
+
+echo "[$(date +%H:%M:%S)] Found ${#DIRS[@]} trajectory dirs; parallelism=$PARALLEL"
+
+run_one() {
+  local dir="$1"
+  local approach="$2"
+  local label="cross-${approach}"
+  local out_file="$dir/scores/mmrubric_${label}.json"
+  local task
+  task=$(basename "$dir")
+  if [[ -f "$out_file" ]]; then
+    echo "[$(date +%H:%M:%S)] [$approach] $task: skip (exists)"
+    return 0
+  fi
+  local start
+  start=$(date +%s)
+  if VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \
+       pnpm exec tsx packages/evals/cli.ts verify "$dir" --label "$label" > /tmp/verify-$$-$task-$approach.log 2>&1; then
+    echo "[$(date +%H:%M:%S)] [$approach] $task: done in $(( $(date +%s) - start ))s"
+  else
+    echo "[$(date +%H:%M:%S)] [$approach] $task: FAILED in $(( $(date +%s) - start ))s; see /tmp/verify-$$-$task-$approach.log"
+  fi
+}
+export -f run_one
+export PARALLEL
+
+# Build (dir, approach) job list and feed to xargs -P.
+JOBS=()
+for d in "${DIRS[@]}"; do
+  JOBS+=("$d|b")
+done
+for d in "${DIRS[@]}"; do
+  JOBS+=("$d|a")
+done
+
+printf '%s\n' "${JOBS[@]}" | xargs -I {} -n 1 -P "$PARALLEL" bash -c '
+  IFS="|" read -r dir approach <<< "$1"
+  run_one "$dir" "$approach"
+' _ {}
+
+echo "[$(date +%H:%M:%S)] All cross-verifications complete."
diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh
new file mode 100755
index 000000000..dfb5d8037
--- /dev/null
+++ b/scripts/cross-verify.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Re-verify each stored trajectory under BOTH approaches via `bench verify`.
+# Lets us isolate verifier disagreement from agent variance.
+#
+# Inputs: every trajectory dir matched by TRAJECTORY_GLOB.
+# Outputs: scores/mmrubric_cross-{a,b}.json next to each trajectory.
+
+set -e
+cd "$(dirname "$0")/.."
+
+if [[ -n "${EVALS_ENV_FILE:-}" && -f "$EVALS_ENV_FILE" ]]; then
+  set -a
+  source "$EVALS_ENV_FILE"
+  set +a
+fi
+
+# Collect trajectory dirs from persisted verifier runs.
+TRAJECTORY_GLOB=${TRAJECTORY_GLOB:-.trajectories/*}
+DIRS=()
+while IFS= read -r d; do
+  DIRS+=("$d")
+done < <(find $TRAJECTORY_GLOB -mindepth 1 -maxdepth 1 -type d | sort)
+
+echo "Found ${#DIRS[@]} trajectory dirs"
+for d in "${DIRS[@]}"; do
+  task=$(basename "$d")
+  echo "=== $(basename "$(dirname "$d")")/$task ==="
+  for approach in b a; do
+    label="cross-${approach}"
+    out_file="$d/scores/mmrubric_${label}.json"
+    if [[ -f "$out_file" ]]; then
+      echo "  [$approach] already exists, skipping"
+      continue
+    fi
+    echo "  [$approach] verifying..."
+    start=$(date +%s)
+    VERIFIER_APPROACH=$approach VERIFIER_OPTIONAL_STEPS=folded \
+      pnpm exec tsx packages/evals/cli.ts verify "$d" --label "$label" > /dev/null 2>&1
+    end=$(date +%s)
+    echo "  [$approach] done in $((end - start))s"
+  done
+done
+
+echo "All cross-verifications complete."

From 06d5a4f6d8697aaea85266ccfc54e4ef2c33f751 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 14:20:15 -0700
Subject: [PATCH 2/4] docs(evals): clarify verifier env naming

---
 packages/evals/docs/verifier-benchmark-matrix.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md
index 922c1d9cd..40cd61fcd 100644
--- a/packages/evals/docs/verifier-benchmark-matrix.md
+++ b/packages/evals/docs/verifier-benchmark-matrix.md
@@ -1,6 +1,8 @@
 # Verifier Benchmark Matrix
 
 Use this matrix before changing `STAGEHAND_EVALUATOR_BACKEND` defaults.
+`STAGEHAND_EVALUATOR_BACKEND` selects the public evaluator backend; `VERIFIER_*`
+flags tune the verifier internals once that backend is selected.
 
 ```bash
 STAGEHAND_EVALUATOR_BACKEND=legacy

From 457754b987f239b4782dea2a25dc15d5e2ae363e Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 21:34:34 -0700
Subject: [PATCH 3/4] docs(evals): include outcome-only verifier matrix

---
 packages/evals/docs/verifier-benchmark-matrix.md | 9 +++++++--
 scripts/cross-verify-parallel.sh                 | 9 +++++----
 scripts/cross-verify.sh                          | 6 +++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/packages/evals/docs/verifier-benchmark-matrix.md b/packages/evals/docs/verifier-benchmark-matrix.md
index 40cd61fcd..ea9fdb4d6 100644
--- a/packages/evals/docs/verifier-benchmark-matrix.md
+++ b/packages/evals/docs/verifier-benchmark-matrix.md
@@ -6,12 +6,17 @@ flags tune the verifier internals once that backend is selected.
 
 ```bash
 STAGEHAND_EVALUATOR_BACKEND=legacy
+STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=outcome-only
 STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=a
 STAGEHAND_EVALUATOR_BACKEND=verifier VERIFIER_APPROACH=b
 ```
 
-For saved trajectories, run both verifier approaches against the same agent
-outputs so verifier quality is isolated from agent variance:
+Use `VERIFIER_APPROACH=outcome-only` as the verifier default for benchmarks
+without curated rubrics. Use approaches `a` and `b` when evaluating the rubric
+pipeline itself or datasets with trusted precomputed rubrics.
+
+For saved trajectories, run verifier approaches against the same agent outputs
+so verifier quality is isolated from agent variance:
 
 ```bash
 TRAJECTORY_GLOB=".trajectories/<run-prefix>*" scripts/cross-verify-parallel.sh
diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh
index 75a23bfdb..1617dc1dc 100755
--- a/scripts/cross-verify-parallel.sh
+++ b/scripts/cross-verify-parallel.sh
@@ -1,8 +1,6 @@
 #!/usr/bin/env bash
-# Parallel cross-verify: 8 verifier processes per approach in flight at once.
-# 20 trajectories x 2 approaches = 40 verifier runs, ~2.5 batches per approach.
-# Expected wall: ~5 min for B (8 in flight, 30s each, ~3 batches), ~15 min for A.
-# Total ~15-20 min vs 80 min sequential.
+# Parallel cross-verify: 8 verifier processes in flight at once across
+# outcome-only plus the rubric approaches.
 
 set -e
 cd "$(dirname "$0")/.."
@@ -48,6 +46,9 @@ export PARALLEL
 
 # Build (dir, approach) job list and feed to xargs -P.
 JOBS=()
+for d in "${DIRS[@]}"; do
+  JOBS+=("$d|outcome-only")
+done
 for d in "${DIRS[@]}"; do
   JOBS+=("$d|b")
 done
diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh
index dfb5d8037..b554768ba 100755
--- a/scripts/cross-verify.sh
+++ b/scripts/cross-verify.sh
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-# Re-verify each stored trajectory under BOTH approaches via `bench verify`.
+# Re-verify each stored trajectory under each verifier approach via `bench verify`.
 # Lets us isolate verifier disagreement from agent variance.
 #
 # Inputs: every trajectory dir matched by TRAJECTORY_GLOB.
-# Outputs: scores/mmrubric_cross-{a,b}.json next to each trajectory.
+# Outputs: scores/mmrubric_cross-{outcome-only,a,b}.json next to each trajectory.
 
 set -e
 cd "$(dirname "$0")/.."
@@ -25,7 +25,7 @@ echo "Found ${#DIRS[@]} trajectory dirs"
 for d in "${DIRS[@]}"; do
   task=$(basename "$d")
   echo "=== $(basename "$(dirname "$d")")/$task ==="
-  for approach in b a; do
+  for approach in outcome-only b a; do
     label="cross-${approach}"
     out_file="$d/scores/mmrubric_${label}.json"
     if [[ -f "$out_file" ]]; then

From a04286ff78e0ea43efdbd3e9d92f0c0aaf4932b8 Mon Sep 17 00:00:00 2001
From: miguel <miguelg71921@gmail.com>
Date: Fri, 15 May 2026 22:47:02 -0700
Subject: [PATCH 4/4] fix(evals): use result filenames in cross verification

---
 scripts/cross-verify-parallel.sh | 2 +-
 scripts/cross-verify.sh          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/cross-verify-parallel.sh b/scripts/cross-verify-parallel.sh
index 1617dc1dc..b992b3649 100755
--- a/scripts/cross-verify-parallel.sh
+++ b/scripts/cross-verify-parallel.sh
@@ -25,7 +25,7 @@ run_one() {
   local dir="$1"
   local approach="$2"
   local label="cross-${approach}"
-  local out_file="$dir/scores/mmrubric_${label}.json"
+  local out_file="$dir/scores/result_${label}.json"
   local task
   task=$(basename "$dir")
   if [[ -f "$out_file" ]]; then
diff --git a/scripts/cross-verify.sh b/scripts/cross-verify.sh
index b554768ba..186f1311c 100755
--- a/scripts/cross-verify.sh
+++ b/scripts/cross-verify.sh
@@ -3,7 +3,7 @@
 # Lets us isolate verifier disagreement from agent variance.
 #
 # Inputs: every trajectory dir matched by TRAJECTORY_GLOB.
-# Outputs: scores/mmrubric_cross-{outcome-only,a,b}.json next to each trajectory.
+# Outputs: scores/result_cross-{outcome-only,a,b}.json next to each trajectory.
 
 set -e
 cd "$(dirname "$0")/.."
@@ -27,7 +27,7 @@ for d in "${DIRS[@]}"; do
   echo "=== $(basename "$(dirname "$d")")/$task ==="
   for approach in outcome-only b a; do
     label="cross-${approach}"
-    out_file="$d/scores/mmrubric_${label}.json"
+    out_file="$d/scores/result_${label}.json"
     if [[ -f "$out_file" ]]; then
       echo "  [$approach] already exists, skipping"
       continue