From 5fcdb16e052b82d98198298dc7f5b6127142d9b0 Mon Sep 17 00:00:00 2001 From: miguel Date: Wed, 6 May 2026 16:07:16 -0700 Subject: [PATCH 1/4] Publish eval results --- .github/workflows/publish-evals.yml | 130 +++ .../scripts/publish-braintrust-ui-data.ts | 744 ++++++++++++++++++ 2 files changed, 874 insertions(+) create mode 100644 .github/workflows/publish-evals.yml create mode 100644 packages/evals/scripts/publish-braintrust-ui-data.ts diff --git a/.github/workflows/publish-evals.yml b/.github/workflows/publish-evals.yml new file mode 100644 index 000000000..3d5e0b278 --- /dev/null +++ b/.github/workflows/publish-evals.yml @@ -0,0 +1,130 @@ +name: Publish Evals + +on: + workflow_dispatch: + inputs: + experiment: + description: Braintrust experiment name or UUID. + required: true + type: string + project: + description: Braintrust project containing the experiment. + required: false + default: stagehand + type: string + kv_key: + description: Upstash Redis key the UI reads. + required: false + default: stagehand:evals:latest + type: string + experiment_key_prefix: + description: Prefix for the secondary experiment-id key. + required: false + default: stagehand:evals:experiments + type: string + write_experiment_key: + description: Also write :. + required: false + default: true + type: boolean + dry_run: + description: Fetch and render payload without writing to Upstash. + required: false + default: false + type: boolean + +permissions: + contents: read + +env: + PUPPETEER_SKIP_DOWNLOAD: "1" + PLAYWRIGHT_SKIP_DOWNLOAD: "1" + TURBO_TELEMETRY_DISABLED: "1" + +jobs: + publish: + name: Publish Evals UI Data + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - uses: ./.github/actions/setup-node-pnpm-turbo + with: + use-prebuilt-artifacts: "false" + restore-turbo-cache: "false" + node-version: 20.x + + - name: Publish Braintrust experiment to Upstash + id: publish + env: + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + UPSTASH_REDIS_REST_URL: ${{ secrets.UPSTASH_REDIS_REST_URL }} + UPSTASH_REDIS_REST_TOKEN: ${{ secrets.UPSTASH_REDIS_REST_TOKEN }} + KV_REST_API_URL: ${{ secrets.KV_REST_API_URL }} + KV_REST_API_TOKEN: ${{ secrets.KV_REST_API_TOKEN }} + EXPERIMENT: ${{ inputs.experiment }} + PROJECT: ${{ inputs.project }} + KV_KEY: ${{ inputs.kv_key }} + EXPERIMENT_KEY_PREFIX: ${{ inputs.experiment_key_prefix }} + WRITE_EXPERIMENT_KEY: ${{ inputs.write_experiment_key }} + DRY_RUN: ${{ inputs.dry_run }} + shell: bash + run: | + set -euo pipefail + + args=( + --experiment "$EXPERIMENT" + --project "$PROJECT" + --key "$KV_KEY" + --experiment-key-prefix "$EXPERIMENT_KEY_PREFIX" + --out evals-ui-data.json + ) + + if [[ "$WRITE_EXPERIMENT_KEY" != "true" ]]; then + args+=(--no-experiment-key) + fi + + if [[ "$DRY_RUN" == "true" ]]; then + args+=(--dry-run) + fi + + pnpm --filter @browserbasehq/stagehand-evals exec tsx \ + scripts/publish-braintrust-ui-data.ts \ + "${args[@]}" | tee publish-evals-ui-data-output.json + + { + echo "summary<> "$GITHUB_OUTPUT" + + - name: Add publish summary + if: always() + env: + PUBLISH_SUMMARY: ${{ steps.publish.outputs.summary }} + shell: bash + run: | + if [ -n "${PUBLISH_SUMMARY:-}" ]; then + echo "### Published Evals UI Data" >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + printf '%s\n' "$PUBLISH_SUMMARY" >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Upload generated payload + if: always() + uses: actions/upload-artifact@v4 + with: + name: evals-ui-data + path: | + evals-ui-data.json + publish-evals-ui-data-output.json diff --git a/packages/evals/scripts/publish-braintrust-ui-data.ts b/packages/evals/scripts/publish-braintrust-ui-data.ts new file mode 100644 index 000000000..ab7e453fa --- /dev/null +++ b/packages/evals/scripts/publish-braintrust-ui-data.ts @@ -0,0 +1,744 @@ +import { writeFile } from "node:fs/promises"; +import { + fetchExperimentData, + type BenchCaseRow, + type ExperimentData, +} from "../lib/braintrust-report.js"; + +type ParsedArgs = { + experiment: string; + project: string; + key: string; + experimentKeyPrefix: string; + writeExperimentKey: boolean; + dryRun: boolean; + outputPath: string; + datasetId: string; +}; + +type UpstashCredentials = { + url: string; + token: string; +}; + +type UiBenchmarkRow = { + id: string; + modelName: string; + provider: string; + providerKey: string; + accuracy: number | null; + speedSeconds: number | null; + costPerTask: number | null; + totalCost: number | null; + timestamp?: number; + experimentName?: string; + experimentUrl?: string; + projectName?: string; + agentMode?: string; +}; + +type UiBenchmark = { + key: string; + label: string; + rows: UiBenchmarkRow[]; +}; + +type UiDataset = { + id: string; + timestamp: number; + benchmarks: UiBenchmark[]; +}; + +type BenchmarkUpdate = { + key: string; + label: string; + row: UiBenchmarkRow; + summary: { + passed: number; + total: number; + passPercent: number; + }; +}; + +type PublishResult = { + dryRun: boolean; + experimentName: string; + projectName: string; + keys: string[]; + benchmark: { + key: string; + label: string; + }; + row: UiBenchmarkRow; + summary: { + passed: number; + total: number; + passPercent: number; + }; +}; + +const DEFAULT_PROJECT = "stagehand"; +const DEFAULT_KEY = "stagehand:evals:latest"; +const DEFAULT_EXPERIMENT_KEY_PREFIX = "stagehand:evals:experiments"; +const DEFAULT_DATASET_ID = "stagehand-evals"; +const DEFAULT_OUTPUT_PATH = "evals-ui-data.json"; + +const BENCHMARK_LABELS = new Map([ + ["gaia", "GAIA"], + ["onlineMind2Web", "Online Mind2Web"], + ["online-mind2web", "Online Mind2Web"], + ["onlinemind2web", "Online Mind2Web"], + ["webtailbench", "WebTailBench"], + ["webvoyager", "WebVoyager"], +]); + +const PROVIDER_LABELS = new Map([ + ["anthropic", "Anthropic"], + ["browserbase", "Browserbase"], + ["google", "Google"], + ["moonshot", "Moonshot AI"], + ["openai", "OpenAI"], + ["xai", "xAI"], +]); + +const PROVIDER_PREFIXES = [ + "anthropic", + "browserbase", + "google", + "moonshot", + "openai", + "xai", +]; + +const COST_METRIC_KEYS = [ + "cost_usd", + "total_cost_usd", + "estimated_cost_usd", + "claude_code_cost_usd", + "cost", + "price_usd", +]; + +function usage(): string { + return [ + "Usage:", + " publish-braintrust-ui-data.ts --experiment [options]", + "", + "Options:", + " --experiment Braintrust experiment name or UUID", + ` --project Braintrust project (default: ${DEFAULT_PROJECT})`, + ` --key Upstash Redis UI key (default: ${DEFAULT_KEY})`, + ` --experiment-key-prefix Secondary key prefix (default: ${DEFAULT_EXPERIMENT_KEY_PREFIX})`, + " --no-experiment-key Do not write :", + ` --dataset-id Top-level dataset id (default: ${DEFAULT_DATASET_ID})`, + ` --out Write generated UI payload (default: ${DEFAULT_OUTPUT_PATH})`, + " --dry-run Fetch and render without writing to Upstash", + ].join("\n"); +} + +function parseArgs(argv: string[]): ParsedArgs { + const args = [...argv]; + let experiment = ""; + let project = DEFAULT_PROJECT; + let key = DEFAULT_KEY; + let experimentKeyPrefix = DEFAULT_EXPERIMENT_KEY_PREFIX; + let writeExperimentKey = true; + let dryRun = false; + let outputPath = DEFAULT_OUTPUT_PATH; + let datasetId = DEFAULT_DATASET_ID; + + while (args.length > 0) { + const arg = args.shift()!; + switch (arg) { + case "--help": + case "-h": + process.stdout.write(`${usage()}\n`); + process.exit(0); + case "--experiment": + experiment = requireValue(args, arg); + break; + case "--project": + project = requireValue(args, arg); + break; + case "--key": + key = requireValue(args, arg); + break; + case "--experiment-key-prefix": + experimentKeyPrefix = requireValue(args, arg); + break; + case "--no-experiment-key": + writeExperimentKey = false; + break; + case "--dry-run": + dryRun = true; + break; + case "--out": + outputPath = requireValue(args, arg); + break; + case "--dataset-id": + datasetId = requireValue(args, arg); + break; + default: + throw new Error(`Unknown argument "${arg}".\n\n${usage()}`); + } + } + + if (!experiment) throw new Error("Missing required --experiment."); + if (!project) throw new Error("Missing required --project."); + if (!key) throw new Error("Missing required --key."); + if (!datasetId) throw new Error("Missing required --dataset-id."); + + return { + experiment, + project, + key, + experimentKeyPrefix, + writeExperimentKey, + dryRun, + outputPath, + datasetId, + }; +} + +function requireValue(args: string[], flag: string): string { + const value = args.shift(); + if (!value) throw new Error(`Missing value for ${flag}`); + return value; +} + +function readEnv(name: string): string | undefined { + const value = process.env[name]?.trim(); + return value ? value : undefined; +} + +function getUpstashCredentials(required: boolean): UpstashCredentials | null { + const url = readEnv("UPSTASH_REDIS_REST_URL") ?? readEnv("KV_REST_API_URL"); + const token = + readEnv("UPSTASH_REDIS_REST_TOKEN") ?? readEnv("KV_REST_API_TOKEN"); + + if (url && token) { + return { + url: url.replace(/\/+$/, ""), + token, + }; + } + + if (required) { + throw new Error( + "Missing Upstash credentials. Set UPSTASH_REDIS_REST_URL/UPSTASH_REDIS_REST_TOKEN or KV_REST_API_URL/KV_REST_API_TOKEN.", + ); + } + + return null; +} + +function asRecord(value: unknown): Record | undefined { + return value && typeof value === "object" + ? (value as Record) + : undefined; +} + +function readString( + record: Record | undefined, + key: string, +): string | undefined { + const value = record?.[key]; + return typeof value === "string" && value.trim() ? value : undefined; +} + +function readNumber(value: unknown): number | undefined { + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "string" && value.trim()) { + const numeric = Number(value); + if (Number.isFinite(numeric)) return numeric; + } + return undefined; +} + +function readNumberOrNull(value: unknown): number | null { + return readNumber(value) ?? null; +} + +function slugify(value: string): string { + return value + .trim() + .replace(/([a-z0-9])([A-Z])/g, "$1-$2") + .replace(/[^a-zA-Z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .toLowerCase(); +} + +function humanize(value: string): string { + const known = BENCHMARK_LABELS.get(value); + if (known) return known; + + const words = value + .replace(/([a-z0-9])([A-Z])/g, "$1 $2") + .split(/[-_\s/]+/) + .filter(Boolean); + return words + .map((word) => + word.length <= 3 + ? word.toUpperCase() + : word.charAt(0).toUpperCase() + word.slice(1), + ) + .join(" "); +} + +function benchmarkSource(benchCase: BenchCaseRow): string | undefined { + return ( + benchCase.dataset ?? + benchCase.suite.replace(/^agent\//, "") ?? + benchCase.category + ); +} + +function uniqueValues(values: Array): string[] { + return [ + ...new Set(values.filter((value): value is string => Boolean(value))), + ]; +} + +function inferBenchmark(cases: BenchCaseRow[]): { key: string; label: string } { + const keys = uniqueValues( + cases + .map(benchmarkSource) + .filter((source): source is string => Boolean(source)) + .map((source) => source.trim()), + ); + + if (keys.length === 0) { + throw new Error( + "Could not infer benchmark key from Braintrust bench cases.", + ); + } + + if (keys.length > 1) { + throw new Error( + `Expected one benchmark per Braintrust experiment, found: ${keys.join(", ")}.`, + ); + } + + const key = keys[0]; + return { key, label: BENCHMARK_LABELS.get(key) ?? humanize(key) }; +} + +function inferProviderFromModel(model: string): string | undefined { + const normalized = model.toLowerCase(); + const slashPrefix = normalized.match(/^([a-z0-9_-]+)\//)?.[1]; + if (slashPrefix) return slugify(slashPrefix); + if (/claude|anthropic/.test(normalized)) return "anthropic"; + if (/gpt|openai|o[1-9]/.test(normalized)) return "openai"; + if (/gemini|google/.test(normalized)) return "google"; + if (/grok|xai/.test(normalized)) return "xai"; + if (/kimi|moonshot/.test(normalized)) return "moonshot"; + return undefined; +} + +function displayModelName(model: string): string { + const [prefix, ...rest] = model.split("/"); + if ( + rest.length > 0 && + PROVIDER_PREFIXES.includes(prefix.trim().toLowerCase()) + ) { + return rest.join("/"); + } + return model; +} + +function providerLabel(providerKey: string, explicitProvider?: string): string { + if (explicitProvider) { + return PROVIDER_LABELS.get(slugify(explicitProvider)) ?? explicitProvider; + } + return PROVIDER_LABELS.get(providerKey) ?? humanize(providerKey); +} + +function inferModel(cases: BenchCaseRow[]): { + modelName: string; + provider: string; + providerKey: string; +} { + const models = uniqueValues(cases.map((benchCase) => benchCase.model)); + if (models.length === 0) { + throw new Error("Could not infer model from Braintrust bench cases."); + } + if (models.length > 1) { + throw new Error( + `Expected one model per Braintrust experiment, found: ${models.join(", ")}.`, + ); + } + + const model = models[0]; + const explicitProviders = uniqueValues( + cases.map((benchCase) => benchCase.provider), + ); + const explicitProvider = + explicitProviders.length === 1 ? explicitProviders[0] : undefined; + const providerKey = slugify( + explicitProvider ?? inferProviderFromModel(model) ?? "unknown", + ); + + return { + modelName: displayModelName(model), + provider: providerLabel(providerKey, explicitProvider), + providerKey, + }; +} + +function mean(values: number[]): number | undefined { + if (values.length === 0) return undefined; + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +function round(value: number, places = 2): number { + const scale = 10 ** places; + return Math.round(value * scale) / scale; +} + +function caseCost(benchCase: BenchCaseRow): number | undefined { + for (const key of COST_METRIC_KEYS) { + const value = benchCase.metrics[key]; + if (typeof value === "number" && Number.isFinite(value)) return value; + } + + const matched = Object.entries(benchCase.metrics).find( + ([key, value]) => + /(cost|price|usd)/i.test(key) && + typeof value === "number" && + Number.isFinite(value), + ); + + return matched ? matched[1] : undefined; +} + +function experimentTimestamp(experiment: ExperimentData): number { + const createdAt = experiment.createdAt + ? Date.parse(experiment.createdAt) + : Number.NaN; + return Number.isFinite(createdAt) ? createdAt : Date.now(); +} + +function toBenchmarkUpdate(experiment: ExperimentData): BenchmarkUpdate { + if (experiment.mode !== "bench" || experiment.benchCases.length === 0) { + throw new Error( + `Experiment "${experiment.experimentName}" is not a benchmark experiment.`, + ); + } + + const benchmark = inferBenchmark(experiment.benchCases); + const model = inferModel(experiment.benchCases); + const total = experiment.benchCases.length; + const passed = experiment.benchCases.filter( + (benchCase) => benchCase.success, + ).length; + const passPercent = total > 0 ? round((passed / total) * 100) : 0; + const durations = experiment.benchCases + .map((benchCase) => benchCase.durationMs) + .filter((value): value is number => typeof value === "number"); + const costs = experiment.benchCases + .map(caseCost) + .filter((value): value is number => typeof value === "number"); + const totalCost = + costs.length > 0 + ? round( + costs.reduce((sum, value) => sum + value, 0), + 6, + ) + : null; + const agentModes = uniqueValues( + experiment.benchCases.map((benchCase) => benchCase.agentMode), + ); + const agentMode = agentModes.length === 1 ? agentModes[0] : undefined; + + return { + key: benchmark.key, + label: benchmark.label, + row: { + id: experiment.experimentId, + modelName: model.modelName, + provider: model.provider, + providerKey: model.providerKey, + accuracy: passPercent, + speedSeconds: + durations.length > 0 ? round((mean(durations) ?? 0) / 1000) : null, + costPerTask: + totalCost !== null && total > 0 ? round(totalCost / total, 6) : null, + totalCost, + timestamp: experimentTimestamp(experiment), + experimentName: experiment.experimentName, + experimentUrl: experiment.experimentUrl, + projectName: experiment.projectName, + ...(agentMode ? { agentMode } : {}), + }, + summary: { + passed, + total, + passPercent, + }, + }; +} + +function sanitizeRow(value: unknown): UiBenchmarkRow | undefined { + const record = asRecord(value); + if (!record) return undefined; + + const id = readString(record, "id") ?? readString(record, "experimentId"); + if (!id) return undefined; + + const modelName = readString(record, "modelName"); + const providerKey = readString(record, "providerKey"); + const provider = readString(record, "provider"); + if (!modelName || !providerKey || !provider) return undefined; + + const row: UiBenchmarkRow = { + id, + modelName, + provider, + providerKey, + accuracy: readNumberOrNull(record.accuracy), + speedSeconds: readNumberOrNull(record.speedSeconds), + costPerTask: readNumberOrNull(record.costPerTask), + totalCost: readNumberOrNull(record.totalCost), + }; + + const timestamp = readNumber(record.timestamp); + const experimentName = readString(record, "experimentName"); + const experimentUrl = readString(record, "experimentUrl"); + const projectName = readString(record, "projectName"); + const agentMode = readString(record, "agentMode"); + + if (timestamp !== undefined) row.timestamp = timestamp; + if (experimentName) row.experimentName = experimentName; + if (experimentUrl) row.experimentUrl = experimentUrl; + if (projectName) row.projectName = projectName; + if (agentMode) row.agentMode = agentMode; + + return row; +} + +function sanitizeBenchmark(value: unknown): UiBenchmark | undefined { + const record = asRecord(value); + if (!record) return undefined; + + const key = readString(record, "key"); + const label = readString(record, "label"); + const rows = Array.isArray(record.rows) + ? record.rows + .map(sanitizeRow) + .filter((row): row is UiBenchmarkRow => Boolean(row)) + : []; + + if (!key || !label) return undefined; + return { key, label, rows }; +} + +function sanitizeDataset(value: unknown, datasetId: string): UiDataset { + const record = asRecord(value); + if (!record) { + throw new Error("Existing Upstash value is not a JSON object."); + } + + if (!Array.isArray(record.benchmarks)) { + throw new Error("Existing Upstash value does not contain benchmarks[]."); + } + + const benchmarks = record.benchmarks + .map(sanitizeBenchmark) + .filter((benchmark): benchmark is UiBenchmark => Boolean(benchmark)); + const id = readString(record, "id") ?? datasetId; + const timestamp = readNumber(record.timestamp) ?? Date.now(); + + return { id, timestamp, benchmarks }; +} + +function sortRows(rows: UiBenchmarkRow[]): UiBenchmarkRow[] { + return [...rows].sort((a, b) => { + const accuracyA = a.accuracy ?? -1; + const accuracyB = b.accuracy ?? -1; + if (accuracyA !== accuracyB) return accuracyB - accuracyA; + + const speedA = a.speedSeconds ?? Number.POSITIVE_INFINITY; + const speedB = b.speedSeconds ?? Number.POSITIVE_INFINITY; + if (speedA !== speedB) return speedA - speedB; + + const provider = a.provider.localeCompare(b.provider); + if (provider !== 0) return provider; + return a.modelName.localeCompare(b.modelName); + }); +} + +function upsertResult( + existing: UiDataset | null, + update: BenchmarkUpdate, + datasetId: string, + timestamp: number, +): UiDataset { + const dataset: UiDataset = existing + ? { + id: existing.id, + timestamp, + benchmarks: existing.benchmarks.map((benchmark) => ({ + key: benchmark.key, + label: benchmark.label, + rows: benchmark.rows.filter((row) => row.id !== update.row.id), + })), + } + : { id: datasetId, timestamp, benchmarks: [] }; + + let benchmark = dataset.benchmarks.find( + (candidate) => candidate.key === update.key, + ); + if (!benchmark) { + benchmark = { key: update.key, label: update.label, rows: [] }; + dataset.benchmarks.push(benchmark); + } + + benchmark.label = update.label; + benchmark.rows.push(update.row); + dataset.benchmarks = dataset.benchmarks + .filter((candidate) => candidate.rows.length > 0) + .map((candidate) => ({ + ...candidate, + rows: sortRows(candidate.rows), + })) + .sort((a, b) => a.label.localeCompare(b.label)); + + return dataset; +} + +function experimentDataset(update: BenchmarkUpdate): UiDataset { + return { + id: update.row.id, + timestamp: update.row.timestamp ?? Date.now(), + benchmarks: [ + { + key: update.key, + label: update.label, + rows: [update.row], + }, + ], + }; +} + +function parseRedisValue(value: unknown): unknown { + if (typeof value !== "string") return value; + + try { + return JSON.parse(value); + } catch { + throw new Error("Existing Upstash value is not valid JSON."); + } +} + +async function upstashGet( + credentials: UpstashCredentials, + key: string, +): Promise { + const response = await fetch( + `${credentials.url}/get/${encodeURIComponent(key)}`, + { + headers: { + Authorization: `Bearer ${credentials.token}`, + }, + }, + ); + + if (!response.ok) { + throw new Error( + `Upstash GET failed for "${key}" (${response.status}): ${await response.text()}`, + ); + } + + const body = (await response.json()) as { result?: unknown }; + return body.result === undefined || body.result === null + ? null + : parseRedisValue(body.result); +} + +async function upstashSetMany( + credentials: UpstashCredentials, + entries: Array<{ key: string; value: UiDataset }>, +): Promise { + const commands = entries.map(({ key, value }) => [ + "SET", + key, + JSON.stringify(value), + ]); + const response = await fetch(`${credentials.url}/pipeline`, { + method: "POST", + headers: { + Authorization: `Bearer ${credentials.token}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(commands), + }); + + if (!response.ok) { + throw new Error( + `Upstash pipeline failed (${response.status}): ${await response.text()}`, + ); + } +} + +async function main(): Promise { + const args = parseArgs(process.argv.slice(2)); + const apiKey = readEnv("BRAINTRUST_API_KEY"); + if (!apiKey) throw new Error("Missing BRAINTRUST_API_KEY."); + + const credentials = getUpstashCredentials(!args.dryRun); + const experiment = await fetchExperimentData( + args.project, + { label: args.experiment, experiment: args.experiment }, + { apiKey }, + ); + const update = toBenchmarkUpdate(experiment); + + let existing: UiDataset | null = null; + if (credentials) { + const currentValue = await upstashGet(credentials, args.key); + if (currentValue !== null) { + existing = sanitizeDataset(currentValue, args.datasetId); + } + } + + const now = Date.now(); + const merged = upsertResult(existing, update, args.datasetId, now); + const keys = [args.key]; + const writes: Array<{ key: string; value: UiDataset }> = [ + { key: args.key, value: merged }, + ]; + + if (args.writeExperimentKey) { + const experimentKey = `${args.experimentKeyPrefix}:${update.row.id}`; + keys.push(experimentKey); + writes.push({ key: experimentKey, value: experimentDataset(update) }); + } + + await writeFile(args.outputPath, `${JSON.stringify(merged, null, 2)}\n`); + + if (!args.dryRun) { + if (!credentials) throw new Error("Missing Upstash credentials."); + await upstashSetMany(credentials, writes); + } + + const result: PublishResult = { + dryRun: args.dryRun, + experimentName: experiment.experimentName, + projectName: experiment.projectName, + keys, + benchmark: { + key: update.key, + label: update.label, + }, + row: update.row, + summary: update.summary, + }; + + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); +} + +main().catch((error: unknown) => { + const message = error instanceof Error ? error.message : String(error); + process.stderr.write(`${message}\n`); + process.exitCode = 1; +}); From d06f816d62fab1550683fc7bc3176f6b349ea567 Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 7 May 2026 15:23:54 -0700 Subject: [PATCH 2/4] linting --- packages/evals/scripts/publish-braintrust-ui-data.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/evals/scripts/publish-braintrust-ui-data.ts b/packages/evals/scripts/publish-braintrust-ui-data.ts index ab7e453fa..e7641d44a 100644 --- a/packages/evals/scripts/publish-braintrust-ui-data.ts +++ b/packages/evals/scripts/publish-braintrust-ui-data.ts @@ -154,6 +154,7 @@ function parseArgs(argv: string[]): ParsedArgs { case "-h": process.stdout.write(`${usage()}\n`); process.exit(0); + break; case "--experiment": experiment = requireValue(args, arg); break; From 900b5041fc0291ea9aaf7d86f03b569990c34a33 Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 7 May 2026 16:47:28 -0700 Subject: [PATCH 3/4] update --- .../scripts/publish-braintrust-ui-data.ts | 170 +++++++++++++----- 1 file changed, 130 insertions(+), 40 deletions(-) diff --git a/packages/evals/scripts/publish-braintrust-ui-data.ts b/packages/evals/scripts/publish-braintrust-ui-data.ts index e7641d44a..1ccf93b46 100644 --- a/packages/evals/scripts/publish-braintrust-ui-data.ts +++ b/packages/evals/scripts/publish-braintrust-ui-data.ts @@ -52,7 +52,8 @@ type UiDataset = { type BenchmarkUpdate = { key: string; label: string; - row: UiBenchmarkRow; + experimentId: string; + rows: UiBenchmarkRow[]; summary: { passed: number; total: number; @@ -70,6 +71,7 @@ type PublishResult = { label: string; }; row: UiBenchmarkRow; + rows: UiBenchmarkRow[]; summary: { passed: number; total: number; @@ -84,6 +86,7 @@ const DEFAULT_DATASET_ID = "stagehand-evals"; const DEFAULT_OUTPUT_PATH = "evals-ui-data.json"; const BENCHMARK_LABELS = new Map([ + ["custom", "Custom"], ["gaia", "GAIA"], ["onlineMind2Web", "Online Mind2Web"], ["online-mind2web", "Online Mind2Web"], @@ -294,6 +297,17 @@ function benchmarkSource(benchCase: BenchCaseRow): string | undefined { ); } +function isKnownBenchmarkKey(value: string): boolean { + return BENCHMARK_LABELS.has(value); +} + +function isPlainAgentRun(cases: BenchCaseRow[]): boolean { + return cases.every( + (benchCase) => + benchCase.category === "agent" || benchCase.suite.startsWith("agent/"), + ); +} + function uniqueValues(values: Array): string[] { return [ ...new Set(values.filter((value): value is string => Boolean(value))), @@ -308,6 +322,23 @@ function inferBenchmark(cases: BenchCaseRow[]): { key: string; label: string } { .map((source) => source.trim()), ); + const knownKeys = uniqueValues(keys.filter(isKnownBenchmarkKey)); + + if (knownKeys.length === 1) { + const key = knownKeys[0]; + return { key, label: BENCHMARK_LABELS.get(key) ?? humanize(key) }; + } + + if (knownKeys.length > 1) { + throw new Error( + `Expected one benchmark per Braintrust experiment, found: ${knownKeys.join(", ")}.`, + ); + } + + if (isPlainAgentRun(cases)) { + return { key: "custom", label: BENCHMARK_LABELS.get("custom")! }; + } + if (keys.length === 0) { throw new Error( "Could not infer benchmark key from Braintrust bench cases.", @@ -386,6 +417,26 @@ function inferModel(cases: BenchCaseRow[]): { }; } +function modelGroupKey(benchCase: BenchCaseRow): string { + return [ + benchCase.model ?? "unknown-model", + benchCase.agentMode ?? "default", + ].join("\0"); +} + +function groupCasesByModel(cases: BenchCaseRow[]): BenchCaseRow[][] { + const groups = new Map(); + + for (const benchCase of cases) { + const key = modelGroupKey(benchCase); + const group = groups.get(key) ?? []; + group.push(benchCase); + groups.set(key, group); + } + + return [...groups.values()]; +} + function mean(values: number[]): number | undefined { if (values.length === 0) return undefined; return values.reduce((sum, value) => sum + value, 0) / values.length; @@ -419,24 +470,31 @@ function experimentTimestamp(experiment: ExperimentData): number { return Number.isFinite(createdAt) ? createdAt : Date.now(); } -function toBenchmarkUpdate(experiment: ExperimentData): BenchmarkUpdate { - if (experiment.mode !== "bench" || experiment.benchCases.length === 0) { - throw new Error( - `Experiment "${experiment.experimentName}" is not a benchmark experiment.`, - ); - } +function rowIdForModelGroup( + experimentId: string, + groups: BenchCaseRow[][], + cases: BenchCaseRow[], +): string { + if (groups.length === 1) return experimentId; - const benchmark = inferBenchmark(experiment.benchCases); - const model = inferModel(experiment.benchCases); - const total = experiment.benchCases.length; - const passed = experiment.benchCases.filter( - (benchCase) => benchCase.success, - ).length; + const model = cases[0]?.model ?? "unknown-model"; + const agentMode = cases[0]?.agentMode ?? "default"; + return `${experimentId}:${slugify(model)}:${slugify(agentMode)}`; +} + +function benchmarkRow( + experiment: ExperimentData, + cases: BenchCaseRow[], + rowId: string, +): UiBenchmarkRow { + const model = inferModel(cases); + const total = cases.length; + const passed = cases.filter((benchCase) => benchCase.success).length; const passPercent = total > 0 ? round((passed / total) * 100) : 0; - const durations = experiment.benchCases + const durations = cases .map((benchCase) => benchCase.durationMs) .filter((value): value is number => typeof value === "number"); - const costs = experiment.benchCases + const costs = cases .map(caseCost) .filter((value): value is number => typeof value === "number"); const totalCost = @@ -447,30 +505,55 @@ function toBenchmarkUpdate(experiment: ExperimentData): BenchmarkUpdate { ) : null; const agentModes = uniqueValues( - experiment.benchCases.map((benchCase) => benchCase.agentMode), + cases.map((benchCase) => benchCase.agentMode), ); const agentMode = agentModes.length === 1 ? agentModes[0] : undefined; + return { + id: rowId, + modelName: model.modelName, + provider: model.provider, + providerKey: model.providerKey, + accuracy: passPercent, + speedSeconds: + durations.length > 0 ? round((mean(durations) ?? 0) / 1000) : null, + costPerTask: + totalCost !== null && total > 0 ? round(totalCost / total, 6) : null, + totalCost, + timestamp: experimentTimestamp(experiment), + experimentName: experiment.experimentName, + experimentUrl: experiment.experimentUrl, + projectName: experiment.projectName, + ...(agentMode ? { agentMode } : {}), + }; +} + +function toBenchmarkUpdate(experiment: ExperimentData): BenchmarkUpdate { + if (experiment.mode !== "bench" || experiment.benchCases.length === 0) { + throw new Error( + `Experiment "${experiment.experimentName}" is not a benchmark experiment.`, + ); + } + + const benchmark = inferBenchmark(experiment.benchCases); + const total = experiment.benchCases.length; + const passed = experiment.benchCases.filter( + (benchCase) => benchCase.success, + ).length; + const passPercent = total > 0 ? round((passed / total) * 100) : 0; + const groups = groupCasesByModel(experiment.benchCases); + return { key: benchmark.key, label: benchmark.label, - row: { - id: experiment.experimentId, - modelName: model.modelName, - provider: model.provider, - providerKey: model.providerKey, - accuracy: passPercent, - speedSeconds: - durations.length > 0 ? round((mean(durations) ?? 0) / 1000) : null, - costPerTask: - totalCost !== null && total > 0 ? round(totalCost / total, 6) : null, - totalCost, - timestamp: experimentTimestamp(experiment), - experimentName: experiment.experimentName, - experimentUrl: experiment.experimentUrl, - projectName: experiment.projectName, - ...(agentMode ? { agentMode } : {}), - }, + experimentId: experiment.experimentId, + rows: groups.map((cases) => + benchmarkRow( + experiment, + cases, + rowIdForModelGroup(experiment.experimentId, groups, cases), + ), + ), summary: { passed, total, @@ -574,6 +657,12 @@ function upsertResult( datasetId: string, timestamp: number, ): UiDataset { + const updateRowIds = new Set(update.rows.map((row) => row.id)); + const isUpdatedExperimentRow = (row: UiBenchmarkRow): boolean => + updateRowIds.has(row.id) || + row.id === update.experimentId || + row.id.startsWith(`${update.experimentId}:`); + const dataset: UiDataset = existing ? { id: existing.id, @@ -581,7 +670,7 @@ function upsertResult( benchmarks: existing.benchmarks.map((benchmark) => ({ key: benchmark.key, label: benchmark.label, - rows: benchmark.rows.filter((row) => row.id !== update.row.id), + rows: benchmark.rows.filter((row) => !isUpdatedExperimentRow(row)), })), } : { id: datasetId, timestamp, benchmarks: [] }; @@ -595,7 +684,7 @@ function upsertResult( } benchmark.label = update.label; - benchmark.rows.push(update.row); + benchmark.rows.push(...update.rows); dataset.benchmarks = dataset.benchmarks .filter((candidate) => candidate.rows.length > 0) .map((candidate) => ({ @@ -609,13 +698,13 @@ function upsertResult( function experimentDataset(update: BenchmarkUpdate): UiDataset { return { - id: update.row.id, - timestamp: update.row.timestamp ?? Date.now(), + id: update.experimentId, + timestamp: update.rows[0]?.timestamp ?? Date.now(), benchmarks: [ { key: update.key, label: update.label, - rows: [update.row], + rows: update.rows, }, ], }; @@ -710,7 +799,7 @@ async function main(): Promise { ]; if (args.writeExperimentKey) { - const experimentKey = `${args.experimentKeyPrefix}:${update.row.id}`; + const experimentKey = `${args.experimentKeyPrefix}:${update.experimentId}`; keys.push(experimentKey); writes.push({ key: experimentKey, value: experimentDataset(update) }); } @@ -731,7 +820,8 @@ async function main(): Promise { key: update.key, label: update.label, }, - row: update.row, + row: update.rows[0]!, + rows: update.rows, summary: update.summary, }; From 32a0a0d9c3c7351f5ac5915c687934af55912b92 Mon Sep 17 00:00:00 2001 From: miguel Date: Thu, 14 May 2026 16:28:48 -0700 Subject: [PATCH 4/4] update workflow --- .../scripts/publish-braintrust-ui-data.ts | 188 +++++++++++++++++- 1 file changed, 186 insertions(+), 2 deletions(-) diff --git a/packages/evals/scripts/publish-braintrust-ui-data.ts b/packages/evals/scripts/publish-braintrust-ui-data.ts index 1ccf93b46..bac033832 100644 --- a/packages/evals/scripts/publish-braintrust-ui-data.ts +++ b/packages/evals/scripts/publish-braintrust-ui-data.ts @@ -114,14 +114,134 @@ const PROVIDER_PREFIXES = [ ]; const COST_METRIC_KEYS = [ + "agent_cost_usd", + "agent_total_cost_usd", + "agent_estimated_cost_usd", + "stagehand_agent_cost_usd", + "stagehand_agent_total_cost_usd", + "codex_cost_usd", + "claude_code_cost_usd", "cost_usd", "total_cost_usd", "estimated_cost_usd", - "claude_code_cost_usd", "cost", "price_usd", ]; +const TESTED_RUN_INPUT_TOKEN_KEYS = [ + "agent_input_tokens", + "total_input_tokens", + "codex_input_tokens", + "claude_code_input_tokens", +]; + +const TESTED_RUN_CACHED_INPUT_TOKEN_KEYS = [ + "agent_cached_input_tokens", + "total_cached_input_tokens", + "codex_cached_input_tokens", + "claude_code_cache_read_input_tokens", +]; + +const TESTED_RUN_OUTPUT_TOKEN_KEYS = [ + "agent_output_tokens", + "total_output_tokens", + "codex_output_tokens", + "claude_code_output_tokens", +]; + +const TESTED_RUN_INFERENCE_MS_KEYS = [ + "agent_inference_time_ms", + "total_inference_time_ms", + "codex_inference_time_ms", + "claude_code_inference_time_ms", +]; + +type ModelPricing = { + input: number; + cachedInput?: number; + output: number; + longContext?: { + thresholdTokens: number; + input: number; + cachedInput?: number; + output: number; + }; +}; + +const MODEL_PRICING_USD_PER_1M_TOKENS = new Map([ + ["anthropic/claude-opus-4-7", { input: 5, cachedInput: 0.5, output: 25 }], + ["claude-opus-4-7", { input: 5, cachedInput: 0.5, output: 25 }], + ["anthropic/claude-opus-4-6", { input: 5, cachedInput: 0.5, output: 25 }], + ["claude-opus-4-6", { input: 5, cachedInput: 0.5, output: 25 }], + ["anthropic/claude-opus-4-5", { input: 5, cachedInput: 0.5, output: 25 }], + ["claude-opus-4-5", { input: 5, cachedInput: 0.5, output: 25 }], + ["anthropic/claude-sonnet-4-6", { input: 3, cachedInput: 0.3, output: 15 }], + ["claude-sonnet-4-6", { input: 3, cachedInput: 0.3, output: 15 }], + ["anthropic/claude-sonnet-4-5", { input: 3, cachedInput: 0.3, output: 15 }], + ["claude-sonnet-4-5", { input: 3, cachedInput: 0.3, output: 15 }], + ["anthropic/claude-haiku-4-5", { input: 1, cachedInput: 0.1, output: 5 }], + ["claude-haiku-4-5", { input: 1, cachedInput: 0.1, output: 5 }], + ["openai/gpt-5.5", { input: 5, cachedInput: 0.5, output: 30 }], + ["gpt-5.5", { input: 5, cachedInput: 0.5, output: 30 }], + ["openai/gpt-5.4", { input: 2.5, cachedInput: 0.25, output: 15 }], + ["gpt-5.4", { input: 2.5, cachedInput: 0.25, output: 15 }], + ["openai/gpt-5.4-mini", { input: 0.75, cachedInput: 0.075, output: 4.5 }], + ["gpt-5.4-mini", { input: 0.75, cachedInput: 0.075, output: 4.5 }], + [ + "google/gemini-3-flash-preview", + { input: 0.5, cachedInput: 0.05, output: 3 }, + ], + ["gemini-3-flash-preview", { input: 0.5, cachedInput: 0.05, output: 3 }], + [ + "google/gemini-2.5-computer-use-preview-10-2025", + { + input: 1.25, + output: 10, + longContext: { + thresholdTokens: 200_000, + input: 2.5, + output: 15, + }, + }, + ], + [ + "gemini-2.5-computer-use-preview-10-2025", + { + input: 1.25, + output: 10, + longContext: { + thresholdTokens: 200_000, + input: 2.5, + output: 15, + }, + }, + ], + [ + "google/gemini-2.5-computer-use-preview", + { + input: 1.25, + output: 10, + longContext: { + thresholdTokens: 200_000, + input: 2.5, + output: 15, + }, + }, + ], + [ + "gemini-2.5-computer-use-preview", + { + input: 1.25, + output: 10, + longContext: { + thresholdTokens: 200_000, + input: 2.5, + output: 15, + }, + }, + ], +]); + function usage(): string { return [ "Usage:", @@ -453,9 +573,13 @@ function caseCost(benchCase: BenchCaseRow): number | undefined { if (typeof value === "number" && Number.isFinite(value)) return value; } + const estimated = estimatedTokenCost(benchCase); + if (estimated !== undefined) return estimated; + const matched = Object.entries(benchCase.metrics).find( ([key, value]) => /(cost|price|usd)/i.test(key) && + !/(verifier|evaluator|scorer|exactmatch|errormatch)/i.test(key) && typeof value === "number" && Number.isFinite(value), ); @@ -463,6 +587,66 @@ function caseCost(benchCase: BenchCaseRow): number | undefined { return matched ? matched[1] : undefined; } +function caseSpeedMs(benchCase: BenchCaseRow): number | undefined { + return firstMetric(benchCase.metrics, TESTED_RUN_INFERENCE_MS_KEYS); +} + +function firstMetric( + metrics: Record, + keys: string[], +): number | undefined { + for (const key of keys) { + const value = metrics[key]; + if (typeof value === "number" && Number.isFinite(value)) return value; + } + return undefined; +} + +function modelPricing(model: string | undefined): ModelPricing | undefined { + if (!model) return undefined; + const normalized = model.trim().toLowerCase(); + return MODEL_PRICING_USD_PER_1M_TOKENS.get(normalized); +} + +function estimatedTokenCost(benchCase: BenchCaseRow): number | undefined { + const basePricing = modelPricing(benchCase.model); + if (!basePricing) return undefined; + + // Only use metrics emitted by the tested agent/harness. Braintrust scorer + // spans and task logs can contain verifier model usage, usually Gemini, and + // must not be billed as the evaluated model. + const inputTokens = firstMetric( + benchCase.metrics, + TESTED_RUN_INPUT_TOKEN_KEYS, + ); + const cachedInputTokens = + firstMetric(benchCase.metrics, TESTED_RUN_CACHED_INPUT_TOKEN_KEYS) ?? 0; + const outputTokens = firstMetric( + benchCase.metrics, + TESTED_RUN_OUTPUT_TOKEN_KEYS, + ); + + if (inputTokens === undefined && outputTokens === undefined) { + return undefined; + } + + const billableInputTokens = Math.max( + (inputTokens ?? 0) - cachedInputTokens, + 0, + ); + const pricing = + basePricing.longContext && + (inputTokens ?? 0) > basePricing.longContext.thresholdTokens + ? basePricing.longContext + : basePricing; + const inputCost = + (billableInputTokens * pricing.input + + cachedInputTokens * (pricing.cachedInput ?? pricing.input)) / + 1_000_000; + const outputCost = ((outputTokens ?? 0) * pricing.output) / 1_000_000; + return inputCost + outputCost; +} + function experimentTimestamp(experiment: ExperimentData): number { const createdAt = experiment.createdAt ? Date.parse(experiment.createdAt) @@ -492,7 +676,7 @@ function benchmarkRow( const passed = cases.filter((benchCase) => benchCase.success).length; const passPercent = total > 0 ? round((passed / total) * 100) : 0; const durations = cases - .map((benchCase) => benchCase.durationMs) + .map((benchCase) => caseSpeedMs(benchCase) ?? benchCase.durationMs) .filter((value): value is number => typeof value === "number"); const costs = cases .map(caseCost)