From f90e86c2d43ab596ccccee62854b5e434cc77afa Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 03:26:55 -0700 Subject: [PATCH 01/12] Add Stagehand v4 eval harness --- packages/evals/core/contracts/tool.ts | 1 + .../evals/framework/ClaudeAgentHarness.ts | 52 ++ packages/evals/framework/CodexAgentHarness.ts | 52 ++ .../framework/StagehandAgentV3Harness.ts | 139 ++++ .../framework/StagehandAgentV4Harness.ts | 470 ++++++++++++ packages/evals/framework/UnderstudyV4Tools.ts | 712 ++++++++++++++++++ packages/evals/framework/benchHarness.ts | 237 +----- packages/evals/framework/benchPlanner.ts | 30 +- packages/evals/framework/benchRunner.ts | 15 +- packages/evals/framework/benchTypes.ts | 12 +- packages/evals/framework/context.ts | 2 +- packages/evals/framework/defineTask.ts | 27 +- packages/evals/framework/taskLoader.ts | 1 + packages/evals/framework/types.ts | 12 + packages/evals/lib/braintrust-report.ts | 4 +- packages/evals/tests/cli.test.ts | 2 +- .../tests/framework/benchHarness.test.ts | 24 +- .../tests/framework/benchPlanner.test.ts | 36 +- .../evals/tests/framework/benchRunner.test.ts | 2 +- .../evals/tests/framework/defineTask.test.ts | 19 + packages/evals/tests/tui/parse.test.ts | 2 +- packages/evals/tests/tui/run.test.ts | 17 +- packages/evals/tui/commands/run.ts | 16 +- 23 files changed, 1610 insertions(+), 274 deletions(-) create mode 100644 packages/evals/framework/ClaudeAgentHarness.ts create mode 100644 packages/evals/framework/CodexAgentHarness.ts create mode 100644 packages/evals/framework/StagehandAgentV3Harness.ts create mode 100644 packages/evals/framework/StagehandAgentV4Harness.ts create mode 100644 packages/evals/framework/UnderstudyV4Tools.ts diff --git a/packages/evals/core/contracts/tool.ts b/packages/evals/core/contracts/tool.ts index bd1d366d8..9b179121d 100644 --- a/packages/evals/core/contracts/tool.ts +++ b/packages/evals/core/contracts/tool.ts @@ -18,6 +18,7 @@ import type { export type ToolSurface = | "understudy_code" + | "understudy_v4" | "playwright_code" | "cdp_code" | "playwright_mcp" diff --git a/packages/evals/framework/ClaudeAgentHarness.ts b/packages/evals/framework/ClaudeAgentHarness.ts new file mode 100644 index 000000000..f13ecb305 --- /dev/null +++ b/packages/evals/framework/ClaudeAgentHarness.ts @@ -0,0 +1,52 @@ +import { EvalsError } from "../errors.js"; +import { runClaudeCodeAgent } from "./claudeCodeRunner.js"; +import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; +import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import type { + BenchHarness, + BenchHarnessExecuteInput, + StartedBenchHarness, +} from "./benchHarness.js"; +import type { TaskResult } from "./types.js"; + +export const ClaudeAgentHarness: BenchHarness = { + harness: "claude_code", + supportedTaskKinds: ["agent", "suite"], + supportsApi: false, + async execute({ + input, + row, + logger, + signal, + }: BenchHarnessExecuteInput): Promise { + const plan = buildExternalHarnessTaskPlan(input); + if (row.config.harness !== "claude_code") { + throw new EvalsError( + `Expected claude_code harness config, received "${row.config.harness}".`, + ); + } + const toolAdapter = await prepareClaudeCodeToolAdapter({ + toolSurface: row.config.toolSurface, + startupProfile: row.config.startupProfile, + environment: row.config.environment, + plan, + logger, + }); + try { + return await runClaudeCodeAgent({ + plan, + model: input.modelName, + logger, + toolAdapter, + signal, + }); + } finally { + await toolAdapter.cleanup(); + } + }, + async start(): Promise { + throw new EvalsError( + "Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.", + ); + }, +}; diff --git a/packages/evals/framework/CodexAgentHarness.ts b/packages/evals/framework/CodexAgentHarness.ts new file mode 100644 index 000000000..fac8c2a31 --- /dev/null +++ b/packages/evals/framework/CodexAgentHarness.ts @@ -0,0 +1,52 @@ +import { EvalsError } from "../errors.js"; +import { runCodexAgent } from "./codexRunner.js"; +import { prepareCodexToolAdapter } from "./codexToolAdapter.js"; +import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import type { + BenchHarness, + BenchHarnessExecuteInput, + StartedBenchHarness, +} from "./benchHarness.js"; +import type { TaskResult } from "./types.js"; + +export const CodexAgentHarness: BenchHarness = { + harness: "codex", + supportedTaskKinds: ["agent", "suite"], + supportsApi: false, + async execute({ + input, + row, + logger, + signal, + }: BenchHarnessExecuteInput): Promise { + const plan = buildExternalHarnessTaskPlan(input); + if (row.config.harness !== "codex") { + throw new EvalsError( + `Expected codex harness config, received "${row.config.harness}".`, + ); + } + const toolAdapter = await prepareCodexToolAdapter({ + toolSurface: row.config.toolSurface, + startupProfile: row.config.startupProfile, + environment: row.config.environment, + plan, + logger, + }); + try { + return await runCodexAgent({ + plan, + model: input.modelName, + logger, + toolAdapter, + signal, + }); + } finally { + await toolAdapter.cleanup(); + } + }, + async start(): Promise { + throw new EvalsError( + "Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.", + ); + }, +}; diff --git a/packages/evals/framework/StagehandAgentV3Harness.ts b/packages/evals/framework/StagehandAgentV3Harness.ts new file mode 100644 index 000000000..9d869fd48 --- /dev/null +++ b/packages/evals/framework/StagehandAgentV3Harness.ts @@ -0,0 +1,139 @@ +import { + AgentProvider, + getAISDKLanguageModel, + loadApiKeyFromEnv, + type AvailableModel, + type LLMClient, + type LogLine, +} from "@browserbasehq/stagehand"; +import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; +import { endBrowserbaseSession } from "../browserbaseCleanup.js"; +import { EvalsError } from "../errors.js"; +import type { V3InitResult } from "../initV3.js"; +import type { + BenchHarness, + BenchHarnessStartInput, + StartedBenchHarness, +} from "./benchHarness.js"; +import type { DiscoveredTask } from "./types.js"; + +function isAgentTask(task: DiscoveredTask): boolean { + return ( + task.primaryCategory === "agent" || + task.categories.includes("agent") || + task.categories.includes("external_agent_benchmarks") + ); +} + +function resolveProvider(modelName: AvailableModel): string | undefined { + if (modelName.includes("/")) { + return modelName.split("/")[0]; + } + + try { + return AgentProvider.getAgentProvider(modelName); + } catch { + return undefined; + } +} + +export const StagehandAgentV3Harness: BenchHarness = { + harness: "stagehand_v3", + supportedTaskKinds: [ + "act", + "extract", + "observe", + "agent", + "combination", + "suite", + ], + supportsApi: true, + async start({ + task, + input, + row, + logger, + verbose, + }: BenchHarnessStartInput): Promise { + let v3Result: V3InitResult | undefined; + const createAgent = isAgentTask(task); + if (row.config.harness !== "stagehand_v3") { + throw new EvalsError( + `Expected stagehand_v3 harness config, received "${row.config.harness}".`, + ); + } + const config = row.config; + const agentMode = config.agentMode ?? input.agentMode; + const isCUA = config.isCUA ?? input.isCUA; + + if (config.useApi) { + const provider = resolveProvider(input.modelName); + const logFn = (line: LogLine) => logger.log(line); + const apiKey = loadApiKeyFromEnv(provider, logFn); + if (!apiKey) { + throw new EvalsError( + `USE_API=true but no API key found for provider "${provider}".`, + ); + } + const { initV3 } = await import("../initV3.js"); + v3Result = await initV3({ + logger, + modelName: input.modelName, + modelClientOptions: { apiKey }, + createAgent, + agentMode, + isCUA, + verbose, + configOverrides: { env: config.environment }, + }); + } else { + let llmClient: LLMClient | undefined; + if (input.modelName.includes("/")) { + const firstSlashIndex = input.modelName.indexOf("/"); + llmClient = new AISdkClientWrapped({ + model: getAISDKLanguageModel( + input.modelName.substring(0, firstSlashIndex), + input.modelName.substring(firstSlashIndex + 1), + ), + }); + } + const { initV3 } = await import("../initV3.js"); + v3Result = await initV3({ + logger, + llmClient, + modelName: input.modelName, + createAgent, + agentMode, + isCUA, + verbose, + configOverrides: { env: config.environment }, + }); + } + + return { + ctx: { + harness: "stagehand_v3", + row, + logger, + v3: v3Result.v3, + agent: v3Result.agent, + page: v3Result.v3.context.pages()[0], + debugUrl: v3Result.debugUrl ?? "", + sessionUrl: v3Result.sessionUrl ?? "", + }, + cleanup: async () => { + if (v3Result?.v3) { + try { + await v3Result.v3.close(); + } catch (closeError) { + console.error( + `Warning: Error closing V3 instance for ${input.name}:`, + closeError, + ); + } + } + await endBrowserbaseSession(v3Result?.v3); + }, + }; + }, +}; diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts new file mode 100644 index 000000000..dff4c2078 --- /dev/null +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -0,0 +1,470 @@ +import { + getAISDKLanguageModel, + type AgentInstance, + type LLMClient, + type LocalBrowserLaunchOptions, + type V3, +} from "@browserbasehq/stagehand"; +import { z } from "zod"; +import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; +import { endBrowserbaseSession } from "../browserbaseCleanup.js"; +import { EvalsError } from "../errors.js"; +import type { V3InitResult } from "../initV3.js"; +import { startUnderstudyV4Tools, type UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js"; +import type { + BenchHarness, + BenchHarnessStartInput, + BenchHarnessContext, + StartedBenchHarness, +} from "./benchHarness.js"; + +type Page = ReturnType[number]; + +function isAgentTask(task: BenchHarnessStartInput["task"]): boolean { + return ( + task.primaryCategory === "agent" || + task.categories.includes("agent") || + task.categories.includes("external_agent_benchmarks") + ); +} + +export const StagehandAgentV4Harness: BenchHarness = { + harness: "stagehand_v4", + supportedTaskKinds: [ + "act", + "extract", + "observe", + "agent", + "combination", + "suite", + ], + supportsApi: false, + async start({ + task, + input, + row, + logger, + verbose, + }: BenchHarnessStartInput): Promise { + if (row.config.harness !== "stagehand_v4") { + throw new EvalsError( + `Expected stagehand_v4 harness config, received "${row.config.harness}".`, + ); + } + if (row.config.toolSurface !== "understudy_v4") { + throw new EvalsError( + `StagehandAgentV4Harness requires --tool understudy_v4; received "${row.config.toolSurface ?? "default"}".`, + ); + } + if (row.config.useApi) { + throw new EvalsError( + "stagehand_v4 must run locally so the v3 agent loop can call the live v4 SDK protocol tools.", + ); + } + + // This is intentionally still the v3 agent loop. The v4 part is the SDK + // launcher/tool catalog/dispatch surface that replaces the v3 agent tools. + const createAgent = isAgentTask(task); + const understudyV4Tools = await startUnderstudyV4Tools({ + environment: row.config.environment, + logger, + }); + let v3Result: V3InitResult | undefined; + let printedV4BusLogTree = false; + const printV4BusLogTree = async (): Promise => { + if (!verbose || printedV4BusLogTree) return; + printedV4BusLogTree = true; + try { + const result = (await understudyV4Tools.stagehandV4.cdp.Mod.evaluate({ + expression: `async () => { + const readLogTree = globalThis.__stagehandBusLogTree; + if (typeof readLogTree !== "function") { + return { error: "globalThis.__stagehandBusLogTree is not available" }; + } + return await readLogTree(params.stagehand_session_id); + }`, + params: { + stagehand_session_id: understudyV4Tools.stagehand_session_id, + }, + })) as { error?: unknown; logTree?: unknown }; + logger.log({ + category: "understudy_v4", + message: + typeof result.logTree === "string" + ? `v4 bus.logTree()\n${result.logTree}` + : `v4 bus.logTree() unavailable: ${String( + result.error ?? "Mod.evaluate did not return logTree.", + )}`, + level: 1, + }); + } catch (dashboardError) { + logger.warn({ + category: "understudy_v4", + message: `Unable to print v4 bus.logTree(): ${ + dashboardError instanceof Error + ? dashboardError.message + : String(dashboardError) + }`, + level: 1, + }); + } + }; + + try { + let llmClient: LLMClient | undefined; + if (input.modelName.includes("/")) { + const firstSlashIndex = input.modelName.indexOf("/"); + llmClient = new AISdkClientWrapped({ + model: getAISDKLanguageModel( + input.modelName.substring(0, firstSlashIndex), + input.modelName.substring(firstSlashIndex + 1), + ), + }); + } + + const localBrowserLaunchOptions = { + cdpUrl: understudyV4Tools.cdpUrl, + } satisfies Partial; + const { initV3 } = await import("../initV3.js"); + v3Result = await initV3({ + logger, + llmClient, + modelName: input.modelName, + createAgent: false, + agentMode: row.config.agentMode ?? input.agentMode, + isCUA: row.config.isCUA ?? input.isCUA, + verbose, + configOverrides: { + env: "LOCAL", + localBrowserLaunchOptions, + experimental: true, + }, + }); + const closeV3 = v3Result.v3.close.bind(v3Result.v3); + v3Result.v3.close = async () => { + await printV4BusLogTree(); + return await closeV3(); + }; + const v4Page = await installStagehandV4BenchFacade(v3Result.v3, understudyV4Tools.stagehandV4); + + if (createAgent) { + v3Result.agent = v3Result.v3.agent({ + model: input.modelName, + mode: "dom", + tools: understudyV4Tools.tools, + systemPrompt: buildStagehandAgentV4SystemPrompt( + understudyV4Tools.toolCatalog, + ), + }) as AgentInstance; + } + + const ctx: BenchHarnessContext = { + harness: "stagehand_v4", + row, + logger, + v3: v3Result.v3, + stagehandV4: understudyV4Tools.stagehandV4, + agent: v3Result.agent, + page: v4Page as unknown as Page, + debugUrl: v3Result.debugUrl ?? "", + sessionUrl: v3Result.sessionUrl ?? "", + }; + + return { + ctx, + cleanup: async () => { + await printV4BusLogTree(); + if (v3Result?.v3) { + try { + await v3Result.v3.close(); + } catch (closeError) { + console.error( + `Warning: Error closing V3 instance for ${input.name}:`, + closeError, + ); + } + } + await endBrowserbaseSession(v3Result?.v3); + await understudyV4Tools.cleanup(); + }, + }; + } catch (error) { + if (v3Result?.v3) await v3Result.v3.close().catch(() => {}); + await understudyV4Tools.cleanup().catch(() => {}); + throw error; + } + }, +}; + +function buildStagehandAgentV4SystemPrompt( + toolCatalog: Record[], +): string { + return [ + "You are using Stagehand v4 protocol tools through the existing Stagehand agent loop.", + "The callable tool schemas are the source of truth. They are v4 event payload schemas, not the older v3 agent wrapper schemas.", + "", + "Selector rules:", + "- Selectors are partial hints. You may pass only elementId, only xpath, only css, only text, only coordinates, or any useful subset.", + "- The browser hydrates selectors before use, so do not invent missing selector fields.", + "- Prefer elementId from the page summary tree when it is available. Coordinates are valid when they are the clearest available selector.", + "- Deep XPath can pierce frames and shadow roots, for example /body/div[3]/iframe[2]/body/iframe[2]/button.", + "", + "Page context:", + "- Use the derived page summary tool to get current DOM/accessibility context and element ids.", + "- Use the derived screenshot tool when visual confirmation or coordinates are needed.", + "- When you already have a selector and a concrete operation, prefer the direct browser action tool for that operation.", + "- If you use act with an action object, follow the action schema exactly.", + "", + "Available v4 tools:", + ...toolCatalog.map((definition) => { + const name = + typeof definition.name === "string" ? definition.name : "unknown"; + const description = + typeof definition.description === "string" + ? definition.description + : name; + return `- ${name}: ${description}`; + }), + ].join("\n"); +} + +async function installStagehandV4BenchFacade( + v3: V3, + stagehandV4: UnderstudyV4NativeRuntime, +): Promise> { + const pageState: { + targetId?: string; + title: string; + url: string; + } = { + title: "", + url: "about:blank", + }; + + const refreshPageInfo = async (): Promise => { + const info = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestInfo({ + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + }), + ); + if (!isRecord(info)) return; + if (typeof info.targetId === "string") pageState.targetId = info.targetId; + if (typeof info.title === "string") pageState.title = info.title; + if (typeof info.url === "string") pageState.url = info.url; + }; + + await refreshPageInfo().catch(() => {}); + + const updatePageStateFromBrowserEvent = (event: unknown): void => { + if (!isRecord(event)) return; + if (typeof event.targetId === "string") pageState.targetId = event.targetId; + if (typeof event.url === "string") pageState.url = event.url; + }; + stagehandV4.cdp.on("Stagehand.BrowserPageNavigated", updatePageStateFromBrowserEvent); + stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", updatePageStateFromBrowserEvent); + + const page = createStagehandV4PageFacade(stagehandV4, pageState, refreshPageInfo); + const pages = (): Record[] => [page]; + + const context = v3.context as unknown as Record; + context.pages = pages; + context.awaitActivePage = async () => { + await refreshPageInfo(); + return page; + }; + + v3.observe = (async (a?: string | Record, b?: Record) => { + const instruction = typeof a === "string" ? a : undefined; + const options = (typeof a === "string" ? b : a) as Record | undefined; + const result = await stagehandV4.cdp.Stagehand.AIObserve({ + ...(instruction != null ? { instruction } : {}), + ...selectorParam(options), + ...workflowOptionsParam(options), + }); + const observed = unwrapStagehandV4Result(result); + return Array.isArray(observed) ? observed : []; + }) as V3["observe"]; + + v3.act = (async (input: string | Record, options?: Record) => { + const result = await stagehandV4.cdp.Stagehand.AIAct( + typeof input === "string" + ? { + instruction: input, + ...workflowOptionsParam(options), + } + : { + action: normalizeV4Action(input), + ...workflowOptionsParam(options), + }, + ); + const unwrapped = unwrapStagehandV4Result(result); + await refreshPageInfo().catch(() => {}); + return unwrapped; + }) as V3["act"]; + + v3.extract = (async ( + a?: string | Record, + b?: z.ZodType | Record, + c?: Record, + ) => { + const instruction = typeof a === "string" ? a : undefined; + const schema = isZodSchema(b) ? z.toJSONSchema(b) : undefined; + const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as Record | undefined; + const result = await stagehandV4.cdp.Stagehand.AIExtract({ + ...(instruction != null ? { instruction } : {}), + ...(schema != null ? { schema: schema as Record } : {}), + ...selectorParam(options), + ...workflowOptionsParam(options), + }); + return unwrapStagehandV4Result(result); + }) as V3["extract"]; + + return page; +} + +function createStagehandV4PageFacade( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: { + targetId?: string; + title: string; + url: string; + }, + refreshPageInfo: () => Promise, +): Record { + return { + async goto(url: string) { + let timer: ReturnType; + const loaded = new Promise((resolve, reject) => { + const onLoaded = (): void => { + clearTimeout(timer); + stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); + resolve(); + }; + timer = setTimeout(() => { + stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); + reject(new Error("Timed out waiting for Stagehand.BrowserPageLoaded.")); + }, 30_000); + stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", onLoaded); + }); + const [rawResult] = await Promise.all([ + stagehandV4.cdp.Stagehand.BrowserPageGoto({ + url, + selector: pageState.targetId != null ? { targetId: pageState.targetId } : { active: true }, + }), + loaded, + ]); + const result = unwrapStagehandV4Result(rawResult); + if (isRecord(result)) { + if (typeof result.targetId === "string") pageState.targetId = result.targetId; + if (typeof result.url === "string") pageState.url = result.url; + } + await refreshPageInfo(); + return { + ok: () => true, + status: () => 200, + url: () => pageState.url, + }; + }, + url() { + return pageState.url; + }, + async title() { + await refreshPageInfo(); + return pageState.title; + }, + async waitForLoadState() { + await new Promise((resolve, reject) => { + let timer: ReturnType; + const onLoaded = (): void => { + clearTimeout(timer); + stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); + resolve(); + }; + timer = setTimeout(() => { + stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); + reject(new Error("Timed out waiting for Stagehand.BrowserPageLoaded.")); + }, 30_000); + stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", onLoaded); + }); + await refreshPageInfo(); + }, + async evaluate(expressionOrFn: unknown, arg?: unknown) { + const expression = + typeof expressionOrFn === "function" + ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` + : String(expressionOrFn); + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + arg: isJsonValue(arg) ? arg : undefined, + awaitPromise: true, + expression, + returnByValue: true, + }), + ); + return isRecord(result) && "value" in result ? result.value : result; + }, + locator() { + throw new Error("stagehand_v4 evals must use v4 protocol actions instead of v3 page.locator()."); + }, + }; +} + +function normalizeV4Action(action: Record): Record { + return { + ...action, + selector: normalizeV4Selector(action.selector), + method: typeof action.method === "string" ? action.method : null, + arguments: Array.isArray(action.arguments) ? action.arguments : null, + }; +} + +function selectorParam(options: Record | undefined): Record { + const selector = normalizeV4Selector(options?.selector); + return selector == null ? {} : { selector }; +} + +function normalizeV4Selector(value: unknown): Record | undefined { + if (value == null) return undefined; + if (isRecord(value)) return value; + if (typeof value !== "string" || value.length === 0) return undefined; + if (value.startsWith("xpath=")) return { xpath: value.slice("xpath=".length) }; + if (value.startsWith("/") || value.startsWith("(")) return { xpath: value }; + return { css: value }; +} + +function workflowOptionsParam(options: Record | undefined): Record { + if (!options) return {}; + const workflowOptions: Record = {}; + if (typeof options.timeout === "number") workflowOptions.timeout = options.timeout; + if (isJsonValue(options.variables)) workflowOptions.variables = options.variables; + return Object.keys(workflowOptions).length === 0 ? {} : { options: workflowOptions }; +} + +function unwrapStagehandV4Result(value: unknown): unknown { + if (!isRecord(value)) return value; + if (isRecord(value.event_results)) { + for (const entry of Object.values(value.event_results)) { + if (!isRecord(entry)) continue; + if ("result" in entry) return entry.result; + } + } + if ("result" in value) return value.result; + return value; +} + +function isZodSchema(value: unknown): value is z.ZodType { + return isRecord(value) && typeof value.safeParse === "function"; +} + +function isJsonValue(value: unknown): boolean { + if (value == null) return true; + if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return true; + if (Array.isArray(value)) return value.every(isJsonValue); + if (!isRecord(value)) return false; + return Object.values(value).every(isJsonValue); +} + +function isRecord(value: unknown): value is Record { + return value != null && typeof value === "object" && !Array.isArray(value); +} diff --git a/packages/evals/framework/UnderstudyV4Tools.ts b/packages/evals/framework/UnderstudyV4Tools.ts new file mode 100644 index 000000000..272742213 --- /dev/null +++ b/packages/evals/framework/UnderstudyV4Tools.ts @@ -0,0 +1,712 @@ +import path from "node:path"; +import fs from "node:fs"; +import { createRequire } from "node:module"; +import { spawn, type ChildProcess } from "node:child_process"; +import { createInterface } from "node:readline"; +import { fileURLToPath, pathToFileURL } from "node:url"; +import type { ToolSet } from "ai"; +import type { EvalLogger } from "../logger.js"; +import { getRepoRootDir } from "../runtimePaths.js"; + +export type UnderstudyV4ToolDefinition = Record; + +type BridgeReadyMessage = { + type: "ready"; + cdpUrl: string; + browserbaseExtensionId?: string; + stagehand_session_id?: string; + toolCatalog: UnderstudyV4ToolDefinition[]; +}; + +type BridgeResultMessage = { + type: "result"; + id: number; + result?: unknown; + error?: string; +}; + +type BridgeEventMessage = { + type: "event"; + name: string; + event: unknown; +}; + +type BridgeErrorMessage = { + type: "error"; + error: string; +}; + +type UnderstudyV4Sdk = { + StagehandClient: new (options?: Record) => { + browserbase_extension_id?: string; + cdp_http_origin?: string; + connect(input?: unknown): Promise; + close(): Promise; + cdp: { + cdp_url?: string | null; + on(eventName: string, listener: (event: unknown) => void): unknown; + off(eventName: string, listener: (event: unknown) => void): unknown; + Stagehand: Record< + string, + (params?: Record) => Promise + >; + }; + stagehand_session_id?: string; + }; + StagehandProtocolEvents: Record; + aiBrowserToolDefinitions: () => UnderstudyV4ToolDefinition[]; +}; + +export interface UnderstudyV4Tools { + cdpUrl: string; + browserbaseExtensionId?: string; + stagehand_session_id?: string; + toolCatalog: UnderstudyV4ToolDefinition[]; + stagehandV4: UnderstudyV4NativeRuntime; + tools: ToolSet; + cleanup: () => Promise; +} + +export interface UnderstudyV4NativeRuntime { + cdp: { + on(eventName: string, listener: (event: unknown) => void): void; + off(eventName: string, listener: (event: unknown) => void): void; + Mod: Record< + string, + (params?: Record) => Promise + >; + Stagehand: Record< + string, + (params?: Record) => Promise + >; + }; +} + +type PendingCall = { + resolve: (value: unknown) => void; + reject: (error: Error) => void; +}; + +export async function startUnderstudyV4Tools(input: { + environment: "LOCAL" | "BROWSERBASE"; + logger: EvalLogger; +}): Promise { + const require = createRequire(import.meta.url); + const tsxCli = require.resolve("tsx/cli"); + const child = spawn( + process.execPath, + [tsxCli, fileURLToPath(import.meta.url)], + { + cwd: getRepoRootDir(), + env: { + ...process.env, + UNDERSTUDY_V4_TOOLS_CHILD: "1", + }, + stdio: ["pipe", "pipe", "pipe"], + }, + ); + + const pending = new Map(); + const eventListeners = new Map void>>(); + const subscribedEvents = new Set(); + let nextId = 1; + let readyResolve: (message: BridgeReadyMessage) => void; + let readyReject: (error: Error) => void; + const readyPromise = new Promise((resolve, reject) => { + readyResolve = resolve; + readyReject = reject; + }); + + const stdout = createInterface({ input: child.stdout }); + stdout.on("line", (line) => { + if (!line.trim()) return; + const message = parseBridgeMessage(line); + if (!message) { + input.logger.log({ + category: "understudy_v4", + message: line, + level: 1, + }); + return; + } + if (message.type === "ready") { + readyResolve(message); + return; + } + if (message.type === "event") { + for (const listener of eventListeners.get(message.name) ?? []) { + listener(message.event); + } + return; + } + if (message.type === "error") { + const error = new Error(message.error); + readyReject(error); + for (const call of pending.values()) call.reject(error); + pending.clear(); + return; + } + const call = pending.get(message.id); + if (!call) return; + pending.delete(message.id); + if (message.error) { + call.reject(new Error(message.error)); + } else { + call.resolve(message.result); + } + }); + + child.stderr.on("data", (chunk: Buffer) => { + for (const line of chunk.toString("utf8").split(/\r?\n/).filter(Boolean)) { + input.logger.warn({ + category: "understudy_v4", + message: line, + level: 1, + }); + } + }); + + child.on("error", (error) => { + readyReject(error); + for (const call of pending.values()) call.reject(error); + pending.clear(); + }); + child.on("exit", (code, signal) => { + const error = new Error( + `Understudy v4 tools process exited (${signal ?? code ?? "unknown"}).`, + ); + readyReject(error); + for (const call of pending.values()) call.reject(error); + pending.clear(); + }); + + child.stdin.write( + `${JSON.stringify({ type: "init", environment: input.environment })}\n`, + ); + + const ready = await readyPromise; + input.logger.log({ + category: "understudy_v4", + message: `Connected v4 tools at ${ready.cdpUrl}`, + level: 1, + }); + const callCommand = (name: string, args: Record) => + callBridge(child, pending, nextId++, "command", name, args); + const callTool = (name: string, args: Record) => + callBridge(child, pending, nextId++, "tool", name, args); + const { jsonSchema, tool } = await import("ai"); + + return { + cdpUrl: ready.cdpUrl, + browserbaseExtensionId: ready.browserbaseExtensionId, + stagehand_session_id: ready.stagehand_session_id, + toolCatalog: ready.toolCatalog, + stagehandV4: { + cdp: { + on(eventName, listener) { + let listeners = eventListeners.get(eventName); + if (!listeners) { + listeners = new Set(); + eventListeners.set(eventName, listeners); + } + listeners.add(listener); + if (!subscribedEvents.has(eventName)) { + subscribedEvents.add(eventName); + child.stdin.write(`${JSON.stringify({ type: "subscribe", name: eventName })}\n`); + } + }, + off(eventName, listener) { + const listeners = eventListeners.get(eventName); + listeners?.delete(listener); + if (listeners?.size === 0) eventListeners.delete(eventName); + }, + Mod: new Proxy( + {}, + { + get(_target, property) { + if (typeof property !== "string") return undefined; + return (params?: Record) => + callCommand(`Mod.${property}`, params ?? {}); + }, + }, + ) as UnderstudyV4NativeRuntime["cdp"]["Mod"], + Stagehand: new Proxy( + {}, + { + get(_target, property) { + if (typeof property !== "string") return undefined; + return (params?: Record) => + callCommand(`Stagehand.${property}`, params ?? {}); + }, + }, + ) as UnderstudyV4NativeRuntime["cdp"]["Stagehand"], + }, + }, + tools: buildUnderstudyV4ToolSet(ready.toolCatalog, callTool, input.logger, { + jsonSchema, + tool, + }), + cleanup: async () => { + await closeBridge(child, pending); + }, + }; +} + +function buildUnderstudyV4ToolSet( + catalog: UnderstudyV4ToolDefinition[], + callTool: (name: string, args: Record) => Promise, + logger: EvalLogger, + ai: Pick, +): ToolSet { + const tools: ToolSet = {}; + const selectorMap: Record> = {}; + for (const definition of catalog) { + const name = typeof definition.name === "string" ? definition.name : null; + const rawSchema = definition.inputSchema ?? definition.parameters; + const schema = + rawSchema != null && + typeof rawSchema === "object" && + !Array.isArray(rawSchema) + ? rawSchema + : null; + if (!name) continue; + if (!schema) continue; + tools[name] = ai.tool({ + description: + typeof definition.description === "string" + ? definition.description + : name, + inputSchema: ai.jsonSchema(schema), + execute: async (args) => { + logger.log({ + category: "understudy_v4", + message: `Agent calling v4 tool: ${name}`, + level: 1, + auxiliary: { + arguments: { + value: JSON.stringify(args), + type: "object", + }, + }, + }); + const hydratedArgs = hydrateSelectorReferences( + isRecord(args) ? args : {}, + selectorMap, + ); + return callTool(name, isRecord(hydratedArgs) ? hydratedArgs : {}); + }, + toModelOutput: (result) => modelOutputForToolResult(result, selectorMap), + }); + } + return tools; +} + +function modelOutputForToolResult( + result: unknown, + selectorMap: Record>, +) { + const payload = firstPayload(result); + const screenshot = stringField(payload, "screenshot"); + if (screenshot) { + return { + type: "content" as const, + value: [ + { + type: "media" as const, + mediaType: "image/png", + data: screenshot.replace(/^data:image\/\w+;base64,/, ""), + }, + ], + }; + } + const pageSummary = + stringField(payload, "formattedTree") ?? + stringField(payload, "observationTree") ?? + stringField(payload, "pageText"); + if (pageSummary) { + updateSelectorMap(selectorMap, payload.elementSelectorMap); + return { + type: "content" as const, + value: [ + { + type: "text" as const, + text: [ + "Page Summary:", + pageSummary, + "", + 'Use an element square-bracket id as selector.elementId without brackets, for example {"selector":{"elementId":"0-3"}}.', + ].join("\n"), + }, + ], + }; + } + return { + type: "content" as const, + value: [ + { + type: "text" as const, + text: JSON.stringify(sanitizeForModel(payload)), + }, + ], + }; +} + +function callBridge( + child: ChildProcess, + pending: Map, + id: number, + type: "tool" | "command", + name: string, + args: Record, +): Promise { + return new Promise((resolve, reject) => { + pending.set(id, { resolve, reject }); + child.stdin.write(`${JSON.stringify({ type, id, name, args })}\n`); + }); +} + +async function closeBridge( + child: ChildProcess, + pending: Map, +): Promise { + if (child.exitCode != null) return; + await new Promise((resolve) => { + child.once("exit", () => resolve()); + child.stdin.write(`${JSON.stringify({ type: "close" })}\n`); + child.stdin.end(); + setTimeout(() => { + if (child.exitCode == null) child.kill("SIGTERM"); + resolve(); + }, 5000).unref(); + }); + for (const call of pending.values()) { + call.reject(new Error("Understudy v4 tools process closed.")); + } + pending.clear(); +} + +function parseBridgeMessage( + line: string, +): BridgeReadyMessage | BridgeResultMessage | BridgeEventMessage | BridgeErrorMessage | null { + try { + const parsed = JSON.parse(line) as BridgeReadyMessage | BridgeResultMessage | BridgeEventMessage | BridgeErrorMessage; + if (parsed.type === "ready" || parsed.type === "result" || parsed.type === "event" || parsed.type === "error") { + return parsed; + } + } catch { + return null; + } + return null; +} + +async function runBridgeChild(): Promise { + const sdk = await loadStagehandV4Sdk(); + const commandByToolName = buildCommandByToolName(sdk); + let client: InstanceType | null = null; + const eventSubscriptions = new Map void>(); + + const stdin = createInterface({ input: process.stdin }); + for await (const line of stdin) { + if (!line.trim()) continue; + const message = JSON.parse(line) as { + type: "init" | "tool" | "command" | "subscribe" | "close"; + environment?: "LOCAL" | "BROWSERBASE"; + id?: number; + name?: string; + args?: Record; + }; + + if (message.type === "init") { + client = new sdk.StagehandClient( + understudyV4ClientOptions(message.environment ?? "LOCAL"), + ); + await client.connect(); + let cdpUrl = client.cdp.cdp_url ?? client.cdp_http_origin ?? ""; + if (/^https?:\/\//i.test(cdpUrl)) { + const versionResponse = await fetch(`${cdpUrl}/json/version`); + if (!versionResponse.ok) { + throw new Error( + `Unable to resolve v4 browser websocket URL from ${cdpUrl}: GET /json/version -> ${versionResponse.status}`, + ); + } + const version = (await versionResponse.json()) as { + webSocketDebuggerUrl?: unknown; + }; + if (typeof version.webSocketDebuggerUrl !== "string") { + throw new Error( + `Unable to resolve v4 browser websocket URL from ${cdpUrl}: missing webSocketDebuggerUrl`, + ); + } + cdpUrl = version.webSocketDebuggerUrl; + } + writeBridgeMessage({ + type: "ready", + cdpUrl, + browserbaseExtensionId: client.browserbase_extension_id, + stagehand_session_id: client.stagehand_session_id, + toolCatalog: sdk.aiBrowserToolDefinitions(), + }); + continue; + } + + if (message.type === "subscribe") { + if (!client) throw new Error("Understudy v4 tools were not initialized."); + const name = message.name; + if (typeof name !== "string") throw new Error("Event subscription requires an event name."); + if (!eventSubscriptions.has(name)) { + const listener = (event: unknown): void => writeBridgeMessage({ type: "event", name, event }); + eventSubscriptions.set(name, listener); + client.cdp.on(name, listener); + } + continue; + } + + if (message.type === "tool" || message.type === "command") { + if (!client) throw new Error("Understudy v4 tools were not initialized."); + const id = message.id ?? 0; + try { + const commandName = + message.type === "tool" + ? commandByToolName.get(message.name ?? "") + : message.name; + if (!commandName) { + throw new Error( + message.type === "tool" + ? `No v4 protocol event is exposed for tool "${message.name}".` + : `No v4 protocol command was provided.`, + ); + } + const command = + message.type === "command" + ? commandForPath(client.cdp, commandName) + : client.cdp.Stagehand[commandName]; + if (!command) { + throw new Error( + `The v4 SDK does not expose ${ + message.type === "command" ? commandName : `Stagehand.${commandName}` + }.`, + ); + } + const result = await command(message.args ?? {}); + writeBridgeMessage({ type: "result", id, result }); + } catch (error) { + writeBridgeMessage({ + type: "result", + id, + error: error instanceof Error ? error.message : String(error), + }); + } + continue; + } + + if (message.type === "close") { + if (client) { + for (const [eventName, listener] of eventSubscriptions) { + client.cdp.off(eventName, listener); + } + } + await client?.close(); + process.exit(0); + } + } +} + +export function assertUnderstudyV4SdkAvailable(): string { + const sdkPath = + process.env.STAGEHAND_V4_SDK_PATH ?? + path.join( + getRepoRootDir(), + "..", + "stagehand-driver", + "sdks", + "js", + "index.ts", + ); + if (!fs.existsSync(sdkPath)) { + throw new Error( + [ + "stagehand_v4 evals require a local Stagehand v4 SDK checkout.", + `Expected v4 SDK entrypoint at: ${sdkPath}`, + "Set STAGEHAND_V4_SDK_PATH to the v4 SDK entrypoint if your checkout lives somewhere else.", + ].join("\n"), + ); + } + return sdkPath; +} + +async function loadStagehandV4Sdk(): Promise { + const sdkPath = assertUnderstudyV4SdkAvailable(); + return (await import(pathToFileURL(sdkPath).href)) as UnderstudyV4Sdk; +} + +function understudyV4ClientOptions( + environment: "LOCAL" | "BROWSERBASE", +): Record { + if (process.env.STAGEHAND_V4_CDP_URL) { + return { cdp_url: process.env.STAGEHAND_V4_CDP_URL, rebuild_extension: false }; + } + if (environment === "BROWSERBASE") { + if (!process.env.BROWSERBASE_API_KEY) { + throw new Error("BROWSERBASE_API_KEY is required for understudy_v4."); + } + return { + rebuild_extension: false, + browserbase_session_create_params: { + browserbase_api_key: process.env.BROWSERBASE_API_KEY, + }, + }; + } + return { + rebuild_extension: false, + local_browser_launch_options: { + headless: process.env.EVAL_HEADLESS !== "false", + ...(process.env.CHROME_PATH + ? { executable_path: process.env.CHROME_PATH } + : {}), + }, + }; +} + +function buildCommandByToolName(sdk: UnderstudyV4Sdk): Map { + const commandByToolName = new Map(); + for (const value of Object.values(sdk.StagehandProtocolEvents)) { + if (typeof value !== "function") continue; + const eventClass = value as { + event_type?: unknown; + llm_tool_name?: unknown; + }; + if ( + typeof eventClass.event_type !== "string" || + typeof eventClass.llm_tool_name !== "string" || + !eventClass.event_type.endsWith("Event") + ) { + continue; + } + commandByToolName.set( + eventClass.llm_tool_name, + eventClass.event_type.slice(0, -"Event".length), + ); + } + return commandByToolName; +} + +function commandForPath( + cdp: InstanceType["cdp"], + path: string, +): ((params?: Record) => Promise) | undefined { + const [domain, method] = path.split("."); + if (!domain || !method) return undefined; + const commands = (cdp as unknown as Record)[domain]; + if (!isRecord(commands)) return undefined; + const command = commands[method]; + return typeof command === "function" + ? (command as (params?: Record) => Promise) + : undefined; +} + +function writeBridgeMessage( + message: BridgeReadyMessage | BridgeResultMessage | BridgeEventMessage | BridgeErrorMessage, +): void { + process.stdout.write(`${JSON.stringify(message)}\n`); +} + +function firstPayload(value: unknown): Record { + if (!isRecord(value)) return {}; + const eventResults = + value.event_results ?? + (isRecord(value.event) ? value.event.event_results : undefined); + if (isRecord(eventResults)) { + const first = Object.values(eventResults)[0]; + if (isRecord(first)) { + if (isRecord(first.result)) return first.result; + return first; + } + } + return value; +} + +function stringField( + record: Record, + key: string, +): string | null { + const value = record[key]; + return typeof value === "string" && value.length > 0 ? value : null; +} + +function sanitizeForModel(value: unknown): unknown { + if (typeof value === "string") { + return value.length > 2000 + ? `${value.slice(0, 2000)}...[truncated]` + : value; + } + if (Array.isArray(value)) + return value.map((entry) => sanitizeForModel(entry)); + if (!isRecord(value)) return value; + const result: Record = {}; + for (const [key, entry] of Object.entries(value)) { + if ( + key.toLowerCase().includes("screenshot") || + key.toLowerCase().includes("image") + ) { + result[key] = + typeof entry === "string" && entry.length > 80 + ? `${entry.slice(0, 80)}...[truncated]` + : entry; + continue; + } + result[key] = sanitizeForModel(entry); + } + return result; +} + +function updateSelectorMap( + selectorMap: Record>, + value: unknown, +): void { + if (!isRecord(value)) return; + for (const [elementId, selector] of Object.entries(value)) { + if (isRecord(selector)) selectorMap[elementId] = selector; + } +} + +function hydrateSelectorReferences( + value: unknown, + selectorMap: Record>, +): unknown { + if (Array.isArray(value)) { + return value.map((entry) => hydrateSelectorReferences(entry, selectorMap)); + } + if (!isRecord(value)) return value; + const elementId = + typeof value.elementId === "string" ? value.elementId : null; + const mappedSelector = elementId == null ? null : selectorMap[elementId]; + const hydratedRecord = Object.fromEntries( + Object.entries(value) + .filter(([key]) => key !== "elementId") + .map(([key, entry]) => [ + key, + hydrateSelectorReferences(entry, selectorMap), + ]), + ); + return mappedSelector == null + ? hydratedRecord + : { ...mappedSelector, ...hydratedRecord }; +} + +function isRecord(value: unknown): value is Record { + return value != null && typeof value === "object" && !Array.isArray(value); +} + +if ( + process.env.UNDERSTUDY_V4_TOOLS_CHILD === "1" && + process.argv[1] && + path.resolve(process.argv[1]) === fileURLToPath(import.meta.url) +) { + void runBridgeChild().catch((error) => { + writeBridgeMessage({ + type: "error", + error: error instanceof Error ? error.message : String(error), + }); + process.exit(1); + }); +} diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index c2277ea36..38eae88c9 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -1,24 +1,11 @@ -import { - AgentProvider, - getAISDKLanguageModel, - loadApiKeyFromEnv, - type AgentInstance, - type AvailableModel, - type LLMClient, - type LogLine, - type V3, -} from "@browserbasehq/stagehand"; -import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; -import { endBrowserbaseSession } from "../browserbaseCleanup.js"; +import type { AgentInstance, V3 } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; -import type { V3InitResult } from "../initV3.js"; import type { EvalInput } from "../types/evals.js"; -import { runClaudeCodeAgent } from "./claudeCodeRunner.js"; -import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; -import { runCodexAgent } from "./codexRunner.js"; -import { prepareCodexToolAdapter } from "./codexToolAdapter.js"; -import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import { ClaudeAgentHarness } from "./ClaudeAgentHarness.js"; +import { CodexAgentHarness } from "./CodexAgentHarness.js"; +import { StagehandAgentV3Harness } from "./StagehandAgentV3Harness.js"; +import type { UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js"; import type { DiscoveredTask, TaskResult } from "./types.js"; import type { BenchMatrixRow, BenchTaskKind, Harness } from "./benchTypes.js"; @@ -41,6 +28,7 @@ export interface BenchHarnessContext { row: BenchMatrixRow; logger: EvalLogger; v3?: V3; + stagehandV4?: UnderstudyV4NativeRuntime; agent?: AgentInstance; page?: Page; debugUrl: string; @@ -60,28 +48,8 @@ export interface BenchHarness { start(input: BenchHarnessStartInput): Promise; } -function isAgentTask(task: DiscoveredTask): boolean { - return ( - task.primaryCategory === "agent" || - task.categories.includes("agent") || - task.categories.includes("external_agent_benchmarks") - ); -} - -function resolveProvider(modelName: AvailableModel): string | undefined { - if (modelName.includes("/")) { - return modelName.split("/")[0]; - } - - try { - return AgentProvider.getAgentProvider(modelName); - } catch { - return undefined; - } -} - -export const stagehandHarness: BenchHarness = { - harness: "stagehand", +export const StagehandAgentV4Harness: BenchHarness = { + harness: "stagehand_v4", supportedTaskKinds: [ "act", "extract", @@ -90,193 +58,28 @@ export const stagehandHarness: BenchHarness = { "combination", "suite", ], - supportsApi: true, - async start({ - task, - input, - row, - logger, - verbose, - }: BenchHarnessStartInput): Promise { - let v3Result: V3InitResult | undefined; - const createAgent = isAgentTask(task); - if (row.config.harness !== "stagehand") { - throw new EvalsError( - `Harness "${row.config.harness}" is not implemented yet. Use --harness stagehand for the current unified runner.`, - ); - } - const config = row.config; - const agentMode = config.agentMode ?? input.agentMode; - const isCUA = config.isCUA ?? input.isCUA; - - if (config.useApi) { - const provider = resolveProvider(input.modelName); - const logFn = (line: LogLine) => logger.log(line); - const apiKey = loadApiKeyFromEnv(provider, logFn); - if (!apiKey) { - throw new EvalsError( - `USE_API=true but no API key found for provider "${provider}".`, - ); - } - const { initV3 } = await import("../initV3.js"); - v3Result = await initV3({ - logger, - modelName: input.modelName, - modelClientOptions: { apiKey }, - createAgent, - agentMode, - isCUA, - verbose, - configOverrides: { env: config.environment }, - }); - } else { - let llmClient: LLMClient | undefined; - if (input.modelName.includes("/")) { - const firstSlashIndex = input.modelName.indexOf("/"); - llmClient = new AISdkClientWrapped({ - model: getAISDKLanguageModel( - input.modelName.substring(0, firstSlashIndex), - input.modelName.substring(firstSlashIndex + 1), - ), - }); - } - const { initV3 } = await import("../initV3.js"); - v3Result = await initV3({ - logger, - llmClient, - modelName: input.modelName, - createAgent, - agentMode, - isCUA, - verbose, - configOverrides: { env: config.environment }, - }); - } - - return { - ctx: { - harness: "stagehand", - row, - logger, - v3: v3Result.v3, - agent: v3Result.agent, - page: v3Result.v3.context.pages()[0], - debugUrl: v3Result.debugUrl ?? "", - sessionUrl: v3Result.sessionUrl ?? "", - }, - cleanup: async () => { - if (v3Result?.v3) { - try { - await v3Result.v3.close(); - } catch (closeError) { - console.error( - `Warning: Error closing V3 instance for ${input.name}:`, - closeError, - ); - } - } - await endBrowserbaseSession(v3Result?.v3); - }, - }; - }, -}; - -export const claudeCodeHarness: BenchHarness = { - harness: "claude_code", - supportedTaskKinds: ["agent", "suite"], supportsApi: false, - async execute({ - input, - row, - logger, - signal, - }: BenchHarnessExecuteInput): Promise { - const plan = buildExternalHarnessTaskPlan(input); - if (row.config.harness !== "claude_code") { - throw new EvalsError( - `Expected claude_code harness config, received "${row.config.harness}".`, - ); - } - const toolAdapter = await prepareClaudeCodeToolAdapter({ - toolSurface: row.config.toolSurface, - startupProfile: row.config.startupProfile, - environment: row.config.environment, - plan, - logger, - }); - try { - return await runClaudeCodeAgent({ - plan, - model: input.modelName, - logger, - toolAdapter, - signal, - }); - } finally { - await toolAdapter.cleanup(); - } - }, - async start(): Promise { - throw new EvalsError( - "Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.", - ); - }, -}; - -export const codexHarness: BenchHarness = { - harness: "codex", - supportedTaskKinds: ["agent", "suite"], - supportsApi: false, - async execute({ - input, - row, - logger, - signal, - }: BenchHarnessExecuteInput): Promise { - const plan = buildExternalHarnessTaskPlan(input); - if (row.config.harness !== "codex") { - throw new EvalsError( - `Expected codex harness config, received "${row.config.harness}".`, - ); - } - const toolAdapter = await prepareCodexToolAdapter({ - toolSurface: row.config.toolSurface, - startupProfile: row.config.startupProfile, - environment: row.config.environment, - plan, - logger, - }); - try { - return await runCodexAgent({ - plan, - model: input.modelName, - logger, - toolAdapter, - signal, - }); - } finally { - await toolAdapter.cleanup(); - } - }, - async start(): Promise { - throw new EvalsError( - "Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.", - ); + async start(input: BenchHarnessStartInput): Promise { + const module = await import("./StagehandAgentV4Harness.js"); + return module.StagehandAgentV4Harness.start(input); }, }; const harnessRegistry = new Map([ - ["stagehand", stagehandHarness], - ["claude_code", claudeCodeHarness], - ["codex", codexHarness], + ["stagehand_v3", StagehandAgentV3Harness], + ["stagehand_v4", StagehandAgentV4Harness], + ["claude_code", ClaudeAgentHarness], + ["codex", CodexAgentHarness], ]); export function getBenchHarness(harness: Harness): BenchHarness { const implementation = harnessRegistry.get(harness); if (!implementation) { - throw new EvalsError( - `Harness "${harness}" is not implemented yet. Use --harness stagehand for the current unified runner.`, - ); + throw new EvalsError(`Harness "${harness}" is not implemented yet.`); } return implementation; } + +export { ClaudeAgentHarness } from "./ClaudeAgentHarness.js"; +export { CodexAgentHarness } from "./CodexAgentHarness.js"; +export { StagehandAgentV3Harness } from "./StagehandAgentV3Harness.js"; diff --git a/packages/evals/framework/benchPlanner.ts b/packages/evals/framework/benchPlanner.ts index 5f93ba39b..dab03b44a 100644 --- a/packages/evals/framework/benchPlanner.ts +++ b/packages/evals/framework/benchPlanner.ts @@ -97,15 +97,16 @@ export function resolveBenchModelEntries( effectiveCategory === "agent" || effectiveCategory === "external_agent_benchmarks"; const harness = options.harness ?? DEFAULT_BENCH_HARNESS; - const requestedAgentModes = - harness === "stagehand" ? resolveRequestedAgentModes(options) : undefined; + const requestedAgentModes = isStagehandAgentHarness(harness) + ? resolveRequestedAgentModes(options) + : undefined; if (options.modelOverride) { const baseModes = isAgentCategory && requestedAgentModes ? requestedAgentModes : [ - harness === "stagehand" + isStagehandAgentHarness(harness) ? resolveAgentModeForModel(options.modelOverride) : "hybrid", ]; @@ -345,9 +346,9 @@ function buildBenchHarnessConfig(input: { startupProfile?: StartupProfile; dataset?: string; }): BenchHarnessConfig { - if (input.harness === "stagehand") { + if (isStagehandAgentHarness(input.harness)) { return { - harness: "stagehand", + harness: input.harness, model: input.model, provider: input.provider, environment: input.environment, @@ -461,7 +462,11 @@ export function generateBenchTestcases( } function rowUsesStagehand(options: Pick): boolean { - return (options.harness ?? DEFAULT_BENCH_HARNESS) === "stagehand"; + return isStagehandAgentHarness(options.harness ?? DEFAULT_BENCH_HARNESS); +} + +function isStagehandAgentHarness(harness: Harness): boolean { + return harness === "stagehand_v3" || harness === "stagehand_v4"; } function resolveBenchRowToolSurface( @@ -474,6 +479,19 @@ function resolveBenchRowToolSurface( if (harness === "codex") { return resolveCodexToolSurface(requested); } + if (harness === "stagehand_v4") { + if (requested && requested !== "understudy_v4") { + throw new EvalsError( + `stagehand_v4 uses the UnderstudyV4Tools surface. Received --tool ${requested}.`, + ); + } + return requested ?? "understudy_v4"; + } + if (harness === "stagehand_v3" && requested === "understudy_v4") { + throw new EvalsError( + "Use --harness stagehand_v4 for the UnderstudyV4Tools surface.", + ); + } return requested; } diff --git a/packages/evals/framework/benchRunner.ts b/packages/evals/framework/benchRunner.ts index e719db56d..e8361f0b8 100644 --- a/packages/evals/framework/benchRunner.ts +++ b/packages/evals/framework/benchRunner.ts @@ -29,7 +29,6 @@ export async function executeBenchTask( ): Promise { const logger = new EvalLogger(Boolean(options.verbose)); const harnessName = options.harness ?? DEFAULT_BENCH_HARNESS; - const harness = getBenchHarness(harnessName); const row = buildBenchMatrixRow( task, input.modelName, @@ -38,6 +37,7 @@ export async function executeBenchTask( input.isCUA, input.agentMode, ); + const harness = getBenchHarness(harnessName); let cleanup: () => Promise = async () => {}; let unregisterCleanup: (() => void) | undefined; let harnessCtx: BenchHarnessContext | undefined; @@ -67,8 +67,13 @@ export async function executeBenchTask( harnessCtx = startedHarness.ctx; const taskModule = await loadTaskModuleFromPath(task.filePath, task.name); if (taskModule.definition) { + const taskFn = + taskModule.definition.benchFns?.[harnessCtx.harness] ?? + taskModule.definition.benchFns?.default ?? + taskModule.definition.fn; const ctx = { v3: harnessCtx.v3, + stagehandV4: harnessCtx.stagehandV4, agent: harnessCtx.agent, page: harnessCtx.page, logger, @@ -78,7 +83,7 @@ export async function executeBenchTask( sessionUrl: harnessCtx.sessionUrl, }; return withBenchSessionUrls( - (await taskModule.definition.fn(ctx)) as TaskResult, + (await taskFn(ctx)) as TaskResult, harnessCtx, ); } @@ -86,6 +91,7 @@ export async function executeBenchTask( return withBenchSessionUrls( await taskModule.legacyFn({ v3: harnessCtx.v3, + stagehandV4: harnessCtx.stagehandV4, logger, debugUrl: harnessCtx.debugUrl, sessionUrl: harnessCtx.sessionUrl, @@ -117,10 +123,7 @@ export async function executeBenchTask( return withBenchSessionUrls( { _success: false, - error: - error instanceof Error - ? JSON.parse(JSON.stringify(error, null, 2)) - : String(error), + error: error instanceof Error ? error.message : String(error), logs: logger.getLogs(), }, harnessCtx, diff --git a/packages/evals/framework/benchTypes.ts b/packages/evals/framework/benchTypes.ts index 2a3af7cc6..3fce7d950 100644 --- a/packages/evals/framework/benchTypes.ts +++ b/packages/evals/framework/benchTypes.ts @@ -1,18 +1,20 @@ import type { AgentToolMode, AvailableModel } from "@browserbasehq/stagehand"; import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js"; -export type Harness = "stagehand" | "claude_code" | "codex"; +export type Harness = "stagehand_v3" | "stagehand_v4" | "claude_code" | "codex"; -export const DEFAULT_BENCH_HARNESS: Harness = "stagehand"; +export const DEFAULT_BENCH_HARNESS: Harness = "stagehand_v3"; export const SUPPORTED_BENCH_HARNESSES = [ - "stagehand", + "stagehand_v3", + "stagehand_v4", "claude_code", "codex", ] as const satisfies readonly Harness[]; export const EXECUTABLE_BENCH_HARNESSES = [ - "stagehand", + "stagehand_v3", + "stagehand_v4", "claude_code", "codex", ] as const satisfies readonly Harness[]; @@ -42,7 +44,7 @@ export type BenchTaskKind = | "suite"; export interface StagehandHarnessConfig { - harness: "stagehand"; + harness: "stagehand_v3" | "stagehand_v4"; model: AvailableModel; provider?: string; environment: "LOCAL" | "BROWSERBASE"; diff --git a/packages/evals/framework/context.ts b/packages/evals/framework/context.ts index daa8eabea..2b6a979c4 100644 --- a/packages/evals/framework/context.ts +++ b/packages/evals/framework/context.ts @@ -13,7 +13,6 @@ import { type V3InitResult, initV3 } from "../initV3.js"; import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js"; import { coreFixtureRoutes } from "../core/fixtures/index.js"; import { prepareCoreBrowserTarget } from "../core/targets/index.js"; -import { getCoreTool } from "../core/tools/registry.js"; import { ensureCoreFixtureServer } from "../core/fixtures/server.js"; import { EvalLogger } from "../logger.js"; import { createAssertHelpers } from "./assertions.js"; @@ -70,6 +69,7 @@ export async function buildCoreContext( const logger = options.logger ?? new EvalLogger(); const environment = options.environment ?? "LOCAL"; const toolSurface = options.toolSurface ?? "understudy_code"; + const { getCoreTool } = await import("../core/tools/registry.js"); const tool = getCoreTool(toolSurface); const startupProfile = options.startupProfile ?? diff --git a/packages/evals/framework/defineTask.ts b/packages/evals/framework/defineTask.ts index b03d3e037..2f754320b 100644 --- a/packages/evals/framework/defineTask.ts +++ b/packages/evals/framework/defineTask.ts @@ -5,6 +5,8 @@ * the file lives in during auto-discovery. */ import type { + BenchTaskFn, + BenchTaskImplementations, BenchTaskContext, BenchTaskMeta, CoreTaskContext, @@ -34,8 +36,31 @@ export function defineCoreTask( */ export function defineBenchTask( meta: BenchTaskMeta, - fn: (ctx: BenchTaskContext) => Promise, + fn: BenchTaskFn, +): TaskDefinition; +export function defineBenchTask( + meta: BenchTaskMeta, + fn: BenchTaskImplementations, +): TaskDefinition; +export function defineBenchTask( + meta: BenchTaskMeta, + fn: BenchTaskFn | BenchTaskImplementations, ): TaskDefinition { + if (typeof fn !== "function") { + return { + __taskDefinition: true, + meta, + fn: + fn.default ?? + (async () => { + throw new Error( + `No default bench implementation is defined for "${meta.name ?? "unnamed task"}".`, + ); + }), + benchFns: fn, + }; + } + return { __taskDefinition: true, meta, diff --git a/packages/evals/framework/taskLoader.ts b/packages/evals/framework/taskLoader.ts index d7a218ac9..0aeaa5291 100644 --- a/packages/evals/framework/taskLoader.ts +++ b/packages/evals/framework/taskLoader.ts @@ -7,6 +7,7 @@ export interface LoadedTaskDefinition { __taskDefinition: true; meta: unknown; fn: (ctx: unknown) => Promise; + benchFns?: Record Promise) | undefined>; } export type LegacyTaskFn = (ctx: unknown) => Promise; diff --git a/packages/evals/framework/types.ts b/packages/evals/framework/types.ts index 359605b12..fea8e4b86 100644 --- a/packages/evals/framework/types.ts +++ b/packages/evals/framework/types.ts @@ -23,6 +23,8 @@ import type { ToolSurface, } from "../core/contracts/tool.js"; import type { EvalLogger } from "../logger.js"; +import type { Harness } from "./benchTypes.js"; +import type { UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js"; /** Page type inferred from V3.context.pages()[0] */ type Page = ReturnType[number]; @@ -70,6 +72,8 @@ export interface CoreTaskContext { export interface BenchTaskContext { /** Stagehand V3 instance. */ v3: V3; + /** Native Stagehand v4 SDK proxy. Present for the stagehand_v4 harness. */ + stagehandV4?: UnderstudyV4NativeRuntime; /** Agent instance (created when the task lives under agent/). */ agent?: AgentInstance; /** Playwright page (convenience — same as v3.context.pages()[0]). */ @@ -128,6 +132,12 @@ export interface MetricsCollector { getSummary(): Record>; } +export type BenchTaskFn = (ctx: BenchTaskContext) => Promise; + +export type BenchTaskImplementations = Partial> & { + default?: BenchTaskFn; +}; + export interface TaskDefinition { /** Marker to identify defineTask outputs during discovery. */ __taskDefinition: true; @@ -135,6 +145,8 @@ export interface TaskDefinition { meta: TaskMeta | BenchTaskMeta; /** The task function. */ fn: (ctx: CoreTaskContext | BenchTaskContext) => Promise; + /** Optional harness-native bench implementations. */ + benchFns?: BenchTaskImplementations; /** Which tier this task was defined for (set during discovery from directory). */ tier?: Tier; } diff --git a/packages/evals/lib/braintrust-report.ts b/packages/evals/lib/braintrust-report.ts index 6fbb0fb99..a86da30e6 100644 --- a/packages/evals/lib/braintrust-report.ts +++ b/packages/evals/lib/braintrust-report.ts @@ -1363,7 +1363,7 @@ export function summarizeBenchCases( function agentConfigKey(benchCase: BenchCaseRow): string { return [ - benchCase.harness ?? "stagehand", + benchCase.harness ?? "stagehand_v3", benchCase.provider ?? "", benchCase.environment ?? "", benchCase.api === undefined ? "" : benchCase.api ? "api" : "local", @@ -1375,7 +1375,7 @@ function agentConfigKey(benchCase: BenchCaseRow): string { function agentConfigLabel(benchCase: BenchCaseRow): string { const parts = [ - benchCase.harness ?? "stagehand", + benchCase.harness ?? "stagehand_v3", benchCase.agentMode, benchCase.provider, benchCase.environment, diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts index 7b057e322..76a7bd3fa 100644 --- a/packages/evals/tests/cli.test.ts +++ b/packages/evals/tests/cli.test.ts @@ -85,7 +85,7 @@ describe("CLI entrypoint", () => { expect(payload.envOverrides.EVAL_ENV).toBe("BROWSERBASE"); expect(payload.envOverrides.USE_API).toBe("true"); expect(payload.envOverrides.EVAL_PROVIDER).toBe("openai"); - expect(payload.runOptions.harness).toBe("stagehand"); + expect(payload.runOptions.harness).toBe("stagehand_v3"); expect(payload.runOptions.verbose).toBe(false); }); diff --git a/packages/evals/tests/framework/benchHarness.test.ts b/packages/evals/tests/framework/benchHarness.test.ts index 60989664a..4c5ddf7a4 100644 --- a/packages/evals/tests/framework/benchHarness.test.ts +++ b/packages/evals/tests/framework/benchHarness.test.ts @@ -1,15 +1,31 @@ import { describe, expect, it } from "vitest"; import { - claudeCodeHarness, - codexHarness, + ClaudeAgentHarness, + CodexAgentHarness, getBenchHarness, + StagehandAgentV3Harness, + StagehandAgentV4Harness, } from "../../framework/benchHarness.js"; describe("bench harness registry", () => { + it("registers stagehand_v3 as the v3 Stagehand agent harness", () => { + const harness = getBenchHarness("stagehand_v3"); + + expect(harness).toBe(StagehandAgentV3Harness); + expect(harness.supportsApi).toBe(true); + }); + + it("registers stagehand_v4 as the v4 Stagehand agent harness", () => { + const harness = getBenchHarness("stagehand_v4"); + + expect(harness).toBe(StagehandAgentV4Harness); + expect(harness.supportsApi).toBe(false); + }); + it("registers claude_code as a concrete executable harness", () => { const harness = getBenchHarness("claude_code"); - expect(harness).toBe(claudeCodeHarness); + expect(harness).toBe(ClaudeAgentHarness); expect(harness.supportedTaskKinds).toEqual(["agent", "suite"]); expect(harness.supportsApi).toBe(false); expect(harness.execute).toBeDefined(); @@ -18,7 +34,7 @@ describe("bench harness registry", () => { it("registers codex as a concrete executable harness", () => { const harness = getBenchHarness("codex"); - expect(harness).toBe(codexHarness); + expect(harness).toBe(CodexAgentHarness); expect(harness.supportedTaskKinds).toEqual(["agent", "suite"]); expect(harness.supportsApi).toBe(false); expect(harness.execute).toBeDefined(); diff --git a/packages/evals/tests/framework/benchPlanner.test.ts b/packages/evals/tests/framework/benchPlanner.test.ts index fdc885c96..5143b9711 100644 --- a/packages/evals/tests/framework/benchPlanner.test.ts +++ b/packages/evals/tests/framework/benchPlanner.test.ts @@ -34,7 +34,7 @@ describe("benchPlanner", () => { ); expect(row).toMatchObject({ - harness: "stagehand", + harness: "stagehand_v3", task: "dropdown", category: "act", taskKind: "act", @@ -43,7 +43,7 @@ describe("benchPlanner", () => { environment: "BROWSERBASE", useApi: true, config: { - harness: "stagehand", + harness: "stagehand_v3", model: "openai/gpt-4.1-mini", provider: "openai", environment: "BROWSERBASE", @@ -55,13 +55,13 @@ describe("benchPlanner", () => { it("annotates generated bench testcases with harness metadata", () => { const [testcase] = generateBenchTestcases([makeTask()], { modelOverride: "openai/gpt-4.1-mini", - harness: "stagehand", + harness: "stagehand_v3", environment: "LOCAL", }); expect(testcase.input.modelName).toBe("openai/gpt-4.1-mini"); - expect(testcase.tags).toContain("harness/stagehand"); - expect(testcase.metadata.harness).toBe("stagehand"); + expect(testcase.tags).toContain("harness/stagehand_v3"); + expect(testcase.metadata.harness).toBe("stagehand_v3"); expect(testcase.metadata.environment).toBe("LOCAL"); }); @@ -78,7 +78,7 @@ describe("benchPlanner", () => { { modelOverride: cuaModel, datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ); @@ -100,7 +100,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-5.4-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ); @@ -123,7 +123,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentMode: "dom", }, ); @@ -153,7 +153,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], }, ), @@ -191,7 +191,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); @@ -226,7 +226,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], }, ), @@ -265,7 +265,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["cua"], }, ), @@ -296,7 +296,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentMode: "cua", }, ), @@ -323,7 +323,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["cua"], }, ), @@ -338,7 +338,7 @@ describe("benchPlanner", () => { it("does not expand non-agent model overrides across agent modes", () => { const testcases = generateBenchTestcases([makeTask()], { modelOverride: "openai/gpt-4.1-mini", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], }); @@ -475,7 +475,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); @@ -510,7 +510,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "onlineMind2Web", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); @@ -541,7 +541,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webtailbench", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); diff --git a/packages/evals/tests/framework/benchRunner.test.ts b/packages/evals/tests/framework/benchRunner.test.ts index 08245618d..bd4afc8c9 100644 --- a/packages/evals/tests/framework/benchRunner.test.ts +++ b/packages/evals/tests/framework/benchRunner.test.ts @@ -96,7 +96,7 @@ describe("bench runner", () => { tasks: [task], registry: makeRegistry([task]), environment: "BROWSERBASE", - harness: "stagehand", + harness: "stagehand_v3", verbose: false, }, ); diff --git a/packages/evals/tests/framework/defineTask.test.ts b/packages/evals/tests/framework/defineTask.test.ts index c531d676d..01ae3abda 100644 --- a/packages/evals/tests/framework/defineTask.test.ts +++ b/packages/evals/tests/framework/defineTask.test.ts @@ -44,6 +44,25 @@ describe("defineBenchTask", () => { expect((result.meta as any).models).toEqual(["openai/gpt-4o"]); }); + + it("preserves harness-native bench implementations", async () => { + const stagehandV3 = vi.fn(async () => ({ _success: true, version: 3 })); + const stagehandV4 = vi.fn(async () => ({ _success: true, version: 4 })); + const result = defineBenchTask( + { name: "native_versions" }, + { + stagehand_v3: stagehandV3 as any, + stagehand_v4: stagehandV4 as any, + }, + ); + + await expect(result.benchFns?.stagehand_v4?.({} as any)).resolves.toEqual({ + _success: true, + version: 4, + }); + expect(stagehandV3).toHaveBeenCalledTimes(0); + expect(stagehandV4).toHaveBeenCalledTimes(1); + }); }); describe("defineTask", () => { diff --git a/packages/evals/tests/tui/parse.test.ts b/packages/evals/tests/tui/parse.test.ts index bb42c9fc6..532057ad6 100644 --- a/packages/evals/tests/tui/parse.test.ts +++ b/packages/evals/tests/tui/parse.test.ts @@ -18,7 +18,7 @@ describe("resolveRunOptions", () => { it("defaults to the stagehand bench harness", () => { const resolved = resolveRunOptions({}, {}, {}); - expect(resolved.harness).toBe("stagehand"); + expect(resolved.harness).toBe("stagehand_v3"); }); it("accepts known bench harnesses", () => { diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts index 36be3e1aa..5f8a3fd83 100644 --- a/packages/evals/tests/tui/run.test.ts +++ b/packages/evals/tests/tui/run.test.ts @@ -115,7 +115,7 @@ describe("deriveCategoryFilter", () => { concurrency: 1, environment: "LOCAL", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", envOverrides: {}, dryRun: true, preview: false, @@ -149,7 +149,7 @@ describe("deriveCategoryFilter", () => { environment: "BROWSERBASE", model: "openai/gpt-4.1-mini", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", datasetFilter: "webvoyager", envOverrides: { EVAL_MAX_K: "1", @@ -169,7 +169,7 @@ describe("deriveCategoryFilter", () => { task: "agent/webvoyager", dataset: "webvoyager", model: "openai/gpt-4.1-mini", - harness: "stagehand", + harness: "stagehand_v3", agentMode: "dom", environment: "BROWSERBASE", useApi: false, @@ -195,7 +195,7 @@ describe("deriveCategoryFilter", () => { environment: "BROWSERBASE", model: "openai/gpt-4.1-mini", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], datasetFilter: "webvoyager", envOverrides: { @@ -413,7 +413,8 @@ describe("deriveCategoryFilter", () => { }); it("allows executable harnesses without env gates", () => { - expect(canExecuteBenchHarness("stagehand")).toBe(true); + expect(canExecuteBenchHarness("stagehand_v3")).toBe(true); + expect(canExecuteBenchHarness("stagehand_v4")).toBe(true); expect(canExecuteBenchHarness("claude_code")).toBe(true); expect(canExecuteBenchHarness("codex")).toBe(true); }); @@ -442,7 +443,7 @@ describe("deriveCategoryFilter", () => { environment: "BROWSERBASE", model: "openai/gpt-4.1-mini", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], envOverrides: {}, dryRun: false, @@ -460,7 +461,7 @@ describe("deriveCategoryFilter", () => { "Plan: 2 tasks × 1 model × 2 modes × 4 trials = 16 runs", ); expect(output).toContain( - "Env: BROWSERBASE Harness: stagehand Concurrency: 25", + "Env: BROWSERBASE Harness: stagehand_v3 Concurrency: 25", ); expect(runEvalsMock).toHaveBeenCalledOnce(); }); @@ -516,7 +517,7 @@ describe("buildCombinations (preview column-pruning)", () => { category: null, dataset: null, model, - harness: "stagehand", + harness: "stagehand_v3", agentMode, environment: "BROWSERBASE", useApi: false, diff --git a/packages/evals/tui/commands/run.ts b/packages/evals/tui/commands/run.ts index 01df1fa45..6b460ea17 100644 --- a/packages/evals/tui/commands/run.ts +++ b/packages/evals/tui/commands/run.ts @@ -23,6 +23,7 @@ import type { ResolvedRunOptions } from "./parse.js"; import { withEnvOverrides } from "./parse.js"; import { getRuntimeTasksRoot } from "../../runtimePaths.js"; import { + DEFAULT_BENCH_HARNESS, isExecutableBenchHarness, type Harness, } from "../../framework/benchTypes.js"; @@ -239,11 +240,11 @@ export async function runCommand( if ( options.useApi && - options.harness !== "stagehand" && + (options.harness ?? DEFAULT_BENCH_HARNESS) !== "stagehand_v3" && tasks.some((t) => t.tier === "bench") ) { throw new Error( - `Harness "${options.harness}" does not support --api. Use --harness stagehand for API-backed bench runs.`, + `Harness "${options.harness}" does not support --api. Use --harness stagehand_v3 for API-backed bench runs.`, ); } @@ -257,9 +258,18 @@ export async function runCommand( tasks.some((t) => t.tier === "bench") ) { throw new Error( - `Harness "${options.harness}" is dry-run only for now. Use --harness stagehand, --harness claude_code, or --harness codex for executable bench runs.`, + `Harness "${options.harness}" is dry-run only for now. Use --harness stagehand_v3, --harness stagehand_v4, --harness claude_code, or --harness codex for executable bench runs.`, ); } + if ( + options.harness === "stagehand_v4" && + tasks.some((t) => t.tier === "bench") + ) { + const { assertUnderstudyV4SdkAvailable } = await import( + "../../framework/UnderstudyV4Tools.js" + ); + assertUnderstudyV4SdkAvailable(); + } const matrix = await buildDryRunMatrix(options, tasks, registry); console.log(`\n ${bold("Running:")} ${cyan(buildRunTargetLabel(options))}`); From 8567cfc6f0cd4369808104cb7f33b70654f938c1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 03:38:37 -0700 Subject: [PATCH 02/12] Rename understudy eval tool surfaces --- packages/evals/ARCHITECTURE.mmd | 2 +- packages/evals/core/contracts/tool.ts | 4 +-- packages/evals/core/tools/registry.ts | 8 ++--- ...derstudy_code.ts => understudy_v3_code.ts} | 14 ++++---- .../framework/StagehandAgentV4Harness.ts | 10 +++--- packages/evals/framework/UnderstudyV4Tools.ts | 10 +++--- packages/evals/framework/benchHarness.ts | 2 +- packages/evals/framework/benchPlanner.ts | 34 +++++++++---------- packages/evals/framework/benchRunner.ts | 4 +-- packages/evals/framework/context.ts | 4 +-- packages/evals/framework/runner.ts | 2 +- packages/evals/framework/types.ts | 2 +- packages/evals/tests/cli.test.ts | 14 ++++---- .../framework/claudeCodeToolAdapter.test.ts | 2 +- .../evals/tests/framework/context.test.ts | 8 ++--- .../evals/tests/framework/core-runner.test.ts | 4 +-- packages/evals/tui/commands/core.ts | 4 +-- packages/evals/tui/commands/help.ts | 8 ++--- 18 files changed, 67 insertions(+), 69 deletions(-) rename packages/evals/core/tools/{understudy_code.ts => understudy_v3_code.ts} (96%) diff --git a/packages/evals/ARCHITECTURE.mmd b/packages/evals/ARCHITECTURE.mmd index 97e87aac7..c1c312f26 100644 --- a/packages/evals/ARCHITECTURE.mmd +++ b/packages/evals/ARCHITECTURE.mmd @@ -48,7 +48,7 @@ flowchart TB CoreContext["framework/context.ts
buildCoreContext"] FixtureServer["core/fixtures
local deterministic pages"] CoreTargets["core/targets
local Chrome
Browserbase CDP"] - CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"] + CoreTools["core/tools registry
understudy_v3_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"] CoreAssertions["assertions + metrics
adapter-backed results"] CoreDeps["core/runtime/coreDeps.ts
browserbase + ws
lazy require"] end diff --git a/packages/evals/core/contracts/tool.ts b/packages/evals/core/contracts/tool.ts index 9b179121d..0790e81d6 100644 --- a/packages/evals/core/contracts/tool.ts +++ b/packages/evals/core/contracts/tool.ts @@ -17,8 +17,8 @@ import type { } from "./results.js"; export type ToolSurface = - | "understudy_code" - | "understudy_v4" + | "understudy_v3_code" + | "understudy_v4_code" | "playwright_code" | "cdp_code" | "playwright_mcp" diff --git a/packages/evals/core/tools/registry.ts b/packages/evals/core/tools/registry.ts index 65384f137..81f29395c 100644 --- a/packages/evals/core/tools/registry.ts +++ b/packages/evals/core/tools/registry.ts @@ -4,11 +4,11 @@ import { CdpCodeTool } from "./cdp_code.js"; import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js"; import { PlaywrightCodeTool } from "./playwright_code.js"; import { PlaywrightMcpTool } from "./playwright_mcp.js"; -import { UnderstudyCodeTool } from "./understudy_code.js"; +import { UnderstudyV3CodeTool } from "./understudy_v3_code.js"; export function listCoreTools(): ToolSurface[] { return [ - "understudy_code", + "understudy_v3_code", "playwright_code", "cdp_code", "playwright_mcp", @@ -19,8 +19,8 @@ export function listCoreTools(): ToolSurface[] { export function getCoreTool(toolSurface: ToolSurface): CoreTool { switch (toolSurface) { - case "understudy_code": - return new UnderstudyCodeTool(); + case "understudy_v3_code": + return new UnderstudyV3CodeTool(); case "playwright_code": return new PlaywrightCodeTool(); case "cdp_code": diff --git a/packages/evals/core/tools/understudy_code.ts b/packages/evals/core/tools/understudy_v3_code.ts similarity index 96% rename from packages/evals/core/tools/understudy_code.ts rename to packages/evals/core/tools/understudy_v3_code.ts index 80709d729..20834007c 100644 --- a/packages/evals/core/tools/understudy_code.ts +++ b/packages/evals/core/tools/understudy_v3_code.ts @@ -222,7 +222,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support click target kind "${target.kind}" yet`, + `understudy_v3_code does not support click target kind "${target.kind}" yet`, ); } } @@ -253,7 +253,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support hover target kind "${target.kind}" yet`, + `understudy_v3_code does not support hover target kind "${target.kind}" yet`, ); } } @@ -298,7 +298,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support type target kind "${target.kind}" yet`, + `understudy_v3_code does not support type target kind "${target.kind}" yet`, ); } } @@ -335,7 +335,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support press target kind "${target.kind}" yet`, + `understudy_v3_code does not support press target kind "${target.kind}" yet`, ); } } @@ -462,8 +462,8 @@ function connectionModeFromProfile( return "launch"; } -export class UnderstudyCodeTool implements CoreTool { - readonly id = "understudy_code"; +export class UnderstudyV3CodeTool implements CoreTool { + readonly id = "understudy_v3_code"; readonly surface = "code"; readonly family = "understudy"; readonly supportedStartupProfiles: StartupProfile[] = [ @@ -485,7 +485,7 @@ export class UnderstudyCodeTool implements CoreTool { async start(input: ToolStartInput): Promise { if (input.startupProfile === "tool_attach_local_cdp") { throw new Error( - `understudy_code does not support startup profile "${input.startupProfile}" yet`, + `understudy_v3_code does not support startup profile "${input.startupProfile}" yet`, ); } diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index dff4c2078..0032291e0 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -51,9 +51,9 @@ export const StagehandAgentV4Harness: BenchHarness = { `Expected stagehand_v4 harness config, received "${row.config.harness}".`, ); } - if (row.config.toolSurface !== "understudy_v4") { + if (row.config.toolSurface !== "understudy_v4_code") { throw new EvalsError( - `StagehandAgentV4Harness requires --tool understudy_v4; received "${row.config.toolSurface ?? "default"}".`, + `StagehandAgentV4Harness requires --tool understudy_v4_code; received "${row.config.toolSurface ?? "default"}".`, ); } if (row.config.useApi) { @@ -88,7 +88,7 @@ export const StagehandAgentV4Harness: BenchHarness = { }, })) as { error?: unknown; logTree?: unknown }; logger.log({ - category: "understudy_v4", + category: "understudy_v4_code", message: typeof result.logTree === "string" ? `v4 bus.logTree()\n${result.logTree}` @@ -99,7 +99,7 @@ export const StagehandAgentV4Harness: BenchHarness = { }); } catch (dashboardError) { logger.warn({ - category: "understudy_v4", + category: "understudy_v4_code", message: `Unable to print v4 bus.logTree(): ${ dashboardError instanceof Error ? dashboardError.message @@ -163,7 +163,7 @@ export const StagehandAgentV4Harness: BenchHarness = { row, logger, v3: v3Result.v3, - stagehandV4: understudyV4Tools.stagehandV4, + v4: understudyV4Tools.stagehandV4, agent: v3Result.agent, page: v4Page as unknown as Page, debugUrl: v3Result.debugUrl ?? "", diff --git a/packages/evals/framework/UnderstudyV4Tools.ts b/packages/evals/framework/UnderstudyV4Tools.ts index 272742213..5e01eef9a 100644 --- a/packages/evals/framework/UnderstudyV4Tools.ts +++ b/packages/evals/framework/UnderstudyV4Tools.ts @@ -123,7 +123,7 @@ export async function startUnderstudyV4Tools(input: { const message = parseBridgeMessage(line); if (!message) { input.logger.log({ - category: "understudy_v4", + category: "understudy_v4_code", message: line, level: 1, }); @@ -159,7 +159,7 @@ export async function startUnderstudyV4Tools(input: { child.stderr.on("data", (chunk: Buffer) => { for (const line of chunk.toString("utf8").split(/\r?\n/).filter(Boolean)) { input.logger.warn({ - category: "understudy_v4", + category: "understudy_v4_code", message: line, level: 1, }); @@ -186,7 +186,7 @@ export async function startUnderstudyV4Tools(input: { const ready = await readyPromise; input.logger.log({ - category: "understudy_v4", + category: "understudy_v4_code", message: `Connected v4 tools at ${ready.cdpUrl}`, level: 1, }); @@ -279,7 +279,7 @@ function buildUnderstudyV4ToolSet( inputSchema: ai.jsonSchema(schema), execute: async (args) => { logger.log({ - category: "understudy_v4", + category: "understudy_v4_code", message: `Agent calling v4 tool: ${name}`, level: 1, auxiliary: { @@ -547,7 +547,7 @@ function understudyV4ClientOptions( } if (environment === "BROWSERBASE") { if (!process.env.BROWSERBASE_API_KEY) { - throw new Error("BROWSERBASE_API_KEY is required for understudy_v4."); + throw new Error("BROWSERBASE_API_KEY is required for understudy_v4_code."); } return { rebuild_extension: false, diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index 38eae88c9..1cc580a90 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -28,7 +28,7 @@ export interface BenchHarnessContext { row: BenchMatrixRow; logger: EvalLogger; v3?: V3; - stagehandV4?: UnderstudyV4NativeRuntime; + v4?: UnderstudyV4NativeRuntime; agent?: AgentInstance; page?: Page; debugUrl: string; diff --git a/packages/evals/framework/benchPlanner.ts b/packages/evals/framework/benchPlanner.ts index dab03b44a..1b6a48875 100644 --- a/packages/evals/framework/benchPlanner.ts +++ b/packages/evals/framework/benchPlanner.ts @@ -97,7 +97,9 @@ export function resolveBenchModelEntries( effectiveCategory === "agent" || effectiveCategory === "external_agent_benchmarks"; const harness = options.harness ?? DEFAULT_BENCH_HARNESS; - const requestedAgentModes = isStagehandAgentHarness(harness) + const usesStagehandHarness = + harness === "stagehand_v3" || harness === "stagehand_v4"; + const requestedAgentModes = usesStagehandHarness ? resolveRequestedAgentModes(options) : undefined; @@ -106,7 +108,7 @@ export function resolveBenchModelEntries( isAgentCategory && requestedAgentModes ? requestedAgentModes : [ - isStagehandAgentHarness(harness) + usesStagehandHarness ? resolveAgentModeForModel(options.modelOverride) : "hybrid", ]; @@ -346,7 +348,7 @@ function buildBenchHarnessConfig(input: { startupProfile?: StartupProfile; dataset?: string; }): BenchHarnessConfig { - if (isStagehandAgentHarness(input.harness)) { + if (input.harness === "stagehand_v3" || input.harness === "stagehand_v4") { return { harness: input.harness, model: input.model, @@ -388,6 +390,9 @@ export function generateBenchTestcases( modelEntries, ); const allTestcases = [...suiteTestcases.testcases]; + const harness = options.harness ?? DEFAULT_BENCH_HARNESS; + const usesStagehandHarness = + harness === "stagehand_v3" || harness === "stagehand_v4"; if (options.harness === "claude_code" || options.harness === "codex") { if (suiteTestcases.remainingTasks.length > 0) { @@ -410,16 +415,16 @@ export function generateBenchTestcases( model, options, undefined, - isAgentCategory && rowUsesStagehand(options) + isAgentCategory && usesStagehandHarness ? entry.mode === "cua" : undefined, - isAgentCategory && rowUsesStagehand(options) + isAgentCategory && usesStagehandHarness ? (options.agentMode ?? entry.mode) : undefined, ); const agentMode = row.agentMode; const includeStagehandAgentMode = - isAgentCategory && rowUsesStagehand(options) && agentMode; + isAgentCategory && usesStagehandHarness && agentMode; allTestcases.push({ input: { name: task.name, @@ -461,14 +466,6 @@ export function generateBenchTestcases( return allTestcases; } -function rowUsesStagehand(options: Pick): boolean { - return isStagehandAgentHarness(options.harness ?? DEFAULT_BENCH_HARNESS); -} - -function isStagehandAgentHarness(harness: Harness): boolean { - return harness === "stagehand_v3" || harness === "stagehand_v4"; -} - function resolveBenchRowToolSurface( harness: Harness, requested?: ToolSurface, @@ -480,14 +477,14 @@ function resolveBenchRowToolSurface( return resolveCodexToolSurface(requested); } if (harness === "stagehand_v4") { - if (requested && requested !== "understudy_v4") { + if (requested && requested !== "understudy_v4_code") { throw new EvalsError( `stagehand_v4 uses the UnderstudyV4Tools surface. Received --tool ${requested}.`, ); } - return requested ?? "understudy_v4"; + return requested ?? "understudy_v4_code"; } - if (harness === "stagehand_v3" && requested === "understudy_v4") { + if (harness === "stagehand_v3" && requested === "understudy_v4_code") { throw new EvalsError( "Use --harness stagehand_v4 for the UnderstudyV4Tools surface.", ); @@ -565,7 +562,8 @@ function withBenchMetadata( task: DiscoveredTask, options: BenchPlanOptions, ): Testcase { - const isStagehand = rowUsesStagehand(options); + const harness = options.harness ?? DEFAULT_BENCH_HARNESS; + const isStagehand = harness === "stagehand_v3" || harness === "stagehand_v4"; const agentMode = isStagehand ? (options.agentMode ?? testcase.input.agentMode) : undefined; diff --git a/packages/evals/framework/benchRunner.ts b/packages/evals/framework/benchRunner.ts index e8361f0b8..51193dc20 100644 --- a/packages/evals/framework/benchRunner.ts +++ b/packages/evals/framework/benchRunner.ts @@ -73,7 +73,7 @@ export async function executeBenchTask( taskModule.definition.fn; const ctx = { v3: harnessCtx.v3, - stagehandV4: harnessCtx.stagehandV4, + v4: harnessCtx.v4, agent: harnessCtx.agent, page: harnessCtx.page, logger, @@ -91,7 +91,7 @@ export async function executeBenchTask( return withBenchSessionUrls( await taskModule.legacyFn({ v3: harnessCtx.v3, - stagehandV4: harnessCtx.stagehandV4, + v4: harnessCtx.v4, logger, debugUrl: harnessCtx.debugUrl, sessionUrl: harnessCtx.sessionUrl, diff --git a/packages/evals/framework/context.ts b/packages/evals/framework/context.ts index 2b6a979c4..3b09e4fcd 100644 --- a/packages/evals/framework/context.ts +++ b/packages/evals/framework/context.ts @@ -40,7 +40,7 @@ export function resolveDefaultCoreStartupProfile( return environment === "BROWSERBASE" ? "tool_create_browserbase" : "tool_launch_local"; - case "understudy_code": + case "understudy_v3_code": case "playwright_code": case "cdp_code": case "playwright_mcp": @@ -68,7 +68,7 @@ export async function buildCoreContext( ): Promise { const logger = options.logger ?? new EvalLogger(); const environment = options.environment ?? "LOCAL"; - const toolSurface = options.toolSurface ?? "understudy_code"; + const toolSurface = options.toolSurface ?? "understudy_v3_code"; const { getCoreTool } = await import("../core/tools/registry.js"); const tool = getCoreTool(toolSurface); const startupProfile = diff --git a/packages/evals/framework/runner.ts b/packages/evals/framework/runner.ts index 336db1c02..8147a8fc1 100644 --- a/packages/evals/framework/runner.ts +++ b/packages/evals/framework/runner.ts @@ -321,7 +321,7 @@ export async function runEvals( (t: DiscoveredTask) => t.tier === "core", ); const effectiveCoreToolSurface = hasCoreOnly - ? (options.coreToolSurface ?? "understudy_code") + ? (options.coreToolSurface ?? "understudy_v3_code") : undefined; const effectiveCoreStartupProfile = hasCoreOnly && effectiveCoreToolSurface diff --git a/packages/evals/framework/types.ts b/packages/evals/framework/types.ts index fea8e4b86..0441b0d98 100644 --- a/packages/evals/framework/types.ts +++ b/packages/evals/framework/types.ts @@ -73,7 +73,7 @@ export interface BenchTaskContext { /** Stagehand V3 instance. */ v3: V3; /** Native Stagehand v4 SDK proxy. Present for the stagehand_v4 harness. */ - stagehandV4?: UnderstudyV4NativeRuntime; + v4?: UnderstudyV4NativeRuntime; /** Agent instance (created when the task lives under agent/). */ agent?: AgentInstance; /** Playwright page (convenience — same as v3.context.pages()[0]). */ diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts index 76a7bd3fa..357f8764f 100644 --- a/packages/evals/tests/cli.test.ts +++ b/packages/evals/tests/cli.test.ts @@ -173,7 +173,7 @@ describe.sequential("core config", () => { const { stdout, code } = await runCli(["config", "core"]); expect(code).toBe(0); expect(stdout).toContain("Core configuration"); - expect(stdout).toContain("runner default: understudy_code"); + expect(stdout).toContain("runner default: understudy_v3_code"); }); it("persists tool via `config core set tool`", async () => { @@ -183,18 +183,18 @@ describe.sequential("core config", () => { "core", "set", "tool", - "understudy_code", + "understudy_v3_code", ]); expect(setResult.code).toBe(0); - expect(setResult.stdout).toContain("Set core.tool to understudy_code"); + expect(setResult.stdout).toContain("Set core.tool to understudy_v3_code"); const saved = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8")); - expect(saved.core?.tool).toBe("understudy_code"); + expect(saved.core?.tool).toBe("understudy_v3_code"); }); it("flows persisted core.tool into run dry-run output", async () => { resetConfig(); - await runCli(["config", "core", "set", "tool", "understudy_code"]); + await runCli(["config", "core", "set", "tool", "understudy_v3_code"]); const { stdout, code } = await runCli([ "run", @@ -203,7 +203,7 @@ describe.sequential("core config", () => { ]); expect(code).toBe(0); const payload = JSON.parse(stdout); - expect(payload.runOptions.coreToolSurface).toBe("understudy_code"); + expect(payload.runOptions.coreToolSurface).toBe("understudy_v3_code"); }); it("rejects unknown tool", async () => { @@ -271,7 +271,7 @@ describe.sequential("core config", () => { it("reset clears the whole core section", async () => { resetConfig(); - await runCli(["config", "core", "set", "tool", "understudy_code"]); + await runCli(["config", "core", "set", "tool", "understudy_v3_code"]); const { code } = await runCli(["config", "core", "reset"]); expect(code).toBe(0); diff --git a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts index e28775652..798dd72e4 100644 --- a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts +++ b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts @@ -55,7 +55,7 @@ describe("claude code tool adapter resolution", () => { }); it("rejects unsupported Claude Code tool surfaces for now", () => { - expect(() => resolveClaudeCodeToolSurface("understudy_code")).toThrow( + expect(() => resolveClaudeCodeToolSurface("understudy_v3_code")).toThrow( /supports --tool browse_cli, playwright_code, or cdp_code/, ); }); diff --git a/packages/evals/tests/framework/context.test.ts b/packages/evals/tests/framework/context.test.ts index 742378635..4073ce561 100644 --- a/packages/evals/tests/framework/context.test.ts +++ b/packages/evals/tests/framework/context.test.ts @@ -4,7 +4,7 @@ import { prepareCoreBrowserTarget } from "../../core/targets/index.js"; describe("resolveDefaultCoreStartupProfile", () => { it("uses runner-provided local CDP for code surfaces in LOCAL", () => { - expect(resolveDefaultCoreStartupProfile("understudy_code", "LOCAL")).toBe( + expect(resolveDefaultCoreStartupProfile("understudy_v3_code", "LOCAL")).toBe( "runner_provided_local_cdp", ); expect(resolveDefaultCoreStartupProfile("playwright_code", "LOCAL")).toBe( @@ -29,7 +29,7 @@ describe("resolveDefaultCoreStartupProfile", () => { it("uses runner-provided Browserbase CDP for code surfaces in BROWSERBASE", () => { expect( - resolveDefaultCoreStartupProfile("understudy_code", "BROWSERBASE"), + resolveDefaultCoreStartupProfile("understudy_v3_code", "BROWSERBASE"), ).toBe("runner_provided_browserbase_cdp"); expect( resolveDefaultCoreStartupProfile("playwright_code", "BROWSERBASE"), @@ -55,7 +55,7 @@ describe("resolveDefaultCoreStartupProfile", () => { await expect( prepareCoreBrowserTarget({ environment: "BROWSERBASE", - toolSurface: "understudy_code", + toolSurface: "understudy_v3_code", startupProfile: "runner_provided_local_cdp", }), ).rejects.toThrow(/requires LOCAL environment/); @@ -65,7 +65,7 @@ describe("resolveDefaultCoreStartupProfile", () => { await expect( prepareCoreBrowserTarget({ environment: "LOCAL", - toolSurface: "understudy_code", + toolSurface: "understudy_v3_code", startupProfile: "runner_provided_browserbase_cdp", }), ).rejects.toThrow(/requires BROWSERBASE environment/); diff --git a/packages/evals/tests/framework/core-runner.test.ts b/packages/evals/tests/framework/core-runner.test.ts index 6d2879fb4..242978f9a 100644 --- a/packages/evals/tests/framework/core-runner.test.ts +++ b/packages/evals/tests/framework/core-runner.test.ts @@ -122,7 +122,7 @@ describe("core runner", () => { }, startupProfile: "runner_provided_local_cdp", adapter: { - name: "understudy_code", + name: "understudy_v3_code", family: "understudy", surface: "code", metadata: { @@ -183,7 +183,7 @@ describe("core runner", () => { concurrency: 1, trials: 1, environment: "LOCAL", - coreToolSurface: "understudy_code", + coreToolSurface: "understudy_v3_code", coreStartupProfile: "runner_provided_local_cdp", }); diff --git a/packages/evals/tui/commands/core.ts b/packages/evals/tui/commands/core.ts index 409fb8c28..592f776bc 100644 --- a/packages/evals/tui/commands/core.ts +++ b/packages/evals/tui/commands/core.ts @@ -88,7 +88,7 @@ export function printCoreConfig(entryDir: string): void { console.log(`\n ${bold("Core configuration:")}\n`); console.log( - ` ${cyan("tool")} ${core.tool ?? gray("(runner default: understudy_code)")}`, + ` ${cyan("tool")} ${core.tool ?? gray("(runner default: understudy_v3_code)")}`, ); console.log( ` ${cyan("startup")} ${core.startup ?? gray("(inferred from tool + env)")}`, @@ -147,7 +147,7 @@ async function setCoreKey( console.error( red(" Cannot set startup without a tool. Set core.tool first."), ); - console.log(dim(` Example: evals core config set tool understudy_code`)); + console.log(dim(` Example: evals core config set tool understudy_v3_code`)); process.exitCode = 1; return; } diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts index 95b49c766..9ef90a4ff 100644 --- a/packages/evals/tui/commands/help.ts +++ b/packages/evals/tui/commands/help.ts @@ -79,7 +79,7 @@ export function printRunHelp(): void { "", row( `${cyan("--tool")} ${dim("")}`, - `Core tool surface ${gray("(understudy_code, playwright_code, ...)")}`, + `Core tool surface ${gray("(understudy_v3_code, playwright_code, ...)")}`, ), row(`${cyan("--startup")} ${dim("")}`, "Core startup profile"), "", @@ -121,7 +121,7 @@ export function printRunHelp(): void { ` ${bold("Examples:")}`, "", ` ${dim("$")} evals run act -t 3 -c 5`, - ` ${dim("$")} evals run navigation/open --tool understudy_code`, + ` ${dim("$")} evals run navigation/open --tool understudy_v3_code`, ` ${dim("$")} evals run b:webvoyager -l 10`, ` ${dim("$")} evals run b:onlineMind2Web -l 25`, ` ${dim("$")} evals run b:webtailbench -l 10`, @@ -200,12 +200,12 @@ export function printConfigHelp(): void { ), row(cyan("setup"), `Interactive wizard ${gray("(coming soon)")}`), "", - ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`, + ` ${bold("Valid core tools:")} ${gray("understudy_v3_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`, "", ` ${bold("Examples:")}`, "", ` ${dim("$")} evals config set trials 5`, - ` ${dim("$")} evals config core set tool understudy_code`, + ` ${dim("$")} evals config core set tool understudy_v3_code`, ` ${dim("$")} evals config core set startup tool_launch_local`, ` ${dim("$")} evals config core reset`, "", From e6a8cc4c98542c6373f15bd1bc6bafe1a59f5292 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 04:03:15 -0700 Subject: [PATCH 03/12] Fix v4 evals lint --- .../framework/StagehandAgentV4Harness.ts | 107 +++++++++++++----- packages/evals/framework/UnderstudyV4Tools.ts | 54 ++++++--- .../evals/tests/framework/context.test.ts | 6 +- packages/evals/tui/commands/core.ts | 4 +- 4 files changed, 127 insertions(+), 44 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 0032291e0..615f7734d 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -10,7 +10,10 @@ import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; import { endBrowserbaseSession } from "../browserbaseCleanup.js"; import { EvalsError } from "../errors.js"; import type { V3InitResult } from "../initV3.js"; -import { startUnderstudyV4Tools, type UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js"; +import { + startUnderstudyV4Tools, + type UnderstudyV4NativeRuntime, +} from "./UnderstudyV4Tools.js"; import type { BenchHarness, BenchHarnessStartInput, @@ -145,7 +148,10 @@ export const StagehandAgentV4Harness: BenchHarness = { await printV4BusLogTree(); return await closeV3(); }; - const v4Page = await installStagehandV4BenchFacade(v3Result.v3, understudyV4Tools.stagehandV4); + const v4Page = await installStagehandV4BenchFacade( + v3Result.v3, + understudyV4Tools.stagehandV4, + ); if (createAgent) { v3Result.agent = v3Result.v3.agent({ @@ -260,10 +266,20 @@ async function installStagehandV4BenchFacade( if (typeof event.targetId === "string") pageState.targetId = event.targetId; if (typeof event.url === "string") pageState.url = event.url; }; - stagehandV4.cdp.on("Stagehand.BrowserPageNavigated", updatePageStateFromBrowserEvent); - stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", updatePageStateFromBrowserEvent); + stagehandV4.cdp.on( + "Stagehand.BrowserPageNavigated", + updatePageStateFromBrowserEvent, + ); + stagehandV4.cdp.on( + "Stagehand.BrowserPageLoaded", + updatePageStateFromBrowserEvent, + ); - const page = createStagehandV4PageFacade(stagehandV4, pageState, refreshPageInfo); + const page = createStagehandV4PageFacade( + stagehandV4, + pageState, + refreshPageInfo, + ); const pages = (): Record[] => [page]; const context = v3.context as unknown as Record; @@ -273,9 +289,14 @@ async function installStagehandV4BenchFacade( return page; }; - v3.observe = (async (a?: string | Record, b?: Record) => { + v3.observe = (async ( + a?: string | Record, + b?: Record, + ) => { const instruction = typeof a === "string" ? a : undefined; - const options = (typeof a === "string" ? b : a) as Record | undefined; + const options = (typeof a === "string" ? b : a) as + | Record + | undefined; const result = await stagehandV4.cdp.Stagehand.AIObserve({ ...(instruction != null ? { instruction } : {}), ...selectorParam(options), @@ -285,7 +306,10 @@ async function installStagehandV4BenchFacade( return Array.isArray(observed) ? observed : []; }) as V3["observe"]; - v3.act = (async (input: string | Record, options?: Record) => { + v3.act = (async ( + input: string | Record, + options?: Record, + ) => { const result = await stagehandV4.cdp.Stagehand.AIAct( typeof input === "string" ? { @@ -309,7 +333,9 @@ async function installStagehandV4BenchFacade( ) => { const instruction = typeof a === "string" ? a : undefined; const schema = isZodSchema(b) ? z.toJSONSchema(b) : undefined; - const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as Record | undefined; + const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as + | Record + | undefined; const result = await stagehandV4.cdp.Stagehand.AIExtract({ ...(instruction != null ? { instruction } : {}), ...(schema != null ? { schema: schema as Record } : {}), @@ -342,20 +368,26 @@ function createStagehandV4PageFacade( }; timer = setTimeout(() => { stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); - reject(new Error("Timed out waiting for Stagehand.BrowserPageLoaded.")); + reject( + new Error("Timed out waiting for Stagehand.BrowserPageLoaded."), + ); }, 30_000); stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", onLoaded); }); const [rawResult] = await Promise.all([ stagehandV4.cdp.Stagehand.BrowserPageGoto({ url, - selector: pageState.targetId != null ? { targetId: pageState.targetId } : { active: true }, + selector: + pageState.targetId != null + ? { targetId: pageState.targetId } + : { active: true }, }), loaded, ]); const result = unwrapStagehandV4Result(rawResult); if (isRecord(result)) { - if (typeof result.targetId === "string") pageState.targetId = result.targetId; + if (typeof result.targetId === "string") + pageState.targetId = result.targetId; if (typeof result.url === "string") pageState.url = result.url; } await refreshPageInfo(); @@ -374,15 +406,16 @@ function createStagehandV4PageFacade( }, async waitForLoadState() { await new Promise((resolve, reject) => { - let timer: ReturnType; const onLoaded = (): void => { clearTimeout(timer); stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); resolve(); }; - timer = setTimeout(() => { + const timer = setTimeout(() => { stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); - reject(new Error("Timed out waiting for Stagehand.BrowserPageLoaded.")); + reject( + new Error("Timed out waiting for Stagehand.BrowserPageLoaded."), + ); }, 30_000); stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", onLoaded); }); @@ -395,7 +428,9 @@ function createStagehandV4PageFacade( : String(expressionOrFn); const result = unwrapStagehandV4Result( await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ - ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), arg: isJsonValue(arg) ? arg : undefined, awaitPromise: true, expression, @@ -405,12 +440,16 @@ function createStagehandV4PageFacade( return isRecord(result) && "value" in result ? result.value : result; }, locator() { - throw new Error("stagehand_v4 evals must use v4 protocol actions instead of v3 page.locator()."); + throw new Error( + "stagehand_v4 evals must use v4 protocol actions instead of v3 page.locator().", + ); }, }; } -function normalizeV4Action(action: Record): Record { +function normalizeV4Action( + action: Record, +): Record { return { ...action, selector: normalizeV4Selector(action.selector), @@ -419,26 +458,37 @@ function normalizeV4Action(action: Record): Record | undefined): Record { +function selectorParam( + options: Record | undefined, +): Record { const selector = normalizeV4Selector(options?.selector); return selector == null ? {} : { selector }; } -function normalizeV4Selector(value: unknown): Record | undefined { +function normalizeV4Selector( + value: unknown, +): Record | undefined { if (value == null) return undefined; if (isRecord(value)) return value; if (typeof value !== "string" || value.length === 0) return undefined; - if (value.startsWith("xpath=")) return { xpath: value.slice("xpath=".length) }; + if (value.startsWith("xpath=")) + return { xpath: value.slice("xpath=".length) }; if (value.startsWith("/") || value.startsWith("(")) return { xpath: value }; return { css: value }; } -function workflowOptionsParam(options: Record | undefined): Record { +function workflowOptionsParam( + options: Record | undefined, +): Record { if (!options) return {}; const workflowOptions: Record = {}; - if (typeof options.timeout === "number") workflowOptions.timeout = options.timeout; - if (isJsonValue(options.variables)) workflowOptions.variables = options.variables; - return Object.keys(workflowOptions).length === 0 ? {} : { options: workflowOptions }; + if (typeof options.timeout === "number") + workflowOptions.timeout = options.timeout; + if (isJsonValue(options.variables)) + workflowOptions.variables = options.variables; + return Object.keys(workflowOptions).length === 0 + ? {} + : { options: workflowOptions }; } function unwrapStagehandV4Result(value: unknown): unknown { @@ -459,7 +509,12 @@ function isZodSchema(value: unknown): value is z.ZodType { function isJsonValue(value: unknown): boolean { if (value == null) return true; - if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") return true; + if ( + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" + ) + return true; if (Array.isArray(value)) return value.every(isJsonValue); if (!isRecord(value)) return false; return Object.values(value).every(isJsonValue); diff --git a/packages/evals/framework/UnderstudyV4Tools.ts b/packages/evals/framework/UnderstudyV4Tools.ts index 5e01eef9a..cba3e1c20 100644 --- a/packages/evals/framework/UnderstudyV4Tools.ts +++ b/packages/evals/framework/UnderstudyV4Tools.ts @@ -71,10 +71,7 @@ export interface UnderstudyV4NativeRuntime { cdp: { on(eventName: string, listener: (event: unknown) => void): void; off(eventName: string, listener: (event: unknown) => void): void; - Mod: Record< - string, - (params?: Record) => Promise - >; + Mod: Record) => Promise>; Stagehand: Record< string, (params?: Record) => Promise @@ -212,7 +209,9 @@ export async function startUnderstudyV4Tools(input: { listeners.add(listener); if (!subscribedEvents.has(eventName)) { subscribedEvents.add(eventName); - child.stdin.write(`${JSON.stringify({ type: "subscribe", name: eventName })}\n`); + child.stdin.write( + `${JSON.stringify({ type: "subscribe", name: eventName })}\n`, + ); } }, off(eventName, listener) { @@ -387,10 +386,24 @@ async function closeBridge( function parseBridgeMessage( line: string, -): BridgeReadyMessage | BridgeResultMessage | BridgeEventMessage | BridgeErrorMessage | null { +): + | BridgeReadyMessage + | BridgeResultMessage + | BridgeEventMessage + | BridgeErrorMessage + | null { try { - const parsed = JSON.parse(line) as BridgeReadyMessage | BridgeResultMessage | BridgeEventMessage | BridgeErrorMessage; - if (parsed.type === "ready" || parsed.type === "result" || parsed.type === "event" || parsed.type === "error") { + const parsed = JSON.parse(line) as + | BridgeReadyMessage + | BridgeResultMessage + | BridgeEventMessage + | BridgeErrorMessage; + if ( + parsed.type === "ready" || + parsed.type === "result" || + parsed.type === "event" || + parsed.type === "error" + ) { return parsed; } } catch { @@ -452,9 +465,11 @@ async function runBridgeChild(): Promise { if (message.type === "subscribe") { if (!client) throw new Error("Understudy v4 tools were not initialized."); const name = message.name; - if (typeof name !== "string") throw new Error("Event subscription requires an event name."); + if (typeof name !== "string") + throw new Error("Event subscription requires an event name."); if (!eventSubscriptions.has(name)) { - const listener = (event: unknown): void => writeBridgeMessage({ type: "event", name, event }); + const listener = (event: unknown): void => + writeBridgeMessage({ type: "event", name, event }); eventSubscriptions.set(name, listener); client.cdp.on(name, listener); } @@ -483,7 +498,9 @@ async function runBridgeChild(): Promise { if (!command) { throw new Error( `The v4 SDK does not expose ${ - message.type === "command" ? commandName : `Stagehand.${commandName}` + message.type === "command" + ? commandName + : `Stagehand.${commandName}` }.`, ); } @@ -543,11 +560,16 @@ function understudyV4ClientOptions( environment: "LOCAL" | "BROWSERBASE", ): Record { if (process.env.STAGEHAND_V4_CDP_URL) { - return { cdp_url: process.env.STAGEHAND_V4_CDP_URL, rebuild_extension: false }; + return { + cdp_url: process.env.STAGEHAND_V4_CDP_URL, + rebuild_extension: false, + }; } if (environment === "BROWSERBASE") { if (!process.env.BROWSERBASE_API_KEY) { - throw new Error("BROWSERBASE_API_KEY is required for understudy_v4_code."); + throw new Error( + "BROWSERBASE_API_KEY is required for understudy_v4_code.", + ); } return { rebuild_extension: false, @@ -605,7 +627,11 @@ function commandForPath( } function writeBridgeMessage( - message: BridgeReadyMessage | BridgeResultMessage | BridgeEventMessage | BridgeErrorMessage, + message: + | BridgeReadyMessage + | BridgeResultMessage + | BridgeEventMessage + | BridgeErrorMessage, ): void { process.stdout.write(`${JSON.stringify(message)}\n`); } diff --git a/packages/evals/tests/framework/context.test.ts b/packages/evals/tests/framework/context.test.ts index 4073ce561..c6683afe4 100644 --- a/packages/evals/tests/framework/context.test.ts +++ b/packages/evals/tests/framework/context.test.ts @@ -4,9 +4,9 @@ import { prepareCoreBrowserTarget } from "../../core/targets/index.js"; describe("resolveDefaultCoreStartupProfile", () => { it("uses runner-provided local CDP for code surfaces in LOCAL", () => { - expect(resolveDefaultCoreStartupProfile("understudy_v3_code", "LOCAL")).toBe( - "runner_provided_local_cdp", - ); + expect( + resolveDefaultCoreStartupProfile("understudy_v3_code", "LOCAL"), + ).toBe("runner_provided_local_cdp"); expect(resolveDefaultCoreStartupProfile("playwright_code", "LOCAL")).toBe( "runner_provided_local_cdp", ); diff --git a/packages/evals/tui/commands/core.ts b/packages/evals/tui/commands/core.ts index 592f776bc..45cf7b7f1 100644 --- a/packages/evals/tui/commands/core.ts +++ b/packages/evals/tui/commands/core.ts @@ -147,7 +147,9 @@ async function setCoreKey( console.error( red(" Cannot set startup without a tool. Set core.tool first."), ); - console.log(dim(` Example: evals core config set tool understudy_v3_code`)); + console.log( + dim(` Example: evals core config set tool understudy_v3_code`), + ); process.exitCode = 1; return; } From 43032bea1d37f99d666a154e92a2454bde7464a6 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 04:06:47 -0700 Subject: [PATCH 04/12] Fix v4 waitForLoadState tracking --- .../framework/StagehandAgentV4Harness.ts | 203 ++++++++++++++---- 1 file changed, 157 insertions(+), 46 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 615f7734d..94235c16f 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -22,6 +22,27 @@ import type { } from "./benchHarness.js"; type Page = ReturnType[number]; +type StagehandV4LoadState = + | "init" + | "domcontentloaded" + | "loaded" + | "networkidle2" + | "networkidle"; + +const STAGEHAND_V4_LOAD_STATE_ORDER: Record = { + init: 0, + domcontentloaded: 1, + loaded: 2, + networkidle2: 3, + networkidle: 4, +}; + +type StagehandV4PageState = { + targetId?: string; + title: string; + url: string; + loadState?: StagehandV4LoadState; +}; function isAgentTask(task: BenchHarnessStartInput["task"]): boolean { return ( @@ -238,11 +259,7 @@ async function installStagehandV4BenchFacade( v3: V3, stagehandV4: UnderstudyV4NativeRuntime, ): Promise> { - const pageState: { - targetId?: string; - title: string; - url: string; - } = { + const pageState: StagehandV4PageState = { title: "", url: "about:blank", }; @@ -262,16 +279,18 @@ async function installStagehandV4BenchFacade( await refreshPageInfo().catch(() => {}); const updatePageStateFromBrowserEvent = (event: unknown): void => { - if (!isRecord(event)) return; - if (typeof event.targetId === "string") pageState.targetId = event.targetId; - if (typeof event.url === "string") pageState.url = event.url; + updatePageStateFromStagehandV4Event(pageState, event); + }; + const updatePageStateFromNavigationEvent = (event: unknown): void => { + updatePageStateFromStagehandV4Event(pageState, event); + pageState.loadState = "init"; }; stagehandV4.cdp.on( "Stagehand.BrowserPageNavigated", - updatePageStateFromBrowserEvent, + updatePageStateFromNavigationEvent, ); stagehandV4.cdp.on( - "Stagehand.BrowserPageLoaded", + "Stagehand.BrowserPageLoadStateChanged", updatePageStateFromBrowserEvent, ); @@ -350,30 +369,21 @@ async function installStagehandV4BenchFacade( function createStagehandV4PageFacade( stagehandV4: UnderstudyV4NativeRuntime, - pageState: { - targetId?: string; - title: string; - url: string; - }, + pageState: StagehandV4PageState, refreshPageInfo: () => Promise, ): Record { return { - async goto(url: string) { - let timer: ReturnType; - const loaded = new Promise((resolve, reject) => { - const onLoaded = (): void => { - clearTimeout(timer); - stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); - resolve(); - }; - timer = setTimeout(() => { - stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); - reject( - new Error("Timed out waiting for Stagehand.BrowserPageLoaded."), - ); - }, 30_000); - stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", onLoaded); - }); + async goto(url: string, options?: unknown) { + pageState.loadState = "init"; + const loaded = waitForStagehandV4LoadState( + stagehandV4, + pageState, + isRecord(options) && "waitUntil" in options + ? options.waitUntil + : undefined, + loadStateTimeoutMs(options), + false, + ); const [rawResult] = await Promise.all([ stagehandV4.cdp.Stagehand.BrowserPageGoto({ url, @@ -404,21 +414,14 @@ function createStagehandV4PageFacade( await refreshPageInfo(); return pageState.title; }, - async waitForLoadState() { - await new Promise((resolve, reject) => { - const onLoaded = (): void => { - clearTimeout(timer); - stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); - resolve(); - }; - const timer = setTimeout(() => { - stagehandV4.cdp.off("Stagehand.BrowserPageLoaded", onLoaded); - reject( - new Error("Timed out waiting for Stagehand.BrowserPageLoaded."), - ); - }, 30_000); - stagehandV4.cdp.on("Stagehand.BrowserPageLoaded", onLoaded); - }); + async waitForLoadState(state?: unknown, options?: unknown) { + await waitForStagehandV4LoadState( + stagehandV4, + pageState, + state, + loadStateTimeoutMs(options), + true, + ); await refreshPageInfo(); }, async evaluate(expressionOrFn: unknown, arg?: unknown) { @@ -447,6 +450,114 @@ function createStagehandV4PageFacade( }; } +async function waitForStagehandV4LoadState( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + state: unknown, + timeoutMs: number, + acceptCurrentState: boolean, +): Promise { + const expectedState = normalizeStagehandV4LoadState(state); + if ( + acceptCurrentState && + pageState.loadState != null && + STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >= + STAGEHAND_V4_LOAD_STATE_ORDER[expectedState] + ) { + return; + } + + await new Promise((resolve, reject) => { + const onLoadStateChanged = (event: unknown): void => { + const eventTargetId = targetIdFromStagehandV4Event(event); + if ( + eventTargetId != null && + pageState.targetId != null && + eventTargetId !== pageState.targetId + ) { + return; + } + updatePageStateFromStagehandV4Event(pageState, event); + if ( + pageState.loadState != null && + STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >= + STAGEHAND_V4_LOAD_STATE_ORDER[expectedState] + ) { + clearTimeout(timer); + stagehandV4.cdp.off( + "Stagehand.BrowserPageLoadStateChanged", + onLoadStateChanged, + ); + resolve(); + } + }; + const timer = setTimeout(() => { + stagehandV4.cdp.off( + "Stagehand.BrowserPageLoadStateChanged", + onLoadStateChanged, + ); + reject( + new Error( + `Timed out waiting for Stagehand.BrowserPageLoadStateChanged(${expectedState}).`, + ), + ); + }, timeoutMs); + stagehandV4.cdp.on( + "Stagehand.BrowserPageLoadStateChanged", + onLoadStateChanged, + ); + }); +} + +function updatePageStateFromStagehandV4Event( + pageState: StagehandV4PageState, + event: unknown, +): void { + if (!isRecord(event)) return; + const targetId = targetIdFromStagehandV4Event(event); + if (targetId != null) pageState.targetId = targetId; + if (typeof event.url === "string") pageState.url = event.url; + if (isRecord(event.selector) && typeof event.selector.url === "string") { + pageState.url = event.selector.url; + } + if (isStagehandV4LoadState(event.loadState)) { + pageState.loadState = event.loadState; + } +} + +function targetIdFromStagehandV4Event(event: unknown): string | undefined { + if (!isRecord(event)) return undefined; + if (typeof event.targetId === "string") return event.targetId; + if (isRecord(event.selector) && typeof event.selector.targetId === "string") { + return event.selector.targetId; + } + return undefined; +} + +function normalizeStagehandV4LoadState(state: unknown): StagehandV4LoadState { + if (state == null || state === "load" || state === "loaded") return "loaded"; + if (isStagehandV4LoadState(state)) return state; + throw new Error(`Unsupported stagehand_v4 waitForLoadState state: ${state}`); +} + +function isStagehandV4LoadState(value: unknown): value is StagehandV4LoadState { + return ( + value === "init" || + value === "domcontentloaded" || + value === "loaded" || + value === "networkidle2" || + value === "networkidle" + ); +} + +function loadStateTimeoutMs(options: unknown): number { + if (!isRecord(options)) return 30_000; + const timeout = options.timeoutMs ?? options.timeout; + return typeof timeout === "number" && Number.isFinite(timeout) + ? Math.max(0, timeout) + : 30_000; +} + function normalizeV4Action( action: Record, ): Record { From 804bbd8b4184244bf7390f45a097266dddcd8247 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 10:40:27 -0700 Subject: [PATCH 05/12] Support v4 protocol-backed eval assertions --- .../framework/StagehandAgentV4Harness.ts | 205 +++++++++++++++++- 1 file changed, 202 insertions(+), 3 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 94235c16f..ca78f762e 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -442,14 +442,213 @@ function createStagehandV4PageFacade( ); return isRecord(result) && "value" in result ? result.value : result; }, - locator() { - throw new Error( - "stagehand_v4 evals must use v4 protocol actions instead of v3 page.locator().", + locator(selector: unknown) { + return createStagehandV4LocatorFacade( + stagehandV4, + pageState, + [], + selector, + ); + }, + frameLocator(selector: unknown) { + return createStagehandV4FrameLocatorFacade( + stagehandV4, + pageState, + [selector], + ); + }, + }; +} + +function createStagehandV4FrameLocatorFacade( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + frameSelectors: unknown[], +): Record { + return { + frameLocator(selector: unknown) { + return createStagehandV4FrameLocatorFacade(stagehandV4, pageState, [ + ...frameSelectors, + selector, + ]); + }, + locator(selector: unknown) { + return createStagehandV4LocatorFacade( + stagehandV4, + pageState, + frameSelectors, + selector, + ); + }, + async evaluate(expressionOrFn: unknown, arg?: unknown) { + const expression = + typeof expressionOrFn === "function" + ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` + : String(expressionOrFn); + return await evaluateStagehandV4LocatorScript( + stagehandV4, + pageState, + frameSelectors, + null, + "evaluate", + expression, + ); + }, + }; +} + +function createStagehandV4LocatorFacade( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + frameSelectors: unknown[], + selector: unknown, +): Record { + const read = async (method: string) => + await evaluateStagehandV4LocatorScript( + stagehandV4, + pageState, + frameSelectors, + selector, + method, + ); + return { + first() { + return createStagehandV4LocatorFacade( + stagehandV4, + pageState, + frameSelectors, + selector, ); }, + async inputValue() { + return await read("inputValue"); + }, + async isChecked() { + return await read("isChecked"); + }, + async textContent() { + return await read("textContent"); + }, + async innerText() { + return await read("innerText"); + }, + async innerHtml() { + return await read("innerHtml"); + }, + async innerHTML() { + return await read("innerHtml"); + }, + async click() { + if (frameSelectors.length > 0) { + await read("click"); + return; + } + await stagehandV4.cdp.Stagehand.BrowserPageClick({ + selector: { + ...normalizeV4Selector(selector), + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + }, + }); + }, + async backendNodeId() { + if (frameSelectors.length > 0) { + throw new Error( + "stagehand_v4 frame locator backendNodeId assertions require a selector returned by v4 observe.", + ); + } + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestElementInfo({ + selector: { + ...normalizeV4Selector(selector), + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), + }, + }), + ); + if (isRecord(result) && typeof result.backendNodeId === "number") { + return result.backendNodeId; + } + throw new Error("stagehand_v4 locator could not resolve backendNodeId."); + }, }; } +async function evaluateStagehandV4LocatorScript( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + frameSelectors: unknown[], + selector: unknown, + method: string, + expression?: string, +): Promise { + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + awaitPromise: true, + expression: `(() => { + const frameSelectors = ${JSON.stringify(frameSelectors.map(normalizeV4Selector))}; + const selector = ${JSON.stringify(selector == null ? null : normalizeV4Selector(selector))}; + const method = ${JSON.stringify(method)}; + const expression = ${JSON.stringify(expression ?? null)}; + + const normalizeCss = (value) => String(value).replace(/\\s*>>\\s*/g, " "); + const resolve = (doc, input) => { + if (!input) return null; + if (input.xpath) { + return doc.evaluate( + String(input.xpath).replace(/^xpath=/, ""), + doc, + null, + XPathResult.FIRST_ORDERED_NODE_TYPE, + null, + ).singleNodeValue; + } + if (input.css) return doc.querySelector(normalizeCss(input.css)); + if (input.text) { + const walker = doc.createTreeWalker(doc.body ?? doc.documentElement, NodeFilter.SHOW_ELEMENT); + while (walker.nextNode()) { + const candidate = walker.currentNode; + if ((candidate.innerText || candidate.textContent || "").includes(input.text)) return candidate; + } + } + return null; + }; + + let doc = document; + let win = window; + for (const frameSelector of frameSelectors) { + const frame = resolve(doc, frameSelector); + if (!frame || !frame.contentWindow || !frame.contentDocument) { + throw new Error("Unable to resolve frame locator."); + } + win = frame.contentWindow; + doc = frame.contentDocument; + } + + if (method === "evaluate") { + return win.eval(expression); + } + + const element = resolve(doc, selector); + if (!element) throw new Error("Unable to resolve locator."); + if (method === "inputValue") return element.value ?? ""; + if (method === "isChecked") return Boolean(element.checked); + if (method === "textContent") return element.textContent; + if (method === "innerText") return element.innerText ?? element.textContent ?? ""; + if (method === "innerHtml") return element.innerHTML ?? ""; + if (method === "click") { + element.click(); + return null; + } + throw new Error("Unsupported stagehand_v4 locator method: " + method); + })()`, + returnByValue: true, + }), + ); + return isRecord(result) && "value" in result ? result.value : result; +} + async function waitForStagehandV4LoadState( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, From b7a96fd349c54f83bb9806e5a51a4a8a38f83487 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 10:47:35 -0700 Subject: [PATCH 06/12] Use v4 element info for eval locator assertions --- .../framework/StagehandAgentV4Harness.ts | 204 ++++++------------ 1 file changed, 65 insertions(+), 139 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index ca78f762e..dcac05bcc 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -443,19 +443,12 @@ function createStagehandV4PageFacade( return isRecord(result) && "value" in result ? result.value : result; }, locator(selector: unknown) { - return createStagehandV4LocatorFacade( - stagehandV4, - pageState, - [], - selector, - ); + return createStagehandV4LocatorFacade(stagehandV4, pageState, selector); }, frameLocator(selector: unknown) { - return createStagehandV4FrameLocatorFacade( - stagehandV4, - pageState, - [selector], - ); + return createStagehandV4FrameLocatorFacade(stagehandV4, pageState, [ + selector, + ]); }, }; } @@ -473,26 +466,30 @@ function createStagehandV4FrameLocatorFacade( ]); }, locator(selector: unknown) { - return createStagehandV4LocatorFacade( - stagehandV4, - pageState, - frameSelectors, - selector, - ); + return createStagehandV4LocatorFacade(stagehandV4, pageState, selector); }, async evaluate(expressionOrFn: unknown, arg?: unknown) { const expression = typeof expressionOrFn === "function" ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` : String(expressionOrFn); - return await evaluateStagehandV4LocatorScript( - stagehandV4, - pageState, - frameSelectors, - null, - "evaluate", - expression, + if (frameSelectors.length > 0) { + throw new Error( + "stagehand_v4 frameLocator.evaluate is not implemented by the v4 protocol-backed eval facade yet.", + ); + } + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), + arg: isJsonValue(arg) ? arg : undefined, + awaitPromise: true, + expression, + returnByValue: true, + }), ); + return isRecord(result) && "value" in result ? result.value : result; }, }; } @@ -500,153 +497,82 @@ function createStagehandV4FrameLocatorFacade( function createStagehandV4LocatorFacade( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, - frameSelectors: unknown[], selector: unknown, ): Record { - const read = async (method: string) => - await evaluateStagehandV4LocatorScript( - stagehandV4, - pageState, - frameSelectors, - selector, - method, - ); + const read = async () => + await requestStagehandV4ElementInfo(stagehandV4, pageState, selector); return { first() { - return createStagehandV4LocatorFacade( - stagehandV4, - pageState, - frameSelectors, - selector, - ); + return createStagehandV4LocatorFacade(stagehandV4, pageState, selector); }, async inputValue() { - return await read("inputValue"); + return (await read()).inputValue ?? ""; }, async isChecked() { - return await read("isChecked"); + return Boolean((await read()).checked); }, async textContent() { - return await read("textContent"); + return (await read()).textContent ?? null; }, async innerText() { - return await read("innerText"); + const info = await read(); + return info.innerText ?? info.textContent ?? ""; }, async innerHtml() { - return await read("innerHtml"); + return (await read()).innerHTML ?? ""; }, async innerHTML() { - return await read("innerHtml"); + return (await read()).innerHTML ?? ""; }, async click() { - if (frameSelectors.length > 0) { - await read("click"); - return; - } await stagehandV4.cdp.Stagehand.BrowserPageClick({ - selector: { - ...normalizeV4Selector(selector), - ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), - }, + selector: stagehandV4SelectorFor(pageState, selector), }); }, async backendNodeId() { - if (frameSelectors.length > 0) { - throw new Error( - "stagehand_v4 frame locator backendNodeId assertions require a selector returned by v4 observe.", - ); - } - const result = unwrapStagehandV4Result( - await stagehandV4.cdp.Stagehand.BrowserPageRequestElementInfo({ - selector: { - ...normalizeV4Selector(selector), - ...(pageState.targetId != null - ? { targetId: pageState.targetId } - : {}), - }, - }), - ); - if (isRecord(result) && typeof result.backendNodeId === "number") { - return result.backendNodeId; - } - throw new Error("stagehand_v4 locator could not resolve backendNodeId."); + return (await read()).backendNodeId; }, }; } -async function evaluateStagehandV4LocatorScript( +async function requestStagehandV4ElementInfo( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, - frameSelectors: unknown[], selector: unknown, - method: string, - expression?: string, -): Promise { +): Promise<{ + backendNodeId: number; + checked?: boolean | null; + innerHTML?: string | null; + innerText?: string | null; + inputValue?: string | null; + textContent?: string | null; +}> { const result = unwrapStagehandV4Result( - await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ - ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), - awaitPromise: true, - expression: `(() => { - const frameSelectors = ${JSON.stringify(frameSelectors.map(normalizeV4Selector))}; - const selector = ${JSON.stringify(selector == null ? null : normalizeV4Selector(selector))}; - const method = ${JSON.stringify(method)}; - const expression = ${JSON.stringify(expression ?? null)}; - - const normalizeCss = (value) => String(value).replace(/\\s*>>\\s*/g, " "); - const resolve = (doc, input) => { - if (!input) return null; - if (input.xpath) { - return doc.evaluate( - String(input.xpath).replace(/^xpath=/, ""), - doc, - null, - XPathResult.FIRST_ORDERED_NODE_TYPE, - null, - ).singleNodeValue; - } - if (input.css) return doc.querySelector(normalizeCss(input.css)); - if (input.text) { - const walker = doc.createTreeWalker(doc.body ?? doc.documentElement, NodeFilter.SHOW_ELEMENT); - while (walker.nextNode()) { - const candidate = walker.currentNode; - if ((candidate.innerText || candidate.textContent || "").includes(input.text)) return candidate; - } - } - return null; - }; - - let doc = document; - let win = window; - for (const frameSelector of frameSelectors) { - const frame = resolve(doc, frameSelector); - if (!frame || !frame.contentWindow || !frame.contentDocument) { - throw new Error("Unable to resolve frame locator."); - } - win = frame.contentWindow; - doc = frame.contentDocument; - } - - if (method === "evaluate") { - return win.eval(expression); - } - - const element = resolve(doc, selector); - if (!element) throw new Error("Unable to resolve locator."); - if (method === "inputValue") return element.value ?? ""; - if (method === "isChecked") return Boolean(element.checked); - if (method === "textContent") return element.textContent; - if (method === "innerText") return element.innerText ?? element.textContent ?? ""; - if (method === "innerHtml") return element.innerHTML ?? ""; - if (method === "click") { - element.click(); - return null; - } - throw new Error("Unsupported stagehand_v4 locator method: " + method); - })()`, - returnByValue: true, + await stagehandV4.cdp.Stagehand.BrowserPageRequestElementInfo({ + selector: stagehandV4SelectorFor(pageState, selector), }), ); - return isRecord(result) && "value" in result ? result.value : result; + if (isRecord(result) && typeof result.backendNodeId === "number") { + return result as { + backendNodeId: number; + checked?: boolean | null; + innerHTML?: string | null; + innerText?: string | null; + inputValue?: string | null; + textContent?: string | null; + }; + } + throw new Error("stagehand_v4 locator could not resolve element info."); +} + +function stagehandV4SelectorFor( + pageState: StagehandV4PageState, + selector: unknown, +): Record { + return { + ...normalizeV4Selector(selector), + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + }; } async function waitForStagehandV4LoadState( From 8696703d30ce2a91129397ed54808b75f2dec833 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 11:02:28 -0700 Subject: [PATCH 07/12] Return page text for no-schema v4 extract --- packages/evals/framework/StagehandAgentV4Harness.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index dcac05bcc..8c7cb8519 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -355,6 +355,18 @@ async function installStagehandV4BenchFacade( const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as | Record | undefined; + if (schema == null) { + const summary = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({ + ...selectorParam(options), + }), + ); + const pageText = + isRecord(summary) && typeof summary.pageText === "string" + ? summary.pageText + : ""; + return { extraction: pageText, pageText }; + } const result = await stagehandV4.cdp.Stagehand.AIExtract({ ...(instruction != null ? { instruction } : {}), ...(schema != null ? { schema: schema as Record } : {}), From f0bf83af517aa1a9b1fcd0cd32b56dc911699534 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 12:53:15 -0700 Subject: [PATCH 08/12] Adapt v4 harness actions to flattened params --- .../framework/StagehandAgentV4Harness.ts | 45 +++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 8c7cb8519..1d3f91253 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -698,14 +698,53 @@ function loadStateTimeoutMs(options: unknown): number { function normalizeV4Action( action: Record, ): Record { + const method = + typeof action.method === "string" ? normalizeV4ActionMethod(action.method) : null; + const selector = normalizeV4Selector(action.selector); + let args: Record = {}; + if (isRecord(action.arguments)) { + args = action.arguments; + } else if (Array.isArray(action.arguments)) { + const positional = action.arguments + .filter((value): value is string => typeof value === "string"); + const first = positional[0]; + if (method === "type") { + args = { text: first ?? "" }; + } else if (method === "keys") { + args = { key: first ?? "", method: "press" }; + } else if (method === "goto") { + args = { url: first ?? "" }; + } else if (method === "wait") { + const ms = Number(first); + args = { ms: Number.isFinite(ms) ? ms : 1000 }; + } else if (method === "scroll" || method === "scrollTo") { + const numberValue = first?.endsWith("%") + ? Number.parseFloat(first) + : Number(first); + args = first?.includes("%") + ? { percent: first } + : method === "scroll" + ? { deltaY: Number.isFinite(numberValue) ? numberValue : 0 } + : { y: Number.isFinite(numberValue) ? numberValue : 0 }; + } else if (method === "dragAndDrop") { + args = { + from: selector, + to: normalizeV4Selector(first) ?? selector, + }; + } + } return { ...action, - selector: normalizeV4Selector(action.selector), - method: typeof action.method === "string" ? action.method : null, - arguments: Array.isArray(action.arguments) ? action.arguments : null, + selector, + method, + arguments: args, }; } +function normalizeV4ActionMethod(method: string): string { + return method === "press" ? "keys" : method; +} + function selectorParam( options: Record | undefined, ): Record { From 29622e3e00a407959ff74794055946543ae4a383 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 15:38:31 -0700 Subject: [PATCH 09/12] Format Stagehand v4 harness --- packages/evals/framework/StagehandAgentV4Harness.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 1d3f91253..2efbb200c 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -699,14 +699,17 @@ function normalizeV4Action( action: Record, ): Record { const method = - typeof action.method === "string" ? normalizeV4ActionMethod(action.method) : null; + typeof action.method === "string" + ? normalizeV4ActionMethod(action.method) + : null; const selector = normalizeV4Selector(action.selector); let args: Record = {}; if (isRecord(action.arguments)) { args = action.arguments; } else if (Array.isArray(action.arguments)) { - const positional = action.arguments - .filter((value): value is string => typeof value === "string"); + const positional = action.arguments.filter( + (value): value is string => typeof value === "string", + ); const first = positional[0]; if (method === "type") { args = { text: first ?? "" }; From 103cddbe47a9f2c6970d7e24e15ffaed9ee79e4c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 6 May 2026 16:47:36 -0700 Subject: [PATCH 10/12] fix stagehand v4 eval target tracking --- .../framework/StagehandAgentV4Harness.ts | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 2efbb200c..3da967b54 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -37,6 +37,14 @@ const STAGEHAND_V4_LOAD_STATE_ORDER: Record = { networkidle: 4, }; +function isInternalStagehandV4PageUrl(url: string | undefined): boolean { + return ( + url == null || + url === "about:blank" || + /^chrome(?:-[a-z]+)?:\/\//u.test(url) + ); +} + type StagehandV4PageState = { targetId?: string; title: string; @@ -400,9 +408,10 @@ function createStagehandV4PageFacade( stagehandV4.cdp.Stagehand.BrowserPageGoto({ url, selector: - pageState.targetId != null + pageState.targetId != null && + !isInternalStagehandV4PageUrl(pageState.url) ? { targetId: pageState.targetId } - : { active: true }, + : { active: false }, }), loaded, ]); @@ -652,6 +661,13 @@ function updatePageStateFromStagehandV4Event( ): void { if (!isRecord(event)) return; const targetId = targetIdFromStagehandV4Event(event); + if ( + targetId != null && + pageState.targetId != null && + targetId !== pageState.targetId + ) { + return; + } if (targetId != null) pageState.targetId = targetId; if (typeof event.url === "string") pageState.url = event.url; if (isRecord(event.selector) && typeof event.selector.url === "string") { From f90cba63e1548872777a7c83423a36ece7c901b8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 7 May 2026 13:07:16 -0700 Subject: [PATCH 11/12] Improve Stagehand v4 eval harness --- packages/evals/evals.config.json | 2 +- .../framework/StagehandAgentV4Harness.ts | 574 +++++++++++++----- packages/evals/framework/UnderstudyV4Tools.ts | 6 +- 3 files changed, 442 insertions(+), 140 deletions(-) diff --git a/packages/evals/evals.config.json b/packages/evals/evals.config.json index c7cff0b53..8823f13f9 100644 --- a/packages/evals/evals.config.json +++ b/packages/evals/evals.config.json @@ -6,7 +6,7 @@ "provider": null, "model": null, "api": false, - "verbose": false + "verbose": true }, "benchmarks": { "webvoyager": { diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts index 3da967b54..eec48977e 100644 --- a/packages/evals/framework/StagehandAgentV4Harness.ts +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -50,8 +50,24 @@ type StagehandV4PageState = { title: string; url: string; loadState?: StagehandV4LoadState; + frames: StagehandV4FrameState[]; }; +type StagehandV4FrameState = { + frameId: string; + targetId?: string; + url?: string; +}; + +type StagehandV4HistoryEntry = { + method: string; + parameters: unknown; + result: unknown; + timestamp: string; +}; + +const STAGEHAND_V4_PAGE_STATE = Symbol("stagehand_v4_page_state"); + function isAgentTask(task: BenchHarnessStartInput["task"]): boolean { return ( task.primaryCategory === "agent" || @@ -180,6 +196,7 @@ export const StagehandAgentV4Harness: BenchHarness = { const v4Page = await installStagehandV4BenchFacade( v3Result.v3, understudyV4Tools.stagehandV4, + input.modelName, ); if (createAgent) { @@ -266,11 +283,28 @@ function buildStagehandAgentV4SystemPrompt( async function installStagehandV4BenchFacade( v3: V3, stagehandV4: UnderstudyV4NativeRuntime, + modelName: string, ): Promise> { const pageState: StagehandV4PageState = { + frames: [], title: "", url: "about:blank", }; + const history: StagehandV4HistoryEntry[] = []; + const recordHistory = ( + method: string, + parameters: unknown, + result: unknown, + ): void => { + history.push({ + method, + parameters, + result, + timestamp: new Date().toISOString(), + }); + }; + const pageCache = new Map>(); + const pageOrder: string[] = []; const refreshPageInfo = async (): Promise => { const info = unwrapStagehandV4Result( @@ -282,39 +316,95 @@ async function installStagehandV4BenchFacade( if (typeof info.targetId === "string") pageState.targetId = info.targetId; if (typeof info.title === "string") pageState.title = info.title; if (typeof info.url === "string") pageState.url = info.url; + if (info.loadState != null) + pageState.loadState = normalizeStagehandV4LoadState(info.loadState); + await refreshFrameStates(stagehandV4, pageState).catch(() => {}); }; - await refreshPageInfo().catch(() => {}); - - const updatePageStateFromBrowserEvent = (event: unknown): void => { - updatePageStateFromStagehandV4Event(pageState, event); - }; - const updatePageStateFromNavigationEvent = (event: unknown): void => { - updatePageStateFromStagehandV4Event(pageState, event); - pageState.loadState = "init"; + const refreshPages = async (): Promise[]> => { + const rawPages = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserRequestTabList({}), + ); + const pages = Array.isArray(rawPages) + ? rawPages.filter((page): page is Record => + isRecord(page), + ) + : []; + for (const pageInfo of pages) { + const targetId = + typeof pageInfo.targetId === "string" ? pageInfo.targetId : null; + if (targetId == null) continue; + if (!pageOrder.includes(targetId)) pageOrder.push(targetId); + let facade = pageCache.get(targetId); + if (facade == null) { + const state: StagehandV4PageState = { + frames: [], + targetId, + title: "", + url: "about:blank", + }; + facade = createStagehandV4PageFacade( + stagehandV4, + state, + async () => { + await refreshSinglePageInfo(stagehandV4, state); + }, + recordHistory, + ); + pageCache.set(targetId, facade); + } + const state = facade[STAGEHAND_V4_PAGE_STATE]; + if (!isStagehandV4PageState(state)) continue; + state.targetId = targetId; + state.title = + typeof pageInfo.title === "string" ? pageInfo.title : state.title; + state.url = typeof pageInfo.url === "string" ? pageInfo.url : state.url; + await refreshFrameStates(stagehandV4, state).catch(() => {}); + } + return pageOrder + .map((targetId) => pageCache.get(targetId)) + .filter((page): page is Record => page != null); }; - stagehandV4.cdp.on( - "Stagehand.BrowserPageNavigated", - updatePageStateFromNavigationEvent, - ); - stagehandV4.cdp.on( - "Stagehand.BrowserPageLoadStateChanged", - updatePageStateFromBrowserEvent, - ); + + await refreshPageInfo().catch(() => {}); + await refreshPages().catch(() => {}); const page = createStagehandV4PageFacade( stagehandV4, pageState, refreshPageInfo, + recordHistory, ); - const pages = (): Record[] => [page]; + if (pageState.targetId != null) { + if (!pageOrder.includes(pageState.targetId)) + pageOrder.push(pageState.targetId); + pageCache.set(pageState.targetId, page); + } + const pages = (): Record[] => { + const cached = pageOrder + .map((targetId) => pageCache.get(targetId)) + .filter((entry): entry is Record => entry != null); + return cached.length > 0 ? cached : [page]; + }; const context = v3.context as unknown as Record; context.pages = pages; context.awaitActivePage = async () => { + await refreshPages().catch(() => {}); + const activePage = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserRequestActivePage({}), + ); + if (isRecord(activePage) && typeof activePage.targetId === "string") { + const cached = pageCache.get(activePage.targetId); + if (cached != null) return cached; + } await refreshPageInfo(); return page; }; + Object.defineProperty(v3, "history", { + configurable: true, + get: () => Promise.resolve([...history]), + }); v3.observe = (async ( a?: string | Record, @@ -327,29 +417,48 @@ async function installStagehandV4BenchFacade( const result = await stagehandV4.cdp.Stagehand.AIObserve({ ...(instruction != null ? { instruction } : {}), ...selectorParam(options), - ...workflowOptionsParam(options), + ...workflowOptionsParam(options, modelName), }); const observed = unwrapStagehandV4Result(result); - return Array.isArray(observed) ? observed : []; + const output = Array.isArray(observed) ? observed : []; + recordHistory("observe", { instruction, options }, output); + return output; }) as V3["observe"]; v3.act = (async ( input: string | Record, options?: Record, ) => { + const workflowOptions = workflowOptionsParam(options, modelName); const result = await stagehandV4.cdp.Stagehand.AIAct( typeof input === "string" ? { instruction: input, - ...workflowOptionsParam(options), + ...selectorParam(options), + ...workflowOptions, } : { action: normalizeV4Action(input), - ...workflowOptionsParam(options), + ...selectorParam(options), + ...workflowOptions, + options: { + ...(isRecord(workflowOptions.options) + ? workflowOptions.options + : {}), + selfHeal: true, + }, }, ); const unwrapped = unwrapStagehandV4Result(result); await refreshPageInfo().catch(() => {}); + await refreshPages().catch(() => {}); + recordHistory( + "act", + typeof input === "string" + ? { instruction: input, options } + : { action: input, options }, + unwrapped, + ); return unwrapped; }) as V3["act"]; @@ -363,7 +472,7 @@ async function installStagehandV4BenchFacade( const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as | Record | undefined; - if (schema == null) { + if (instruction == null && schema == null) { const summary = unwrapStagehandV4Result( await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({ ...selectorParam(options), @@ -373,15 +482,17 @@ async function installStagehandV4BenchFacade( isRecord(summary) && typeof summary.pageText === "string" ? summary.pageText : ""; - return { extraction: pageText, pageText }; + return { pageText, extraction: pageText }; } const result = await stagehandV4.cdp.Stagehand.AIExtract({ ...(instruction != null ? { instruction } : {}), ...(schema != null ? { schema: schema as Record } : {}), ...selectorParam(options), - ...workflowOptionsParam(options), + ...workflowOptionsParam(options, modelName), }); - return unwrapStagehandV4Result(result); + const extracted = unwrapStagehandV4Result(result); + recordHistory("extract", { instruction, schema, options }, extracted); + return extracted; }) as V3["extract"]; return page; @@ -391,30 +502,32 @@ function createStagehandV4PageFacade( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, refreshPageInfo: () => Promise, + recordHistory?: ( + method: string, + parameters: unknown, + result: unknown, + ) => void, ): Record { return { + [STAGEHAND_V4_PAGE_STATE]: pageState, async goto(url: string, options?: unknown) { pageState.loadState = "init"; - const loaded = waitForStagehandV4LoadState( - stagehandV4, - pageState, + const selector = + pageState.targetId != null + ? { targetId: pageState.targetId } + : { active: true }; + if (!("targetId" in selector)) { + delete pageState.targetId; + } + const waitUntil = isRecord(options) && "waitUntil" in options ? options.waitUntil - : undefined, - loadStateTimeoutMs(options), - false, - ); - const [rawResult] = await Promise.all([ - stagehandV4.cdp.Stagehand.BrowserPageGoto({ - url, - selector: - pageState.targetId != null && - !isInternalStagehandV4PageUrl(pageState.url) - ? { targetId: pageState.targetId } - : { active: false }, - }), - loaded, - ]); + : undefined; + const rawResult = await stagehandV4.cdp.Stagehand.BrowserPageGoto({ + url, + selector, + waitUntil: normalizeStagehandV4LoadState(waitUntil), + }); const result = unwrapStagehandV4Result(rawResult); if (isRecord(result)) { if (typeof result.targetId === "string") @@ -422,11 +535,13 @@ function createStagehandV4PageFacade( if (typeof result.url === "string") pageState.url = result.url; } await refreshPageInfo(); - return { + const response = { ok: () => true, status: () => 200, url: () => pageState.url, }; + recordHistory?.("navigate", { url, options }, result); + return response; }, url() { return pageState.url; @@ -435,13 +550,17 @@ function createStagehandV4PageFacade( await refreshPageInfo(); return pageState.title; }, + frames() { + return pageState.frames.map((frameState) => + createStagehandV4FrameFacade(stagehandV4, frameState), + ); + }, async waitForLoadState(state?: unknown, options?: unknown) { await waitForStagehandV4LoadState( stagehandV4, pageState, state, loadStateTimeoutMs(options), - true, ); await refreshPageInfo(); }, @@ -474,6 +593,36 @@ function createStagehandV4PageFacade( }; } +function createStagehandV4FrameFacade( + stagehandV4: UnderstudyV4NativeRuntime, + frameState: StagehandV4FrameState, +): Record { + return { + async evaluate(expressionOrFn: unknown, arg?: unknown) { + const expression = + typeof expressionOrFn === "function" + ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` + : String(expressionOrFn); + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(frameState.targetId != null + ? { targetId: frameState.targetId } + : {}), + arg: isJsonValue(arg) ? arg : undefined, + awaitPromise: true, + expression, + frameId: frameState.frameId, + returnByValue: true, + }), + ); + return isRecord(result) && "value" in result ? result.value : result; + }, + url() { + return frameState.url ?? "about:blank"; + }, + }; +} + function createStagehandV4FrameLocatorFacade( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, @@ -494,11 +643,11 @@ function createStagehandV4FrameLocatorFacade( typeof expressionOrFn === "function" ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` : String(expressionOrFn); - if (frameSelectors.length > 0) { - throw new Error( - "stagehand_v4 frameLocator.evaluate is not implemented by the v4 protocol-backed eval facade yet.", - ); - } + const frameId = await resolveStagehandV4FrameLocator( + stagehandV4, + pageState, + frameSelectors, + ); const result = unwrapStagehandV4Result( await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ ...(pageState.targetId != null @@ -507,6 +656,7 @@ function createStagehandV4FrameLocatorFacade( arg: isJsonValue(arg) ? arg : undefined, awaitPromise: true, expression, + ...(frameId != null ? { frameId } : {}), returnByValue: true, }), ); @@ -519,12 +669,23 @@ function createStagehandV4LocatorFacade( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, selector: unknown, + frameSelectors: unknown[] = [], ): Record { const read = async () => - await requestStagehandV4ElementInfo(stagehandV4, pageState, selector); + await requestStagehandV4ElementInfo( + stagehandV4, + pageState, + selector, + frameSelectors, + ); return { first() { - return createStagehandV4LocatorFacade(stagehandV4, pageState, selector); + return createStagehandV4LocatorFacade( + stagehandV4, + pageState, + selector, + frameSelectors, + ); }, async inputValue() { return (await read()).inputValue ?? ""; @@ -547,7 +708,12 @@ function createStagehandV4LocatorFacade( }, async click() { await stagehandV4.cdp.Stagehand.BrowserPageClick({ - selector: stagehandV4SelectorFor(pageState, selector), + selector: await stagehandV4SelectorFor( + stagehandV4, + pageState, + selector, + frameSelectors, + ), }); }, async backendNodeId() { @@ -560,6 +726,7 @@ async function requestStagehandV4ElementInfo( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, selector: unknown, + frameSelectors: unknown[] = [], ): Promise<{ backendNodeId: number; checked?: boolean | null; @@ -570,7 +737,12 @@ async function requestStagehandV4ElementInfo( }> { const result = unwrapStagehandV4Result( await stagehandV4.cdp.Stagehand.BrowserPageRequestElementInfo({ - selector: stagehandV4SelectorFor(pageState, selector), + selector: await stagehandV4SelectorFor( + stagehandV4, + pageState, + selector, + frameSelectors, + ), }), ); if (isRecord(result) && typeof result.backendNodeId === "number") { @@ -586,109 +758,199 @@ async function requestStagehandV4ElementInfo( throw new Error("stagehand_v4 locator could not resolve element info."); } -function stagehandV4SelectorFor( +async function refreshSinglePageInfo( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, +): Promise { + const info = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestInfo({ + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + }), + ); + if (!isRecord(info)) return; + if (typeof info.targetId === "string") pageState.targetId = info.targetId; + if (typeof info.title === "string") pageState.title = info.title; + if (typeof info.url === "string") pageState.url = info.url; + if (info.loadState != null) + pageState.loadState = normalizeStagehandV4LoadState(info.loadState); + await refreshFrameStates(stagehandV4, pageState); +} + +async function refreshFrameStates( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, +): Promise { + if (pageState.targetId == null || isInternalStagehandV4PageUrl(pageState.url)) + return; + const rawFrameTree = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestFullFrameTree({ + targetId: pageState.targetId, + }), + ); + if (!isRecord(rawFrameTree) || !isRecord(rawFrameTree.frameTree)) return; + const frames: StagehandV4FrameState[] = []; + collectStagehandV4Frames(rawFrameTree.frameTree, pageState.targetId, frames); + pageState.frames = frames; +} + +function collectStagehandV4Frames( + frameTree: Record, + targetId: string, + frames: StagehandV4FrameState[], +): void { + const frame = isRecord(frameTree.frame) ? frameTree.frame : null; + if (frame != null && typeof frame.id === "string") { + frames.push({ + frameId: frame.id, + targetId, + url: typeof frame.url === "string" ? frame.url : undefined, + }); + } + const childFrames = Array.isArray(frameTree.childFrames) + ? frameTree.childFrames + : []; + for (const childFrame of childFrames) { + if (isRecord(childFrame)) { + collectStagehandV4Frames(childFrame, targetId, frames); + } + } +} + +async function stagehandV4SelectorFor( + stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, selector: unknown, -): Record { + frameSelectors: unknown[] = [], +): Promise> { + if (pageState.targetId == null) { + await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {}); + } + const frameId = await resolveStagehandV4FrameLocator( + stagehandV4, + pageState, + frameSelectors, + ); return { ...normalizeV4Selector(selector), ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + ...(frameId != null ? { frameId } : {}), }; } -async function waitForStagehandV4LoadState( +async function resolveStagehandV4FrameLocator( stagehandV4: UnderstudyV4NativeRuntime, pageState: StagehandV4PageState, - state: unknown, - timeoutMs: number, - acceptCurrentState: boolean, -): Promise { - const expectedState = normalizeStagehandV4LoadState(state); - if ( - acceptCurrentState && - pageState.loadState != null && - STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >= - STAGEHAND_V4_LOAD_STATE_ORDER[expectedState] - ) { - return; + frameSelectors: unknown[], +): Promise { + if (frameSelectors.length === 0) return undefined; + if (pageState.targetId == null) { + await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {}); } - - await new Promise((resolve, reject) => { - const onLoadStateChanged = (event: unknown): void => { - const eventTargetId = targetIdFromStagehandV4Event(event); - if ( - eventTargetId != null && - pageState.targetId != null && - eventTargetId !== pageState.targetId - ) { - return; - } - updatePageStateFromStagehandV4Event(pageState, event); - if ( - pageState.loadState != null && - STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >= - STAGEHAND_V4_LOAD_STATE_ORDER[expectedState] - ) { - clearTimeout(timer); - stagehandV4.cdp.off( - "Stagehand.BrowserPageLoadStateChanged", - onLoadStateChanged, - ); - resolve(); - } + let frameId: string | undefined; + for (const frameSelector of frameSelectors) { + const selector = { + ...normalizeV4Selector(frameSelector), + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + ...(frameId != null ? { frameId } : {}), }; - const timer = setTimeout(() => { - stagehandV4.cdp.off( - "Stagehand.BrowserPageLoadStateChanged", - onLoadStateChanged, - ); - reject( - new Error( - `Timed out waiting for Stagehand.BrowserPageLoadStateChanged(${expectedState}).`, - ), + const located = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageLocate({ selector }).catch( + (error: unknown): never => { + throw new Error( + `stagehand_v4 frameLocator could not locate ${JSON.stringify(selector)}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + }, + ), + ); + if (!isRecord(located)) { + throw new Error( + "stagehand_v4 frameLocator could not resolve iframe selector.", ); - }, timeoutMs); - stagehandV4.cdp.on( - "Stagehand.BrowserPageLoadStateChanged", - onLoadStateChanged, + } + const summary = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({ + hydrate: { ax: false }, + selector: { + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), + }, + }), ); - }); + frameId = childFrameIdForLocatedFrameOwner(summary, located); + } + return frameId; } -function updatePageStateFromStagehandV4Event( - pageState: StagehandV4PageState, - event: unknown, -): void { - if (!isRecord(event)) return; - const targetId = targetIdFromStagehandV4Event(event); - if ( - targetId != null && - pageState.targetId != null && - targetId !== pageState.targetId - ) { - return; +function childFrameIdForLocatedFrameOwner( + summary: unknown, + located: Record, +): string { + const frameGraph = isRecord(summary) ? summary.frameGraph : null; + if (!isRecord(frameGraph) || !isRecord(frameGraph.ownerChainByFrameId)) { + throw new Error( + "stagehand_v4 frameLocator could not read the frame graph.", + ); } - if (targetId != null) pageState.targetId = targetId; - if (typeof event.url === "string") pageState.url = event.url; - if (isRecord(event.selector) && typeof event.selector.url === "string") { - pageState.url = event.selector.url; + const backendNodeId = + typeof located.backendNodeId === "number" ? located.backendNodeId : null; + const ownerFrameId = + typeof located.frameId === "string" ? located.frameId : null; + if (backendNodeId == null || ownerFrameId == null) { + throw new Error( + "stagehand_v4 frameLocator resolved selector without a frame owner.", + ); } - if (isStagehandV4LoadState(event.loadState)) { - pageState.loadState = event.loadState; + for (const [candidateFrameId, chain] of Object.entries( + frameGraph.ownerChainByFrameId, + )) { + if (!Array.isArray(chain)) continue; + const owner = chain.at(-1); + if ( + isRecord(owner) && + owner.backendNodeId === backendNodeId && + owner.frameId === ownerFrameId + ) { + return candidateFrameId; + } } + throw new Error("stagehand_v4 frameLocator could not find a child frame."); } -function targetIdFromStagehandV4Event(event: unknown): string | undefined { - if (!isRecord(event)) return undefined; - if (typeof event.targetId === "string") return event.targetId; - if (isRecord(event.selector) && typeof event.selector.targetId === "string") { - return event.selector.targetId; +async function waitForStagehandV4LoadState( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + state: unknown, + timeoutMs: number, +): Promise { + const expectedState = normalizeStagehandV4LoadState(state); + const deadline = Date.now() + timeoutMs; + while (true) { + await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {}); + if ( + pageState.loadState != null && + STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >= + STAGEHAND_V4_LOAD_STATE_ORDER[expectedState] + ) { + return; + } + const remainingMs = deadline - Date.now(); + if (remainingMs <= 0) { + throw new Error( + `Timed out waiting for stagehand_v4 page loadState=${expectedState}.`, + ); + } + await new Promise((resolve) => + setTimeout(resolve, Math.min(100, remainingMs)), + ); } - return undefined; } function normalizeStagehandV4LoadState(state: unknown): StagehandV4LoadState { if (state == null || state === "load" || state === "loaded") return "loaded"; + if (state === "networkalmostidle") return "networkidle2"; if (isStagehandV4LoadState(state)) return state; throw new Error(`Unsupported stagehand_v4 waitForLoadState state: ${state}`); } @@ -727,7 +989,9 @@ function normalizeV4Action( (value): value is string => typeof value === "string", ); const first = positional[0]; - if (method === "type") { + if (method === "fill") { + args = { value: first ?? "" }; + } else if (method === "type") { args = { text: first ?? "" }; } else if (method === "keys") { args = { key: first ?? "", method: "press" }; @@ -767,8 +1031,16 @@ function normalizeV4ActionMethod(method: string): string { function selectorParam( options: Record | undefined, ): Record { + const pageSelector = stagehandV4PageSelector(options?.page); const selector = normalizeV4Selector(options?.selector); - return selector == null ? {} : { selector }; + const mergedSelector = + pageSelector == null && selector == null + ? undefined + : { + ...(pageSelector ?? {}), + ...(selector ?? {}), + }; + return mergedSelector == null ? {} : { selector: mergedSelector }; } function normalizeV4Selector( @@ -780,18 +1052,44 @@ function normalizeV4Selector( if (value.startsWith("xpath=")) return { xpath: value.slice("xpath=".length) }; if (value.startsWith("/") || value.startsWith("(")) return { xpath: value }; - return { css: value }; + return { + css: value + .split(/\s*>>\s*/u) + .filter(Boolean) + .join(" "), + }; +} + +function stagehandV4PageSelector( + page: unknown, +): Record | undefined { + if (page == null) return undefined; + const state = (page as Record)[STAGEHAND_V4_PAGE_STATE]; + if (!isStagehandV4PageState(state) || state.targetId == null) + return undefined; + return { targetId: state.targetId }; +} + +function isStagehandV4PageState(value: unknown): value is StagehandV4PageState { + return ( + isRecord(value) && + Array.isArray(value.frames) && + typeof value.title === "string" && + typeof value.url === "string" + ); } function workflowOptionsParam( options: Record | undefined, + modelName: string, ): Record { - if (!options) return {}; - const workflowOptions: Record = {}; - if (typeof options.timeout === "number") + const workflowOptions: Record = { model: modelName }; + if (typeof options?.timeout === "number") workflowOptions.timeout = options.timeout; - if (isJsonValue(options.variables)) + if (options != null && isJsonValue(options.variables)) workflowOptions.variables = options.variables; + if (isRecord(options?.model)) workflowOptions.model = options.model; + if (typeof options?.model === "string") workflowOptions.model = options.model; return Object.keys(workflowOptions).length === 0 ? {} : { options: workflowOptions }; diff --git a/packages/evals/framework/UnderstudyV4Tools.ts b/packages/evals/framework/UnderstudyV4Tools.ts index cba3e1c20..e2eeaafbf 100644 --- a/packages/evals/framework/UnderstudyV4Tools.ts +++ b/packages/evals/framework/UnderstudyV4Tools.ts @@ -187,6 +187,11 @@ export async function startUnderstudyV4Tools(input: { message: `Connected v4 tools at ${ready.cdpUrl}`, level: 1, }); + input.logger.log({ + category: "understudy_v4_code", + message: `v4 stagehand_session_id=${ready.stagehand_session_id ?? "unknown"}`, + level: 1, + }); const callCommand = (name: string, args: Record) => callBridge(child, pending, nextId++, "command", name, args); const callTool = (name: string, args: Record) => @@ -579,7 +584,6 @@ function understudyV4ClientOptions( }; } return { - rebuild_extension: false, local_browser_launch_options: { headless: process.env.EVAL_HEADLESS !== "false", ...(process.env.CHROME_PATH From 8b9cf5b1949ce8743a79e7aa101aa6c36fa9e2fe Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 7 May 2026 15:06:32 -0700 Subject: [PATCH 12/12] Update packages/evals/evals.config.json Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com> --- packages/evals/evals.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evals/evals.config.json b/packages/evals/evals.config.json index 8823f13f9..c7cff0b53 100644 --- a/packages/evals/evals.config.json +++ b/packages/evals/evals.config.json @@ -6,7 +6,7 @@ "provider": null, "model": null, "api": false, - "verbose": true + "verbose": false }, "benchmarks": { "webvoyager": {