diff --git a/packages/evals/ARCHITECTURE.mmd b/packages/evals/ARCHITECTURE.mmd index 97e87aac7..c1c312f26 100644 --- a/packages/evals/ARCHITECTURE.mmd +++ b/packages/evals/ARCHITECTURE.mmd @@ -48,7 +48,7 @@ flowchart TB CoreContext["framework/context.ts
buildCoreContext"] FixtureServer["core/fixtures
local deterministic pages"] CoreTargets["core/targets
local Chrome
Browserbase CDP"] - CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"] + CoreTools["core/tools registry
understudy_v3_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"] CoreAssertions["assertions + metrics
adapter-backed results"] CoreDeps["core/runtime/coreDeps.ts
browserbase + ws
lazy require"] end diff --git a/packages/evals/core/contracts/tool.ts b/packages/evals/core/contracts/tool.ts index bd1d366d8..0790e81d6 100644 --- a/packages/evals/core/contracts/tool.ts +++ b/packages/evals/core/contracts/tool.ts @@ -17,7 +17,8 @@ import type { } from "./results.js"; export type ToolSurface = - | "understudy_code" + | "understudy_v3_code" + | "understudy_v4_code" | "playwright_code" | "cdp_code" | "playwright_mcp" diff --git a/packages/evals/core/tools/registry.ts b/packages/evals/core/tools/registry.ts index 65384f137..81f29395c 100644 --- a/packages/evals/core/tools/registry.ts +++ b/packages/evals/core/tools/registry.ts @@ -4,11 +4,11 @@ import { CdpCodeTool } from "./cdp_code.js"; import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js"; import { PlaywrightCodeTool } from "./playwright_code.js"; import { PlaywrightMcpTool } from "./playwright_mcp.js"; -import { UnderstudyCodeTool } from "./understudy_code.js"; +import { UnderstudyV3CodeTool } from "./understudy_v3_code.js"; export function listCoreTools(): ToolSurface[] { return [ - "understudy_code", + "understudy_v3_code", "playwright_code", "cdp_code", "playwright_mcp", @@ -19,8 +19,8 @@ export function listCoreTools(): ToolSurface[] { export function getCoreTool(toolSurface: ToolSurface): CoreTool { switch (toolSurface) { - case "understudy_code": - return new UnderstudyCodeTool(); + case "understudy_v3_code": + return new UnderstudyV3CodeTool(); case "playwright_code": return new PlaywrightCodeTool(); case "cdp_code": diff --git a/packages/evals/core/tools/understudy_code.ts b/packages/evals/core/tools/understudy_v3_code.ts similarity index 96% rename from packages/evals/core/tools/understudy_code.ts rename to packages/evals/core/tools/understudy_v3_code.ts index 80709d729..20834007c 100644 --- a/packages/evals/core/tools/understudy_code.ts +++ b/packages/evals/core/tools/understudy_v3_code.ts @@ -222,7 +222,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support click target kind "${target.kind}" yet`, + `understudy_v3_code does not support click target kind "${target.kind}" yet`, ); } } @@ -253,7 +253,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support hover target kind "${target.kind}" yet`, + `understudy_v3_code does not support hover target kind "${target.kind}" yet`, ); } } @@ -298,7 +298,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support type target kind "${target.kind}" yet`, + `understudy_v3_code does not support type target kind "${target.kind}" yet`, ); } } @@ -335,7 +335,7 @@ class UnderstudyPageHandle implements CorePageHandle { return; default: throw new Error( - `understudy_code does not support press target kind "${target.kind}" yet`, + `understudy_v3_code does not support press target kind "${target.kind}" yet`, ); } } @@ -462,8 +462,8 @@ function connectionModeFromProfile( return "launch"; } -export class UnderstudyCodeTool implements CoreTool { - readonly id = "understudy_code"; +export class UnderstudyV3CodeTool implements CoreTool { + readonly id = "understudy_v3_code"; readonly surface = "code"; readonly family = "understudy"; readonly supportedStartupProfiles: StartupProfile[] = [ @@ -485,7 +485,7 @@ export class UnderstudyCodeTool implements CoreTool { async start(input: ToolStartInput): Promise { if (input.startupProfile === "tool_attach_local_cdp") { throw new Error( - `understudy_code does not support startup profile "${input.startupProfile}" yet`, + `understudy_v3_code does not support startup profile "${input.startupProfile}" yet`, ); } diff --git a/packages/evals/framework/ClaudeAgentHarness.ts b/packages/evals/framework/ClaudeAgentHarness.ts new file mode 100644 index 000000000..f13ecb305 --- /dev/null +++ b/packages/evals/framework/ClaudeAgentHarness.ts @@ -0,0 +1,52 @@ +import { EvalsError } from "../errors.js"; +import { runClaudeCodeAgent } from "./claudeCodeRunner.js"; +import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; +import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import type { + BenchHarness, + BenchHarnessExecuteInput, + StartedBenchHarness, +} from "./benchHarness.js"; +import type { TaskResult } from "./types.js"; + +export const ClaudeAgentHarness: BenchHarness = { + harness: "claude_code", + supportedTaskKinds: ["agent", "suite"], + supportsApi: false, + async execute({ + input, + row, + logger, + signal, + }: BenchHarnessExecuteInput): Promise { + const plan = buildExternalHarnessTaskPlan(input); + if (row.config.harness !== "claude_code") { + throw new EvalsError( + `Expected claude_code harness config, received "${row.config.harness}".`, + ); + } + const toolAdapter = await prepareClaudeCodeToolAdapter({ + toolSurface: row.config.toolSurface, + startupProfile: row.config.startupProfile, + environment: row.config.environment, + plan, + logger, + }); + try { + return await runClaudeCodeAgent({ + plan, + model: input.modelName, + logger, + toolAdapter, + signal, + }); + } finally { + await toolAdapter.cleanup(); + } + }, + async start(): Promise { + throw new EvalsError( + "Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.", + ); + }, +}; diff --git a/packages/evals/framework/CodexAgentHarness.ts b/packages/evals/framework/CodexAgentHarness.ts new file mode 100644 index 000000000..fac8c2a31 --- /dev/null +++ b/packages/evals/framework/CodexAgentHarness.ts @@ -0,0 +1,52 @@ +import { EvalsError } from "../errors.js"; +import { runCodexAgent } from "./codexRunner.js"; +import { prepareCodexToolAdapter } from "./codexToolAdapter.js"; +import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import type { + BenchHarness, + BenchHarnessExecuteInput, + StartedBenchHarness, +} from "./benchHarness.js"; +import type { TaskResult } from "./types.js"; + +export const CodexAgentHarness: BenchHarness = { + harness: "codex", + supportedTaskKinds: ["agent", "suite"], + supportsApi: false, + async execute({ + input, + row, + logger, + signal, + }: BenchHarnessExecuteInput): Promise { + const plan = buildExternalHarnessTaskPlan(input); + if (row.config.harness !== "codex") { + throw new EvalsError( + `Expected codex harness config, received "${row.config.harness}".`, + ); + } + const toolAdapter = await prepareCodexToolAdapter({ + toolSurface: row.config.toolSurface, + startupProfile: row.config.startupProfile, + environment: row.config.environment, + plan, + logger, + }); + try { + return await runCodexAgent({ + plan, + model: input.modelName, + logger, + toolAdapter, + signal, + }); + } finally { + await toolAdapter.cleanup(); + } + }, + async start(): Promise { + throw new EvalsError( + "Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.", + ); + }, +}; diff --git a/packages/evals/framework/StagehandAgentV3Harness.ts b/packages/evals/framework/StagehandAgentV3Harness.ts new file mode 100644 index 000000000..9d869fd48 --- /dev/null +++ b/packages/evals/framework/StagehandAgentV3Harness.ts @@ -0,0 +1,139 @@ +import { + AgentProvider, + getAISDKLanguageModel, + loadApiKeyFromEnv, + type AvailableModel, + type LLMClient, + type LogLine, +} from "@browserbasehq/stagehand"; +import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; +import { endBrowserbaseSession } from "../browserbaseCleanup.js"; +import { EvalsError } from "../errors.js"; +import type { V3InitResult } from "../initV3.js"; +import type { + BenchHarness, + BenchHarnessStartInput, + StartedBenchHarness, +} from "./benchHarness.js"; +import type { DiscoveredTask } from "./types.js"; + +function isAgentTask(task: DiscoveredTask): boolean { + return ( + task.primaryCategory === "agent" || + task.categories.includes("agent") || + task.categories.includes("external_agent_benchmarks") + ); +} + +function resolveProvider(modelName: AvailableModel): string | undefined { + if (modelName.includes("/")) { + return modelName.split("/")[0]; + } + + try { + return AgentProvider.getAgentProvider(modelName); + } catch { + return undefined; + } +} + +export const StagehandAgentV3Harness: BenchHarness = { + harness: "stagehand_v3", + supportedTaskKinds: [ + "act", + "extract", + "observe", + "agent", + "combination", + "suite", + ], + supportsApi: true, + async start({ + task, + input, + row, + logger, + verbose, + }: BenchHarnessStartInput): Promise { + let v3Result: V3InitResult | undefined; + const createAgent = isAgentTask(task); + if (row.config.harness !== "stagehand_v3") { + throw new EvalsError( + `Expected stagehand_v3 harness config, received "${row.config.harness}".`, + ); + } + const config = row.config; + const agentMode = config.agentMode ?? input.agentMode; + const isCUA = config.isCUA ?? input.isCUA; + + if (config.useApi) { + const provider = resolveProvider(input.modelName); + const logFn = (line: LogLine) => logger.log(line); + const apiKey = loadApiKeyFromEnv(provider, logFn); + if (!apiKey) { + throw new EvalsError( + `USE_API=true but no API key found for provider "${provider}".`, + ); + } + const { initV3 } = await import("../initV3.js"); + v3Result = await initV3({ + logger, + modelName: input.modelName, + modelClientOptions: { apiKey }, + createAgent, + agentMode, + isCUA, + verbose, + configOverrides: { env: config.environment }, + }); + } else { + let llmClient: LLMClient | undefined; + if (input.modelName.includes("/")) { + const firstSlashIndex = input.modelName.indexOf("/"); + llmClient = new AISdkClientWrapped({ + model: getAISDKLanguageModel( + input.modelName.substring(0, firstSlashIndex), + input.modelName.substring(firstSlashIndex + 1), + ), + }); + } + const { initV3 } = await import("../initV3.js"); + v3Result = await initV3({ + logger, + llmClient, + modelName: input.modelName, + createAgent, + agentMode, + isCUA, + verbose, + configOverrides: { env: config.environment }, + }); + } + + return { + ctx: { + harness: "stagehand_v3", + row, + logger, + v3: v3Result.v3, + agent: v3Result.agent, + page: v3Result.v3.context.pages()[0], + debugUrl: v3Result.debugUrl ?? "", + sessionUrl: v3Result.sessionUrl ?? "", + }, + cleanup: async () => { + if (v3Result?.v3) { + try { + await v3Result.v3.close(); + } catch (closeError) { + console.error( + `Warning: Error closing V3 instance for ${input.name}:`, + closeError, + ); + } + } + await endBrowserbaseSession(v3Result?.v3); + }, + }; + }, +}; diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts new file mode 100644 index 000000000..eec48977e --- /dev/null +++ b/packages/evals/framework/StagehandAgentV4Harness.ts @@ -0,0 +1,1129 @@ +import { + getAISDKLanguageModel, + type AgentInstance, + type LLMClient, + type LocalBrowserLaunchOptions, + type V3, +} from "@browserbasehq/stagehand"; +import { z } from "zod"; +import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; +import { endBrowserbaseSession } from "../browserbaseCleanup.js"; +import { EvalsError } from "../errors.js"; +import type { V3InitResult } from "../initV3.js"; +import { + startUnderstudyV4Tools, + type UnderstudyV4NativeRuntime, +} from "./UnderstudyV4Tools.js"; +import type { + BenchHarness, + BenchHarnessStartInput, + BenchHarnessContext, + StartedBenchHarness, +} from "./benchHarness.js"; + +type Page = ReturnType[number]; +type StagehandV4LoadState = + | "init" + | "domcontentloaded" + | "loaded" + | "networkidle2" + | "networkidle"; + +const STAGEHAND_V4_LOAD_STATE_ORDER: Record = { + init: 0, + domcontentloaded: 1, + loaded: 2, + networkidle2: 3, + networkidle: 4, +}; + +function isInternalStagehandV4PageUrl(url: string | undefined): boolean { + return ( + url == null || + url === "about:blank" || + /^chrome(?:-[a-z]+)?:\/\//u.test(url) + ); +} + +type StagehandV4PageState = { + targetId?: string; + title: string; + url: string; + loadState?: StagehandV4LoadState; + frames: StagehandV4FrameState[]; +}; + +type StagehandV4FrameState = { + frameId: string; + targetId?: string; + url?: string; +}; + +type StagehandV4HistoryEntry = { + method: string; + parameters: unknown; + result: unknown; + timestamp: string; +}; + +const STAGEHAND_V4_PAGE_STATE = Symbol("stagehand_v4_page_state"); + +function isAgentTask(task: BenchHarnessStartInput["task"]): boolean { + return ( + task.primaryCategory === "agent" || + task.categories.includes("agent") || + task.categories.includes("external_agent_benchmarks") + ); +} + +export const StagehandAgentV4Harness: BenchHarness = { + harness: "stagehand_v4", + supportedTaskKinds: [ + "act", + "extract", + "observe", + "agent", + "combination", + "suite", + ], + supportsApi: false, + async start({ + task, + input, + row, + logger, + verbose, + }: BenchHarnessStartInput): Promise { + if (row.config.harness !== "stagehand_v4") { + throw new EvalsError( + `Expected stagehand_v4 harness config, received "${row.config.harness}".`, + ); + } + if (row.config.toolSurface !== "understudy_v4_code") { + throw new EvalsError( + `StagehandAgentV4Harness requires --tool understudy_v4_code; received "${row.config.toolSurface ?? "default"}".`, + ); + } + if (row.config.useApi) { + throw new EvalsError( + "stagehand_v4 must run locally so the v3 agent loop can call the live v4 SDK protocol tools.", + ); + } + + // This is intentionally still the v3 agent loop. The v4 part is the SDK + // launcher/tool catalog/dispatch surface that replaces the v3 agent tools. + const createAgent = isAgentTask(task); + const understudyV4Tools = await startUnderstudyV4Tools({ + environment: row.config.environment, + logger, + }); + let v3Result: V3InitResult | undefined; + let printedV4BusLogTree = false; + const printV4BusLogTree = async (): Promise => { + if (!verbose || printedV4BusLogTree) return; + printedV4BusLogTree = true; + try { + const result = (await understudyV4Tools.stagehandV4.cdp.Mod.evaluate({ + expression: `async () => { + const readLogTree = globalThis.__stagehandBusLogTree; + if (typeof readLogTree !== "function") { + return { error: "globalThis.__stagehandBusLogTree is not available" }; + } + return await readLogTree(params.stagehand_session_id); + }`, + params: { + stagehand_session_id: understudyV4Tools.stagehand_session_id, + }, + })) as { error?: unknown; logTree?: unknown }; + logger.log({ + category: "understudy_v4_code", + message: + typeof result.logTree === "string" + ? `v4 bus.logTree()\n${result.logTree}` + : `v4 bus.logTree() unavailable: ${String( + result.error ?? "Mod.evaluate did not return logTree.", + )}`, + level: 1, + }); + } catch (dashboardError) { + logger.warn({ + category: "understudy_v4_code", + message: `Unable to print v4 bus.logTree(): ${ + dashboardError instanceof Error + ? dashboardError.message + : String(dashboardError) + }`, + level: 1, + }); + } + }; + + try { + let llmClient: LLMClient | undefined; + if (input.modelName.includes("/")) { + const firstSlashIndex = input.modelName.indexOf("/"); + llmClient = new AISdkClientWrapped({ + model: getAISDKLanguageModel( + input.modelName.substring(0, firstSlashIndex), + input.modelName.substring(firstSlashIndex + 1), + ), + }); + } + + const localBrowserLaunchOptions = { + cdpUrl: understudyV4Tools.cdpUrl, + } satisfies Partial; + const { initV3 } = await import("../initV3.js"); + v3Result = await initV3({ + logger, + llmClient, + modelName: input.modelName, + createAgent: false, + agentMode: row.config.agentMode ?? input.agentMode, + isCUA: row.config.isCUA ?? input.isCUA, + verbose, + configOverrides: { + env: "LOCAL", + localBrowserLaunchOptions, + experimental: true, + }, + }); + const closeV3 = v3Result.v3.close.bind(v3Result.v3); + v3Result.v3.close = async () => { + await printV4BusLogTree(); + return await closeV3(); + }; + const v4Page = await installStagehandV4BenchFacade( + v3Result.v3, + understudyV4Tools.stagehandV4, + input.modelName, + ); + + if (createAgent) { + v3Result.agent = v3Result.v3.agent({ + model: input.modelName, + mode: "dom", + tools: understudyV4Tools.tools, + systemPrompt: buildStagehandAgentV4SystemPrompt( + understudyV4Tools.toolCatalog, + ), + }) as AgentInstance; + } + + const ctx: BenchHarnessContext = { + harness: "stagehand_v4", + row, + logger, + v3: v3Result.v3, + v4: understudyV4Tools.stagehandV4, + agent: v3Result.agent, + page: v4Page as unknown as Page, + debugUrl: v3Result.debugUrl ?? "", + sessionUrl: v3Result.sessionUrl ?? "", + }; + + return { + ctx, + cleanup: async () => { + await printV4BusLogTree(); + if (v3Result?.v3) { + try { + await v3Result.v3.close(); + } catch (closeError) { + console.error( + `Warning: Error closing V3 instance for ${input.name}:`, + closeError, + ); + } + } + await endBrowserbaseSession(v3Result?.v3); + await understudyV4Tools.cleanup(); + }, + }; + } catch (error) { + if (v3Result?.v3) await v3Result.v3.close().catch(() => {}); + await understudyV4Tools.cleanup().catch(() => {}); + throw error; + } + }, +}; + +function buildStagehandAgentV4SystemPrompt( + toolCatalog: Record[], +): string { + return [ + "You are using Stagehand v4 protocol tools through the existing Stagehand agent loop.", + "The callable tool schemas are the source of truth. They are v4 event payload schemas, not the older v3 agent wrapper schemas.", + "", + "Selector rules:", + "- Selectors are partial hints. You may pass only elementId, only xpath, only css, only text, only coordinates, or any useful subset.", + "- The browser hydrates selectors before use, so do not invent missing selector fields.", + "- Prefer elementId from the page summary tree when it is available. Coordinates are valid when they are the clearest available selector.", + "- Deep XPath can pierce frames and shadow roots, for example /body/div[3]/iframe[2]/body/iframe[2]/button.", + "", + "Page context:", + "- Use the derived page summary tool to get current DOM/accessibility context and element ids.", + "- Use the derived screenshot tool when visual confirmation or coordinates are needed.", + "- When you already have a selector and a concrete operation, prefer the direct browser action tool for that operation.", + "- If you use act with an action object, follow the action schema exactly.", + "", + "Available v4 tools:", + ...toolCatalog.map((definition) => { + const name = + typeof definition.name === "string" ? definition.name : "unknown"; + const description = + typeof definition.description === "string" + ? definition.description + : name; + return `- ${name}: ${description}`; + }), + ].join("\n"); +} + +async function installStagehandV4BenchFacade( + v3: V3, + stagehandV4: UnderstudyV4NativeRuntime, + modelName: string, +): Promise> { + const pageState: StagehandV4PageState = { + frames: [], + title: "", + url: "about:blank", + }; + const history: StagehandV4HistoryEntry[] = []; + const recordHistory = ( + method: string, + parameters: unknown, + result: unknown, + ): void => { + history.push({ + method, + parameters, + result, + timestamp: new Date().toISOString(), + }); + }; + const pageCache = new Map>(); + const pageOrder: string[] = []; + + const refreshPageInfo = async (): Promise => { + const info = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestInfo({ + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + }), + ); + if (!isRecord(info)) return; + if (typeof info.targetId === "string") pageState.targetId = info.targetId; + if (typeof info.title === "string") pageState.title = info.title; + if (typeof info.url === "string") pageState.url = info.url; + if (info.loadState != null) + pageState.loadState = normalizeStagehandV4LoadState(info.loadState); + await refreshFrameStates(stagehandV4, pageState).catch(() => {}); + }; + + const refreshPages = async (): Promise[]> => { + const rawPages = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserRequestTabList({}), + ); + const pages = Array.isArray(rawPages) + ? rawPages.filter((page): page is Record => + isRecord(page), + ) + : []; + for (const pageInfo of pages) { + const targetId = + typeof pageInfo.targetId === "string" ? pageInfo.targetId : null; + if (targetId == null) continue; + if (!pageOrder.includes(targetId)) pageOrder.push(targetId); + let facade = pageCache.get(targetId); + if (facade == null) { + const state: StagehandV4PageState = { + frames: [], + targetId, + title: "", + url: "about:blank", + }; + facade = createStagehandV4PageFacade( + stagehandV4, + state, + async () => { + await refreshSinglePageInfo(stagehandV4, state); + }, + recordHistory, + ); + pageCache.set(targetId, facade); + } + const state = facade[STAGEHAND_V4_PAGE_STATE]; + if (!isStagehandV4PageState(state)) continue; + state.targetId = targetId; + state.title = + typeof pageInfo.title === "string" ? pageInfo.title : state.title; + state.url = typeof pageInfo.url === "string" ? pageInfo.url : state.url; + await refreshFrameStates(stagehandV4, state).catch(() => {}); + } + return pageOrder + .map((targetId) => pageCache.get(targetId)) + .filter((page): page is Record => page != null); + }; + + await refreshPageInfo().catch(() => {}); + await refreshPages().catch(() => {}); + + const page = createStagehandV4PageFacade( + stagehandV4, + pageState, + refreshPageInfo, + recordHistory, + ); + if (pageState.targetId != null) { + if (!pageOrder.includes(pageState.targetId)) + pageOrder.push(pageState.targetId); + pageCache.set(pageState.targetId, page); + } + const pages = (): Record[] => { + const cached = pageOrder + .map((targetId) => pageCache.get(targetId)) + .filter((entry): entry is Record => entry != null); + return cached.length > 0 ? cached : [page]; + }; + + const context = v3.context as unknown as Record; + context.pages = pages; + context.awaitActivePage = async () => { + await refreshPages().catch(() => {}); + const activePage = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserRequestActivePage({}), + ); + if (isRecord(activePage) && typeof activePage.targetId === "string") { + const cached = pageCache.get(activePage.targetId); + if (cached != null) return cached; + } + await refreshPageInfo(); + return page; + }; + Object.defineProperty(v3, "history", { + configurable: true, + get: () => Promise.resolve([...history]), + }); + + v3.observe = (async ( + a?: string | Record, + b?: Record, + ) => { + const instruction = typeof a === "string" ? a : undefined; + const options = (typeof a === "string" ? b : a) as + | Record + | undefined; + const result = await stagehandV4.cdp.Stagehand.AIObserve({ + ...(instruction != null ? { instruction } : {}), + ...selectorParam(options), + ...workflowOptionsParam(options, modelName), + }); + const observed = unwrapStagehandV4Result(result); + const output = Array.isArray(observed) ? observed : []; + recordHistory("observe", { instruction, options }, output); + return output; + }) as V3["observe"]; + + v3.act = (async ( + input: string | Record, + options?: Record, + ) => { + const workflowOptions = workflowOptionsParam(options, modelName); + const result = await stagehandV4.cdp.Stagehand.AIAct( + typeof input === "string" + ? { + instruction: input, + ...selectorParam(options), + ...workflowOptions, + } + : { + action: normalizeV4Action(input), + ...selectorParam(options), + ...workflowOptions, + options: { + ...(isRecord(workflowOptions.options) + ? workflowOptions.options + : {}), + selfHeal: true, + }, + }, + ); + const unwrapped = unwrapStagehandV4Result(result); + await refreshPageInfo().catch(() => {}); + await refreshPages().catch(() => {}); + recordHistory( + "act", + typeof input === "string" + ? { instruction: input, options } + : { action: input, options }, + unwrapped, + ); + return unwrapped; + }) as V3["act"]; + + v3.extract = (async ( + a?: string | Record, + b?: z.ZodType | Record, + c?: Record, + ) => { + const instruction = typeof a === "string" ? a : undefined; + const schema = isZodSchema(b) ? z.toJSONSchema(b) : undefined; + const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as + | Record + | undefined; + if (instruction == null && schema == null) { + const summary = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({ + ...selectorParam(options), + }), + ); + const pageText = + isRecord(summary) && typeof summary.pageText === "string" + ? summary.pageText + : ""; + return { pageText, extraction: pageText }; + } + const result = await stagehandV4.cdp.Stagehand.AIExtract({ + ...(instruction != null ? { instruction } : {}), + ...(schema != null ? { schema: schema as Record } : {}), + ...selectorParam(options), + ...workflowOptionsParam(options, modelName), + }); + const extracted = unwrapStagehandV4Result(result); + recordHistory("extract", { instruction, schema, options }, extracted); + return extracted; + }) as V3["extract"]; + + return page; +} + +function createStagehandV4PageFacade( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + refreshPageInfo: () => Promise, + recordHistory?: ( + method: string, + parameters: unknown, + result: unknown, + ) => void, +): Record { + return { + [STAGEHAND_V4_PAGE_STATE]: pageState, + async goto(url: string, options?: unknown) { + pageState.loadState = "init"; + const selector = + pageState.targetId != null + ? { targetId: pageState.targetId } + : { active: true }; + if (!("targetId" in selector)) { + delete pageState.targetId; + } + const waitUntil = + isRecord(options) && "waitUntil" in options + ? options.waitUntil + : undefined; + const rawResult = await stagehandV4.cdp.Stagehand.BrowserPageGoto({ + url, + selector, + waitUntil: normalizeStagehandV4LoadState(waitUntil), + }); + const result = unwrapStagehandV4Result(rawResult); + if (isRecord(result)) { + if (typeof result.targetId === "string") + pageState.targetId = result.targetId; + if (typeof result.url === "string") pageState.url = result.url; + } + await refreshPageInfo(); + const response = { + ok: () => true, + status: () => 200, + url: () => pageState.url, + }; + recordHistory?.("navigate", { url, options }, result); + return response; + }, + url() { + return pageState.url; + }, + async title() { + await refreshPageInfo(); + return pageState.title; + }, + frames() { + return pageState.frames.map((frameState) => + createStagehandV4FrameFacade(stagehandV4, frameState), + ); + }, + async waitForLoadState(state?: unknown, options?: unknown) { + await waitForStagehandV4LoadState( + stagehandV4, + pageState, + state, + loadStateTimeoutMs(options), + ); + await refreshPageInfo(); + }, + async evaluate(expressionOrFn: unknown, arg?: unknown) { + const expression = + typeof expressionOrFn === "function" + ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` + : String(expressionOrFn); + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), + arg: isJsonValue(arg) ? arg : undefined, + awaitPromise: true, + expression, + returnByValue: true, + }), + ); + return isRecord(result) && "value" in result ? result.value : result; + }, + locator(selector: unknown) { + return createStagehandV4LocatorFacade(stagehandV4, pageState, selector); + }, + frameLocator(selector: unknown) { + return createStagehandV4FrameLocatorFacade(stagehandV4, pageState, [ + selector, + ]); + }, + }; +} + +function createStagehandV4FrameFacade( + stagehandV4: UnderstudyV4NativeRuntime, + frameState: StagehandV4FrameState, +): Record { + return { + async evaluate(expressionOrFn: unknown, arg?: unknown) { + const expression = + typeof expressionOrFn === "function" + ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` + : String(expressionOrFn); + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(frameState.targetId != null + ? { targetId: frameState.targetId } + : {}), + arg: isJsonValue(arg) ? arg : undefined, + awaitPromise: true, + expression, + frameId: frameState.frameId, + returnByValue: true, + }), + ); + return isRecord(result) && "value" in result ? result.value : result; + }, + url() { + return frameState.url ?? "about:blank"; + }, + }; +} + +function createStagehandV4FrameLocatorFacade( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + frameSelectors: unknown[], +): Record { + return { + frameLocator(selector: unknown) { + return createStagehandV4FrameLocatorFacade(stagehandV4, pageState, [ + ...frameSelectors, + selector, + ]); + }, + locator(selector: unknown) { + return createStagehandV4LocatorFacade(stagehandV4, pageState, selector); + }, + async evaluate(expressionOrFn: unknown, arg?: unknown) { + const expression = + typeof expressionOrFn === "function" + ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})` + : String(expressionOrFn); + const frameId = await resolveStagehandV4FrameLocator( + stagehandV4, + pageState, + frameSelectors, + ); + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({ + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), + arg: isJsonValue(arg) ? arg : undefined, + awaitPromise: true, + expression, + ...(frameId != null ? { frameId } : {}), + returnByValue: true, + }), + ); + return isRecord(result) && "value" in result ? result.value : result; + }, + }; +} + +function createStagehandV4LocatorFacade( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + selector: unknown, + frameSelectors: unknown[] = [], +): Record { + const read = async () => + await requestStagehandV4ElementInfo( + stagehandV4, + pageState, + selector, + frameSelectors, + ); + return { + first() { + return createStagehandV4LocatorFacade( + stagehandV4, + pageState, + selector, + frameSelectors, + ); + }, + async inputValue() { + return (await read()).inputValue ?? ""; + }, + async isChecked() { + return Boolean((await read()).checked); + }, + async textContent() { + return (await read()).textContent ?? null; + }, + async innerText() { + const info = await read(); + return info.innerText ?? info.textContent ?? ""; + }, + async innerHtml() { + return (await read()).innerHTML ?? ""; + }, + async innerHTML() { + return (await read()).innerHTML ?? ""; + }, + async click() { + await stagehandV4.cdp.Stagehand.BrowserPageClick({ + selector: await stagehandV4SelectorFor( + stagehandV4, + pageState, + selector, + frameSelectors, + ), + }); + }, + async backendNodeId() { + return (await read()).backendNodeId; + }, + }; +} + +async function requestStagehandV4ElementInfo( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + selector: unknown, + frameSelectors: unknown[] = [], +): Promise<{ + backendNodeId: number; + checked?: boolean | null; + innerHTML?: string | null; + innerText?: string | null; + inputValue?: string | null; + textContent?: string | null; +}> { + const result = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestElementInfo({ + selector: await stagehandV4SelectorFor( + stagehandV4, + pageState, + selector, + frameSelectors, + ), + }), + ); + if (isRecord(result) && typeof result.backendNodeId === "number") { + return result as { + backendNodeId: number; + checked?: boolean | null; + innerHTML?: string | null; + innerText?: string | null; + inputValue?: string | null; + textContent?: string | null; + }; + } + throw new Error("stagehand_v4 locator could not resolve element info."); +} + +async function refreshSinglePageInfo( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, +): Promise { + const info = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestInfo({ + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + }), + ); + if (!isRecord(info)) return; + if (typeof info.targetId === "string") pageState.targetId = info.targetId; + if (typeof info.title === "string") pageState.title = info.title; + if (typeof info.url === "string") pageState.url = info.url; + if (info.loadState != null) + pageState.loadState = normalizeStagehandV4LoadState(info.loadState); + await refreshFrameStates(stagehandV4, pageState); +} + +async function refreshFrameStates( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, +): Promise { + if (pageState.targetId == null || isInternalStagehandV4PageUrl(pageState.url)) + return; + const rawFrameTree = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageRequestFullFrameTree({ + targetId: pageState.targetId, + }), + ); + if (!isRecord(rawFrameTree) || !isRecord(rawFrameTree.frameTree)) return; + const frames: StagehandV4FrameState[] = []; + collectStagehandV4Frames(rawFrameTree.frameTree, pageState.targetId, frames); + pageState.frames = frames; +} + +function collectStagehandV4Frames( + frameTree: Record, + targetId: string, + frames: StagehandV4FrameState[], +): void { + const frame = isRecord(frameTree.frame) ? frameTree.frame : null; + if (frame != null && typeof frame.id === "string") { + frames.push({ + frameId: frame.id, + targetId, + url: typeof frame.url === "string" ? frame.url : undefined, + }); + } + const childFrames = Array.isArray(frameTree.childFrames) + ? frameTree.childFrames + : []; + for (const childFrame of childFrames) { + if (isRecord(childFrame)) { + collectStagehandV4Frames(childFrame, targetId, frames); + } + } +} + +async function stagehandV4SelectorFor( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + selector: unknown, + frameSelectors: unknown[] = [], +): Promise> { + if (pageState.targetId == null) { + await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {}); + } + const frameId = await resolveStagehandV4FrameLocator( + stagehandV4, + pageState, + frameSelectors, + ); + return { + ...normalizeV4Selector(selector), + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + ...(frameId != null ? { frameId } : {}), + }; +} + +async function resolveStagehandV4FrameLocator( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + frameSelectors: unknown[], +): Promise { + if (frameSelectors.length === 0) return undefined; + if (pageState.targetId == null) { + await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {}); + } + let frameId: string | undefined; + for (const frameSelector of frameSelectors) { + const selector = { + ...normalizeV4Selector(frameSelector), + ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}), + ...(frameId != null ? { frameId } : {}), + }; + const located = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageLocate({ selector }).catch( + (error: unknown): never => { + throw new Error( + `stagehand_v4 frameLocator could not locate ${JSON.stringify(selector)}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + }, + ), + ); + if (!isRecord(located)) { + throw new Error( + "stagehand_v4 frameLocator could not resolve iframe selector.", + ); + } + const summary = unwrapStagehandV4Result( + await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({ + hydrate: { ax: false }, + selector: { + ...(pageState.targetId != null + ? { targetId: pageState.targetId } + : {}), + }, + }), + ); + frameId = childFrameIdForLocatedFrameOwner(summary, located); + } + return frameId; +} + +function childFrameIdForLocatedFrameOwner( + summary: unknown, + located: Record, +): string { + const frameGraph = isRecord(summary) ? summary.frameGraph : null; + if (!isRecord(frameGraph) || !isRecord(frameGraph.ownerChainByFrameId)) { + throw new Error( + "stagehand_v4 frameLocator could not read the frame graph.", + ); + } + const backendNodeId = + typeof located.backendNodeId === "number" ? located.backendNodeId : null; + const ownerFrameId = + typeof located.frameId === "string" ? located.frameId : null; + if (backendNodeId == null || ownerFrameId == null) { + throw new Error( + "stagehand_v4 frameLocator resolved selector without a frame owner.", + ); + } + for (const [candidateFrameId, chain] of Object.entries( + frameGraph.ownerChainByFrameId, + )) { + if (!Array.isArray(chain)) continue; + const owner = chain.at(-1); + if ( + isRecord(owner) && + owner.backendNodeId === backendNodeId && + owner.frameId === ownerFrameId + ) { + return candidateFrameId; + } + } + throw new Error("stagehand_v4 frameLocator could not find a child frame."); +} + +async function waitForStagehandV4LoadState( + stagehandV4: UnderstudyV4NativeRuntime, + pageState: StagehandV4PageState, + state: unknown, + timeoutMs: number, +): Promise { + const expectedState = normalizeStagehandV4LoadState(state); + const deadline = Date.now() + timeoutMs; + while (true) { + await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {}); + if ( + pageState.loadState != null && + STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >= + STAGEHAND_V4_LOAD_STATE_ORDER[expectedState] + ) { + return; + } + const remainingMs = deadline - Date.now(); + if (remainingMs <= 0) { + throw new Error( + `Timed out waiting for stagehand_v4 page loadState=${expectedState}.`, + ); + } + await new Promise((resolve) => + setTimeout(resolve, Math.min(100, remainingMs)), + ); + } +} + +function normalizeStagehandV4LoadState(state: unknown): StagehandV4LoadState { + if (state == null || state === "load" || state === "loaded") return "loaded"; + if (state === "networkalmostidle") return "networkidle2"; + if (isStagehandV4LoadState(state)) return state; + throw new Error(`Unsupported stagehand_v4 waitForLoadState state: ${state}`); +} + +function isStagehandV4LoadState(value: unknown): value is StagehandV4LoadState { + return ( + value === "init" || + value === "domcontentloaded" || + value === "loaded" || + value === "networkidle2" || + value === "networkidle" + ); +} + +function loadStateTimeoutMs(options: unknown): number { + if (!isRecord(options)) return 30_000; + const timeout = options.timeoutMs ?? options.timeout; + return typeof timeout === "number" && Number.isFinite(timeout) + ? Math.max(0, timeout) + : 30_000; +} + +function normalizeV4Action( + action: Record, +): Record { + const method = + typeof action.method === "string" + ? normalizeV4ActionMethod(action.method) + : null; + const selector = normalizeV4Selector(action.selector); + let args: Record = {}; + if (isRecord(action.arguments)) { + args = action.arguments; + } else if (Array.isArray(action.arguments)) { + const positional = action.arguments.filter( + (value): value is string => typeof value === "string", + ); + const first = positional[0]; + if (method === "fill") { + args = { value: first ?? "" }; + } else if (method === "type") { + args = { text: first ?? "" }; + } else if (method === "keys") { + args = { key: first ?? "", method: "press" }; + } else if (method === "goto") { + args = { url: first ?? "" }; + } else if (method === "wait") { + const ms = Number(first); + args = { ms: Number.isFinite(ms) ? ms : 1000 }; + } else if (method === "scroll" || method === "scrollTo") { + const numberValue = first?.endsWith("%") + ? Number.parseFloat(first) + : Number(first); + args = first?.includes("%") + ? { percent: first } + : method === "scroll" + ? { deltaY: Number.isFinite(numberValue) ? numberValue : 0 } + : { y: Number.isFinite(numberValue) ? numberValue : 0 }; + } else if (method === "dragAndDrop") { + args = { + from: selector, + to: normalizeV4Selector(first) ?? selector, + }; + } + } + return { + ...action, + selector, + method, + arguments: args, + }; +} + +function normalizeV4ActionMethod(method: string): string { + return method === "press" ? "keys" : method; +} + +function selectorParam( + options: Record | undefined, +): Record { + const pageSelector = stagehandV4PageSelector(options?.page); + const selector = normalizeV4Selector(options?.selector); + const mergedSelector = + pageSelector == null && selector == null + ? undefined + : { + ...(pageSelector ?? {}), + ...(selector ?? {}), + }; + return mergedSelector == null ? {} : { selector: mergedSelector }; +} + +function normalizeV4Selector( + value: unknown, +): Record | undefined { + if (value == null) return undefined; + if (isRecord(value)) return value; + if (typeof value !== "string" || value.length === 0) return undefined; + if (value.startsWith("xpath=")) + return { xpath: value.slice("xpath=".length) }; + if (value.startsWith("/") || value.startsWith("(")) return { xpath: value }; + return { + css: value + .split(/\s*>>\s*/u) + .filter(Boolean) + .join(" "), + }; +} + +function stagehandV4PageSelector( + page: unknown, +): Record | undefined { + if (page == null) return undefined; + const state = (page as Record)[STAGEHAND_V4_PAGE_STATE]; + if (!isStagehandV4PageState(state) || state.targetId == null) + return undefined; + return { targetId: state.targetId }; +} + +function isStagehandV4PageState(value: unknown): value is StagehandV4PageState { + return ( + isRecord(value) && + Array.isArray(value.frames) && + typeof value.title === "string" && + typeof value.url === "string" + ); +} + +function workflowOptionsParam( + options: Record | undefined, + modelName: string, +): Record { + const workflowOptions: Record = { model: modelName }; + if (typeof options?.timeout === "number") + workflowOptions.timeout = options.timeout; + if (options != null && isJsonValue(options.variables)) + workflowOptions.variables = options.variables; + if (isRecord(options?.model)) workflowOptions.model = options.model; + if (typeof options?.model === "string") workflowOptions.model = options.model; + return Object.keys(workflowOptions).length === 0 + ? {} + : { options: workflowOptions }; +} + +function unwrapStagehandV4Result(value: unknown): unknown { + if (!isRecord(value)) return value; + if (isRecord(value.event_results)) { + for (const entry of Object.values(value.event_results)) { + if (!isRecord(entry)) continue; + if ("result" in entry) return entry.result; + } + } + if ("result" in value) return value.result; + return value; +} + +function isZodSchema(value: unknown): value is z.ZodType { + return isRecord(value) && typeof value.safeParse === "function"; +} + +function isJsonValue(value: unknown): boolean { + if (value == null) return true; + if ( + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" + ) + return true; + if (Array.isArray(value)) return value.every(isJsonValue); + if (!isRecord(value)) return false; + return Object.values(value).every(isJsonValue); +} + +function isRecord(value: unknown): value is Record { + return value != null && typeof value === "object" && !Array.isArray(value); +} diff --git a/packages/evals/framework/UnderstudyV4Tools.ts b/packages/evals/framework/UnderstudyV4Tools.ts new file mode 100644 index 000000000..e2eeaafbf --- /dev/null +++ b/packages/evals/framework/UnderstudyV4Tools.ts @@ -0,0 +1,742 @@ +import path from "node:path"; +import fs from "node:fs"; +import { createRequire } from "node:module"; +import { spawn, type ChildProcess } from "node:child_process"; +import { createInterface } from "node:readline"; +import { fileURLToPath, pathToFileURL } from "node:url"; +import type { ToolSet } from "ai"; +import type { EvalLogger } from "../logger.js"; +import { getRepoRootDir } from "../runtimePaths.js"; + +export type UnderstudyV4ToolDefinition = Record; + +type BridgeReadyMessage = { + type: "ready"; + cdpUrl: string; + browserbaseExtensionId?: string; + stagehand_session_id?: string; + toolCatalog: UnderstudyV4ToolDefinition[]; +}; + +type BridgeResultMessage = { + type: "result"; + id: number; + result?: unknown; + error?: string; +}; + +type BridgeEventMessage = { + type: "event"; + name: string; + event: unknown; +}; + +type BridgeErrorMessage = { + type: "error"; + error: string; +}; + +type UnderstudyV4Sdk = { + StagehandClient: new (options?: Record) => { + browserbase_extension_id?: string; + cdp_http_origin?: string; + connect(input?: unknown): Promise; + close(): Promise; + cdp: { + cdp_url?: string | null; + on(eventName: string, listener: (event: unknown) => void): unknown; + off(eventName: string, listener: (event: unknown) => void): unknown; + Stagehand: Record< + string, + (params?: Record) => Promise + >; + }; + stagehand_session_id?: string; + }; + StagehandProtocolEvents: Record; + aiBrowserToolDefinitions: () => UnderstudyV4ToolDefinition[]; +}; + +export interface UnderstudyV4Tools { + cdpUrl: string; + browserbaseExtensionId?: string; + stagehand_session_id?: string; + toolCatalog: UnderstudyV4ToolDefinition[]; + stagehandV4: UnderstudyV4NativeRuntime; + tools: ToolSet; + cleanup: () => Promise; +} + +export interface UnderstudyV4NativeRuntime { + cdp: { + on(eventName: string, listener: (event: unknown) => void): void; + off(eventName: string, listener: (event: unknown) => void): void; + Mod: Record) => Promise>; + Stagehand: Record< + string, + (params?: Record) => Promise + >; + }; +} + +type PendingCall = { + resolve: (value: unknown) => void; + reject: (error: Error) => void; +}; + +export async function startUnderstudyV4Tools(input: { + environment: "LOCAL" | "BROWSERBASE"; + logger: EvalLogger; +}): Promise { + const require = createRequire(import.meta.url); + const tsxCli = require.resolve("tsx/cli"); + const child = spawn( + process.execPath, + [tsxCli, fileURLToPath(import.meta.url)], + { + cwd: getRepoRootDir(), + env: { + ...process.env, + UNDERSTUDY_V4_TOOLS_CHILD: "1", + }, + stdio: ["pipe", "pipe", "pipe"], + }, + ); + + const pending = new Map(); + const eventListeners = new Map void>>(); + const subscribedEvents = new Set(); + let nextId = 1; + let readyResolve: (message: BridgeReadyMessage) => void; + let readyReject: (error: Error) => void; + const readyPromise = new Promise((resolve, reject) => { + readyResolve = resolve; + readyReject = reject; + }); + + const stdout = createInterface({ input: child.stdout }); + stdout.on("line", (line) => { + if (!line.trim()) return; + const message = parseBridgeMessage(line); + if (!message) { + input.logger.log({ + category: "understudy_v4_code", + message: line, + level: 1, + }); + return; + } + if (message.type === "ready") { + readyResolve(message); + return; + } + if (message.type === "event") { + for (const listener of eventListeners.get(message.name) ?? []) { + listener(message.event); + } + return; + } + if (message.type === "error") { + const error = new Error(message.error); + readyReject(error); + for (const call of pending.values()) call.reject(error); + pending.clear(); + return; + } + const call = pending.get(message.id); + if (!call) return; + pending.delete(message.id); + if (message.error) { + call.reject(new Error(message.error)); + } else { + call.resolve(message.result); + } + }); + + child.stderr.on("data", (chunk: Buffer) => { + for (const line of chunk.toString("utf8").split(/\r?\n/).filter(Boolean)) { + input.logger.warn({ + category: "understudy_v4_code", + message: line, + level: 1, + }); + } + }); + + child.on("error", (error) => { + readyReject(error); + for (const call of pending.values()) call.reject(error); + pending.clear(); + }); + child.on("exit", (code, signal) => { + const error = new Error( + `Understudy v4 tools process exited (${signal ?? code ?? "unknown"}).`, + ); + readyReject(error); + for (const call of pending.values()) call.reject(error); + pending.clear(); + }); + + child.stdin.write( + `${JSON.stringify({ type: "init", environment: input.environment })}\n`, + ); + + const ready = await readyPromise; + input.logger.log({ + category: "understudy_v4_code", + message: `Connected v4 tools at ${ready.cdpUrl}`, + level: 1, + }); + input.logger.log({ + category: "understudy_v4_code", + message: `v4 stagehand_session_id=${ready.stagehand_session_id ?? "unknown"}`, + level: 1, + }); + const callCommand = (name: string, args: Record) => + callBridge(child, pending, nextId++, "command", name, args); + const callTool = (name: string, args: Record) => + callBridge(child, pending, nextId++, "tool", name, args); + const { jsonSchema, tool } = await import("ai"); + + return { + cdpUrl: ready.cdpUrl, + browserbaseExtensionId: ready.browserbaseExtensionId, + stagehand_session_id: ready.stagehand_session_id, + toolCatalog: ready.toolCatalog, + stagehandV4: { + cdp: { + on(eventName, listener) { + let listeners = eventListeners.get(eventName); + if (!listeners) { + listeners = new Set(); + eventListeners.set(eventName, listeners); + } + listeners.add(listener); + if (!subscribedEvents.has(eventName)) { + subscribedEvents.add(eventName); + child.stdin.write( + `${JSON.stringify({ type: "subscribe", name: eventName })}\n`, + ); + } + }, + off(eventName, listener) { + const listeners = eventListeners.get(eventName); + listeners?.delete(listener); + if (listeners?.size === 0) eventListeners.delete(eventName); + }, + Mod: new Proxy( + {}, + { + get(_target, property) { + if (typeof property !== "string") return undefined; + return (params?: Record) => + callCommand(`Mod.${property}`, params ?? {}); + }, + }, + ) as UnderstudyV4NativeRuntime["cdp"]["Mod"], + Stagehand: new Proxy( + {}, + { + get(_target, property) { + if (typeof property !== "string") return undefined; + return (params?: Record) => + callCommand(`Stagehand.${property}`, params ?? {}); + }, + }, + ) as UnderstudyV4NativeRuntime["cdp"]["Stagehand"], + }, + }, + tools: buildUnderstudyV4ToolSet(ready.toolCatalog, callTool, input.logger, { + jsonSchema, + tool, + }), + cleanup: async () => { + await closeBridge(child, pending); + }, + }; +} + +function buildUnderstudyV4ToolSet( + catalog: UnderstudyV4ToolDefinition[], + callTool: (name: string, args: Record) => Promise, + logger: EvalLogger, + ai: Pick, +): ToolSet { + const tools: ToolSet = {}; + const selectorMap: Record> = {}; + for (const definition of catalog) { + const name = typeof definition.name === "string" ? definition.name : null; + const rawSchema = definition.inputSchema ?? definition.parameters; + const schema = + rawSchema != null && + typeof rawSchema === "object" && + !Array.isArray(rawSchema) + ? rawSchema + : null; + if (!name) continue; + if (!schema) continue; + tools[name] = ai.tool({ + description: + typeof definition.description === "string" + ? definition.description + : name, + inputSchema: ai.jsonSchema(schema), + execute: async (args) => { + logger.log({ + category: "understudy_v4_code", + message: `Agent calling v4 tool: ${name}`, + level: 1, + auxiliary: { + arguments: { + value: JSON.stringify(args), + type: "object", + }, + }, + }); + const hydratedArgs = hydrateSelectorReferences( + isRecord(args) ? args : {}, + selectorMap, + ); + return callTool(name, isRecord(hydratedArgs) ? hydratedArgs : {}); + }, + toModelOutput: (result) => modelOutputForToolResult(result, selectorMap), + }); + } + return tools; +} + +function modelOutputForToolResult( + result: unknown, + selectorMap: Record>, +) { + const payload = firstPayload(result); + const screenshot = stringField(payload, "screenshot"); + if (screenshot) { + return { + type: "content" as const, + value: [ + { + type: "media" as const, + mediaType: "image/png", + data: screenshot.replace(/^data:image\/\w+;base64,/, ""), + }, + ], + }; + } + const pageSummary = + stringField(payload, "formattedTree") ?? + stringField(payload, "observationTree") ?? + stringField(payload, "pageText"); + if (pageSummary) { + updateSelectorMap(selectorMap, payload.elementSelectorMap); + return { + type: "content" as const, + value: [ + { + type: "text" as const, + text: [ + "Page Summary:", + pageSummary, + "", + 'Use an element square-bracket id as selector.elementId without brackets, for example {"selector":{"elementId":"0-3"}}.', + ].join("\n"), + }, + ], + }; + } + return { + type: "content" as const, + value: [ + { + type: "text" as const, + text: JSON.stringify(sanitizeForModel(payload)), + }, + ], + }; +} + +function callBridge( + child: ChildProcess, + pending: Map, + id: number, + type: "tool" | "command", + name: string, + args: Record, +): Promise { + return new Promise((resolve, reject) => { + pending.set(id, { resolve, reject }); + child.stdin.write(`${JSON.stringify({ type, id, name, args })}\n`); + }); +} + +async function closeBridge( + child: ChildProcess, + pending: Map, +): Promise { + if (child.exitCode != null) return; + await new Promise((resolve) => { + child.once("exit", () => resolve()); + child.stdin.write(`${JSON.stringify({ type: "close" })}\n`); + child.stdin.end(); + setTimeout(() => { + if (child.exitCode == null) child.kill("SIGTERM"); + resolve(); + }, 5000).unref(); + }); + for (const call of pending.values()) { + call.reject(new Error("Understudy v4 tools process closed.")); + } + pending.clear(); +} + +function parseBridgeMessage( + line: string, +): + | BridgeReadyMessage + | BridgeResultMessage + | BridgeEventMessage + | BridgeErrorMessage + | null { + try { + const parsed = JSON.parse(line) as + | BridgeReadyMessage + | BridgeResultMessage + | BridgeEventMessage + | BridgeErrorMessage; + if ( + parsed.type === "ready" || + parsed.type === "result" || + parsed.type === "event" || + parsed.type === "error" + ) { + return parsed; + } + } catch { + return null; + } + return null; +} + +async function runBridgeChild(): Promise { + const sdk = await loadStagehandV4Sdk(); + const commandByToolName = buildCommandByToolName(sdk); + let client: InstanceType | null = null; + const eventSubscriptions = new Map void>(); + + const stdin = createInterface({ input: process.stdin }); + for await (const line of stdin) { + if (!line.trim()) continue; + const message = JSON.parse(line) as { + type: "init" | "tool" | "command" | "subscribe" | "close"; + environment?: "LOCAL" | "BROWSERBASE"; + id?: number; + name?: string; + args?: Record; + }; + + if (message.type === "init") { + client = new sdk.StagehandClient( + understudyV4ClientOptions(message.environment ?? "LOCAL"), + ); + await client.connect(); + let cdpUrl = client.cdp.cdp_url ?? client.cdp_http_origin ?? ""; + if (/^https?:\/\//i.test(cdpUrl)) { + const versionResponse = await fetch(`${cdpUrl}/json/version`); + if (!versionResponse.ok) { + throw new Error( + `Unable to resolve v4 browser websocket URL from ${cdpUrl}: GET /json/version -> ${versionResponse.status}`, + ); + } + const version = (await versionResponse.json()) as { + webSocketDebuggerUrl?: unknown; + }; + if (typeof version.webSocketDebuggerUrl !== "string") { + throw new Error( + `Unable to resolve v4 browser websocket URL from ${cdpUrl}: missing webSocketDebuggerUrl`, + ); + } + cdpUrl = version.webSocketDebuggerUrl; + } + writeBridgeMessage({ + type: "ready", + cdpUrl, + browserbaseExtensionId: client.browserbase_extension_id, + stagehand_session_id: client.stagehand_session_id, + toolCatalog: sdk.aiBrowserToolDefinitions(), + }); + continue; + } + + if (message.type === "subscribe") { + if (!client) throw new Error("Understudy v4 tools were not initialized."); + const name = message.name; + if (typeof name !== "string") + throw new Error("Event subscription requires an event name."); + if (!eventSubscriptions.has(name)) { + const listener = (event: unknown): void => + writeBridgeMessage({ type: "event", name, event }); + eventSubscriptions.set(name, listener); + client.cdp.on(name, listener); + } + continue; + } + + if (message.type === "tool" || message.type === "command") { + if (!client) throw new Error("Understudy v4 tools were not initialized."); + const id = message.id ?? 0; + try { + const commandName = + message.type === "tool" + ? commandByToolName.get(message.name ?? "") + : message.name; + if (!commandName) { + throw new Error( + message.type === "tool" + ? `No v4 protocol event is exposed for tool "${message.name}".` + : `No v4 protocol command was provided.`, + ); + } + const command = + message.type === "command" + ? commandForPath(client.cdp, commandName) + : client.cdp.Stagehand[commandName]; + if (!command) { + throw new Error( + `The v4 SDK does not expose ${ + message.type === "command" + ? commandName + : `Stagehand.${commandName}` + }.`, + ); + } + const result = await command(message.args ?? {}); + writeBridgeMessage({ type: "result", id, result }); + } catch (error) { + writeBridgeMessage({ + type: "result", + id, + error: error instanceof Error ? error.message : String(error), + }); + } + continue; + } + + if (message.type === "close") { + if (client) { + for (const [eventName, listener] of eventSubscriptions) { + client.cdp.off(eventName, listener); + } + } + await client?.close(); + process.exit(0); + } + } +} + +export function assertUnderstudyV4SdkAvailable(): string { + const sdkPath = + process.env.STAGEHAND_V4_SDK_PATH ?? + path.join( + getRepoRootDir(), + "..", + "stagehand-driver", + "sdks", + "js", + "index.ts", + ); + if (!fs.existsSync(sdkPath)) { + throw new Error( + [ + "stagehand_v4 evals require a local Stagehand v4 SDK checkout.", + `Expected v4 SDK entrypoint at: ${sdkPath}`, + "Set STAGEHAND_V4_SDK_PATH to the v4 SDK entrypoint if your checkout lives somewhere else.", + ].join("\n"), + ); + } + return sdkPath; +} + +async function loadStagehandV4Sdk(): Promise { + const sdkPath = assertUnderstudyV4SdkAvailable(); + return (await import(pathToFileURL(sdkPath).href)) as UnderstudyV4Sdk; +} + +function understudyV4ClientOptions( + environment: "LOCAL" | "BROWSERBASE", +): Record { + if (process.env.STAGEHAND_V4_CDP_URL) { + return { + cdp_url: process.env.STAGEHAND_V4_CDP_URL, + rebuild_extension: false, + }; + } + if (environment === "BROWSERBASE") { + if (!process.env.BROWSERBASE_API_KEY) { + throw new Error( + "BROWSERBASE_API_KEY is required for understudy_v4_code.", + ); + } + return { + rebuild_extension: false, + browserbase_session_create_params: { + browserbase_api_key: process.env.BROWSERBASE_API_KEY, + }, + }; + } + return { + local_browser_launch_options: { + headless: process.env.EVAL_HEADLESS !== "false", + ...(process.env.CHROME_PATH + ? { executable_path: process.env.CHROME_PATH } + : {}), + }, + }; +} + +function buildCommandByToolName(sdk: UnderstudyV4Sdk): Map { + const commandByToolName = new Map(); + for (const value of Object.values(sdk.StagehandProtocolEvents)) { + if (typeof value !== "function") continue; + const eventClass = value as { + event_type?: unknown; + llm_tool_name?: unknown; + }; + if ( + typeof eventClass.event_type !== "string" || + typeof eventClass.llm_tool_name !== "string" || + !eventClass.event_type.endsWith("Event") + ) { + continue; + } + commandByToolName.set( + eventClass.llm_tool_name, + eventClass.event_type.slice(0, -"Event".length), + ); + } + return commandByToolName; +} + +function commandForPath( + cdp: InstanceType["cdp"], + path: string, +): ((params?: Record) => Promise) | undefined { + const [domain, method] = path.split("."); + if (!domain || !method) return undefined; + const commands = (cdp as unknown as Record)[domain]; + if (!isRecord(commands)) return undefined; + const command = commands[method]; + return typeof command === "function" + ? (command as (params?: Record) => Promise) + : undefined; +} + +function writeBridgeMessage( + message: + | BridgeReadyMessage + | BridgeResultMessage + | BridgeEventMessage + | BridgeErrorMessage, +): void { + process.stdout.write(`${JSON.stringify(message)}\n`); +} + +function firstPayload(value: unknown): Record { + if (!isRecord(value)) return {}; + const eventResults = + value.event_results ?? + (isRecord(value.event) ? value.event.event_results : undefined); + if (isRecord(eventResults)) { + const first = Object.values(eventResults)[0]; + if (isRecord(first)) { + if (isRecord(first.result)) return first.result; + return first; + } + } + return value; +} + +function stringField( + record: Record, + key: string, +): string | null { + const value = record[key]; + return typeof value === "string" && value.length > 0 ? value : null; +} + +function sanitizeForModel(value: unknown): unknown { + if (typeof value === "string") { + return value.length > 2000 + ? `${value.slice(0, 2000)}...[truncated]` + : value; + } + if (Array.isArray(value)) + return value.map((entry) => sanitizeForModel(entry)); + if (!isRecord(value)) return value; + const result: Record = {}; + for (const [key, entry] of Object.entries(value)) { + if ( + key.toLowerCase().includes("screenshot") || + key.toLowerCase().includes("image") + ) { + result[key] = + typeof entry === "string" && entry.length > 80 + ? `${entry.slice(0, 80)}...[truncated]` + : entry; + continue; + } + result[key] = sanitizeForModel(entry); + } + return result; +} + +function updateSelectorMap( + selectorMap: Record>, + value: unknown, +): void { + if (!isRecord(value)) return; + for (const [elementId, selector] of Object.entries(value)) { + if (isRecord(selector)) selectorMap[elementId] = selector; + } +} + +function hydrateSelectorReferences( + value: unknown, + selectorMap: Record>, +): unknown { + if (Array.isArray(value)) { + return value.map((entry) => hydrateSelectorReferences(entry, selectorMap)); + } + if (!isRecord(value)) return value; + const elementId = + typeof value.elementId === "string" ? value.elementId : null; + const mappedSelector = elementId == null ? null : selectorMap[elementId]; + const hydratedRecord = Object.fromEntries( + Object.entries(value) + .filter(([key]) => key !== "elementId") + .map(([key, entry]) => [ + key, + hydrateSelectorReferences(entry, selectorMap), + ]), + ); + return mappedSelector == null + ? hydratedRecord + : { ...mappedSelector, ...hydratedRecord }; +} + +function isRecord(value: unknown): value is Record { + return value != null && typeof value === "object" && !Array.isArray(value); +} + +if ( + process.env.UNDERSTUDY_V4_TOOLS_CHILD === "1" && + process.argv[1] && + path.resolve(process.argv[1]) === fileURLToPath(import.meta.url) +) { + void runBridgeChild().catch((error) => { + writeBridgeMessage({ + type: "error", + error: error instanceof Error ? error.message : String(error), + }); + process.exit(1); + }); +} diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts index c2277ea36..1cc580a90 100644 --- a/packages/evals/framework/benchHarness.ts +++ b/packages/evals/framework/benchHarness.ts @@ -1,24 +1,11 @@ -import { - AgentProvider, - getAISDKLanguageModel, - loadApiKeyFromEnv, - type AgentInstance, - type AvailableModel, - type LLMClient, - type LogLine, - type V3, -} from "@browserbasehq/stagehand"; -import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js"; -import { endBrowserbaseSession } from "../browserbaseCleanup.js"; +import type { AgentInstance, V3 } from "@browserbasehq/stagehand"; import { EvalsError } from "../errors.js"; import type { EvalLogger } from "../logger.js"; -import type { V3InitResult } from "../initV3.js"; import type { EvalInput } from "../types/evals.js"; -import { runClaudeCodeAgent } from "./claudeCodeRunner.js"; -import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js"; -import { runCodexAgent } from "./codexRunner.js"; -import { prepareCodexToolAdapter } from "./codexToolAdapter.js"; -import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; +import { ClaudeAgentHarness } from "./ClaudeAgentHarness.js"; +import { CodexAgentHarness } from "./CodexAgentHarness.js"; +import { StagehandAgentV3Harness } from "./StagehandAgentV3Harness.js"; +import type { UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js"; import type { DiscoveredTask, TaskResult } from "./types.js"; import type { BenchMatrixRow, BenchTaskKind, Harness } from "./benchTypes.js"; @@ -41,6 +28,7 @@ export interface BenchHarnessContext { row: BenchMatrixRow; logger: EvalLogger; v3?: V3; + v4?: UnderstudyV4NativeRuntime; agent?: AgentInstance; page?: Page; debugUrl: string; @@ -60,28 +48,8 @@ export interface BenchHarness { start(input: BenchHarnessStartInput): Promise; } -function isAgentTask(task: DiscoveredTask): boolean { - return ( - task.primaryCategory === "agent" || - task.categories.includes("agent") || - task.categories.includes("external_agent_benchmarks") - ); -} - -function resolveProvider(modelName: AvailableModel): string | undefined { - if (modelName.includes("/")) { - return modelName.split("/")[0]; - } - - try { - return AgentProvider.getAgentProvider(modelName); - } catch { - return undefined; - } -} - -export const stagehandHarness: BenchHarness = { - harness: "stagehand", +export const StagehandAgentV4Harness: BenchHarness = { + harness: "stagehand_v4", supportedTaskKinds: [ "act", "extract", @@ -90,193 +58,28 @@ export const stagehandHarness: BenchHarness = { "combination", "suite", ], - supportsApi: true, - async start({ - task, - input, - row, - logger, - verbose, - }: BenchHarnessStartInput): Promise { - let v3Result: V3InitResult | undefined; - const createAgent = isAgentTask(task); - if (row.config.harness !== "stagehand") { - throw new EvalsError( - `Harness "${row.config.harness}" is not implemented yet. Use --harness stagehand for the current unified runner.`, - ); - } - const config = row.config; - const agentMode = config.agentMode ?? input.agentMode; - const isCUA = config.isCUA ?? input.isCUA; - - if (config.useApi) { - const provider = resolveProvider(input.modelName); - const logFn = (line: LogLine) => logger.log(line); - const apiKey = loadApiKeyFromEnv(provider, logFn); - if (!apiKey) { - throw new EvalsError( - `USE_API=true but no API key found for provider "${provider}".`, - ); - } - const { initV3 } = await import("../initV3.js"); - v3Result = await initV3({ - logger, - modelName: input.modelName, - modelClientOptions: { apiKey }, - createAgent, - agentMode, - isCUA, - verbose, - configOverrides: { env: config.environment }, - }); - } else { - let llmClient: LLMClient | undefined; - if (input.modelName.includes("/")) { - const firstSlashIndex = input.modelName.indexOf("/"); - llmClient = new AISdkClientWrapped({ - model: getAISDKLanguageModel( - input.modelName.substring(0, firstSlashIndex), - input.modelName.substring(firstSlashIndex + 1), - ), - }); - } - const { initV3 } = await import("../initV3.js"); - v3Result = await initV3({ - logger, - llmClient, - modelName: input.modelName, - createAgent, - agentMode, - isCUA, - verbose, - configOverrides: { env: config.environment }, - }); - } - - return { - ctx: { - harness: "stagehand", - row, - logger, - v3: v3Result.v3, - agent: v3Result.agent, - page: v3Result.v3.context.pages()[0], - debugUrl: v3Result.debugUrl ?? "", - sessionUrl: v3Result.sessionUrl ?? "", - }, - cleanup: async () => { - if (v3Result?.v3) { - try { - await v3Result.v3.close(); - } catch (closeError) { - console.error( - `Warning: Error closing V3 instance for ${input.name}:`, - closeError, - ); - } - } - await endBrowserbaseSession(v3Result?.v3); - }, - }; - }, -}; - -export const claudeCodeHarness: BenchHarness = { - harness: "claude_code", - supportedTaskKinds: ["agent", "suite"], supportsApi: false, - async execute({ - input, - row, - logger, - signal, - }: BenchHarnessExecuteInput): Promise { - const plan = buildExternalHarnessTaskPlan(input); - if (row.config.harness !== "claude_code") { - throw new EvalsError( - `Expected claude_code harness config, received "${row.config.harness}".`, - ); - } - const toolAdapter = await prepareClaudeCodeToolAdapter({ - toolSurface: row.config.toolSurface, - startupProfile: row.config.startupProfile, - environment: row.config.environment, - plan, - logger, - }); - try { - return await runClaudeCodeAgent({ - plan, - model: input.modelName, - logger, - toolAdapter, - signal, - }); - } finally { - await toolAdapter.cleanup(); - } - }, - async start(): Promise { - throw new EvalsError( - "Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.", - ); - }, -}; - -export const codexHarness: BenchHarness = { - harness: "codex", - supportedTaskKinds: ["agent", "suite"], - supportsApi: false, - async execute({ - input, - row, - logger, - signal, - }: BenchHarnessExecuteInput): Promise { - const plan = buildExternalHarnessTaskPlan(input); - if (row.config.harness !== "codex") { - throw new EvalsError( - `Expected codex harness config, received "${row.config.harness}".`, - ); - } - const toolAdapter = await prepareCodexToolAdapter({ - toolSurface: row.config.toolSurface, - startupProfile: row.config.startupProfile, - environment: row.config.environment, - plan, - logger, - }); - try { - return await runCodexAgent({ - plan, - model: input.modelName, - logger, - toolAdapter, - signal, - }); - } finally { - await toolAdapter.cleanup(); - } - }, - async start(): Promise { - throw new EvalsError( - "Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.", - ); + async start(input: BenchHarnessStartInput): Promise { + const module = await import("./StagehandAgentV4Harness.js"); + return module.StagehandAgentV4Harness.start(input); }, }; const harnessRegistry = new Map([ - ["stagehand", stagehandHarness], - ["claude_code", claudeCodeHarness], - ["codex", codexHarness], + ["stagehand_v3", StagehandAgentV3Harness], + ["stagehand_v4", StagehandAgentV4Harness], + ["claude_code", ClaudeAgentHarness], + ["codex", CodexAgentHarness], ]); export function getBenchHarness(harness: Harness): BenchHarness { const implementation = harnessRegistry.get(harness); if (!implementation) { - throw new EvalsError( - `Harness "${harness}" is not implemented yet. Use --harness stagehand for the current unified runner.`, - ); + throw new EvalsError(`Harness "${harness}" is not implemented yet.`); } return implementation; } + +export { ClaudeAgentHarness } from "./ClaudeAgentHarness.js"; +export { CodexAgentHarness } from "./CodexAgentHarness.js"; +export { StagehandAgentV3Harness } from "./StagehandAgentV3Harness.js"; diff --git a/packages/evals/framework/benchPlanner.ts b/packages/evals/framework/benchPlanner.ts index 5f93ba39b..1b6a48875 100644 --- a/packages/evals/framework/benchPlanner.ts +++ b/packages/evals/framework/benchPlanner.ts @@ -97,15 +97,18 @@ export function resolveBenchModelEntries( effectiveCategory === "agent" || effectiveCategory === "external_agent_benchmarks"; const harness = options.harness ?? DEFAULT_BENCH_HARNESS; - const requestedAgentModes = - harness === "stagehand" ? resolveRequestedAgentModes(options) : undefined; + const usesStagehandHarness = + harness === "stagehand_v3" || harness === "stagehand_v4"; + const requestedAgentModes = usesStagehandHarness + ? resolveRequestedAgentModes(options) + : undefined; if (options.modelOverride) { const baseModes = isAgentCategory && requestedAgentModes ? requestedAgentModes : [ - harness === "stagehand" + usesStagehandHarness ? resolveAgentModeForModel(options.modelOverride) : "hybrid", ]; @@ -345,9 +348,9 @@ function buildBenchHarnessConfig(input: { startupProfile?: StartupProfile; dataset?: string; }): BenchHarnessConfig { - if (input.harness === "stagehand") { + if (input.harness === "stagehand_v3" || input.harness === "stagehand_v4") { return { - harness: "stagehand", + harness: input.harness, model: input.model, provider: input.provider, environment: input.environment, @@ -387,6 +390,9 @@ export function generateBenchTestcases( modelEntries, ); const allTestcases = [...suiteTestcases.testcases]; + const harness = options.harness ?? DEFAULT_BENCH_HARNESS; + const usesStagehandHarness = + harness === "stagehand_v3" || harness === "stagehand_v4"; if (options.harness === "claude_code" || options.harness === "codex") { if (suiteTestcases.remainingTasks.length > 0) { @@ -409,16 +415,16 @@ export function generateBenchTestcases( model, options, undefined, - isAgentCategory && rowUsesStagehand(options) + isAgentCategory && usesStagehandHarness ? entry.mode === "cua" : undefined, - isAgentCategory && rowUsesStagehand(options) + isAgentCategory && usesStagehandHarness ? (options.agentMode ?? entry.mode) : undefined, ); const agentMode = row.agentMode; const includeStagehandAgentMode = - isAgentCategory && rowUsesStagehand(options) && agentMode; + isAgentCategory && usesStagehandHarness && agentMode; allTestcases.push({ input: { name: task.name, @@ -460,10 +466,6 @@ export function generateBenchTestcases( return allTestcases; } -function rowUsesStagehand(options: Pick): boolean { - return (options.harness ?? DEFAULT_BENCH_HARNESS) === "stagehand"; -} - function resolveBenchRowToolSurface( harness: Harness, requested?: ToolSurface, @@ -474,6 +476,19 @@ function resolveBenchRowToolSurface( if (harness === "codex") { return resolveCodexToolSurface(requested); } + if (harness === "stagehand_v4") { + if (requested && requested !== "understudy_v4_code") { + throw new EvalsError( + `stagehand_v4 uses the UnderstudyV4Tools surface. Received --tool ${requested}.`, + ); + } + return requested ?? "understudy_v4_code"; + } + if (harness === "stagehand_v3" && requested === "understudy_v4_code") { + throw new EvalsError( + "Use --harness stagehand_v4 for the UnderstudyV4Tools surface.", + ); + } return requested; } @@ -547,7 +562,8 @@ function withBenchMetadata( task: DiscoveredTask, options: BenchPlanOptions, ): Testcase { - const isStagehand = rowUsesStagehand(options); + const harness = options.harness ?? DEFAULT_BENCH_HARNESS; + const isStagehand = harness === "stagehand_v3" || harness === "stagehand_v4"; const agentMode = isStagehand ? (options.agentMode ?? testcase.input.agentMode) : undefined; diff --git a/packages/evals/framework/benchRunner.ts b/packages/evals/framework/benchRunner.ts index e719db56d..51193dc20 100644 --- a/packages/evals/framework/benchRunner.ts +++ b/packages/evals/framework/benchRunner.ts @@ -29,7 +29,6 @@ export async function executeBenchTask( ): Promise { const logger = new EvalLogger(Boolean(options.verbose)); const harnessName = options.harness ?? DEFAULT_BENCH_HARNESS; - const harness = getBenchHarness(harnessName); const row = buildBenchMatrixRow( task, input.modelName, @@ -38,6 +37,7 @@ export async function executeBenchTask( input.isCUA, input.agentMode, ); + const harness = getBenchHarness(harnessName); let cleanup: () => Promise = async () => {}; let unregisterCleanup: (() => void) | undefined; let harnessCtx: BenchHarnessContext | undefined; @@ -67,8 +67,13 @@ export async function executeBenchTask( harnessCtx = startedHarness.ctx; const taskModule = await loadTaskModuleFromPath(task.filePath, task.name); if (taskModule.definition) { + const taskFn = + taskModule.definition.benchFns?.[harnessCtx.harness] ?? + taskModule.definition.benchFns?.default ?? + taskModule.definition.fn; const ctx = { v3: harnessCtx.v3, + v4: harnessCtx.v4, agent: harnessCtx.agent, page: harnessCtx.page, logger, @@ -78,7 +83,7 @@ export async function executeBenchTask( sessionUrl: harnessCtx.sessionUrl, }; return withBenchSessionUrls( - (await taskModule.definition.fn(ctx)) as TaskResult, + (await taskFn(ctx)) as TaskResult, harnessCtx, ); } @@ -86,6 +91,7 @@ export async function executeBenchTask( return withBenchSessionUrls( await taskModule.legacyFn({ v3: harnessCtx.v3, + v4: harnessCtx.v4, logger, debugUrl: harnessCtx.debugUrl, sessionUrl: harnessCtx.sessionUrl, @@ -117,10 +123,7 @@ export async function executeBenchTask( return withBenchSessionUrls( { _success: false, - error: - error instanceof Error - ? JSON.parse(JSON.stringify(error, null, 2)) - : String(error), + error: error instanceof Error ? error.message : String(error), logs: logger.getLogs(), }, harnessCtx, diff --git a/packages/evals/framework/benchTypes.ts b/packages/evals/framework/benchTypes.ts index 2a3af7cc6..3fce7d950 100644 --- a/packages/evals/framework/benchTypes.ts +++ b/packages/evals/framework/benchTypes.ts @@ -1,18 +1,20 @@ import type { AgentToolMode, AvailableModel } from "@browserbasehq/stagehand"; import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js"; -export type Harness = "stagehand" | "claude_code" | "codex"; +export type Harness = "stagehand_v3" | "stagehand_v4" | "claude_code" | "codex"; -export const DEFAULT_BENCH_HARNESS: Harness = "stagehand"; +export const DEFAULT_BENCH_HARNESS: Harness = "stagehand_v3"; export const SUPPORTED_BENCH_HARNESSES = [ - "stagehand", + "stagehand_v3", + "stagehand_v4", "claude_code", "codex", ] as const satisfies readonly Harness[]; export const EXECUTABLE_BENCH_HARNESSES = [ - "stagehand", + "stagehand_v3", + "stagehand_v4", "claude_code", "codex", ] as const satisfies readonly Harness[]; @@ -42,7 +44,7 @@ export type BenchTaskKind = | "suite"; export interface StagehandHarnessConfig { - harness: "stagehand"; + harness: "stagehand_v3" | "stagehand_v4"; model: AvailableModel; provider?: string; environment: "LOCAL" | "BROWSERBASE"; diff --git a/packages/evals/framework/context.ts b/packages/evals/framework/context.ts index daa8eabea..3b09e4fcd 100644 --- a/packages/evals/framework/context.ts +++ b/packages/evals/framework/context.ts @@ -13,7 +13,6 @@ import { type V3InitResult, initV3 } from "../initV3.js"; import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js"; import { coreFixtureRoutes } from "../core/fixtures/index.js"; import { prepareCoreBrowserTarget } from "../core/targets/index.js"; -import { getCoreTool } from "../core/tools/registry.js"; import { ensureCoreFixtureServer } from "../core/fixtures/server.js"; import { EvalLogger } from "../logger.js"; import { createAssertHelpers } from "./assertions.js"; @@ -41,7 +40,7 @@ export function resolveDefaultCoreStartupProfile( return environment === "BROWSERBASE" ? "tool_create_browserbase" : "tool_launch_local"; - case "understudy_code": + case "understudy_v3_code": case "playwright_code": case "cdp_code": case "playwright_mcp": @@ -69,7 +68,8 @@ export async function buildCoreContext( ): Promise { const logger = options.logger ?? new EvalLogger(); const environment = options.environment ?? "LOCAL"; - const toolSurface = options.toolSurface ?? "understudy_code"; + const toolSurface = options.toolSurface ?? "understudy_v3_code"; + const { getCoreTool } = await import("../core/tools/registry.js"); const tool = getCoreTool(toolSurface); const startupProfile = options.startupProfile ?? diff --git a/packages/evals/framework/defineTask.ts b/packages/evals/framework/defineTask.ts index b03d3e037..2f754320b 100644 --- a/packages/evals/framework/defineTask.ts +++ b/packages/evals/framework/defineTask.ts @@ -5,6 +5,8 @@ * the file lives in during auto-discovery. */ import type { + BenchTaskFn, + BenchTaskImplementations, BenchTaskContext, BenchTaskMeta, CoreTaskContext, @@ -34,8 +36,31 @@ export function defineCoreTask( */ export function defineBenchTask( meta: BenchTaskMeta, - fn: (ctx: BenchTaskContext) => Promise, + fn: BenchTaskFn, +): TaskDefinition; +export function defineBenchTask( + meta: BenchTaskMeta, + fn: BenchTaskImplementations, +): TaskDefinition; +export function defineBenchTask( + meta: BenchTaskMeta, + fn: BenchTaskFn | BenchTaskImplementations, ): TaskDefinition { + if (typeof fn !== "function") { + return { + __taskDefinition: true, + meta, + fn: + fn.default ?? + (async () => { + throw new Error( + `No default bench implementation is defined for "${meta.name ?? "unnamed task"}".`, + ); + }), + benchFns: fn, + }; + } + return { __taskDefinition: true, meta, diff --git a/packages/evals/framework/runner.ts b/packages/evals/framework/runner.ts index 336db1c02..8147a8fc1 100644 --- a/packages/evals/framework/runner.ts +++ b/packages/evals/framework/runner.ts @@ -321,7 +321,7 @@ export async function runEvals( (t: DiscoveredTask) => t.tier === "core", ); const effectiveCoreToolSurface = hasCoreOnly - ? (options.coreToolSurface ?? "understudy_code") + ? (options.coreToolSurface ?? "understudy_v3_code") : undefined; const effectiveCoreStartupProfile = hasCoreOnly && effectiveCoreToolSurface diff --git a/packages/evals/framework/taskLoader.ts b/packages/evals/framework/taskLoader.ts index d7a218ac9..0aeaa5291 100644 --- a/packages/evals/framework/taskLoader.ts +++ b/packages/evals/framework/taskLoader.ts @@ -7,6 +7,7 @@ export interface LoadedTaskDefinition { __taskDefinition: true; meta: unknown; fn: (ctx: unknown) => Promise; + benchFns?: Record Promise) | undefined>; } export type LegacyTaskFn = (ctx: unknown) => Promise; diff --git a/packages/evals/framework/types.ts b/packages/evals/framework/types.ts index 359605b12..0441b0d98 100644 --- a/packages/evals/framework/types.ts +++ b/packages/evals/framework/types.ts @@ -23,6 +23,8 @@ import type { ToolSurface, } from "../core/contracts/tool.js"; import type { EvalLogger } from "../logger.js"; +import type { Harness } from "./benchTypes.js"; +import type { UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js"; /** Page type inferred from V3.context.pages()[0] */ type Page = ReturnType[number]; @@ -70,6 +72,8 @@ export interface CoreTaskContext { export interface BenchTaskContext { /** Stagehand V3 instance. */ v3: V3; + /** Native Stagehand v4 SDK proxy. Present for the stagehand_v4 harness. */ + v4?: UnderstudyV4NativeRuntime; /** Agent instance (created when the task lives under agent/). */ agent?: AgentInstance; /** Playwright page (convenience — same as v3.context.pages()[0]). */ @@ -128,6 +132,12 @@ export interface MetricsCollector { getSummary(): Record>; } +export type BenchTaskFn = (ctx: BenchTaskContext) => Promise; + +export type BenchTaskImplementations = Partial> & { + default?: BenchTaskFn; +}; + export interface TaskDefinition { /** Marker to identify defineTask outputs during discovery. */ __taskDefinition: true; @@ -135,6 +145,8 @@ export interface TaskDefinition { meta: TaskMeta | BenchTaskMeta; /** The task function. */ fn: (ctx: CoreTaskContext | BenchTaskContext) => Promise; + /** Optional harness-native bench implementations. */ + benchFns?: BenchTaskImplementations; /** Which tier this task was defined for (set during discovery from directory). */ tier?: Tier; } diff --git a/packages/evals/lib/braintrust-report.ts b/packages/evals/lib/braintrust-report.ts index 6fbb0fb99..a86da30e6 100644 --- a/packages/evals/lib/braintrust-report.ts +++ b/packages/evals/lib/braintrust-report.ts @@ -1363,7 +1363,7 @@ export function summarizeBenchCases( function agentConfigKey(benchCase: BenchCaseRow): string { return [ - benchCase.harness ?? "stagehand", + benchCase.harness ?? "stagehand_v3", benchCase.provider ?? "", benchCase.environment ?? "", benchCase.api === undefined ? "" : benchCase.api ? "api" : "local", @@ -1375,7 +1375,7 @@ function agentConfigKey(benchCase: BenchCaseRow): string { function agentConfigLabel(benchCase: BenchCaseRow): string { const parts = [ - benchCase.harness ?? "stagehand", + benchCase.harness ?? "stagehand_v3", benchCase.agentMode, benchCase.provider, benchCase.environment, diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts index 7b057e322..357f8764f 100644 --- a/packages/evals/tests/cli.test.ts +++ b/packages/evals/tests/cli.test.ts @@ -85,7 +85,7 @@ describe("CLI entrypoint", () => { expect(payload.envOverrides.EVAL_ENV).toBe("BROWSERBASE"); expect(payload.envOverrides.USE_API).toBe("true"); expect(payload.envOverrides.EVAL_PROVIDER).toBe("openai"); - expect(payload.runOptions.harness).toBe("stagehand"); + expect(payload.runOptions.harness).toBe("stagehand_v3"); expect(payload.runOptions.verbose).toBe(false); }); @@ -173,7 +173,7 @@ describe.sequential("core config", () => { const { stdout, code } = await runCli(["config", "core"]); expect(code).toBe(0); expect(stdout).toContain("Core configuration"); - expect(stdout).toContain("runner default: understudy_code"); + expect(stdout).toContain("runner default: understudy_v3_code"); }); it("persists tool via `config core set tool`", async () => { @@ -183,18 +183,18 @@ describe.sequential("core config", () => { "core", "set", "tool", - "understudy_code", + "understudy_v3_code", ]); expect(setResult.code).toBe(0); - expect(setResult.stdout).toContain("Set core.tool to understudy_code"); + expect(setResult.stdout).toContain("Set core.tool to understudy_v3_code"); const saved = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8")); - expect(saved.core?.tool).toBe("understudy_code"); + expect(saved.core?.tool).toBe("understudy_v3_code"); }); it("flows persisted core.tool into run dry-run output", async () => { resetConfig(); - await runCli(["config", "core", "set", "tool", "understudy_code"]); + await runCli(["config", "core", "set", "tool", "understudy_v3_code"]); const { stdout, code } = await runCli([ "run", @@ -203,7 +203,7 @@ describe.sequential("core config", () => { ]); expect(code).toBe(0); const payload = JSON.parse(stdout); - expect(payload.runOptions.coreToolSurface).toBe("understudy_code"); + expect(payload.runOptions.coreToolSurface).toBe("understudy_v3_code"); }); it("rejects unknown tool", async () => { @@ -271,7 +271,7 @@ describe.sequential("core config", () => { it("reset clears the whole core section", async () => { resetConfig(); - await runCli(["config", "core", "set", "tool", "understudy_code"]); + await runCli(["config", "core", "set", "tool", "understudy_v3_code"]); const { code } = await runCli(["config", "core", "reset"]); expect(code).toBe(0); diff --git a/packages/evals/tests/framework/benchHarness.test.ts b/packages/evals/tests/framework/benchHarness.test.ts index 60989664a..4c5ddf7a4 100644 --- a/packages/evals/tests/framework/benchHarness.test.ts +++ b/packages/evals/tests/framework/benchHarness.test.ts @@ -1,15 +1,31 @@ import { describe, expect, it } from "vitest"; import { - claudeCodeHarness, - codexHarness, + ClaudeAgentHarness, + CodexAgentHarness, getBenchHarness, + StagehandAgentV3Harness, + StagehandAgentV4Harness, } from "../../framework/benchHarness.js"; describe("bench harness registry", () => { + it("registers stagehand_v3 as the v3 Stagehand agent harness", () => { + const harness = getBenchHarness("stagehand_v3"); + + expect(harness).toBe(StagehandAgentV3Harness); + expect(harness.supportsApi).toBe(true); + }); + + it("registers stagehand_v4 as the v4 Stagehand agent harness", () => { + const harness = getBenchHarness("stagehand_v4"); + + expect(harness).toBe(StagehandAgentV4Harness); + expect(harness.supportsApi).toBe(false); + }); + it("registers claude_code as a concrete executable harness", () => { const harness = getBenchHarness("claude_code"); - expect(harness).toBe(claudeCodeHarness); + expect(harness).toBe(ClaudeAgentHarness); expect(harness.supportedTaskKinds).toEqual(["agent", "suite"]); expect(harness.supportsApi).toBe(false); expect(harness.execute).toBeDefined(); @@ -18,7 +34,7 @@ describe("bench harness registry", () => { it("registers codex as a concrete executable harness", () => { const harness = getBenchHarness("codex"); - expect(harness).toBe(codexHarness); + expect(harness).toBe(CodexAgentHarness); expect(harness.supportedTaskKinds).toEqual(["agent", "suite"]); expect(harness.supportsApi).toBe(false); expect(harness.execute).toBeDefined(); diff --git a/packages/evals/tests/framework/benchPlanner.test.ts b/packages/evals/tests/framework/benchPlanner.test.ts index fdc885c96..5143b9711 100644 --- a/packages/evals/tests/framework/benchPlanner.test.ts +++ b/packages/evals/tests/framework/benchPlanner.test.ts @@ -34,7 +34,7 @@ describe("benchPlanner", () => { ); expect(row).toMatchObject({ - harness: "stagehand", + harness: "stagehand_v3", task: "dropdown", category: "act", taskKind: "act", @@ -43,7 +43,7 @@ describe("benchPlanner", () => { environment: "BROWSERBASE", useApi: true, config: { - harness: "stagehand", + harness: "stagehand_v3", model: "openai/gpt-4.1-mini", provider: "openai", environment: "BROWSERBASE", @@ -55,13 +55,13 @@ describe("benchPlanner", () => { it("annotates generated bench testcases with harness metadata", () => { const [testcase] = generateBenchTestcases([makeTask()], { modelOverride: "openai/gpt-4.1-mini", - harness: "stagehand", + harness: "stagehand_v3", environment: "LOCAL", }); expect(testcase.input.modelName).toBe("openai/gpt-4.1-mini"); - expect(testcase.tags).toContain("harness/stagehand"); - expect(testcase.metadata.harness).toBe("stagehand"); + expect(testcase.tags).toContain("harness/stagehand_v3"); + expect(testcase.metadata.harness).toBe("stagehand_v3"); expect(testcase.metadata.environment).toBe("LOCAL"); }); @@ -78,7 +78,7 @@ describe("benchPlanner", () => { { modelOverride: cuaModel, datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ); @@ -100,7 +100,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-5.4-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ); @@ -123,7 +123,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentMode: "dom", }, ); @@ -153,7 +153,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], }, ), @@ -191,7 +191,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); @@ -226,7 +226,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], }, ), @@ -265,7 +265,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["cua"], }, ), @@ -296,7 +296,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentMode: "cua", }, ), @@ -323,7 +323,7 @@ describe("benchPlanner", () => { ], { datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["cua"], }, ), @@ -338,7 +338,7 @@ describe("benchPlanner", () => { it("does not expand non-agent model overrides across agent modes", () => { const testcases = generateBenchTestcases([makeTask()], { modelOverride: "openai/gpt-4.1-mini", - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], }); @@ -475,7 +475,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webvoyager", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); @@ -510,7 +510,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "onlineMind2Web", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); @@ -541,7 +541,7 @@ describe("benchPlanner", () => { { modelOverride: "openai/gpt-4.1-mini", datasetFilter: "webtailbench", - harness: "stagehand", + harness: "stagehand_v3", }, ), ); diff --git a/packages/evals/tests/framework/benchRunner.test.ts b/packages/evals/tests/framework/benchRunner.test.ts index 08245618d..bd4afc8c9 100644 --- a/packages/evals/tests/framework/benchRunner.test.ts +++ b/packages/evals/tests/framework/benchRunner.test.ts @@ -96,7 +96,7 @@ describe("bench runner", () => { tasks: [task], registry: makeRegistry([task]), environment: "BROWSERBASE", - harness: "stagehand", + harness: "stagehand_v3", verbose: false, }, ); diff --git a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts index e28775652..798dd72e4 100644 --- a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts +++ b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts @@ -55,7 +55,7 @@ describe("claude code tool adapter resolution", () => { }); it("rejects unsupported Claude Code tool surfaces for now", () => { - expect(() => resolveClaudeCodeToolSurface("understudy_code")).toThrow( + expect(() => resolveClaudeCodeToolSurface("understudy_v3_code")).toThrow( /supports --tool browse_cli, playwright_code, or cdp_code/, ); }); diff --git a/packages/evals/tests/framework/context.test.ts b/packages/evals/tests/framework/context.test.ts index 742378635..c6683afe4 100644 --- a/packages/evals/tests/framework/context.test.ts +++ b/packages/evals/tests/framework/context.test.ts @@ -4,9 +4,9 @@ import { prepareCoreBrowserTarget } from "../../core/targets/index.js"; describe("resolveDefaultCoreStartupProfile", () => { it("uses runner-provided local CDP for code surfaces in LOCAL", () => { - expect(resolveDefaultCoreStartupProfile("understudy_code", "LOCAL")).toBe( - "runner_provided_local_cdp", - ); + expect( + resolveDefaultCoreStartupProfile("understudy_v3_code", "LOCAL"), + ).toBe("runner_provided_local_cdp"); expect(resolveDefaultCoreStartupProfile("playwright_code", "LOCAL")).toBe( "runner_provided_local_cdp", ); @@ -29,7 +29,7 @@ describe("resolveDefaultCoreStartupProfile", () => { it("uses runner-provided Browserbase CDP for code surfaces in BROWSERBASE", () => { expect( - resolveDefaultCoreStartupProfile("understudy_code", "BROWSERBASE"), + resolveDefaultCoreStartupProfile("understudy_v3_code", "BROWSERBASE"), ).toBe("runner_provided_browserbase_cdp"); expect( resolveDefaultCoreStartupProfile("playwright_code", "BROWSERBASE"), @@ -55,7 +55,7 @@ describe("resolveDefaultCoreStartupProfile", () => { await expect( prepareCoreBrowserTarget({ environment: "BROWSERBASE", - toolSurface: "understudy_code", + toolSurface: "understudy_v3_code", startupProfile: "runner_provided_local_cdp", }), ).rejects.toThrow(/requires LOCAL environment/); @@ -65,7 +65,7 @@ describe("resolveDefaultCoreStartupProfile", () => { await expect( prepareCoreBrowserTarget({ environment: "LOCAL", - toolSurface: "understudy_code", + toolSurface: "understudy_v3_code", startupProfile: "runner_provided_browserbase_cdp", }), ).rejects.toThrow(/requires BROWSERBASE environment/); diff --git a/packages/evals/tests/framework/core-runner.test.ts b/packages/evals/tests/framework/core-runner.test.ts index 6d2879fb4..242978f9a 100644 --- a/packages/evals/tests/framework/core-runner.test.ts +++ b/packages/evals/tests/framework/core-runner.test.ts @@ -122,7 +122,7 @@ describe("core runner", () => { }, startupProfile: "runner_provided_local_cdp", adapter: { - name: "understudy_code", + name: "understudy_v3_code", family: "understudy", surface: "code", metadata: { @@ -183,7 +183,7 @@ describe("core runner", () => { concurrency: 1, trials: 1, environment: "LOCAL", - coreToolSurface: "understudy_code", + coreToolSurface: "understudy_v3_code", coreStartupProfile: "runner_provided_local_cdp", }); diff --git a/packages/evals/tests/framework/defineTask.test.ts b/packages/evals/tests/framework/defineTask.test.ts index c531d676d..01ae3abda 100644 --- a/packages/evals/tests/framework/defineTask.test.ts +++ b/packages/evals/tests/framework/defineTask.test.ts @@ -44,6 +44,25 @@ describe("defineBenchTask", () => { expect((result.meta as any).models).toEqual(["openai/gpt-4o"]); }); + + it("preserves harness-native bench implementations", async () => { + const stagehandV3 = vi.fn(async () => ({ _success: true, version: 3 })); + const stagehandV4 = vi.fn(async () => ({ _success: true, version: 4 })); + const result = defineBenchTask( + { name: "native_versions" }, + { + stagehand_v3: stagehandV3 as any, + stagehand_v4: stagehandV4 as any, + }, + ); + + await expect(result.benchFns?.stagehand_v4?.({} as any)).resolves.toEqual({ + _success: true, + version: 4, + }); + expect(stagehandV3).toHaveBeenCalledTimes(0); + expect(stagehandV4).toHaveBeenCalledTimes(1); + }); }); describe("defineTask", () => { diff --git a/packages/evals/tests/tui/parse.test.ts b/packages/evals/tests/tui/parse.test.ts index bb42c9fc6..532057ad6 100644 --- a/packages/evals/tests/tui/parse.test.ts +++ b/packages/evals/tests/tui/parse.test.ts @@ -18,7 +18,7 @@ describe("resolveRunOptions", () => { it("defaults to the stagehand bench harness", () => { const resolved = resolveRunOptions({}, {}, {}); - expect(resolved.harness).toBe("stagehand"); + expect(resolved.harness).toBe("stagehand_v3"); }); it("accepts known bench harnesses", () => { diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts index 36be3e1aa..5f8a3fd83 100644 --- a/packages/evals/tests/tui/run.test.ts +++ b/packages/evals/tests/tui/run.test.ts @@ -115,7 +115,7 @@ describe("deriveCategoryFilter", () => { concurrency: 1, environment: "LOCAL", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", envOverrides: {}, dryRun: true, preview: false, @@ -149,7 +149,7 @@ describe("deriveCategoryFilter", () => { environment: "BROWSERBASE", model: "openai/gpt-4.1-mini", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", datasetFilter: "webvoyager", envOverrides: { EVAL_MAX_K: "1", @@ -169,7 +169,7 @@ describe("deriveCategoryFilter", () => { task: "agent/webvoyager", dataset: "webvoyager", model: "openai/gpt-4.1-mini", - harness: "stagehand", + harness: "stagehand_v3", agentMode: "dom", environment: "BROWSERBASE", useApi: false, @@ -195,7 +195,7 @@ describe("deriveCategoryFilter", () => { environment: "BROWSERBASE", model: "openai/gpt-4.1-mini", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], datasetFilter: "webvoyager", envOverrides: { @@ -413,7 +413,8 @@ describe("deriveCategoryFilter", () => { }); it("allows executable harnesses without env gates", () => { - expect(canExecuteBenchHarness("stagehand")).toBe(true); + expect(canExecuteBenchHarness("stagehand_v3")).toBe(true); + expect(canExecuteBenchHarness("stagehand_v4")).toBe(true); expect(canExecuteBenchHarness("claude_code")).toBe(true); expect(canExecuteBenchHarness("codex")).toBe(true); }); @@ -442,7 +443,7 @@ describe("deriveCategoryFilter", () => { environment: "BROWSERBASE", model: "openai/gpt-4.1-mini", useApi: false, - harness: "stagehand", + harness: "stagehand_v3", agentModes: ["dom", "hybrid"], envOverrides: {}, dryRun: false, @@ -460,7 +461,7 @@ describe("deriveCategoryFilter", () => { "Plan: 2 tasks × 1 model × 2 modes × 4 trials = 16 runs", ); expect(output).toContain( - "Env: BROWSERBASE Harness: stagehand Concurrency: 25", + "Env: BROWSERBASE Harness: stagehand_v3 Concurrency: 25", ); expect(runEvalsMock).toHaveBeenCalledOnce(); }); @@ -516,7 +517,7 @@ describe("buildCombinations (preview column-pruning)", () => { category: null, dataset: null, model, - harness: "stagehand", + harness: "stagehand_v3", agentMode, environment: "BROWSERBASE", useApi: false, diff --git a/packages/evals/tui/commands/core.ts b/packages/evals/tui/commands/core.ts index 409fb8c28..45cf7b7f1 100644 --- a/packages/evals/tui/commands/core.ts +++ b/packages/evals/tui/commands/core.ts @@ -88,7 +88,7 @@ export function printCoreConfig(entryDir: string): void { console.log(`\n ${bold("Core configuration:")}\n`); console.log( - ` ${cyan("tool")} ${core.tool ?? gray("(runner default: understudy_code)")}`, + ` ${cyan("tool")} ${core.tool ?? gray("(runner default: understudy_v3_code)")}`, ); console.log( ` ${cyan("startup")} ${core.startup ?? gray("(inferred from tool + env)")}`, @@ -147,7 +147,9 @@ async function setCoreKey( console.error( red(" Cannot set startup without a tool. Set core.tool first."), ); - console.log(dim(` Example: evals core config set tool understudy_code`)); + console.log( + dim(` Example: evals core config set tool understudy_v3_code`), + ); process.exitCode = 1; return; } diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts index 95b49c766..9ef90a4ff 100644 --- a/packages/evals/tui/commands/help.ts +++ b/packages/evals/tui/commands/help.ts @@ -79,7 +79,7 @@ export function printRunHelp(): void { "", row( `${cyan("--tool")} ${dim("")}`, - `Core tool surface ${gray("(understudy_code, playwright_code, ...)")}`, + `Core tool surface ${gray("(understudy_v3_code, playwright_code, ...)")}`, ), row(`${cyan("--startup")} ${dim("")}`, "Core startup profile"), "", @@ -121,7 +121,7 @@ export function printRunHelp(): void { ` ${bold("Examples:")}`, "", ` ${dim("$")} evals run act -t 3 -c 5`, - ` ${dim("$")} evals run navigation/open --tool understudy_code`, + ` ${dim("$")} evals run navigation/open --tool understudy_v3_code`, ` ${dim("$")} evals run b:webvoyager -l 10`, ` ${dim("$")} evals run b:onlineMind2Web -l 25`, ` ${dim("$")} evals run b:webtailbench -l 10`, @@ -200,12 +200,12 @@ export function printConfigHelp(): void { ), row(cyan("setup"), `Interactive wizard ${gray("(coming soon)")}`), "", - ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`, + ` ${bold("Valid core tools:")} ${gray("understudy_v3_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`, "", ` ${bold("Examples:")}`, "", ` ${dim("$")} evals config set trials 5`, - ` ${dim("$")} evals config core set tool understudy_code`, + ` ${dim("$")} evals config core set tool understudy_v3_code`, ` ${dim("$")} evals config core set startup tool_launch_local`, ` ${dim("$")} evals config core reset`, "", diff --git a/packages/evals/tui/commands/run.ts b/packages/evals/tui/commands/run.ts index 01df1fa45..6b460ea17 100644 --- a/packages/evals/tui/commands/run.ts +++ b/packages/evals/tui/commands/run.ts @@ -23,6 +23,7 @@ import type { ResolvedRunOptions } from "./parse.js"; import { withEnvOverrides } from "./parse.js"; import { getRuntimeTasksRoot } from "../../runtimePaths.js"; import { + DEFAULT_BENCH_HARNESS, isExecutableBenchHarness, type Harness, } from "../../framework/benchTypes.js"; @@ -239,11 +240,11 @@ export async function runCommand( if ( options.useApi && - options.harness !== "stagehand" && + (options.harness ?? DEFAULT_BENCH_HARNESS) !== "stagehand_v3" && tasks.some((t) => t.tier === "bench") ) { throw new Error( - `Harness "${options.harness}" does not support --api. Use --harness stagehand for API-backed bench runs.`, + `Harness "${options.harness}" does not support --api. Use --harness stagehand_v3 for API-backed bench runs.`, ); } @@ -257,9 +258,18 @@ export async function runCommand( tasks.some((t) => t.tier === "bench") ) { throw new Error( - `Harness "${options.harness}" is dry-run only for now. Use --harness stagehand, --harness claude_code, or --harness codex for executable bench runs.`, + `Harness "${options.harness}" is dry-run only for now. Use --harness stagehand_v3, --harness stagehand_v4, --harness claude_code, or --harness codex for executable bench runs.`, ); } + if ( + options.harness === "stagehand_v4" && + tasks.some((t) => t.tier === "bench") + ) { + const { assertUnderstudyV4SdkAvailable } = await import( + "../../framework/UnderstudyV4Tools.js" + ); + assertUnderstudyV4SdkAvailable(); + } const matrix = await buildDryRunMatrix(options, tasks, registry); console.log(`\n ${bold("Running:")} ${cyan(buildRunTargetLabel(options))}`);