diff --git a/packages/evals/ARCHITECTURE.mmd b/packages/evals/ARCHITECTURE.mmd
index 97e87aac7..c1c312f26 100644
--- a/packages/evals/ARCHITECTURE.mmd
+++ b/packages/evals/ARCHITECTURE.mmd
@@ -48,7 +48,7 @@ flowchart TB
CoreContext["framework/context.ts
buildCoreContext"]
FixtureServer["core/fixtures
local deterministic pages"]
CoreTargets["core/targets
local Chrome
Browserbase CDP"]
- CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"]
+ CoreTools["core/tools registry
understudy_v3_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"]
CoreAssertions["assertions + metrics
adapter-backed results"]
CoreDeps["core/runtime/coreDeps.ts
browserbase + ws
lazy require"]
end
diff --git a/packages/evals/core/contracts/tool.ts b/packages/evals/core/contracts/tool.ts
index bd1d366d8..0790e81d6 100644
--- a/packages/evals/core/contracts/tool.ts
+++ b/packages/evals/core/contracts/tool.ts
@@ -17,7 +17,8 @@ import type {
} from "./results.js";
export type ToolSurface =
- | "understudy_code"
+ | "understudy_v3_code"
+ | "understudy_v4_code"
| "playwright_code"
| "cdp_code"
| "playwright_mcp"
diff --git a/packages/evals/core/tools/registry.ts b/packages/evals/core/tools/registry.ts
index 65384f137..81f29395c 100644
--- a/packages/evals/core/tools/registry.ts
+++ b/packages/evals/core/tools/registry.ts
@@ -4,11 +4,11 @@ import { CdpCodeTool } from "./cdp_code.js";
import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js";
import { PlaywrightCodeTool } from "./playwright_code.js";
import { PlaywrightMcpTool } from "./playwright_mcp.js";
-import { UnderstudyCodeTool } from "./understudy_code.js";
+import { UnderstudyV3CodeTool } from "./understudy_v3_code.js";
export function listCoreTools(): ToolSurface[] {
return [
- "understudy_code",
+ "understudy_v3_code",
"playwright_code",
"cdp_code",
"playwright_mcp",
@@ -19,8 +19,8 @@ export function listCoreTools(): ToolSurface[] {
export function getCoreTool(toolSurface: ToolSurface): CoreTool {
switch (toolSurface) {
- case "understudy_code":
- return new UnderstudyCodeTool();
+ case "understudy_v3_code":
+ return new UnderstudyV3CodeTool();
case "playwright_code":
return new PlaywrightCodeTool();
case "cdp_code":
diff --git a/packages/evals/core/tools/understudy_code.ts b/packages/evals/core/tools/understudy_v3_code.ts
similarity index 96%
rename from packages/evals/core/tools/understudy_code.ts
rename to packages/evals/core/tools/understudy_v3_code.ts
index 80709d729..20834007c 100644
--- a/packages/evals/core/tools/understudy_code.ts
+++ b/packages/evals/core/tools/understudy_v3_code.ts
@@ -222,7 +222,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
- `understudy_code does not support click target kind "${target.kind}" yet`,
+ `understudy_v3_code does not support click target kind "${target.kind}" yet`,
);
}
}
@@ -253,7 +253,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
- `understudy_code does not support hover target kind "${target.kind}" yet`,
+ `understudy_v3_code does not support hover target kind "${target.kind}" yet`,
);
}
}
@@ -298,7 +298,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
- `understudy_code does not support type target kind "${target.kind}" yet`,
+ `understudy_v3_code does not support type target kind "${target.kind}" yet`,
);
}
}
@@ -335,7 +335,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
- `understudy_code does not support press target kind "${target.kind}" yet`,
+ `understudy_v3_code does not support press target kind "${target.kind}" yet`,
);
}
}
@@ -462,8 +462,8 @@ function connectionModeFromProfile(
return "launch";
}
-export class UnderstudyCodeTool implements CoreTool {
- readonly id = "understudy_code";
+export class UnderstudyV3CodeTool implements CoreTool {
+ readonly id = "understudy_v3_code";
readonly surface = "code";
readonly family = "understudy";
readonly supportedStartupProfiles: StartupProfile[] = [
@@ -485,7 +485,7 @@ export class UnderstudyCodeTool implements CoreTool {
async start(input: ToolStartInput): Promise {
if (input.startupProfile === "tool_attach_local_cdp") {
throw new Error(
- `understudy_code does not support startup profile "${input.startupProfile}" yet`,
+ `understudy_v3_code does not support startup profile "${input.startupProfile}" yet`,
);
}
diff --git a/packages/evals/framework/ClaudeAgentHarness.ts b/packages/evals/framework/ClaudeAgentHarness.ts
new file mode 100644
index 000000000..f13ecb305
--- /dev/null
+++ b/packages/evals/framework/ClaudeAgentHarness.ts
@@ -0,0 +1,52 @@
+import { EvalsError } from "../errors.js";
+import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
+import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
+import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
+import type {
+ BenchHarness,
+ BenchHarnessExecuteInput,
+ StartedBenchHarness,
+} from "./benchHarness.js";
+import type { TaskResult } from "./types.js";
+
+export const ClaudeAgentHarness: BenchHarness = {
+ harness: "claude_code",
+ supportedTaskKinds: ["agent", "suite"],
+ supportsApi: false,
+ async execute({
+ input,
+ row,
+ logger,
+ signal,
+ }: BenchHarnessExecuteInput): Promise {
+ const plan = buildExternalHarnessTaskPlan(input);
+ if (row.config.harness !== "claude_code") {
+ throw new EvalsError(
+ `Expected claude_code harness config, received "${row.config.harness}".`,
+ );
+ }
+ const toolAdapter = await prepareClaudeCodeToolAdapter({
+ toolSurface: row.config.toolSurface,
+ startupProfile: row.config.startupProfile,
+ environment: row.config.environment,
+ plan,
+ logger,
+ });
+ try {
+ return await runClaudeCodeAgent({
+ plan,
+ model: input.modelName,
+ logger,
+ toolAdapter,
+ signal,
+ });
+ } finally {
+ await toolAdapter.cleanup();
+ }
+ },
+ async start(): Promise {
+ throw new EvalsError(
+ "Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.",
+ );
+ },
+};
diff --git a/packages/evals/framework/CodexAgentHarness.ts b/packages/evals/framework/CodexAgentHarness.ts
new file mode 100644
index 000000000..fac8c2a31
--- /dev/null
+++ b/packages/evals/framework/CodexAgentHarness.ts
@@ -0,0 +1,52 @@
+import { EvalsError } from "../errors.js";
+import { runCodexAgent } from "./codexRunner.js";
+import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
+import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
+import type {
+ BenchHarness,
+ BenchHarnessExecuteInput,
+ StartedBenchHarness,
+} from "./benchHarness.js";
+import type { TaskResult } from "./types.js";
+
+export const CodexAgentHarness: BenchHarness = {
+ harness: "codex",
+ supportedTaskKinds: ["agent", "suite"],
+ supportsApi: false,
+ async execute({
+ input,
+ row,
+ logger,
+ signal,
+ }: BenchHarnessExecuteInput): Promise {
+ const plan = buildExternalHarnessTaskPlan(input);
+ if (row.config.harness !== "codex") {
+ throw new EvalsError(
+ `Expected codex harness config, received "${row.config.harness}".`,
+ );
+ }
+ const toolAdapter = await prepareCodexToolAdapter({
+ toolSurface: row.config.toolSurface,
+ startupProfile: row.config.startupProfile,
+ environment: row.config.environment,
+ plan,
+ logger,
+ });
+ try {
+ return await runCodexAgent({
+ plan,
+ model: input.modelName,
+ logger,
+ toolAdapter,
+ signal,
+ });
+ } finally {
+ await toolAdapter.cleanup();
+ }
+ },
+ async start(): Promise {
+ throw new EvalsError(
+ "Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.",
+ );
+ },
+};
diff --git a/packages/evals/framework/StagehandAgentV3Harness.ts b/packages/evals/framework/StagehandAgentV3Harness.ts
new file mode 100644
index 000000000..9d869fd48
--- /dev/null
+++ b/packages/evals/framework/StagehandAgentV3Harness.ts
@@ -0,0 +1,139 @@
+import {
+ AgentProvider,
+ getAISDKLanguageModel,
+ loadApiKeyFromEnv,
+ type AvailableModel,
+ type LLMClient,
+ type LogLine,
+} from "@browserbasehq/stagehand";
+import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
+import { endBrowserbaseSession } from "../browserbaseCleanup.js";
+import { EvalsError } from "../errors.js";
+import type { V3InitResult } from "../initV3.js";
+import type {
+ BenchHarness,
+ BenchHarnessStartInput,
+ StartedBenchHarness,
+} from "./benchHarness.js";
+import type { DiscoveredTask } from "./types.js";
+
+function isAgentTask(task: DiscoveredTask): boolean {
+ return (
+ task.primaryCategory === "agent" ||
+ task.categories.includes("agent") ||
+ task.categories.includes("external_agent_benchmarks")
+ );
+}
+
+function resolveProvider(modelName: AvailableModel): string | undefined {
+ if (modelName.includes("/")) {
+ return modelName.split("/")[0];
+ }
+
+ try {
+ return AgentProvider.getAgentProvider(modelName);
+ } catch {
+ return undefined;
+ }
+}
+
+export const StagehandAgentV3Harness: BenchHarness = {
+ harness: "stagehand_v3",
+ supportedTaskKinds: [
+ "act",
+ "extract",
+ "observe",
+ "agent",
+ "combination",
+ "suite",
+ ],
+ supportsApi: true,
+ async start({
+ task,
+ input,
+ row,
+ logger,
+ verbose,
+ }: BenchHarnessStartInput): Promise {
+ let v3Result: V3InitResult | undefined;
+ const createAgent = isAgentTask(task);
+ if (row.config.harness !== "stagehand_v3") {
+ throw new EvalsError(
+ `Expected stagehand_v3 harness config, received "${row.config.harness}".`,
+ );
+ }
+ const config = row.config;
+ const agentMode = config.agentMode ?? input.agentMode;
+ const isCUA = config.isCUA ?? input.isCUA;
+
+ if (config.useApi) {
+ const provider = resolveProvider(input.modelName);
+ const logFn = (line: LogLine) => logger.log(line);
+ const apiKey = loadApiKeyFromEnv(provider, logFn);
+ if (!apiKey) {
+ throw new EvalsError(
+ `USE_API=true but no API key found for provider "${provider}".`,
+ );
+ }
+ const { initV3 } = await import("../initV3.js");
+ v3Result = await initV3({
+ logger,
+ modelName: input.modelName,
+ modelClientOptions: { apiKey },
+ createAgent,
+ agentMode,
+ isCUA,
+ verbose,
+ configOverrides: { env: config.environment },
+ });
+ } else {
+ let llmClient: LLMClient | undefined;
+ if (input.modelName.includes("/")) {
+ const firstSlashIndex = input.modelName.indexOf("/");
+ llmClient = new AISdkClientWrapped({
+ model: getAISDKLanguageModel(
+ input.modelName.substring(0, firstSlashIndex),
+ input.modelName.substring(firstSlashIndex + 1),
+ ),
+ });
+ }
+ const { initV3 } = await import("../initV3.js");
+ v3Result = await initV3({
+ logger,
+ llmClient,
+ modelName: input.modelName,
+ createAgent,
+ agentMode,
+ isCUA,
+ verbose,
+ configOverrides: { env: config.environment },
+ });
+ }
+
+ return {
+ ctx: {
+ harness: "stagehand_v3",
+ row,
+ logger,
+ v3: v3Result.v3,
+ agent: v3Result.agent,
+ page: v3Result.v3.context.pages()[0],
+ debugUrl: v3Result.debugUrl ?? "",
+ sessionUrl: v3Result.sessionUrl ?? "",
+ },
+ cleanup: async () => {
+ if (v3Result?.v3) {
+ try {
+ await v3Result.v3.close();
+ } catch (closeError) {
+ console.error(
+ `Warning: Error closing V3 instance for ${input.name}:`,
+ closeError,
+ );
+ }
+ }
+ await endBrowserbaseSession(v3Result?.v3);
+ },
+ };
+ },
+};
diff --git a/packages/evals/framework/StagehandAgentV4Harness.ts b/packages/evals/framework/StagehandAgentV4Harness.ts
new file mode 100644
index 000000000..eec48977e
--- /dev/null
+++ b/packages/evals/framework/StagehandAgentV4Harness.ts
@@ -0,0 +1,1129 @@
+import {
+ getAISDKLanguageModel,
+ type AgentInstance,
+ type LLMClient,
+ type LocalBrowserLaunchOptions,
+ type V3,
+} from "@browserbasehq/stagehand";
+import { z } from "zod";
+import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
+import { endBrowserbaseSession } from "../browserbaseCleanup.js";
+import { EvalsError } from "../errors.js";
+import type { V3InitResult } from "../initV3.js";
+import {
+ startUnderstudyV4Tools,
+ type UnderstudyV4NativeRuntime,
+} from "./UnderstudyV4Tools.js";
+import type {
+ BenchHarness,
+ BenchHarnessStartInput,
+ BenchHarnessContext,
+ StartedBenchHarness,
+} from "./benchHarness.js";
+
+type Page = ReturnType[number];
+type StagehandV4LoadState =
+ | "init"
+ | "domcontentloaded"
+ | "loaded"
+ | "networkidle2"
+ | "networkidle";
+
+const STAGEHAND_V4_LOAD_STATE_ORDER: Record = {
+ init: 0,
+ domcontentloaded: 1,
+ loaded: 2,
+ networkidle2: 3,
+ networkidle: 4,
+};
+
+function isInternalStagehandV4PageUrl(url: string | undefined): boolean {
+ return (
+ url == null ||
+ url === "about:blank" ||
+ /^chrome(?:-[a-z]+)?:\/\//u.test(url)
+ );
+}
+
+type StagehandV4PageState = {
+ targetId?: string;
+ title: string;
+ url: string;
+ loadState?: StagehandV4LoadState;
+ frames: StagehandV4FrameState[];
+};
+
+type StagehandV4FrameState = {
+ frameId: string;
+ targetId?: string;
+ url?: string;
+};
+
+type StagehandV4HistoryEntry = {
+ method: string;
+ parameters: unknown;
+ result: unknown;
+ timestamp: string;
+};
+
+const STAGEHAND_V4_PAGE_STATE = Symbol("stagehand_v4_page_state");
+
+function isAgentTask(task: BenchHarnessStartInput["task"]): boolean {
+ return (
+ task.primaryCategory === "agent" ||
+ task.categories.includes("agent") ||
+ task.categories.includes("external_agent_benchmarks")
+ );
+}
+
+export const StagehandAgentV4Harness: BenchHarness = {
+ harness: "stagehand_v4",
+ supportedTaskKinds: [
+ "act",
+ "extract",
+ "observe",
+ "agent",
+ "combination",
+ "suite",
+ ],
+ supportsApi: false,
+ async start({
+ task,
+ input,
+ row,
+ logger,
+ verbose,
+ }: BenchHarnessStartInput): Promise {
+ if (row.config.harness !== "stagehand_v4") {
+ throw new EvalsError(
+ `Expected stagehand_v4 harness config, received "${row.config.harness}".`,
+ );
+ }
+ if (row.config.toolSurface !== "understudy_v4_code") {
+ throw new EvalsError(
+ `StagehandAgentV4Harness requires --tool understudy_v4_code; received "${row.config.toolSurface ?? "default"}".`,
+ );
+ }
+ if (row.config.useApi) {
+ throw new EvalsError(
+ "stagehand_v4 must run locally so the v3 agent loop can call the live v4 SDK protocol tools.",
+ );
+ }
+
+ // This is intentionally still the v3 agent loop. The v4 part is the SDK
+ // launcher/tool catalog/dispatch surface that replaces the v3 agent tools.
+ const createAgent = isAgentTask(task);
+ const understudyV4Tools = await startUnderstudyV4Tools({
+ environment: row.config.environment,
+ logger,
+ });
+ let v3Result: V3InitResult | undefined;
+ let printedV4BusLogTree = false;
+ const printV4BusLogTree = async (): Promise => {
+ if (!verbose || printedV4BusLogTree) return;
+ printedV4BusLogTree = true;
+ try {
+ const result = (await understudyV4Tools.stagehandV4.cdp.Mod.evaluate({
+ expression: `async () => {
+ const readLogTree = globalThis.__stagehandBusLogTree;
+ if (typeof readLogTree !== "function") {
+ return { error: "globalThis.__stagehandBusLogTree is not available" };
+ }
+ return await readLogTree(params.stagehand_session_id);
+ }`,
+ params: {
+ stagehand_session_id: understudyV4Tools.stagehand_session_id,
+ },
+ })) as { error?: unknown; logTree?: unknown };
+ logger.log({
+ category: "understudy_v4_code",
+ message:
+ typeof result.logTree === "string"
+ ? `v4 bus.logTree()\n${result.logTree}`
+ : `v4 bus.logTree() unavailable: ${String(
+ result.error ?? "Mod.evaluate did not return logTree.",
+ )}`,
+ level: 1,
+ });
+ } catch (dashboardError) {
+ logger.warn({
+ category: "understudy_v4_code",
+ message: `Unable to print v4 bus.logTree(): ${
+ dashboardError instanceof Error
+ ? dashboardError.message
+ : String(dashboardError)
+ }`,
+ level: 1,
+ });
+ }
+ };
+
+ try {
+ let llmClient: LLMClient | undefined;
+ if (input.modelName.includes("/")) {
+ const firstSlashIndex = input.modelName.indexOf("/");
+ llmClient = new AISdkClientWrapped({
+ model: getAISDKLanguageModel(
+ input.modelName.substring(0, firstSlashIndex),
+ input.modelName.substring(firstSlashIndex + 1),
+ ),
+ });
+ }
+
+ const localBrowserLaunchOptions = {
+ cdpUrl: understudyV4Tools.cdpUrl,
+ } satisfies Partial;
+ const { initV3 } = await import("../initV3.js");
+ v3Result = await initV3({
+ logger,
+ llmClient,
+ modelName: input.modelName,
+ createAgent: false,
+ agentMode: row.config.agentMode ?? input.agentMode,
+ isCUA: row.config.isCUA ?? input.isCUA,
+ verbose,
+ configOverrides: {
+ env: "LOCAL",
+ localBrowserLaunchOptions,
+ experimental: true,
+ },
+ });
+ const closeV3 = v3Result.v3.close.bind(v3Result.v3);
+ v3Result.v3.close = async () => {
+ await printV4BusLogTree();
+ return await closeV3();
+ };
+ const v4Page = await installStagehandV4BenchFacade(
+ v3Result.v3,
+ understudyV4Tools.stagehandV4,
+ input.modelName,
+ );
+
+ if (createAgent) {
+ v3Result.agent = v3Result.v3.agent({
+ model: input.modelName,
+ mode: "dom",
+ tools: understudyV4Tools.tools,
+ systemPrompt: buildStagehandAgentV4SystemPrompt(
+ understudyV4Tools.toolCatalog,
+ ),
+ }) as AgentInstance;
+ }
+
+ const ctx: BenchHarnessContext = {
+ harness: "stagehand_v4",
+ row,
+ logger,
+ v3: v3Result.v3,
+ v4: understudyV4Tools.stagehandV4,
+ agent: v3Result.agent,
+ page: v4Page as unknown as Page,
+ debugUrl: v3Result.debugUrl ?? "",
+ sessionUrl: v3Result.sessionUrl ?? "",
+ };
+
+ return {
+ ctx,
+ cleanup: async () => {
+ await printV4BusLogTree();
+ if (v3Result?.v3) {
+ try {
+ await v3Result.v3.close();
+ } catch (closeError) {
+ console.error(
+ `Warning: Error closing V3 instance for ${input.name}:`,
+ closeError,
+ );
+ }
+ }
+ await endBrowserbaseSession(v3Result?.v3);
+ await understudyV4Tools.cleanup();
+ },
+ };
+ } catch (error) {
+ if (v3Result?.v3) await v3Result.v3.close().catch(() => {});
+ await understudyV4Tools.cleanup().catch(() => {});
+ throw error;
+ }
+ },
+};
+
+function buildStagehandAgentV4SystemPrompt(
+ toolCatalog: Record[],
+): string {
+ return [
+ "You are using Stagehand v4 protocol tools through the existing Stagehand agent loop.",
+ "The callable tool schemas are the source of truth. They are v4 event payload schemas, not the older v3 agent wrapper schemas.",
+ "",
+ "Selector rules:",
+ "- Selectors are partial hints. You may pass only elementId, only xpath, only css, only text, only coordinates, or any useful subset.",
+ "- The browser hydrates selectors before use, so do not invent missing selector fields.",
+ "- Prefer elementId from the page summary tree when it is available. Coordinates are valid when they are the clearest available selector.",
+ "- Deep XPath can pierce frames and shadow roots, for example /body/div[3]/iframe[2]/body/iframe[2]/button.",
+ "",
+ "Page context:",
+ "- Use the derived page summary tool to get current DOM/accessibility context and element ids.",
+ "- Use the derived screenshot tool when visual confirmation or coordinates are needed.",
+ "- When you already have a selector and a concrete operation, prefer the direct browser action tool for that operation.",
+ "- If you use act with an action object, follow the action schema exactly.",
+ "",
+ "Available v4 tools:",
+ ...toolCatalog.map((definition) => {
+ const name =
+ typeof definition.name === "string" ? definition.name : "unknown";
+ const description =
+ typeof definition.description === "string"
+ ? definition.description
+ : name;
+ return `- ${name}: ${description}`;
+ }),
+ ].join("\n");
+}
+
+async function installStagehandV4BenchFacade(
+ v3: V3,
+ stagehandV4: UnderstudyV4NativeRuntime,
+ modelName: string,
+): Promise> {
+ const pageState: StagehandV4PageState = {
+ frames: [],
+ title: "",
+ url: "about:blank",
+ };
+ const history: StagehandV4HistoryEntry[] = [];
+ const recordHistory = (
+ method: string,
+ parameters: unknown,
+ result: unknown,
+ ): void => {
+ history.push({
+ method,
+ parameters,
+ result,
+ timestamp: new Date().toISOString(),
+ });
+ };
+ const pageCache = new Map>();
+ const pageOrder: string[] = [];
+
+ const refreshPageInfo = async (): Promise => {
+ const info = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageRequestInfo({
+ ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}),
+ }),
+ );
+ if (!isRecord(info)) return;
+ if (typeof info.targetId === "string") pageState.targetId = info.targetId;
+ if (typeof info.title === "string") pageState.title = info.title;
+ if (typeof info.url === "string") pageState.url = info.url;
+ if (info.loadState != null)
+ pageState.loadState = normalizeStagehandV4LoadState(info.loadState);
+ await refreshFrameStates(stagehandV4, pageState).catch(() => {});
+ };
+
+ const refreshPages = async (): Promise[]> => {
+ const rawPages = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserRequestTabList({}),
+ );
+ const pages = Array.isArray(rawPages)
+ ? rawPages.filter((page): page is Record =>
+ isRecord(page),
+ )
+ : [];
+ for (const pageInfo of pages) {
+ const targetId =
+ typeof pageInfo.targetId === "string" ? pageInfo.targetId : null;
+ if (targetId == null) continue;
+ if (!pageOrder.includes(targetId)) pageOrder.push(targetId);
+ let facade = pageCache.get(targetId);
+ if (facade == null) {
+ const state: StagehandV4PageState = {
+ frames: [],
+ targetId,
+ title: "",
+ url: "about:blank",
+ };
+ facade = createStagehandV4PageFacade(
+ stagehandV4,
+ state,
+ async () => {
+ await refreshSinglePageInfo(stagehandV4, state);
+ },
+ recordHistory,
+ );
+ pageCache.set(targetId, facade);
+ }
+ const state = facade[STAGEHAND_V4_PAGE_STATE];
+ if (!isStagehandV4PageState(state)) continue;
+ state.targetId = targetId;
+ state.title =
+ typeof pageInfo.title === "string" ? pageInfo.title : state.title;
+ state.url = typeof pageInfo.url === "string" ? pageInfo.url : state.url;
+ await refreshFrameStates(stagehandV4, state).catch(() => {});
+ }
+ return pageOrder
+ .map((targetId) => pageCache.get(targetId))
+ .filter((page): page is Record => page != null);
+ };
+
+ await refreshPageInfo().catch(() => {});
+ await refreshPages().catch(() => {});
+
+ const page = createStagehandV4PageFacade(
+ stagehandV4,
+ pageState,
+ refreshPageInfo,
+ recordHistory,
+ );
+ if (pageState.targetId != null) {
+ if (!pageOrder.includes(pageState.targetId))
+ pageOrder.push(pageState.targetId);
+ pageCache.set(pageState.targetId, page);
+ }
+ const pages = (): Record[] => {
+ const cached = pageOrder
+ .map((targetId) => pageCache.get(targetId))
+ .filter((entry): entry is Record => entry != null);
+ return cached.length > 0 ? cached : [page];
+ };
+
+ const context = v3.context as unknown as Record;
+ context.pages = pages;
+ context.awaitActivePage = async () => {
+ await refreshPages().catch(() => {});
+ const activePage = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserRequestActivePage({}),
+ );
+ if (isRecord(activePage) && typeof activePage.targetId === "string") {
+ const cached = pageCache.get(activePage.targetId);
+ if (cached != null) return cached;
+ }
+ await refreshPageInfo();
+ return page;
+ };
+ Object.defineProperty(v3, "history", {
+ configurable: true,
+ get: () => Promise.resolve([...history]),
+ });
+
+ v3.observe = (async (
+ a?: string | Record,
+ b?: Record,
+ ) => {
+ const instruction = typeof a === "string" ? a : undefined;
+ const options = (typeof a === "string" ? b : a) as
+ | Record
+ | undefined;
+ const result = await stagehandV4.cdp.Stagehand.AIObserve({
+ ...(instruction != null ? { instruction } : {}),
+ ...selectorParam(options),
+ ...workflowOptionsParam(options, modelName),
+ });
+ const observed = unwrapStagehandV4Result(result);
+ const output = Array.isArray(observed) ? observed : [];
+ recordHistory("observe", { instruction, options }, output);
+ return output;
+ }) as V3["observe"];
+
+ v3.act = (async (
+ input: string | Record,
+ options?: Record,
+ ) => {
+ const workflowOptions = workflowOptionsParam(options, modelName);
+ const result = await stagehandV4.cdp.Stagehand.AIAct(
+ typeof input === "string"
+ ? {
+ instruction: input,
+ ...selectorParam(options),
+ ...workflowOptions,
+ }
+ : {
+ action: normalizeV4Action(input),
+ ...selectorParam(options),
+ ...workflowOptions,
+ options: {
+ ...(isRecord(workflowOptions.options)
+ ? workflowOptions.options
+ : {}),
+ selfHeal: true,
+ },
+ },
+ );
+ const unwrapped = unwrapStagehandV4Result(result);
+ await refreshPageInfo().catch(() => {});
+ await refreshPages().catch(() => {});
+ recordHistory(
+ "act",
+ typeof input === "string"
+ ? { instruction: input, options }
+ : { action: input, options },
+ unwrapped,
+ );
+ return unwrapped;
+ }) as V3["act"];
+
+ v3.extract = (async (
+ a?: string | Record,
+ b?: z.ZodType | Record,
+ c?: Record,
+ ) => {
+ const instruction = typeof a === "string" ? a : undefined;
+ const schema = isZodSchema(b) ? z.toJSONSchema(b) : undefined;
+ const options = (typeof a === "string" ? (isZodSchema(b) ? c : b) : a) as
+ | Record
+ | undefined;
+ if (instruction == null && schema == null) {
+ const summary = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({
+ ...selectorParam(options),
+ }),
+ );
+ const pageText =
+ isRecord(summary) && typeof summary.pageText === "string"
+ ? summary.pageText
+ : "";
+ return { pageText, extraction: pageText };
+ }
+ const result = await stagehandV4.cdp.Stagehand.AIExtract({
+ ...(instruction != null ? { instruction } : {}),
+ ...(schema != null ? { schema: schema as Record } : {}),
+ ...selectorParam(options),
+ ...workflowOptionsParam(options, modelName),
+ });
+ const extracted = unwrapStagehandV4Result(result);
+ recordHistory("extract", { instruction, schema, options }, extracted);
+ return extracted;
+ }) as V3["extract"];
+
+ return page;
+}
+
+function createStagehandV4PageFacade(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ refreshPageInfo: () => Promise,
+ recordHistory?: (
+ method: string,
+ parameters: unknown,
+ result: unknown,
+ ) => void,
+): Record {
+ return {
+ [STAGEHAND_V4_PAGE_STATE]: pageState,
+ async goto(url: string, options?: unknown) {
+ pageState.loadState = "init";
+ const selector =
+ pageState.targetId != null
+ ? { targetId: pageState.targetId }
+ : { active: true };
+ if (!("targetId" in selector)) {
+ delete pageState.targetId;
+ }
+ const waitUntil =
+ isRecord(options) && "waitUntil" in options
+ ? options.waitUntil
+ : undefined;
+ const rawResult = await stagehandV4.cdp.Stagehand.BrowserPageGoto({
+ url,
+ selector,
+ waitUntil: normalizeStagehandV4LoadState(waitUntil),
+ });
+ const result = unwrapStagehandV4Result(rawResult);
+ if (isRecord(result)) {
+ if (typeof result.targetId === "string")
+ pageState.targetId = result.targetId;
+ if (typeof result.url === "string") pageState.url = result.url;
+ }
+ await refreshPageInfo();
+ const response = {
+ ok: () => true,
+ status: () => 200,
+ url: () => pageState.url,
+ };
+ recordHistory?.("navigate", { url, options }, result);
+ return response;
+ },
+ url() {
+ return pageState.url;
+ },
+ async title() {
+ await refreshPageInfo();
+ return pageState.title;
+ },
+ frames() {
+ return pageState.frames.map((frameState) =>
+ createStagehandV4FrameFacade(stagehandV4, frameState),
+ );
+ },
+ async waitForLoadState(state?: unknown, options?: unknown) {
+ await waitForStagehandV4LoadState(
+ stagehandV4,
+ pageState,
+ state,
+ loadStateTimeoutMs(options),
+ );
+ await refreshPageInfo();
+ },
+ async evaluate(expressionOrFn: unknown, arg?: unknown) {
+ const expression =
+ typeof expressionOrFn === "function"
+ ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})`
+ : String(expressionOrFn);
+ const result = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({
+ ...(pageState.targetId != null
+ ? { targetId: pageState.targetId }
+ : {}),
+ arg: isJsonValue(arg) ? arg : undefined,
+ awaitPromise: true,
+ expression,
+ returnByValue: true,
+ }),
+ );
+ return isRecord(result) && "value" in result ? result.value : result;
+ },
+ locator(selector: unknown) {
+ return createStagehandV4LocatorFacade(stagehandV4, pageState, selector);
+ },
+ frameLocator(selector: unknown) {
+ return createStagehandV4FrameLocatorFacade(stagehandV4, pageState, [
+ selector,
+ ]);
+ },
+ };
+}
+
+function createStagehandV4FrameFacade(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ frameState: StagehandV4FrameState,
+): Record {
+ return {
+ async evaluate(expressionOrFn: unknown, arg?: unknown) {
+ const expression =
+ typeof expressionOrFn === "function"
+ ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})`
+ : String(expressionOrFn);
+ const result = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({
+ ...(frameState.targetId != null
+ ? { targetId: frameState.targetId }
+ : {}),
+ arg: isJsonValue(arg) ? arg : undefined,
+ awaitPromise: true,
+ expression,
+ frameId: frameState.frameId,
+ returnByValue: true,
+ }),
+ );
+ return isRecord(result) && "value" in result ? result.value : result;
+ },
+ url() {
+ return frameState.url ?? "about:blank";
+ },
+ };
+}
+
+function createStagehandV4FrameLocatorFacade(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ frameSelectors: unknown[],
+): Record {
+ return {
+ frameLocator(selector: unknown) {
+ return createStagehandV4FrameLocatorFacade(stagehandV4, pageState, [
+ ...frameSelectors,
+ selector,
+ ]);
+ },
+ locator(selector: unknown) {
+ return createStagehandV4LocatorFacade(stagehandV4, pageState, selector);
+ },
+ async evaluate(expressionOrFn: unknown, arg?: unknown) {
+ const expression =
+ typeof expressionOrFn === "function"
+ ? `(${expressionOrFn.toString()})(...${JSON.stringify(arg === undefined ? [] : [arg])})`
+ : String(expressionOrFn);
+ const frameId = await resolveStagehandV4FrameLocator(
+ stagehandV4,
+ pageState,
+ frameSelectors,
+ );
+ const result = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageEvaluate({
+ ...(pageState.targetId != null
+ ? { targetId: pageState.targetId }
+ : {}),
+ arg: isJsonValue(arg) ? arg : undefined,
+ awaitPromise: true,
+ expression,
+ ...(frameId != null ? { frameId } : {}),
+ returnByValue: true,
+ }),
+ );
+ return isRecord(result) && "value" in result ? result.value : result;
+ },
+ };
+}
+
+function createStagehandV4LocatorFacade(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ selector: unknown,
+ frameSelectors: unknown[] = [],
+): Record {
+ const read = async () =>
+ await requestStagehandV4ElementInfo(
+ stagehandV4,
+ pageState,
+ selector,
+ frameSelectors,
+ );
+ return {
+ first() {
+ return createStagehandV4LocatorFacade(
+ stagehandV4,
+ pageState,
+ selector,
+ frameSelectors,
+ );
+ },
+ async inputValue() {
+ return (await read()).inputValue ?? "";
+ },
+ async isChecked() {
+ return Boolean((await read()).checked);
+ },
+ async textContent() {
+ return (await read()).textContent ?? null;
+ },
+ async innerText() {
+ const info = await read();
+ return info.innerText ?? info.textContent ?? "";
+ },
+ async innerHtml() {
+ return (await read()).innerHTML ?? "";
+ },
+ async innerHTML() {
+ return (await read()).innerHTML ?? "";
+ },
+ async click() {
+ await stagehandV4.cdp.Stagehand.BrowserPageClick({
+ selector: await stagehandV4SelectorFor(
+ stagehandV4,
+ pageState,
+ selector,
+ frameSelectors,
+ ),
+ });
+ },
+ async backendNodeId() {
+ return (await read()).backendNodeId;
+ },
+ };
+}
+
+async function requestStagehandV4ElementInfo(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ selector: unknown,
+ frameSelectors: unknown[] = [],
+): Promise<{
+ backendNodeId: number;
+ checked?: boolean | null;
+ innerHTML?: string | null;
+ innerText?: string | null;
+ inputValue?: string | null;
+ textContent?: string | null;
+}> {
+ const result = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageRequestElementInfo({
+ selector: await stagehandV4SelectorFor(
+ stagehandV4,
+ pageState,
+ selector,
+ frameSelectors,
+ ),
+ }),
+ );
+ if (isRecord(result) && typeof result.backendNodeId === "number") {
+ return result as {
+ backendNodeId: number;
+ checked?: boolean | null;
+ innerHTML?: string | null;
+ innerText?: string | null;
+ inputValue?: string | null;
+ textContent?: string | null;
+ };
+ }
+ throw new Error("stagehand_v4 locator could not resolve element info.");
+}
+
+async function refreshSinglePageInfo(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+): Promise {
+ const info = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageRequestInfo({
+ ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}),
+ }),
+ );
+ if (!isRecord(info)) return;
+ if (typeof info.targetId === "string") pageState.targetId = info.targetId;
+ if (typeof info.title === "string") pageState.title = info.title;
+ if (typeof info.url === "string") pageState.url = info.url;
+ if (info.loadState != null)
+ pageState.loadState = normalizeStagehandV4LoadState(info.loadState);
+ await refreshFrameStates(stagehandV4, pageState);
+}
+
+async function refreshFrameStates(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+): Promise {
+ if (pageState.targetId == null || isInternalStagehandV4PageUrl(pageState.url))
+ return;
+ const rawFrameTree = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageRequestFullFrameTree({
+ targetId: pageState.targetId,
+ }),
+ );
+ if (!isRecord(rawFrameTree) || !isRecord(rawFrameTree.frameTree)) return;
+ const frames: StagehandV4FrameState[] = [];
+ collectStagehandV4Frames(rawFrameTree.frameTree, pageState.targetId, frames);
+ pageState.frames = frames;
+}
+
+function collectStagehandV4Frames(
+ frameTree: Record,
+ targetId: string,
+ frames: StagehandV4FrameState[],
+): void {
+ const frame = isRecord(frameTree.frame) ? frameTree.frame : null;
+ if (frame != null && typeof frame.id === "string") {
+ frames.push({
+ frameId: frame.id,
+ targetId,
+ url: typeof frame.url === "string" ? frame.url : undefined,
+ });
+ }
+ const childFrames = Array.isArray(frameTree.childFrames)
+ ? frameTree.childFrames
+ : [];
+ for (const childFrame of childFrames) {
+ if (isRecord(childFrame)) {
+ collectStagehandV4Frames(childFrame, targetId, frames);
+ }
+ }
+}
+
+async function stagehandV4SelectorFor(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ selector: unknown,
+ frameSelectors: unknown[] = [],
+): Promise> {
+ if (pageState.targetId == null) {
+ await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {});
+ }
+ const frameId = await resolveStagehandV4FrameLocator(
+ stagehandV4,
+ pageState,
+ frameSelectors,
+ );
+ return {
+ ...normalizeV4Selector(selector),
+ ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}),
+ ...(frameId != null ? { frameId } : {}),
+ };
+}
+
+async function resolveStagehandV4FrameLocator(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ frameSelectors: unknown[],
+): Promise {
+ if (frameSelectors.length === 0) return undefined;
+ if (pageState.targetId == null) {
+ await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {});
+ }
+ let frameId: string | undefined;
+ for (const frameSelector of frameSelectors) {
+ const selector = {
+ ...normalizeV4Selector(frameSelector),
+ ...(pageState.targetId != null ? { targetId: pageState.targetId } : {}),
+ ...(frameId != null ? { frameId } : {}),
+ };
+ const located = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageLocate({ selector }).catch(
+ (error: unknown): never => {
+ throw new Error(
+ `stagehand_v4 frameLocator could not locate ${JSON.stringify(selector)}: ${
+ error instanceof Error ? error.message : String(error)
+ }`,
+ );
+ },
+ ),
+ );
+ if (!isRecord(located)) {
+ throw new Error(
+ "stagehand_v4 frameLocator could not resolve iframe selector.",
+ );
+ }
+ const summary = unwrapStagehandV4Result(
+ await stagehandV4.cdp.Stagehand.BrowserPageDOMSummary({
+ hydrate: { ax: false },
+ selector: {
+ ...(pageState.targetId != null
+ ? { targetId: pageState.targetId }
+ : {}),
+ },
+ }),
+ );
+ frameId = childFrameIdForLocatedFrameOwner(summary, located);
+ }
+ return frameId;
+}
+
+function childFrameIdForLocatedFrameOwner(
+ summary: unknown,
+ located: Record,
+): string {
+ const frameGraph = isRecord(summary) ? summary.frameGraph : null;
+ if (!isRecord(frameGraph) || !isRecord(frameGraph.ownerChainByFrameId)) {
+ throw new Error(
+ "stagehand_v4 frameLocator could not read the frame graph.",
+ );
+ }
+ const backendNodeId =
+ typeof located.backendNodeId === "number" ? located.backendNodeId : null;
+ const ownerFrameId =
+ typeof located.frameId === "string" ? located.frameId : null;
+ if (backendNodeId == null || ownerFrameId == null) {
+ throw new Error(
+ "stagehand_v4 frameLocator resolved selector without a frame owner.",
+ );
+ }
+ for (const [candidateFrameId, chain] of Object.entries(
+ frameGraph.ownerChainByFrameId,
+ )) {
+ if (!Array.isArray(chain)) continue;
+ const owner = chain.at(-1);
+ if (
+ isRecord(owner) &&
+ owner.backendNodeId === backendNodeId &&
+ owner.frameId === ownerFrameId
+ ) {
+ return candidateFrameId;
+ }
+ }
+ throw new Error("stagehand_v4 frameLocator could not find a child frame.");
+}
+
+async function waitForStagehandV4LoadState(
+ stagehandV4: UnderstudyV4NativeRuntime,
+ pageState: StagehandV4PageState,
+ state: unknown,
+ timeoutMs: number,
+): Promise {
+ const expectedState = normalizeStagehandV4LoadState(state);
+ const deadline = Date.now() + timeoutMs;
+ while (true) {
+ await refreshSinglePageInfo(stagehandV4, pageState).catch(() => {});
+ if (
+ pageState.loadState != null &&
+ STAGEHAND_V4_LOAD_STATE_ORDER[pageState.loadState] >=
+ STAGEHAND_V4_LOAD_STATE_ORDER[expectedState]
+ ) {
+ return;
+ }
+ const remainingMs = deadline - Date.now();
+ if (remainingMs <= 0) {
+ throw new Error(
+ `Timed out waiting for stagehand_v4 page loadState=${expectedState}.`,
+ );
+ }
+ await new Promise((resolve) =>
+ setTimeout(resolve, Math.min(100, remainingMs)),
+ );
+ }
+}
+
+function normalizeStagehandV4LoadState(state: unknown): StagehandV4LoadState {
+ if (state == null || state === "load" || state === "loaded") return "loaded";
+ if (state === "networkalmostidle") return "networkidle2";
+ if (isStagehandV4LoadState(state)) return state;
+ throw new Error(`Unsupported stagehand_v4 waitForLoadState state: ${state}`);
+}
+
+function isStagehandV4LoadState(value: unknown): value is StagehandV4LoadState {
+ return (
+ value === "init" ||
+ value === "domcontentloaded" ||
+ value === "loaded" ||
+ value === "networkidle2" ||
+ value === "networkidle"
+ );
+}
+
+function loadStateTimeoutMs(options: unknown): number {
+ if (!isRecord(options)) return 30_000;
+ const timeout = options.timeoutMs ?? options.timeout;
+ return typeof timeout === "number" && Number.isFinite(timeout)
+ ? Math.max(0, timeout)
+ : 30_000;
+}
+
+function normalizeV4Action(
+ action: Record,
+): Record {
+ const method =
+ typeof action.method === "string"
+ ? normalizeV4ActionMethod(action.method)
+ : null;
+ const selector = normalizeV4Selector(action.selector);
+ let args: Record = {};
+ if (isRecord(action.arguments)) {
+ args = action.arguments;
+ } else if (Array.isArray(action.arguments)) {
+ const positional = action.arguments.filter(
+ (value): value is string => typeof value === "string",
+ );
+ const first = positional[0];
+ if (method === "fill") {
+ args = { value: first ?? "" };
+ } else if (method === "type") {
+ args = { text: first ?? "" };
+ } else if (method === "keys") {
+ args = { key: first ?? "", method: "press" };
+ } else if (method === "goto") {
+ args = { url: first ?? "" };
+ } else if (method === "wait") {
+ const ms = Number(first);
+ args = { ms: Number.isFinite(ms) ? ms : 1000 };
+ } else if (method === "scroll" || method === "scrollTo") {
+ const numberValue = first?.endsWith("%")
+ ? Number.parseFloat(first)
+ : Number(first);
+ args = first?.includes("%")
+ ? { percent: first }
+ : method === "scroll"
+ ? { deltaY: Number.isFinite(numberValue) ? numberValue : 0 }
+ : { y: Number.isFinite(numberValue) ? numberValue : 0 };
+ } else if (method === "dragAndDrop") {
+ args = {
+ from: selector,
+ to: normalizeV4Selector(first) ?? selector,
+ };
+ }
+ }
+ return {
+ ...action,
+ selector,
+ method,
+ arguments: args,
+ };
+}
+
+function normalizeV4ActionMethod(method: string): string {
+ return method === "press" ? "keys" : method;
+}
+
+function selectorParam(
+ options: Record | undefined,
+): Record {
+ const pageSelector = stagehandV4PageSelector(options?.page);
+ const selector = normalizeV4Selector(options?.selector);
+ const mergedSelector =
+ pageSelector == null && selector == null
+ ? undefined
+ : {
+ ...(pageSelector ?? {}),
+ ...(selector ?? {}),
+ };
+ return mergedSelector == null ? {} : { selector: mergedSelector };
+}
+
+function normalizeV4Selector(
+ value: unknown,
+): Record | undefined {
+ if (value == null) return undefined;
+ if (isRecord(value)) return value;
+ if (typeof value !== "string" || value.length === 0) return undefined;
+ if (value.startsWith("xpath="))
+ return { xpath: value.slice("xpath=".length) };
+ if (value.startsWith("/") || value.startsWith("(")) return { xpath: value };
+ return {
+ css: value
+ .split(/\s*>>\s*/u)
+ .filter(Boolean)
+ .join(" "),
+ };
+}
+
+function stagehandV4PageSelector(
+ page: unknown,
+): Record | undefined {
+ if (page == null) return undefined;
+ const state = (page as Record)[STAGEHAND_V4_PAGE_STATE];
+ if (!isStagehandV4PageState(state) || state.targetId == null)
+ return undefined;
+ return { targetId: state.targetId };
+}
+
+function isStagehandV4PageState(value: unknown): value is StagehandV4PageState {
+ return (
+ isRecord(value) &&
+ Array.isArray(value.frames) &&
+ typeof value.title === "string" &&
+ typeof value.url === "string"
+ );
+}
+
+function workflowOptionsParam(
+ options: Record | undefined,
+ modelName: string,
+): Record {
+ const workflowOptions: Record = { model: modelName };
+ if (typeof options?.timeout === "number")
+ workflowOptions.timeout = options.timeout;
+ if (options != null && isJsonValue(options.variables))
+ workflowOptions.variables = options.variables;
+ if (isRecord(options?.model)) workflowOptions.model = options.model;
+ if (typeof options?.model === "string") workflowOptions.model = options.model;
+ return Object.keys(workflowOptions).length === 0
+ ? {}
+ : { options: workflowOptions };
+}
+
+function unwrapStagehandV4Result(value: unknown): unknown {
+ if (!isRecord(value)) return value;
+ if (isRecord(value.event_results)) {
+ for (const entry of Object.values(value.event_results)) {
+ if (!isRecord(entry)) continue;
+ if ("result" in entry) return entry.result;
+ }
+ }
+ if ("result" in value) return value.result;
+ return value;
+}
+
+function isZodSchema(value: unknown): value is z.ZodType {
+ return isRecord(value) && typeof value.safeParse === "function";
+}
+
+function isJsonValue(value: unknown): boolean {
+ if (value == null) return true;
+ if (
+ typeof value === "string" ||
+ typeof value === "number" ||
+ typeof value === "boolean"
+ )
+ return true;
+ if (Array.isArray(value)) return value.every(isJsonValue);
+ if (!isRecord(value)) return false;
+ return Object.values(value).every(isJsonValue);
+}
+
+function isRecord(value: unknown): value is Record {
+ return value != null && typeof value === "object" && !Array.isArray(value);
+}
diff --git a/packages/evals/framework/UnderstudyV4Tools.ts b/packages/evals/framework/UnderstudyV4Tools.ts
new file mode 100644
index 000000000..e2eeaafbf
--- /dev/null
+++ b/packages/evals/framework/UnderstudyV4Tools.ts
@@ -0,0 +1,742 @@
+import path from "node:path";
+import fs from "node:fs";
+import { createRequire } from "node:module";
+import { spawn, type ChildProcess } from "node:child_process";
+import { createInterface } from "node:readline";
+import { fileURLToPath, pathToFileURL } from "node:url";
+import type { ToolSet } from "ai";
+import type { EvalLogger } from "../logger.js";
+import { getRepoRootDir } from "../runtimePaths.js";
+
+export type UnderstudyV4ToolDefinition = Record;
+
+type BridgeReadyMessage = {
+ type: "ready";
+ cdpUrl: string;
+ browserbaseExtensionId?: string;
+ stagehand_session_id?: string;
+ toolCatalog: UnderstudyV4ToolDefinition[];
+};
+
+type BridgeResultMessage = {
+ type: "result";
+ id: number;
+ result?: unknown;
+ error?: string;
+};
+
+type BridgeEventMessage = {
+ type: "event";
+ name: string;
+ event: unknown;
+};
+
+type BridgeErrorMessage = {
+ type: "error";
+ error: string;
+};
+
+type UnderstudyV4Sdk = {
+ StagehandClient: new (options?: Record) => {
+ browserbase_extension_id?: string;
+ cdp_http_origin?: string;
+ connect(input?: unknown): Promise;
+ close(): Promise;
+ cdp: {
+ cdp_url?: string | null;
+ on(eventName: string, listener: (event: unknown) => void): unknown;
+ off(eventName: string, listener: (event: unknown) => void): unknown;
+ Stagehand: Record<
+ string,
+ (params?: Record) => Promise
+ >;
+ };
+ stagehand_session_id?: string;
+ };
+ StagehandProtocolEvents: Record;
+ aiBrowserToolDefinitions: () => UnderstudyV4ToolDefinition[];
+};
+
+export interface UnderstudyV4Tools {
+ cdpUrl: string;
+ browserbaseExtensionId?: string;
+ stagehand_session_id?: string;
+ toolCatalog: UnderstudyV4ToolDefinition[];
+ stagehandV4: UnderstudyV4NativeRuntime;
+ tools: ToolSet;
+ cleanup: () => Promise;
+}
+
+export interface UnderstudyV4NativeRuntime {
+ cdp: {
+ on(eventName: string, listener: (event: unknown) => void): void;
+ off(eventName: string, listener: (event: unknown) => void): void;
+ Mod: Record) => Promise>;
+ Stagehand: Record<
+ string,
+ (params?: Record) => Promise
+ >;
+ };
+}
+
+type PendingCall = {
+ resolve: (value: unknown) => void;
+ reject: (error: Error) => void;
+};
+
+export async function startUnderstudyV4Tools(input: {
+ environment: "LOCAL" | "BROWSERBASE";
+ logger: EvalLogger;
+}): Promise {
+ const require = createRequire(import.meta.url);
+ const tsxCli = require.resolve("tsx/cli");
+ const child = spawn(
+ process.execPath,
+ [tsxCli, fileURLToPath(import.meta.url)],
+ {
+ cwd: getRepoRootDir(),
+ env: {
+ ...process.env,
+ UNDERSTUDY_V4_TOOLS_CHILD: "1",
+ },
+ stdio: ["pipe", "pipe", "pipe"],
+ },
+ );
+
+ const pending = new Map();
+ const eventListeners = new Map void>>();
+ const subscribedEvents = new Set();
+ let nextId = 1;
+ let readyResolve: (message: BridgeReadyMessage) => void;
+ let readyReject: (error: Error) => void;
+ const readyPromise = new Promise((resolve, reject) => {
+ readyResolve = resolve;
+ readyReject = reject;
+ });
+
+ const stdout = createInterface({ input: child.stdout });
+ stdout.on("line", (line) => {
+ if (!line.trim()) return;
+ const message = parseBridgeMessage(line);
+ if (!message) {
+ input.logger.log({
+ category: "understudy_v4_code",
+ message: line,
+ level: 1,
+ });
+ return;
+ }
+ if (message.type === "ready") {
+ readyResolve(message);
+ return;
+ }
+ if (message.type === "event") {
+ for (const listener of eventListeners.get(message.name) ?? []) {
+ listener(message.event);
+ }
+ return;
+ }
+ if (message.type === "error") {
+ const error = new Error(message.error);
+ readyReject(error);
+ for (const call of pending.values()) call.reject(error);
+ pending.clear();
+ return;
+ }
+ const call = pending.get(message.id);
+ if (!call) return;
+ pending.delete(message.id);
+ if (message.error) {
+ call.reject(new Error(message.error));
+ } else {
+ call.resolve(message.result);
+ }
+ });
+
+ child.stderr.on("data", (chunk: Buffer) => {
+ for (const line of chunk.toString("utf8").split(/\r?\n/).filter(Boolean)) {
+ input.logger.warn({
+ category: "understudy_v4_code",
+ message: line,
+ level: 1,
+ });
+ }
+ });
+
+ child.on("error", (error) => {
+ readyReject(error);
+ for (const call of pending.values()) call.reject(error);
+ pending.clear();
+ });
+ child.on("exit", (code, signal) => {
+ const error = new Error(
+ `Understudy v4 tools process exited (${signal ?? code ?? "unknown"}).`,
+ );
+ readyReject(error);
+ for (const call of pending.values()) call.reject(error);
+ pending.clear();
+ });
+
+ child.stdin.write(
+ `${JSON.stringify({ type: "init", environment: input.environment })}\n`,
+ );
+
+ const ready = await readyPromise;
+ input.logger.log({
+ category: "understudy_v4_code",
+ message: `Connected v4 tools at ${ready.cdpUrl}`,
+ level: 1,
+ });
+ input.logger.log({
+ category: "understudy_v4_code",
+ message: `v4 stagehand_session_id=${ready.stagehand_session_id ?? "unknown"}`,
+ level: 1,
+ });
+ const callCommand = (name: string, args: Record) =>
+ callBridge(child, pending, nextId++, "command", name, args);
+ const callTool = (name: string, args: Record) =>
+ callBridge(child, pending, nextId++, "tool", name, args);
+ const { jsonSchema, tool } = await import("ai");
+
+ return {
+ cdpUrl: ready.cdpUrl,
+ browserbaseExtensionId: ready.browserbaseExtensionId,
+ stagehand_session_id: ready.stagehand_session_id,
+ toolCatalog: ready.toolCatalog,
+ stagehandV4: {
+ cdp: {
+ on(eventName, listener) {
+ let listeners = eventListeners.get(eventName);
+ if (!listeners) {
+ listeners = new Set();
+ eventListeners.set(eventName, listeners);
+ }
+ listeners.add(listener);
+ if (!subscribedEvents.has(eventName)) {
+ subscribedEvents.add(eventName);
+ child.stdin.write(
+ `${JSON.stringify({ type: "subscribe", name: eventName })}\n`,
+ );
+ }
+ },
+ off(eventName, listener) {
+ const listeners = eventListeners.get(eventName);
+ listeners?.delete(listener);
+ if (listeners?.size === 0) eventListeners.delete(eventName);
+ },
+ Mod: new Proxy(
+ {},
+ {
+ get(_target, property) {
+ if (typeof property !== "string") return undefined;
+ return (params?: Record) =>
+ callCommand(`Mod.${property}`, params ?? {});
+ },
+ },
+ ) as UnderstudyV4NativeRuntime["cdp"]["Mod"],
+ Stagehand: new Proxy(
+ {},
+ {
+ get(_target, property) {
+ if (typeof property !== "string") return undefined;
+ return (params?: Record) =>
+ callCommand(`Stagehand.${property}`, params ?? {});
+ },
+ },
+ ) as UnderstudyV4NativeRuntime["cdp"]["Stagehand"],
+ },
+ },
+ tools: buildUnderstudyV4ToolSet(ready.toolCatalog, callTool, input.logger, {
+ jsonSchema,
+ tool,
+ }),
+ cleanup: async () => {
+ await closeBridge(child, pending);
+ },
+ };
+}
+
+function buildUnderstudyV4ToolSet(
+ catalog: UnderstudyV4ToolDefinition[],
+ callTool: (name: string, args: Record) => Promise,
+ logger: EvalLogger,
+ ai: Pick,
+): ToolSet {
+ const tools: ToolSet = {};
+ const selectorMap: Record> = {};
+ for (const definition of catalog) {
+ const name = typeof definition.name === "string" ? definition.name : null;
+ const rawSchema = definition.inputSchema ?? definition.parameters;
+ const schema =
+ rawSchema != null &&
+ typeof rawSchema === "object" &&
+ !Array.isArray(rawSchema)
+ ? rawSchema
+ : null;
+ if (!name) continue;
+ if (!schema) continue;
+ tools[name] = ai.tool({
+ description:
+ typeof definition.description === "string"
+ ? definition.description
+ : name,
+ inputSchema: ai.jsonSchema(schema),
+ execute: async (args) => {
+ logger.log({
+ category: "understudy_v4_code",
+ message: `Agent calling v4 tool: ${name}`,
+ level: 1,
+ auxiliary: {
+ arguments: {
+ value: JSON.stringify(args),
+ type: "object",
+ },
+ },
+ });
+ const hydratedArgs = hydrateSelectorReferences(
+ isRecord(args) ? args : {},
+ selectorMap,
+ );
+ return callTool(name, isRecord(hydratedArgs) ? hydratedArgs : {});
+ },
+ toModelOutput: (result) => modelOutputForToolResult(result, selectorMap),
+ });
+ }
+ return tools;
+}
+
+function modelOutputForToolResult(
+ result: unknown,
+ selectorMap: Record>,
+) {
+ const payload = firstPayload(result);
+ const screenshot = stringField(payload, "screenshot");
+ if (screenshot) {
+ return {
+ type: "content" as const,
+ value: [
+ {
+ type: "media" as const,
+ mediaType: "image/png",
+ data: screenshot.replace(/^data:image\/\w+;base64,/, ""),
+ },
+ ],
+ };
+ }
+ const pageSummary =
+ stringField(payload, "formattedTree") ??
+ stringField(payload, "observationTree") ??
+ stringField(payload, "pageText");
+ if (pageSummary) {
+ updateSelectorMap(selectorMap, payload.elementSelectorMap);
+ return {
+ type: "content" as const,
+ value: [
+ {
+ type: "text" as const,
+ text: [
+ "Page Summary:",
+ pageSummary,
+ "",
+ 'Use an element square-bracket id as selector.elementId without brackets, for example {"selector":{"elementId":"0-3"}}.',
+ ].join("\n"),
+ },
+ ],
+ };
+ }
+ return {
+ type: "content" as const,
+ value: [
+ {
+ type: "text" as const,
+ text: JSON.stringify(sanitizeForModel(payload)),
+ },
+ ],
+ };
+}
+
+function callBridge(
+ child: ChildProcess,
+ pending: Map,
+ id: number,
+ type: "tool" | "command",
+ name: string,
+ args: Record,
+): Promise {
+ return new Promise((resolve, reject) => {
+ pending.set(id, { resolve, reject });
+ child.stdin.write(`${JSON.stringify({ type, id, name, args })}\n`);
+ });
+}
+
+async function closeBridge(
+ child: ChildProcess,
+ pending: Map,
+): Promise {
+ if (child.exitCode != null) return;
+ await new Promise((resolve) => {
+ child.once("exit", () => resolve());
+ child.stdin.write(`${JSON.stringify({ type: "close" })}\n`);
+ child.stdin.end();
+ setTimeout(() => {
+ if (child.exitCode == null) child.kill("SIGTERM");
+ resolve();
+ }, 5000).unref();
+ });
+ for (const call of pending.values()) {
+ call.reject(new Error("Understudy v4 tools process closed."));
+ }
+ pending.clear();
+}
+
+function parseBridgeMessage(
+ line: string,
+):
+ | BridgeReadyMessage
+ | BridgeResultMessage
+ | BridgeEventMessage
+ | BridgeErrorMessage
+ | null {
+ try {
+ const parsed = JSON.parse(line) as
+ | BridgeReadyMessage
+ | BridgeResultMessage
+ | BridgeEventMessage
+ | BridgeErrorMessage;
+ if (
+ parsed.type === "ready" ||
+ parsed.type === "result" ||
+ parsed.type === "event" ||
+ parsed.type === "error"
+ ) {
+ return parsed;
+ }
+ } catch {
+ return null;
+ }
+ return null;
+}
+
+async function runBridgeChild(): Promise {
+ const sdk = await loadStagehandV4Sdk();
+ const commandByToolName = buildCommandByToolName(sdk);
+ let client: InstanceType | null = null;
+ const eventSubscriptions = new Map void>();
+
+ const stdin = createInterface({ input: process.stdin });
+ for await (const line of stdin) {
+ if (!line.trim()) continue;
+ const message = JSON.parse(line) as {
+ type: "init" | "tool" | "command" | "subscribe" | "close";
+ environment?: "LOCAL" | "BROWSERBASE";
+ id?: number;
+ name?: string;
+ args?: Record;
+ };
+
+ if (message.type === "init") {
+ client = new sdk.StagehandClient(
+ understudyV4ClientOptions(message.environment ?? "LOCAL"),
+ );
+ await client.connect();
+ let cdpUrl = client.cdp.cdp_url ?? client.cdp_http_origin ?? "";
+ if (/^https?:\/\//i.test(cdpUrl)) {
+ const versionResponse = await fetch(`${cdpUrl}/json/version`);
+ if (!versionResponse.ok) {
+ throw new Error(
+ `Unable to resolve v4 browser websocket URL from ${cdpUrl}: GET /json/version -> ${versionResponse.status}`,
+ );
+ }
+ const version = (await versionResponse.json()) as {
+ webSocketDebuggerUrl?: unknown;
+ };
+ if (typeof version.webSocketDebuggerUrl !== "string") {
+ throw new Error(
+ `Unable to resolve v4 browser websocket URL from ${cdpUrl}: missing webSocketDebuggerUrl`,
+ );
+ }
+ cdpUrl = version.webSocketDebuggerUrl;
+ }
+ writeBridgeMessage({
+ type: "ready",
+ cdpUrl,
+ browserbaseExtensionId: client.browserbase_extension_id,
+ stagehand_session_id: client.stagehand_session_id,
+ toolCatalog: sdk.aiBrowserToolDefinitions(),
+ });
+ continue;
+ }
+
+ if (message.type === "subscribe") {
+ if (!client) throw new Error("Understudy v4 tools were not initialized.");
+ const name = message.name;
+ if (typeof name !== "string")
+ throw new Error("Event subscription requires an event name.");
+ if (!eventSubscriptions.has(name)) {
+ const listener = (event: unknown): void =>
+ writeBridgeMessage({ type: "event", name, event });
+ eventSubscriptions.set(name, listener);
+ client.cdp.on(name, listener);
+ }
+ continue;
+ }
+
+ if (message.type === "tool" || message.type === "command") {
+ if (!client) throw new Error("Understudy v4 tools were not initialized.");
+ const id = message.id ?? 0;
+ try {
+ const commandName =
+ message.type === "tool"
+ ? commandByToolName.get(message.name ?? "")
+ : message.name;
+ if (!commandName) {
+ throw new Error(
+ message.type === "tool"
+ ? `No v4 protocol event is exposed for tool "${message.name}".`
+ : `No v4 protocol command was provided.`,
+ );
+ }
+ const command =
+ message.type === "command"
+ ? commandForPath(client.cdp, commandName)
+ : client.cdp.Stagehand[commandName];
+ if (!command) {
+ throw new Error(
+ `The v4 SDK does not expose ${
+ message.type === "command"
+ ? commandName
+ : `Stagehand.${commandName}`
+ }.`,
+ );
+ }
+ const result = await command(message.args ?? {});
+ writeBridgeMessage({ type: "result", id, result });
+ } catch (error) {
+ writeBridgeMessage({
+ type: "result",
+ id,
+ error: error instanceof Error ? error.message : String(error),
+ });
+ }
+ continue;
+ }
+
+ if (message.type === "close") {
+ if (client) {
+ for (const [eventName, listener] of eventSubscriptions) {
+ client.cdp.off(eventName, listener);
+ }
+ }
+ await client?.close();
+ process.exit(0);
+ }
+ }
+}
+
+export function assertUnderstudyV4SdkAvailable(): string {
+ const sdkPath =
+ process.env.STAGEHAND_V4_SDK_PATH ??
+ path.join(
+ getRepoRootDir(),
+ "..",
+ "stagehand-driver",
+ "sdks",
+ "js",
+ "index.ts",
+ );
+ if (!fs.existsSync(sdkPath)) {
+ throw new Error(
+ [
+ "stagehand_v4 evals require a local Stagehand v4 SDK checkout.",
+ `Expected v4 SDK entrypoint at: ${sdkPath}`,
+ "Set STAGEHAND_V4_SDK_PATH to the v4 SDK entrypoint if your checkout lives somewhere else.",
+ ].join("\n"),
+ );
+ }
+ return sdkPath;
+}
+
+async function loadStagehandV4Sdk(): Promise {
+ const sdkPath = assertUnderstudyV4SdkAvailable();
+ return (await import(pathToFileURL(sdkPath).href)) as UnderstudyV4Sdk;
+}
+
+function understudyV4ClientOptions(
+ environment: "LOCAL" | "BROWSERBASE",
+): Record {
+ if (process.env.STAGEHAND_V4_CDP_URL) {
+ return {
+ cdp_url: process.env.STAGEHAND_V4_CDP_URL,
+ rebuild_extension: false,
+ };
+ }
+ if (environment === "BROWSERBASE") {
+ if (!process.env.BROWSERBASE_API_KEY) {
+ throw new Error(
+ "BROWSERBASE_API_KEY is required for understudy_v4_code.",
+ );
+ }
+ return {
+ rebuild_extension: false,
+ browserbase_session_create_params: {
+ browserbase_api_key: process.env.BROWSERBASE_API_KEY,
+ },
+ };
+ }
+ return {
+ local_browser_launch_options: {
+ headless: process.env.EVAL_HEADLESS !== "false",
+ ...(process.env.CHROME_PATH
+ ? { executable_path: process.env.CHROME_PATH }
+ : {}),
+ },
+ };
+}
+
+function buildCommandByToolName(sdk: UnderstudyV4Sdk): Map {
+ const commandByToolName = new Map();
+ for (const value of Object.values(sdk.StagehandProtocolEvents)) {
+ if (typeof value !== "function") continue;
+ const eventClass = value as {
+ event_type?: unknown;
+ llm_tool_name?: unknown;
+ };
+ if (
+ typeof eventClass.event_type !== "string" ||
+ typeof eventClass.llm_tool_name !== "string" ||
+ !eventClass.event_type.endsWith("Event")
+ ) {
+ continue;
+ }
+ commandByToolName.set(
+ eventClass.llm_tool_name,
+ eventClass.event_type.slice(0, -"Event".length),
+ );
+ }
+ return commandByToolName;
+}
+
+function commandForPath(
+ cdp: InstanceType["cdp"],
+ path: string,
+): ((params?: Record) => Promise) | undefined {
+ const [domain, method] = path.split(".");
+ if (!domain || !method) return undefined;
+ const commands = (cdp as unknown as Record)[domain];
+ if (!isRecord(commands)) return undefined;
+ const command = commands[method];
+ return typeof command === "function"
+ ? (command as (params?: Record) => Promise)
+ : undefined;
+}
+
+function writeBridgeMessage(
+ message:
+ | BridgeReadyMessage
+ | BridgeResultMessage
+ | BridgeEventMessage
+ | BridgeErrorMessage,
+): void {
+ process.stdout.write(`${JSON.stringify(message)}\n`);
+}
+
+function firstPayload(value: unknown): Record {
+ if (!isRecord(value)) return {};
+ const eventResults =
+ value.event_results ??
+ (isRecord(value.event) ? value.event.event_results : undefined);
+ if (isRecord(eventResults)) {
+ const first = Object.values(eventResults)[0];
+ if (isRecord(first)) {
+ if (isRecord(first.result)) return first.result;
+ return first;
+ }
+ }
+ return value;
+}
+
+function stringField(
+ record: Record,
+ key: string,
+): string | null {
+ const value = record[key];
+ return typeof value === "string" && value.length > 0 ? value : null;
+}
+
+function sanitizeForModel(value: unknown): unknown {
+ if (typeof value === "string") {
+ return value.length > 2000
+ ? `${value.slice(0, 2000)}...[truncated]`
+ : value;
+ }
+ if (Array.isArray(value))
+ return value.map((entry) => sanitizeForModel(entry));
+ if (!isRecord(value)) return value;
+ const result: Record = {};
+ for (const [key, entry] of Object.entries(value)) {
+ if (
+ key.toLowerCase().includes("screenshot") ||
+ key.toLowerCase().includes("image")
+ ) {
+ result[key] =
+ typeof entry === "string" && entry.length > 80
+ ? `${entry.slice(0, 80)}...[truncated]`
+ : entry;
+ continue;
+ }
+ result[key] = sanitizeForModel(entry);
+ }
+ return result;
+}
+
+function updateSelectorMap(
+ selectorMap: Record>,
+ value: unknown,
+): void {
+ if (!isRecord(value)) return;
+ for (const [elementId, selector] of Object.entries(value)) {
+ if (isRecord(selector)) selectorMap[elementId] = selector;
+ }
+}
+
+function hydrateSelectorReferences(
+ value: unknown,
+ selectorMap: Record>,
+): unknown {
+ if (Array.isArray(value)) {
+ return value.map((entry) => hydrateSelectorReferences(entry, selectorMap));
+ }
+ if (!isRecord(value)) return value;
+ const elementId =
+ typeof value.elementId === "string" ? value.elementId : null;
+ const mappedSelector = elementId == null ? null : selectorMap[elementId];
+ const hydratedRecord = Object.fromEntries(
+ Object.entries(value)
+ .filter(([key]) => key !== "elementId")
+ .map(([key, entry]) => [
+ key,
+ hydrateSelectorReferences(entry, selectorMap),
+ ]),
+ );
+ return mappedSelector == null
+ ? hydratedRecord
+ : { ...mappedSelector, ...hydratedRecord };
+}
+
+function isRecord(value: unknown): value is Record {
+ return value != null && typeof value === "object" && !Array.isArray(value);
+}
+
+if (
+ process.env.UNDERSTUDY_V4_TOOLS_CHILD === "1" &&
+ process.argv[1] &&
+ path.resolve(process.argv[1]) === fileURLToPath(import.meta.url)
+) {
+ void runBridgeChild().catch((error) => {
+ writeBridgeMessage({
+ type: "error",
+ error: error instanceof Error ? error.message : String(error),
+ });
+ process.exit(1);
+ });
+}
diff --git a/packages/evals/framework/benchHarness.ts b/packages/evals/framework/benchHarness.ts
index c2277ea36..1cc580a90 100644
--- a/packages/evals/framework/benchHarness.ts
+++ b/packages/evals/framework/benchHarness.ts
@@ -1,24 +1,11 @@
-import {
- AgentProvider,
- getAISDKLanguageModel,
- loadApiKeyFromEnv,
- type AgentInstance,
- type AvailableModel,
- type LLMClient,
- type LogLine,
- type V3,
-} from "@browserbasehq/stagehand";
-import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
-import { endBrowserbaseSession } from "../browserbaseCleanup.js";
+import type { AgentInstance, V3 } from "@browserbasehq/stagehand";
import { EvalsError } from "../errors.js";
import type { EvalLogger } from "../logger.js";
-import type { V3InitResult } from "../initV3.js";
import type { EvalInput } from "../types/evals.js";
-import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
-import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
-import { runCodexAgent } from "./codexRunner.js";
-import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
-import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
+import { ClaudeAgentHarness } from "./ClaudeAgentHarness.js";
+import { CodexAgentHarness } from "./CodexAgentHarness.js";
+import { StagehandAgentV3Harness } from "./StagehandAgentV3Harness.js";
+import type { UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js";
import type { DiscoveredTask, TaskResult } from "./types.js";
import type { BenchMatrixRow, BenchTaskKind, Harness } from "./benchTypes.js";
@@ -41,6 +28,7 @@ export interface BenchHarnessContext {
row: BenchMatrixRow;
logger: EvalLogger;
v3?: V3;
+ v4?: UnderstudyV4NativeRuntime;
agent?: AgentInstance;
page?: Page;
debugUrl: string;
@@ -60,28 +48,8 @@ export interface BenchHarness {
start(input: BenchHarnessStartInput): Promise;
}
-function isAgentTask(task: DiscoveredTask): boolean {
- return (
- task.primaryCategory === "agent" ||
- task.categories.includes("agent") ||
- task.categories.includes("external_agent_benchmarks")
- );
-}
-
-function resolveProvider(modelName: AvailableModel): string | undefined {
- if (modelName.includes("/")) {
- return modelName.split("/")[0];
- }
-
- try {
- return AgentProvider.getAgentProvider(modelName);
- } catch {
- return undefined;
- }
-}
-
-export const stagehandHarness: BenchHarness = {
- harness: "stagehand",
+export const StagehandAgentV4Harness: BenchHarness = {
+ harness: "stagehand_v4",
supportedTaskKinds: [
"act",
"extract",
@@ -90,193 +58,28 @@ export const stagehandHarness: BenchHarness = {
"combination",
"suite",
],
- supportsApi: true,
- async start({
- task,
- input,
- row,
- logger,
- verbose,
- }: BenchHarnessStartInput): Promise {
- let v3Result: V3InitResult | undefined;
- const createAgent = isAgentTask(task);
- if (row.config.harness !== "stagehand") {
- throw new EvalsError(
- `Harness "${row.config.harness}" is not implemented yet. Use --harness stagehand for the current unified runner.`,
- );
- }
- const config = row.config;
- const agentMode = config.agentMode ?? input.agentMode;
- const isCUA = config.isCUA ?? input.isCUA;
-
- if (config.useApi) {
- const provider = resolveProvider(input.modelName);
- const logFn = (line: LogLine) => logger.log(line);
- const apiKey = loadApiKeyFromEnv(provider, logFn);
- if (!apiKey) {
- throw new EvalsError(
- `USE_API=true but no API key found for provider "${provider}".`,
- );
- }
- const { initV3 } = await import("../initV3.js");
- v3Result = await initV3({
- logger,
- modelName: input.modelName,
- modelClientOptions: { apiKey },
- createAgent,
- agentMode,
- isCUA,
- verbose,
- configOverrides: { env: config.environment },
- });
- } else {
- let llmClient: LLMClient | undefined;
- if (input.modelName.includes("/")) {
- const firstSlashIndex = input.modelName.indexOf("/");
- llmClient = new AISdkClientWrapped({
- model: getAISDKLanguageModel(
- input.modelName.substring(0, firstSlashIndex),
- input.modelName.substring(firstSlashIndex + 1),
- ),
- });
- }
- const { initV3 } = await import("../initV3.js");
- v3Result = await initV3({
- logger,
- llmClient,
- modelName: input.modelName,
- createAgent,
- agentMode,
- isCUA,
- verbose,
- configOverrides: { env: config.environment },
- });
- }
-
- return {
- ctx: {
- harness: "stagehand",
- row,
- logger,
- v3: v3Result.v3,
- agent: v3Result.agent,
- page: v3Result.v3.context.pages()[0],
- debugUrl: v3Result.debugUrl ?? "",
- sessionUrl: v3Result.sessionUrl ?? "",
- },
- cleanup: async () => {
- if (v3Result?.v3) {
- try {
- await v3Result.v3.close();
- } catch (closeError) {
- console.error(
- `Warning: Error closing V3 instance for ${input.name}:`,
- closeError,
- );
- }
- }
- await endBrowserbaseSession(v3Result?.v3);
- },
- };
- },
-};
-
-export const claudeCodeHarness: BenchHarness = {
- harness: "claude_code",
- supportedTaskKinds: ["agent", "suite"],
supportsApi: false,
- async execute({
- input,
- row,
- logger,
- signal,
- }: BenchHarnessExecuteInput): Promise {
- const plan = buildExternalHarnessTaskPlan(input);
- if (row.config.harness !== "claude_code") {
- throw new EvalsError(
- `Expected claude_code harness config, received "${row.config.harness}".`,
- );
- }
- const toolAdapter = await prepareClaudeCodeToolAdapter({
- toolSurface: row.config.toolSurface,
- startupProfile: row.config.startupProfile,
- environment: row.config.environment,
- plan,
- logger,
- });
- try {
- return await runClaudeCodeAgent({
- plan,
- model: input.modelName,
- logger,
- toolAdapter,
- signal,
- });
- } finally {
- await toolAdapter.cleanup();
- }
- },
- async start(): Promise {
- throw new EvalsError(
- "Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.",
- );
- },
-};
-
-export const codexHarness: BenchHarness = {
- harness: "codex",
- supportedTaskKinds: ["agent", "suite"],
- supportsApi: false,
- async execute({
- input,
- row,
- logger,
- signal,
- }: BenchHarnessExecuteInput): Promise {
- const plan = buildExternalHarnessTaskPlan(input);
- if (row.config.harness !== "codex") {
- throw new EvalsError(
- `Expected codex harness config, received "${row.config.harness}".`,
- );
- }
- const toolAdapter = await prepareCodexToolAdapter({
- toolSurface: row.config.toolSurface,
- startupProfile: row.config.startupProfile,
- environment: row.config.environment,
- plan,
- logger,
- });
- try {
- return await runCodexAgent({
- plan,
- model: input.modelName,
- logger,
- toolAdapter,
- signal,
- });
- } finally {
- await toolAdapter.cleanup();
- }
- },
- async start(): Promise {
- throw new EvalsError(
- "Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.",
- );
+ async start(input: BenchHarnessStartInput): Promise {
+ const module = await import("./StagehandAgentV4Harness.js");
+ return module.StagehandAgentV4Harness.start(input);
},
};
const harnessRegistry = new Map([
- ["stagehand", stagehandHarness],
- ["claude_code", claudeCodeHarness],
- ["codex", codexHarness],
+ ["stagehand_v3", StagehandAgentV3Harness],
+ ["stagehand_v4", StagehandAgentV4Harness],
+ ["claude_code", ClaudeAgentHarness],
+ ["codex", CodexAgentHarness],
]);
export function getBenchHarness(harness: Harness): BenchHarness {
const implementation = harnessRegistry.get(harness);
if (!implementation) {
- throw new EvalsError(
- `Harness "${harness}" is not implemented yet. Use --harness stagehand for the current unified runner.`,
- );
+ throw new EvalsError(`Harness "${harness}" is not implemented yet.`);
}
return implementation;
}
+
+export { ClaudeAgentHarness } from "./ClaudeAgentHarness.js";
+export { CodexAgentHarness } from "./CodexAgentHarness.js";
+export { StagehandAgentV3Harness } from "./StagehandAgentV3Harness.js";
diff --git a/packages/evals/framework/benchPlanner.ts b/packages/evals/framework/benchPlanner.ts
index 5f93ba39b..1b6a48875 100644
--- a/packages/evals/framework/benchPlanner.ts
+++ b/packages/evals/framework/benchPlanner.ts
@@ -97,15 +97,18 @@ export function resolveBenchModelEntries(
effectiveCategory === "agent" ||
effectiveCategory === "external_agent_benchmarks";
const harness = options.harness ?? DEFAULT_BENCH_HARNESS;
- const requestedAgentModes =
- harness === "stagehand" ? resolveRequestedAgentModes(options) : undefined;
+ const usesStagehandHarness =
+ harness === "stagehand_v3" || harness === "stagehand_v4";
+ const requestedAgentModes = usesStagehandHarness
+ ? resolveRequestedAgentModes(options)
+ : undefined;
if (options.modelOverride) {
const baseModes =
isAgentCategory && requestedAgentModes
? requestedAgentModes
: [
- harness === "stagehand"
+ usesStagehandHarness
? resolveAgentModeForModel(options.modelOverride)
: "hybrid",
];
@@ -345,9 +348,9 @@ function buildBenchHarnessConfig(input: {
startupProfile?: StartupProfile;
dataset?: string;
}): BenchHarnessConfig {
- if (input.harness === "stagehand") {
+ if (input.harness === "stagehand_v3" || input.harness === "stagehand_v4") {
return {
- harness: "stagehand",
+ harness: input.harness,
model: input.model,
provider: input.provider,
environment: input.environment,
@@ -387,6 +390,9 @@ export function generateBenchTestcases(
modelEntries,
);
const allTestcases = [...suiteTestcases.testcases];
+ const harness = options.harness ?? DEFAULT_BENCH_HARNESS;
+ const usesStagehandHarness =
+ harness === "stagehand_v3" || harness === "stagehand_v4";
if (options.harness === "claude_code" || options.harness === "codex") {
if (suiteTestcases.remainingTasks.length > 0) {
@@ -409,16 +415,16 @@ export function generateBenchTestcases(
model,
options,
undefined,
- isAgentCategory && rowUsesStagehand(options)
+ isAgentCategory && usesStagehandHarness
? entry.mode === "cua"
: undefined,
- isAgentCategory && rowUsesStagehand(options)
+ isAgentCategory && usesStagehandHarness
? (options.agentMode ?? entry.mode)
: undefined,
);
const agentMode = row.agentMode;
const includeStagehandAgentMode =
- isAgentCategory && rowUsesStagehand(options) && agentMode;
+ isAgentCategory && usesStagehandHarness && agentMode;
allTestcases.push({
input: {
name: task.name,
@@ -460,10 +466,6 @@ export function generateBenchTestcases(
return allTestcases;
}
-function rowUsesStagehand(options: Pick): boolean {
- return (options.harness ?? DEFAULT_BENCH_HARNESS) === "stagehand";
-}
-
function resolveBenchRowToolSurface(
harness: Harness,
requested?: ToolSurface,
@@ -474,6 +476,19 @@ function resolveBenchRowToolSurface(
if (harness === "codex") {
return resolveCodexToolSurface(requested);
}
+ if (harness === "stagehand_v4") {
+ if (requested && requested !== "understudy_v4_code") {
+ throw new EvalsError(
+ `stagehand_v4 uses the UnderstudyV4Tools surface. Received --tool ${requested}.`,
+ );
+ }
+ return requested ?? "understudy_v4_code";
+ }
+ if (harness === "stagehand_v3" && requested === "understudy_v4_code") {
+ throw new EvalsError(
+ "Use --harness stagehand_v4 for the UnderstudyV4Tools surface.",
+ );
+ }
return requested;
}
@@ -547,7 +562,8 @@ function withBenchMetadata(
task: DiscoveredTask,
options: BenchPlanOptions,
): Testcase {
- const isStagehand = rowUsesStagehand(options);
+ const harness = options.harness ?? DEFAULT_BENCH_HARNESS;
+ const isStagehand = harness === "stagehand_v3" || harness === "stagehand_v4";
const agentMode = isStagehand
? (options.agentMode ?? testcase.input.agentMode)
: undefined;
diff --git a/packages/evals/framework/benchRunner.ts b/packages/evals/framework/benchRunner.ts
index e719db56d..51193dc20 100644
--- a/packages/evals/framework/benchRunner.ts
+++ b/packages/evals/framework/benchRunner.ts
@@ -29,7 +29,6 @@ export async function executeBenchTask(
): Promise {
const logger = new EvalLogger(Boolean(options.verbose));
const harnessName = options.harness ?? DEFAULT_BENCH_HARNESS;
- const harness = getBenchHarness(harnessName);
const row = buildBenchMatrixRow(
task,
input.modelName,
@@ -38,6 +37,7 @@ export async function executeBenchTask(
input.isCUA,
input.agentMode,
);
+ const harness = getBenchHarness(harnessName);
let cleanup: () => Promise = async () => {};
let unregisterCleanup: (() => void) | undefined;
let harnessCtx: BenchHarnessContext | undefined;
@@ -67,8 +67,13 @@ export async function executeBenchTask(
harnessCtx = startedHarness.ctx;
const taskModule = await loadTaskModuleFromPath(task.filePath, task.name);
if (taskModule.definition) {
+ const taskFn =
+ taskModule.definition.benchFns?.[harnessCtx.harness] ??
+ taskModule.definition.benchFns?.default ??
+ taskModule.definition.fn;
const ctx = {
v3: harnessCtx.v3,
+ v4: harnessCtx.v4,
agent: harnessCtx.agent,
page: harnessCtx.page,
logger,
@@ -78,7 +83,7 @@ export async function executeBenchTask(
sessionUrl: harnessCtx.sessionUrl,
};
return withBenchSessionUrls(
- (await taskModule.definition.fn(ctx)) as TaskResult,
+ (await taskFn(ctx)) as TaskResult,
harnessCtx,
);
}
@@ -86,6 +91,7 @@ export async function executeBenchTask(
return withBenchSessionUrls(
await taskModule.legacyFn({
v3: harnessCtx.v3,
+ v4: harnessCtx.v4,
logger,
debugUrl: harnessCtx.debugUrl,
sessionUrl: harnessCtx.sessionUrl,
@@ -117,10 +123,7 @@ export async function executeBenchTask(
return withBenchSessionUrls(
{
_success: false,
- error:
- error instanceof Error
- ? JSON.parse(JSON.stringify(error, null, 2))
- : String(error),
+ error: error instanceof Error ? error.message : String(error),
logs: logger.getLogs(),
},
harnessCtx,
diff --git a/packages/evals/framework/benchTypes.ts b/packages/evals/framework/benchTypes.ts
index 2a3af7cc6..3fce7d950 100644
--- a/packages/evals/framework/benchTypes.ts
+++ b/packages/evals/framework/benchTypes.ts
@@ -1,18 +1,20 @@
import type { AgentToolMode, AvailableModel } from "@browserbasehq/stagehand";
import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js";
-export type Harness = "stagehand" | "claude_code" | "codex";
+export type Harness = "stagehand_v3" | "stagehand_v4" | "claude_code" | "codex";
-export const DEFAULT_BENCH_HARNESS: Harness = "stagehand";
+export const DEFAULT_BENCH_HARNESS: Harness = "stagehand_v3";
export const SUPPORTED_BENCH_HARNESSES = [
- "stagehand",
+ "stagehand_v3",
+ "stagehand_v4",
"claude_code",
"codex",
] as const satisfies readonly Harness[];
export const EXECUTABLE_BENCH_HARNESSES = [
- "stagehand",
+ "stagehand_v3",
+ "stagehand_v4",
"claude_code",
"codex",
] as const satisfies readonly Harness[];
@@ -42,7 +44,7 @@ export type BenchTaskKind =
| "suite";
export interface StagehandHarnessConfig {
- harness: "stagehand";
+ harness: "stagehand_v3" | "stagehand_v4";
model: AvailableModel;
provider?: string;
environment: "LOCAL" | "BROWSERBASE";
diff --git a/packages/evals/framework/context.ts b/packages/evals/framework/context.ts
index daa8eabea..3b09e4fcd 100644
--- a/packages/evals/framework/context.ts
+++ b/packages/evals/framework/context.ts
@@ -13,7 +13,6 @@ import { type V3InitResult, initV3 } from "../initV3.js";
import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js";
import { coreFixtureRoutes } from "../core/fixtures/index.js";
import { prepareCoreBrowserTarget } from "../core/targets/index.js";
-import { getCoreTool } from "../core/tools/registry.js";
import { ensureCoreFixtureServer } from "../core/fixtures/server.js";
import { EvalLogger } from "../logger.js";
import { createAssertHelpers } from "./assertions.js";
@@ -41,7 +40,7 @@ export function resolveDefaultCoreStartupProfile(
return environment === "BROWSERBASE"
? "tool_create_browserbase"
: "tool_launch_local";
- case "understudy_code":
+ case "understudy_v3_code":
case "playwright_code":
case "cdp_code":
case "playwright_mcp":
@@ -69,7 +68,8 @@ export async function buildCoreContext(
): Promise {
const logger = options.logger ?? new EvalLogger();
const environment = options.environment ?? "LOCAL";
- const toolSurface = options.toolSurface ?? "understudy_code";
+ const toolSurface = options.toolSurface ?? "understudy_v3_code";
+ const { getCoreTool } = await import("../core/tools/registry.js");
const tool = getCoreTool(toolSurface);
const startupProfile =
options.startupProfile ??
diff --git a/packages/evals/framework/defineTask.ts b/packages/evals/framework/defineTask.ts
index b03d3e037..2f754320b 100644
--- a/packages/evals/framework/defineTask.ts
+++ b/packages/evals/framework/defineTask.ts
@@ -5,6 +5,8 @@
* the file lives in during auto-discovery.
*/
import type {
+ BenchTaskFn,
+ BenchTaskImplementations,
BenchTaskContext,
BenchTaskMeta,
CoreTaskContext,
@@ -34,8 +36,31 @@ export function defineCoreTask(
*/
export function defineBenchTask(
meta: BenchTaskMeta,
- fn: (ctx: BenchTaskContext) => Promise,
+ fn: BenchTaskFn,
+): TaskDefinition;
+export function defineBenchTask(
+ meta: BenchTaskMeta,
+ fn: BenchTaskImplementations,
+): TaskDefinition;
+export function defineBenchTask(
+ meta: BenchTaskMeta,
+ fn: BenchTaskFn | BenchTaskImplementations,
): TaskDefinition {
+ if (typeof fn !== "function") {
+ return {
+ __taskDefinition: true,
+ meta,
+ fn:
+ fn.default ??
+ (async () => {
+ throw new Error(
+ `No default bench implementation is defined for "${meta.name ?? "unnamed task"}".`,
+ );
+ }),
+ benchFns: fn,
+ };
+ }
+
return {
__taskDefinition: true,
meta,
diff --git a/packages/evals/framework/runner.ts b/packages/evals/framework/runner.ts
index 336db1c02..8147a8fc1 100644
--- a/packages/evals/framework/runner.ts
+++ b/packages/evals/framework/runner.ts
@@ -321,7 +321,7 @@ export async function runEvals(
(t: DiscoveredTask) => t.tier === "core",
);
const effectiveCoreToolSurface = hasCoreOnly
- ? (options.coreToolSurface ?? "understudy_code")
+ ? (options.coreToolSurface ?? "understudy_v3_code")
: undefined;
const effectiveCoreStartupProfile =
hasCoreOnly && effectiveCoreToolSurface
diff --git a/packages/evals/framework/taskLoader.ts b/packages/evals/framework/taskLoader.ts
index d7a218ac9..0aeaa5291 100644
--- a/packages/evals/framework/taskLoader.ts
+++ b/packages/evals/framework/taskLoader.ts
@@ -7,6 +7,7 @@ export interface LoadedTaskDefinition {
__taskDefinition: true;
meta: unknown;
fn: (ctx: unknown) => Promise;
+ benchFns?: Record Promise) | undefined>;
}
export type LegacyTaskFn = (ctx: unknown) => Promise;
diff --git a/packages/evals/framework/types.ts b/packages/evals/framework/types.ts
index 359605b12..0441b0d98 100644
--- a/packages/evals/framework/types.ts
+++ b/packages/evals/framework/types.ts
@@ -23,6 +23,8 @@ import type {
ToolSurface,
} from "../core/contracts/tool.js";
import type { EvalLogger } from "../logger.js";
+import type { Harness } from "./benchTypes.js";
+import type { UnderstudyV4NativeRuntime } from "./UnderstudyV4Tools.js";
/** Page type inferred from V3.context.pages()[0] */
type Page = ReturnType[number];
@@ -70,6 +72,8 @@ export interface CoreTaskContext {
export interface BenchTaskContext {
/** Stagehand V3 instance. */
v3: V3;
+ /** Native Stagehand v4 SDK proxy. Present for the stagehand_v4 harness. */
+ v4?: UnderstudyV4NativeRuntime;
/** Agent instance (created when the task lives under agent/). */
agent?: AgentInstance;
/** Playwright page (convenience — same as v3.context.pages()[0]). */
@@ -128,6 +132,12 @@ export interface MetricsCollector {
getSummary(): Record>;
}
+export type BenchTaskFn = (ctx: BenchTaskContext) => Promise;
+
+export type BenchTaskImplementations = Partial> & {
+ default?: BenchTaskFn;
+};
+
export interface TaskDefinition {
/** Marker to identify defineTask outputs during discovery. */
__taskDefinition: true;
@@ -135,6 +145,8 @@ export interface TaskDefinition {
meta: TaskMeta | BenchTaskMeta;
/** The task function. */
fn: (ctx: CoreTaskContext | BenchTaskContext) => Promise;
+ /** Optional harness-native bench implementations. */
+ benchFns?: BenchTaskImplementations;
/** Which tier this task was defined for (set during discovery from directory). */
tier?: Tier;
}
diff --git a/packages/evals/lib/braintrust-report.ts b/packages/evals/lib/braintrust-report.ts
index 6fbb0fb99..a86da30e6 100644
--- a/packages/evals/lib/braintrust-report.ts
+++ b/packages/evals/lib/braintrust-report.ts
@@ -1363,7 +1363,7 @@ export function summarizeBenchCases(
function agentConfigKey(benchCase: BenchCaseRow): string {
return [
- benchCase.harness ?? "stagehand",
+ benchCase.harness ?? "stagehand_v3",
benchCase.provider ?? "",
benchCase.environment ?? "",
benchCase.api === undefined ? "" : benchCase.api ? "api" : "local",
@@ -1375,7 +1375,7 @@ function agentConfigKey(benchCase: BenchCaseRow): string {
function agentConfigLabel(benchCase: BenchCaseRow): string {
const parts = [
- benchCase.harness ?? "stagehand",
+ benchCase.harness ?? "stagehand_v3",
benchCase.agentMode,
benchCase.provider,
benchCase.environment,
diff --git a/packages/evals/tests/cli.test.ts b/packages/evals/tests/cli.test.ts
index 7b057e322..357f8764f 100644
--- a/packages/evals/tests/cli.test.ts
+++ b/packages/evals/tests/cli.test.ts
@@ -85,7 +85,7 @@ describe("CLI entrypoint", () => {
expect(payload.envOverrides.EVAL_ENV).toBe("BROWSERBASE");
expect(payload.envOverrides.USE_API).toBe("true");
expect(payload.envOverrides.EVAL_PROVIDER).toBe("openai");
- expect(payload.runOptions.harness).toBe("stagehand");
+ expect(payload.runOptions.harness).toBe("stagehand_v3");
expect(payload.runOptions.verbose).toBe(false);
});
@@ -173,7 +173,7 @@ describe.sequential("core config", () => {
const { stdout, code } = await runCli(["config", "core"]);
expect(code).toBe(0);
expect(stdout).toContain("Core configuration");
- expect(stdout).toContain("runner default: understudy_code");
+ expect(stdout).toContain("runner default: understudy_v3_code");
});
it("persists tool via `config core set tool`", async () => {
@@ -183,18 +183,18 @@ describe.sequential("core config", () => {
"core",
"set",
"tool",
- "understudy_code",
+ "understudy_v3_code",
]);
expect(setResult.code).toBe(0);
- expect(setResult.stdout).toContain("Set core.tool to understudy_code");
+ expect(setResult.stdout).toContain("Set core.tool to understudy_v3_code");
const saved = JSON.parse(fs.readFileSync(SOURCE_CONFIG, "utf-8"));
- expect(saved.core?.tool).toBe("understudy_code");
+ expect(saved.core?.tool).toBe("understudy_v3_code");
});
it("flows persisted core.tool into run dry-run output", async () => {
resetConfig();
- await runCli(["config", "core", "set", "tool", "understudy_code"]);
+ await runCli(["config", "core", "set", "tool", "understudy_v3_code"]);
const { stdout, code } = await runCli([
"run",
@@ -203,7 +203,7 @@ describe.sequential("core config", () => {
]);
expect(code).toBe(0);
const payload = JSON.parse(stdout);
- expect(payload.runOptions.coreToolSurface).toBe("understudy_code");
+ expect(payload.runOptions.coreToolSurface).toBe("understudy_v3_code");
});
it("rejects unknown tool", async () => {
@@ -271,7 +271,7 @@ describe.sequential("core config", () => {
it("reset clears the whole core section", async () => {
resetConfig();
- await runCli(["config", "core", "set", "tool", "understudy_code"]);
+ await runCli(["config", "core", "set", "tool", "understudy_v3_code"]);
const { code } = await runCli(["config", "core", "reset"]);
expect(code).toBe(0);
diff --git a/packages/evals/tests/framework/benchHarness.test.ts b/packages/evals/tests/framework/benchHarness.test.ts
index 60989664a..4c5ddf7a4 100644
--- a/packages/evals/tests/framework/benchHarness.test.ts
+++ b/packages/evals/tests/framework/benchHarness.test.ts
@@ -1,15 +1,31 @@
import { describe, expect, it } from "vitest";
import {
- claudeCodeHarness,
- codexHarness,
+ ClaudeAgentHarness,
+ CodexAgentHarness,
getBenchHarness,
+ StagehandAgentV3Harness,
+ StagehandAgentV4Harness,
} from "../../framework/benchHarness.js";
describe("bench harness registry", () => {
+ it("registers stagehand_v3 as the v3 Stagehand agent harness", () => {
+ const harness = getBenchHarness("stagehand_v3");
+
+ expect(harness).toBe(StagehandAgentV3Harness);
+ expect(harness.supportsApi).toBe(true);
+ });
+
+ it("registers stagehand_v4 as the v4 Stagehand agent harness", () => {
+ const harness = getBenchHarness("stagehand_v4");
+
+ expect(harness).toBe(StagehandAgentV4Harness);
+ expect(harness.supportsApi).toBe(false);
+ });
+
it("registers claude_code as a concrete executable harness", () => {
const harness = getBenchHarness("claude_code");
- expect(harness).toBe(claudeCodeHarness);
+ expect(harness).toBe(ClaudeAgentHarness);
expect(harness.supportedTaskKinds).toEqual(["agent", "suite"]);
expect(harness.supportsApi).toBe(false);
expect(harness.execute).toBeDefined();
@@ -18,7 +34,7 @@ describe("bench harness registry", () => {
it("registers codex as a concrete executable harness", () => {
const harness = getBenchHarness("codex");
- expect(harness).toBe(codexHarness);
+ expect(harness).toBe(CodexAgentHarness);
expect(harness.supportedTaskKinds).toEqual(["agent", "suite"]);
expect(harness.supportsApi).toBe(false);
expect(harness.execute).toBeDefined();
diff --git a/packages/evals/tests/framework/benchPlanner.test.ts b/packages/evals/tests/framework/benchPlanner.test.ts
index fdc885c96..5143b9711 100644
--- a/packages/evals/tests/framework/benchPlanner.test.ts
+++ b/packages/evals/tests/framework/benchPlanner.test.ts
@@ -34,7 +34,7 @@ describe("benchPlanner", () => {
);
expect(row).toMatchObject({
- harness: "stagehand",
+ harness: "stagehand_v3",
task: "dropdown",
category: "act",
taskKind: "act",
@@ -43,7 +43,7 @@ describe("benchPlanner", () => {
environment: "BROWSERBASE",
useApi: true,
config: {
- harness: "stagehand",
+ harness: "stagehand_v3",
model: "openai/gpt-4.1-mini",
provider: "openai",
environment: "BROWSERBASE",
@@ -55,13 +55,13 @@ describe("benchPlanner", () => {
it("annotates generated bench testcases with harness metadata", () => {
const [testcase] = generateBenchTestcases([makeTask()], {
modelOverride: "openai/gpt-4.1-mini",
- harness: "stagehand",
+ harness: "stagehand_v3",
environment: "LOCAL",
});
expect(testcase.input.modelName).toBe("openai/gpt-4.1-mini");
- expect(testcase.tags).toContain("harness/stagehand");
- expect(testcase.metadata.harness).toBe("stagehand");
+ expect(testcase.tags).toContain("harness/stagehand_v3");
+ expect(testcase.metadata.harness).toBe("stagehand_v3");
expect(testcase.metadata.environment).toBe("LOCAL");
});
@@ -78,7 +78,7 @@ describe("benchPlanner", () => {
{
modelOverride: cuaModel,
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
},
);
@@ -100,7 +100,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-5.4-mini",
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
},
);
@@ -123,7 +123,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-4.1-mini",
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentMode: "dom",
},
);
@@ -153,7 +153,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-4.1-mini",
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["dom", "hybrid"],
},
),
@@ -191,7 +191,7 @@ describe("benchPlanner", () => {
],
{
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
},
),
);
@@ -226,7 +226,7 @@ describe("benchPlanner", () => {
],
{
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["dom", "hybrid"],
},
),
@@ -265,7 +265,7 @@ describe("benchPlanner", () => {
],
{
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["cua"],
},
),
@@ -296,7 +296,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-4.1-mini",
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentMode: "cua",
},
),
@@ -323,7 +323,7 @@ describe("benchPlanner", () => {
],
{
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["cua"],
},
),
@@ -338,7 +338,7 @@ describe("benchPlanner", () => {
it("does not expand non-agent model overrides across agent modes", () => {
const testcases = generateBenchTestcases([makeTask()], {
modelOverride: "openai/gpt-4.1-mini",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["dom", "hybrid"],
});
@@ -475,7 +475,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-4.1-mini",
datasetFilter: "webvoyager",
- harness: "stagehand",
+ harness: "stagehand_v3",
},
),
);
@@ -510,7 +510,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-4.1-mini",
datasetFilter: "onlineMind2Web",
- harness: "stagehand",
+ harness: "stagehand_v3",
},
),
);
@@ -541,7 +541,7 @@ describe("benchPlanner", () => {
{
modelOverride: "openai/gpt-4.1-mini",
datasetFilter: "webtailbench",
- harness: "stagehand",
+ harness: "stagehand_v3",
},
),
);
diff --git a/packages/evals/tests/framework/benchRunner.test.ts b/packages/evals/tests/framework/benchRunner.test.ts
index 08245618d..bd4afc8c9 100644
--- a/packages/evals/tests/framework/benchRunner.test.ts
+++ b/packages/evals/tests/framework/benchRunner.test.ts
@@ -96,7 +96,7 @@ describe("bench runner", () => {
tasks: [task],
registry: makeRegistry([task]),
environment: "BROWSERBASE",
- harness: "stagehand",
+ harness: "stagehand_v3",
verbose: false,
},
);
diff --git a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts
index e28775652..798dd72e4 100644
--- a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts
+++ b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts
@@ -55,7 +55,7 @@ describe("claude code tool adapter resolution", () => {
});
it("rejects unsupported Claude Code tool surfaces for now", () => {
- expect(() => resolveClaudeCodeToolSurface("understudy_code")).toThrow(
+ expect(() => resolveClaudeCodeToolSurface("understudy_v3_code")).toThrow(
/supports --tool browse_cli, playwright_code, or cdp_code/,
);
});
diff --git a/packages/evals/tests/framework/context.test.ts b/packages/evals/tests/framework/context.test.ts
index 742378635..c6683afe4 100644
--- a/packages/evals/tests/framework/context.test.ts
+++ b/packages/evals/tests/framework/context.test.ts
@@ -4,9 +4,9 @@ import { prepareCoreBrowserTarget } from "../../core/targets/index.js";
describe("resolveDefaultCoreStartupProfile", () => {
it("uses runner-provided local CDP for code surfaces in LOCAL", () => {
- expect(resolveDefaultCoreStartupProfile("understudy_code", "LOCAL")).toBe(
- "runner_provided_local_cdp",
- );
+ expect(
+ resolveDefaultCoreStartupProfile("understudy_v3_code", "LOCAL"),
+ ).toBe("runner_provided_local_cdp");
expect(resolveDefaultCoreStartupProfile("playwright_code", "LOCAL")).toBe(
"runner_provided_local_cdp",
);
@@ -29,7 +29,7 @@ describe("resolveDefaultCoreStartupProfile", () => {
it("uses runner-provided Browserbase CDP for code surfaces in BROWSERBASE", () => {
expect(
- resolveDefaultCoreStartupProfile("understudy_code", "BROWSERBASE"),
+ resolveDefaultCoreStartupProfile("understudy_v3_code", "BROWSERBASE"),
).toBe("runner_provided_browserbase_cdp");
expect(
resolveDefaultCoreStartupProfile("playwright_code", "BROWSERBASE"),
@@ -55,7 +55,7 @@ describe("resolveDefaultCoreStartupProfile", () => {
await expect(
prepareCoreBrowserTarget({
environment: "BROWSERBASE",
- toolSurface: "understudy_code",
+ toolSurface: "understudy_v3_code",
startupProfile: "runner_provided_local_cdp",
}),
).rejects.toThrow(/requires LOCAL environment/);
@@ -65,7 +65,7 @@ describe("resolveDefaultCoreStartupProfile", () => {
await expect(
prepareCoreBrowserTarget({
environment: "LOCAL",
- toolSurface: "understudy_code",
+ toolSurface: "understudy_v3_code",
startupProfile: "runner_provided_browserbase_cdp",
}),
).rejects.toThrow(/requires BROWSERBASE environment/);
diff --git a/packages/evals/tests/framework/core-runner.test.ts b/packages/evals/tests/framework/core-runner.test.ts
index 6d2879fb4..242978f9a 100644
--- a/packages/evals/tests/framework/core-runner.test.ts
+++ b/packages/evals/tests/framework/core-runner.test.ts
@@ -122,7 +122,7 @@ describe("core runner", () => {
},
startupProfile: "runner_provided_local_cdp",
adapter: {
- name: "understudy_code",
+ name: "understudy_v3_code",
family: "understudy",
surface: "code",
metadata: {
@@ -183,7 +183,7 @@ describe("core runner", () => {
concurrency: 1,
trials: 1,
environment: "LOCAL",
- coreToolSurface: "understudy_code",
+ coreToolSurface: "understudy_v3_code",
coreStartupProfile: "runner_provided_local_cdp",
});
diff --git a/packages/evals/tests/framework/defineTask.test.ts b/packages/evals/tests/framework/defineTask.test.ts
index c531d676d..01ae3abda 100644
--- a/packages/evals/tests/framework/defineTask.test.ts
+++ b/packages/evals/tests/framework/defineTask.test.ts
@@ -44,6 +44,25 @@ describe("defineBenchTask", () => {
expect((result.meta as any).models).toEqual(["openai/gpt-4o"]);
});
+
+ it("preserves harness-native bench implementations", async () => {
+ const stagehandV3 = vi.fn(async () => ({ _success: true, version: 3 }));
+ const stagehandV4 = vi.fn(async () => ({ _success: true, version: 4 }));
+ const result = defineBenchTask(
+ { name: "native_versions" },
+ {
+ stagehand_v3: stagehandV3 as any,
+ stagehand_v4: stagehandV4 as any,
+ },
+ );
+
+ await expect(result.benchFns?.stagehand_v4?.({} as any)).resolves.toEqual({
+ _success: true,
+ version: 4,
+ });
+ expect(stagehandV3).toHaveBeenCalledTimes(0);
+ expect(stagehandV4).toHaveBeenCalledTimes(1);
+ });
});
describe("defineTask", () => {
diff --git a/packages/evals/tests/tui/parse.test.ts b/packages/evals/tests/tui/parse.test.ts
index bb42c9fc6..532057ad6 100644
--- a/packages/evals/tests/tui/parse.test.ts
+++ b/packages/evals/tests/tui/parse.test.ts
@@ -18,7 +18,7 @@ describe("resolveRunOptions", () => {
it("defaults to the stagehand bench harness", () => {
const resolved = resolveRunOptions({}, {}, {});
- expect(resolved.harness).toBe("stagehand");
+ expect(resolved.harness).toBe("stagehand_v3");
});
it("accepts known bench harnesses", () => {
diff --git a/packages/evals/tests/tui/run.test.ts b/packages/evals/tests/tui/run.test.ts
index 36be3e1aa..5f8a3fd83 100644
--- a/packages/evals/tests/tui/run.test.ts
+++ b/packages/evals/tests/tui/run.test.ts
@@ -115,7 +115,7 @@ describe("deriveCategoryFilter", () => {
concurrency: 1,
environment: "LOCAL",
useApi: false,
- harness: "stagehand",
+ harness: "stagehand_v3",
envOverrides: {},
dryRun: true,
preview: false,
@@ -149,7 +149,7 @@ describe("deriveCategoryFilter", () => {
environment: "BROWSERBASE",
model: "openai/gpt-4.1-mini",
useApi: false,
- harness: "stagehand",
+ harness: "stagehand_v3",
datasetFilter: "webvoyager",
envOverrides: {
EVAL_MAX_K: "1",
@@ -169,7 +169,7 @@ describe("deriveCategoryFilter", () => {
task: "agent/webvoyager",
dataset: "webvoyager",
model: "openai/gpt-4.1-mini",
- harness: "stagehand",
+ harness: "stagehand_v3",
agentMode: "dom",
environment: "BROWSERBASE",
useApi: false,
@@ -195,7 +195,7 @@ describe("deriveCategoryFilter", () => {
environment: "BROWSERBASE",
model: "openai/gpt-4.1-mini",
useApi: false,
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["dom", "hybrid"],
datasetFilter: "webvoyager",
envOverrides: {
@@ -413,7 +413,8 @@ describe("deriveCategoryFilter", () => {
});
it("allows executable harnesses without env gates", () => {
- expect(canExecuteBenchHarness("stagehand")).toBe(true);
+ expect(canExecuteBenchHarness("stagehand_v3")).toBe(true);
+ expect(canExecuteBenchHarness("stagehand_v4")).toBe(true);
expect(canExecuteBenchHarness("claude_code")).toBe(true);
expect(canExecuteBenchHarness("codex")).toBe(true);
});
@@ -442,7 +443,7 @@ describe("deriveCategoryFilter", () => {
environment: "BROWSERBASE",
model: "openai/gpt-4.1-mini",
useApi: false,
- harness: "stagehand",
+ harness: "stagehand_v3",
agentModes: ["dom", "hybrid"],
envOverrides: {},
dryRun: false,
@@ -460,7 +461,7 @@ describe("deriveCategoryFilter", () => {
"Plan: 2 tasks × 1 model × 2 modes × 4 trials = 16 runs",
);
expect(output).toContain(
- "Env: BROWSERBASE Harness: stagehand Concurrency: 25",
+ "Env: BROWSERBASE Harness: stagehand_v3 Concurrency: 25",
);
expect(runEvalsMock).toHaveBeenCalledOnce();
});
@@ -516,7 +517,7 @@ describe("buildCombinations (preview column-pruning)", () => {
category: null,
dataset: null,
model,
- harness: "stagehand",
+ harness: "stagehand_v3",
agentMode,
environment: "BROWSERBASE",
useApi: false,
diff --git a/packages/evals/tui/commands/core.ts b/packages/evals/tui/commands/core.ts
index 409fb8c28..45cf7b7f1 100644
--- a/packages/evals/tui/commands/core.ts
+++ b/packages/evals/tui/commands/core.ts
@@ -88,7 +88,7 @@ export function printCoreConfig(entryDir: string): void {
console.log(`\n ${bold("Core configuration:")}\n`);
console.log(
- ` ${cyan("tool")} ${core.tool ?? gray("(runner default: understudy_code)")}`,
+ ` ${cyan("tool")} ${core.tool ?? gray("(runner default: understudy_v3_code)")}`,
);
console.log(
` ${cyan("startup")} ${core.startup ?? gray("(inferred from tool + env)")}`,
@@ -147,7 +147,9 @@ async function setCoreKey(
console.error(
red(" Cannot set startup without a tool. Set core.tool first."),
);
- console.log(dim(` Example: evals core config set tool understudy_code`));
+ console.log(
+ dim(` Example: evals core config set tool understudy_v3_code`),
+ );
process.exitCode = 1;
return;
}
diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts
index 95b49c766..9ef90a4ff 100644
--- a/packages/evals/tui/commands/help.ts
+++ b/packages/evals/tui/commands/help.ts
@@ -79,7 +79,7 @@ export function printRunHelp(): void {
"",
row(
`${cyan("--tool")} ${dim("")}`,
- `Core tool surface ${gray("(understudy_code, playwright_code, ...)")}`,
+ `Core tool surface ${gray("(understudy_v3_code, playwright_code, ...)")}`,
),
row(`${cyan("--startup")} ${dim("")}`, "Core startup profile"),
"",
@@ -121,7 +121,7 @@ export function printRunHelp(): void {
` ${bold("Examples:")}`,
"",
` ${dim("$")} evals run act -t 3 -c 5`,
- ` ${dim("$")} evals run navigation/open --tool understudy_code`,
+ ` ${dim("$")} evals run navigation/open --tool understudy_v3_code`,
` ${dim("$")} evals run b:webvoyager -l 10`,
` ${dim("$")} evals run b:onlineMind2Web -l 25`,
` ${dim("$")} evals run b:webtailbench -l 10`,
@@ -200,12 +200,12 @@ export function printConfigHelp(): void {
),
row(cyan("setup"), `Interactive wizard ${gray("(coming soon)")}`),
"",
- ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`,
+ ` ${bold("Valid core tools:")} ${gray("understudy_v3_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`,
"",
` ${bold("Examples:")}`,
"",
` ${dim("$")} evals config set trials 5`,
- ` ${dim("$")} evals config core set tool understudy_code`,
+ ` ${dim("$")} evals config core set tool understudy_v3_code`,
` ${dim("$")} evals config core set startup tool_launch_local`,
` ${dim("$")} evals config core reset`,
"",
diff --git a/packages/evals/tui/commands/run.ts b/packages/evals/tui/commands/run.ts
index 01df1fa45..6b460ea17 100644
--- a/packages/evals/tui/commands/run.ts
+++ b/packages/evals/tui/commands/run.ts
@@ -23,6 +23,7 @@ import type { ResolvedRunOptions } from "./parse.js";
import { withEnvOverrides } from "./parse.js";
import { getRuntimeTasksRoot } from "../../runtimePaths.js";
import {
+ DEFAULT_BENCH_HARNESS,
isExecutableBenchHarness,
type Harness,
} from "../../framework/benchTypes.js";
@@ -239,11 +240,11 @@ export async function runCommand(
if (
options.useApi &&
- options.harness !== "stagehand" &&
+ (options.harness ?? DEFAULT_BENCH_HARNESS) !== "stagehand_v3" &&
tasks.some((t) => t.tier === "bench")
) {
throw new Error(
- `Harness "${options.harness}" does not support --api. Use --harness stagehand for API-backed bench runs.`,
+ `Harness "${options.harness}" does not support --api. Use --harness stagehand_v3 for API-backed bench runs.`,
);
}
@@ -257,9 +258,18 @@ export async function runCommand(
tasks.some((t) => t.tier === "bench")
) {
throw new Error(
- `Harness "${options.harness}" is dry-run only for now. Use --harness stagehand, --harness claude_code, or --harness codex for executable bench runs.`,
+ `Harness "${options.harness}" is dry-run only for now. Use --harness stagehand_v3, --harness stagehand_v4, --harness claude_code, or --harness codex for executable bench runs.`,
);
}
+ if (
+ options.harness === "stagehand_v4" &&
+ tasks.some((t) => t.tier === "bench")
+ ) {
+ const { assertUnderstudyV4SdkAvailable } = await import(
+ "../../framework/UnderstudyV4Tools.js"
+ );
+ assertUnderstudyV4SdkAvailable();
+ }
const matrix = await buildDryRunMatrix(options, tasks, registry);
console.log(`\n ${bold("Running:")} ${cyan(buildRunTargetLabel(options))}`);