diff --git a/packages/evals/ARCHITECTURE.mmd b/packages/evals/ARCHITECTURE.mmd
index 97e87aac7..31b8e579c 100644
--- a/packages/evals/ARCHITECTURE.mmd
+++ b/packages/evals/ARCHITECTURE.mmd
@@ -48,7 +48,7 @@ flowchart TB
CoreContext["framework/context.ts
buildCoreContext"]
FixtureServer["core/fixtures
local deterministic pages"]
CoreTargets["core/targets
local Chrome
Browserbase CDP"]
- CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"]
+ CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
modcdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"]
CoreAssertions["assertions + metrics
adapter-backed results"]
CoreDeps["core/runtime/coreDeps.ts
browserbase + ws
lazy require"]
end
diff --git a/packages/evals/core/contracts/tool.ts b/packages/evals/core/contracts/tool.ts
index bd1d366d8..9ce0a53ff 100644
--- a/packages/evals/core/contracts/tool.ts
+++ b/packages/evals/core/contracts/tool.ts
@@ -20,6 +20,7 @@ export type ToolSurface =
| "understudy_code"
| "playwright_code"
| "cdp_code"
+ | "modcdp_code"
| "playwright_mcp"
| "chrome_devtools_mcp"
| "browse_cli";
diff --git a/packages/evals/core/tools/cdp_code.ts b/packages/evals/core/tools/cdp_code.ts
index ecc1ee544..e6b6063cb 100644
--- a/packages/evals/core/tools/cdp_code.ts
+++ b/packages/evals/core/tools/cdp_code.ts
@@ -19,7 +19,7 @@ import { loadWsModule } from "../runtime/coreDeps.js";
const DEFAULT_TIMEOUT_MS = 15_000;
const POLL_INTERVAL_MS = 100;
-const SUPPORTED_CAPABILITIES: CoreCapability[] = [
+export const CDP_CODE_SUPPORTED_CAPABILITIES: CoreCapability[] = [
"session",
"navigation",
"evaluation",
@@ -40,6 +40,16 @@ export type CdpEventMessage = {
sessionId?: string;
};
+export interface CdpConnectionLike {
+ onEvent(listener: (event: CdpEventMessage) => void): () => void;
+ send(
+ method: string,
+ params?: Record,
+ sessionId?: string,
+ ): Promise;
+ close(): Promise;
+}
+
type SelectorInspection = {
count: number;
visible: boolean;
@@ -156,7 +166,7 @@ async function resolveWebSocketEndpoint(input: {
return payload.webSocketDebuggerUrl;
}
-export class CdpConnection {
+export class CdpConnection implements CdpConnectionLike {
private readonly pending = new Map<
number,
{
@@ -353,7 +363,7 @@ class CdpLocatorHandle implements CoreLocatorHandle {
class CdpPageHandle implements CorePageHandle {
constructor(
- private readonly connection: CdpConnection,
+ private readonly connection: CdpConnectionLike,
private readonly state: CdpPageState,
) {}
@@ -1015,12 +1025,12 @@ class CdpPageHandle implements CorePageHandle {
}
}
-class CdpSession implements CoreSession {
+export class CdpSession implements CoreSession {
private readonly pages = new Map();
private activePageId: string | null = null;
private closed = false;
- private constructor(private readonly connection: CdpConnection) {}
+ private constructor(private readonly connection: CdpConnectionLike) {}
static async connect(input: {
providedEndpoint: {
@@ -1030,6 +1040,12 @@ class CdpSession implements CoreSession {
};
}): Promise {
const connection = await CdpConnection.connect(input.providedEndpoint);
+ return CdpSession.fromConnection(connection);
+ }
+
+ static async fromConnection(
+ connection: CdpConnectionLike,
+ ): Promise {
const session = new CdpSession(connection);
await session.bootstrap();
return session;
@@ -1183,7 +1199,7 @@ class CdpSession implements CoreSession {
}
}
-function connectionModeFromProfile(
+export function connectionModeFromProfile(
startupProfile: StartupProfile,
endpointKind?: "ws" | "http",
): ConnectionMode {
@@ -1210,7 +1226,7 @@ export class CdpCodeTool implements CoreTool {
"tool_attach_browserbase",
];
readonly supportedCapabilities: CoreCapability[] = [
- ...SUPPORTED_CAPABILITIES,
+ ...CDP_CODE_SUPPORTED_CAPABILITIES,
];
readonly supportedTargetKinds: TargetKind[] = [
"selector",
diff --git a/packages/evals/core/tools/modcdp_code.ts b/packages/evals/core/tools/modcdp_code.ts
new file mode 100644
index 000000000..0f780b09c
--- /dev/null
+++ b/packages/evals/core/tools/modcdp_code.ts
@@ -0,0 +1,210 @@
+import fs from "node:fs";
+import path from "node:path";
+import { pathToFileURL } from "node:url";
+import type {
+ CoreCapability,
+ CoreTool,
+ StartupProfile,
+ ToolStartInput,
+ ToolStartResult,
+} from "../contracts/tool.js";
+import type { TargetKind } from "../contracts/targets.js";
+import { getRepoRootDir } from "../../runtimePaths.js";
+import {
+ CDP_CODE_SUPPORTED_CAPABILITIES,
+ CdpSession,
+ connectionModeFromProfile,
+ type CdpConnectionLike,
+ type CdpEventMessage,
+} from "./cdp_code.js";
+
+export type ModCDPClientLike = {
+ connect(): Promise;
+ close(): Promise;
+ send(method: string, params?: unknown): Promise;
+ on(
+ eventName: string | symbol,
+ listener: (...args: unknown[]) => void,
+ ): unknown;
+ off(
+ eventName: string | symbol,
+ listener: (...args: unknown[]) => void,
+ ): unknown;
+ _cdp: {
+ send(
+ method: string,
+ params?: Record,
+ sessionId?: string | null,
+ ): Promise;
+ };
+ [key: string]: unknown;
+};
+
+type ModCDPClientConstructor = new (
+ options?: Record,
+) => ModCDPClientLike;
+
+type ModCDPClientModule = {
+ ModCDPClient: ModCDPClientConstructor;
+};
+
+const DEFAULT_STAGEHAND_V4_SDK_PATH = path.join(
+ getRepoRootDir(),
+ "..",
+ "stagehand-driver",
+ "sdks",
+ "js",
+ "index.ts",
+);
+
+const DEFAULT_MODCDP_CLIENT_PATH = path.join(
+ path.dirname(DEFAULT_STAGEHAND_V4_SDK_PATH),
+ "..",
+ "..",
+ "modcdp",
+ "dist",
+ "client",
+ "js",
+ "ModCDPClient.js",
+);
+
+export class ModCdpConnection implements CdpConnectionLike {
+ readonly client: ModCDPClientLike;
+
+ private constructor(client: ModCDPClientLike) {
+ this.client = client;
+ }
+
+ static async connect(input: {
+ kind: "ws" | "http";
+ url: string;
+ }): Promise {
+ const stagehandV4SdkPath =
+ process.env.STAGEHAND_V4_SDK_PATH ?? DEFAULT_STAGEHAND_V4_SDK_PATH;
+ const stagehandV4RootPath = path.join(
+ path.dirname(stagehandV4SdkPath),
+ "..",
+ "..",
+ );
+ const clientPath =
+ process.env.MODCDP_CLIENT_PATH ??
+ (process.env.STAGEHAND_V4_SDK_PATH
+ ? path.join(
+ stagehandV4RootPath,
+ "modcdp",
+ "dist",
+ "client",
+ "js",
+ "ModCDPClient.js",
+ )
+ : DEFAULT_MODCDP_CLIENT_PATH);
+ if (!fs.existsSync(clientPath)) {
+ throw new Error(
+ [
+ "modcdp_code requires a built ModCDP JS client.",
+ `Expected ModCDP client at: ${clientPath}`,
+ "Set MODCDP_CLIENT_PATH to the ModCDPClient.js entrypoint if your checkout lives somewhere else.",
+ `Or build it with: pnpm --dir ${stagehandV4RootPath} --filter modcdp run build`,
+ ].join("\n"),
+ );
+ }
+
+ const { ModCDPClient } = (await import(
+ pathToFileURL(clientPath).href
+ )) as ModCDPClientModule;
+ const client = new ModCDPClient({
+ cdp_url: input.url,
+ routes: { "*.*": "service_worker" },
+ server: {
+ loopback_cdp_url: input.url,
+ routes: { "*.*": "loopback_cdp" },
+ },
+ });
+ await client.connect();
+ return new ModCdpConnection(client);
+ }
+
+ onEvent(listener: (event: CdpEventMessage) => void): () => void {
+ const wrapped = (
+ method: unknown,
+ params: unknown,
+ sessionId: unknown,
+ ): void => {
+ if (typeof method !== "string") return;
+ listener({
+ method,
+ params:
+ params && typeof params === "object" && !Array.isArray(params)
+ ? (params as Record)
+ : undefined,
+ sessionId: typeof sessionId === "string" ? sessionId : undefined,
+ });
+ };
+ this.client.on("*", wrapped);
+ return () => {
+ this.client.off("*", wrapped);
+ };
+ }
+
+ async send(
+ method: string,
+ params?: Record,
+ sessionId?: string,
+ ): Promise {
+ return this.client._cdp.send(method, params, sessionId ?? null);
+ }
+
+ async close(): Promise {
+ await this.client.close();
+ }
+}
+
+export class ModCdpCodeTool implements CoreTool {
+ readonly id = "modcdp_code";
+ readonly surface = "code";
+ readonly family = "cdp";
+ readonly supportedStartupProfiles: StartupProfile[] = [
+ "runner_provided_local_cdp",
+ "runner_provided_browserbase_cdp",
+ "tool_attach_local_cdp",
+ "tool_attach_browserbase",
+ ];
+ readonly supportedCapabilities: CoreCapability[] = [
+ ...CDP_CODE_SUPPORTED_CAPABILITIES,
+ ];
+ readonly supportedTargetKinds: TargetKind[] = [
+ "selector",
+ "coords",
+ "focused",
+ ];
+
+ async start(input: ToolStartInput): Promise {
+ if (!input.providedEndpoint) {
+ throw new Error(
+ `modcdp_code startup profile "${input.startupProfile}" requires a providedEndpoint`,
+ );
+ }
+
+ const connection = await ModCdpConnection.connect(input.providedEndpoint);
+ const session = await CdpSession.fromConnection(connection);
+
+ return {
+ session,
+ cleanup: async () => {
+ await session.close();
+ },
+ metadata: {
+ environment:
+ input.environment === "BROWSERBASE" ? "browserbase" : "local",
+ browserOwnership: input.startupProfile.startsWith("runner_provided")
+ ? "runner"
+ : "tool",
+ connectionMode: connectionModeFromProfile(
+ input.startupProfile,
+ input.providedEndpoint.kind,
+ ),
+ startupProfile: input.startupProfile,
+ },
+ };
+ }
+}
diff --git a/packages/evals/core/tools/registry.ts b/packages/evals/core/tools/registry.ts
index 65384f137..243df19c7 100644
--- a/packages/evals/core/tools/registry.ts
+++ b/packages/evals/core/tools/registry.ts
@@ -2,6 +2,7 @@ import type { CoreTool, ToolSurface } from "../contracts/tool.js";
import { BrowseCliTool } from "./browse_cli.js";
import { CdpCodeTool } from "./cdp_code.js";
import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js";
+import { ModCdpCodeTool } from "./modcdp_code.js";
import { PlaywrightCodeTool } from "./playwright_code.js";
import { PlaywrightMcpTool } from "./playwright_mcp.js";
import { UnderstudyCodeTool } from "./understudy_code.js";
@@ -11,6 +12,7 @@ export function listCoreTools(): ToolSurface[] {
"understudy_code",
"playwright_code",
"cdp_code",
+ "modcdp_code",
"playwright_mcp",
"chrome_devtools_mcp",
"browse_cli",
@@ -25,6 +27,8 @@ export function getCoreTool(toolSurface: ToolSurface): CoreTool {
return new PlaywrightCodeTool();
case "cdp_code":
return new CdpCodeTool();
+ case "modcdp_code":
+ return new ModCdpCodeTool();
case "playwright_mcp":
return new PlaywrightMcpTool();
case "chrome_devtools_mcp":
diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts
index 6ec620233..333ccabfe 100644
--- a/packages/evals/framework/claudeCodeRunner.ts
+++ b/packages/evals/framework/claudeCodeRunner.ts
@@ -61,6 +61,17 @@ export function buildClaudeCodePrompt(
.join("\n");
}
+export function buildClaudeCodeSystemPromptAppend(
+ toolInstructions?: string,
+): string {
+ return [
+ "You are being evaluated. Do not edit repository files. Complete the browser task and emit the requested EVAL_RESULT line.",
+ toolInstructions,
+ ]
+ .filter(Boolean)
+ .join("\n\n");
+}
+
export function parseClaudeCodeResult(raw: string): ParsedClaudeCodeResult {
const marker = "EVAL_RESULT:";
const markerIndex = raw.lastIndexOf(marker);
@@ -178,8 +189,9 @@ export async function runClaudeCodeAgent({
systemPrompt: {
type: "preset",
preset: "claude_code",
- append:
- "You are being evaluated. Do not edit repository files. Complete the browser task and emit the requested EVAL_RESULT line.",
+ append: buildClaudeCodeSystemPromptAppend(
+ toolAdapter?.promptInstructions,
+ ),
},
},
})) {
diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts
index f67604da0..6b4ebc2d2 100644
--- a/packages/evals/framework/claudeCodeToolAdapter.ts
+++ b/packages/evals/framework/claudeCodeToolAdapter.ts
@@ -9,7 +9,15 @@ import type { EvalLogger } from "../logger.js";
import { getRepoRootDir } from "../runtimePaths.js";
import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js";
import { prepareCoreBrowserTarget } from "../core/targets/index.js";
-import { CdpConnection, type CdpEventMessage } from "../core/tools/cdp_code.js";
+import {
+ CdpConnection,
+ type CdpConnectionLike,
+ type CdpEventMessage,
+} from "../core/tools/cdp_code.js";
+import {
+ ModCdpConnection,
+ type ModCDPClientLike,
+} from "../core/tools/modcdp_code.js";
import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
export interface ClaudeCodeToolAdapterInput {
@@ -135,6 +143,12 @@ type CdpRuntime = {
wait(ms: number): Promise;
};
+type ModCdpRuntime = CdpRuntime & {
+ readonly client: ModCDPClientLike;
+ readonly Mod: unknown;
+ readonly Custom: unknown;
+};
+
export interface BrowseCliToolMetadata {
toolCommand: "browse";
browseCliEntrypoint: string;
@@ -186,9 +200,15 @@ export async function prepareClaudeCodeToolAdapter(
toolSurface,
startupProfile,
});
+ case "modcdp_code":
+ return prepareModCdpCodeAdapter({
+ ...input,
+ toolSurface,
+ startupProfile,
+ });
default:
throw new EvalsError(
- `Claude Code harness supports --tool browse_cli, playwright_code, or cdp_code for execution right now; received "${toolSurface}".`,
+ `Claude Code harness supports --tool browse_cli, playwright_code, cdp_code, or modcdp_code for execution right now; received "${toolSurface}".`,
);
}
}
@@ -200,12 +220,13 @@ export function resolveClaudeCodeToolSurface(
if (
requested === "browse_cli" ||
requested === "playwright_code" ||
- requested === "cdp_code"
+ requested === "cdp_code" ||
+ requested === "modcdp_code"
) {
return requested;
}
throw new EvalsError(
- `Claude Code harness supports --tool browse_cli, playwright_code, or cdp_code for execution right now; received "${requested}".`,
+ `Claude Code harness supports --tool browse_cli, playwright_code, cdp_code, or modcdp_code for execution right now; received "${requested}".`,
);
}
@@ -221,7 +242,11 @@ export function resolveClaudeCodeStartupProfile(
? "tool_create_browserbase"
: "tool_launch_local";
}
- if (toolSurface === "playwright_code" || toolSurface === "cdp_code") {
+ if (
+ toolSurface === "playwright_code" ||
+ toolSurface === "cdp_code" ||
+ toolSurface === "modcdp_code"
+ ) {
return environment === "BROWSERBASE"
? "runner_provided_browserbase_cdp"
: "runner_provided_local_cdp";
@@ -579,6 +604,121 @@ async function prepareCdpCodeAdapter(
}
}
+async function prepareModCdpCodeAdapter(
+ input: ClaudeCodeToolAdapterInput & {
+ toolSurface: "modcdp_code";
+ startupProfile: StartupProfile;
+ },
+): Promise {
+ if (
+ input.startupProfile !== "runner_provided_local_cdp" &&
+ input.startupProfile !== "runner_provided_browserbase_cdp"
+ ) {
+ throw new EvalsError(
+ `modcdp_code startup profile "${input.startupProfile}" is not valid for Claude Code. Use runner_provided_local_cdp or runner_provided_browserbase_cdp.`,
+ );
+ }
+
+ const cwd = await fsp.mkdtemp(
+ path.join(os.tmpdir(), "stagehand-evals-claude-modcdp-"),
+ );
+ const env = { ...process.env } as Record;
+ let connection: ModCdpConnection | undefined;
+ let targetCleanup: () => Promise = async () => {};
+
+ try {
+ const target = await prepareCoreBrowserTarget({
+ environment: input.environment,
+ toolSurface: "modcdp_code",
+ startupProfile: input.startupProfile,
+ });
+ targetCleanup = target.cleanup;
+ if (!target.providedEndpoint?.url) {
+ throw new EvalsError(
+ `modcdp_code requires a runner-provided CDP endpoint for startup profile "${input.startupProfile}".`,
+ );
+ }
+
+ connection = await ModCdpConnection.connect(target.providedEndpoint);
+ const activePage = await attachActiveCdpPage(connection);
+ const mcpServers = await buildModCdpRunMcpServers({
+ connection,
+ activePage,
+ plan: input.plan,
+ logger: input.logger,
+ });
+
+ input.logger.log({
+ category: "claude_code",
+ message: `Initialized modcdp_code browser runtime for Claude Code run tool.`,
+ level: 1,
+ auxiliary: {
+ startupProfile: {
+ value: input.startupProfile,
+ type: "string",
+ },
+ environment: {
+ value: input.environment,
+ type: "string",
+ },
+ targetId: {
+ value: activePage.targetId,
+ type: "string",
+ },
+ sessionId: {
+ value: activePage.sessionId,
+ type: "string",
+ },
+ ...(target.metadata && {
+ targetMetadata: {
+ value: JSON.stringify(target.metadata),
+ type: "object",
+ },
+ }),
+ },
+ });
+
+ return {
+ toolSurface: "modcdp_code",
+ startupProfile: input.startupProfile,
+ cwd,
+ env,
+ allowedTools: ["Bash", RUN_TOOL_NAME],
+ settingSources: [],
+ mcpServers,
+ canUseTool: async (toolName, commandInput) => {
+ if (toolName === RUN_TOOL_NAME || toolName === "Bash") {
+ return { behavior: "allow", updatedInput: commandInput };
+ }
+ return {
+ behavior: "deny",
+ message: `Use Bash for inspection and ${RUN_TOOL_NAME} for ModCDP browser automation.`,
+ };
+ },
+ promptInstructions: buildModCdpCodePromptInstructions(input.plan),
+ cleanup: async () => {
+ try {
+ await connection?.close();
+ } catch {
+ // best-effort only
+ } finally {
+ await targetCleanup();
+ await fsp.rm(cwd, { recursive: true, force: true });
+ }
+ },
+ };
+ } catch (error) {
+ try {
+ await connection?.close();
+ } catch {
+ // best-effort only
+ }
+ await targetCleanup();
+ await fsp.rm(cwd, { recursive: true, force: true });
+ throw error;
+ }
+}
+
async function buildPlaywrightRunMcpServers(input: {
browser: Browser;
context: BrowserContext;
@@ -701,7 +841,7 @@ async function executePlaywrightSnippet(input: {
}
async function buildCdpRunMcpServers(input: {
- connection: CdpConnection;
+ connection: CdpConnectionLike;
activePage: ActiveCdpPage;
plan: ExternalHarnessTaskPlan;
logger: EvalLogger;
@@ -749,7 +889,7 @@ async function buildCdpRunMcpServers(input: {
async function executeCdpRunTool(input: {
code: string;
- connection: CdpConnection;
+ connection: CdpConnectionLike;
activePage: ActiveCdpPage;
plan: ExternalHarnessTaskPlan;
logger: EvalLogger;
@@ -784,7 +924,7 @@ async function executeCdpRunTool(input: {
async function executeCdpSnippet(input: {
code: string;
- connection: CdpConnection;
+ connection: CdpConnectionLike;
activePage: ActiveCdpPage;
plan: ExternalHarnessTaskPlan;
logger: EvalLogger;
@@ -813,8 +953,123 @@ async function executeCdpSnippet(input: {
);
}
+async function buildModCdpRunMcpServers(input: {
+ connection: ModCdpConnection;
+ activePage: ActiveCdpPage;
+ plan: ExternalHarnessTaskPlan;
+ logger: EvalLogger;
+}): Promise> {
+ const sdk = (await import("@anthropic-ai/claude-agent-sdk")) as unknown as {
+ createSdkMcpServer: SdkMcpServerFactory;
+ tool: SdkToolFactory;
+ };
+
+ const runTool = sdk.tool(
+ "run",
+ [
+ "Execute JavaScript against the initialized ModCDP browser client.",
+ "The snippet runs inside an async function with modcdp, z, startUrl, task, and console in scope.",
+ "Use await directly. Return a JSON-serializable value when useful.",
+ ].join(" "),
+ {
+ code: z
+ .string()
+ .describe(
+ "JavaScript function body to execute. modcdp/z/startUrl/task are already in scope.",
+ ),
+ },
+ async ({ code }) => {
+ return executeModCdpRunTool({
+ code,
+ connection: input.connection,
+ activePage: input.activePage,
+ plan: input.plan,
+ logger: input.logger,
+ });
+ },
+ { alwaysLoad: true },
+ );
+
+ return {
+ [RUN_TOOL_SERVER]: sdk.createSdkMcpServer({
+ name: RUN_TOOL_SERVER,
+ version: "1.0.0",
+ tools: [runTool],
+ alwaysLoad: true,
+ }),
+ };
+}
+
+async function executeModCdpRunTool(input: {
+ code: string;
+ connection: ModCdpConnection;
+ activePage: ActiveCdpPage;
+ plan: ExternalHarnessTaskPlan;
+ logger: EvalLogger;
+}): Promise {
+ try {
+ const result = await withTimeout(
+ executeModCdpSnippet(input),
+ readPositiveIntEnv("EVAL_CLAUDE_CODE_RUN_TOOL_TIMEOUT_MS", 60_000),
+ );
+ const text = stringifyToolResult(result);
+ input.logger.log({
+ category: "claude_code",
+ message: `run tool completed: ${clip(text, 500)}`,
+ level: 1,
+ });
+ return {
+ content: [{ type: "text", text }],
+ };
+ } catch (error) {
+ const message = error instanceof Error ? error.message : String(error);
+ input.logger.warn({
+ category: "claude_code",
+ message: `run tool failed: ${message}`,
+ level: 1,
+ });
+ return {
+ isError: true,
+ content: [{ type: "text", text: message }],
+ };
+ }
+}
+
+async function executeModCdpSnippet(input: {
+ code: string;
+ connection: ModCdpConnection;
+ activePage: ActiveCdpPage;
+ plan: ExternalHarnessTaskPlan;
+ logger: EvalLogger;
+}): Promise {
+ const AsyncFunction = Object.getPrototypeOf(async function () {})
+ .constructor as new (
+ ...args: string[]
+ ) => (...values: unknown[]) => Promise;
+ const fn = new AsyncFunction(
+ "modcdp",
+ "z",
+ "startUrl",
+ "task",
+ "console",
+ input.code,
+ );
+ return fn(
+ buildModCdpRuntime(input.connection, input.activePage, input.logger),
+ z,
+ input.plan.startUrl,
+ {
+ dataset: input.plan.dataset,
+ id: input.plan.taskId,
+ startUrl: input.plan.startUrl,
+ instruction: input.plan.instruction,
+ },
+ buildRunToolConsole(input.logger),
+ );
+}
+
function buildCdpRuntime(
- connection: CdpConnection,
+ connection: CdpConnectionLike,
activePage: ActiveCdpPage,
logger: EvalLogger,
): CdpRuntime {
@@ -900,8 +1155,22 @@ function buildCdpRuntime(
};
}
+function buildModCdpRuntime(
+ connection: ModCdpConnection,
+ activePage: ActiveCdpPage,
+ logger: EvalLogger,
+): ModCdpRuntime {
+ const client = connection.client;
+ return {
+ ...buildCdpRuntime(connection, activePage, logger),
+ client,
+ Mod: client.Mod,
+ Custom: client.Custom,
+ };
+}
+
function onCdpEvent(
- connection: CdpConnection,
+ connection: CdpConnectionLike,
sessionId: string,
method: string,
listener: (event: CdpEventMessage) => unknown | Promise,
@@ -936,7 +1205,7 @@ function onCdpEvent(
}
async function attachActiveCdpPage(
- connection: CdpConnection,
+ connection: CdpConnectionLike,
): Promise {
const targets = await connection.send<{
targetInfos: Array<{
@@ -982,7 +1251,7 @@ async function attachActiveCdpPage(
}
export function waitForCdpEvent(
- connection: CdpConnection,
+ connection: CdpConnectionLike,
sessionId: string,
method: string,
timeoutMs: number,
@@ -1063,6 +1332,88 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string {
].join("\n");
}
+function buildModCdpCodePromptInstructions(
+ plan: ExternalHarnessTaskPlan,
+): string {
+ void plan;
+ return [
+ "Browser tool surface: modcdp_code.",
+ `Use the ${RUN_TOOL_NAME} tool for browser automation. It exposes an initialized modcdp object, startUrl, and task object.`,
+ "modcdp.client is the typed ModCDPClient. modcdp.Mod is the generated Mod.* command surface and modcdp.Custom is the generated Custom.* command surface.",
+ "z from Zod is available for ModCDP command/event schemas.",
+ "Use modcdp.send(method, params) for page-scoped CDP commands and modcdp.browser(method, params) for browser-level CDP commands.",
+ "Helpers available: modcdp.on(method, listener), modcdp.once(method), modcdp.waitForEvent(method, timeoutMs), modcdp.wait(ms), modcdp.targetId, modcdp.sessionId.",
+ "ModCDP primitive 1: Mod.evaluate evaluates an expression through the extended CDP protocol, with chrome.* and a cdp bridge available.",
+ "ModCDP primitive 2: Mod.addCustomCommand extends CDP with a Custom.* method that can be called like any other protocol command.",
+ "ModCDP primitive 3: Mod.addCustomEvent extends CDP with a Custom.* event that can be emitted and listened for like any other protocol event.",
+ "ModCDP primitive 4: Mod.addMiddleware modifies CDP requests, responses, or events by exact name, wildcard prefix, or *.",
+ "ModCDP string-form examples:",
+ [
+ "```js",
+ "// Use it like a normal CDP connection: send normal CDP and register for normal CDP events.",
+ 'console.log(await modcdp.browser("Browser.getVersion"));',
+ 'modcdp.on("Target.targetInfoChanged", console.log);',
+ "",
+ "// Evaluate with chrome.* and a cdp bridge available.",
+ 'const tab = await modcdp.send("Mod.evaluate", {',
+ ' expression: "(await chrome.tabs.query({ active: true }))[0]",',
+ "});",
+ "",
+ "// Extend CDP with a Custom.* command, then call it with send().",
+ 'await modcdp.send("Mod.addCustomCommand", {',
+ ' name: "Custom.tabIdFromTargetId",',
+ " paramsSchema: { targetId: modcdp.client.types.zod.Target.TargetID },",
+ " resultSchema: { tabId: z.number().nullable() },",
+ " expression: `async ({ targetId }) => ({",
+ " tabId: (await chrome.debugger.getTargets()).find(t => t.id === targetId)?.tabId ?? null",
+ " })`,",
+ "});",
+ 'const { targetInfos } = await modcdp.browser("Target.getTargets");',
+ 'const pageTarget = targetInfos.find((targetInfo) => targetInfo.type === "page");',
+ 'console.log(await modcdp.send("Custom.tabIdFromTargetId", { targetId: pageTarget.targetId }));',
+ "",
+ "// Extend CDP with a Custom.* event, then listen for it with on().",
+ 'await modcdp.send("Mod.addCustomEvent", {',
+ ' name: "Page.foregroundPageChanged",',
+ " eventSchema: {",
+ " targetId: modcdp.client.types.zod.Target.TargetID.nullable(),",
+ " tabId: z.number(),",
+ " },",
+ "});",
+ 'await modcdp.send("Mod.evaluate", {',
+ " expression: `chrome.tabs.onActivated.addListener(async ({ tabId }) =>",
+ ' cdp.emit("Page.foregroundPageChanged", {',
+ " tabId,",
+ " targetId: (await chrome.debugger.getTargets()).find(t => t.tabId === tabId)?.id ?? null",
+ " })",
+ " )`,",
+ "});",
+ 'modcdp.on("Page.foregroundPageChanged", console.log);',
+ "",
+ "// Modify existing CDP command results on the wire.",
+ 'await modcdp.send("Mod.addMiddleware", {',
+ ' name: "Target.getTargets",',
+ ' phase: "response",',
+ " expression: `async (payload, next) => {",
+ " for (const targetInfo of payload.targetInfos) {",
+ ' const { tabId } = await cdp.send("Custom.tabIdFromTargetId", {',
+ " targetId: targetInfo.targetId,",
+ " });",
+ " targetInfo.tabId = tabId;",
+ " }",
+ " return next(payload);",
+ " }`,",
+ "});",
+ 'console.log(await modcdp.browser("Target.getTargets"));',
+ "```",
+ ].join("\n"),
+ 'The first browser action should usually be: const loaded = modcdp.waitForEvent("Page.loadEventFired"); await modcdp.send("Page.navigate", { url: startUrl }); await loaded.',
+ "Use Bash for inspection and lightweight scripting. Do not create a separate browser process.",
+ "Do not edit repository files.",
+ "Return useful JSON-serializable values from run snippets so you can inspect progress.",
+ ].join("\n");
+}
+
async function runBrowseSetup(
wrapperPath: string,
environment: "LOCAL" | "BROWSERBASE",
diff --git a/packages/evals/framework/context.ts b/packages/evals/framework/context.ts
index daa8eabea..d85d90340 100644
--- a/packages/evals/framework/context.ts
+++ b/packages/evals/framework/context.ts
@@ -44,6 +44,7 @@ export function resolveDefaultCoreStartupProfile(
case "understudy_code":
case "playwright_code":
case "cdp_code":
+ case "modcdp_code":
case "playwright_mcp":
case "chrome_devtools_mcp":
return environment === "BROWSERBASE"
diff --git a/packages/evals/tests/core/tool-registry.test.ts b/packages/evals/tests/core/tool-registry.test.ts
index 9a524fa8a..c1676151f 100644
--- a/packages/evals/tests/core/tool-registry.test.ts
+++ b/packages/evals/tests/core/tool-registry.test.ts
@@ -5,6 +5,7 @@ describe("core tool registry", () => {
it("lists extended tool surfaces", () => {
expect(listCoreTools()).toEqual(
expect.arrayContaining([
+ "modcdp_code",
"playwright_mcp",
"chrome_devtools_mcp",
"browse_cli",
@@ -13,6 +14,7 @@ describe("core tool registry", () => {
});
it("constructs MCP and CLI tools", () => {
+ expect(getCoreTool("modcdp_code").id).toBe("modcdp_code");
expect(getCoreTool("playwright_mcp").id).toBe("playwright_mcp");
expect(getCoreTool("chrome_devtools_mcp").id).toBe("chrome_devtools_mcp");
expect(getCoreTool("browse_cli").id).toBe("browse_cli");
diff --git a/packages/evals/tests/framework/claudeCodeRunner.test.ts b/packages/evals/tests/framework/claudeCodeRunner.test.ts
index 4d8c3cde5..7492c2912 100644
--- a/packages/evals/tests/framework/claudeCodeRunner.test.ts
+++ b/packages/evals/tests/framework/claudeCodeRunner.test.ts
@@ -212,5 +212,13 @@ describe("claude code runner helpers", () => {
"Bash",
"mcp__stagehand_browser__run",
]);
+ expect(capturedOptions?.systemPrompt).toMatchObject({
+ type: "preset",
+ preset: "claude_code",
+ });
+ expect(
+ (capturedOptions?.systemPrompt as { append?: string } | undefined)
+ ?.append,
+ ).toContain("Use run.");
});
});
diff --git a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts
index e28775652..428f3ce2b 100644
--- a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts
+++ b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts
@@ -52,11 +52,18 @@ describe("claude code tool adapter resolution", () => {
expect(resolveClaudeCodeStartupProfile("cdp_code", "BROWSERBASE")).toBe(
"runner_provided_browserbase_cdp",
);
+ expect(resolveClaudeCodeToolSurface("modcdp_code")).toBe("modcdp_code");
+ expect(resolveClaudeCodeStartupProfile("modcdp_code", "LOCAL")).toBe(
+ "runner_provided_local_cdp",
+ );
+ expect(resolveClaudeCodeStartupProfile("modcdp_code", "BROWSERBASE")).toBe(
+ "runner_provided_browserbase_cdp",
+ );
});
it("rejects unsupported Claude Code tool surfaces for now", () => {
expect(() => resolveClaudeCodeToolSurface("understudy_code")).toThrow(
- /supports --tool browse_cli, playwright_code, or cdp_code/,
+ /supports --tool browse_cli, playwright_code, cdp_code, or modcdp_code/,
);
});
diff --git a/packages/evals/tests/framework/context.test.ts b/packages/evals/tests/framework/context.test.ts
index 742378635..18b32ed76 100644
--- a/packages/evals/tests/framework/context.test.ts
+++ b/packages/evals/tests/framework/context.test.ts
@@ -13,6 +13,9 @@ describe("resolveDefaultCoreStartupProfile", () => {
expect(resolveDefaultCoreStartupProfile("cdp_code", "LOCAL")).toBe(
"runner_provided_local_cdp",
);
+ expect(resolveDefaultCoreStartupProfile("modcdp_code", "LOCAL")).toBe(
+ "runner_provided_local_cdp",
+ );
expect(resolveDefaultCoreStartupProfile("playwright_mcp", "LOCAL")).toBe(
"runner_provided_local_cdp",
);
@@ -37,6 +40,9 @@ describe("resolveDefaultCoreStartupProfile", () => {
expect(resolveDefaultCoreStartupProfile("cdp_code", "BROWSERBASE")).toBe(
"runner_provided_browserbase_cdp",
);
+ expect(resolveDefaultCoreStartupProfile("modcdp_code", "BROWSERBASE")).toBe(
+ "runner_provided_browserbase_cdp",
+ );
expect(
resolveDefaultCoreStartupProfile("playwright_mcp", "BROWSERBASE"),
).toBe("runner_provided_browserbase_cdp");
diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts
index 95b49c766..1085459c7 100644
--- a/packages/evals/tui/commands/help.ts
+++ b/packages/evals/tui/commands/help.ts
@@ -79,7 +79,7 @@ export function printRunHelp(): void {
"",
row(
`${cyan("--tool")} ${dim("")}`,
- `Core tool surface ${gray("(understudy_code, playwright_code, ...)")}`,
+ `Core tool surface ${gray("(understudy_code, playwright_code, cdp_code, modcdp_code, ...)")}`,
),
row(`${cyan("--startup")} ${dim("")}`, "Core startup profile"),
"",
@@ -200,7 +200,7 @@ export function printConfigHelp(): void {
),
row(cyan("setup"), `Interactive wizard ${gray("(coming soon)")}`),
"",
- ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`,
+ ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, modcdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`,
"",
` ${bold("Examples:")}`,
"",