diff --git a/packages/evals/ARCHITECTURE.mmd b/packages/evals/ARCHITECTURE.mmd index 97e87aac7..31b8e579c 100644 --- a/packages/evals/ARCHITECTURE.mmd +++ b/packages/evals/ARCHITECTURE.mmd @@ -48,7 +48,7 @@ flowchart TB CoreContext["framework/context.ts
buildCoreContext"] FixtureServer["core/fixtures
local deterministic pages"] CoreTargets["core/targets
local Chrome
Browserbase CDP"] - CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"] + CoreTools["core/tools registry
understudy_code
playwright_code
cdp_code
modcdp_code
playwright_mcp
chrome_devtools_mcp
browse_cli"] CoreAssertions["assertions + metrics
adapter-backed results"] CoreDeps["core/runtime/coreDeps.ts
browserbase + ws
lazy require"] end diff --git a/packages/evals/core/contracts/tool.ts b/packages/evals/core/contracts/tool.ts index bd1d366d8..9ce0a53ff 100644 --- a/packages/evals/core/contracts/tool.ts +++ b/packages/evals/core/contracts/tool.ts @@ -20,6 +20,7 @@ export type ToolSurface = | "understudy_code" | "playwright_code" | "cdp_code" + | "modcdp_code" | "playwright_mcp" | "chrome_devtools_mcp" | "browse_cli"; diff --git a/packages/evals/core/tools/cdp_code.ts b/packages/evals/core/tools/cdp_code.ts index ecc1ee544..e6b6063cb 100644 --- a/packages/evals/core/tools/cdp_code.ts +++ b/packages/evals/core/tools/cdp_code.ts @@ -19,7 +19,7 @@ import { loadWsModule } from "../runtime/coreDeps.js"; const DEFAULT_TIMEOUT_MS = 15_000; const POLL_INTERVAL_MS = 100; -const SUPPORTED_CAPABILITIES: CoreCapability[] = [ +export const CDP_CODE_SUPPORTED_CAPABILITIES: CoreCapability[] = [ "session", "navigation", "evaluation", @@ -40,6 +40,16 @@ export type CdpEventMessage = { sessionId?: string; }; +export interface CdpConnectionLike { + onEvent(listener: (event: CdpEventMessage) => void): () => void; + send( + method: string, + params?: Record, + sessionId?: string, + ): Promise; + close(): Promise; +} + type SelectorInspection = { count: number; visible: boolean; @@ -156,7 +166,7 @@ async function resolveWebSocketEndpoint(input: { return payload.webSocketDebuggerUrl; } -export class CdpConnection { +export class CdpConnection implements CdpConnectionLike { private readonly pending = new Map< number, { @@ -353,7 +363,7 @@ class CdpLocatorHandle implements CoreLocatorHandle { class CdpPageHandle implements CorePageHandle { constructor( - private readonly connection: CdpConnection, + private readonly connection: CdpConnectionLike, private readonly state: CdpPageState, ) {} @@ -1015,12 +1025,12 @@ class CdpPageHandle implements CorePageHandle { } } -class CdpSession implements CoreSession { +export class CdpSession implements CoreSession { private readonly pages = new Map(); private activePageId: string | null = null; private closed = false; - private constructor(private readonly connection: CdpConnection) {} + private constructor(private readonly connection: CdpConnectionLike) {} static async connect(input: { providedEndpoint: { @@ -1030,6 +1040,12 @@ class CdpSession implements CoreSession { }; }): Promise { const connection = await CdpConnection.connect(input.providedEndpoint); + return CdpSession.fromConnection(connection); + } + + static async fromConnection( + connection: CdpConnectionLike, + ): Promise { const session = new CdpSession(connection); await session.bootstrap(); return session; @@ -1183,7 +1199,7 @@ class CdpSession implements CoreSession { } } -function connectionModeFromProfile( +export function connectionModeFromProfile( startupProfile: StartupProfile, endpointKind?: "ws" | "http", ): ConnectionMode { @@ -1210,7 +1226,7 @@ export class CdpCodeTool implements CoreTool { "tool_attach_browserbase", ]; readonly supportedCapabilities: CoreCapability[] = [ - ...SUPPORTED_CAPABILITIES, + ...CDP_CODE_SUPPORTED_CAPABILITIES, ]; readonly supportedTargetKinds: TargetKind[] = [ "selector", diff --git a/packages/evals/core/tools/modcdp_code.ts b/packages/evals/core/tools/modcdp_code.ts new file mode 100644 index 000000000..0f780b09c --- /dev/null +++ b/packages/evals/core/tools/modcdp_code.ts @@ -0,0 +1,210 @@ +import fs from "node:fs"; +import path from "node:path"; +import { pathToFileURL } from "node:url"; +import type { + CoreCapability, + CoreTool, + StartupProfile, + ToolStartInput, + ToolStartResult, +} from "../contracts/tool.js"; +import type { TargetKind } from "../contracts/targets.js"; +import { getRepoRootDir } from "../../runtimePaths.js"; +import { + CDP_CODE_SUPPORTED_CAPABILITIES, + CdpSession, + connectionModeFromProfile, + type CdpConnectionLike, + type CdpEventMessage, +} from "./cdp_code.js"; + +export type ModCDPClientLike = { + connect(): Promise; + close(): Promise; + send(method: string, params?: unknown): Promise; + on( + eventName: string | symbol, + listener: (...args: unknown[]) => void, + ): unknown; + off( + eventName: string | symbol, + listener: (...args: unknown[]) => void, + ): unknown; + _cdp: { + send( + method: string, + params?: Record, + sessionId?: string | null, + ): Promise; + }; + [key: string]: unknown; +}; + +type ModCDPClientConstructor = new ( + options?: Record, +) => ModCDPClientLike; + +type ModCDPClientModule = { + ModCDPClient: ModCDPClientConstructor; +}; + +const DEFAULT_STAGEHAND_V4_SDK_PATH = path.join( + getRepoRootDir(), + "..", + "stagehand-driver", + "sdks", + "js", + "index.ts", +); + +const DEFAULT_MODCDP_CLIENT_PATH = path.join( + path.dirname(DEFAULT_STAGEHAND_V4_SDK_PATH), + "..", + "..", + "modcdp", + "dist", + "client", + "js", + "ModCDPClient.js", +); + +export class ModCdpConnection implements CdpConnectionLike { + readonly client: ModCDPClientLike; + + private constructor(client: ModCDPClientLike) { + this.client = client; + } + + static async connect(input: { + kind: "ws" | "http"; + url: string; + }): Promise { + const stagehandV4SdkPath = + process.env.STAGEHAND_V4_SDK_PATH ?? DEFAULT_STAGEHAND_V4_SDK_PATH; + const stagehandV4RootPath = path.join( + path.dirname(stagehandV4SdkPath), + "..", + "..", + ); + const clientPath = + process.env.MODCDP_CLIENT_PATH ?? + (process.env.STAGEHAND_V4_SDK_PATH + ? path.join( + stagehandV4RootPath, + "modcdp", + "dist", + "client", + "js", + "ModCDPClient.js", + ) + : DEFAULT_MODCDP_CLIENT_PATH); + if (!fs.existsSync(clientPath)) { + throw new Error( + [ + "modcdp_code requires a built ModCDP JS client.", + `Expected ModCDP client at: ${clientPath}`, + "Set MODCDP_CLIENT_PATH to the ModCDPClient.js entrypoint if your checkout lives somewhere else.", + `Or build it with: pnpm --dir ${stagehandV4RootPath} --filter modcdp run build`, + ].join("\n"), + ); + } + + const { ModCDPClient } = (await import( + pathToFileURL(clientPath).href + )) as ModCDPClientModule; + const client = new ModCDPClient({ + cdp_url: input.url, + routes: { "*.*": "service_worker" }, + server: { + loopback_cdp_url: input.url, + routes: { "*.*": "loopback_cdp" }, + }, + }); + await client.connect(); + return new ModCdpConnection(client); + } + + onEvent(listener: (event: CdpEventMessage) => void): () => void { + const wrapped = ( + method: unknown, + params: unknown, + sessionId: unknown, + ): void => { + if (typeof method !== "string") return; + listener({ + method, + params: + params && typeof params === "object" && !Array.isArray(params) + ? (params as Record) + : undefined, + sessionId: typeof sessionId === "string" ? sessionId : undefined, + }); + }; + this.client.on("*", wrapped); + return () => { + this.client.off("*", wrapped); + }; + } + + async send( + method: string, + params?: Record, + sessionId?: string, + ): Promise { + return this.client._cdp.send(method, params, sessionId ?? null); + } + + async close(): Promise { + await this.client.close(); + } +} + +export class ModCdpCodeTool implements CoreTool { + readonly id = "modcdp_code"; + readonly surface = "code"; + readonly family = "cdp"; + readonly supportedStartupProfiles: StartupProfile[] = [ + "runner_provided_local_cdp", + "runner_provided_browserbase_cdp", + "tool_attach_local_cdp", + "tool_attach_browserbase", + ]; + readonly supportedCapabilities: CoreCapability[] = [ + ...CDP_CODE_SUPPORTED_CAPABILITIES, + ]; + readonly supportedTargetKinds: TargetKind[] = [ + "selector", + "coords", + "focused", + ]; + + async start(input: ToolStartInput): Promise { + if (!input.providedEndpoint) { + throw new Error( + `modcdp_code startup profile "${input.startupProfile}" requires a providedEndpoint`, + ); + } + + const connection = await ModCdpConnection.connect(input.providedEndpoint); + const session = await CdpSession.fromConnection(connection); + + return { + session, + cleanup: async () => { + await session.close(); + }, + metadata: { + environment: + input.environment === "BROWSERBASE" ? "browserbase" : "local", + browserOwnership: input.startupProfile.startsWith("runner_provided") + ? "runner" + : "tool", + connectionMode: connectionModeFromProfile( + input.startupProfile, + input.providedEndpoint.kind, + ), + startupProfile: input.startupProfile, + }, + }; + } +} diff --git a/packages/evals/core/tools/registry.ts b/packages/evals/core/tools/registry.ts index 65384f137..243df19c7 100644 --- a/packages/evals/core/tools/registry.ts +++ b/packages/evals/core/tools/registry.ts @@ -2,6 +2,7 @@ import type { CoreTool, ToolSurface } from "../contracts/tool.js"; import { BrowseCliTool } from "./browse_cli.js"; import { CdpCodeTool } from "./cdp_code.js"; import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js"; +import { ModCdpCodeTool } from "./modcdp_code.js"; import { PlaywrightCodeTool } from "./playwright_code.js"; import { PlaywrightMcpTool } from "./playwright_mcp.js"; import { UnderstudyCodeTool } from "./understudy_code.js"; @@ -11,6 +12,7 @@ export function listCoreTools(): ToolSurface[] { "understudy_code", "playwright_code", "cdp_code", + "modcdp_code", "playwright_mcp", "chrome_devtools_mcp", "browse_cli", @@ -25,6 +27,8 @@ export function getCoreTool(toolSurface: ToolSurface): CoreTool { return new PlaywrightCodeTool(); case "cdp_code": return new CdpCodeTool(); + case "modcdp_code": + return new ModCdpCodeTool(); case "playwright_mcp": return new PlaywrightMcpTool(); case "chrome_devtools_mcp": diff --git a/packages/evals/framework/claudeCodeRunner.ts b/packages/evals/framework/claudeCodeRunner.ts index 6ec620233..333ccabfe 100644 --- a/packages/evals/framework/claudeCodeRunner.ts +++ b/packages/evals/framework/claudeCodeRunner.ts @@ -61,6 +61,17 @@ export function buildClaudeCodePrompt( .join("\n"); } +export function buildClaudeCodeSystemPromptAppend( + toolInstructions?: string, +): string { + return [ + "You are being evaluated. Do not edit repository files. Complete the browser task and emit the requested EVAL_RESULT line.", + toolInstructions, + ] + .filter(Boolean) + .join("\n\n"); +} + export function parseClaudeCodeResult(raw: string): ParsedClaudeCodeResult { const marker = "EVAL_RESULT:"; const markerIndex = raw.lastIndexOf(marker); @@ -178,8 +189,9 @@ export async function runClaudeCodeAgent({ systemPrompt: { type: "preset", preset: "claude_code", - append: - "You are being evaluated. Do not edit repository files. Complete the browser task and emit the requested EVAL_RESULT line.", + append: buildClaudeCodeSystemPromptAppend( + toolAdapter?.promptInstructions, + ), }, }, })) { diff --git a/packages/evals/framework/claudeCodeToolAdapter.ts b/packages/evals/framework/claudeCodeToolAdapter.ts index f67604da0..6b4ebc2d2 100644 --- a/packages/evals/framework/claudeCodeToolAdapter.ts +++ b/packages/evals/framework/claudeCodeToolAdapter.ts @@ -9,7 +9,15 @@ import type { EvalLogger } from "../logger.js"; import { getRepoRootDir } from "../runtimePaths.js"; import type { StartupProfile, ToolSurface } from "../core/contracts/tool.js"; import { prepareCoreBrowserTarget } from "../core/targets/index.js"; -import { CdpConnection, type CdpEventMessage } from "../core/tools/cdp_code.js"; +import { + CdpConnection, + type CdpConnectionLike, + type CdpEventMessage, +} from "../core/tools/cdp_code.js"; +import { + ModCdpConnection, + type ModCDPClientLike, +} from "../core/tools/modcdp_code.js"; import type { ExternalHarnessTaskPlan } from "./externalHarnessPlan.js"; export interface ClaudeCodeToolAdapterInput { @@ -135,6 +143,12 @@ type CdpRuntime = { wait(ms: number): Promise; }; +type ModCdpRuntime = CdpRuntime & { + readonly client: ModCDPClientLike; + readonly Mod: unknown; + readonly Custom: unknown; +}; + export interface BrowseCliToolMetadata { toolCommand: "browse"; browseCliEntrypoint: string; @@ -186,9 +200,15 @@ export async function prepareClaudeCodeToolAdapter( toolSurface, startupProfile, }); + case "modcdp_code": + return prepareModCdpCodeAdapter({ + ...input, + toolSurface, + startupProfile, + }); default: throw new EvalsError( - `Claude Code harness supports --tool browse_cli, playwright_code, or cdp_code for execution right now; received "${toolSurface}".`, + `Claude Code harness supports --tool browse_cli, playwright_code, cdp_code, or modcdp_code for execution right now; received "${toolSurface}".`, ); } } @@ -200,12 +220,13 @@ export function resolveClaudeCodeToolSurface( if ( requested === "browse_cli" || requested === "playwright_code" || - requested === "cdp_code" + requested === "cdp_code" || + requested === "modcdp_code" ) { return requested; } throw new EvalsError( - `Claude Code harness supports --tool browse_cli, playwright_code, or cdp_code for execution right now; received "${requested}".`, + `Claude Code harness supports --tool browse_cli, playwright_code, cdp_code, or modcdp_code for execution right now; received "${requested}".`, ); } @@ -221,7 +242,11 @@ export function resolveClaudeCodeStartupProfile( ? "tool_create_browserbase" : "tool_launch_local"; } - if (toolSurface === "playwright_code" || toolSurface === "cdp_code") { + if ( + toolSurface === "playwright_code" || + toolSurface === "cdp_code" || + toolSurface === "modcdp_code" + ) { return environment === "BROWSERBASE" ? "runner_provided_browserbase_cdp" : "runner_provided_local_cdp"; @@ -579,6 +604,121 @@ async function prepareCdpCodeAdapter( } } +async function prepareModCdpCodeAdapter( + input: ClaudeCodeToolAdapterInput & { + toolSurface: "modcdp_code"; + startupProfile: StartupProfile; + }, +): Promise { + if ( + input.startupProfile !== "runner_provided_local_cdp" && + input.startupProfile !== "runner_provided_browserbase_cdp" + ) { + throw new EvalsError( + `modcdp_code startup profile "${input.startupProfile}" is not valid for Claude Code. Use runner_provided_local_cdp or runner_provided_browserbase_cdp.`, + ); + } + + const cwd = await fsp.mkdtemp( + path.join(os.tmpdir(), "stagehand-evals-claude-modcdp-"), + ); + const env = { ...process.env } as Record; + let connection: ModCdpConnection | undefined; + let targetCleanup: () => Promise = async () => {}; + + try { + const target = await prepareCoreBrowserTarget({ + environment: input.environment, + toolSurface: "modcdp_code", + startupProfile: input.startupProfile, + }); + targetCleanup = target.cleanup; + if (!target.providedEndpoint?.url) { + throw new EvalsError( + `modcdp_code requires a runner-provided CDP endpoint for startup profile "${input.startupProfile}".`, + ); + } + + connection = await ModCdpConnection.connect(target.providedEndpoint); + const activePage = await attachActiveCdpPage(connection); + const mcpServers = await buildModCdpRunMcpServers({ + connection, + activePage, + plan: input.plan, + logger: input.logger, + }); + + input.logger.log({ + category: "claude_code", + message: `Initialized modcdp_code browser runtime for Claude Code run tool.`, + level: 1, + auxiliary: { + startupProfile: { + value: input.startupProfile, + type: "string", + }, + environment: { + value: input.environment, + type: "string", + }, + targetId: { + value: activePage.targetId, + type: "string", + }, + sessionId: { + value: activePage.sessionId, + type: "string", + }, + ...(target.metadata && { + targetMetadata: { + value: JSON.stringify(target.metadata), + type: "object", + }, + }), + }, + }); + + return { + toolSurface: "modcdp_code", + startupProfile: input.startupProfile, + cwd, + env, + allowedTools: ["Bash", RUN_TOOL_NAME], + settingSources: [], + mcpServers, + canUseTool: async (toolName, commandInput) => { + if (toolName === RUN_TOOL_NAME || toolName === "Bash") { + return { behavior: "allow", updatedInput: commandInput }; + } + return { + behavior: "deny", + message: `Use Bash for inspection and ${RUN_TOOL_NAME} for ModCDP browser automation.`, + }; + }, + promptInstructions: buildModCdpCodePromptInstructions(input.plan), + cleanup: async () => { + try { + await connection?.close(); + } catch { + // best-effort only + } finally { + await targetCleanup(); + await fsp.rm(cwd, { recursive: true, force: true }); + } + }, + }; + } catch (error) { + try { + await connection?.close(); + } catch { + // best-effort only + } + await targetCleanup(); + await fsp.rm(cwd, { recursive: true, force: true }); + throw error; + } +} + async function buildPlaywrightRunMcpServers(input: { browser: Browser; context: BrowserContext; @@ -701,7 +841,7 @@ async function executePlaywrightSnippet(input: { } async function buildCdpRunMcpServers(input: { - connection: CdpConnection; + connection: CdpConnectionLike; activePage: ActiveCdpPage; plan: ExternalHarnessTaskPlan; logger: EvalLogger; @@ -749,7 +889,7 @@ async function buildCdpRunMcpServers(input: { async function executeCdpRunTool(input: { code: string; - connection: CdpConnection; + connection: CdpConnectionLike; activePage: ActiveCdpPage; plan: ExternalHarnessTaskPlan; logger: EvalLogger; @@ -784,7 +924,7 @@ async function executeCdpRunTool(input: { async function executeCdpSnippet(input: { code: string; - connection: CdpConnection; + connection: CdpConnectionLike; activePage: ActiveCdpPage; plan: ExternalHarnessTaskPlan; logger: EvalLogger; @@ -813,8 +953,123 @@ async function executeCdpSnippet(input: { ); } +async function buildModCdpRunMcpServers(input: { + connection: ModCdpConnection; + activePage: ActiveCdpPage; + plan: ExternalHarnessTaskPlan; + logger: EvalLogger; +}): Promise> { + const sdk = (await import("@anthropic-ai/claude-agent-sdk")) as unknown as { + createSdkMcpServer: SdkMcpServerFactory; + tool: SdkToolFactory; + }; + + const runTool = sdk.tool( + "run", + [ + "Execute JavaScript against the initialized ModCDP browser client.", + "The snippet runs inside an async function with modcdp, z, startUrl, task, and console in scope.", + "Use await directly. Return a JSON-serializable value when useful.", + ].join(" "), + { + code: z + .string() + .describe( + "JavaScript function body to execute. modcdp/z/startUrl/task are already in scope.", + ), + }, + async ({ code }) => { + return executeModCdpRunTool({ + code, + connection: input.connection, + activePage: input.activePage, + plan: input.plan, + logger: input.logger, + }); + }, + { alwaysLoad: true }, + ); + + return { + [RUN_TOOL_SERVER]: sdk.createSdkMcpServer({ + name: RUN_TOOL_SERVER, + version: "1.0.0", + tools: [runTool], + alwaysLoad: true, + }), + }; +} + +async function executeModCdpRunTool(input: { + code: string; + connection: ModCdpConnection; + activePage: ActiveCdpPage; + plan: ExternalHarnessTaskPlan; + logger: EvalLogger; +}): Promise { + try { + const result = await withTimeout( + executeModCdpSnippet(input), + readPositiveIntEnv("EVAL_CLAUDE_CODE_RUN_TOOL_TIMEOUT_MS", 60_000), + ); + const text = stringifyToolResult(result); + input.logger.log({ + category: "claude_code", + message: `run tool completed: ${clip(text, 500)}`, + level: 1, + }); + return { + content: [{ type: "text", text }], + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + input.logger.warn({ + category: "claude_code", + message: `run tool failed: ${message}`, + level: 1, + }); + return { + isError: true, + content: [{ type: "text", text: message }], + }; + } +} + +async function executeModCdpSnippet(input: { + code: string; + connection: ModCdpConnection; + activePage: ActiveCdpPage; + plan: ExternalHarnessTaskPlan; + logger: EvalLogger; +}): Promise { + const AsyncFunction = Object.getPrototypeOf(async function () {}) + .constructor as new ( + ...args: string[] + ) => (...values: unknown[]) => Promise; + const fn = new AsyncFunction( + "modcdp", + "z", + "startUrl", + "task", + "console", + input.code, + ); + return fn( + buildModCdpRuntime(input.connection, input.activePage, input.logger), + z, + input.plan.startUrl, + { + dataset: input.plan.dataset, + id: input.plan.taskId, + startUrl: input.plan.startUrl, + instruction: input.plan.instruction, + }, + buildRunToolConsole(input.logger), + ); +} + function buildCdpRuntime( - connection: CdpConnection, + connection: CdpConnectionLike, activePage: ActiveCdpPage, logger: EvalLogger, ): CdpRuntime { @@ -900,8 +1155,22 @@ function buildCdpRuntime( }; } +function buildModCdpRuntime( + connection: ModCdpConnection, + activePage: ActiveCdpPage, + logger: EvalLogger, +): ModCdpRuntime { + const client = connection.client; + return { + ...buildCdpRuntime(connection, activePage, logger), + client, + Mod: client.Mod, + Custom: client.Custom, + }; +} + function onCdpEvent( - connection: CdpConnection, + connection: CdpConnectionLike, sessionId: string, method: string, listener: (event: CdpEventMessage) => unknown | Promise, @@ -936,7 +1205,7 @@ function onCdpEvent( } async function attachActiveCdpPage( - connection: CdpConnection, + connection: CdpConnectionLike, ): Promise { const targets = await connection.send<{ targetInfos: Array<{ @@ -982,7 +1251,7 @@ async function attachActiveCdpPage( } export function waitForCdpEvent( - connection: CdpConnection, + connection: CdpConnectionLike, sessionId: string, method: string, timeoutMs: number, @@ -1063,6 +1332,88 @@ function buildCdpCodePromptInstructions(plan: ExternalHarnessTaskPlan): string { ].join("\n"); } +function buildModCdpCodePromptInstructions( + plan: ExternalHarnessTaskPlan, +): string { + void plan; + return [ + "Browser tool surface: modcdp_code.", + `Use the ${RUN_TOOL_NAME} tool for browser automation. It exposes an initialized modcdp object, startUrl, and task object.`, + "modcdp.client is the typed ModCDPClient. modcdp.Mod is the generated Mod.* command surface and modcdp.Custom is the generated Custom.* command surface.", + "z from Zod is available for ModCDP command/event schemas.", + "Use modcdp.send(method, params) for page-scoped CDP commands and modcdp.browser(method, params) for browser-level CDP commands.", + "Helpers available: modcdp.on(method, listener), modcdp.once(method), modcdp.waitForEvent(method, timeoutMs), modcdp.wait(ms), modcdp.targetId, modcdp.sessionId.", + "ModCDP primitive 1: Mod.evaluate evaluates an expression through the extended CDP protocol, with chrome.* and a cdp bridge available.", + "ModCDP primitive 2: Mod.addCustomCommand extends CDP with a Custom.* method that can be called like any other protocol command.", + "ModCDP primitive 3: Mod.addCustomEvent extends CDP with a Custom.* event that can be emitted and listened for like any other protocol event.", + "ModCDP primitive 4: Mod.addMiddleware modifies CDP requests, responses, or events by exact name, wildcard prefix, or *.", + "ModCDP string-form examples:", + [ + "```js", + "// Use it like a normal CDP connection: send normal CDP and register for normal CDP events.", + 'console.log(await modcdp.browser("Browser.getVersion"));', + 'modcdp.on("Target.targetInfoChanged", console.log);', + "", + "// Evaluate with chrome.* and a cdp bridge available.", + 'const tab = await modcdp.send("Mod.evaluate", {', + ' expression: "(await chrome.tabs.query({ active: true }))[0]",', + "});", + "", + "// Extend CDP with a Custom.* command, then call it with send().", + 'await modcdp.send("Mod.addCustomCommand", {', + ' name: "Custom.tabIdFromTargetId",', + " paramsSchema: { targetId: modcdp.client.types.zod.Target.TargetID },", + " resultSchema: { tabId: z.number().nullable() },", + " expression: `async ({ targetId }) => ({", + " tabId: (await chrome.debugger.getTargets()).find(t => t.id === targetId)?.tabId ?? null", + " })`,", + "});", + 'const { targetInfos } = await modcdp.browser("Target.getTargets");', + 'const pageTarget = targetInfos.find((targetInfo) => targetInfo.type === "page");', + 'console.log(await modcdp.send("Custom.tabIdFromTargetId", { targetId: pageTarget.targetId }));', + "", + "// Extend CDP with a Custom.* event, then listen for it with on().", + 'await modcdp.send("Mod.addCustomEvent", {', + ' name: "Page.foregroundPageChanged",', + " eventSchema: {", + " targetId: modcdp.client.types.zod.Target.TargetID.nullable(),", + " tabId: z.number(),", + " },", + "});", + 'await modcdp.send("Mod.evaluate", {', + " expression: `chrome.tabs.onActivated.addListener(async ({ tabId }) =>", + ' cdp.emit("Page.foregroundPageChanged", {', + " tabId,", + " targetId: (await chrome.debugger.getTargets()).find(t => t.tabId === tabId)?.id ?? null", + " })", + " )`,", + "});", + 'modcdp.on("Page.foregroundPageChanged", console.log);', + "", + "// Modify existing CDP command results on the wire.", + 'await modcdp.send("Mod.addMiddleware", {', + ' name: "Target.getTargets",', + ' phase: "response",', + " expression: `async (payload, next) => {", + " for (const targetInfo of payload.targetInfos) {", + ' const { tabId } = await cdp.send("Custom.tabIdFromTargetId", {', + " targetId: targetInfo.targetId,", + " });", + " targetInfo.tabId = tabId;", + " }", + " return next(payload);", + " }`,", + "});", + 'console.log(await modcdp.browser("Target.getTargets"));', + "```", + ].join("\n"), + 'The first browser action should usually be: const loaded = modcdp.waitForEvent("Page.loadEventFired"); await modcdp.send("Page.navigate", { url: startUrl }); await loaded.', + "Use Bash for inspection and lightweight scripting. Do not create a separate browser process.", + "Do not edit repository files.", + "Return useful JSON-serializable values from run snippets so you can inspect progress.", + ].join("\n"); +} + async function runBrowseSetup( wrapperPath: string, environment: "LOCAL" | "BROWSERBASE", diff --git a/packages/evals/framework/context.ts b/packages/evals/framework/context.ts index daa8eabea..d85d90340 100644 --- a/packages/evals/framework/context.ts +++ b/packages/evals/framework/context.ts @@ -44,6 +44,7 @@ export function resolveDefaultCoreStartupProfile( case "understudy_code": case "playwright_code": case "cdp_code": + case "modcdp_code": case "playwright_mcp": case "chrome_devtools_mcp": return environment === "BROWSERBASE" diff --git a/packages/evals/tests/core/tool-registry.test.ts b/packages/evals/tests/core/tool-registry.test.ts index 9a524fa8a..c1676151f 100644 --- a/packages/evals/tests/core/tool-registry.test.ts +++ b/packages/evals/tests/core/tool-registry.test.ts @@ -5,6 +5,7 @@ describe("core tool registry", () => { it("lists extended tool surfaces", () => { expect(listCoreTools()).toEqual( expect.arrayContaining([ + "modcdp_code", "playwright_mcp", "chrome_devtools_mcp", "browse_cli", @@ -13,6 +14,7 @@ describe("core tool registry", () => { }); it("constructs MCP and CLI tools", () => { + expect(getCoreTool("modcdp_code").id).toBe("modcdp_code"); expect(getCoreTool("playwright_mcp").id).toBe("playwright_mcp"); expect(getCoreTool("chrome_devtools_mcp").id).toBe("chrome_devtools_mcp"); expect(getCoreTool("browse_cli").id).toBe("browse_cli"); diff --git a/packages/evals/tests/framework/claudeCodeRunner.test.ts b/packages/evals/tests/framework/claudeCodeRunner.test.ts index 4d8c3cde5..7492c2912 100644 --- a/packages/evals/tests/framework/claudeCodeRunner.test.ts +++ b/packages/evals/tests/framework/claudeCodeRunner.test.ts @@ -212,5 +212,13 @@ describe("claude code runner helpers", () => { "Bash", "mcp__stagehand_browser__run", ]); + expect(capturedOptions?.systemPrompt).toMatchObject({ + type: "preset", + preset: "claude_code", + }); + expect( + (capturedOptions?.systemPrompt as { append?: string } | undefined) + ?.append, + ).toContain("Use run."); }); }); diff --git a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts index e28775652..428f3ce2b 100644 --- a/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts +++ b/packages/evals/tests/framework/claudeCodeToolAdapter.test.ts @@ -52,11 +52,18 @@ describe("claude code tool adapter resolution", () => { expect(resolveClaudeCodeStartupProfile("cdp_code", "BROWSERBASE")).toBe( "runner_provided_browserbase_cdp", ); + expect(resolveClaudeCodeToolSurface("modcdp_code")).toBe("modcdp_code"); + expect(resolveClaudeCodeStartupProfile("modcdp_code", "LOCAL")).toBe( + "runner_provided_local_cdp", + ); + expect(resolveClaudeCodeStartupProfile("modcdp_code", "BROWSERBASE")).toBe( + "runner_provided_browserbase_cdp", + ); }); it("rejects unsupported Claude Code tool surfaces for now", () => { expect(() => resolveClaudeCodeToolSurface("understudy_code")).toThrow( - /supports --tool browse_cli, playwright_code, or cdp_code/, + /supports --tool browse_cli, playwright_code, cdp_code, or modcdp_code/, ); }); diff --git a/packages/evals/tests/framework/context.test.ts b/packages/evals/tests/framework/context.test.ts index 742378635..18b32ed76 100644 --- a/packages/evals/tests/framework/context.test.ts +++ b/packages/evals/tests/framework/context.test.ts @@ -13,6 +13,9 @@ describe("resolveDefaultCoreStartupProfile", () => { expect(resolveDefaultCoreStartupProfile("cdp_code", "LOCAL")).toBe( "runner_provided_local_cdp", ); + expect(resolveDefaultCoreStartupProfile("modcdp_code", "LOCAL")).toBe( + "runner_provided_local_cdp", + ); expect(resolveDefaultCoreStartupProfile("playwright_mcp", "LOCAL")).toBe( "runner_provided_local_cdp", ); @@ -37,6 +40,9 @@ describe("resolveDefaultCoreStartupProfile", () => { expect(resolveDefaultCoreStartupProfile("cdp_code", "BROWSERBASE")).toBe( "runner_provided_browserbase_cdp", ); + expect(resolveDefaultCoreStartupProfile("modcdp_code", "BROWSERBASE")).toBe( + "runner_provided_browserbase_cdp", + ); expect( resolveDefaultCoreStartupProfile("playwright_mcp", "BROWSERBASE"), ).toBe("runner_provided_browserbase_cdp"); diff --git a/packages/evals/tui/commands/help.ts b/packages/evals/tui/commands/help.ts index 95b49c766..1085459c7 100644 --- a/packages/evals/tui/commands/help.ts +++ b/packages/evals/tui/commands/help.ts @@ -79,7 +79,7 @@ export function printRunHelp(): void { "", row( `${cyan("--tool")} ${dim("")}`, - `Core tool surface ${gray("(understudy_code, playwright_code, ...)")}`, + `Core tool surface ${gray("(understudy_code, playwright_code, cdp_code, modcdp_code, ...)")}`, ), row(`${cyan("--startup")} ${dim("")}`, "Core startup profile"), "", @@ -200,7 +200,7 @@ export function printConfigHelp(): void { ), row(cyan("setup"), `Interactive wizard ${gray("(coming soon)")}`), "", - ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`, + ` ${bold("Valid core tools:")} ${gray("understudy_code, playwright_code, cdp_code, modcdp_code, playwright_mcp, chrome_devtools_mcp, browse_cli")}`, "", ` ${bold("Examples:")}`, "",