Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/evals/ARCHITECTURE.mmd
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ flowchart TB
CoreContext["framework/context.ts<br/>buildCoreContext"]
FixtureServer["core/fixtures<br/>local deterministic pages"]
CoreTargets["core/targets<br/>local Chrome<br/>Browserbase CDP"]
CoreTools["core/tools registry<br/>understudy_code<br/>playwright_code<br/>cdp_code<br/>playwright_mcp<br/>chrome_devtools_mcp<br/>browse_cli"]
CoreTools["core/tools registry<br/>understudy_code<br/>playwright_code<br/>cdp_code<br/>modcdp_code<br/>playwright_mcp<br/>chrome_devtools_mcp<br/>browse_cli"]
CoreAssertions["assertions + metrics<br/>adapter-backed results"]
CoreDeps["core/runtime/coreDeps.ts<br/>browserbase + ws<br/>lazy require"]
end
Expand Down
1 change: 1 addition & 0 deletions packages/evals/core/contracts/tool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export type ToolSurface =
| "understudy_code"
| "playwright_code"
| "cdp_code"
| "modcdp_code"
| "playwright_mcp"
| "chrome_devtools_mcp"
| "browse_cli";
Expand Down
30 changes: 23 additions & 7 deletions packages/evals/core/tools/cdp_code.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import { loadWsModule } from "../runtime/coreDeps.js";
const DEFAULT_TIMEOUT_MS = 15_000;
const POLL_INTERVAL_MS = 100;

const SUPPORTED_CAPABILITIES: CoreCapability[] = [
export const CDP_CODE_SUPPORTED_CAPABILITIES: CoreCapability[] = [
"session",
"navigation",
"evaluation",
Expand All @@ -40,6 +40,16 @@ export type CdpEventMessage = {
sessionId?: string;
};

export interface CdpConnectionLike {
onEvent(listener: (event: CdpEventMessage) => void): () => void;
send<T = unknown>(
method: string,
params?: Record<string, unknown>,
sessionId?: string,
): Promise<T>;
close(): Promise<void>;
}

type SelectorInspection = {
count: number;
visible: boolean;
Expand Down Expand Up @@ -156,7 +166,7 @@ async function resolveWebSocketEndpoint(input: {
return payload.webSocketDebuggerUrl;
}

export class CdpConnection {
export class CdpConnection implements CdpConnectionLike {
private readonly pending = new Map<
number,
{
Expand Down Expand Up @@ -353,7 +363,7 @@ class CdpLocatorHandle implements CoreLocatorHandle {

class CdpPageHandle implements CorePageHandle {
constructor(
private readonly connection: CdpConnection,
private readonly connection: CdpConnectionLike,
private readonly state: CdpPageState,
) {}

Expand Down Expand Up @@ -1015,12 +1025,12 @@ class CdpPageHandle implements CorePageHandle {
}
}

class CdpSession implements CoreSession {
export class CdpSession implements CoreSession {
private readonly pages = new Map<string, CdpPageState>();
private activePageId: string | null = null;
private closed = false;

private constructor(private readonly connection: CdpConnection) {}
private constructor(private readonly connection: CdpConnectionLike) {}

static async connect(input: {
providedEndpoint: {
Expand All @@ -1030,6 +1040,12 @@ class CdpSession implements CoreSession {
};
}): Promise<CdpSession> {
const connection = await CdpConnection.connect(input.providedEndpoint);
return CdpSession.fromConnection(connection);
}

static async fromConnection(
connection: CdpConnectionLike,
): Promise<CdpSession> {
const session = new CdpSession(connection);
await session.bootstrap();
return session;
Expand Down Expand Up @@ -1183,7 +1199,7 @@ class CdpSession implements CoreSession {
}
}

function connectionModeFromProfile(
export function connectionModeFromProfile(
startupProfile: StartupProfile,
endpointKind?: "ws" | "http",
): ConnectionMode {
Expand All @@ -1210,7 +1226,7 @@ export class CdpCodeTool implements CoreTool {
"tool_attach_browserbase",
];
readonly supportedCapabilities: CoreCapability[] = [
...SUPPORTED_CAPABILITIES,
...CDP_CODE_SUPPORTED_CAPABILITIES,
];
readonly supportedTargetKinds: TargetKind[] = [
"selector",
Expand Down
210 changes: 210 additions & 0 deletions packages/evals/core/tools/modcdp_code.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import fs from "node:fs";
import path from "node:path";
import { pathToFileURL } from "node:url";
import type {
CoreCapability,
CoreTool,
StartupProfile,
ToolStartInput,
ToolStartResult,
} from "../contracts/tool.js";
import type { TargetKind } from "../contracts/targets.js";
import { getRepoRootDir } from "../../runtimePaths.js";
import {
CDP_CODE_SUPPORTED_CAPABILITIES,
CdpSession,
connectionModeFromProfile,
type CdpConnectionLike,
type CdpEventMessage,
} from "./cdp_code.js";

export type ModCDPClientLike = {
connect(): Promise<unknown>;
close(): Promise<void>;
send<T = unknown>(method: string, params?: unknown): Promise<T>;
on(
eventName: string | symbol,
listener: (...args: unknown[]) => void,
): unknown;
off(
eventName: string | symbol,
listener: (...args: unknown[]) => void,
): unknown;
_cdp: {
send<T = unknown>(
method: string,
params?: Record<string, unknown>,
sessionId?: string | null,
): Promise<T>;
};
[key: string]: unknown;
};

type ModCDPClientConstructor = new (
options?: Record<string, unknown>,
) => ModCDPClientLike;

type ModCDPClientModule = {
ModCDPClient: ModCDPClientConstructor;
};

const DEFAULT_STAGEHAND_V4_SDK_PATH = path.join(
getRepoRootDir(),
"..",
"stagehand-driver",
"sdks",
"js",
"index.ts",
);

const DEFAULT_MODCDP_CLIENT_PATH = path.join(
path.dirname(DEFAULT_STAGEHAND_V4_SDK_PATH),
"..",
"..",
"modcdp",
"dist",
"client",
"js",
"ModCDPClient.js",
);

export class ModCdpConnection implements CdpConnectionLike {
readonly client: ModCDPClientLike;

private constructor(client: ModCDPClientLike) {
this.client = client;
}

static async connect(input: {
Comment thread
pirate marked this conversation as resolved.
kind: "ws" | "http";
url: string;
}): Promise<ModCdpConnection> {
const stagehandV4SdkPath =
process.env.STAGEHAND_V4_SDK_PATH ?? DEFAULT_STAGEHAND_V4_SDK_PATH;
const stagehandV4RootPath = path.join(
path.dirname(stagehandV4SdkPath),
"..",
"..",
);
const clientPath =
process.env.MODCDP_CLIENT_PATH ??
(process.env.STAGEHAND_V4_SDK_PATH
? path.join(
stagehandV4RootPath,
"modcdp",
"dist",
"client",
"js",
"ModCDPClient.js",
)
: DEFAULT_MODCDP_CLIENT_PATH);
if (!fs.existsSync(clientPath)) {
throw new Error(
[
"modcdp_code requires a built ModCDP JS client.",
`Expected ModCDP client at: ${clientPath}`,
"Set MODCDP_CLIENT_PATH to the ModCDPClient.js entrypoint if your checkout lives somewhere else.",
`Or build it with: pnpm --dir ${stagehandV4RootPath} --filter modcdp run build`,
].join("\n"),
);
}

const { ModCDPClient } = (await import(
pathToFileURL(clientPath).href
)) as ModCDPClientModule;
const client = new ModCDPClient({
cdp_url: input.url,
routes: { "*.*": "service_worker" },
server: {
loopback_cdp_url: input.url,
routes: { "*.*": "loopback_cdp" },
},
});
await client.connect();
return new ModCdpConnection(client);
}

onEvent(listener: (event: CdpEventMessage) => void): () => void {
const wrapped = (
method: unknown,
params: unknown,
sessionId: unknown,
): void => {
if (typeof method !== "string") return;
listener({
method,
params:
params && typeof params === "object" && !Array.isArray(params)
? (params as Record<string, unknown>)
: undefined,
sessionId: typeof sessionId === "string" ? sessionId : undefined,
});
};
this.client.on("*", wrapped);
return () => {
this.client.off("*", wrapped);
};
}

async send<T = unknown>(
method: string,
params?: Record<string, unknown>,
sessionId?: string,
): Promise<T> {
return this.client._cdp.send<T>(method, params, sessionId ?? null);
}

async close(): Promise<void> {
await this.client.close();
}
}

export class ModCdpCodeTool implements CoreTool {
readonly id = "modcdp_code";
readonly surface = "code";
readonly family = "cdp";
readonly supportedStartupProfiles: StartupProfile[] = [
"runner_provided_local_cdp",
"runner_provided_browserbase_cdp",
"tool_attach_local_cdp",
"tool_attach_browserbase",
];
readonly supportedCapabilities: CoreCapability[] = [
...CDP_CODE_SUPPORTED_CAPABILITIES,
];
readonly supportedTargetKinds: TargetKind[] = [
"selector",
"coords",
"focused",
];

async start(input: ToolStartInput): Promise<ToolStartResult> {
if (!input.providedEndpoint) {
throw new Error(
`modcdp_code startup profile "${input.startupProfile}" requires a providedEndpoint`,
);
}

const connection = await ModCdpConnection.connect(input.providedEndpoint);
const session = await CdpSession.fromConnection(connection);

return {
session,
cleanup: async () => {
await session.close();
},
metadata: {
environment:
input.environment === "BROWSERBASE" ? "browserbase" : "local",
browserOwnership: input.startupProfile.startsWith("runner_provided")
? "runner"
: "tool",
connectionMode: connectionModeFromProfile(
input.startupProfile,
input.providedEndpoint.kind,
),
startupProfile: input.startupProfile,
},
};
}
}
4 changes: 4 additions & 0 deletions packages/evals/core/tools/registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import type { CoreTool, ToolSurface } from "../contracts/tool.js";
import { BrowseCliTool } from "./browse_cli.js";
import { CdpCodeTool } from "./cdp_code.js";
import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js";
import { ModCdpCodeTool } from "./modcdp_code.js";
import { PlaywrightCodeTool } from "./playwright_code.js";
import { PlaywrightMcpTool } from "./playwright_mcp.js";
import { UnderstudyCodeTool } from "./understudy_code.js";
Expand All @@ -11,6 +12,7 @@ export function listCoreTools(): ToolSurface[] {
"understudy_code",
"playwright_code",
"cdp_code",
"modcdp_code",
"playwright_mcp",
"chrome_devtools_mcp",
"browse_cli",
Expand All @@ -25,6 +27,8 @@ export function getCoreTool(toolSurface: ToolSurface): CoreTool {
return new PlaywrightCodeTool();
case "cdp_code":
return new CdpCodeTool();
case "modcdp_code":
return new ModCdpCodeTool();
case "playwright_mcp":
return new PlaywrightMcpTool();
case "chrome_devtools_mcp":
Expand Down
16 changes: 14 additions & 2 deletions packages/evals/framework/claudeCodeRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,17 @@ export function buildClaudeCodePrompt(
.join("\n");
}

export function buildClaudeCodeSystemPromptAppend(
toolInstructions?: string,
): string {
return [
"You are being evaluated. Do not edit repository files. Complete the browser task and emit the requested EVAL_RESULT line.",
toolInstructions,
]
.filter(Boolean)
.join("\n\n");
}

export function parseClaudeCodeResult(raw: string): ParsedClaudeCodeResult {
const marker = "EVAL_RESULT:";
const markerIndex = raw.lastIndexOf(marker);
Expand Down Expand Up @@ -178,8 +189,9 @@ export async function runClaudeCodeAgent({
systemPrompt: {
type: "preset",
preset: "claude_code",
append:
"You are being evaluated. Do not edit repository files. Complete the browser task and emit the requested EVAL_RESULT line.",
append: buildClaudeCodeSystemPromptAppend(
toolAdapter?.promptInstructions,
),
},
},
})) {
Expand Down
Loading
Loading