Skip to content
2 changes: 1 addition & 1 deletion packages/evals/ARCHITECTURE.mmd
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ flowchart TB
CoreContext["framework/context.ts<br/>buildCoreContext"]
FixtureServer["core/fixtures<br/>local deterministic pages"]
CoreTargets["core/targets<br/>local Chrome<br/>Browserbase CDP"]
CoreTools["core/tools registry<br/>understudy_code<br/>playwright_code<br/>cdp_code<br/>playwright_mcp<br/>chrome_devtools_mcp<br/>browse_cli"]
CoreTools["core/tools registry<br/>understudy_v3_code<br/>playwright_code<br/>cdp_code<br/>playwright_mcp<br/>chrome_devtools_mcp<br/>browse_cli"]
CoreAssertions["assertions + metrics<br/>adapter-backed results"]
CoreDeps["core/runtime/coreDeps.ts<br/>browserbase + ws<br/>lazy require"]
end
Expand Down
3 changes: 2 additions & 1 deletion packages/evals/core/contracts/tool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ import type {
} from "./results.js";

export type ToolSurface =
| "understudy_code"
| "understudy_v3_code"
| "understudy_v4_code"
| "playwright_code"
| "cdp_code"
| "playwright_mcp"
Expand Down
8 changes: 4 additions & 4 deletions packages/evals/core/tools/registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ import { CdpCodeTool } from "./cdp_code.js";
import { ChromeDevtoolsMcpTool } from "./chrome_devtools_mcp.js";
import { PlaywrightCodeTool } from "./playwright_code.js";
import { PlaywrightMcpTool } from "./playwright_mcp.js";
import { UnderstudyCodeTool } from "./understudy_code.js";
import { UnderstudyV3CodeTool } from "./understudy_v3_code.js";

export function listCoreTools(): ToolSurface[] {
return [
"understudy_code",
"understudy_v3_code",
"playwright_code",
"cdp_code",
"playwright_mcp",
Expand All @@ -19,8 +19,8 @@ export function listCoreTools(): ToolSurface[] {

export function getCoreTool(toolSurface: ToolSurface): CoreTool {
switch (toolSurface) {
case "understudy_code":
return new UnderstudyCodeTool();
case "understudy_v3_code":
return new UnderstudyV3CodeTool();
case "playwright_code":
return new PlaywrightCodeTool();
case "cdp_code":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
`understudy_code does not support click target kind "${target.kind}" yet`,
`understudy_v3_code does not support click target kind "${target.kind}" yet`,
);
}
}
Expand Down Expand Up @@ -253,7 +253,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
`understudy_code does not support hover target kind "${target.kind}" yet`,
`understudy_v3_code does not support hover target kind "${target.kind}" yet`,
);
}
}
Expand Down Expand Up @@ -298,7 +298,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
`understudy_code does not support type target kind "${target.kind}" yet`,
`understudy_v3_code does not support type target kind "${target.kind}" yet`,
);
}
}
Expand Down Expand Up @@ -335,7 +335,7 @@ class UnderstudyPageHandle implements CorePageHandle {
return;
default:
throw new Error(
`understudy_code does not support press target kind "${target.kind}" yet`,
`understudy_v3_code does not support press target kind "${target.kind}" yet`,
);
}
}
Expand Down Expand Up @@ -462,8 +462,8 @@ function connectionModeFromProfile(
return "launch";
}

export class UnderstudyCodeTool implements CoreTool {
readonly id = "understudy_code";
export class UnderstudyV3CodeTool implements CoreTool {
readonly id = "understudy_v3_code";
readonly surface = "code";
readonly family = "understudy";
readonly supportedStartupProfiles: StartupProfile[] = [
Expand All @@ -485,7 +485,7 @@ export class UnderstudyCodeTool implements CoreTool {
async start(input: ToolStartInput): Promise<ToolStartResult> {
if (input.startupProfile === "tool_attach_local_cdp") {
throw new Error(
`understudy_code does not support startup profile "${input.startupProfile}" yet`,
`understudy_v3_code does not support startup profile "${input.startupProfile}" yet`,
);
}

Expand Down
52 changes: 52 additions & 0 deletions packages/evals/framework/ClaudeAgentHarness.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { EvalsError } from "../errors.js";
import { runClaudeCodeAgent } from "./claudeCodeRunner.js";
import { prepareClaudeCodeToolAdapter } from "./claudeCodeToolAdapter.js";
import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
import type {
BenchHarness,
BenchHarnessExecuteInput,
StartedBenchHarness,
} from "./benchHarness.js";
import type { TaskResult } from "./types.js";

export const ClaudeAgentHarness: BenchHarness = {
harness: "claude_code",
supportedTaskKinds: ["agent", "suite"],
supportsApi: false,
async execute({
input,
row,
logger,
signal,
}: BenchHarnessExecuteInput): Promise<TaskResult> {
const plan = buildExternalHarnessTaskPlan(input);
if (row.config.harness !== "claude_code") {
throw new EvalsError(
`Expected claude_code harness config, received "${row.config.harness}".`,
);
}
const toolAdapter = await prepareClaudeCodeToolAdapter({
toolSurface: row.config.toolSurface,
startupProfile: row.config.startupProfile,
environment: row.config.environment,
plan,
logger,
});
try {
return await runClaudeCodeAgent({
plan,
model: input.modelName,
logger,
toolAdapter,
signal,
});
} finally {
await toolAdapter.cleanup();
}
},
async start(): Promise<StartedBenchHarness> {
throw new EvalsError(
"Claude Code harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness claude_code.",
);
},
};
52 changes: 52 additions & 0 deletions packages/evals/framework/CodexAgentHarness.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { EvalsError } from "../errors.js";
import { runCodexAgent } from "./codexRunner.js";
import { prepareCodexToolAdapter } from "./codexToolAdapter.js";
import { buildExternalHarnessTaskPlan } from "./externalHarnessPlan.js";
import type {
BenchHarness,
BenchHarnessExecuteInput,
StartedBenchHarness,
} from "./benchHarness.js";
import type { TaskResult } from "./types.js";

export const CodexAgentHarness: BenchHarness = {
harness: "codex",
supportedTaskKinds: ["agent", "suite"],
supportsApi: false,
async execute({
input,
row,
logger,
signal,
}: BenchHarnessExecuteInput): Promise<TaskResult> {
const plan = buildExternalHarnessTaskPlan(input);
if (row.config.harness !== "codex") {
throw new EvalsError(
`Expected codex harness config, received "${row.config.harness}".`,
);
}
const toolAdapter = await prepareCodexToolAdapter({
toolSurface: row.config.toolSurface,
startupProfile: row.config.startupProfile,
environment: row.config.environment,
plan,
logger,
});
try {
return await runCodexAgent({
plan,
model: input.modelName,
logger,
toolAdapter,
signal,
});
} finally {
await toolAdapter.cleanup();
}
},
async start(): Promise<StartedBenchHarness> {
throw new EvalsError(
"Codex harness execution uses the external harness execute path. Use --dry-run to inspect its bench matrix, or run with --harness codex.",
);
},
};
139 changes: 139 additions & 0 deletions packages/evals/framework/StagehandAgentV3Harness.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import {
AgentProvider,
getAISDKLanguageModel,
loadApiKeyFromEnv,
type AvailableModel,
type LLMClient,
type LogLine,
} from "@browserbasehq/stagehand";
import { AISdkClientWrapped } from "../lib/AISdkClientWrapped.js";
import { endBrowserbaseSession } from "../browserbaseCleanup.js";
import { EvalsError } from "../errors.js";
import type { V3InitResult } from "../initV3.js";
import type {
BenchHarness,
BenchHarnessStartInput,
StartedBenchHarness,
} from "./benchHarness.js";
import type { DiscoveredTask } from "./types.js";

function isAgentTask(task: DiscoveredTask): boolean {
return (
task.primaryCategory === "agent" ||
task.categories.includes("agent") ||
task.categories.includes("external_agent_benchmarks")
);
}

function resolveProvider(modelName: AvailableModel): string | undefined {
if (modelName.includes("/")) {
return modelName.split("/")[0];
}

try {
return AgentProvider.getAgentProvider(modelName);
} catch {
return undefined;
}
}

export const StagehandAgentV3Harness: BenchHarness = {
harness: "stagehand_v3",
supportedTaskKinds: [
"act",
"extract",
"observe",
"agent",
"combination",
"suite",
],
supportsApi: true,
async start({
task,
input,
row,
logger,
verbose,
}: BenchHarnessStartInput): Promise<StartedBenchHarness> {
let v3Result: V3InitResult | undefined;
const createAgent = isAgentTask(task);
if (row.config.harness !== "stagehand_v3") {
throw new EvalsError(
`Expected stagehand_v3 harness config, received "${row.config.harness}".`,
);
}
const config = row.config;
const agentMode = config.agentMode ?? input.agentMode;
const isCUA = config.isCUA ?? input.isCUA;

if (config.useApi) {
const provider = resolveProvider(input.modelName);
const logFn = (line: LogLine) => logger.log(line);
const apiKey = loadApiKeyFromEnv(provider, logFn);
if (!apiKey) {
throw new EvalsError(
`USE_API=true but no API key found for provider "${provider}".`,
);
}
const { initV3 } = await import("../initV3.js");
v3Result = await initV3({
logger,
modelName: input.modelName,
modelClientOptions: { apiKey },
createAgent,
agentMode,
isCUA,
verbose,
configOverrides: { env: config.environment },
});
} else {
let llmClient: LLMClient | undefined;
if (input.modelName.includes("/")) {
const firstSlashIndex = input.modelName.indexOf("/");
llmClient = new AISdkClientWrapped({
model: getAISDKLanguageModel(
input.modelName.substring(0, firstSlashIndex),
input.modelName.substring(firstSlashIndex + 1),
),
});
}
const { initV3 } = await import("../initV3.js");
v3Result = await initV3({
logger,
llmClient,
modelName: input.modelName,
createAgent,
agentMode,
isCUA,
verbose,
configOverrides: { env: config.environment },
});
}

return {
ctx: {
harness: "stagehand_v3",
row,
logger,
v3: v3Result.v3,
agent: v3Result.agent,
page: v3Result.v3.context.pages()[0],
debugUrl: v3Result.debugUrl ?? "",
sessionUrl: v3Result.sessionUrl ?? "",
},
cleanup: async () => {
if (v3Result?.v3) {
try {
await v3Result.v3.close();
} catch (closeError) {
console.error(
`Warning: Error closing V3 instance for ${input.name}:`,
closeError,
);
}
}
await endBrowserbaseSession(v3Result?.v3);
},
};
},
};
Loading
Loading