Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions packages/core/lib/v3/agent/AnthropicCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ import {
extractLlmCuaResponseSummary,
} from "../flowlogger/FlowLogger.js";
import { v7 as uuidv7 } from "uuid";
import {
DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT,
formatCustomToolResult,
} from "./utils/customToolResult.js";

export type ResponseInputItem = AnthropicMessage | AnthropicToolResult;

Expand Down Expand Up @@ -702,7 +706,7 @@ export class AnthropicCUAClient extends AgentClient {
});
} else {
// Handle custom tools
let toolResult = "Tool executed successfully";
let toolResult = DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT;
if (this.tools && item.name in this.tools) {
try {
const tool = this.tools[item.name];
Expand All @@ -717,7 +721,7 @@ export class AnthropicCUAClient extends AgentClient {
toolCallId: item.id,
messages: [],
});
toolResult = JSON.stringify(result);
toolResult = formatCustomToolResult(result);

logger({
category: "agent",
Expand Down
81 changes: 63 additions & 18 deletions packages/core/lib/v3/agent/GoogleCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,42 @@ export class GoogleCUAClient extends AgentClient {

if (result.actions.length > 0) {
let hasError = false;
let attemptedPostActionScreenshot = false;
let postActionScreenshotBase64: string | undefined;

const getPostActionScreenshotBase64 = async (): Promise<
string | undefined
> => {
if (attemptedPostActionScreenshot) {
return postActionScreenshotBase64;
}

attemptedPostActionScreenshot = true;

try {
logger({
category: "agent",
message: `Taking screenshot after executing ${result.actions.length} actions${hasError ? " (with errors)" : ""}`,
level: 2,
});

const screenshot = await this.captureScreenshot();
postActionScreenshotBase64 = screenshot.replace(
/^data:image\/png;base64,/,
"",
);

return postActionScreenshotBase64;
} catch (error) {
logger({
category: "agent",
message: `Error capturing screenshot: ${error}`,
level: 0,
});

return undefined;
}
};

// Execute all actions
for (let i = 0; i < result.actions.length; i++) {
Expand Down Expand Up @@ -559,6 +595,31 @@ export class GoogleCUAClient extends AgentClient {
}
}

if (functionResponses.length > 0) {
const base64Data = await getPostActionScreenshotBase64();
if (base64Data) {
for (const functionResponsePart of functionResponses) {
if (!functionResponsePart.functionResponse) {
continue;
}

functionResponsePart.functionResponse.response = {
url: this.currentUrl || "",
...functionResponsePart.functionResponse.response,
};
functionResponsePart.functionResponse.parts = [
...(functionResponsePart.functionResponse.parts || []),
{
inlineData: {
mimeType: "image/png",
data: base64Data,
},
},
];
}
}
}

// Create function responses for computer use actions (non-custom tools)
// We need exactly one response per function call, regardless of how many actions were generated
if (result.functionCalls.length > 0 || hasError) {
Expand All @@ -568,19 +629,9 @@ export class GoogleCUAClient extends AgentClient {
);

if (computerUseFunctionCalls.length > 0) {
try {
logger({
category: "agent",
message: `Taking screenshot after executing ${result.actions.length} actions${hasError ? " (with errors)" : ""}`,
level: 2,
});

const screenshot = await this.captureScreenshot();
const base64Data = screenshot.replace(
/^data:image\/png;base64,/,
"",
);
const base64Data = await getPostActionScreenshotBase64();

if (base64Data) {
// Create one function response for each computer use function call
// Following Python SDK pattern: FunctionResponse with parts containing inline_data
for (const functionCall of computerUseFunctionCalls) {
Expand Down Expand Up @@ -615,12 +666,6 @@ export class GoogleCUAClient extends AgentClient {
};
functionResponses.push(functionResponsePart);
}
} catch (error) {
logger({
category: "agent",
message: `Error capturing screenshot: ${error}`,
level: 0,
});
}
}
}
Expand Down
8 changes: 6 additions & 2 deletions packages/core/lib/v3/agent/OpenAICUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ import {
extractLlmCuaResponseSummary,
} from "../flowlogger/FlowLogger.js";
import { v7 as uuidv7 } from "uuid";
import {
DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT,
formatCustomToolResult,
} from "./utils/customToolResult.js";

/**
* Client for OpenAI's Computer Use Assistant API
Expand Down Expand Up @@ -803,7 +807,7 @@ export class OpenAICUAClient extends AgentClient {
}

// Execute the tool if available
let toolResult = "Tool executed successfully";
let toolResult = DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT;
if (this.tools && item.name in this.tools) {
try {
const tool = this.tools[item.name];
Expand All @@ -819,7 +823,7 @@ export class OpenAICUAClient extends AgentClient {
toolCallId: item.call_id,
messages: [],
});
toolResult = JSON.stringify(result);
toolResult = formatCustomToolResult(result);

logger({
category: "agent",
Expand Down
5 changes: 5 additions & 0 deletions packages/core/lib/v3/agent/utils/customToolResult.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export const DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT = "Tool executed successfully";

export function formatCustomToolResult(toolResult: unknown): string {
return JSON.stringify(toolResult) ?? DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT;
}
6 changes: 4 additions & 2 deletions packages/core/lib/v3/agent/utils/googleCustomToolHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { ToolSet } from "ai";
import { LogLine } from "../../types/public/logs.js";
import { toJsonSchema } from "../../zodCompat.js";
import type { StagehandZodSchema } from "../../zodCompat.js";
import { formatCustomToolResult } from "./customToolResult.js";

/**
* Result of executing a custom tool for Google CUA
Expand Down Expand Up @@ -36,10 +37,11 @@ export async function executeGoogleCustomTool(
toolCallId: `tool_${Date.now()}`,
messages: [],
});
const formattedToolResult = formatCustomToolResult(toolResult);

logger({
category: "agent",
message: `Tool ${toolName} completed successfully. Result: ${JSON.stringify(toolResult)}`,
message: `Tool ${toolName} completed successfully. Result: ${formattedToolResult}`,
level: 1,
});

Expand All @@ -48,7 +50,7 @@ export async function executeGoogleCustomTool(
functionResponse: {
name: toolName,
response: {
result: JSON.stringify(toolResult),
result: formattedToolResult,
},
},
};
Expand Down
1 change: 1 addition & 0 deletions packages/core/lib/v3/handlers/v3CuaAgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ export class V3CuaAgentHandler {
this.ensureNotClosed();
const page = await this.v3.context.awaitActivePage();
const screenshotBuffer = await page.screenshot({ fullPage: false });
this.agentClient.setCurrentUrl(page.url());
return screenshotBuffer.toString("base64"); // base64 png
});

Expand Down
40 changes: 39 additions & 1 deletion packages/core/tests/unit/agent-captcha-hooks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ class FakeCuaClient {
public captureScreenshot = vi.fn(async () => null);
public setViewport = vi.fn();
public setCurrentUrl = vi.fn();
public setScreenshotProvider = vi.fn();
public screenshotProvider?: () => Promise<string>;
public setScreenshotProvider = vi.fn((provider: () => Promise<string>) => {
this.screenshotProvider = provider;
});
public setSafetyConfirmationHandler = vi.fn();

setActionHandler(
Expand Down Expand Up @@ -138,6 +141,41 @@ describe("agent captcha hooks", () => {
fakeCuaClient = new FakeCuaClient();
});

it("keeps the CUA client URL in sync when the screenshot provider captures the active page", async () => {
new V3CuaAgentHandler(
{
context: {
awaitActivePage: async () => page,
},
bus: { emit: vi.fn() },
isCaptchaAutoSolveEnabled: false,
isAdvancedStealth: false,
configuredViewport: { width: 1288, height: 711 },
isAgentReplayActive: () => false,
updateMetrics: vi.fn(),
} as never,
logger,
{
modelName: "anthropic/claude-haiku-4-5-20251001",
clientOptions: { waitBetweenActions: 1 },
} as never,
);

await vi.waitFor(() => {
expect(fakeCuaClient.setCurrentUrl).toHaveBeenCalledWith(
"https://example.com",
);
});
fakeCuaClient.setCurrentUrl.mockClear();

await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe(
Buffer.from("fake-image").toString("base64"),
);
expect(fakeCuaClient.setCurrentUrl).toHaveBeenCalledWith(
"https://example.com",
);
});

it("blocks regular agent prepareStep until the solver finishes and injects one solved message", async () => {
const handler = new V3AgentHandler(
{
Expand Down
65 changes: 65 additions & 0 deletions packages/core/tests/unit/anthropic-cua-client.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { describe, expect, it, vi } from "vitest";
import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js";

function createClient() {
return new AnthropicCUAClient(
"anthropic",
"claude-sonnet-4-5-20250929",
undefined,
{ apiKey: "test-key" },
);
}

describe("AnthropicCUAClient", () => {
it("returns a success result when a custom tool completes with undefined", async () => {
const client = createClient();
const toolExecute = vi.fn(async () => undefined);

(
client as unknown as {
tools: Record<
string,
{
execute: typeof toolExecute;
}
>;
}
).tools = {
fillUsername: {
execute: toolExecute,
},
};

const result = await (
client as unknown as {
takeAction: (
output: unknown[],
logger: (msg: unknown) => void,
) => Promise<unknown[]>;
}
).takeAction(
[
{
id: "tool-1",
name: "fillUsername",
input: {},
},
],
vi.fn(),
);

expect(toolExecute).toHaveBeenCalledTimes(1);
expect(result).toEqual([
{
type: "tool_result",
tool_use_id: "tool-1",
content: [
{
type: "text",
text: "Tool executed successfully",
},
],
},
]);
});
});
Loading
Loading