diff --git a/packages/core/lib/v3/agent/AnthropicCUAClient.ts b/packages/core/lib/v3/agent/AnthropicCUAClient.ts index 4bbe2d47b..04cc191f4 100644 --- a/packages/core/lib/v3/agent/AnthropicCUAClient.ts +++ b/packages/core/lib/v3/agent/AnthropicCUAClient.ts @@ -27,6 +27,10 @@ import { extractLlmCuaResponseSummary, } from "../flowlogger/FlowLogger.js"; import { v7 as uuidv7 } from "uuid"; +import { + DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT, + formatCustomToolResult, +} from "./utils/customToolResult.js"; export type ResponseInputItem = AnthropicMessage | AnthropicToolResult; @@ -702,7 +706,7 @@ export class AnthropicCUAClient extends AgentClient { }); } else { // Handle custom tools - let toolResult = "Tool executed successfully"; + let toolResult = DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT; if (this.tools && item.name in this.tools) { try { const tool = this.tools[item.name]; @@ -717,7 +721,7 @@ export class AnthropicCUAClient extends AgentClient { toolCallId: item.id, messages: [], }); - toolResult = JSON.stringify(result); + toolResult = formatCustomToolResult(result); logger({ category: "agent", diff --git a/packages/core/lib/v3/agent/GoogleCUAClient.ts b/packages/core/lib/v3/agent/GoogleCUAClient.ts index 281aa765f..0e4ec5647 100644 --- a/packages/core/lib/v3/agent/GoogleCUAClient.ts +++ b/packages/core/lib/v3/agent/GoogleCUAClient.ts @@ -486,6 +486,42 @@ export class GoogleCUAClient extends AgentClient { if (result.actions.length > 0) { let hasError = false; + let attemptedPostActionScreenshot = false; + let postActionScreenshotBase64: string | undefined; + + const getPostActionScreenshotBase64 = async (): Promise< + string | undefined + > => { + if (attemptedPostActionScreenshot) { + return postActionScreenshotBase64; + } + + attemptedPostActionScreenshot = true; + + try { + logger({ + category: "agent", + message: `Taking screenshot after executing ${result.actions.length} actions${hasError ? " (with errors)" : ""}`, + level: 2, + }); + + const screenshot = await this.captureScreenshot(); + postActionScreenshotBase64 = screenshot.replace( + /^data:image\/png;base64,/, + "", + ); + + return postActionScreenshotBase64; + } catch (error) { + logger({ + category: "agent", + message: `Error capturing screenshot: ${error}`, + level: 0, + }); + + return undefined; + } + }; // Execute all actions for (let i = 0; i < result.actions.length; i++) { @@ -559,6 +595,31 @@ export class GoogleCUAClient extends AgentClient { } } + if (functionResponses.length > 0) { + const base64Data = await getPostActionScreenshotBase64(); + if (base64Data) { + for (const functionResponsePart of functionResponses) { + if (!functionResponsePart.functionResponse) { + continue; + } + + functionResponsePart.functionResponse.response = { + url: this.currentUrl || "", + ...functionResponsePart.functionResponse.response, + }; + functionResponsePart.functionResponse.parts = [ + ...(functionResponsePart.functionResponse.parts || []), + { + inlineData: { + mimeType: "image/png", + data: base64Data, + }, + }, + ]; + } + } + } + // Create function responses for computer use actions (non-custom tools) // We need exactly one response per function call, regardless of how many actions were generated if (result.functionCalls.length > 0 || hasError) { @@ -568,19 +629,9 @@ export class GoogleCUAClient extends AgentClient { ); if (computerUseFunctionCalls.length > 0) { - try { - logger({ - category: "agent", - message: `Taking screenshot after executing ${result.actions.length} actions${hasError ? " (with errors)" : ""}`, - level: 2, - }); - - const screenshot = await this.captureScreenshot(); - const base64Data = screenshot.replace( - /^data:image\/png;base64,/, - "", - ); + const base64Data = await getPostActionScreenshotBase64(); + if (base64Data) { // Create one function response for each computer use function call // Following Python SDK pattern: FunctionResponse with parts containing inline_data for (const functionCall of computerUseFunctionCalls) { @@ -615,12 +666,6 @@ export class GoogleCUAClient extends AgentClient { }; functionResponses.push(functionResponsePart); } - } catch (error) { - logger({ - category: "agent", - message: `Error capturing screenshot: ${error}`, - level: 0, - }); } } } diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index 184346381..5ef90a134 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -30,6 +30,10 @@ import { extractLlmCuaResponseSummary, } from "../flowlogger/FlowLogger.js"; import { v7 as uuidv7 } from "uuid"; +import { + DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT, + formatCustomToolResult, +} from "./utils/customToolResult.js"; /** * Client for OpenAI's Computer Use Assistant API @@ -803,7 +807,7 @@ export class OpenAICUAClient extends AgentClient { } // Execute the tool if available - let toolResult = "Tool executed successfully"; + let toolResult = DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT; if (this.tools && item.name in this.tools) { try { const tool = this.tools[item.name]; @@ -819,7 +823,7 @@ export class OpenAICUAClient extends AgentClient { toolCallId: item.call_id, messages: [], }); - toolResult = JSON.stringify(result); + toolResult = formatCustomToolResult(result); logger({ category: "agent", diff --git a/packages/core/lib/v3/agent/utils/customToolResult.ts b/packages/core/lib/v3/agent/utils/customToolResult.ts new file mode 100644 index 000000000..ef1708624 --- /dev/null +++ b/packages/core/lib/v3/agent/utils/customToolResult.ts @@ -0,0 +1,5 @@ +export const DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT = "Tool executed successfully"; + +export function formatCustomToolResult(toolResult: unknown): string { + return JSON.stringify(toolResult) ?? DEFAULT_CUSTOM_TOOL_SUCCESS_RESULT; +} diff --git a/packages/core/lib/v3/agent/utils/googleCustomToolHandler.ts b/packages/core/lib/v3/agent/utils/googleCustomToolHandler.ts index 986aa187e..69075ce03 100644 --- a/packages/core/lib/v3/agent/utils/googleCustomToolHandler.ts +++ b/packages/core/lib/v3/agent/utils/googleCustomToolHandler.ts @@ -3,6 +3,7 @@ import { ToolSet } from "ai"; import { LogLine } from "../../types/public/logs.js"; import { toJsonSchema } from "../../zodCompat.js"; import type { StagehandZodSchema } from "../../zodCompat.js"; +import { formatCustomToolResult } from "./customToolResult.js"; /** * Result of executing a custom tool for Google CUA @@ -36,10 +37,11 @@ export async function executeGoogleCustomTool( toolCallId: `tool_${Date.now()}`, messages: [], }); + const formattedToolResult = formatCustomToolResult(toolResult); logger({ category: "agent", - message: `Tool ${toolName} completed successfully. Result: ${JSON.stringify(toolResult)}`, + message: `Tool ${toolName} completed successfully. Result: ${formattedToolResult}`, level: 1, }); @@ -48,7 +50,7 @@ export async function executeGoogleCustomTool( functionResponse: { name: toolName, response: { - result: JSON.stringify(toolResult), + result: formattedToolResult, }, }, }; diff --git a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts index 6cefa4b4d..feefd55dc 100644 --- a/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts +++ b/packages/core/lib/v3/handlers/v3CuaAgentHandler.ts @@ -76,6 +76,7 @@ export class V3CuaAgentHandler { this.ensureNotClosed(); const page = await this.v3.context.awaitActivePage(); const screenshotBuffer = await page.screenshot({ fullPage: false }); + this.agentClient.setCurrentUrl(page.url()); return screenshotBuffer.toString("base64"); // base64 png }); diff --git a/packages/core/tests/unit/agent-captcha-hooks.test.ts b/packages/core/tests/unit/agent-captcha-hooks.test.ts index b3d584c25..fabd50ace 100644 --- a/packages/core/tests/unit/agent-captcha-hooks.test.ts +++ b/packages/core/tests/unit/agent-captcha-hooks.test.ts @@ -72,7 +72,10 @@ class FakeCuaClient { public captureScreenshot = vi.fn(async () => null); public setViewport = vi.fn(); public setCurrentUrl = vi.fn(); - public setScreenshotProvider = vi.fn(); + public screenshotProvider?: () => Promise; + public setScreenshotProvider = vi.fn((provider: () => Promise) => { + this.screenshotProvider = provider; + }); public setSafetyConfirmationHandler = vi.fn(); setActionHandler( @@ -138,6 +141,41 @@ describe("agent captcha hooks", () => { fakeCuaClient = new FakeCuaClient(); }); + it("keeps the CUA client URL in sync when the screenshot provider captures the active page", async () => { + new V3CuaAgentHandler( + { + context: { + awaitActivePage: async () => page, + }, + bus: { emit: vi.fn() }, + isCaptchaAutoSolveEnabled: false, + isAdvancedStealth: false, + configuredViewport: { width: 1288, height: 711 }, + isAgentReplayActive: () => false, + updateMetrics: vi.fn(), + } as never, + logger, + { + modelName: "anthropic/claude-haiku-4-5-20251001", + clientOptions: { waitBetweenActions: 1 }, + } as never, + ); + + await vi.waitFor(() => { + expect(fakeCuaClient.setCurrentUrl).toHaveBeenCalledWith( + "https://example.com", + ); + }); + fakeCuaClient.setCurrentUrl.mockClear(); + + await expect(fakeCuaClient.screenshotProvider?.()).resolves.toBe( + Buffer.from("fake-image").toString("base64"), + ); + expect(fakeCuaClient.setCurrentUrl).toHaveBeenCalledWith( + "https://example.com", + ); + }); + it("blocks regular agent prepareStep until the solver finishes and injects one solved message", async () => { const handler = new V3AgentHandler( { diff --git a/packages/core/tests/unit/anthropic-cua-client.test.ts b/packages/core/tests/unit/anthropic-cua-client.test.ts new file mode 100644 index 000000000..d7a98e1a7 --- /dev/null +++ b/packages/core/tests/unit/anthropic-cua-client.test.ts @@ -0,0 +1,65 @@ +import { describe, expect, it, vi } from "vitest"; +import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js"; + +function createClient() { + return new AnthropicCUAClient( + "anthropic", + "claude-sonnet-4-5-20250929", + undefined, + { apiKey: "test-key" }, + ); +} + +describe("AnthropicCUAClient", () => { + it("returns a success result when a custom tool completes with undefined", async () => { + const client = createClient(); + const toolExecute = vi.fn(async () => undefined); + + ( + client as unknown as { + tools: Record< + string, + { + execute: typeof toolExecute; + } + >; + } + ).tools = { + fillUsername: { + execute: toolExecute, + }, + }; + + const result = await ( + client as unknown as { + takeAction: ( + output: unknown[], + logger: (msg: unknown) => void, + ) => Promise; + } + ).takeAction( + [ + { + id: "tool-1", + name: "fillUsername", + input: {}, + }, + ], + vi.fn(), + ); + + expect(toolExecute).toHaveBeenCalledTimes(1); + expect(result).toEqual([ + { + type: "tool_result", + tool_use_id: "tool-1", + content: [ + { + type: "text", + text: "Tool executed successfully", + }, + ], + }, + ]); + }); +}); diff --git a/packages/core/tests/unit/google-cua-client.test.ts b/packages/core/tests/unit/google-cua-client.test.ts new file mode 100644 index 000000000..7806f07d4 --- /dev/null +++ b/packages/core/tests/unit/google-cua-client.test.ts @@ -0,0 +1,275 @@ +import { describe, expect, it, vi } from "vitest"; +import { GoogleCUAClient } from "../../lib/v3/agent/GoogleCUAClient.js"; +import type { Content } from "@google/genai"; + +function createClient() { + return new GoogleCUAClient( + "google", + "google/gemini-2.5-computer-use-preview-10-2025", + "test instructions", + { apiKey: "test" }, + ); +} + +describe("GoogleCUAClient", () => { + it("returns a fresh screenshot after executing a custom tool", async () => { + const client = createClient(); + const screenshotProvider = vi.fn(async () => "fresh-screenshot-base64"); + client.setScreenshotProvider(screenshotProvider); + client.setCurrentUrl("http://127.0.0.1:6789/"); + + const toolExecute = vi.fn(async () => ({ filled: true })); + ( + client as unknown as { + tools: Record< + string, + { + execute: typeof toolExecute; + } + >; + } + ).tools = { + fillUsername: { + execute: toolExecute, + }, + }; + + const generateContent = vi.fn(async () => ({ + candidates: [ + { + content: { + role: "model", + parts: [ + { + functionCall: { + name: "fillUsername", + args: {}, + }, + }, + ], + }, + finishReason: "STOP", + }, + ], + usageMetadata: { + promptTokenCount: 1, + candidatesTokenCount: 1, + }, + })); + + ( + client as unknown as { + client: { + models: { + generateContent: typeof generateContent; + }; + }; + } + ).client = { + models: { + generateContent, + }, + }; + + await client.executeStep(vi.fn()); + + expect(toolExecute).toHaveBeenCalledTimes(1); + expect(screenshotProvider).toHaveBeenCalledTimes(1); + + const history = (client as unknown as { history: Content[] }).history; + const userResponse = history[history.length - 1]; + const functionResponse = userResponse.parts?.[0]?.functionResponse; + + expect(functionResponse).toMatchObject({ + name: "fillUsername", + response: { + result: JSON.stringify({ filled: true }), + url: "http://127.0.0.1:6789/", + }, + parts: [ + { + inlineData: { + mimeType: "image/png", + data: "fresh-screenshot-base64", + }, + }, + ], + }); + }); + + it("returns a success result and fresh screenshot when a custom tool completes with undefined", async () => { + const client = createClient(); + const screenshotProvider = vi.fn(async () => "fresh-screenshot-base64"); + client.setScreenshotProvider(screenshotProvider); + client.setCurrentUrl("http://127.0.0.1:6789/"); + + const toolExecute = vi.fn(async () => undefined); + ( + client as unknown as { + tools: Record< + string, + { + execute: typeof toolExecute; + } + >; + } + ).tools = { + fillUsername: { + execute: toolExecute, + }, + }; + + const generateContent = vi.fn(async () => ({ + candidates: [ + { + content: { + role: "model", + parts: [ + { + functionCall: { + name: "fillUsername", + args: {}, + }, + }, + ], + }, + finishReason: "STOP", + }, + ], + usageMetadata: { + promptTokenCount: 1, + candidatesTokenCount: 1, + }, + })); + + ( + client as unknown as { + client: { + models: { + generateContent: typeof generateContent; + }; + }; + } + ).client = { + models: { + generateContent, + }, + }; + + await client.executeStep(vi.fn()); + + expect(toolExecute).toHaveBeenCalledTimes(1); + expect(screenshotProvider).toHaveBeenCalledTimes(1); + + const history = (client as unknown as { history: Content[] }).history; + const userResponse = history[history.length - 1]; + const functionResponse = userResponse.parts?.[0]?.functionResponse; + + expect(functionResponse).toMatchObject({ + name: "fillUsername", + response: { + result: "Tool executed successfully", + url: "http://127.0.0.1:6789/", + }, + parts: [ + { + inlineData: { + mimeType: "image/png", + data: "fresh-screenshot-base64", + }, + }, + ], + }); + }); + + it("reuses one fresh screenshot for custom and computer-use responses in the same step", async () => { + const client = createClient(); + const screenshotProvider = vi.fn(async () => "shared-screenshot-base64"); + const actionHandler = vi.fn(async () => {}); + client.setScreenshotProvider(screenshotProvider); + client.setActionHandler(actionHandler); + client.setCurrentUrl("http://127.0.0.1:6789/"); + + const toolExecute = vi.fn(async () => ({ filled: true })); + ( + client as unknown as { + tools: Record< + string, + { + execute: typeof toolExecute; + } + >; + } + ).tools = { + fillUsername: { + execute: toolExecute, + }, + }; + + const generateContent = vi.fn(async () => ({ + candidates: [ + { + content: { + role: "model", + parts: [ + { + functionCall: { + name: "fillUsername", + args: {}, + }, + }, + { + functionCall: { + name: "click_at", + args: { x: 500, y: 500, button: "left" }, + }, + }, + ], + }, + finishReason: "STOP", + }, + ], + usageMetadata: { + promptTokenCount: 1, + candidatesTokenCount: 1, + }, + })); + + ( + client as unknown as { + client: { + models: { + generateContent: typeof generateContent; + }; + }; + } + ).client = { + models: { + generateContent, + }, + }; + + await client.executeStep(vi.fn()); + + expect(toolExecute).toHaveBeenCalledTimes(1); + expect(actionHandler).toHaveBeenCalledTimes(1); + expect(screenshotProvider).toHaveBeenCalledTimes(1); + + const history = (client as unknown as { history: Content[] }).history; + const userResponse = history[history.length - 1]; + const functionResponses = userResponse.parts?.map( + (part) => part.functionResponse, + ); + + expect(functionResponses).toHaveLength(2); + expect(functionResponses?.map((response) => response?.name)).toEqual([ + "fillUsername", + "click_at", + ]); + expect( + functionResponses?.map( + (response) => response?.parts?.[0]?.inlineData?.data, + ), + ).toEqual(["shared-screenshot-base64", "shared-screenshot-base64"]); + }); +}); diff --git a/packages/core/tests/unit/openai-cua-client.test.ts b/packages/core/tests/unit/openai-cua-client.test.ts index 9a3381c80..aba9c1b7e 100644 --- a/packages/core/tests/unit/openai-cua-client.test.ts +++ b/packages/core/tests/unit/openai-cua-client.test.ts @@ -87,6 +87,54 @@ describe("OpenAICUAClient", () => { ]); }); + it("returns a success result when a custom tool completes with undefined", async () => { + const client = createClient(); + const toolExecute = vi.fn(async () => undefined); + + ( + client as unknown as { + tools: Record< + string, + { + execute: typeof toolExecute; + } + >; + } + ).tools = { + fillUsername: { + execute: toolExecute, + }, + }; + + const result = await ( + client as unknown as { + takeAction: ( + output: unknown[], + logger: (msg: unknown) => void, + ) => Promise; + } + ).takeAction( + [ + { + type: "function_call", + name: "fillUsername", + call_id: "call-1", + arguments: "{}", + }, + ], + vi.fn(), + ); + + expect(toolExecute).toHaveBeenCalledTimes(1); + expect(result).toEqual([ + { + type: "function_call_output", + call_id: "call-1", + output: "Tool executed successfully", + }, + ]); + }); + it("does NOT auto-continue follow-up questions without a captcha context", async () => { const client = createClient(); // No captcha context note — no tool should be exposed