From ad7d51f0f25a52579e451484c9b941c593cae72a Mon Sep 17 00:00:00 2001 From: bcode Date: Sat, 9 May 2026 03:12:11 +0000 Subject: [PATCH] feat(browser_execute): auto-attach Page.captureScreenshot results as image attachments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every successful Page.captureScreenshot made during a browser_execute call is now collected from the CDP transport and surfaced as a FilePart on the tool result. The opencode runner appends those attachments to the next assistant turn as image parts, so the model sees the screenshot natively as vision input. No more decode-write-read dance from inside the snippet. Same channel that read.ts and webfetch.ts already use when they surface images; we're adding browser_execute as a third producer. Mechanism (Level 1, zero upstream diff): - cdp/session.ts: new generic onCallResult(fn) listener API, symmetric with existing onEvent. Fires after every successful _call resolve. Keeps the Session agnostic of any one method's semantics. - browser-execute.ts (Level 1): subscribes for the duration of each execute() call, filters to Page.captureScreenshot, accumulates results into a per-call collector returned alongside output/result. When BCODE_SCREENSHOT_DIR is set, the same tap also writes each screenshot to disk (eval-judge consumption — second consumer of the same hook). - tool/browser-execute.ts (Level 2): maps the collector to attachments[] on the ExecuteResult. BROWSER.md and interaction-skills/screenshots.md updated to tell the agent the auto-attach behavior. Two new smoke tests (gated on BCODE_SMOKE_CHROME) verify screenshots round-trip + the env-var disk dump. --- packages/bcode-browser/skills/BROWSER.md | 7 +- .../skills/interaction-skills/screenshots.md | 9 ++- packages/bcode-browser/src/browser-execute.ts | 69 ++++++++++++++++++- packages/bcode-browser/src/cdp/session.ts | 28 +++++++- .../test/browser-execute.test.ts | 57 +++++++++++++++ packages/opencode/src/tool/browser-execute.ts | 15 ++++ 6 files changed, 179 insertions(+), 6 deletions(-) diff --git a/packages/bcode-browser/skills/BROWSER.md b/packages/bcode-browser/skills/BROWSER.md index 423e4b048..f5194a557 100644 --- a/packages/bcode-browser/skills/BROWSER.md +++ b/packages/bcode-browser/skills/BROWSER.md @@ -133,8 +133,11 @@ await session.Input.dispatchMouseEvent({ type: "mouseReleased", x, y, button: "l await session.Input.insertText({ text: "hello" }) // Screenshot. -const { data } = await session.Page.captureScreenshot({ format: "png" }) -// data is base64; write with the `write` tool or process in JS. +await session.Page.captureScreenshot({ format: "png" }) +// You see the image inline on the next turn — `browser_execute` automatically +// attaches every `Page.captureScreenshot` result. No need to decode, save, or +// `read` the bytes back. The base64 is still in `data` (via the return value) +// for the rare case you want to process it programmatically. ``` For the full menu of UI mechanics — dropdowns, dialogs, iframes, shadow DOM, uploads, scrolling, screenshots-with-highlights — list `{{SKILLS_DIR}}/interaction-skills/` to see all available topics, then read the relevant one. diff --git a/packages/bcode-browser/skills/interaction-skills/screenshots.md b/packages/bcode-browser/skills/interaction-skills/screenshots.md index 94c5a6962..6d1146992 100644 --- a/packages/bcode-browser/skills/interaction-skills/screenshots.md +++ b/packages/bcode-browser/skills/interaction-skills/screenshots.md @@ -2,12 +2,19 @@ `session.Page.captureScreenshot` is your default discovery and verification tool. +**Auto-attached.** Every successful `Page.captureScreenshot` made during a `browser_execute` call is automatically surfaced to you as an inline image attachment on the next turn — same channel the `read` tool uses for image files. You don't need to decode the base64, save it, or `read` it back to see the image. + +The `data` field on the return value still carries the base64 string for the rare case where you want to process the image programmatically (OCR, diff against a previous shot, dimension extraction). + ## Core calls ```js // Viewport only (default) — fastest, matches what the user sees +await session.Page.captureScreenshot({ format: 'png' }) +// You'll see the image inline on the next turn. No write/read step needed. + +// If you do want the bytes (e.g. to write to disk yourself): const { data } = await session.Page.captureScreenshot({ format: 'png' }) -// Cross-platform temp dir: /tmp on Linux, /var/folders/… on macOS, %TEMP% on Windows const { tmpdir } = await import('node:os') await Bun.write(`${tmpdir()}/shot.png`, Buffer.from(data, 'base64')) diff --git a/packages/bcode-browser/src/browser-execute.ts b/packages/bcode-browser/src/browser-execute.ts index 17bebdbc2..64a6802d4 100644 --- a/packages/bcode-browser/src/browser-execute.ts +++ b/packages/bcode-browser/src/browser-execute.ts @@ -41,6 +41,7 @@ // Level-2 hook in packages/opencode is a thin adapter. import fs from "fs/promises" +import path from "path" import { Effect, Schema } from "effect" import { SessionStore } from "./session-store" import { Skills } from "./skills" @@ -78,14 +79,44 @@ export interface ExecuteContext { readonly onChunk?: (output: string) => Effect.Effect } +// One screenshot collected during an execute() call. Drained into the +// Level-2 wrapper's `attachments[]` so the agent sees the image inline on the +// next assistant turn — no decode/write/read dance from inside the snippet. +export interface CollectedScreenshot { + readonly mime: "image/png" | "image/jpeg" | "image/webp" + readonly base64: string +} + export interface ExecuteResult { readonly output: string // The snippet's `return` value, JSON-serialized when possible. `undefined` // serializes as `null` (JSON has no undefined). Non-serializable values // fall back to `String(v)`. readonly result: string + // Every successful `Page.captureScreenshot` made by the snippet, in the + // order the CDP responses came back. Empty when the snippet didn't take + // any screenshots. + readonly screenshots: readonly CollectedScreenshot[] +} + +const SCREENSHOT_FORMAT_TO_MIME: Record = { + png: "image/png", + jpeg: "image/jpeg", + webp: "image/webp", +} + +const SCREENSHOT_FORMAT_TO_EXT: Record = { + png: "png", + jpeg: "jpg", + webp: "webp", } +const screenshotMime = (format: unknown): CollectedScreenshot["mime"] => + SCREENSHOT_FORMAT_TO_MIME[typeof format === "string" ? format : "png"] ?? "image/png" + +const screenshotExt = (format: unknown): string => + SCREENSHOT_FORMAT_TO_EXT[typeof format === "string" ? format : "png"] ?? "png" + // AsyncFunction is not a global — pull it off an async arrow's constructor. const AsyncFunction = (async () => {}).constructor as new ( ...args: string[] @@ -145,12 +176,46 @@ export const make = Effect.fn("BrowserExecute.make")(function* (dataDir: string) debug: tee, }) + // Screenshot tap. Subscribes to the Session's call-result stream for + // the duration of this execute() call; every successful + // `Page.captureScreenshot` is collected (drained into `attachments[]` + // by the Level-2 wrapper so the agent sees the image inline) and, + // when `BCODE_SCREENSHOT_DIR` is set, also written to disk for + // eval-judge consumption. Two consumers of one tap. + // + // Concurrency note: parallel execute() calls against the same Session + // (rare but possible — different sessionIDs share no Session, but a + // single sessionID with two in-flight tool calls would) each subscribe + // independently and would each see all screenshots produced during + // their lifetime. Acceptable for v1; opencode tool calls within one + // assistant message are serialized anyway. + const screenshots: CollectedScreenshot[] = [] + const dumpDir = process.env.BCODE_SCREENSHOT_DIR + const startedAt = Date.now() + let seq = 0 + const unsubscribe = session.onCallResult((method, params, result) => { + if (method !== "Page.captureScreenshot") return + const r = result as { data?: unknown } + if (typeof r?.data !== "string") return + const p = (params ?? {}) as { format?: unknown } + const mime = screenshotMime(p.format) + const ext = screenshotExt(p.format) + const idx = seq++ + screenshots.push({ mime, base64: r.data }) + if (dumpDir) { + const filename = `${ctx.sessionID}-${startedAt}-${String(idx).padStart(3, "0")}.${ext}` + fs.mkdir(dumpDir, { recursive: true }) + .then(() => fs.writeFile(path.join(dumpDir, filename), Buffer.from(r.data as string, "base64"))) + .catch(() => { /* eval-side dump is best-effort */ }) + } + }) + const ran = yield* Effect.tryPromise({ try: () => wrapped(session, snippetConsole), catch: (err) => new Error(`browser_execute snippet threw: ${err instanceof Error ? err.stack ?? err.message : String(err)}`), - }) + }).pipe(Effect.ensuring(Effect.sync(() => unsubscribe()))) - return { output, result: serialize(ran) } satisfies ExecuteResult + return { output, result: serialize(ran), screenshots } satisfies ExecuteResult }).pipe( Effect.scoped, Effect.timeoutOrElse({ diff --git a/packages/bcode-browser/src/cdp/session.ts b/packages/bcode-browser/src/cdp/session.ts index 66327a148..6fb400f5a 100644 --- a/packages/bcode-browser/src/cdp/session.ts +++ b/packages/bcode-browser/src/cdp/session.ts @@ -47,6 +47,7 @@ export class Session implements Transport { private pending = new Map(); private activeSessionId: string | undefined; private eventListeners: Array<(method: string, params: unknown, sessionId?: string) => void> = []; + private callResultListeners: Array<(method: string, params: unknown, result: unknown) => void> = []; // Generated bindings — one per CDP domain. // Initialized lazily after construction so `_call` is available. @@ -170,6 +171,23 @@ export class Session implements Transport { }; } + /** + * Subscribe to all successful CDP method results. Returns an unsubscribe fn. + * Fires after `_call` resolves; listener errors are swallowed. + * + * Used by `browser-execute` to collect `Page.captureScreenshot` outputs + * from inside an execute() call (drained into `attachments[]` so the agent + * sees the image inline; optionally also written to `BCODE_SCREENSHOT_DIR` + * for eval-judge consumption). Generic by design — keeps `Session` + * agnostic of any one method's semantics. + */ + onCallResult(fn: (method: string, params: unknown, result: unknown) => void): () => void { + this.callResultListeners.push(fn); + return () => { + this.callResultListeners = this.callResultListeners.filter(x => x !== fn); + }; + } + /** Wait for the next event matching `method` (and optional predicate). */ waitFor(method: string, predicate?: (params: T) => boolean, timeoutMs = 30_000): Promise { return new Promise((resolve, reject) => { @@ -198,7 +216,15 @@ export class Session implements Transport { msg.sessionId = this.activeSessionId; } return new Promise((resolve, reject) => { - this.pending.set(id, { resolve, reject }); + this.pending.set(id, { + resolve: (v) => { + for (const fn of this.callResultListeners) { + try { fn(method, params, v); } catch { /* ignore */ } + } + resolve(v); + }, + reject, + }); this.ws!.send(JSON.stringify(msg)); }); } diff --git a/packages/bcode-browser/test/browser-execute.test.ts b/packages/bcode-browser/test/browser-execute.test.ts index 1c2f2967b..20a4e5670 100644 --- a/packages/bcode-browser/test/browser-execute.test.ts +++ b/packages/bcode-browser/test/browser-execute.test.ts @@ -113,6 +113,63 @@ test.skipIf(!enabled)("workspace import inside a snippet", async () => { expect(JSON.parse(result.result)).toBe("bcode-be") }) +test.skipIf(!enabled)("Page.captureScreenshot is collected into result.screenshots", async () => { + const result = await Effect.runPromise( + Effect.scoped( + Effect.gen(function* () { + const impl = yield* BrowserExecute.make(dataDir) + return yield* impl.execute( + { + code: `await session.Page.enable(); + await session.Page.navigate({ url: "data:text/html,shothi" }); + await session.waitFor("Page.loadEventFired", undefined, 5000); + const a = await session.Page.captureScreenshot({ format: "png" }); + const b = await session.Page.captureScreenshot({ format: "jpeg", quality: 50 }); + return { aLen: a.data.length, bLen: b.data.length };`, + }, + { sessionID, workspaceDir }, + ) + }), + ), + ) + expect(result.screenshots).toHaveLength(2) + expect(result.screenshots[0]!.mime).toBe("image/png") + expect(result.screenshots[1]!.mime).toBe("image/jpeg") + // base64 must round-trip back to non-empty bytes for both shots. + expect(Buffer.from(result.screenshots[0]!.base64, "base64").length).toBeGreaterThan(0) + expect(Buffer.from(result.screenshots[1]!.base64, "base64").length).toBeGreaterThan(0) +}) + +test.skipIf(!enabled)("BCODE_SCREENSHOT_DIR dumps screenshots to disk", async () => { + const dump = await fs.mkdtemp(path.join(os.tmpdir(), "bcode-shotdump-")) + const prev = process.env.BCODE_SCREENSHOT_DIR + process.env.BCODE_SCREENSHOT_DIR = dump + try { + await Effect.runPromise( + Effect.scoped( + Effect.gen(function* () { + const impl = yield* BrowserExecute.make(dataDir) + return yield* impl.execute( + { + code: `await session.Page.captureScreenshot({ format: "png" });`, + }, + { sessionID, workspaceDir }, + ) + }), + ), + ) + // Disk dump is fire-and-forget; give it a tick to land. + await new Promise((r) => setTimeout(r, 150)) + const files = await fs.readdir(dump) + expect(files.length).toBeGreaterThan(0) + expect(files.every((f) => f.endsWith(".png"))).toBe(true) + } finally { + if (prev === undefined) delete process.env.BCODE_SCREENSHOT_DIR + else process.env.BCODE_SCREENSHOT_DIR = prev + await fs.rm(dump, { recursive: true, force: true }) + } +}) + test.skipIf(!enabled)("syntax error in snippet surfaces a clean failure", async () => { await expect( Effect.runPromise( diff --git a/packages/opencode/src/tool/browser-execute.ts b/packages/opencode/src/tool/browser-execute.ts index f0632f732..e7dd420c5 100644 --- a/packages/opencode/src/tool/browser-execute.ts +++ b/packages/opencode/src/tool/browser-execute.ts @@ -55,15 +55,30 @@ export const BrowserExecuteTool = Tool.define( metadata: { output: preview(output) }, }), }) + // Drain every `Page.captureScreenshot` made during this snippet + // into `attachments[]`. Opencode appends FilePart attachments to + // the next assistant turn as image parts, so the model receives + // the screenshot natively as vision input — no decode/write/read + // dance from inside the snippet. Same channel `read` and + // `webfetch` use when they surface images. + const attachments = result.screenshots.map((s) => ({ + type: "file" as const, + mime: s.mime, + url: `data:${s.mime};base64,${s.base64}`, + })) return { title: "browser_execute", output: [ result.output.trimEnd(), result.result === "null" ? "" : `=> ${result.result}`, + attachments.length > 0 + ? `(${attachments.length} screenshot${attachments.length === 1 ? "" : "s"} attached)` + : "", ] .filter(Boolean) .join("\n\n"), metadata: { result: result.result, output: preview(result.output) }, + attachments, } }).pipe(Effect.orDie), }