Skip to content

Commit 4ab814d

Browse files
committed
feat(mcp): return screen images as native MCP image content blocks
When get_screen is called with includeImage: true, return two content blocks instead of embedding the data URI in JSON text: a text block with screen metadata and a type:"image" block with raw base64 PNG. Multimodal LLMs can now visually see the screen. Uses a __contentBlocks escape hatch in server.js to support multi-content tool responses.
1 parent 54e0f13 commit 4ab814d

2 files changed

Lines changed: 18 additions & 1 deletion

File tree

mcp-server/src/server.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ export function createServer(state, renderer) {
7272
};
7373
}
7474

75+
if (result && result.__contentBlocks) {
76+
return { content: result.__contentBlocks };
77+
}
7578
return {
7679
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
7780
};

mcp-server/src/tools/screen-tools.js

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,22 @@ export async function handleScreenTool(name, args, state, renderer) {
226226
if (!args.includeImage) {
227227
delete result.imageData;
228228
result.hasImage = !!screen.imageData;
229+
return result;
229230
}
230-
return result;
231+
232+
// Return image as a native MCP image content block so multimodal LLMs can see it
233+
const imageData = result.imageData;
234+
delete result.imageData;
235+
result.hasImage = !!imageData;
236+
237+
const content = [
238+
{ type: "text", text: JSON.stringify(result, null, 2) },
239+
];
240+
if (imageData) {
241+
const base64 = imageData.replace(/^data:image\/\w+;base64,/, "");
242+
content.push({ type: "image", data: base64, mimeType: "image/png" });
243+
}
244+
return { __contentBlocks: content };
231245
}
232246

233247
case "update_screen_image": {

0 commit comments

Comments
 (0)