feat(mcp): return screen images as native MCP image content blocks

trmquang93 · trmquang93 · commit 4ab814de8ff5 · 2026-03-27T23:35:41.000+07:00
When get_screen is called with includeImage: true, return two content
blocks instead of embedding the data URI in JSON text: a text block
with screen metadata and a type:"image" block with raw base64 PNG.
Multimodal LLMs can now visually see the screen. Uses a __contentBlocks
escape hatch in server.js to support multi-content tool responses.
diff --git a/mcp-server/src/server.js b/mcp-server/src/server.js
@@ -72,6 +72,9 @@ export function createServer(state, renderer) {
         };
       }
 
+      if (result && result.__contentBlocks) {
+        return { content: result.__contentBlocks };
+      }
       return {
         content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
       };
diff --git a/mcp-server/src/tools/screen-tools.js b/mcp-server/src/tools/screen-tools.js
@@ -226,8 +226,22 @@ export async function handleScreenTool(name, args, state, renderer) {
       if (!args.includeImage) {
         delete result.imageData;
         result.hasImage = !!screen.imageData;
+        return result;
       }
-      return result;
+
+      // Return image as a native MCP image content block so multimodal LLMs can see it
+      const imageData = result.imageData;
+      delete result.imageData;
+      result.hasImage = !!imageData;
+
+      const content = [
+        { type: "text", text: JSON.stringify(result, null, 2) },
+      ];
+      if (imageData) {
+        const base64 = imageData.replace(/^data:image\/\w+;base64,/, "");
+        content.push({ type: "image", data: base64, mimeType: "image/png" });
+      }
+      return { __contentBlocks: content };
     }
 
     case "update_screen_image": {

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,9 @@ export function createServer(state, renderer) {`
`72`	`72`	`};`
`73`	`73`	`}`
`74`	`74`
	`75`	`+ if (result && result.__contentBlocks) {`
	`76`	`+ return { content: result.__contentBlocks };`
	`77`	`+ }`
`75`	`78`	`return {`
`76`	`79`	`content: [{ type: "text", text: JSON.stringify(result, null, 2) }],`
`77`	`80`	`};`