Skip to content

Commit 5e73edd

Browse files
Whanodclaude
andauthored
fix: wire image data through to Claude for screenshot/photo support (#168)
The bot was downloading and base64-encoding images but only passing a text prompt to Claude, never the actual image data. This sends images as multimodal content blocks via the SDK AsyncIterable path so Claude can actually see uploaded screenshots and photos. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 02d9f5e commit 5e73edd

4 files changed

Lines changed: 56 additions & 3 deletions

File tree

src/bot/features/image_handler.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ async def process_image(
5757
else:
5858
prompt = self._create_generic_prompt(caption)
5959

60-
# Convert to base64 for Claude (if supported in future)
6160
base64_image = base64.b64encode(image_bytes).decode("utf-8")
6261

6362
return ProcessedImage(

src/bot/orchestrator.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@
4242

4343
logger = structlog.get_logger()
4444

45+
_MEDIA_TYPE_MAP = {
46+
"png": "image/png",
47+
"jpeg": "image/jpeg",
48+
"gif": "image/gif",
49+
"webp": "image/webp",
50+
}
51+
4552
# Patterns that look like secrets/credentials in CLI arguments
4653
_SECRET_PATTERNS: List[re.Pattern[str]] = [
4754
# API keys / tokens (sk-ant-..., sk-..., ghp_..., gho_..., github_pat_..., xoxb-...)
@@ -1353,13 +1360,22 @@ async def agentic_photo(
13531360
processed_image = await image_handler.process_image(
13541361
photo, update.message.caption
13551362
)
1363+
fmt = processed_image.metadata.get("format", "png")
1364+
images = [
1365+
{
1366+
"data": processed_image.base64_data,
1367+
"media_type": _MEDIA_TYPE_MAP.get(fmt, "image/png"),
1368+
}
1369+
]
1370+
13561371
await self._handle_agentic_media_message(
13571372
update=update,
13581373
context=context,
13591374
prompt=processed_image.prompt,
13601375
progress_msg=progress_msg,
13611376
user_id=user_id,
13621377
chat=chat,
1378+
images=images,
13631379
)
13641380

13651381
except Exception as e:
@@ -1420,6 +1436,7 @@ async def _handle_agentic_media_message(
14201436
progress_msg: Any,
14211437
user_id: int,
14221438
chat: Any,
1439+
images: Optional[List[Dict[str, str]]] = None,
14231440
) -> None:
14241441
"""Run a media-derived prompt through Claude and send responses."""
14251442
claude_integration = context.bot_data.get("claude_integration")
@@ -1456,6 +1473,7 @@ async def _handle_agentic_media_message(
14561473
session_id=session_id,
14571474
on_stream=on_stream,
14581475
force_new=force_new,
1476+
images=images,
14591477
)
14601478
finally:
14611479
heartbeat.cancel()

src/claude/facade.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ async def run_command(
3939
on_stream: Optional[Callable[[StreamUpdate], None]] = None,
4040
force_new: bool = False,
4141
interrupt_event: Optional["asyncio.Event"] = None,
42+
images: Optional[List[Dict[str, str]]] = None,
4243
) -> ClaudeResponse:
4344
"""Run Claude Code command with full integration."""
4445
logger.info(
@@ -88,6 +89,7 @@ async def run_command(
8889
continue_session=should_continue,
8990
stream_callback=on_stream,
9091
interrupt_event=interrupt_event,
92+
images=images,
9193
)
9294
except Exception as resume_error:
9395
# If resume failed (e.g., session expired/missing on Claude's side),
@@ -113,6 +115,7 @@ async def run_command(
113115
continue_session=False,
114116
stream_callback=on_stream,
115117
interrupt_event=interrupt_event,
118+
images=images,
116119
)
117120
else:
118121
raise
@@ -157,6 +160,7 @@ async def _execute(
157160
continue_session: bool = False,
158161
stream_callback: Optional[Callable] = None,
159162
interrupt_event: Optional[asyncio.Event] = None,
163+
images: Optional[List[Dict[str, str]]] = None,
160164
) -> ClaudeResponse:
161165
"""Execute command via SDK."""
162166
return await self.sdk_manager.execute_command(
@@ -166,6 +170,7 @@ async def _execute(
166170
continue_session=continue_session,
167171
stream_callback=stream_callback,
168172
interrupt_event=interrupt_event,
173+
images=images,
169174
)
170175

171176
async def _find_resumable_session(

src/claude/sdk_integration.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55
from dataclasses import dataclass, field
66
from pathlib import Path
7-
from typing import Any, Callable, Dict, List, Optional
7+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
88

99
import structlog
1010
from claude_agent_sdk import (
@@ -276,6 +276,7 @@ async def execute_command(
276276
continue_session: bool = False,
277277
stream_callback: Optional[Callable[[StreamUpdate], None]] = None,
278278
interrupt_event: Optional[asyncio.Event] = None,
279+
images: Optional[List[Dict[str, str]]] = None,
279280
) -> ClaudeResponse:
280281
"""Execute Claude Code command via SDK."""
281282
start_time = asyncio.get_event_loop().time()
@@ -369,7 +370,37 @@ async def _run_client() -> None:
369370
client = ClaudeSDKClient(options)
370371
try:
371372
await client.connect()
372-
await client.query(prompt)
373+
374+
if images:
375+
content_blocks: List[Dict[str, Any]] = []
376+
for img in images:
377+
media_type = img.get("media_type", "image/png")
378+
content_blocks.append(
379+
{
380+
"type": "image",
381+
"source": {
382+
"type": "base64",
383+
"media_type": media_type,
384+
"data": img["data"],
385+
},
386+
}
387+
)
388+
content_blocks.append({"type": "text", "text": prompt})
389+
390+
multimodal_msg = {
391+
"type": "user",
392+
"message": {
393+
"role": "user",
394+
"content": content_blocks,
395+
},
396+
}
397+
398+
async def _multimodal_prompt() -> AsyncIterator[Dict[str, Any]]:
399+
yield multimodal_msg
400+
401+
await client.query(_multimodal_prompt())
402+
else:
403+
await client.query(prompt)
373404

374405
async for raw_data in client._query.receive_messages():
375406
try:

0 commit comments

Comments
 (0)