Use Anthropic for vision tool.

tcdent · tcdent · commit 7c1bf897742c · 2025-02-13T13:37:47.000-08:00
diff --git a/agentstack/_tools/vision/__init__.py b/agentstack/_tools/vision/__init__.py
@@ -1,70 +1,106 @@
-"""Vision tool for analyzing images using OpenAI's Vision API."""
-
+from typing import IO, Optional
+import os
+from pathlib import Path
 import base64
-from typing import Optional
+import tempfile
 import requests
-from openai import OpenAI
+import anthropic
 
 __all__ = ["analyze_image"]
 
+PROMPT = os.getenv('VISION_PROMPT', "What's in this image?")
+MODEL = os.getenv('VISION_MODEL', "claude-3-5-sonnet-20241022")
+MAX_TOKENS = os.getenv('VISION_MAX_TOKENS', 1024)
 
-def analyze_image(image_path_url: str) -> str:
-    """
-    Analyze an image using OpenAI's Vision API.
+MEDIA_TYPES = {
+    "jpg": "image/jpeg",
+    "jpeg": "image/jpeg",
+    "png": "image/png",
+    "gif": "image/gif",
+    "webp": "image/webp",
+}
 
-    Args:
-        image_path_url: Local path or URL to the image
+# image sizes that will not be resized
+# TODO is there any value in resizing pre-upload?
+# 1:1	1092x1092 px
+# 3:4	951x1268 px
+# 2:3	896x1344 px
+# 9:16	819x1456 px
+# 1:2	784x1568 px
 
-    Returns:
-        str: Description of the image contents
-    """
-    client = OpenAI()
 
-    if not image_path_url:
-        return "Image Path or URL is required."
+def _get_media_type(image_filename: str) -> Optional[str]:
+    """Get the media type from an image filename."""
+    for ext, media_type in MEDIA_TYPES.items():
+        if image_filename.endswith(ext):
+            return media_type
+    return None
+
 
-    if "http" in image_path_url:
-        return _analyze_web_image(client, image_path_url)
-    return _analyze_local_image(client, image_path_url)
+def _encode_image(image_handle: IO) -> str:
+    """Encode a file handle to base64."""
+    return base64.b64encode(image_handle.read()).decode("utf-8")
 
 
-def _analyze_web_image(client: OpenAI, image_path_url: str) -> str:
-    response = client.chat.completions.create(
-        model="gpt-4-vision-preview",
+def _make_anthropic_request(image_handle: IO, media_type: str) -> dict:
+    """Make a request to the Anthropic API using an image."""
+    client = anthropic.Anthropic()
+    data = _encode_image(image_handle)
+    return client.messages.create(
+        model=MODEL,
+        max_tokens=MAX_TOKENS,
         messages=[
             {
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": "What's in this image?"},
-                    {"type": "image_url", "image_url": {"url": image_path_url}},
+                    {
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": media_type,
+                            "data": data,
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": PROMPT,
+                    },
                 ],
             }
         ],
-        max_tokens=300,
     )
-    return response.choices[0].message.content  # type: ignore[return-value]
 
 
-def _analyze_local_image(client: OpenAI, image_path: str) -> str:
-    base64_image = _encode_image(image_path)
-    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {client.api_key}"}
-    payload = {
-        "model": "gpt-4-vision-preview",
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What's in this image?"},
-                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
-                ],
-            }
-        ],
-        "max_tokens": 300,
-    }
-    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
-    return response.json()["choices"][0]["message"]["content"]
+def _analyze_web_image(image_url: str) -> str:
+    """Analyze an image from a URL."""
+    with tempfile.NamedTemporaryFile() as temp_file:
+        temp_file.write(requests.get(image_url).content)
+        temp_file.flush()
+        temp_file.seek(0)
+        response = _make_anthropic_request(temp_file, _get_media_type(image_url))
+        return response.content[0].text
 
 
-def _encode_image(image_path: str) -> str:
+def _analyze_local_image(image_path: str) -> str:
+    """Analyze an image from a local file."""
     with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode("utf-8")
+        response = _make_anthropic_request(image_file, _get_media_type(image_path))
+        return response.content[0].text
+
+
+def analyze_image(image_path_or_url: str) -> str:
+    """
+    Analyze an image using OpenAI's Vision API.
+
+    Args:
+        image_path_or_url: Local path or URL to the image.
+
+    Returns:
+        str: Description of the image contents
+    """
+    if not image_path_or_url:
+        return "Image Path or URL is required."
+
+    if "http" in image_path_or_url:
+        return _analyze_web_image(image_path_or_url)
+    return _analyze_local_image(image_path_or_url)
diff --git a/agentstack/_tools/vision/config.json b/agentstack/_tools/vision/config.json
@@ -2,10 +2,13 @@
   "name": "vision",
   "category": "image-analysis",
   "env": {
-    "OPENAI_API_KEY": null
+    "ANTHROPIC_API_KEY": null, 
+    "VISION_PROMPT": null, 
+    "VISION_MODEL": null, 
+    "VISION_MAX_TOKENS": null
   },
   "dependencies": [
-    "openai>=1.0.0",
+    "anthropic>=0.45.2",
     "requests>=2.31.0"
   ],
   "tools": ["analyze_image"]
diff --git a/tests/fixtures/test_image.jpg b/tests/fixtures/test_image.jpg
diff --git a/tests/tools/test_tool_vision.py b/tests/tools/test_tool_vision.py
@@ -0,0 +1,51 @@
+import os
+from pathlib import Path
+import unittest
+from agentstack._tools import ToolConfig
+
+
+TEST_IMAGE_PATH: Path = Path(__file__).parent.parent / 'fixtures/test_image.jpg'
+
+
+class VisionToolTest(unittest.TestCase):
+    def setUp(self):
+        tool = ToolConfig.from_tool_name('vision')
+        for dependency in tool.dependencies:
+            os.system(f"pip install {dependency}")
+
+    def test_get_media_type(self):
+        from agentstack._tools.vision import _get_media_type
+
+        self.assertEqual(_get_media_type("image.jpg"), "image/jpeg")
+        self.assertEqual(_get_media_type("image.jpeg"), "image/jpeg")
+        self.assertEqual(_get_media_type("http://google.com/image.png"), "image/png")
+        self.assertEqual(_get_media_type("/foo/bar/image.gif"), "image/gif")
+        self.assertEqual(_get_media_type("image.webp"), "image/webp")
+        self.assertEqual(_get_media_type("document.pdf"), None)
+
+    def test_encode_image(self):
+        from agentstack._tools.vision import _encode_image
+
+        with open(TEST_IMAGE_PATH, "rb") as image_file:
+            encoded_image = _encode_image(image_file)
+            print(encoded_image[:200])
+            self.assertTrue(isinstance(encoded_image, str))
+
+    def test_analyze_image_web_live(self):
+        from agentstack._tools.vision import analyze_image
+
+        if not os.environ.get('ANTHROPIC_API_KEY'):
+            self.skipTest("ANTHROPIC_API_KEY not set")
+
+        image_url = "https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png"
+        result = analyze_image(image_url)
+        self.assertTrue(isinstance(result, str))
+
+    def test_analyze_image_local_live(self):
+        from agentstack._tools.vision import analyze_image
+
+        if not os.environ.get('ANTHROPIC_API_KEY'):
+            self.skipTest("ANTHROPIC_API_KEY not set")
+
+        result = analyze_image(str(TEST_IMAGE_PATH))
+        self.assertTrue(isinstance(result, str))