fix: convert storage image source url to base64 (#1989)

Priyanka-Microsoft · web-flow · commit a40e461a4287 · 2025-12-05T12:15:25.000+05:30
diff --git a/code/backend/batch/utilities/helpers/embedders/push_embedder.py b/code/backend/batch/utilities/helpers/embedders/push_embedder.py
@@ -3,7 +3,7 @@
 import logging
 from typing import List
 from urllib.parse import urlparse
-
+import urllib.request
 from ...helpers.llm_helper import LLMHelper
 from ...helpers.env_helper import EnvHelper
 from ..azure_computer_vision_client import AzureComputerVisionClient
@@ -18,6 +18,8 @@
 from ..document_loading_helper import DocumentLoading
 from ..document_chunking_helper import DocumentChunking
 from ...common.source_document import SourceDocument
+import base64
+from mimetypes import guess_type
 
 logger = logging.getLogger(__name__)
 
@@ -101,6 +103,27 @@ def __embed(
         else:
             logger.warning("No documents to upload.")
 
+    def __local_image_to_data_url(self, image_path):
+        """Convert a local image file or URL to a data URL."""
+        mime_type, _ = guess_type(image_path)
+        if mime_type is None:
+            mime_type = 'application/octet-stream'
+
+        # Check if the image_path is a URL or a local file path
+        parsed_url = urlparse(image_path)
+        if parsed_url.scheme in ('http', 'https'):
+            # Download the image from the URL
+            logger.info(f"Downloading image from URL: {image_path}")
+            with urllib.request.urlopen(image_path) as response:
+                image_data = response.read()
+            base64_encoded_data = base64.b64encode(image_data).decode('utf-8')
+        else:
+            # Read from local file
+            with open(image_path, "rb") as image_file:
+                base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')
+
+        return f"data:{mime_type};base64,{base64_encoded_data}"
+
     def __generate_image_caption(self, source_url):
         logger.info(f"Generating image caption for URL: {source_url}")
         model = self.env_helper.AZURE_OPENAI_VISION_MODEL
@@ -119,7 +142,7 @@ def __generate_image_caption(self, source_url):
                         "text": "Describe this image in detail. Limit the response to 500 words.",
                         "type": "text",
                     },
-                    {"image_url": {"url": source_url}, "type": "image_url"},
+                    {"image_url": {"url": self.__local_image_to_data_url(source_url)}, "type": "image_url"},
                 ],
             },
         ]
diff --git a/code/tests/functional/tests/functions/advanced_image_processing/test_advanced_image_processing.py b/code/tests/functional/tests/functions/advanced_image_processing/test_advanced_image_processing.py
@@ -63,6 +63,12 @@ def setup_blob_metadata_mocking(httpserver: HTTPServer, app_config: AppConfig):
         method="PUT",
     ).respond_with_data()
 
+    # Mock GET request for image download (base64 conversion)
+    httpserver.expect_request(
+        f"/{app_config.get_from_json('AZURE_BLOB_STORAGE_INFO','containerName')}/{FILE_NAME}",
+        method="GET",
+    ).respond_with_data(b"fake_image_data", content_type="image/jpeg")
+
 
 @pytest.fixture(autouse=True)
 def setup_caption_response(httpserver: HTTPServer, app_config: AppConfig):
@@ -192,11 +198,9 @@ def test_image_passed_to_llm_to_generate_caption(
         ),
     )[0]
 
-    assert request.get_json()["messages"][1]["content"][1]["image_url"][
-        "url"
-    ].startswith(
-        f"{app_config.get('AZURE_STORAGE_ACCOUNT_ENDPOINT')}{app_config.get_from_json('AZURE_BLOB_STORAGE_INFO','containerName')}/{FILE_NAME}"
-    )
+    # The URL should be converted to base64 data URL
+    image_url = request.get_json()["messages"][1]["content"][1]["image_url"]["url"]
+    assert image_url.startswith("data:image/"), f"Expected base64 data URL, got {image_url[:100]}"
 
 
 def test_embeddings_generated_for_caption(
diff --git a/code/tests/utilities/helpers/test_push_embedder.py b/code/tests/utilities/helpers/test_push_embedder.py
@@ -171,6 +171,20 @@ def azure_computer_vision_mock():
         yield mock
 
 
+@pytest.fixture(autouse=True)
+def urllib_request_mock():
+    with patch(
+        "backend.batch.utilities.helpers.embedders.push_embedder.urllib.request.urlopen"
+    ) as mock:
+        # Create a mock response object
+        mock_response = MagicMock()
+        mock_response.read.return_value = b"fake_image_data"
+        mock_response.__enter__.return_value = mock_response
+        mock_response.__exit__.return_value = None
+        mock.return_value = mock_response
+        yield mock
+
+
 def test_embed_file_advanced_image_processing_vectorizes_image(
     azure_computer_vision_mock,
 ):
@@ -200,29 +214,24 @@ def test_embed_file_advanced_image_processing_uses_vision_model_for_captioning(
     push_embedder.embed_file(source_url, "some-file-name.jpg")
 
     # then
-    llm_helper_mock.get_chat_completion.assert_called_once_with(
-        [
-            {
-                "role": "system",
-                "content": """You are an assistant that generates rich descriptions of images.
-You need to be accurate in the information you extract and detailed in the descriptons you generate.
-Do not abbreviate anything and do not shorten sentances. Explain the image completely.
-If you are provided with an image of a flow chart, describe the flow chart in detail.
-If the image is mostly text, use OCR to extract the text as it is displayed in the image.""",
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "text": "Describe this image in detail. Limit the response to 500 words.",
-                        "type": "text",
-                    },
-                    {"image_url": {"url": source_url}, "type": "image_url"},
-                ],
-            },
-        ],
-        env_helper_mock.AZURE_OPENAI_VISION_MODEL,
-    )
+    # Verify the vision model is called with direct URL (not base64) for token efficiency
+    llm_helper_mock.get_chat_completion.assert_called_once()
+    call_args = llm_helper_mock.get_chat_completion.call_args
+    messages = call_args[0][0]
+    model = call_args[0][1]
+
+    assert model == env_helper_mock.AZURE_OPENAI_VISION_MODEL
+    assert len(messages) == 2
+    assert messages[0]["role"] == "system"
+    assert "You are an assistant that generates rich descriptions of images" in messages[0]["content"]
+    assert messages[1]["role"] == "user"
+    assert len(messages[1]["content"]) == 2
+    assert messages[1]["content"][0]["type"] == "text"
+    assert "Describe this image in detail" in messages[1]["content"][0]["text"]
+    assert messages[1]["content"][1]["type"] == "image_url"
+    # Image should be converted to base64 data URL
+    image_url = messages[1]["content"][1]["image_url"]["url"]
+    assert image_url.startswith("data:image/"), f"Expected base64 data URL, got {image_url[:100]}"
 
 
 def test_embed_file_advanced_image_processing_stores_embeddings_in_search_index(