From 2561ae2ec928291d5bc8e074923d035b71b9da4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Wiedermann-M=C3=B6ller?= <dev@jowimo.com>
Date: Fri, 27 Mar 2026 14:48:51 +0100
Subject: [PATCH] fix: update DocIntel default and surface OCR failures

---
 .../markitdown/src/markitdown/_markitdown.py  |  6 +++
 .../converters/_doc_intel_converter.py        |  5 +-
 .../markitdown/tests/test_docintel_html.py    | 54 +++++++++++++++++++
 packages/markitdown/tests/test_module_misc.py | 29 ++++++++++
 4 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f342a614b..98029694a 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -619,6 +619,12 @@ def _convert(
                         [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                     )
                     res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
+                    if (
+                        res.text_content.strip() == ""
+                        and len(failed_attempts) > 0
+                    ):
+                        res = None
+                        continue
                     return res
 
         # If we got this far without success, report any exceptions
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index fd843f231..9224a8596 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -50,6 +50,7 @@ class DefaultAzureCredential:
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
 # This constant is a temporary fix until the bug is resolved.
 CONTENT_FORMAT = "markdown"
+DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION = "2024-11-30"
 
 
 class DocumentIntelligenceFileType(str, Enum):
@@ -134,7 +135,7 @@ def __init__(
         self,
         *,
         endpoint: str,
-        api_version: str = "2024-07-31-preview",
+        api_version: str = DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION,
         credential: AzureKeyCredential | TokenCredential | None = None,
         file_types: List[DocumentIntelligenceFileType] = [
             DocumentIntelligenceFileType.DOCX,
@@ -152,7 +153,7 @@ def __init__(
 
         Args:
             endpoint (str): The endpoint for the Document Intelligence service.
-            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
+            api_version (str): The API version to use. Defaults to "2024-11-30".
             credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
             file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
         """
diff --git a/packages/markitdown/tests/test_docintel_html.py b/packages/markitdown/tests/test_docintel_html.py
index d0b4caa3e..b2256578b 100644
--- a/packages/markitdown/tests/test_docintel_html.py
+++ b/packages/markitdown/tests/test_docintel_html.py
@@ -1,5 +1,6 @@
 import io
 from markitdown.converters._doc_intel_converter import (
+    DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION,
     DocumentIntelligenceConverter,
     DocumentIntelligenceFileType,
 )
@@ -24,3 +25,56 @@ def test_docintel_accepts_html_mimetype():
     assert conv.accepts(io.BytesIO(b""), stream_info)
     stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None)
     assert conv.accepts(io.BytesIO(b""), stream_info)
+
+
+def test_docintel_default_api_version(monkeypatch):
+    captured = {}
+
+    class FakeClient:
+        def __init__(self, *, endpoint, api_version, credential):
+            captured["endpoint"] = endpoint
+            captured["api_version"] = api_version
+            captured["credential"] = credential
+
+    monkeypatch.setattr(
+        "markitdown.converters._doc_intel_converter._dependency_exc_info", None
+    )
+    monkeypatch.setattr(
+        "markitdown.converters._doc_intel_converter.DocumentIntelligenceClient",
+        FakeClient,
+    )
+
+    credential = object()
+    converter = DocumentIntelligenceConverter(
+        endpoint="https://example.cognitiveservices.azure.com/",
+        credential=credential,
+    )
+
+    assert converter.api_version == DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION
+    assert captured["api_version"] == DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION
+    assert captured["credential"] is credential
+
+
+def test_docintel_explicit_api_version(monkeypatch):
+    captured = {}
+
+    class FakeClient:
+        def __init__(self, *, endpoint, api_version, credential):
+            captured["api_version"] = api_version
+
+    monkeypatch.setattr(
+        "markitdown.converters._doc_intel_converter._dependency_exc_info", None
+    )
+    monkeypatch.setattr(
+        "markitdown.converters._doc_intel_converter.DocumentIntelligenceClient",
+        FakeClient,
+    )
+
+    converter = DocumentIntelligenceConverter(
+        endpoint="https://example.cognitiveservices.azure.com/",
+        credential=object(),
+        api_version="2024-07-31-preview",
+    )
+
+    assert converter.api_version == "2024-07-31-preview"
+    assert captured["api_version"] == "2024-07-31-preview"
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 8e3acc23d..99db585ff 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -14,6 +14,8 @@
     FileConversionException,
     StreamInfo,
 )
+from markitdown._base_converter import DocumentConverter
+from markitdown.converters import ImageConverter
 
 # This file contains module tests that are not directly tested by the FileTestVectors.
 # This includes things like helper functions and runtime conversion options
@@ -382,6 +384,33 @@ def test_exceptions() -> None:
     assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"
 
 
+def test_empty_image_fallback_does_not_mask_prior_failure() -> None:
+    class FailingPngConverter(DocumentConverter):
+        def accepts(self, file_stream, stream_info, **kwargs):
+            return (stream_info.extension or "").lower() == ".png"
+
+        def convert(self, file_stream, stream_info, **kwargs):
+            raise RuntimeError("docintel analyze failed")
+
+    markitdown = MarkItDown(enable_builtins=False)
+    markitdown.register_converter(ImageConverter())
+    markitdown.register_converter(FailingPngConverter())
+
+    with pytest.raises(FileConversionException) as exc_info:
+        markitdown.convert_stream(
+            io.BytesIO(b"not-a-real-png"),
+            stream_info=StreamInfo(
+                extension=".png",
+                filename="sample.png",
+                mimetype="image/png",
+            ),
+        )
+
+    assert len(exc_info.value.attempts) == 1
+    assert type(exc_info.value.attempts[0].converter).__name__ == "FailingPngConverter"
+    assert "docintel analyze failed" in str(exc_info.value)
+
+
 @pytest.mark.skipif(
     skip_exiftool,
     reason="do not run if exiftool is not installed",