From 2561ae2ec928291d5bc8e074923d035b71b9da4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Wiedermann-M=C3=B6ller?= Date: Fri, 27 Mar 2026 14:48:51 +0100 Subject: [PATCH] fix: update DocIntel default and surface OCR failures --- .../markitdown/src/markitdown/_markitdown.py | 6 +++ .../converters/_doc_intel_converter.py | 5 +- .../markitdown/tests/test_docintel_html.py | 54 +++++++++++++++++++ packages/markitdown/tests/test_module_misc.py | 29 ++++++++++ 4 files changed, 92 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..98029694a 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -619,6 +619,12 @@ def _convert( [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] ) res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) + if ( + res.text_content.strip() == "" + and len(failed_attempts) > 0 + ): + res = None + continue return res # If we got this far without success, report any exceptions diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index fd843f231..9224a8596 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -50,6 +50,7 @@ class DefaultAzureCredential: # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # This constant is a temporary fix until the bug is resolved. CONTENT_FORMAT = "markdown" +DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION = "2024-11-30" class DocumentIntelligenceFileType(str, Enum): @@ -134,7 +135,7 @@ def __init__( self, *, endpoint: str, - api_version: str = "2024-07-31-preview", + api_version: str = DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION, credential: AzureKeyCredential | TokenCredential | None = None, file_types: List[DocumentIntelligenceFileType] = [ DocumentIntelligenceFileType.DOCX, @@ -152,7 +153,7 @@ def __init__( Args: endpoint (str): The endpoint for the Document Intelligence service. - api_version (str): The API version to use. Defaults to "2024-07-31-preview". + api_version (str): The API version to use. Defaults to "2024-11-30". credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication. file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types. """ diff --git a/packages/markitdown/tests/test_docintel_html.py b/packages/markitdown/tests/test_docintel_html.py index d0b4caa3e..b2256578b 100644 --- a/packages/markitdown/tests/test_docintel_html.py +++ b/packages/markitdown/tests/test_docintel_html.py @@ -1,5 +1,6 @@ import io from markitdown.converters._doc_intel_converter import ( + DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION, DocumentIntelligenceConverter, DocumentIntelligenceFileType, ) @@ -24,3 +25,56 @@ def test_docintel_accepts_html_mimetype(): assert conv.accepts(io.BytesIO(b""), stream_info) stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None) assert conv.accepts(io.BytesIO(b""), stream_info) + + +def test_docintel_default_api_version(monkeypatch): + captured = {} + + class FakeClient: + def __init__(self, *, endpoint, api_version, credential): + captured["endpoint"] = endpoint + captured["api_version"] = api_version + captured["credential"] = credential + + monkeypatch.setattr( + "markitdown.converters._doc_intel_converter._dependency_exc_info", None + ) + monkeypatch.setattr( + "markitdown.converters._doc_intel_converter.DocumentIntelligenceClient", + FakeClient, + ) + + credential = object() + converter = DocumentIntelligenceConverter( + endpoint="https://example.cognitiveservices.azure.com/", + credential=credential, + ) + + assert converter.api_version == DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION + assert captured["api_version"] == DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION + assert captured["credential"] is credential + + +def test_docintel_explicit_api_version(monkeypatch): + captured = {} + + class FakeClient: + def __init__(self, *, endpoint, api_version, credential): + captured["api_version"] = api_version + + monkeypatch.setattr( + "markitdown.converters._doc_intel_converter._dependency_exc_info", None + ) + monkeypatch.setattr( + "markitdown.converters._doc_intel_converter.DocumentIntelligenceClient", + FakeClient, + ) + + converter = DocumentIntelligenceConverter( + endpoint="https://example.cognitiveservices.azure.com/", + credential=object(), + api_version="2024-07-31-preview", + ) + + assert converter.api_version == "2024-07-31-preview" + assert captured["api_version"] == "2024-07-31-preview" diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..99db585ff 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -14,6 +14,8 @@ FileConversionException, StreamInfo, ) +from markitdown._base_converter import DocumentConverter +from markitdown.converters import ImageConverter # This file contains module tests that are not directly tested by the FileTestVectors. # This includes things like helper functions and runtime conversion options @@ -382,6 +384,33 @@ def test_exceptions() -> None: assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" +def test_empty_image_fallback_does_not_mask_prior_failure() -> None: + class FailingPngConverter(DocumentConverter): + def accepts(self, file_stream, stream_info, **kwargs): + return (stream_info.extension or "").lower() == ".png" + + def convert(self, file_stream, stream_info, **kwargs): + raise RuntimeError("docintel analyze failed") + + markitdown = MarkItDown(enable_builtins=False) + markitdown.register_converter(ImageConverter()) + markitdown.register_converter(FailingPngConverter()) + + with pytest.raises(FileConversionException) as exc_info: + markitdown.convert_stream( + io.BytesIO(b"not-a-real-png"), + stream_info=StreamInfo( + extension=".png", + filename="sample.png", + mimetype="image/png", + ), + ) + + assert len(exc_info.value.attempts) == 1 + assert type(exc_info.value.attempts[0].converter).__name__ == "FailingPngConverter" + assert "docintel analyze failed" in str(exc_info.value) + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed",