Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,12 @@ def _convert(
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
)
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
if (
res.text_content.strip() == ""
and len(failed_attempts) > 0
):
res = None
continue
return res

# If we got this far without success, report any exceptions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class DefaultAzureCredential:
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.
CONTENT_FORMAT = "markdown"
DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION = "2024-11-30"


class DocumentIntelligenceFileType(str, Enum):
Expand Down Expand Up @@ -134,7 +135,7 @@ def __init__(
self,
*,
endpoint: str,
api_version: str = "2024-07-31-preview",
api_version: str = DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION,
credential: AzureKeyCredential | TokenCredential | None = None,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOCX,
Expand All @@ -152,7 +153,7 @@ def __init__(

Args:
endpoint (str): The endpoint for the Document Intelligence service.
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
api_version (str): The API version to use. Defaults to "2024-11-30".
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
"""
Expand Down
54 changes: 54 additions & 0 deletions packages/markitdown/tests/test_docintel_html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import io
from markitdown.converters._doc_intel_converter import (
DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION,
DocumentIntelligenceConverter,
DocumentIntelligenceFileType,
)
Expand All @@ -24,3 +25,56 @@ def test_docintel_accepts_html_mimetype():
assert conv.accepts(io.BytesIO(b""), stream_info)
stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None)
assert conv.accepts(io.BytesIO(b""), stream_info)


def test_docintel_default_api_version(monkeypatch):
captured = {}

class FakeClient:
def __init__(self, *, endpoint, api_version, credential):
captured["endpoint"] = endpoint
captured["api_version"] = api_version
captured["credential"] = credential

monkeypatch.setattr(
"markitdown.converters._doc_intel_converter._dependency_exc_info", None
)
monkeypatch.setattr(
"markitdown.converters._doc_intel_converter.DocumentIntelligenceClient",
FakeClient,
)

credential = object()
converter = DocumentIntelligenceConverter(
endpoint="https://example.cognitiveservices.azure.com/",
credential=credential,
)

assert converter.api_version == DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION
assert captured["api_version"] == DEFAULT_DOCUMENT_INTELLIGENCE_API_VERSION
assert captured["credential"] is credential


def test_docintel_explicit_api_version(monkeypatch):
captured = {}

class FakeClient:
def __init__(self, *, endpoint, api_version, credential):
captured["api_version"] = api_version

monkeypatch.setattr(
"markitdown.converters._doc_intel_converter._dependency_exc_info", None
)
monkeypatch.setattr(
"markitdown.converters._doc_intel_converter.DocumentIntelligenceClient",
FakeClient,
)

converter = DocumentIntelligenceConverter(
endpoint="https://example.cognitiveservices.azure.com/",
credential=object(),
api_version="2024-07-31-preview",
)

assert converter.api_version == "2024-07-31-preview"
assert captured["api_version"] == "2024-07-31-preview"
29 changes: 29 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
FileConversionException,
StreamInfo,
)
from markitdown._base_converter import DocumentConverter
from markitdown.converters import ImageConverter

# This file contains module tests that are not directly tested by the FileTestVectors.
# This includes things like helper functions and runtime conversion options
Expand Down Expand Up @@ -382,6 +384,33 @@ def test_exceptions() -> None:
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"


def test_empty_image_fallback_does_not_mask_prior_failure() -> None:
class FailingPngConverter(DocumentConverter):
def accepts(self, file_stream, stream_info, **kwargs):
return (stream_info.extension or "").lower() == ".png"

def convert(self, file_stream, stream_info, **kwargs):
raise RuntimeError("docintel analyze failed")

markitdown = MarkItDown(enable_builtins=False)
markitdown.register_converter(ImageConverter())
markitdown.register_converter(FailingPngConverter())

with pytest.raises(FileConversionException) as exc_info:
markitdown.convert_stream(
io.BytesIO(b"not-a-real-png"),
stream_info=StreamInfo(
extension=".png",
filename="sample.png",
mimetype="image/png",
),
)

assert len(exc_info.value.attempts) == 1
assert type(exc_info.value.attempts[0].converter).__name__ == "FailingPngConverter"
assert "docintel analyze failed" in str(exc_info.value)


@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
Expand Down