From 2d11ae969dfb242b398648598f8444d87c524b36 Mon Sep 17 00:00:00 2001 From: Hiren Date: Sun, 29 Mar 2026 16:03:43 -0400 Subject: [PATCH] fix: make FORMULAS feature conditional in DocumentIntelligenceConverter Add enable_formulas parameter (default True) to allow disabling formula extraction when not needed. Unconditionally enabling FORMULAS degrades recognition accuracy for documents without mathematical formulas. Fixes #1536 --- .../markitdown/converters/_doc_intel_converter.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index fd843f231..69ba5a459 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -146,6 +146,7 @@ def __init__( DocumentIntelligenceFileType.BMP, DocumentIntelligenceFileType.TIFF, ], + enable_formulas: bool = True, ): """ Initialize the DocumentIntelligenceConverter. @@ -154,7 +155,9 @@ def __init__( endpoint (str): The endpoint for the Document Intelligence service. api_version (str): The API version to use. Defaults to "2024-07-31-preview". credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication. - file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types. + file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported types. + enable_formulas (bool): Whether to enable formula extraction. Defaults to True for backward compatibility. + Set to False to improve accuracy on documents without mathematical formulas. """ super().__init__() @@ -180,6 +183,7 @@ def __init__( self.endpoint = endpoint self.api_version = api_version + self._enable_formulas = enable_formulas self.doc_intel_client = DocumentIntelligenceClient( endpoint=self.endpoint, api_version=self.api_version, @@ -228,11 +232,13 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]: if mimetype.startswith(prefix): return [] - return [ - DocumentAnalysisFeature.FORMULAS, # enable formula extraction + features = [ DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction ] + if self._enable_formulas: + features.append(DocumentAnalysisFeature.FORMULAS) + return features def convert( self,