From b86f13980a35fd800fa21d9a91c88fffbbd6addd Mon Sep 17 00:00:00 2001 From: Ashok Date: Wed, 17 Sep 2025 01:31:02 +0530 Subject: [PATCH 1/2] Fix: handle missing w:styleId in DOCX to prevent KeyError --- .../src/markitdown/converters/_docx_converter.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 9cb2cbd5b..193a8a7ed 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -84,7 +84,15 @@ def convert( style_map = kwargs.get("style_map", None) pre_process_stream = pre_process_docx(file_stream) - return self._html_converter.convert_string( - mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, - **kwargs, - ) + + # Patch: handle missing styleId safely + try: + html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value + except KeyError as e: + if str(e) == "'w:styleId'": + # Ignore missing style IDs and convert anyway + html = mammoth.convert_to_html(pre_process_stream, style_map=style_map, ignore_empty_styles=True).value + else: + raise + + return self._html_converter.convert_string(html, **kwargs) From 7ae316294ba3381626a1cc716effcea7c0f86483 Mon Sep 17 00:00:00 2001 From: Ash Date: Sat, 17 Jan 2026 17:42:32 +0530 Subject: [PATCH 2/2] Fix DOCX conversion crash with missing styleId safely - Avoids retrying consumed streams by buffering DOCX input - Removes dependency-specific parameters - Adds logging and safe fallback for malformed DOCX files --- .../markitdown/converters/_docx_converter.py | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 68d8d58f0..60c787809 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -1,7 +1,6 @@ import sys import io -from warnings import warn - +import logging from typing import BinaryIO, Any from ._html_converter import HtmlConverter @@ -10,14 +9,12 @@ from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later +logger = logging.getLogger(__name__) + _dependency_exc_info = None try: import mammoth - except ImportError: - # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() @@ -30,7 +27,8 @@ class DocxConverter(HtmlConverter): """ - Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + Converts DOCX files to Markdown. + Gracefully handles malformed DOCX files with missing style information. """ def __init__(self): @@ -41,27 +39,22 @@ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() - if extension in ACCEPTED_FILE_EXTENSIONS: - return True - - for prefix in ACCEPTED_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return True - - return False + return ( + extension in ACCEPTED_FILE_EXTENSIONS + or any(mimetype.startswith(p) for p in ACCEPTED_MIME_TYPE_PREFIXES) + ) def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: - # Check: the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( @@ -69,23 +62,30 @@ def convert( extension=".docx", feature="docx", ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] + ) from _dependency_exc_info[1].with_traceback( # type: ignore _dependency_exc_info[2] ) - style_map = kwargs.get("style_map", None) - pre_process_stream = pre_process_docx(file_stream) + style_map = kwargs.get("style_map") + + # Preprocess and fully buffer the DOCX to avoid stream reuse issues + processed = pre_process_docx(file_stream) + buffer = io.BytesIO(processed.read()) + buffer.seek(0) - # Patch: handle missing styleId safely try: - html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value - except KeyError as e: - if str(e) == "'w:styleId'": - # Ignore missing style IDs and convert anyway - html = mammoth.convert_to_html(pre_process_stream, style_map=style_map, ignore_empty_styles=True).value - else: - raise + result = mammoth.convert_to_html(buffer, style_map=style_map) + html = result.value + except KeyError as exc: + # Known issue: malformed DOCX with missing w:styleId + logger.warning( + "DOCX conversion encountered missing style metadata (%s). " + "Falling back to default style handling.", + exc, + ) + + buffer.seek(0) + result = mammoth.convert_to_html(buffer) + html = result.value return self._html_converter.convert_string(html, **kwargs)