From b86f13980a35fd800fa21d9a91c88fffbbd6addd Mon Sep 17 00:00:00 2001
From: Ashok <ashh010101@gmail.com>
Date: Wed, 17 Sep 2025 01:31:02 +0530
Subject: [PATCH 1/2] Fix: handle missing w:styleId in DOCX to prevent KeyError

---
 .../src/markitdown/converters/_docx_converter.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 9cb2cbd5b..193a8a7ed 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -84,7 +84,15 @@ def convert(
 
         style_map = kwargs.get("style_map", None)
         pre_process_stream = pre_process_docx(file_stream)
-        return self._html_converter.convert_string(
-            mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
-            **kwargs,
-        )
+
+        # Patch: handle missing styleId safely
+        try:
+            html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
+        except KeyError as e:
+            if str(e) == "'w:styleId'":
+                # Ignore missing style IDs and convert anyway
+                html = mammoth.convert_to_html(pre_process_stream, style_map=style_map, ignore_empty_styles=True).value
+            else:
+                raise
+
+        return self._html_converter.convert_string(html, **kwargs)

From 7ae316294ba3381626a1cc716effcea7c0f86483 Mon Sep 17 00:00:00 2001
From: Ash <ashhhh010101@gmail.com>
Date: Sat, 17 Jan 2026 17:42:32 +0530
Subject: [PATCH 2/2] Fix DOCX conversion crash with missing styleId safely

- Avoids retrying consumed streams by buffering DOCX input
- Removes dependency-specific parameters
- Adds logging and safe fallback for malformed DOCX files
---
 .../markitdown/converters/_docx_converter.py  | 62 +++++++++----------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 68d8d58f0..60c787809 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,7 +1,6 @@
 import sys
 import io
-from warnings import warn
-
+import logging
 from typing import BinaryIO, Any
 
 from ._html_converter import HtmlConverter
@@ -10,14 +9,12 @@
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
-# Try loading optional (but in this case, required) dependencies
-# Save reporting of any exceptions for later
+logger = logging.getLogger(__name__)
+
 _dependency_exc_info = None
 try:
     import mammoth
-
 except ImportError:
-    # Preserve the error and stack trace for later
     _dependency_exc_info = sys.exc_info()
 
 
@@ -30,7 +27,8 @@
 
 class DocxConverter(HtmlConverter):
     """
-    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+    Converts DOCX files to Markdown.
+    Gracefully handles malformed DOCX files with missing style information.
     """
 
     def __init__(self):
@@ -41,27 +39,22 @@ def accepts(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        **kwargs: Any,
     ) -> bool:
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
 
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
+        return (
+            extension in ACCEPTED_FILE_EXTENSIONS
+            or any(mimetype.startswith(p) for p in ACCEPTED_MIME_TYPE_PREFIXES)
+        )
 
     def convert(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        **kwargs: Any,
     ) -> DocumentConverterResult:
-        # Check: the dependencies
         if _dependency_exc_info is not None:
             raise MissingDependencyException(
                 MISSING_DEPENDENCY_MESSAGE.format(
@@ -69,23 +62,30 @@ def convert(
                     extension=".docx",
                     feature="docx",
                 )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _dependency_exc_info[1].with_traceback(  # type: ignore
                 _dependency_exc_info[2]
             )
 
-        style_map = kwargs.get("style_map", None)
-        pre_process_stream = pre_process_docx(file_stream)
+        style_map = kwargs.get("style_map")
+
+        # Preprocess and fully buffer the DOCX to avoid stream reuse issues
+        processed = pre_process_docx(file_stream)
+        buffer = io.BytesIO(processed.read())
+        buffer.seek(0)
 
-        # Patch: handle missing styleId safely
         try:
-            html = mammoth.convert_to_html(pre_process_stream, style_map=style_map).value
-        except KeyError as e:
-            if str(e) == "'w:styleId'":
-                # Ignore missing style IDs and convert anyway
-                html = mammoth.convert_to_html(pre_process_stream, style_map=style_map, ignore_empty_styles=True).value
-            else:
-                raise
+            result = mammoth.convert_to_html(buffer, style_map=style_map)
+            html = result.value
+        except KeyError as exc:
+            # Known issue: malformed DOCX with missing w:styleId
+            logger.warning(
+                "DOCX conversion encountered missing style metadata (%s). "
+                "Falling back to default style handling.",
+                exc,
+            )
+
+            buffer.seek(0)
+            result = mammoth.convert_to_html(buffer)
+            html = result.value
 
         return self._html_converter.convert_string(html, **kwargs)