microsoft · jigangz · Mar 28, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -55,10 +55,25 @@ def convert(
         # Print only the main content
         body_elm = soup.find("body")
         webpage_text = ""
-        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
-        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+        try:
+            if body_elm:
+                webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            else:
+                webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+        except RecursionError:
+            # Large or deeply-nested HTML can exceed Python's recursion limit
+            # during markdownify's recursive DOM traversal.  Fall back to
+            # BeautifulSoup's iterative get_text() so the caller still gets
+            # usable plain-text content instead of raw HTML.
+            import warnings
+
+            warnings.warn(
+                "HTML document is too deeply nested for markdown conversion "
+                "(RecursionError). Falling back to plain-text extraction.",
+                stacklevel=2,
+            )
+            target = body_elm if body_elm else soup
+            webpage_text = target.get_text("\n", strip=True)
 
         assert isinstance(webpage_text, str)
 

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -288,6 +288,41 @@ def test_input_as_strings() -> None:
     assert "# Test" in result.text_content
 
 
+def test_deeply_nested_html_fallback() -> None:
+    """Large, deeply nested HTML should fall back to plain-text extraction
+    instead of silently returning unconverted HTML (issue #1636)."""
+    import warnings
+
+    markitdown = MarkItDown()
+
+    # Build HTML with nesting deep enough to trigger RecursionError in markdownify
+    depth = 500
+    html = "<html><body>"
+    for _ in range(depth):
+        html += '<div style="margin-left:10px">'
+    html += "<p>Deep content with <b>bold text</b></p>"
+    for _ in range(depth):
+        html += "</div>"
+    html += "</body></html>"
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        result = markitdown.convert_stream(
+            io.BytesIO(html.encode("utf-8")),
+            file_extension=".html",
+        )
+
+        # Should have emitted a warning about the fallback
+        recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
+        assert len(recursion_warnings) > 0
+
+    # The output should contain the text content, not raw HTML
+    assert "Deep content" in result.text_content
+    assert "bold text" in result.text_content
+    assert "<div" not in result.text_content
+    assert "<p>" not in result.text_content
+
+
 def test_doc_rlink() -> None:
     # Test for: CVE-2025-11849
     markitdown = MarkItDown()