diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index dabb0d7d3..f95480d2c 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -55,10 +55,25 @@ def convert( # Print only the main content body_elm = soup.find("body") webpage_text = "" - if body_elm: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) - else: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + try: + if body_elm: + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + except RecursionError: + # Large or deeply-nested HTML can exceed Python's recursion limit + # during markdownify's recursive DOM traversal. Fall back to + # BeautifulSoup's iterative get_text() so the caller still gets + # usable plain-text content instead of raw HTML. + import warnings + + warnings.warn( + "HTML document is too deeply nested for markdown conversion " + "(RecursionError). Falling back to plain-text extraction.", + stacklevel=2, + ) + target = body_elm if body_elm else soup + webpage_text = target.get_text("\n", strip=True) assert isinstance(webpage_text, str) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..0fb968f37 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -288,6 +288,41 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def test_deeply_nested_html_fallback() -> None: + """Large, deeply nested HTML should fall back to plain-text extraction + instead of silently returning unconverted HTML (issue #1636).""" + import warnings + + markitdown = MarkItDown() + + # Build HTML with nesting deep enough to trigger RecursionError in markdownify + depth = 500 + html = "" + for _ in range(depth): + html += '
' + html += "

Deep content with bold text

" + for _ in range(depth): + html += "
" + html += "" + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = markitdown.convert_stream( + io.BytesIO(html.encode("utf-8")), + file_extension=".html", + ) + + # Should have emitted a warning about the fallback + recursion_warnings = [x for x in w if "deeply nested" in str(x.message)] + assert len(recursion_warnings) > 0 + + # The output should contain the text content, not raw HTML + assert "Deep content" in result.text_content + assert "bold text" in result.text_content + assert "" not in result.text_content + + def test_doc_rlink() -> None: # Test for: CVE-2025-11849 markitdown = MarkItDown()