From fb9e80b9027625946db33182cd5b453c8276da06 Mon Sep 17 00:00:00 2001 From: jigangz Date: Fri, 27 Mar 2026 22:56:37 -0700 Subject: [PATCH] fix: handle deeply nested HTML that triggers RecursionError (#1636) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large HTML files with deep DOM nesting (e.g., SEC EDGAR filings) cause markdownify's recursive DOM traversal to exceed Python's default recursion limit (1000). Previously this RecursionError was caught by the top-level _convert() dispatcher, which then fell through to PlainTextConverter — silently returning the raw HTML as 'markdown' with no warning. This fix catches RecursionError in HtmlConverter.convert() and falls back to BeautifulSoup's iterative get_text() method, which handles arbitrary nesting depths. A warning is emitted so callers know the output is plain text rather than full markdown. Root cause chain: 1. HtmlConverter.convert() calls markdownify.convert_soup() (recursive) 2. Deeply nested HTML (>~400 levels) triggers RecursionError 3. _convert() catches all Exceptions, stores in failed_attempts 4. PlainTextConverter.accepts() matches text/html via 'text/' prefix 5. PlainTextConverter.convert() returns raw HTML bytes as text 6. Caller receives 'markdown' that is actually unconverted HTML --- .../markitdown/converters/_html_converter.py | 23 +++++++++--- packages/markitdown/tests/test_module_misc.py | 35 +++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index dabb0d7d3..f95480d2c 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -55,10 +55,25 @@ def convert( # Print only the main content body_elm = soup.find("body") webpage_text = "" - if body_elm: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) - else: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + try: + if body_elm: + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + except RecursionError: + # Large or deeply-nested HTML can exceed Python's recursion limit + # during markdownify's recursive DOM traversal. Fall back to + # BeautifulSoup's iterative get_text() so the caller still gets + # usable plain-text content instead of raw HTML. + import warnings + + warnings.warn( + "HTML document is too deeply nested for markdown conversion " + "(RecursionError). Falling back to plain-text extraction.", + stacklevel=2, + ) + target = body_elm if body_elm else soup + webpage_text = target.get_text("\n", strip=True) assert isinstance(webpage_text, str) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..0fb968f37 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -288,6 +288,41 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def test_deeply_nested_html_fallback() -> None: + """Large, deeply nested HTML should fall back to plain-text extraction + instead of silently returning unconverted HTML (issue #1636).""" + import warnings + + markitdown = MarkItDown() + + # Build HTML with nesting deep enough to trigger RecursionError in markdownify + depth = 500 + html = "" + for _ in range(depth): + html += '
' + html += "

Deep content with bold text

" + for _ in range(depth): + html += "
" + html += "" + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = markitdown.convert_stream( + io.BytesIO(html.encode("utf-8")), + file_extension=".html", + ) + + # Should have emitted a warning about the fallback + recursion_warnings = [x for x in w if "deeply nested" in str(x.message)] + assert len(recursion_warnings) > 0 + + # The output should contain the text content, not raw HTML + assert "Deep content" in result.text_content + assert "bold text" in result.text_content + assert "" not in result.text_content + + def test_doc_rlink() -> None: # Test for: CVE-2025-11849 markitdown = MarkItDown()