Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions packages/markitdown/src/markitdown/converters/_html_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,25 @@ def convert(
# Print only the main content
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
try:
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
except RecursionError:
# Large or deeply-nested HTML can exceed Python's recursion limit
# during markdownify's recursive DOM traversal. Fall back to
# BeautifulSoup's iterative get_text() so the caller still gets
# usable plain-text content instead of raw HTML.
import warnings

warnings.warn(
"HTML document is too deeply nested for markdown conversion "
"(RecursionError). Falling back to plain-text extraction.",
stacklevel=2,
)
target = body_elm if body_elm else soup
webpage_text = target.get_text("\n", strip=True)

assert isinstance(webpage_text, str)

Expand Down
35 changes: 35 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,41 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_deeply_nested_html_fallback() -> None:
"""Large, deeply nested HTML should fall back to plain-text extraction
instead of silently returning unconverted HTML (issue #1636)."""
import warnings

markitdown = MarkItDown()

# Build HTML with nesting deep enough to trigger RecursionError in markdownify
depth = 500
html = "<html><body>"
for _ in range(depth):
html += '<div style="margin-left:10px">'
html += "<p>Deep content with <b>bold text</b></p>"
for _ in range(depth):
html += "</div>"
html += "</body></html>"

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
result = markitdown.convert_stream(
io.BytesIO(html.encode("utf-8")),
file_extension=".html",
)

# Should have emitted a warning about the fallback
recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
assert len(recursion_warnings) > 0

# The output should contain the text content, not raw HTML
assert "Deep content" in result.text_content
assert "bold text" in result.text_content
assert "<div" not in result.text_content
assert "<p>" not in result.text_content


def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()
Expand Down