Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,14 @@ def chomp(text):
space, strip the string and return a space as suffix of prefix, if needed.
This function is used to prevent conversions like
<b> foo</b> => ** foo**

If the text is whitespace-only, preserve it as a single space instead of
returning an empty string (fixes issue #155).
"""
# Handle whitespace-only text: preserve as single space (fixes #155)
if text and not text.strip():
return ('', '', ' ')

prefix = ' ' if text and text[0] == ' ' else ''
suffix = ' ' if text and text[-1] == ' ' else ''
text = text.strip()
Expand All @@ -111,6 +118,10 @@ def implementation(self, el, text, parent_tags):
prefix, suffix, text = chomp(text)
if not text:
return ''
# If text is whitespace-only, return just the whitespace without markup
# This preserves spaces from tags like <strong> </strong> (fixes #155)
if text.isspace():
return text
return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)
return implementation

Expand Down
8 changes: 5 additions & 3 deletions tests/test_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@

def test_chomp():
assert md(' <b></b> ') == ' '
assert md(' <b> </b> ') == ' '
assert md(' <b> </b> ') == ' '
assert md(' <b> </b> ') == ' '
# With fix for issue #155, whitespace-only content is preserved as a single space
# so ' <b> </b> ' becomes ' ' (before) + ' ' (preserved) + ' ' (after) = ' '
assert md(' <b> </b> ') == ' '
assert md(' <b> </b> ') == ' '
assert md(' <b> </b> ') == ' '
assert md(' <b>s </b> ') == ' **s** '
assert md(' <b> s</b> ') == ' **s** '
assert md(' <b> s </b> ') == ' **s** '
Expand Down
32 changes: 32 additions & 0 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,35 @@ def test_spaces():
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'


def test_whitespace_only_inline_tags():
"""
Test that whitespace-only inline tags preserve the whitespace.
Fixes issue #155: https://github.com/matthewwithanm/python-markdownify/issues/155

When DOCX files have formatting where a space is in its own formatting run
(e.g., "further" [normal] + " " [bold] + "reference" [normal]), the HTML
produced is: further<strong> </strong>reference

Previously, this would be converted to "furtherreference" (losing the space).
After the fix, it should be "further reference" (space preserved).
"""
# Whitespace-only strong/b tags should preserve the space
assert md('further<strong> </strong>reference') == 'further reference'
assert md('word1<b> </b>word2') == 'word1 word2'

# Whitespace-only em/i tags should preserve the space
assert md('hello<em> </em>world') == 'hello world'
assert md('foo<i> </i>bar') == 'foo bar'

# Multiple whitespace characters should collapse to single space
assert md('a<strong> </strong>b') == 'a b'
assert md('a<em> </em>b') == 'a b'

# Mixed formatting with whitespace boundary (real-world DOCX pattern)
assert md('The <strong>TRUST,</strong> but without further<strong> </strong>reference') == 'The **TRUST,** but without further reference'

# Tabs and other whitespace should also be preserved as single space
assert md('a<b>\t</b>b') == 'a b'
assert md('a<i>\n</i>b') == 'a b'