Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,16 @@ def convert(
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
file_bytes = file_stream.read()

if stream_info.charset:
text_content = file_stream.read().decode(stream_info.charset)
try:
text_content = file_bytes.decode(stream_info.charset)
except UnicodeDecodeError:
# Charset detection from partial file content may be inaccurate.
# Fall back to charset_normalizer for the full file content.
text_content = str(from_bytes(file_bytes).best())
else:
text_content = str(from_bytes(file_stream.read()).best())
text_content = str(from_bytes(file_bytes).best())

return DocumentConverterResult(markdown=text_content)
46 changes: 46 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,51 @@ def test_markitdown_llm_parameters() -> None:
assert messages[0]["content"][0]["text"] == test_prompt


def test_plaintext_charset_fallback() -> None:
"""
Test for GitHub issue #1505: PlainTextConverter throws UnicodeDecodeError
when charset detection from partial file content is inaccurate.

When the first 4096 bytes are ASCII-only but later bytes contain UTF-8
characters (e.g., accented or CJK characters), the charset may be incorrectly
detected as 'ascii'. The converter should fall back to charset_normalizer
when decoding fails.
"""
markitdown = MarkItDown()

test_cases = [
(
"Spanish",
"Hola, señor! ¿Cómo está? Año nuevo, vida nueva.",
["señor", "¿Cómo está?", "Año"],
),
("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]),
("Japanese", "こんにちは!日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]),
("Chinese", "你好!中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]),
]

for lang, utf8_text, expected_substrings in test_cases:
# Create a test file where:
# - First 4100 bytes are ASCII (exceeds the 4096 byte sample for charset detection)
# - Followed by UTF-8 encoded non-ASCII characters
ascii_part = "A" * 4100
test_content = ascii_part + utf8_text

# Use BytesIO to simulate a file stream
file_stream = io.BytesIO(test_content.encode("utf-8"))

# Convert using stream with incorrect charset hint (simulating the bug)
result = markitdown.convert_stream(
file_stream, stream_info=StreamInfo(charset="ascii", extension=".txt")
)

# Verify that the conversion succeeded and contains the UTF-8 characters
for expected in expected_substrings:
assert (
expected in result.text_content
), f"{lang}: Expected '{expected}' not found in result"


@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
Expand Down Expand Up @@ -495,6 +540,7 @@ def test_markitdown_llm() -> None:
test_exceptions,
test_doc_rlink,
test_markitdown_exiftool,
test_plaintext_charset_fallback,
test_markitdown_llm_parameters,
test_markitdown_llm,
]:
Expand Down