From f371936648c708941a004b1bc9d4203dc8a00ec2 Mon Sep 17 00:00:00 2001 From: s3ich4n Date: Thu, 22 Jan 2026 05:10:19 +0900 Subject: [PATCH 1/3] fix: Handle UnicodeDecodeError in PlainTextConverter (#1505) When charset detection samples only the first 4096 bytes and detects 'ascii', but the file contains UTF-8 characters beyond that point, decoding fails with UnicodeDecodeError. Added fallback to charset_normalizer when UnicodeDecodeError occurs, allowing proper handling of files with non-ASCII characters Spanish, Korean, Japanese, Chinese, etc.) that appear after the 4096-byte sample. --- .../converters/_plain_text_converter.py | 11 ++++- packages/markitdown/tests/test_module_misc.py | 42 ++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 6f1306fe8..aa51823ae 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -63,9 +63,16 @@ def convert( stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + file_bytes = file_stream.read() + if stream_info.charset: - text_content = file_stream.read().decode(stream_info.charset) + try: + text_content = file_bytes.decode(stream_info.charset) + except UnicodeDecodeError: + # Charset detection from partial file content may be inaccurate. + # Fall back to charset_normalizer for the full file content. + text_content = str(from_bytes(file_bytes).best()) else: - text_content = str(from_bytes(file_stream.read()).best()) + text_content = str(from_bytes(file_bytes).best()) return DocumentConverterResult(markdown=text_content) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..ce72bf13e 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -95,7 +95,6 @@ "2003", # chart value ] - # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): """Validate presence or absence of specific strings.""" @@ -456,6 +455,47 @@ def test_markitdown_llm_parameters() -> None: assert messages[0]["content"][0]["text"] == test_prompt +def test_plaintext_charset_fallback() -> None: + """ + Test for GitHub issue #1505: PlainTextConverter throws UnicodeDecodeError + when charset detection from partial file content is inaccurate. + + When the first 4096 bytes are ASCII-only but later bytes contain UTF-8 + characters (e.g., accented or CJK characters), the charset may be incorrectly + detected as 'ascii'. The converter should fall back to charset_normalizer + when decoding fails. + """ + markitdown = MarkItDown() + + test_cases = [ + ("Spanish", "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.", ["señor", "¿Cómo está?", "Año"]), + ("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]), + ("Japanese", "こんにちは!日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]), + ("Chinese", "你好!中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]), + ] + + for lang, utf8_text, expected_substrings in test_cases: + # Create a test file where: + # - First 4100 bytes are ASCII (exceeds the 4096 byte sample for charset detection) + # - Followed by UTF-8 encoded non-ASCII characters + ascii_part = "A" * 4100 + test_content = ascii_part + utf8_text + + # Use BytesIO to simulate a file stream + file_stream = io.BytesIO(test_content.encode("utf-8")) + + # Convert using stream with incorrect charset hint (simulating the bug) + result = markitdown.convert_stream( + file_stream, stream_info=StreamInfo(charset="ascii", extension=".txt") + ) + + # Verify that the conversion succeeded and contains the UTF-8 characters + for expected in expected_substrings: + assert expected in result.text_content, ( + f"{lang}: Expected '{expected}' not found in result" + ) + + @pytest.mark.skipif( skip_llm, reason="do not run llm tests without a key", From fb6596d6c2f89e8c36a6fba146781d53c9482773 Mon Sep 17 00:00:00 2001 From: s3ich4n Date: Thu, 22 Jan 2026 05:19:07 +0900 Subject: [PATCH 2/3] style: fix linting (#1505) --- packages/markitdown/tests/test_module_misc.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index ce72bf13e..9290d1cd4 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -95,6 +95,7 @@ "2003", # chart value ] + # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): """Validate presence or absence of specific strings.""" @@ -468,7 +469,11 @@ def test_plaintext_charset_fallback() -> None: markitdown = MarkItDown() test_cases = [ - ("Spanish", "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.", ["señor", "¿Cómo está?", "Año"]), + ( + "Spanish", + "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.", + ["señor", "¿Cómo está?", "Año"], + ), ("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]), ("Japanese", "こんにちは!日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]), ("Chinese", "你好!中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]), @@ -491,9 +496,9 @@ def test_plaintext_charset_fallback() -> None: # Verify that the conversion succeeded and contains the UTF-8 characters for expected in expected_substrings: - assert expected in result.text_content, ( - f"{lang}: Expected '{expected}' not found in result" - ) + assert ( + expected in result.text_content + ), f"{lang}: Expected '{expected}' not found in result" @pytest.mark.skipif( From a3adc910330883753beb37e0eb31937c34e4697c Mon Sep 17 00:00:00 2001 From: s3ich4n Date: Thu, 22 Jan 2026 05:52:10 +0900 Subject: [PATCH 3/3] chore: Include charset fallback test in `__main__` runner (#1505) --- packages/markitdown/tests/test_module_misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 9290d1cd4..db2cbff0a 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -540,6 +540,7 @@ def test_markitdown_llm() -> None: test_exceptions, test_doc_rlink, test_markitdown_exiftool, + test_plaintext_charset_fallback, test_markitdown_llm_parameters, test_markitdown_llm, ]: