diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 6f1306fe..aa51823a 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -63,9 +63,16 @@ def convert( stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + file_bytes = file_stream.read() + if stream_info.charset: - text_content = file_stream.read().decode(stream_info.charset) + try: + text_content = file_bytes.decode(stream_info.charset) + except UnicodeDecodeError: + # Charset detection from partial file content may be inaccurate. + # Fall back to charset_normalizer for the full file content. + text_content = str(from_bytes(file_bytes).best()) else: - text_content = str(from_bytes(file_stream.read()).best()) + text_content = str(from_bytes(file_bytes).best()) return DocumentConverterResult(markdown=text_content) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23..db2cbff0 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -456,6 +456,51 @@ def test_markitdown_llm_parameters() -> None: assert messages[0]["content"][0]["text"] == test_prompt +def test_plaintext_charset_fallback() -> None: + """ + Test for GitHub issue #1505: PlainTextConverter throws UnicodeDecodeError + when charset detection from partial file content is inaccurate. + + When the first 4096 bytes are ASCII-only but later bytes contain UTF-8 + characters (e.g., accented or CJK characters), the charset may be incorrectly + detected as 'ascii'. The converter should fall back to charset_normalizer + when decoding fails. + """ + markitdown = MarkItDown() + + test_cases = [ + ( + "Spanish", + "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.", + ["señor", "¿Cómo está?", "Año"], + ), + ("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]), + ("Japanese", "こんにちは!日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]), + ("Chinese", "你好!中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]), + ] + + for lang, utf8_text, expected_substrings in test_cases: + # Create a test file where: + # - First 4100 bytes are ASCII (exceeds the 4096 byte sample for charset detection) + # - Followed by UTF-8 encoded non-ASCII characters + ascii_part = "A" * 4100 + test_content = ascii_part + utf8_text + + # Use BytesIO to simulate a file stream + file_stream = io.BytesIO(test_content.encode("utf-8")) + + # Convert using stream with incorrect charset hint (simulating the bug) + result = markitdown.convert_stream( + file_stream, stream_info=StreamInfo(charset="ascii", extension=".txt") + ) + + # Verify that the conversion succeeded and contains the UTF-8 characters + for expected in expected_substrings: + assert ( + expected in result.text_content + ), f"{lang}: Expected '{expected}' not found in result" + + @pytest.mark.skipif( skip_llm, reason="do not run llm tests without a key", @@ -495,6 +540,7 @@ def test_markitdown_llm() -> None: test_exceptions, test_doc_rlink, test_markitdown_exiftool, + test_plaintext_charset_fallback, test_markitdown_llm_parameters, test_markitdown_llm, ]: