microsoft · s3ich4n · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -63,9 +63,16 @@ def convert(
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
+        file_bytes = file_stream.read()
+
         if stream_info.charset:
-            text_content = file_stream.read().decode(stream_info.charset)
+            try:
+                text_content = file_bytes.decode(stream_info.charset)
+            except UnicodeDecodeError:
+                # Charset detection from partial file content may be inaccurate.
+                # Fall back to charset_normalizer for the full file content.
+                text_content = str(from_bytes(file_bytes).best())
         else:
-            text_content = str(from_bytes(file_stream.read()).best())
+            text_content = str(from_bytes(file_bytes).best())
 
         return DocumentConverterResult(markdown=text_content)
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -456,6 +456,51 @@ def test_markitdown_llm_parameters() -> None:
     assert messages[0]["content"][0]["text"] == test_prompt
 
 
+def test_plaintext_charset_fallback() -> None:
+    """
+    Test for GitHub issue #1505: PlainTextConverter throws UnicodeDecodeError
+    when charset detection from partial file content is inaccurate.
+
+    When the first 4096 bytes are ASCII-only but later bytes contain UTF-8
+    characters (e.g., accented or CJK characters), the charset may be incorrectly
+    detected as 'ascii'. The converter should fall back to charset_normalizer
+    when decoding fails.
+    """
+    markitdown = MarkItDown()
+
+    test_cases = [
+        (
+            "Spanish",
+            "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.",
+            ["señor", "¿Cómo está?", "Año"],
+        ),
+        ("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]),
+        ("Japanese", "こんにちは！日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]),
+        ("Chinese", "你好！中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]),
+    ]
+
+    for lang, utf8_text, expected_substrings in test_cases:
+        # Create a test file where:
+        # - First 4100 bytes are ASCII (exceeds the 4096 byte sample for charset detection)
+        # - Followed by UTF-8 encoded non-ASCII characters
+        ascii_part = "A" * 4100
+        test_content = ascii_part + utf8_text
+
+        # Use BytesIO to simulate a file stream
+        file_stream = io.BytesIO(test_content.encode("utf-8"))
+
+        # Convert using stream with incorrect charset hint (simulating the bug)
+        result = markitdown.convert_stream(
+            file_stream, stream_info=StreamInfo(charset="ascii", extension=".txt")
+        )
+
+        # Verify that the conversion succeeded and contains the UTF-8 characters
+        for expected in expected_substrings:
+            assert (
+                expected in result.text_content
+            ), f"{lang}: Expected '{expected}' not found in result"
+
+
 @pytest.mark.skipif(
     skip_llm,
     reason="do not run llm tests without a key",
@@ -495,6 +540,7 @@ def test_markitdown_llm() -> None:
         test_exceptions,
         test_doc_rlink,
         test_markitdown_exiftool,
+        test_plaintext_charset_fallback,
         test_markitdown_llm_parameters,
         test_markitdown_llm,
     ]: