From f371936648c708941a004b1bc9d4203dc8a00ec2 Mon Sep 17 00:00:00 2001
From: s3ich4n <s3ich4n@gmail.com>
Date: Thu, 22 Jan 2026 05:10:19 +0900
Subject: [PATCH 1/3] fix: Handle UnicodeDecodeError in PlainTextConverter
 (#1505)

When charset detection samples only the first 4096 bytes and detects 'ascii',
but the file contains UTF-8 characters beyond that point,
decoding fails with UnicodeDecodeError.

Added fallback to charset_normalizer when UnicodeDecodeError occurs,
allowing proper handling of files with non-ASCII characters
Spanish, Korean, Japanese, Chinese, etc.)
that appear after the 4096-byte sample.
---
 .../converters/_plain_text_converter.py       | 11 ++++-
 packages/markitdown/tests/test_module_misc.py | 42 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
index 6f1306fe8..aa51823ae 100644
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -63,9 +63,16 @@ def convert(
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
+        file_bytes = file_stream.read()
+
         if stream_info.charset:
-            text_content = file_stream.read().decode(stream_info.charset)
+            try:
+                text_content = file_bytes.decode(stream_info.charset)
+            except UnicodeDecodeError:
+                # Charset detection from partial file content may be inaccurate.
+                # Fall back to charset_normalizer for the full file content.
+                text_content = str(from_bytes(file_bytes).best())
         else:
-            text_content = str(from_bytes(file_stream.read()).best())
+            text_content = str(from_bytes(file_bytes).best())
 
         return DocumentConverterResult(markdown=text_content)
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 8e3acc23d..ce72bf13e 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -95,7 +95,6 @@
     "2003",  # chart value
 ]
 
-
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
     """Validate presence or absence of specific strings."""
@@ -456,6 +455,47 @@ def test_markitdown_llm_parameters() -> None:
     assert messages[0]["content"][0]["text"] == test_prompt
 
 
+def test_plaintext_charset_fallback() -> None:
+    """
+    Test for GitHub issue #1505: PlainTextConverter throws UnicodeDecodeError
+    when charset detection from partial file content is inaccurate.
+
+    When the first 4096 bytes are ASCII-only but later bytes contain UTF-8
+    characters (e.g., accented or CJK characters), the charset may be incorrectly
+    detected as 'ascii'. The converter should fall back to charset_normalizer
+    when decoding fails.
+    """
+    markitdown = MarkItDown()
+
+    test_cases = [
+        ("Spanish", "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.", ["señor", "¿Cómo está?", "Año"]),
+        ("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]),
+        ("Japanese", "こんにちは！日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]),
+        ("Chinese", "你好！中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]),
+    ]
+
+    for lang, utf8_text, expected_substrings in test_cases:
+        # Create a test file where:
+        # - First 4100 bytes are ASCII (exceeds the 4096 byte sample for charset detection)
+        # - Followed by UTF-8 encoded non-ASCII characters
+        ascii_part = "A" * 4100
+        test_content = ascii_part + utf8_text
+
+        # Use BytesIO to simulate a file stream
+        file_stream = io.BytesIO(test_content.encode("utf-8"))
+
+        # Convert using stream with incorrect charset hint (simulating the bug)
+        result = markitdown.convert_stream(
+            file_stream, stream_info=StreamInfo(charset="ascii", extension=".txt")
+        )
+
+        # Verify that the conversion succeeded and contains the UTF-8 characters
+        for expected in expected_substrings:
+            assert expected in result.text_content, (
+                f"{lang}: Expected '{expected}' not found in result"
+            )
+
+
 @pytest.mark.skipif(
     skip_llm,
     reason="do not run llm tests without a key",

From fb6596d6c2f89e8c36a6fba146781d53c9482773 Mon Sep 17 00:00:00 2001
From: s3ich4n <s3ich4n@gmail.com>
Date: Thu, 22 Jan 2026 05:19:07 +0900
Subject: [PATCH 2/3] style: fix linting (#1505)

---
 packages/markitdown/tests/test_module_misc.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index ce72bf13e..9290d1cd4 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -95,6 +95,7 @@
     "2003",  # chart value
 ]
 
+
 # --- Helper Functions ---
 def validate_strings(result, expected_strings, exclude_strings=None):
     """Validate presence or absence of specific strings."""
@@ -468,7 +469,11 @@ def test_plaintext_charset_fallback() -> None:
     markitdown = MarkItDown()
 
     test_cases = [
-        ("Spanish", "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.", ["señor", "¿Cómo está?", "Año"]),
+        (
+            "Spanish",
+            "Hola, señor! ¿Cómo está? Año nuevo, vida nueva.",
+            ["señor", "¿Cómo está?", "Año"],
+        ),
         ("Korean", "안녕하세요! 한글 테스트입니다. 가나다라마바사", ["안녕하세요", "한글", "가나다라마바사"]),
         ("Japanese", "こんにちは！日本語テストです。あいうえお", ["こんにちは", "日本語", "あいうえお"]),
         ("Chinese", "你好！中文测试。这是一个测试文件。", ["你好", "中文测试", "测试文件"]),
@@ -491,9 +496,9 @@ def test_plaintext_charset_fallback() -> None:
 
         # Verify that the conversion succeeded and contains the UTF-8 characters
         for expected in expected_substrings:
-            assert expected in result.text_content, (
-                f"{lang}: Expected '{expected}' not found in result"
-            )
+            assert (
+                expected in result.text_content
+            ), f"{lang}: Expected '{expected}' not found in result"
 
 
 @pytest.mark.skipif(

From a3adc910330883753beb37e0eb31937c34e4697c Mon Sep 17 00:00:00 2001
From: s3ich4n <s3ich4n@gmail.com>
Date: Thu, 22 Jan 2026 05:52:10 +0900
Subject: [PATCH 3/3] chore: Include charset fallback test in `__main__` runner
 (#1505)

---
 packages/markitdown/tests/test_module_misc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 9290d1cd4..db2cbff0a 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -540,6 +540,7 @@ def test_markitdown_llm() -> None:
         test_exceptions,
         test_doc_rlink,
         test_markitdown_exiftool,
+        test_plaintext_charset_fallback,
         test_markitdown_llm_parameters,
         test_markitdown_llm,
     ]: