From de5dd1ae8d9f59e3ba1a88764d39564440019f16 Mon Sep 17 00:00:00 2001 From: mizuresort Date: Wed, 14 Jan 2026 16:22:24 +0900 Subject: [PATCH] Fix ZIP filename encoding for Chinese and Japanese files --- .../markitdown/converters/_zip_converter.py | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index f87e6c89..46597077 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -12,6 +12,24 @@ if TYPE_CHECKING: from .._markitdown import MarkItDown +def normalize_zip_filename(name: str) -> str: + """ + normalize_zip_filename that may be garbed due to encoding issues. + """ + + try: + raw = name.encode("cp437") + except UnicodeDecodeError: + return name + + for enc in ["utf-8", "cp932", "gbk", "big5"]: + try: + return raw.decode(enc) + except UnicodeDecodeError: + continue + + return name + ACCEPTED_MIME_TYPE_PREFIXES = [ "application/zip", ] @@ -94,9 +112,16 @@ def convert( md_content = f"Content from the zip file `{file_path}`:\n\n" with zipfile.ZipFile(file_stream, "r") as zipObj: - for name in zipObj.namelist(): + for info in zipObj.infolist(): + raw_name = info.filename + name = normalize_zip_filename(raw_name) + + #skip macOS metadata files + if name.startswith("_MACOSX/") or "/._" in name: + continue + try: - z_file_stream = io.BytesIO(zipObj.read(name)) + z_file_stream = io.BytesIO(zipObj.read(info)) z_file_stream_info = StreamInfo( extension=os.path.splitext(name)[1], filename=os.path.basename(name),