From d189eeea6f6d736d9b82238072205b4b183194fd Mon Sep 17 00:00:00 2001 From: yuchengpersonal Date: Thu, 19 Mar 2026 11:50:12 +0000 Subject: [PATCH] feat: Add support for legacy .doc file format - Add DocConverter class to convert Word 97-2003 (.doc) files - Support both textract (Python) and antiword (system) backends - Register converter in MarkItDown class - Add [doc] optional dependency in pyproject.toml - Update README.md to document .doc support Fixes #23 Co-Authored-By: yuchengpersonal --- README.md | 2 +- packages/markitdown/pyproject.toml | 2 + .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../markitdown/converters/_doc_converter.py | 131 ++++++++++++++++++ 5 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 packages/markitdown/src/markitdown/converters/_doc_converter.py diff --git a/README.md b/README.md index 6da3ee1d9..58b1fbd99 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ MarkItDown currently supports the conversion from: - PDF - PowerPoint -- Word +- Word (.docx, .doc) - Excel - Images (EXIF metadata and OCR) - Audio (EXIF metadata and speech transcription) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index ac3c8d947..349df453f 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -43,6 +43,7 @@ all = [ "pdfminer.six>=20251230", "pdfplumber>=0.11.9", "olefile", + "textract", "pydub", "SpeechRecognition", "youtube-transcript-api~=1.0.0", @@ -51,6 +52,7 @@ all = [ ] pptx = ["python-pptx"] docx = ["mammoth~=1.11.0", "lxml"] +doc = ["textract"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"] diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..614022660 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -29,6 +29,7 @@ BingSerpConverter, PdfConverter, DocxConverter, + DocConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(DocConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..56475bdb6 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -11,6 +11,7 @@ from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter +from ._doc_converter import DocConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter @@ -34,6 +35,7 @@ "BingSerpConverter", "PdfConverter", "DocxConverter", + "DocConverter", "XlsxConverter", "XlsConverter", "PptxConverter", diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py new file mode 100644 index 000000000..1a82c408f --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py @@ -0,0 +1,131 @@ +""" +Converts legacy .doc (Word 97-2003) files to Markdown. +Uses antiword or textract as the underlying conversion engine. +""" + +import sys +import subprocess +import shutil +from typing import BinaryIO, Any + +from ._plain_text_converter import PlainTextConverter +from .._base_converter import DocumentConverterResult +from .._stream_info import StreamInfo +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE + +# Try loading optional dependencies +_dependency_exc_info = None +try: + import textract +except ImportError: + textract = None + +ACCEPTED_FILE_EXTENSIONS = [".doc"] +ACCEPTED_MIME_TYPES = [ + "application/msword", + "application/x-msword", +] + + +class DocConverter(PlainTextConverter): + """ + Converts legacy .doc (Word 97-2003) files to Markdown. + Requires either 'antiword' system command or 'textract' Python package. + """ + + def __init__(self): + super().__init__() + self._has_antiword = shutil.which("antiword") is not None + self._has_textract = textract is not None + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + if mimetype in ACCEPTED_MIME_TYPES: + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + # Check for available conversion methods + if not self._has_antiword and not self._has_textract: + raise MissingDependencyException( + "The DocConverter requires either 'antiword' system command " + "or 'textract' Python package to convert .doc files. " + "Install with: pip install textract " + "Or install antiword: apt-get install antiword (Debian/Ubuntu) " + "or brew install antiword (macOS)" + ) + + # Read file content + file_stream.seek(0) + content = file_stream.read() + + # Try textract first (pure Python, more portable) + if self._has_textract: + try: + import tempfile + import os + + # Write to temp file since textract needs a file path + with tempfile.NamedTemporaryFile( + suffix=".doc", delete=False + ) as tmp_file: + tmp_file.write(content) + tmp_path = tmp_file.name + + try: + text = textract.process(tmp_path).decode("utf-8") + return DocumentConverterResult(markdown=text) + finally: + os.unlink(tmp_path) + except Exception: + # Fall back to antiword if textract fails + pass + + # Try antiword as fallback + if self._has_antiword: + try: + import tempfile + import os + + with tempfile.NamedTemporaryFile( + suffix=".doc", delete=False + ) as tmp_file: + tmp_file.write(content) + tmp_path = tmp_file.name + + try: + result = subprocess.run( + ["antiword", tmp_path], + capture_output=True, + text=True, + check=True, + ) + return DocumentConverterResult(markdown=result.stdout) + finally: + os.unlink(tmp_path) + except subprocess.CalledProcessError as e: + raise MissingDependencyException( + f"antiword failed to convert .doc file: {e.stderr}" + ) + + # Should not reach here, but just in case + raise MissingDependencyException( + "No available method to convert .doc files. " + "Please install textract: pip install textract" + )