microsoft · yuchengpersonal · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ MarkItDown currently supports the conversion from:
 
 - PDF
 - PowerPoint
-- Word
+- Word (.docx, .doc)
 - Excel
 - Images (EXIF metadata and OCR)
 - Audio (EXIF metadata and speech transcription)

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -43,6 +43,7 @@ all = [
   "pdfminer.six>=20251230",
   "pdfplumber>=0.11.9",
   "olefile",
+  "textract",
   "pydub",
   "SpeechRecognition",
   "youtube-transcript-api~=1.0.0",
@@ -51,6 +52,7 @@ all = [
 ]
 pptx = ["python-pptx"]
 docx = ["mammoth~=1.11.0", "lxml"]
+doc = ["textract"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
 pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -29,6 +29,7 @@
     BingSerpConverter,
     PdfConverter,
     DocxConverter,
+    DocConverter,
     XlsxConverter,
     XlsConverter,
     PptxConverter,
@@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None:
             self.register_converter(YouTubeConverter())
             self.register_converter(BingSerpConverter())
             self.register_converter(DocxConverter())
+            self.register_converter(DocConverter())
             self.register_converter(XlsxConverter())
             self.register_converter(XlsConverter())
             self.register_converter(PptxConverter())

diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -11,6 +11,7 @@
 from ._bing_serp_converter import BingSerpConverter
 from ._pdf_converter import PdfConverter
 from ._docx_converter import DocxConverter
+from ._doc_converter import DocConverter
 from ._xlsx_converter import XlsxConverter, XlsConverter
 from ._pptx_converter import PptxConverter
 from ._image_converter import ImageConverter
@@ -34,6 +35,7 @@
     "BingSerpConverter",
     "PdfConverter",
     "DocxConverter",
+    "DocConverter",
     "XlsxConverter",
     "XlsConverter",
     "PptxConverter",

diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py
@@ -0,0 +1,131 @@
+"""
+Converts legacy .doc (Word 97-2003) files to Markdown.
+Uses antiword or textract as the underlying conversion engine.
+"""
+
+import sys
+import subprocess
+import shutil
+from typing import BinaryIO, Any
+
+from ._plain_text_converter import PlainTextConverter
+from .._base_converter import DocumentConverterResult
+from .._stream_info import StreamInfo
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional dependencies
+_dependency_exc_info = None
+try:
+    import textract
+except ImportError:
+    textract = None
+
+ACCEPTED_FILE_EXTENSIONS = [".doc"]
+ACCEPTED_MIME_TYPES = [
+    "application/msword",
+    "application/x-msword",
+]
+
+
+class DocConverter(PlainTextConverter):
+    """
+    Converts legacy .doc (Word 97-2003) files to Markdown.
+    Requires either 'antiword' system command or 'textract' Python package.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._has_antiword = shutil.which("antiword") is not None
+        self._has_textract = textract is not None
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        if mimetype in ACCEPTED_MIME_TYPES:
+            return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        # Check for available conversion methods
+        if not self._has_antiword and not self._has_textract:
+            raise MissingDependencyException(
+                "The DocConverter requires either 'antiword' system command "
+                "or 'textract' Python package to convert .doc files. "
+                "Install with: pip install textract "
+                "Or install antiword: apt-get install antiword (Debian/Ubuntu) "
+                "or brew install antiword (macOS)"
+            )
+
+        # Read file content
+        file_stream.seek(0)
+        content = file_stream.read()
+
+        # Try textract first (pure Python, more portable)
+        if self._has_textract:
+            try:
+                import tempfile
+                import os
+
+                # Write to temp file since textract needs a file path
+                with tempfile.NamedTemporaryFile(
+                    suffix=".doc", delete=False
+                ) as tmp_file:
+                    tmp_file.write(content)
+                    tmp_path = tmp_file.name
+
+                try:
+                    text = textract.process(tmp_path).decode("utf-8")
+                    return DocumentConverterResult(markdown=text)
+                finally:
+                    os.unlink(tmp_path)
+            except Exception:
+                # Fall back to antiword if textract fails
+                pass
+
+        # Try antiword as fallback
+        if self._has_antiword:
+            try:
+                import tempfile
+                import os
+
+                with tempfile.NamedTemporaryFile(
+                    suffix=".doc", delete=False
+                ) as tmp_file:
+                    tmp_file.write(content)
+                    tmp_path = tmp_file.name
+
+                try:
+                    result = subprocess.run(
+                        ["antiword", tmp_path],
+                        capture_output=True,
+                        text=True,
+                        check=True,
+                    )
+                    return DocumentConverterResult(markdown=result.stdout)
+                finally:
+                    os.unlink(tmp_path)
+            except subprocess.CalledProcessError as e:
+                raise MissingDependencyException(
+                    f"antiword failed to convert .doc file: {e.stderr}"
+                )
+
+        # Should not reach here, but just in case
+        raise MissingDependencyException(
+            "No available method to convert .doc files. "
+            "Please install textract: pip install textract"
+        )