Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ MarkItDown currently supports the conversion from:

- PDF
- PowerPoint
- Word
- Word (.docx, .doc)
- Excel
- Images (EXIF metadata and OCR)
- Audio (EXIF metadata and speech transcription)
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ all = [
"pdfminer.six>=20251230",
"pdfplumber>=0.11.9",
"olefile",
"textract",
"pydub",
"SpeechRecognition",
"youtube-transcript-api~=1.0.0",
Expand All @@ -51,6 +52,7 @@ all = [
]
pptx = ["python-pptx"]
docx = ["mammoth~=1.11.0", "lxml"]
doc = ["textract"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BingSerpConverter,
PdfConverter,
DocxConverter,
DocConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
Expand Down Expand Up @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(DocConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
Expand Down
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._doc_converter import DocConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
Expand All @@ -34,6 +35,7 @@
"BingSerpConverter",
"PdfConverter",
"DocxConverter",
"DocConverter",
"XlsxConverter",
"XlsConverter",
"PptxConverter",
Expand Down
131 changes: 131 additions & 0 deletions packages/markitdown/src/markitdown/converters/_doc_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""
Converts legacy .doc (Word 97-2003) files to Markdown.
Uses antiword or textract as the underlying conversion engine.
"""

import sys
import subprocess
import shutil
from typing import BinaryIO, Any

from ._plain_text_converter import PlainTextConverter
from .._base_converter import DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

# Try loading optional dependencies
_dependency_exc_info = None
try:
import textract
except ImportError:
textract = None

ACCEPTED_FILE_EXTENSIONS = [".doc"]
ACCEPTED_MIME_TYPES = [
"application/msword",
"application/x-msword",
]


class DocConverter(PlainTextConverter):
"""
Converts legacy .doc (Word 97-2003) files to Markdown.
Requires either 'antiword' system command or 'textract' Python package.
"""

def __init__(self):
super().__init__()
self._has_antiword = shutil.which("antiword") is not None
self._has_textract = textract is not None

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

if mimetype in ACCEPTED_MIME_TYPES:
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
# Check for available conversion methods
if not self._has_antiword and not self._has_textract:
raise MissingDependencyException(
"The DocConverter requires either 'antiword' system command "
"or 'textract' Python package to convert .doc files. "
"Install with: pip install textract "
"Or install antiword: apt-get install antiword (Debian/Ubuntu) "
"or brew install antiword (macOS)"
)

# Read file content
file_stream.seek(0)
content = file_stream.read()

# Try textract first (pure Python, more portable)
if self._has_textract:
try:
import tempfile
import os

# Write to temp file since textract needs a file path
with tempfile.NamedTemporaryFile(
suffix=".doc", delete=False
) as tmp_file:
tmp_file.write(content)
tmp_path = tmp_file.name

try:
text = textract.process(tmp_path).decode("utf-8")
return DocumentConverterResult(markdown=text)
finally:
os.unlink(tmp_path)
except Exception:
# Fall back to antiword if textract fails
pass

# Try antiword as fallback
if self._has_antiword:
try:
import tempfile
import os

with tempfile.NamedTemporaryFile(
suffix=".doc", delete=False
) as tmp_file:
tmp_file.write(content)
tmp_path = tmp_file.name

try:
result = subprocess.run(
["antiword", tmp_path],
capture_output=True,
text=True,
check=True,
)
return DocumentConverterResult(markdown=result.stdout)
finally:
os.unlink(tmp_path)
except subprocess.CalledProcessError as e:
raise MissingDependencyException(
f"antiword failed to convert .doc file: {e.stderr}"
)

# Should not reach here, but just in case
raise MissingDependencyException(
"No available method to convert .doc files. "
"Please install textract: pip install textract"
)