Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BingSerpConverter,
PdfConverter,
DocxConverter,
DocConverter,
XlsxConverter,
XlsConverter,
PptxConverter,
Expand Down Expand Up @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None:
self.register_converter(YouTubeConverter())
self.register_converter(BingSerpConverter())
self.register_converter(DocxConverter())
self.register_converter(DocConverter())
self.register_converter(XlsxConverter())
self.register_converter(XlsConverter())
self.register_converter(PptxConverter())
Expand Down
1 change: 1 addition & 0 deletions packages/markitdown/src/markitdown/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ._bing_serp_converter import BingSerpConverter
from ._pdf_converter import PdfConverter
from ._docx_converter import DocxConverter
from ._doc_converter import DocConverter
from ._xlsx_converter import XlsxConverter, XlsConverter
from ._pptx_converter import PptxConverter
from ._image_converter import ImageConverter
Expand Down
107 changes: 107 additions & 0 deletions packages/markitdown/src/markitdown/converters/_doc_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import sys
import subprocess
import tempfile
import os
from typing import BinaryIO, Any

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._stream_info import StreamInfo

ACCEPTED_MIME_TYPE_PREFIXES = [
"application/msword",
"application/vnd.ms-word",
]

ACCEPTED_FILE_EXTENSIONS = [".doc"]

_TOOLS = ["antiword", "catdoc"]


def _get_available_tool() -> str | None:
"""Return the first available command-line tool for .doc conversion."""
for tool in _TOOLS:
try:
subprocess.run(
[tool, "--help"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=5,
)
return tool
except (FileNotFoundError, subprocess.TimeoutExpired):
continue
return None


class DocConverter(DocumentConverter):
"""
Converts legacy .doc (Word 97-2003) files to Markdown plain text.

Requires one of the following system tools to be installed:
- antiword (https://www.winfield.demon.nl/)
- catdoc (https://www.wagner.pp.ru/~vitus/software/catdoc/)

On macOS: ``brew install antiword`` or ``brew install catdoc``
On Ubuntu/Debian: ``apt install antiword`` or ``apt install catdoc``
"""

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
tool = _get_available_tool()
if tool is None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".doc",
feature="doc",
)
+ "\n\nInstall antiword or catdoc:\n"
" macOS: brew install antiword\n"
" Ubuntu/Debian: apt install antiword"
)

# Write the stream to a temporary file (antiword/catdoc require a path)
suffix = stream_info.extension or ".doc"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(file_stream.read())
tmp_path = tmp.name

try:
result = subprocess.run(
[tool, tmp_path],
capture_output=True,
text=True,
timeout=30,
)
text = result.stdout.strip()
if not text and result.returncode != 0:
raise ValueError(
f"{tool} failed (exit {result.returncode}): {result.stderr.strip()}"
)
finally:
os.unlink(tmp_path)

return DocumentConverterResult(markdown=text or "")