diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..614022660 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -29,6 +29,7 @@ BingSerpConverter, PdfConverter, DocxConverter, + DocConverter, XlsxConverter, XlsConverter, PptxConverter, @@ -192,6 +193,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) + self.register_converter(DocConverter()) self.register_converter(XlsxConverter()) self.register_converter(XlsConverter()) self.register_converter(PptxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..9e63e73d1 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -11,6 +11,7 @@ from ._bing_serp_converter import BingSerpConverter from ._pdf_converter import PdfConverter from ._docx_converter import DocxConverter +from ._doc_converter import DocConverter from ._xlsx_converter import XlsxConverter, XlsConverter from ._pptx_converter import PptxConverter from ._image_converter import ImageConverter diff --git a/packages/markitdown/src/markitdown/converters/_doc_converter.py b/packages/markitdown/src/markitdown/converters/_doc_converter.py new file mode 100644 index 000000000..67ca4d5cd --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_doc_converter.py @@ -0,0 +1,107 @@ +import sys +import subprocess +import tempfile +import os +from typing import BinaryIO, Any + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/msword", + "application/vnd.ms-word", +] + +ACCEPTED_FILE_EXTENSIONS = [".doc"] + +_TOOLS = ["antiword", "catdoc"] + + +def _get_available_tool() -> str | None: + """Return the first available command-line tool for .doc conversion.""" + for tool in _TOOLS: + try: + subprocess.run( + [tool, "--help"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=5, + ) + return tool + except (FileNotFoundError, subprocess.TimeoutExpired): + continue + return None + + +class DocConverter(DocumentConverter): + """ + Converts legacy .doc (Word 97-2003) files to Markdown plain text. + + Requires one of the following system tools to be installed: + - antiword (https://www.winfield.demon.nl/) + - catdoc (https://www.wagner.pp.ru/~vitus/software/catdoc/) + + On macOS: ``brew install antiword`` or ``brew install catdoc`` + On Ubuntu/Debian: ``apt install antiword`` or ``apt install catdoc`` + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + tool = _get_available_tool() + if tool is None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".doc", + feature="doc", + ) + + "\n\nInstall antiword or catdoc:\n" + " macOS: brew install antiword\n" + " Ubuntu/Debian: apt install antiword" + ) + + # Write the stream to a temporary file (antiword/catdoc require a path) + suffix = stream_info.extension or ".doc" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp.write(file_stream.read()) + tmp_path = tmp.name + + try: + result = subprocess.run( + [tool, tmp_path], + capture_output=True, + text=True, + timeout=30, + ) + text = result.stdout.strip() + if not text and result.returncode != 0: + raise ValueError( + f"{tool} failed (exit {result.returncode}): {result.stderr.strip()}" + ) + finally: + os.unlink(tmp_path) + + return DocumentConverterResult(markdown=text or "")