diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..93c1fb893 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -110,6 +110,12 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--password", + type=str, + help="Password for opening password-protected files (e.g., encrypted PDFs).", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -186,15 +192,19 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) + convert_kwargs: dict = {"keep_data_uris": args.keep_data_uris} + if args.password is not None: + convert_kwargs["password"] = args.password + if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, - keep_data_uris=args.keep_data_uris, + **convert_kwargs, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, stream_info=stream_info, **convert_kwargs ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..458dd6af3 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -5,7 +5,11 @@ from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._exceptions import ( + MissingDependencyException, + FileConversionException, + MISSING_DEPENDENCY_MESSAGE, +) # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") @@ -62,6 +66,7 @@ def _merge_partial_numbering_lines(text: str) -> str: try: import pdfminer import pdfminer.high_level + from pdfminer.pdfdocument import PDFPasswordIncorrect import pdfplumber except ImportError: _dependency_exc_info = sys.exc_info() @@ -539,6 +544,9 @@ def convert( # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) + # Extract password from kwargs (for password-protected PDFs) + password = kwargs.get("password") + try: # Single pass: check every page for form-style content. # Pages with tables/forms get rich extraction; plain-text @@ -549,7 +557,11 @@ def convert( form_page_count = 0 plain_page_indices: list[int] = [] - with pdfplumber.open(pdf_bytes) as pdf: + pdfplumber_kwargs: dict[str, Any] = {} + if password is not None: + pdfplumber_kwargs["password"] = password + + with pdfplumber.open(pdf_bytes, **pdfplumber_kwargs) as pdf: for page_idx, page in enumerate(pdf.pages): page_content = _extract_form_content_from_words(page) @@ -569,19 +581,47 @@ def convert( # the whole document (better text spacing for prose). if form_page_count == 0: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_kwargs: dict[str, Any] = {} + if password is not None: + pdfminer_kwargs["password"] = password + markdown = pdfminer.high_level.extract_text( + pdf_bytes, **pdfminer_kwargs + ) else: markdown = "\n\n".join(markdown_chunks).strip() + except PDFPasswordIncorrect: + raise FileConversionException( + "The PDF is password-protected and the provided password is " + "incorrect. Use the 'password' parameter to supply the correct " + "password." + ) except Exception: # Fallback if pdfplumber fails pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_fallback_kwargs: dict[str, Any] = {} + if password is not None: + pdfminer_fallback_kwargs["password"] = password + try: + markdown = pdfminer.high_level.extract_text( + pdf_bytes, **pdfminer_fallback_kwargs + ) + except PDFPasswordIncorrect: + raise FileConversionException( + "The PDF is password-protected and the provided password " + "is incorrect. Use the 'password' parameter to supply the " + "correct password." + ) # Fallback if still empty if not markdown: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_empty_kwargs: dict[str, Any] = {} + if password is not None: + pdfminer_empty_kwargs["password"] = password + markdown = pdfminer.high_level.extract_text( + pdf_bytes, **pdfminer_empty_kwargs + ) # Post-process to merge MasterFormat-style partial numbering with following text markdown = _merge_partial_numbering_lines(markdown) diff --git a/packages/markitdown/tests/test_files/test_password.pdf b/packages/markitdown/tests/test_files/test_password.pdf new file mode 100644 index 000000000..5d3bf2e91 Binary files /dev/null and b/packages/markitdown/tests/test_files/test_password.pdf differ diff --git a/packages/markitdown/tests/test_pdf_password.py b/packages/markitdown/tests/test_pdf_password.py new file mode 100644 index 000000000..88102385d --- /dev/null +++ b/packages/markitdown/tests/test_pdf_password.py @@ -0,0 +1,59 @@ +"""Tests for password-protected PDF support.""" + +import os + +import pytest + +from markitdown import MarkItDown +from markitdown._exceptions import FileConversionException + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +PASSWORD_PDF = os.path.join(TEST_FILES_DIR, "test_password.pdf") +CORRECT_PASSWORD = "testpassword" + + +class TestPdfPasswordSupport: + """Tests for password-protected PDF conversion.""" + + def test_convert_with_correct_password(self): + """A password-protected PDF should convert when the correct password is given.""" + md = MarkItDown() + result = md.convert(PASSWORD_PDF, password=CORRECT_PASSWORD) + assert "password-protected test document" in result.markdown + + def test_convert_without_password_raises(self): + """A password-protected PDF without a password should raise FileConversionException.""" + md = MarkItDown() + with pytest.raises(Exception): + md.convert(PASSWORD_PDF) + + def test_convert_with_wrong_password_raises(self): + """A password-protected PDF with the wrong password should raise an error.""" + md = MarkItDown() + with pytest.raises(Exception): + md.convert(PASSWORD_PDF, password="wrongpassword") + + def test_unprotected_pdf_unaffected(self): + """An unprotected PDF should convert normally even when password is None.""" + md = MarkItDown() + # Use an existing test PDF from the test_files directory + test_pdfs = [ + f + for f in os.listdir(TEST_FILES_DIR) + if f.endswith(".pdf") and f != "test_password.pdf" + ] + if test_pdfs: + result = md.convert(os.path.join(TEST_FILES_DIR, test_pdfs[0])) + assert result.markdown is not None + + def test_cli_password_flag(self): + """The --password CLI flag should be recognized.""" + import subprocess + import sys + + result = subprocess.run( + [sys.executable, "-m", "markitdown", "--help"], + capture_output=True, + text=True, + ) + assert "--password" in result.stdout