From 258a24031c090e3354c3de6a8d62ec61580d3fac Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Thu, 19 Mar 2026 22:01:40 -0700 Subject: [PATCH] Add password support for PDF conversion Pass an optional password parameter through to pdfminer and pdfplumber when converting encrypted PDFs. Raise a clear FileConversionException when the password is missing or incorrect. Add a --password CLI flag. Fixes microsoft/markitdown#1585 --- .../markitdown/src/markitdown/__main__.py | 14 ++++- .../markitdown/converters/_pdf_converter.py | 50 +++++++++++++-- .../tests/test_files/test_password.pdf | Bin 0 -> 1318 bytes .../markitdown/tests/test_pdf_password.py | 59 ++++++++++++++++++ 4 files changed, 116 insertions(+), 7 deletions(-) create mode 100644 packages/markitdown/tests/test_files/test_password.pdf create mode 100644 packages/markitdown/tests/test_pdf_password.py diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..93c1fb893 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -110,6 +110,12 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "--password", + type=str, + help="Password for opening password-protected files (e.g., encrypted PDFs).", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -186,15 +192,19 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) + convert_kwargs: dict = {"keep_data_uris": args.keep_data_uris} + if args.password is not None: + convert_kwargs["password"] = args.password + if args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, - keep_data_uris=args.keep_data_uris, + **convert_kwargs, ) else: result = markitdown.convert( - args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + args.filename, stream_info=stream_info, **convert_kwargs ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index ffbcbd990..458dd6af3 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -5,7 +5,11 @@ from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo -from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE +from .._exceptions import ( + MissingDependencyException, + FileConversionException, + MISSING_DEPENDENCY_MESSAGE, +) # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10") PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$") @@ -62,6 +66,7 @@ def _merge_partial_numbering_lines(text: str) -> str: try: import pdfminer import pdfminer.high_level + from pdfminer.pdfdocument import PDFPasswordIncorrect import pdfplumber except ImportError: _dependency_exc_info = sys.exc_info() @@ -539,6 +544,9 @@ def convert( # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) + # Extract password from kwargs (for password-protected PDFs) + password = kwargs.get("password") + try: # Single pass: check every page for form-style content. # Pages with tables/forms get rich extraction; plain-text @@ -549,7 +557,11 @@ def convert( form_page_count = 0 plain_page_indices: list[int] = [] - with pdfplumber.open(pdf_bytes) as pdf: + pdfplumber_kwargs: dict[str, Any] = {} + if password is not None: + pdfplumber_kwargs["password"] = password + + with pdfplumber.open(pdf_bytes, **pdfplumber_kwargs) as pdf: for page_idx, page in enumerate(pdf.pages): page_content = _extract_form_content_from_words(page) @@ -569,19 +581,47 @@ def convert( # the whole document (better text spacing for prose). if form_page_count == 0: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_kwargs: dict[str, Any] = {} + if password is not None: + pdfminer_kwargs["password"] = password + markdown = pdfminer.high_level.extract_text( + pdf_bytes, **pdfminer_kwargs + ) else: markdown = "\n\n".join(markdown_chunks).strip() + except PDFPasswordIncorrect: + raise FileConversionException( + "The PDF is password-protected and the provided password is " + "incorrect. Use the 'password' parameter to supply the correct " + "password." + ) except Exception: # Fallback if pdfplumber fails pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_fallback_kwargs: dict[str, Any] = {} + if password is not None: + pdfminer_fallback_kwargs["password"] = password + try: + markdown = pdfminer.high_level.extract_text( + pdf_bytes, **pdfminer_fallback_kwargs + ) + except PDFPasswordIncorrect: + raise FileConversionException( + "The PDF is password-protected and the provided password " + "is incorrect. Use the 'password' parameter to supply the " + "correct password." + ) # Fallback if still empty if not markdown: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_empty_kwargs: dict[str, Any] = {} + if password is not None: + pdfminer_empty_kwargs["password"] = password + markdown = pdfminer.high_level.extract_text( + pdf_bytes, **pdfminer_empty_kwargs + ) # Post-process to merge MasterFormat-style partial numbering with following text markdown = _merge_partial_numbering_lines(markdown) diff --git a/packages/markitdown/tests/test_files/test_password.pdf b/packages/markitdown/tests/test_files/test_password.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5d3bf2e9132817015e126ef2d7a855a9fbf918fb GIT binary patch literal 1318 zcmb_cL5Lhh6a`U`N(cc@n}aFPF$Zz0tE#KI3nl~E-3(cD&1Pm|Fh;Ptt9tA(+r9SG zt{e3rh#;s3!Hb?k&_i?~pr9gQ!GMs1l7r|WQ3x9~iYP%r1G(haWM+4=d$k9q>c9T~ z{onuns@nWa^D#QfQEmO&rSA~M0WOo1s9r~8zADX9YAal~iOy8Q1&bS~(LlB^-iM9( zc1|r?Oy+gh_7RybmkNg|B9G^$kDtLIIJWS!W1)Oo=(KjaS9T$*=fIx3H5TA0Ms-b> zg|mh0V=-_dE!!`bsuZH+9bmHE|6wVJ3C|3(pW^EutnEBkEvk3%D$_tUO->JvirR^=gwa~^Xgw=GI8;t*OwR1 zef#9uM^79N|2%T}Waq)o)qRJ)Is4Ixm#>`uU0mF~Y8LkL`}h6&*_r2_egU8RPu$A? zS-bJRz4Y%V6E}~pf3au&$BFCuoz;)`UU+z?`Eq~ltImB_-+gUuN51^wFOS@Fe8-;d z8*@j#|84)LKVSO*-+QI;!vniMKeqPb${#m=y3k&I>+hSpPQSgr^SoDVSmIDS->Hr7 zgMMN!c$4A?@Il|b`f1xb=o^6%!K7nX94501AkWlrogS{UcWLV>M$}WfZ=0T;9J0M- z>+)11a`yHjGke(iT4|L76QWi?+v#08Zx? zKcV0NZo#C2%VLDNO6NUW4epJ0{*r~yAg!`=*q`(kBC|zSVt={#{R|9MT?$Dho2B1rpV2 I2M;&VZIj-3a{vGU literal 0 HcmV?d00001 diff --git a/packages/markitdown/tests/test_pdf_password.py b/packages/markitdown/tests/test_pdf_password.py new file mode 100644 index 000000000..88102385d --- /dev/null +++ b/packages/markitdown/tests/test_pdf_password.py @@ -0,0 +1,59 @@ +"""Tests for password-protected PDF support.""" + +import os + +import pytest + +from markitdown import MarkItDown +from markitdown._exceptions import FileConversionException + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") +PASSWORD_PDF = os.path.join(TEST_FILES_DIR, "test_password.pdf") +CORRECT_PASSWORD = "testpassword" + + +class TestPdfPasswordSupport: + """Tests for password-protected PDF conversion.""" + + def test_convert_with_correct_password(self): + """A password-protected PDF should convert when the correct password is given.""" + md = MarkItDown() + result = md.convert(PASSWORD_PDF, password=CORRECT_PASSWORD) + assert "password-protected test document" in result.markdown + + def test_convert_without_password_raises(self): + """A password-protected PDF without a password should raise FileConversionException.""" + md = MarkItDown() + with pytest.raises(Exception): + md.convert(PASSWORD_PDF) + + def test_convert_with_wrong_password_raises(self): + """A password-protected PDF with the wrong password should raise an error.""" + md = MarkItDown() + with pytest.raises(Exception): + md.convert(PASSWORD_PDF, password="wrongpassword") + + def test_unprotected_pdf_unaffected(self): + """An unprotected PDF should convert normally even when password is None.""" + md = MarkItDown() + # Use an existing test PDF from the test_files directory + test_pdfs = [ + f + for f in os.listdir(TEST_FILES_DIR) + if f.endswith(".pdf") and f != "test_password.pdf" + ] + if test_pdfs: + result = md.convert(os.path.join(TEST_FILES_DIR, test_pdfs[0])) + assert result.markdown is not None + + def test_cli_password_flag(self): + """The --password CLI flag should be recognized.""" + import subprocess + import sys + + result = subprocess.run( + [sys.executable, "-m", "markitdown", "--help"], + capture_output=True, + text=True, + ) + assert "--password" in result.stdout