Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ def main():
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)

parser.add_argument(
"--password",
type=str,
help="Password for opening password-protected files (e.g., encrypted PDFs).",
)

parser.add_argument("filename", nargs="?")
args = parser.parse_args()

Expand Down Expand Up @@ -186,15 +192,19 @@ def main():
else:
markitdown = MarkItDown(enable_plugins=args.use_plugins)

convert_kwargs: dict = {"keep_data_uris": args.keep_data_uris}
if args.password is not None:
convert_kwargs["password"] = args.password

if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
**convert_kwargs,
)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
args.filename, stream_info=stream_info, **convert_kwargs
)

_handle_output(args, result)
Expand Down
50 changes: 45 additions & 5 deletions packages/markitdown/src/markitdown/converters/_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._exceptions import (
MissingDependencyException,
FileConversionException,
MISSING_DEPENDENCY_MESSAGE,
)

# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
Expand Down Expand Up @@ -62,6 +66,7 @@ def _merge_partial_numbering_lines(text: str) -> str:
try:
import pdfminer
import pdfminer.high_level
from pdfminer.pdfdocument import PDFPasswordIncorrect
import pdfplumber
except ImportError:
_dependency_exc_info = sys.exc_info()
Expand Down Expand Up @@ -539,6 +544,9 @@ def convert(
# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())

# Extract password from kwargs (for password-protected PDFs)
password = kwargs.get("password")

try:
# Single pass: check every page for form-style content.
# Pages with tables/forms get rich extraction; plain-text
Expand All @@ -549,7 +557,11 @@ def convert(
form_page_count = 0
plain_page_indices: list[int] = []

with pdfplumber.open(pdf_bytes) as pdf:
pdfplumber_kwargs: dict[str, Any] = {}
if password is not None:
pdfplumber_kwargs["password"] = password

with pdfplumber.open(pdf_bytes, **pdfplumber_kwargs) as pdf:
for page_idx, page in enumerate(pdf.pages):
page_content = _extract_form_content_from_words(page)

Expand All @@ -569,19 +581,47 @@ def convert(
# the whole document (better text spacing for prose).
if form_page_count == 0:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
pdfminer_kwargs: dict[str, Any] = {}
if password is not None:
pdfminer_kwargs["password"] = password
markdown = pdfminer.high_level.extract_text(
pdf_bytes, **pdfminer_kwargs
)
else:
markdown = "\n\n".join(markdown_chunks).strip()

except PDFPasswordIncorrect:
raise FileConversionException(
"The PDF is password-protected and the provided password is "
"incorrect. Use the 'password' parameter to supply the correct "
"password."
)
except Exception:
# Fallback if pdfplumber fails
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
pdfminer_fallback_kwargs: dict[str, Any] = {}
if password is not None:
pdfminer_fallback_kwargs["password"] = password
try:
markdown = pdfminer.high_level.extract_text(
pdf_bytes, **pdfminer_fallback_kwargs
)
except PDFPasswordIncorrect:
raise FileConversionException(
"The PDF is password-protected and the provided password "
"is incorrect. Use the 'password' parameter to supply the "
"correct password."
)

# Fallback if still empty
if not markdown:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
pdfminer_empty_kwargs: dict[str, Any] = {}
if password is not None:
pdfminer_empty_kwargs["password"] = password
markdown = pdfminer.high_level.extract_text(
pdf_bytes, **pdfminer_empty_kwargs
)

# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)
Expand Down
Binary file not shown.
59 changes: 59 additions & 0 deletions packages/markitdown/tests/test_pdf_password.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Tests for password-protected PDF support."""

import os

import pytest

from markitdown import MarkItDown
from markitdown._exceptions import FileConversionException

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
PASSWORD_PDF = os.path.join(TEST_FILES_DIR, "test_password.pdf")
CORRECT_PASSWORD = "testpassword"


class TestPdfPasswordSupport:
"""Tests for password-protected PDF conversion."""

def test_convert_with_correct_password(self):
"""A password-protected PDF should convert when the correct password is given."""
md = MarkItDown()
result = md.convert(PASSWORD_PDF, password=CORRECT_PASSWORD)
assert "password-protected test document" in result.markdown

def test_convert_without_password_raises(self):
"""A password-protected PDF without a password should raise FileConversionException."""
md = MarkItDown()
with pytest.raises(Exception):
md.convert(PASSWORD_PDF)

def test_convert_with_wrong_password_raises(self):
"""A password-protected PDF with the wrong password should raise an error."""
md = MarkItDown()
with pytest.raises(Exception):
md.convert(PASSWORD_PDF, password="wrongpassword")

def test_unprotected_pdf_unaffected(self):
"""An unprotected PDF should convert normally even when password is None."""
md = MarkItDown()
# Use an existing test PDF from the test_files directory
test_pdfs = [
f
for f in os.listdir(TEST_FILES_DIR)
if f.endswith(".pdf") and f != "test_password.pdf"
]
if test_pdfs:
result = md.convert(os.path.join(TEST_FILES_DIR, test_pdfs[0]))
assert result.markdown is not None

def test_cli_password_flag(self):
"""The --password CLI flag should be recognized."""
import subprocess
import sys

result = subprocess.run(
[sys.executable, "-m", "markitdown", "--help"],
capture_output=True,
text=True,
)
assert "--password" in result.stdout