|
8 | 8 |
|
9 | 9 | import markdown |
10 | 10 | import pdfkit |
| 11 | +import pypdf |
11 | 12 |
|
12 | 13 | from dol import Pipe |
13 | 14 |
|
@@ -351,3 +352,140 @@ def _image_to_pdf_bytes(src_item): |
351 | 352 | # Resolve the egress processing function and apply it. |
352 | 353 | egress_func = _resolve_bytes_egress(egress) |
353 | 354 | return egress_func(pdf_bytes) |
| 355 | + |
| 356 | + |
| 357 | +# --------------------------------------------------------------------------------- |
| 358 | +# PDF metadata extraction |
| 359 | + |
| 360 | +from typing import Union |
| 361 | + |
| 362 | + |
| 363 | +def _resolve_pdf_src_to_reader( |
| 364 | + pdf_src: Union[str, bytes, pypdf.PdfReader], |
| 365 | +) -> pypdf.PdfReader: |
| 366 | + """ |
| 367 | + Convert various PDF source types to a PdfReader object. |
| 368 | +
|
| 369 | + Args: |
| 370 | + pdf_src: Can be a file path (str), PDF bytes, or a PdfReader object |
| 371 | +
|
| 372 | + Returns: |
| 373 | + pypdf.PdfReader: A PdfReader object |
| 374 | +
|
| 375 | + Raises: |
| 376 | + ValueError: If pdf_src type is not supported or file doesn't exist |
| 377 | +
|
| 378 | + Examples: |
| 379 | + >>> import tempfile |
| 380 | + >>> from pypdf import PdfWriter |
| 381 | + >>> # Create a temp PDF |
| 382 | + >>> writer = PdfWriter() |
| 383 | + >>> _ = writer.add_blank_page(width=200, height=200) |
| 384 | + >>> with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp: # doctest: +ELLIPSIS |
| 385 | + ... _ = writer.write(tmp) |
| 386 | + ... tmp_path = tmp.name |
| 387 | + >>> # Test with filepath |
| 388 | + >>> reader = _resolve_pdf_src_to_reader(tmp_path) |
| 389 | + >>> isinstance(reader, pypdf.PdfReader) |
| 390 | + True |
| 391 | + >>> # Test with bytes |
| 392 | + >>> with open(tmp_path, 'rb') as f: |
| 393 | + ... pdf_bytes = f.read() |
| 394 | + >>> reader = _resolve_pdf_src_to_reader(pdf_bytes) |
| 395 | + >>> isinstance(reader, pypdf.PdfReader) |
| 396 | + True |
| 397 | + >>> # Test with PdfReader |
| 398 | + >>> reader_in = pypdf.PdfReader(tmp_path) |
| 399 | + >>> reader = _resolve_pdf_src_to_reader(reader_in) |
| 400 | + >>> reader is reader_in |
| 401 | + True |
| 402 | + >>> import os |
| 403 | + >>> os.remove(tmp_path) |
| 404 | + """ |
| 405 | + if isinstance(pdf_src, pypdf.PdfReader): |
| 406 | + return pdf_src |
| 407 | + elif isinstance(pdf_src, bytes): |
| 408 | + from pdfdol.base import bytes_to_pdf_reader_obj |
| 409 | + |
| 410 | + return bytes_to_pdf_reader_obj(pdf_src) |
| 411 | + elif isinstance(pdf_src, str): |
| 412 | + if not os.path.exists(pdf_src): |
| 413 | + raise ValueError(f"File not found: {pdf_src}") |
| 414 | + return pypdf.PdfReader(pdf_src) |
| 415 | + else: |
| 416 | + raise ValueError( |
| 417 | + f"pdf_src must be a file path (str), bytes, or PdfReader object, not {type(pdf_src)}" |
| 418 | + ) |
| 419 | + |
| 420 | + |
| 421 | +def pdf_to_metadata(pdf_src: Union[str, bytes, pypdf.PdfReader]) -> dict: |
| 422 | + """ |
| 423 | + Extract metadata from a PDF source. |
| 424 | +
|
| 425 | + Args: |
| 426 | + pdf_src: Can be a file path (str), PDF bytes, or a PdfReader object |
| 427 | +
|
| 428 | + Returns: |
| 429 | + dict: Dictionary containing metadata fields (title, author, subject, etc.) |
| 430 | + Returns empty dict if no metadata or an error occurs. |
| 431 | +
|
| 432 | + Examples: |
| 433 | + >>> from pathlib import Path |
| 434 | + >>> from pdfdol.tests.utils_for_testing import get_test_pdf_folder |
| 435 | + >>> test_folder = Path(get_test_pdf_folder()) |
| 436 | + >>> pdf_path = test_folder / "sample_with_title.pdf" |
| 437 | + >>> metadata = pdf_to_metadata(str(pdf_path)) |
| 438 | + >>> metadata.get('Title') |
| 439 | + 'Sample PDF with Title' |
| 440 | + >>> metadata.get('Author') |
| 441 | + 'Test Author' |
| 442 | + """ |
| 443 | + try: |
| 444 | + reader = _resolve_pdf_src_to_reader(pdf_src) |
| 445 | + if reader.metadata: |
| 446 | + # Convert pypdf DocumentInformation to a regular dict |
| 447 | + # and normalize the keys (remove leading slash) |
| 448 | + return {key.lstrip('/'): value for key, value in reader.metadata.items()} |
| 449 | + return {} |
| 450 | + except Exception as e: |
| 451 | + # Optionally log the error instead of printing |
| 452 | + # For now, return empty dict on error |
| 453 | + return {} |
| 454 | + |
| 455 | + |
| 456 | +def pdf_to_title(pdf_src: Union[str, bytes, pypdf.PdfReader]) -> str | None: |
| 457 | + """ |
| 458 | + Extract the document title from a PDF source. |
| 459 | +
|
| 460 | + Args: |
| 461 | + pdf_src: Can be a file path (str), PDF bytes, or a PdfReader object |
| 462 | +
|
| 463 | + Returns: |
| 464 | + str | None: The title from the metadata, or None if not found or an error occurs. |
| 465 | +
|
| 466 | + Examples: |
| 467 | + >>> from pathlib import Path |
| 468 | + >>> from pdfdol.tests.utils_for_testing import get_test_pdf_folder |
| 469 | + >>> test_folder = Path(get_test_pdf_folder()) |
| 470 | + >>> # Test with file path |
| 471 | + >>> pdf_path = test_folder / "sample_with_title.pdf" |
| 472 | + >>> pdf_to_title(str(pdf_path)) |
| 473 | + 'Sample PDF with Title' |
| 474 | + >>> # Test with bytes |
| 475 | + >>> pdf_bytes = pdf_path.read_bytes() |
| 476 | + >>> pdf_to_title(pdf_bytes) |
| 477 | + 'Sample PDF with Title' |
| 478 | + >>> # Test with PdfReader |
| 479 | + >>> reader = pypdf.PdfReader(str(pdf_path)) |
| 480 | + >>> pdf_to_title(reader) |
| 481 | + 'Sample PDF with Title' |
| 482 | + >>> # Test with no title |
| 483 | + >>> pdf_path_no_title = test_folder / "sample_pdf_1.pdf" |
| 484 | + >>> pdf_to_title(str(pdf_path_no_title)) is None |
| 485 | + True |
| 486 | + """ |
| 487 | + metadata = pdf_to_metadata(pdf_src) |
| 488 | + title = metadata.get('title') or metadata.get('Title') |
| 489 | + if title: |
| 490 | + return title.strip() |
| 491 | + return None |
0 commit comments