Skip to content

Commit b74d04c

Browse files
committed
feat: add pdf_to_title and pdf_to_metadata functions for enhanced PDF metadata extraction
1 parent a0d8a5a commit b74d04c

5 files changed

Lines changed: 226 additions & 3 deletions

File tree

pdfdol/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
>>> folder_path = get_test_pdf_folder()
66
>>> s = PdfFilesReader(folder_path)
77
>>> sorted(s)
8-
['sample_pdf_1', 'sample_pdf_2']
8+
['sample_pdf_1', 'sample_pdf_2', 'sample_with_title']
99
>>> assert s['sample_pdf_2'] == [
1010
... 'Page 1\nThis is a sample text for testing Python PDF tools.'
1111
... ]
@@ -24,4 +24,5 @@
2424
from pdfdol.util import concat_pdfs # concatenate pdfs
2525
from pdfdol.tools import (
2626
get_pdf, # Convert the given source to a PDF (bytes) and process it using the specified egress.
27+
pdf_to_title, # Extract the title from a PDF file's metadata.
2728
)
1 KB
Binary file not shown.

pdfdol/tests/test_base.py

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Test the base.py module"""
22

3+
from pathlib import Path
4+
from pypdf import PdfReader
35
from pdfdol.base import PdfFilesReader
46
from pdfdol.tests.utils_for_testing import get_test_pdf_folder
57

@@ -8,7 +10,90 @@ def test_pdf_files_reader():
810
test_pdf_folder = get_test_pdf_folder()
911
s = PdfFilesReader(str(test_pdf_folder))
1012

11-
assert sorted(s) == ["sample_pdf_1", "sample_pdf_2"]
13+
assert sorted(s) == ["sample_pdf_1", "sample_pdf_2", "sample_with_title"]
1214
assert s["sample_pdf_2"] == [
1315
"Page 1\nThis is a sample text for testing Python PDF tools."
1416
]
17+
18+
19+
def test_pdf_to_title_with_filepath():
20+
"""Test pdf_to_title with a file path"""
21+
from pdfdol.tools import pdf_to_title
22+
23+
test_pdf_folder = get_test_pdf_folder()
24+
pdf_path = Path(test_pdf_folder) / "sample_with_title.pdf"
25+
26+
title = pdf_to_title(str(pdf_path))
27+
assert title == "Sample PDF with Title"
28+
29+
30+
def test_pdf_to_title_with_bytes():
31+
"""Test pdf_to_title with PDF bytes"""
32+
from pdfdol.tools import pdf_to_title
33+
34+
test_pdf_folder = get_test_pdf_folder()
35+
pdf_path = Path(test_pdf_folder) / "sample_with_title.pdf"
36+
pdf_bytes = pdf_path.read_bytes()
37+
38+
title = pdf_to_title(pdf_bytes)
39+
assert title == "Sample PDF with Title"
40+
41+
42+
def test_pdf_to_title_with_pdf_reader():
43+
"""Test pdf_to_title with a PdfReader object"""
44+
from pdfdol.tools import pdf_to_title
45+
46+
test_pdf_folder = get_test_pdf_folder()
47+
pdf_path = Path(test_pdf_folder) / "sample_with_title.pdf"
48+
reader = PdfReader(str(pdf_path))
49+
50+
title = pdf_to_title(reader)
51+
assert title == "Sample PDF with Title"
52+
53+
54+
def test_pdf_to_title_no_title():
55+
"""Test pdf_to_title when PDF has no title metadata"""
56+
from pdfdol.tools import pdf_to_title
57+
58+
test_pdf_folder = get_test_pdf_folder()
59+
pdf_path = Path(test_pdf_folder) / "sample_pdf_1.pdf"
60+
61+
title = pdf_to_title(str(pdf_path))
62+
assert title is None
63+
64+
65+
def test_pdf_to_title_nonexistent_file():
66+
"""Test pdf_to_title with non-existent file"""
67+
from pdfdol.tools import pdf_to_title
68+
69+
title = pdf_to_title("/nonexistent/path/to/file.pdf")
70+
assert title is None
71+
72+
73+
def test_pdf_to_metadata():
74+
"""Test pdf_to_metadata function"""
75+
from pdfdol.tools import pdf_to_metadata
76+
77+
test_pdf_folder = get_test_pdf_folder()
78+
pdf_path = Path(test_pdf_folder) / "sample_with_title.pdf"
79+
80+
# Test with filepath
81+
metadata = pdf_to_metadata(str(pdf_path))
82+
assert metadata['Title'] == "Sample PDF with Title"
83+
assert metadata['Author'] == "Test Author"
84+
assert metadata['Subject'] == "Testing PDF metadata extraction"
85+
86+
# Test with bytes
87+
pdf_bytes = pdf_path.read_bytes()
88+
metadata = pdf_to_metadata(pdf_bytes)
89+
assert metadata['Title'] == "Sample PDF with Title"
90+
91+
# Test with PdfReader
92+
reader = PdfReader(str(pdf_path))
93+
metadata = pdf_to_metadata(reader)
94+
assert metadata['Title'] == "Sample PDF with Title"
95+
96+
# Test with PDF without metadata
97+
pdf_path_no_meta = Path(test_pdf_folder) / "sample_pdf_1.pdf"
98+
metadata = pdf_to_metadata(str(pdf_path_no_meta))
99+
assert isinstance(metadata, dict) # Should return empty dict, not None

pdfdol/tools.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import markdown
1010
import pdfkit
11+
import pypdf
1112

1213
from dol import Pipe
1314

@@ -351,3 +352,140 @@ def _image_to_pdf_bytes(src_item):
351352
# Resolve the egress processing function and apply it.
352353
egress_func = _resolve_bytes_egress(egress)
353354
return egress_func(pdf_bytes)
355+
356+
357+
# ---------------------------------------------------------------------------------
358+
# PDF metadata extraction
359+
360+
from typing import Union
361+
362+
363+
def _resolve_pdf_src_to_reader(
364+
pdf_src: Union[str, bytes, pypdf.PdfReader],
365+
) -> pypdf.PdfReader:
366+
"""
367+
Convert various PDF source types to a PdfReader object.
368+
369+
Args:
370+
pdf_src: Can be a file path (str), PDF bytes, or a PdfReader object
371+
372+
Returns:
373+
pypdf.PdfReader: A PdfReader object
374+
375+
Raises:
376+
ValueError: If pdf_src type is not supported or file doesn't exist
377+
378+
Examples:
379+
>>> import tempfile
380+
>>> from pypdf import PdfWriter
381+
>>> # Create a temp PDF
382+
>>> writer = PdfWriter()
383+
>>> _ = writer.add_blank_page(width=200, height=200)
384+
>>> with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp: # doctest: +ELLIPSIS
385+
... _ = writer.write(tmp)
386+
... tmp_path = tmp.name
387+
>>> # Test with filepath
388+
>>> reader = _resolve_pdf_src_to_reader(tmp_path)
389+
>>> isinstance(reader, pypdf.PdfReader)
390+
True
391+
>>> # Test with bytes
392+
>>> with open(tmp_path, 'rb') as f:
393+
... pdf_bytes = f.read()
394+
>>> reader = _resolve_pdf_src_to_reader(pdf_bytes)
395+
>>> isinstance(reader, pypdf.PdfReader)
396+
True
397+
>>> # Test with PdfReader
398+
>>> reader_in = pypdf.PdfReader(tmp_path)
399+
>>> reader = _resolve_pdf_src_to_reader(reader_in)
400+
>>> reader is reader_in
401+
True
402+
>>> import os
403+
>>> os.remove(tmp_path)
404+
"""
405+
if isinstance(pdf_src, pypdf.PdfReader):
406+
return pdf_src
407+
elif isinstance(pdf_src, bytes):
408+
from pdfdol.base import bytes_to_pdf_reader_obj
409+
410+
return bytes_to_pdf_reader_obj(pdf_src)
411+
elif isinstance(pdf_src, str):
412+
if not os.path.exists(pdf_src):
413+
raise ValueError(f"File not found: {pdf_src}")
414+
return pypdf.PdfReader(pdf_src)
415+
else:
416+
raise ValueError(
417+
f"pdf_src must be a file path (str), bytes, or PdfReader object, not {type(pdf_src)}"
418+
)
419+
420+
421+
def pdf_to_metadata(pdf_src: Union[str, bytes, pypdf.PdfReader]) -> dict:
422+
"""
423+
Extract metadata from a PDF source.
424+
425+
Args:
426+
pdf_src: Can be a file path (str), PDF bytes, or a PdfReader object
427+
428+
Returns:
429+
dict: Dictionary containing metadata fields (title, author, subject, etc.)
430+
Returns empty dict if no metadata or an error occurs.
431+
432+
Examples:
433+
>>> from pathlib import Path
434+
>>> from pdfdol.tests.utils_for_testing import get_test_pdf_folder
435+
>>> test_folder = Path(get_test_pdf_folder())
436+
>>> pdf_path = test_folder / "sample_with_title.pdf"
437+
>>> metadata = pdf_to_metadata(str(pdf_path))
438+
>>> metadata.get('Title')
439+
'Sample PDF with Title'
440+
>>> metadata.get('Author')
441+
'Test Author'
442+
"""
443+
try:
444+
reader = _resolve_pdf_src_to_reader(pdf_src)
445+
if reader.metadata:
446+
# Convert pypdf DocumentInformation to a regular dict
447+
# and normalize the keys (remove leading slash)
448+
return {key.lstrip('/'): value for key, value in reader.metadata.items()}
449+
return {}
450+
except Exception as e:
451+
# Optionally log the error instead of printing
452+
# For now, return empty dict on error
453+
return {}
454+
455+
456+
def pdf_to_title(pdf_src: Union[str, bytes, pypdf.PdfReader]) -> str | None:
457+
"""
458+
Extract the document title from a PDF source.
459+
460+
Args:
461+
pdf_src: Can be a file path (str), PDF bytes, or a PdfReader object
462+
463+
Returns:
464+
str | None: The title from the metadata, or None if not found or an error occurs.
465+
466+
Examples:
467+
>>> from pathlib import Path
468+
>>> from pdfdol.tests.utils_for_testing import get_test_pdf_folder
469+
>>> test_folder = Path(get_test_pdf_folder())
470+
>>> # Test with file path
471+
>>> pdf_path = test_folder / "sample_with_title.pdf"
472+
>>> pdf_to_title(str(pdf_path))
473+
'Sample PDF with Title'
474+
>>> # Test with bytes
475+
>>> pdf_bytes = pdf_path.read_bytes()
476+
>>> pdf_to_title(pdf_bytes)
477+
'Sample PDF with Title'
478+
>>> # Test with PdfReader
479+
>>> reader = pypdf.PdfReader(str(pdf_path))
480+
>>> pdf_to_title(reader)
481+
'Sample PDF with Title'
482+
>>> # Test with no title
483+
>>> pdf_path_no_title = test_folder / "sample_pdf_1.pdf"
484+
>>> pdf_to_title(str(pdf_path_no_title)) is None
485+
True
486+
"""
487+
metadata = pdf_to_metadata(pdf_src)
488+
title = metadata.get('title') or metadata.get('Title')
489+
if title:
490+
return title.strip()
491+
return None

pdfdol/util.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,6 @@ def html_to_pdf(*args, **kwargs):
190190
"You need to have either weasyprint or pdfkit installed to use html_to_pdf"
191191
)
192192

193-
194193
# ---------------------------------------------------------------------------------
195194
# Pdf concatenation
196195
# TODO: Add some functionality to prefix/suffix pdf pages (useful when concatenating)

0 commit comments

Comments
 (0)