diff --git a/README.md b/README.md index 04f9222..e95fced 100644 --- a/README.md +++ b/README.md @@ -155,17 +155,18 @@ For more information take a look at our [Getting Started with Parxy tutorial](./ ## Supported services -| Service or Library | Support status | Extra | Local file | Remote file | -|--------------------|----------------|-------|------------|-------------| -| [**PyMuPDF**](https://pymupdf.readthedocs.io/en/latest/) | Live | - | ✅ | ✅ | -| [**PdfAct**](https://github.com/data-house/pdfact) | Live | - | ✅ | ✅ | -| [**Unstructured** library](https://docs.unstructured.io/open-source/introduction/overview) | Preview | `unstructured_local` | ✅ | ✅ | -| [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | Preview | `landingai` | ✅ | ✅ | -| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ | -| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ | -| [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | Planned | | | | -| [**Chunkr**](https://www.chunkr.ai/) | Planned | | | | -| [**Docling**](https://docling-project.github.io/docling/) | Planned | | | | +| Service or Library | Extra | Local file | Remote file | Status | +|--------------------|-------|------------|-------------|----------------| +| [**PyMuPDF**](https://pymupdf.readthedocs.io/en/latest/) | - | ✅ | ✅ | | +| [**PdfAct**](https://github.com/data-house/pdfact) | - | ✅ | ✅ | | +| [**Unstructured** library](https://docs.unstructured.io/open-source/introduction/overview) | `unstructured_local` | ✅ | ✅ | Preview | +| [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | `landingai` | ✅ | ✅ | Preview | +| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | `llama` | ✅ | ✅ | Preview | +| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | `llmwhisperer` | ✅ | ✅ | Preview | +| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | `pypdfium2` | ✅ | ✅ | Preview | +| [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | | | | Planned | +| [**Chunkr**](https://www.chunkr.ai/) | | | | Planned | +| [**Docling**](https://docling-project.github.io/docling/) | | | | Planned | ...and more can be added via the [live extension](#live-extension)! diff --git a/docs/supported_services.md b/docs/supported_services.md index 6bc6944..ff9ef3d 100644 --- a/docs/supported_services.md +++ b/docs/supported_services.md @@ -16,7 +16,7 @@ Parxy supports the following document processing services and libraries. The **E | [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | Preview | `landingai` | ✅ | ✅ | | [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ | | [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ | - +| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | Preview | `pypdfium2` | ✅ | ✅ | Status meanings: **Live** = stable; **Preview** = functional but the API may change. diff --git a/pyproject.toml b/pyproject.toml index 5050d7c..4dbdc17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,12 +46,16 @@ landingai = [ tui = [ "textual>=0.89.0", ] +pypdfium2 = [ + "pypdfium2>=5.7.1", +] all = [ "llama-cloud>=2.0.0", "llmwhisperer-client>=2.4.2", "unstructured[pdf]>=0.18.13", "landingai-ade>=0.15.1", "textual>=0.89.0", + "pypdfium2>=5.7.1", ] diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py index ca017d4..754640d 100644 --- a/src/parxy_core/drivers/__init__.py +++ b/src/parxy_core/drivers/__init__.py @@ -8,3 +8,6 @@ from parxy_core.drivers.unstructured_local import ( UnstructuredLocalDriver as UnstructuredLocalDriver, ) +from parxy_core.drivers.pypdfium2 import ( + PyPDFium2Driver as PyPDFium2Driver, +) diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py index 46b02d1..9be612f 100644 --- a/src/parxy_core/drivers/factory.py +++ b/src/parxy_core/drivers/factory.py @@ -9,6 +9,7 @@ from parxy_core.drivers.llamaparse import LlamaParseDriver from parxy_core.drivers.llmwhisperer import LlmWhispererDriver from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver +from parxy_core.drivers.pypdfium2 import PyPDFium2Driver from parxy_core.models import ( PdfActConfig, LandingAIConfig, @@ -209,6 +210,9 @@ def _create_unstructured_local_driver(self) -> UnstructuredLocalDriver: logger=self._logger, ) + def _create_pypdfium_driver(self) -> PyPDFium2Driver: + return PyPDFium2Driver(logger=self._logger) + def _create_landingai_driver(self) -> LandingAIADEDriver: """Create a LandingAI ADE Driver instance. @@ -283,6 +287,7 @@ def get_supported_drivers(self) -> List[str]: 'llamaparse', 'llmwhisperer', 'unstructured_local', + 'pypdfium', ] return supported_drivers diff --git a/src/parxy_core/drivers/pypdfium2.py b/src/parxy_core/drivers/pypdfium2.py new file mode 100644 index 0000000..42cf7a5 --- /dev/null +++ b/src/parxy_core/drivers/pypdfium2.py @@ -0,0 +1,158 @@ +"""PyPDFium2 driver for parxy.""" + +import io + +from datetime import datetime + +from parxy_core.drivers import Driver +from parxy_core.models import Document, Page, Metadata, TocEntry, BoundingBox + + +class PyPDFium2Driver(Driver): + """PDF parser using PyPDFium2 - Chrome's PDF engine. + + PyPDFium2 wraps PDFium, the PDF rendering engine used in Chrome. + Fast and reliable for text extraction. + + Thread-safety: PDFium is not thread-safe and in practice crashes even + when calls are serialized via a lock or routed through a dedicated + single-thread executor. Batch processing with this driver must be run + with a single worker; ``Parxy.batch_iter`` enforces this automatically. + """ + + supported_levels = ['page', 'block'] + + def _initialize_driver(self): + """Initialize PyPDFium2 driver by checking if the library is available.""" + try: + import pypdfium2 # noqa: F401 + except ImportError as e: + raise ImportError( + 'pypdfium2 is required. Install with: pip install parxy[pypdfium2]' + ) from e + return self + + def _handle( + self, file: str | io.BytesIO | bytes, level: str = 'page', **kwargs + ) -> Document: + """Parse PDF to Document object. + + Parameters + ---------- + file : str | io.BytesIO | bytes + Path, URL or stream of the file to parse. + level : str, optional + Desired extraction level. Default is "page". + **kwargs : dict + Additional keyword arguments. + + Returns + ------- + Document + A parsed Document in unified format. + """ + import pypdfium2 as pdfium + + if level == 'block': + level = 'page' # Only page is really supported, added block as it is the default for Parxy + + filename, stream = self.handle_file_input(file) + + with self._trace_parse(filename, stream, **kwargs) as span: + pdf = pdfium.PdfDocument(stream) + + pages = [] + for page_num, page in enumerate(pdf, start=1): + textpage = page.get_textpage() + text = textpage.get_text_range() + textpage.close() + page.close() + if text and text.strip(): + pages.append( + Page( + number=page_num, + text=text.strip(), + blocks=None, + ) + ) + else: + # Include empty pages to maintain page numbering + pages.append( + Page( + number=page_num, + text='', + blocks=None, + ) + ) + outline = [] + for bm in pdf.get_toc(max_depth=15): + dest = bm.get_dest() + page_num = None + bbox = None + if dest: + index = dest.get_index() + page_num = index + 1 if index is not None else None + view_mode, view_pos = dest.get_view() + # XYZ (1): [left, top, zoom] — destination point + # FITR (4): [left, bottom, right, top] — destination rect + if view_mode == 1 and len(view_pos) >= 2: + bbox = BoundingBox( + x0=view_pos[0], + y0=view_pos[1], + x1=view_pos[0], + y1=view_pos[1], + ) + elif view_mode == 4 and len(view_pos) >= 4: + bbox = BoundingBox( + x0=view_pos[0], + y0=view_pos[1], + x1=view_pos[2], + y1=view_pos[3], + ) + outline.append( + TocEntry( + title=bm.get_title(), + page=page_num, + level=bm.level, + bbox=bbox, + ) + ) + + span.set_attribute('output.pages', len(pages)) + + metadata = pdf.get_metadata_dict() + pdf.close() + + return Document( + filename=filename, + pages=pages, + outline=outline or None, + metadata=Metadata( + title=metadata.get('Title'), + author=metadata.get('Author'), + subject=metadata.get('Subject'), + keywords=metadata.get('Keywords'), + creator=metadata.get('Creator'), + producer=metadata.get('Producer'), + created_at=_parse_pdf_date(metadata.get('CreationDate')), + updated_at=_parse_pdf_date(metadata.get('ModDate')), + ), + ) + + +def _parse_pdf_date(pdf_date: str) -> str | None: + """ + Parse PDF date string to ISO format. + PDF date format: D:YYYYMMDDHHmmSSOHH'mm' + """ + if not pdf_date: + return None + try: + # Remove prefix if present + if pdf_date.startswith('D:'): + pdf_date = pdf_date[2:] + # Only take up to seconds + dt = datetime.strptime(pdf_date[:14], '%Y%m%d%H%M%S') + return dt.isoformat() + except Exception: + return None diff --git a/src/parxy_core/facade/parxy.py b/src/parxy_core/facade/parxy.py index ddfe290..bc52b6f 100644 --- a/src/parxy_core/facade/parxy.py +++ b/src/parxy_core/facade/parxy.py @@ -214,6 +214,19 @@ def batch_iter( # Determine number of workers max_workers = workers if workers else (os.cpu_count() or 2) + # PDFium is not thread-safe and crashes under any form of concurrency, + # even when calls are serialized via a lock or a dedicated executor. + # Force serial execution when pypdfium is among the requested drivers, + # or any task explicitly asks for it. + task_driver_names = { + d + for t in tasks + if isinstance(t, BatchTask) and t.drivers + for d in t.drivers + } + if 'pypdfium' in set(default_drivers) | task_driver_names: + max_workers = 1 + # Normalize tasks into single-driver BatchTask objects. # When a BatchTask specifies multiple drivers it is split into # one BatchTask per driver so each unit of work targets exactly diff --git a/src/parxy_core/models/__init__.py b/src/parxy_core/models/__init__.py index 0853107..8d91e2e 100644 --- a/src/parxy_core/models/__init__.py +++ b/src/parxy_core/models/__init__.py @@ -12,6 +12,7 @@ TextBlock as TextBlock, Page as Page, Metadata as Metadata, + TocEntry as TocEntry, Document as Document, BatchTask as BatchTask, BatchResult as BatchResult, diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py index b965c56..9765c6c 100644 --- a/src/parxy_core/models/models.py +++ b/src/parxy_core/models/models.py @@ -114,12 +114,19 @@ class Metadata(BaseModel): updated_at: Optional[str] = None +class TocEntry(BaseModel): + title: str + page: Optional[int] = None + level: Optional[int] = None + bbox: Optional[BoundingBox] = None + + class Document(BaseModel): filename: Optional[str] = None language: Optional[str] = None metadata: Optional[Metadata] = None pages: List[Page] - outline: Optional[List[str]] = None + outline: Optional[List[str | TocEntry]] = None source_data: Optional[dict[str, Any]] = None parsing_metadata: Optional[dict[str, Any]] = None diff --git a/tests/drivers/test_pypdfium.py b/tests/drivers/test_pypdfium.py new file mode 100644 index 0000000..c81f625 --- /dev/null +++ b/tests/drivers/test_pypdfium.py @@ -0,0 +1,177 @@ +import os +import pytest +from unittest.mock import Mock, patch, MagicMock + +from parxy_core.models import Page, TocEntry, BoundingBox + +from parxy_core.drivers import PyPDFium2Driver +from parxy_core.exceptions import FileNotFoundException + + +class TestPyPDFiumfDriver: + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def test_pypdfium_driver_can_be_created(self): + driver = PyPDFium2Driver() + + assert driver.supported_levels == ['page', 'block'] + + def test_pypdfium_driver_unrecognized_level_handled(self): + driver = PyPDFium2Driver() + + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(ValueError) as excinfo: + driver.parse(path, level='custom') + + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + + def test_pypdfium_driver_handle_not_existing_file(self): + driver = PyPDFium2Driver() + + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException) as excinfo: + driver.parse(path) + + def test_pypdfium_driver_read_empty_document_page_level(self): + driver = PyPDFium2Driver() + + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path) + + assert document is not None + assert document.language is None + assert document.outline is None + assert document.metadata is not None + assert document.metadata.title + assert document.metadata.title == 'Test document' + assert document.metadata.author == 'Data House Author' + assert document.metadata.subject == '' + assert document.metadata.keywords == '' + assert document.metadata.creator == 'Microsoft® Word for Microsoft 365' + assert document.metadata.producer == 'Microsoft® Word for Microsoft 365' + assert document.metadata.created_at == '2023-11-13T18:43:06' + assert document.metadata.updated_at == document.metadata.created_at + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].blocks is None + assert document.pages[0].text == '1' + + def test_pypdfium_driver_read_document(self): + driver = PyPDFium2Driver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.metadata is not None + assert document.metadata.title + assert document.metadata.title == 'Test document' + assert document.metadata.author == 'Data House Author' + assert document.metadata.subject == '' + assert document.metadata.keywords == '' + assert document.metadata.creator == 'Microsoft® Word for Microsoft 365' + assert document.metadata.producer == 'Microsoft® Word for Microsoft 365' + assert document.metadata.created_at == '2023-05-09T11:34:41' + assert document.metadata.updated_at == document.metadata.created_at + + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].blocks is None + assert ( + document.pages[0].text + == 'This is the header\r\n1\r\nThis is a test PDF to be used as input in unit \r\ntests\r\nThis is a heading 1\r\nThis is a paragraph below heading 1' + ) + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_pypdfium_driver_tracing_span_created(self, mock_tracer): + # Setup mocks for the span context manager + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + driver = PyPDFium2Driver() + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + # Verify tracer.span was called to create span + mock_tracer.span.assert_called() + + # Find the 'document-processing' span call (from abstract_driver.parse) + span_calls = mock_tracer.span.call_args_list + doc_processing_call = [ + c for c in span_calls if c[0][0] == 'document-processing' + ][0] + + # Verify span attributes + assert doc_processing_call[1]['driver'] == 'PyPDFium2Driver' + assert doc_processing_call[1]['level'] == 'page' + + # Verify counter was incremented via tracer.count + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.processed' + assert count_call[1]['driver'] == 'PyPDFium2Driver' + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_pypdfium_driver_tracing_exception_recorded(self, mock_tracer): + # Setup mocks + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = PyPDFium2Driver() + path = self.__fixture_path('non-existing-file.pdf') + + # Attempt to parse non-existing file + with pytest.raises(FileNotFoundException): + driver.parse(path, level='page') + + # Verify error was logged via tracer.error + mock_tracer.error.assert_called_once() + error_call = mock_tracer.error.call_args + assert error_call[0][0] == 'Parsing failed' + + # Verify counter was NOT incremented due to exception + mock_tracer.count.assert_called_once() + + def test_pypdfium_driver_reads_outline(self): + driver = PyPDFium2Driver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document.outline is not None + assert len(document.outline) == 1 + entry = document.outline[0] + assert isinstance(entry, TocEntry) + assert entry.title == 'This is a heading 1' + assert entry.page == 1 + assert entry.level == 0 + assert isinstance(entry.bbox, BoundingBox) + assert entry.bbox.x0 == 69.0 + assert entry.bbox.y0 == 629.0 + + def test_pypdfium_driver_records_elapsed_time(self): + driver = PyPDFium2Driver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + # Verify elapsed time is recorded in parsing_metadata + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 diff --git a/tests/test_factory.py b/tests/test_factory.py index 5a3b1b7..8ac9f61 100644 --- a/tests/test_factory.py +++ b/tests/test_factory.py @@ -9,6 +9,7 @@ from parxy_core.drivers import LlmWhispererDriver from parxy_core.drivers import UnstructuredLocalDriver from parxy_core.drivers.landingai import LandingAIADEDriver +from parxy_core.drivers import PyPDFium2Driver from parxy_core.models import Document from parxy_core.models import ParxyConfig @@ -116,3 +117,8 @@ def test_landingai_driver_instantiated(self): DriverFactory.reset() driver = DriverFactory.build().driver('landingai') assert isinstance(driver, LandingAIADEDriver) + + def test_pypdfium_driver_instantiated(self): + DriverFactory.reset() + driver = DriverFactory.build().driver('pypdfium') + assert isinstance(driver, PyPDFium2Driver) diff --git a/uv.lock b/uv.lock index 77b4a4c..26a5c39 100644 --- a/uv.lock +++ b/uv.lock @@ -1921,6 +1921,7 @@ all = [ { name = "landingai-ade" }, { name = "llama-cloud" }, { name = "llmwhisperer-client" }, + { name = "pypdfium2" }, { name = "textual" }, { name = "unstructured", extra = ["pdf"] }, ] @@ -1933,6 +1934,9 @@ llama = [ llmwhisperer = [ { name = "llmwhisperer-client" }, ] +pypdfium2 = [ + { name = "pypdfium2" }, +] tui = [ { name = "textual" }, ] @@ -1962,6 +1966,8 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.11.7" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, { name = "pymupdf", specifier = ">=1.26.3" }, + { name = "pypdfium2", marker = "extra == 'all'", specifier = ">=5.7.1" }, + { name = "pypdfium2", marker = "extra == 'pypdfium2'", specifier = ">=5.7.1" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "requests", specifier = ">=2.32.4" }, { name = "rich", specifier = ">=13.7.0" }, @@ -1972,7 +1978,7 @@ requires-dist = [ { name = "unstructured", extras = ["pdf"], marker = "extra == 'unstructured-local'", specifier = ">=0.18.13" }, { name = "validators", specifier = ">=0.35.0" }, ] -provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "all"] +provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "all"] [package.metadata.requires-dev] dev = [