diff --git a/README.md b/README.md index e95fced..734b04a 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,7 @@ For more information take a look at our [Getting Started with Parxy tutorial](./ | [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | `llama` | ✅ | ✅ | Preview | | [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | `llmwhisperer` | ✅ | ✅ | Preview | | [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | `pypdfium2` | ✅ | ✅ | Preview | +| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | `pdfplumber` | ✅ | ✅ | Preview | | [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | | | | Planned | | [**Chunkr**](https://www.chunkr.ai/) | | | | Planned | | [**Docling**](https://docling-project.github.io/docling/) | | | | Planned | diff --git a/docs/supported_services.md b/docs/supported_services.md index ff9ef3d..19ce7a8 100644 --- a/docs/supported_services.md +++ b/docs/supported_services.md @@ -17,6 +17,7 @@ Parxy supports the following document processing services and libraries. The **E | [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ | | [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ | | [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | Preview | `pypdfium2` | ✅ | ✅ | +| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ | Status meanings: **Live** = stable; **Preview** = functional but the API may change. diff --git a/pyproject.toml b/pyproject.toml index 4dbdc17..3d2fc48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,9 @@ tui = [ pypdfium2 = [ "pypdfium2>=5.7.1", ] +pdfplumber = [ + "pdfplumber>=0.11.0", +] all = [ "llama-cloud>=2.0.0", "llmwhisperer-client>=2.4.2", @@ -56,6 +59,7 @@ all = [ "landingai-ade>=0.15.1", "textual>=0.89.0", "pypdfium2>=5.7.1", + "pdfplumber>=0.11.0", ] diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py index 754640d..0f17d34 100644 --- a/src/parxy_core/drivers/__init__.py +++ b/src/parxy_core/drivers/__init__.py @@ -11,3 +11,4 @@ from parxy_core.drivers.pypdfium2 import ( PyPDFium2Driver as PyPDFium2Driver, ) +from parxy_core.drivers.pdfplumber import PDFPlumberDriver as PDFPlumberDriver diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py index 9be612f..9717099 100644 --- a/src/parxy_core/drivers/factory.py +++ b/src/parxy_core/drivers/factory.py @@ -10,6 +10,7 @@ from parxy_core.drivers.llmwhisperer import LlmWhispererDriver from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver from parxy_core.drivers.pypdfium2 import PyPDFium2Driver +from parxy_core.drivers.pdfplumber import PDFPlumberDriver from parxy_core.models import ( PdfActConfig, LandingAIConfig, @@ -213,6 +214,9 @@ def _create_unstructured_local_driver(self) -> UnstructuredLocalDriver: def _create_pypdfium_driver(self) -> PyPDFium2Driver: return PyPDFium2Driver(logger=self._logger) + def _create_pdfplumber_driver(self) -> PDFPlumberDriver: + return PDFPlumberDriver(logger=self._logger) + def _create_landingai_driver(self) -> LandingAIADEDriver: """Create a LandingAI ADE Driver instance. @@ -288,6 +292,7 @@ def get_supported_drivers(self) -> List[str]: 'llmwhisperer', 'unstructured_local', 'pypdfium', + 'pdfplumber', ] return supported_drivers diff --git a/src/parxy_core/drivers/pdfplumber.py b/src/parxy_core/drivers/pdfplumber.py new file mode 100644 index 0000000..a1721eb --- /dev/null +++ b/src/parxy_core/drivers/pdfplumber.py @@ -0,0 +1,124 @@ +"""pdfplumber driver for parxy.""" + +import io +from typing import Any + +from parxy_core.drivers import Driver +from parxy_core.models import Document, Page + + +class PDFPlumberDriver(Driver): + """PDF parser using pdfplumber.""" + + supported_levels = ['page', 'block'] + + def _initialize_driver(self): + """Initialize pdfplumber driver by checking if the library is available.""" + try: + import pdfplumber # noqa: F401 + except ImportError as e: + raise ImportError( + 'pdfplumber is required. Install with: pip install parxy[pdfplumber]' + ) from e + return self + + def _handle( + self, file: str | io.BytesIO | bytes, level: str = 'page', **kwargs + ) -> Document: + """Parse PDF to Document object with table extraction. + + Parameters + ---------- + file : str | io.BytesIO | bytes + Path, URL or stream of the file to parse. + level : str, optional + Desired extraction level. Default is "page". + **kwargs : dict + Additional keyword arguments. + + Returns + ------- + Document + A parsed Document in unified format. + """ + import pdfplumber + + if level == 'block': + level = 'page' # Only page is really supported, added block as it is the default for Parxy + + filename, stream = self.handle_file_input(file) + + with self._trace_parse(filename, stream, **kwargs) as span: + with pdfplumber.open(io.BytesIO(stream)) as pdf: + if not pdf.pages: + return Document(filename=filename, pages=[]) + + pages = [] + for page_num, page in enumerate(pdf.pages, start=1): + page_content = self._extract_page(page) + pages.append( + Page( + number=page_num, + text=page_content.strip() if page_content.strip() else '', + blocks=None, + ) + ) + + span.set_attribute('output.pages', len(pages)) + + return Document( + filename=filename, + pages=pages, + ) + + def _extract_page(self, page: Any) -> str: + """Extract content from a single page.""" + content_parts = [] + + # Extract tables + tables = page.extract_tables() + if tables: + for table in tables: + table_md = self._table_to_markdown(table) + if table_md: + content_parts.append(table_md) + + # Extract text + text = page.extract_text() + if text and text.strip(): + content_parts.append(text.strip()) + + return '\n\n'.join(content_parts) + + def _table_to_markdown(self, table: list[list[str | None]]) -> str: + """Convert table to GitHub Flavored Markdown.""" + if not table or len(table) < 2: + return '' + + # Filter empty rows + table = [row for row in table if any(cell for cell in row if cell)] + if not table: + return '' + + max_cols = max(len(row) for row in table) + if max_cols == 0: + return '' + + # Normalize rows + normalized: list[list[str]] = [] + for row in table: + padded = row + [None] * (max_cols - len(row)) + normalized.append( + [str(cell).strip() if cell is not None else '' for cell in padded] + ) + + lines = [] + # Header + lines.append('| ' + ' | '.join(normalized[0]) + ' |') + # Separator + lines.append('| ' + ' | '.join(['---'] * max_cols) + ' |') + # Data rows + for row in normalized[1:]: # type: ignore[assignment] + lines.append('| ' + ' | '.join(row) + ' |') # type: ignore[arg-type] + + return '\n'.join(lines) diff --git a/tests/drivers/test_pdfplumber.py b/tests/drivers/test_pdfplumber.py new file mode 100644 index 0000000..a6d82d5 --- /dev/null +++ b/tests/drivers/test_pdfplumber.py @@ -0,0 +1,133 @@ +import os +import pytest +from unittest.mock import Mock, patch, MagicMock + +from parxy_core.models import Page + +from parxy_core.drivers import PDFPlumberDriver +from parxy_core.exceptions import FileNotFoundException + + +class TestPDFPlumberDriver: + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def test_pdfplumber_driver_can_be_created(self): + driver = PDFPlumberDriver() + + assert driver.supported_levels == ['page', 'block'] + + def test_pdfplumber_driver_unrecognized_level_handled(self): + driver = PDFPlumberDriver() + + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(ValueError) as excinfo: + driver.parse(path, level='custom') + + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + + def test_pdfplumber_driver_handle_not_existing_file(self): + driver = PDFPlumberDriver() + + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level='page') + + def test_pdfplumber_driver_read_empty_document_page_level(self): + driver = PDFPlumberDriver() + + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.outline is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].blocks is None + assert document.pages[0].text == '1' + assert document.pages[0].number == 1 + + def test_pdfplumber_driver_read_document(self): + driver = PDFPlumberDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.metadata is None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].blocks is None + assert document.pages[0].number == 1 + assert ( + document.pages[0].text + == 'This is the header\nThis is a test PDF to be used as input in unit\ntests\nThis is a heading 1\nThis is a paragraph below heading 1\n1' + ) + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_pdfplumber_driver_tracing_span_created(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + driver = PDFPlumberDriver() + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + mock_tracer.span.assert_called() + + span_calls = mock_tracer.span.call_args_list + doc_processing_call = [ + c for c in span_calls if c[0][0] == 'document-processing' + ][0] + + assert doc_processing_call[1]['driver'] == 'PDFPlumberDriver' + assert doc_processing_call[1]['level'] == 'page' + + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.processed' + assert count_call[1]['driver'] == 'PDFPlumberDriver' + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_pdfplumber_driver_tracing_exception_recorded(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = PDFPlumberDriver() + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level='page') + + mock_tracer.error.assert_called_once() + error_call = mock_tracer.error.call_args + assert error_call[0][0] == 'Parsing failed' + + mock_tracer.count.assert_called_once() + + def test_pdfplumber_driver_records_elapsed_time(self): + driver = PDFPlumberDriver() + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 diff --git a/tests/test_factory.py b/tests/test_factory.py index 8ac9f61..d243ae0 100644 --- a/tests/test_factory.py +++ b/tests/test_factory.py @@ -10,6 +10,7 @@ from parxy_core.drivers import UnstructuredLocalDriver from parxy_core.drivers.landingai import LandingAIADEDriver from parxy_core.drivers import PyPDFium2Driver +from parxy_core.drivers import PDFPlumberDriver from parxy_core.models import Document from parxy_core.models import ParxyConfig @@ -122,3 +123,8 @@ def test_pypdfium_driver_instantiated(self): DriverFactory.reset() driver = DriverFactory.build().driver('pypdfium') assert isinstance(driver, PyPDFium2Driver) + + def test_pdfplumber_driver_instantiated(self): + DriverFactory.reset() + driver = DriverFactory.build().driver('pdfplumber') + assert isinstance(driver, PDFPlumberDriver) diff --git a/uv.lock b/uv.lock index 26a5c39..2881f7f 100644 --- a/uv.lock +++ b/uv.lock @@ -1921,6 +1921,7 @@ all = [ { name = "landingai-ade" }, { name = "llama-cloud" }, { name = "llmwhisperer-client" }, + { name = "pdfplumber" }, { name = "pypdfium2" }, { name = "textual" }, { name = "unstructured", extra = ["pdf"] }, @@ -1934,6 +1935,9 @@ llama = [ llmwhisperer = [ { name = "llmwhisperer-client" }, ] +pdfplumber = [ + { name = "pdfplumber" }, +] pypdfium2 = [ { name = "pypdfium2" }, ] @@ -1963,6 +1967,8 @@ requires-dist = [ { name = "opentelemetry-exporter-otlp", specifier = ">=1.37.0" }, { name = "opentelemetry-proto", specifier = ">=1.37.0" }, { name = "opentelemetry-sdk", specifier = ">=1.37.0" }, + { name = "pdfplumber", marker = "extra == 'all'", specifier = ">=0.11.0" }, + { name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.0" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "pydantic-settings", specifier = ">=2.10.1" }, { name = "pymupdf", specifier = ">=1.26.3" }, @@ -1978,7 +1984,7 @@ requires-dist = [ { name = "unstructured", extras = ["pdf"], marker = "extra == 'unstructured-local'", specifier = ">=0.18.13" }, { name = "validators", specifier = ">=0.35.0" }, ] -provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "all"] +provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "pdfplumber", "all"] [package.metadata.requires-dev] dev = [ @@ -2000,15 +2006,29 @@ wheels = [ [[package]] name = "pdfminer-six" -version = "20260107" +version = "20251230" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "charset-normalizer" }, { name = "cryptography" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/34/a4/5cec1112009f0439a5ca6afa8ace321f0ab2f48da3255b7a1c8953014670/pdfminer_six-20260107.tar.gz", hash = "sha256:96bfd431e3577a55a0efd25676968ca4ce8fd5b53f14565f85716ff363889602", size = 8512094, upload-time = "2026-01-07T13:29:12.937Z" } +sdist = { url = "https://files.pythonhosted.org/packages/46/9a/d79d8fa6d47a0338846bb558b39b9963b8eb2dfedec61867c138c1b17eeb/pdfminer_six-20251230.tar.gz", hash = "sha256:e8f68a14c57e00c2d7276d26519ea64be1b48f91db1cdc776faa80528ca06c1e", size = 8511285, upload-time = "2025-12-30T15:49:13.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/d7/b288ea32deb752a09aab73c75e1e7572ab2a2b56c3124a5d1eb24c62ceb3/pdfminer_six-20251230-py3-none-any.whl", hash = "sha256:9ff2e3466a7dfc6de6fd779478850b6b7c2d9e9405aa2a5869376a822771f485", size = 6591909, upload-time = "2025-12-30T15:49:10.76Z" }, +] + +[[package]] +name = "pdfplumber" +version = "0.11.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pdfminer-six" }, + { name = "pillow" }, + { name = "pypdfium2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/37/9ca3519e92a8434eb93be570b131476cc0a4e840bb39c62ddb7813a39d53/pdfplumber-0.11.9.tar.gz", hash = "sha256:481224b678b2bbdbf376e2c39bf914144eef7c3d301b4a28eebf0f7f6109d6dc", size = 102768, upload-time = "2026-01-05T08:10:29.072Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/20/8b/28c4eaec9d6b036a52cb44720408f26b1a143ca9bce76cc19e8f5de00ab4/pdfminer_six-20260107-py3-none-any.whl", hash = "sha256:366585ba97e80dffa8f00cebe303d2f381884d8637af4ce422f1df3ef38111a9", size = 6592252, upload-time = "2026-01-07T13:29:10.742Z" }, + { url = "https://files.pythonhosted.org/packages/8b/c8/cdbc975f5b634e249cfa6597e37c50f3078412474f21c015e508bfbfe3c3/pdfplumber-0.11.9-py3-none-any.whl", hash = "sha256:33ec5580959ba524e9100138746e090879504c42955df1b8a997604dd326c443", size = 60045, upload-time = "2026-01-05T08:10:27.512Z" }, ] [[package]]