Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ For more information take a look at our [Getting Started with Parxy tutorial](./
| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | `llama` | ✅ | ✅ | Preview |
| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | `llmwhisperer` | ✅ | ✅ | Preview |
| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | `pypdfium2` | ✅ | ✅ | Preview |
| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | `pdfplumber` | ✅ | ✅ | Preview |
| [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | | | | Planned |
| [**Chunkr**](https://www.chunkr.ai/) | | | | Planned |
| [**Docling**](https://docling-project.github.io/docling/) | | | | Planned |
Expand Down
1 change: 1 addition & 0 deletions docs/supported_services.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Parxy supports the following document processing services and libraries. The **E
| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ |
| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ |
| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | Preview | `pypdfium2` | ✅ | ✅ |
| [**pdfplumber**](https://github.com/jsvine/pdfplumber) | Preview | `pdfplumber` | ✅ | ✅ |

Status meanings: **Live** = stable; **Preview** = functional but the API may change.

Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,17 @@ tui = [
pypdfium2 = [
"pypdfium2>=5.7.1",
]
pdfplumber = [
"pdfplumber>=0.11.0",
]
all = [
"llama-cloud>=2.0.0",
"llmwhisperer-client>=2.4.2",
"unstructured[pdf]>=0.18.13",
"landingai-ade>=0.15.1",
"textual>=0.89.0",
"pypdfium2>=5.7.1",
"pdfplumber>=0.11.0",
]


Expand Down
1 change: 1 addition & 0 deletions src/parxy_core/drivers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
from parxy_core.drivers.pypdfium2 import (
PyPDFium2Driver as PyPDFium2Driver,
)
from parxy_core.drivers.pdfplumber import PDFPlumberDriver as PDFPlumberDriver
5 changes: 5 additions & 0 deletions src/parxy_core/drivers/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from parxy_core.drivers.llmwhisperer import LlmWhispererDriver
from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver
from parxy_core.drivers.pypdfium2 import PyPDFium2Driver
from parxy_core.drivers.pdfplumber import PDFPlumberDriver
from parxy_core.models import (
PdfActConfig,
LandingAIConfig,
Expand Down Expand Up @@ -213,6 +214,9 @@ def _create_unstructured_local_driver(self) -> UnstructuredLocalDriver:
def _create_pypdfium_driver(self) -> PyPDFium2Driver:
return PyPDFium2Driver(logger=self._logger)

def _create_pdfplumber_driver(self) -> PDFPlumberDriver:
return PDFPlumberDriver(logger=self._logger)

def _create_landingai_driver(self) -> LandingAIADEDriver:
"""Create a LandingAI ADE Driver instance.

Expand Down Expand Up @@ -288,6 +292,7 @@ def get_supported_drivers(self) -> List[str]:
'llmwhisperer',
'unstructured_local',
'pypdfium',
'pdfplumber',
]

return supported_drivers
Expand Down
124 changes: 124 additions & 0 deletions src/parxy_core/drivers/pdfplumber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""pdfplumber driver for parxy."""

import io
from typing import Any

from parxy_core.drivers import Driver
from parxy_core.models import Document, Page


class PDFPlumberDriver(Driver):
"""PDF parser using pdfplumber."""

supported_levels = ['page', 'block']

def _initialize_driver(self):
"""Initialize pdfplumber driver by checking if the library is available."""
try:
import pdfplumber # noqa: F401
except ImportError as e:
raise ImportError(
'pdfplumber is required. Install with: pip install parxy[pdfplumber]'
) from e
return self

def _handle(
self, file: str | io.BytesIO | bytes, level: str = 'page', **kwargs
) -> Document:
"""Parse PDF to Document object with table extraction.

Parameters
----------
file : str | io.BytesIO | bytes
Path, URL or stream of the file to parse.
level : str, optional
Desired extraction level. Default is "page".
**kwargs : dict
Additional keyword arguments.

Returns
-------
Document
A parsed Document in unified format.
"""
import pdfplumber

if level == 'block':
level = 'page' # Only page is really supported, added block as it is the default for Parxy

filename, stream = self.handle_file_input(file)

with self._trace_parse(filename, stream, **kwargs) as span:
with pdfplumber.open(io.BytesIO(stream)) as pdf:
if not pdf.pages:
return Document(filename=filename, pages=[])

pages = []
for page_num, page in enumerate(pdf.pages, start=1):
page_content = self._extract_page(page)
pages.append(
Page(
number=page_num,
text=page_content.strip() if page_content.strip() else '',
blocks=None,
)
)

span.set_attribute('output.pages', len(pages))

return Document(
filename=filename,
pages=pages,
)

def _extract_page(self, page: Any) -> str:
"""Extract content from a single page."""
content_parts = []

# Extract tables
tables = page.extract_tables()
if tables:
for table in tables:
table_md = self._table_to_markdown(table)
if table_md:
content_parts.append(table_md)

# Extract text
text = page.extract_text()
if text and text.strip():
content_parts.append(text.strip())

return '\n\n'.join(content_parts)

def _table_to_markdown(self, table: list[list[str | None]]) -> str:
"""Convert table to GitHub Flavored Markdown."""
if not table or len(table) < 2:
return ''

# Filter empty rows
table = [row for row in table if any(cell for cell in row if cell)]
if not table:
return ''

max_cols = max(len(row) for row in table)
if max_cols == 0:
return ''

# Normalize rows
normalized: list[list[str]] = []
for row in table:
padded = row + [None] * (max_cols - len(row))
normalized.append(
[str(cell).strip() if cell is not None else '' for cell in padded]
)

lines = []
# Header
lines.append('| ' + ' | '.join(normalized[0]) + ' |')
# Separator
lines.append('| ' + ' | '.join(['---'] * max_cols) + ' |')
# Data rows
for row in normalized[1:]: # type: ignore[assignment]
lines.append('| ' + ' | '.join(row) + ' |') # type: ignore[arg-type]

return '\n'.join(lines)
133 changes: 133 additions & 0 deletions tests/drivers/test_pdfplumber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os
import pytest
from unittest.mock import Mock, patch, MagicMock

from parxy_core.models import Page

from parxy_core.drivers import PDFPlumberDriver
from parxy_core.exceptions import FileNotFoundException


class TestPDFPlumberDriver:
def __fixture_path(self, file: str) -> str:
current_dir = os.path.dirname(os.path.abspath(__file__))
fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures')
return os.path.join(fixtures_dir, file)

def test_pdfplumber_driver_can_be_created(self):
driver = PDFPlumberDriver()

assert driver.supported_levels == ['page', 'block']

def test_pdfplumber_driver_unrecognized_level_handled(self):
driver = PDFPlumberDriver()

path = self.__fixture_path('empty-doc.pdf')

with pytest.raises(ValueError) as excinfo:
driver.parse(path, level='custom')

assert 'not supported' in str(excinfo.value)
assert '[custom]' in str(excinfo.value)

def test_pdfplumber_driver_handle_not_existing_file(self):
driver = PDFPlumberDriver()

path = self.__fixture_path('non-existing-file.pdf')

with pytest.raises(FileNotFoundException):
driver.parse(path, level='page')

def test_pdfplumber_driver_read_empty_document_page_level(self):
driver = PDFPlumberDriver()

path = self.__fixture_path('empty-doc.pdf')
document = driver.parse(path, level='page')

assert document is not None
assert document.language is None
assert document.outline is None
assert document.metadata is None
assert len(document.pages) == 1
assert isinstance(document.pages[0], Page)
assert document.pages[0].blocks is None
assert document.pages[0].text == '1'
assert document.pages[0].number == 1

def test_pdfplumber_driver_read_document(self):
driver = PDFPlumberDriver()

path = self.__fixture_path('test-doc.pdf')
document = driver.parse(path, level='page')

assert document is not None
assert document.language is None
assert document.metadata is None
assert len(document.pages) == 1
assert isinstance(document.pages[0], Page)
assert document.pages[0].blocks is None
assert document.pages[0].number == 1
assert (
document.pages[0].text
== 'This is the header\nThis is a test PDF to be used as input in unit\ntests\nThis is a heading 1\nThis is a paragraph below heading 1\n1'
)

@patch('parxy_core.drivers.abstract_driver.tracer')
def test_pdfplumber_driver_tracing_span_created(self, mock_tracer):
mock_span = MagicMock()
mock_span.__enter__ = Mock(return_value=mock_span)
mock_span.__exit__ = Mock(return_value=False)
mock_tracer.span = Mock(return_value=mock_span)
mock_tracer.count = Mock()
mock_tracer.info = Mock()

driver = PDFPlumberDriver()
path = self.__fixture_path('empty-doc.pdf')
document = driver.parse(path, level='page')

mock_tracer.span.assert_called()

span_calls = mock_tracer.span.call_args_list
doc_processing_call = [
c for c in span_calls if c[0][0] == 'document-processing'
][0]

assert doc_processing_call[1]['driver'] == 'PDFPlumberDriver'
assert doc_processing_call[1]['level'] == 'page'

mock_tracer.count.assert_called_once()
count_call = mock_tracer.count.call_args
assert count_call[0][0] == 'documents.processed'
assert count_call[1]['driver'] == 'PDFPlumberDriver'

@patch('parxy_core.drivers.abstract_driver.tracer')
def test_pdfplumber_driver_tracing_exception_recorded(self, mock_tracer):
mock_span = MagicMock()
mock_span.__enter__ = Mock(return_value=mock_span)
mock_span.__exit__ = Mock(return_value=False)
mock_tracer.span = Mock(return_value=mock_span)
mock_tracer.count = Mock()
mock_tracer.error = Mock()

driver = PDFPlumberDriver()
path = self.__fixture_path('non-existing-file.pdf')

with pytest.raises(FileNotFoundException):
driver.parse(path, level='page')

mock_tracer.error.assert_called_once()
error_call = mock_tracer.error.call_args
assert error_call[0][0] == 'Parsing failed'

mock_tracer.count.assert_called_once()

def test_pdfplumber_driver_records_elapsed_time(self):
driver = PDFPlumberDriver()

path = self.__fixture_path('test-doc.pdf')
document = driver.parse(path, level='page')

assert document.parsing_metadata is not None
assert 'driver_elapsed_time' in document.parsing_metadata
assert isinstance(document.parsing_metadata['driver_elapsed_time'], float)
assert document.parsing_metadata['driver_elapsed_time'] > 0
6 changes: 6 additions & 0 deletions tests/test_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from parxy_core.drivers import UnstructuredLocalDriver
from parxy_core.drivers.landingai import LandingAIADEDriver
from parxy_core.drivers import PyPDFium2Driver
from parxy_core.drivers import PDFPlumberDriver
from parxy_core.models import Document
from parxy_core.models import ParxyConfig

Expand Down Expand Up @@ -122,3 +123,8 @@ def test_pypdfium_driver_instantiated(self):
DriverFactory.reset()
driver = DriverFactory.build().driver('pypdfium')
assert isinstance(driver, PyPDFium2Driver)

def test_pdfplumber_driver_instantiated(self):
DriverFactory.reset()
driver = DriverFactory.build().driver('pdfplumber')
assert isinstance(driver, PDFPlumberDriver)
Loading