From 49a2d263511033de136682bdfdd9906a7e641a51 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Mon, 25 May 2026 15:03:23 +0200 Subject: [PATCH 1/2] Add support for Reducto --- README.md | 2 + docs/supported_services.md | 1 + pyproject.toml | 4 + src/parxy_core/drivers/__init__.py | 1 + src/parxy_core/drivers/factory.py | 6 + src/parxy_core/drivers/reducto.py | 372 +++++++++++++++++++ src/parxy_core/facade/parxy.py | 1 + src/parxy_core/models/__init__.py | 1 + src/parxy_core/models/config.py | 28 +- tests/drivers/test_reducto.py | 571 +++++++++++++++++++++++++++++ uv.lock | 25 +- 11 files changed, 1010 insertions(+), 2 deletions(-) create mode 100644 src/parxy_core/drivers/reducto.py create mode 100644 tests/drivers/test_reducto.py diff --git a/README.md b/README.md index 6702f2d..3df9364 100644 --- a/README.md +++ b/README.md @@ -167,6 +167,8 @@ For more information take a look at our [Getting Started with Parxy tutorial](./ | [**pdfplumber**](https://github.com/jsvine/pdfplumber) | `pdfplumber` | ✅ | ✅ | Preview | | [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | `pdfminer` | ✅ | ✅ | Preview | | [**Docling**](https://docling-project.github.io/docling/) | `docling` | ✅ | ✅ | Preview | +| [**LiteParse**](https://github.com/run-llama/liteparse) | `liteparse` | ✅ | ✅ | Preview | +| [**Reducto**](https://reducto.ai/) | `reducto` | ✅ | ✅ | Preview | | [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | | | | Planned | | [**Chunkr**](https://www.chunkr.ai/) | | | | Planned | diff --git a/docs/supported_services.md b/docs/supported_services.md index 85b9ad9..4f6e903 100644 --- a/docs/supported_services.md +++ b/docs/supported_services.md @@ -21,6 +21,7 @@ Parxy supports the following document processing services and libraries. The **E | [**PDFMiner**](https://github.com/pdfminer/pdfminer.six) | Preview | `pdfminer` | ✅ | ✅ | | [**Docling**](https://docling-project.github.io/docling/) | Preview | `docling` | ✅ | ✅ | | [**LiteParse**](https://github.com/run-llama/liteparse) | Preview | `liteparse` | ✅ | ✅ | +| [**Reducto**](https://reducto.ai/) | Preview | `reducto` | ✅ | ✅ | Status meanings: **Live** = stable; **Preview** = functional but the API may change. diff --git a/pyproject.toml b/pyproject.toml index 582affe..cf365f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,9 @@ pdfminer = [ docling = [ "docling-slim[service-client]>=2.93.0", ] +reducto = [ + "reductoai>=0.22.0", +] all = [ "llama-cloud>=2.0.0", "llmwhisperer-client>=2.4.2", @@ -68,6 +71,7 @@ all = [ "pdfplumber>=0.11.0", "pdfminer.six>=20251230", "docling-slim[service-client]>=2.93.0", + "reductoai>=0.22.0", ] diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py index eb215d6..0793dbc 100644 --- a/src/parxy_core/drivers/__init__.py +++ b/src/parxy_core/drivers/__init__.py @@ -15,3 +15,4 @@ from parxy_core.drivers.pdfminer import PDFMinerDriver as PDFMinerDriver from parxy_core.drivers.docling import DoclingDriver as DoclingDriver from parxy_core.drivers.liteparse import LiteParseDriver as LiteParseDriver +from parxy_core.drivers.reducto import ReductoDriver as ReductoDriver diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py index 2fb2373..0472dfe 100644 --- a/src/parxy_core/drivers/factory.py +++ b/src/parxy_core/drivers/factory.py @@ -14,6 +14,7 @@ from parxy_core.drivers.pdfminer import PDFMinerDriver from parxy_core.drivers.docling import DoclingDriver from parxy_core.drivers.liteparse import LiteParseDriver +from parxy_core.drivers.reducto import ReductoDriver from parxy_core.models import ( PdfActConfig, LandingAIConfig, @@ -21,6 +22,7 @@ LlmWhispererConfig, UnstructuredLocalConfig, LiteParseConfig, + ReductoConfig, ParxyConfig, DoclingConfig, ) @@ -231,6 +233,9 @@ def _create_docling_driver(self) -> DoclingDriver: def _create_liteparse_driver(self) -> LiteParseDriver: return LiteParseDriver(config=LiteParseConfig(), logger=self._logger) + def _create_reducto_driver(self) -> ReductoDriver: + return ReductoDriver(config=ReductoConfig(), logger=self._logger) + def _create_landingai_driver(self) -> LandingAIADEDriver: """Create a LandingAI ADE Driver instance. @@ -310,6 +315,7 @@ def get_supported_drivers(self) -> List[str]: 'pdfminer', 'docling', 'liteparse', + 'reducto', ] return supported_drivers diff --git a/src/parxy_core/drivers/reducto.py b/src/parxy_core/drivers/reducto.py new file mode 100644 index 0000000..9902f93 --- /dev/null +++ b/src/parxy_core/drivers/reducto.py @@ -0,0 +1,372 @@ +import io +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +from parxy_core.models.config import ReductoConfig +from parxy_core.tracing.utils import trace_with_output + +if TYPE_CHECKING: + from reducto import Reducto + from reducto.types.shared.parse_response import ( + ResultFullResult, + ResultFullResultChunkBlock, + ) +else: + Reducto = None + ResultFullResult = object + ResultFullResultChunkBlock = object + +from parxy_core.drivers import Driver +from parxy_core.models import ( + Document, + Page, + BoundingBox, + TextBlock, + TableBlock, + ImageBlock, + HierarchyLevel, +) +from parxy_core.exceptions import ( + ParsingException, + AuthenticationException, + FileNotFoundException, +) + +# Mapping from Reducto block types to WAI-ARIA document structure roles. +REDUCTO_TO_ROLE: dict[str, str] = { + 'Header': 'doc-pageheader', + 'Footer': 'doc-pagefooter', + 'Title': 'doc-title', + 'Section Header': 'heading', + 'Page Number': 'doc-pagefooter', + 'List Item': 'list', + 'Figure': 'figure', + 'Table': 'table', + 'Key Value': 'generic', + 'Text': 'paragraph', + 'Comment': 'generic', + 'Signature': 'generic', +} + +# Options that can be overridden per-call via kwargs +_PER_CALL_OPTIONS = frozenset( + { + 'extraction_mode', + 'table_output_format', + 'page_range', + 'summarize_figures', + } +) + + +class ReductoDriver(Driver): + """Reducto document processing via the Reducto Parse API. + + Attributes + ---------- + supported_levels : list of str + The supported extraction levels: `page`, `block`. + """ + + supported_levels = ['page', 'block'] + + _config: ReductoConfig + + def _initialize_driver(self): + try: + from reducto import Reducto as ReductoClient + + self._ReductoClient = ReductoClient + except ImportError as e: + raise ImportError( + 'Reducto dependencies not installed. ' + "Install with 'pip install reductoai'" + ) from e + + def _create_client(self) -> 'Reducto': + kwargs: dict = {} + if self._config and self._config.api_key: + kwargs['api_key'] = self._config.api_key.get_secret_value() + if self._config and self._config.environment: + kwargs['environment'] = self._config.environment + if self._config and self._config.base_url: + kwargs['base_url'] = self._config.base_url + if self._config and self._config.timeout: + kwargs['timeout'] = self._config.timeout + return self._ReductoClient(**kwargs) + + def _get_opt(self, overrides: dict, key: str, default=None): + """Return the value for ``key`` from overrides, config, or default.""" + if key in overrides: + return overrides[key] + if self._config and hasattr(self._config, key): + val = getattr(self._config, key) + if val is not None: + return val + return default + + def _handle( + self, + file: str | io.BytesIO | bytes, + level: str = 'block', + **kwargs, + ) -> Document: + """Parse a document using the Reducto Parse API. + + Parameters + ------- + file : str | io.BytesIO | bytes + Path, URL or stream of the file to parse. + level : str, optional + Desired extraction level. Must be one of `supported_levels`. Default is `"block"`. + **kwargs + Per-call configuration overrides. Supported options: + + - extraction_mode: Text extraction mode ('ocr' or 'hybrid') + - table_output_format: Table format ('html', 'json', 'md', 'csv', 'dynamic') + - page_range: Page range to process (e.g. {'start': 1, 'end': 5}) + - summarize_figures: If True, summarize figures using a vision model + + Returns + ------- + Document + A parsed `Document` in unified format. + + Raises + ------ + ImportError + If reductoai is not installed + AuthenticationException + If authentication with Reducto fails + FileNotFoundException + If the input file cannot be accessed + ParsingException + If any other parsing error occurs + """ + overrides = {k: v for k, v in kwargs.items() if k in _PER_CALL_OPTIONS} + client = self._create_client() + + try: + from reducto._exceptions import AuthenticationError, PermissionDeniedError + except ImportError: + AuthenticationError = Exception # type: ignore[assignment,misc] + PermissionDeniedError = Exception # type: ignore[assignment,misc] + + try: + filename, stream = self.handle_file_input(file) + upload_filename = Path(filename).name if filename else 'document.pdf' + + upload = client.upload(file=(upload_filename, stream)) + upload_file_id = upload.file_id + input_url = f'reducto://{upload_file_id}' + + with self._trace_parse(filename, stream, **kwargs) as span: + parse_kwargs: dict = {'input': input_url} + + settings: dict = {} + extraction_mode = self._get_opt(overrides, 'extraction_mode', None) + if extraction_mode: + settings['extraction_mode'] = extraction_mode + page_range = self._get_opt(overrides, 'page_range', None) + if page_range: + settings['page_range'] = page_range + if settings: + parse_kwargs['settings'] = settings + + formatting: dict = {} + table_output_format = self._get_opt( + overrides, 'table_output_format', None + ) + if table_output_format: + formatting['table_output_format'] = table_output_format + if formatting: + parse_kwargs['formatting'] = formatting + + enhance: dict = {} + summarize_figures = self._get_opt(overrides, 'summarize_figures', None) + if summarize_figures is not None: + enhance['summarize_figures'] = summarize_figures + if enhance: + parse_kwargs['enhance'] = enhance + + response = client.parse.run(**parse_kwargs) + + from reducto.lib.helpers import handle_url_response + + full_response = handle_url_response(response) + + span.set_attribute('output.document', full_response.model_dump_json()) + + except FileNotFoundError as fex: + raise FileNotFoundException(fex, self.__class__) from fex + except (AuthenticationError, PermissionDeniedError) as ex: + raise AuthenticationException( + message=str(ex), + service=self.__class__.__name__, + details={ + 'status_code': getattr(ex, 'status_code', None), + 'error_response': getattr(ex, 'body', None), + }, + ) from ex + except Exception as ex: + raise ParsingException(str(ex), self.__class__) from ex + + converted_document = reducto_to_parxy( + result=full_response.result, + filename=filename, + level=level, + ) + + if converted_document.parsing_metadata is None: + converted_document.parsing_metadata = {} + + converted_document.parsing_metadata['job_id'] = full_response.job_id + converted_document.parsing_metadata['upload_file_id'] = upload_file_id + converted_document.parsing_metadata['duration'] = full_response.duration + converted_document.parsing_metadata['num_pages'] = full_response.usage.num_pages + if full_response.usage.credits is not None: + converted_document.parsing_metadata['cost_estimation'] = ( + full_response.usage.credits + ) + converted_document.parsing_metadata['cost_estimation_unit'] = 'credits' + if full_response.pdf_url: + converted_document.parsing_metadata['pdf_url'] = full_response.pdf_url + + return converted_document + + +@trace_with_output('converting') +def reducto_to_parxy( + result: 'ResultFullResult', + filename: str, + level: str, +) -> Document: + """Convert a Reducto ``ResultFullResult`` to a ``Document`` object. + + Parameters + ---------- + result : ResultFullResult + The Reducto parse result. + filename : str + Original filename. + level : str + Desired extraction level. + + Returns + ------- + Document + The converted ``Document`` in unified format. + """ + blocks_by_page: dict[int, list] = {} + + for chunk in result.chunks: + for block in chunk.blocks: + page_num = block.bbox.page + if page_num not in blocks_by_page: + blocks_by_page[page_num] = [] + blocks_by_page[page_num].append(block) + + include_blocks = HierarchyLevel[level.upper()] >= HierarchyLevel.BLOCK + + pages = [] + for page_num in sorted(blocks_by_page.keys()): + raw_blocks = blocks_by_page[page_num] + page_text = '\n'.join(b.content for b in raw_blocks if b.content) + + page_blocks = None + if include_blocks: + page_blocks = [] + for raw_block in raw_blocks: + if raw_block.type == 'Table': + page_blocks.append(_convert_table_block(raw_block, page_num)) + elif raw_block.type == 'Figure': + page_blocks.append(_convert_image_block(raw_block, page_num)) + else: + page_blocks.append(_convert_text_block(raw_block, page_num)) + + pages.append( + Page( + number=page_num, + text=page_text, + blocks=page_blocks, + ) + ) + + return Document( + filename=filename, + pages=pages, + ) + + +def _convert_bbox(bbox) -> Optional[BoundingBox]: + if bbox is None: + return None + return BoundingBox( + x0=bbox.left, + y0=bbox.top, + x1=bbox.left + bbox.width, + y1=bbox.top + bbox.height, + ) + + +def _convert_text_block( + block: 'ResultFullResultChunkBlock', page_number: int +) -> TextBlock: + bbox = _convert_bbox(block.bbox) + role = REDUCTO_TO_ROLE.get(block.type, 'generic') + + source_data: dict = {} + if hasattr(block, 'model_dump'): + source_data = block.model_dump(exclude={'content', 'type', 'bbox'}) + + return TextBlock( + type='text', + role=role, + category=block.type, + text=block.content, + bbox=bbox, + page=page_number, + source_data=source_data, + ) + + +def _convert_table_block( + block: 'ResultFullResultChunkBlock', page_number: int +) -> TableBlock: + bbox = _convert_bbox(block.bbox) + + source_data: dict = {} + if hasattr(block, 'model_dump'): + source_data = block.model_dump(exclude={'content', 'type', 'bbox'}) + + return TableBlock( + type='table', + role='table', + category=block.type, + text=block.content, + bbox=bbox, + page=page_number, + source_data=source_data, + ) + + +def _convert_image_block( + block: 'ResultFullResultChunkBlock', page_number: int +) -> ImageBlock: + bbox = _convert_bbox(block.bbox) + image_url = getattr(block, 'image_url', None) + + source_data: dict = {} + if hasattr(block, 'model_dump'): + source_data = block.model_dump(exclude={'content', 'type', 'bbox'}) + + return ImageBlock( + type='image', + role='figure', + category=block.type, + name=image_url, + alt_text=block.content or None, + bbox=bbox, + page=page_number, + source_data=source_data, + ) diff --git a/src/parxy_core/facade/parxy.py b/src/parxy_core/facade/parxy.py index bc52b6f..5c1e533 100644 --- a/src/parxy_core/facade/parxy.py +++ b/src/parxy_core/facade/parxy.py @@ -39,6 +39,7 @@ class Parxy: LLAMAPARSE = 'llamaparse' LLMWHISPERER = 'llmwhisperer' UNSTRUCTURED_LIBRARY = 'unstructured_local' + REDUCTO = 'reducto' # Private class variable to hold the DriverFactory instance _factory: Optional[DriverFactory] = None diff --git a/src/parxy_core/models/__init__.py b/src/parxy_core/models/__init__.py index a2acee0..4a8a892 100644 --- a/src/parxy_core/models/__init__.py +++ b/src/parxy_core/models/__init__.py @@ -29,4 +29,5 @@ UnstructuredLocalConfig as UnstructuredLocalConfig, DoclingConfig as DoclingConfig, LiteParseConfig as LiteParseConfig, + ReductoConfig as ReductoConfig, ) diff --git a/src/parxy_core/models/config.py b/src/parxy_core/models/config.py index 8484edc..64e9554 100644 --- a/src/parxy_core/models/config.py +++ b/src/parxy_core/models/config.py @@ -4,7 +4,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict -from pydantic import Field, SecretStr, BaseModel +from pydantic import Field, SecretStr class BaseConfig(BaseSettings): @@ -267,6 +267,32 @@ class DoclingConfig(BaseConfig): ) +class ReductoConfig(BaseConfig): + """Configuration values for Reducto service. All env variables must start with `parxy_reducto_`""" + + api_key: Optional[SecretStr] = Field(exclude=True, default=None) + """The authentication key.""" + + environment: Optional[Literal['production', 'eu', 'au']] = None + """The Reducto environment. Options: 'production', 'eu', 'au'. Default None (uses production).""" + + base_url: Optional[str] = None + """Custom base URL. When set, takes precedence over environment.""" + + timeout: Optional[float] = None + """HTTP request timeout in seconds. Default None (uses SDK default).""" + + extraction_mode: Optional[Literal['ocr', 'hybrid']] = None + """Text extraction mode. 'hybrid' combines OCR with embedded text (default). 'ocr' uses OCR only.""" + + table_output_format: Optional[Literal['html', 'json', 'md', 'jsonbbox', 'dynamic', 'csv']] = None + """Table output format. Default None (uses API default of 'dynamic').""" + + model_config = SettingsConfigDict( + env_prefix='parxy_reducto_', env_file='.env', extra='ignore' + ) + + class LiteParseConfig(BaseConfig): """Configuration values for the LiteParse HTTP service. All env variables must start with `parxy_liteparse_`""" diff --git a/tests/drivers/test_reducto.py b/tests/drivers/test_reducto.py new file mode 100644 index 0000000..b31fb05 --- /dev/null +++ b/tests/drivers/test_reducto.py @@ -0,0 +1,571 @@ +import os +import pytest +from unittest.mock import Mock, patch, MagicMock + +from parxy_core.exceptions import AuthenticationException, FileNotFoundException +from parxy_core.models import Page, TextBlock, TableBlock, ImageBlock + +from parxy_core.drivers import ReductoDriver +from parxy_core.models import ReductoConfig + + +@pytest.mark.skipif( + os.getenv('GITHUB_ACTIONS') == 'true' or not os.getenv('PARXY_REDUCTO_API_KEY'), + reason='External service required. Set PARXY_REDUCTO_API_KEY to run these tests.', +) +class TestReductoDriver: + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def test_reducto_driver_can_be_created(self): + driver = ReductoDriver(ReductoConfig()) + + assert driver.supported_levels == ['page', 'block'] + + def test_reducto_driver_handle_invalid_key(self): + driver = ReductoDriver(ReductoConfig(api_key='invalid')) + + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(AuthenticationException): + driver.parse(path) + + def test_reducto_driver_handle_not_existing_file(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path) + + def test_reducto_driver_unrecognized_level_handled(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(ValueError) as excinfo: + driver.parse(path, level='custom') + + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + + def test_reducto_driver_read_empty_document_block_level(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path) + + assert document is not None + assert document.language is None + assert document.outline is None + assert document.metadata is None + assert len(document.pages) >= 1 + assert isinstance(document.pages[0], Page) + assert isinstance(document.pages[0].text, str) + + def test_reducto_driver_read_empty_document_page_level(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('empty-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert document.language is None + assert document.outline is None + assert document.metadata is None + assert len(document.pages) >= 1 + assert isinstance(document.pages[0], Page) + assert isinstance(document.pages[0].text, str) + assert document.pages[0].blocks is None + + def test_reducto_driver_read_document(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='page') + + assert document is not None + assert len(document.pages) == 1 + assert isinstance(document.pages[0], Page) + assert document.pages[0].number == 1 + assert isinstance(document.pages[0].text, str) + assert len(document.pages[0].text) > 0 + + def test_reducto_driver_read_document_as_blocks(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document is not None + assert len(document.pages) == 1 + assert isinstance(document.pages[0].blocks, list) + assert len(document.pages[0].blocks) > 0 + + def test_reducto_driver_read_document_with_tables(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('pdf-headings-images-tables.pdf') + document = driver.parse(path, level='block') + + assert document is not None + assert len(document.pages) > 0 + all_blocks = [b for page in document.pages for b in (page.blocks or [])] + assert any(isinstance(b, TableBlock) for b in all_blocks) + + def test_reducto_driver_page_numbers_are_populated(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + for page in document.pages: + assert isinstance(page.number, int) + assert page.number >= 1 + + def test_reducto_driver_parsing_metadata_populated(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document.parsing_metadata is not None + assert 'job_id' in document.parsing_metadata + assert isinstance(document.parsing_metadata['job_id'], str) + assert 'upload_file_id' in document.parsing_metadata + assert isinstance(document.parsing_metadata['upload_file_id'], str) + assert 'duration' in document.parsing_metadata + assert isinstance(document.parsing_metadata['duration'], float) + assert 'num_pages' in document.parsing_metadata + assert document.parsing_metadata['num_pages'] >= 1 + + def test_reducto_driver_cost_estimation_populated(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document.parsing_metadata is not None + assert 'cost_estimation' in document.parsing_metadata + assert isinstance(document.parsing_metadata['cost_estimation'], (int, float)) + assert document.parsing_metadata['cost_estimation'] >= 0 + assert 'cost_estimation_unit' in document.parsing_metadata + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + + def test_reducto_driver_records_elapsed_time(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document.parsing_metadata is not None + assert 'driver_elapsed_time' in document.parsing_metadata + assert isinstance(document.parsing_metadata['driver_elapsed_time'], float) + assert document.parsing_metadata['driver_elapsed_time'] > 0 + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_reducto_driver_tracing_span_created(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.info = Mock() + + driver = ReductoDriver(ReductoConfig()) + path = self.__fixture_path('empty-doc.pdf') + driver.parse(path, level='block') + + mock_tracer.span.assert_called() + + span_calls = mock_tracer.span.call_args_list + doc_processing_call = [ + c for c in span_calls if c[0][0] == 'document-processing' + ][0] + + assert doc_processing_call[1]['driver'] == 'ReductoDriver' + assert doc_processing_call[1]['level'] == 'block' + + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.processed' + assert count_call[1]['driver'] == 'ReductoDriver' + + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_reducto_driver_tracing_exception_recorded(self, mock_tracer): + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = ReductoDriver(ReductoConfig()) + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level='block') + + mock_tracer.error.assert_called_once() + error_call = mock_tracer.error.call_args + assert error_call[0][0] == 'Parsing failed' + + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.failures' + assert count_call[1]['driver'] == 'ReductoDriver' + + +class TestReductoDriverUnit: + """Unit tests that mock the Reducto client — no API key or network required.""" + + def __fixture_path(self, file: str) -> str: + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixtures_dir = os.path.join(os.path.dirname(current_dir), 'fixtures') + return os.path.join(fixtures_dir, file) + + def _build_full_response(self, blocks=None, job_id='test-job-id', credits=3.0): + """Build a realistic FullParseResponse using real SDK Pydantic models.""" + from reducto.lib.helpers import FullParseResponse + from reducto.types.parse_usage import ParseUsage + from reducto.types.shared.parse_response import ( + ResultFullResult, + ResultFullResultChunk, + ResultFullResultChunkBlock, + ) + from reducto.types.bounding_box import BoundingBox as ReductoBox + + if blocks is None: + bbox = ReductoBox(left=0.1, top=0.1, width=0.8, height=0.05, page=1) + blocks = [ + ResultFullResultChunkBlock( + type='Text', + content='This is a test paragraph.', + bbox=bbox, + ) + ] + + chunk = ResultFullResultChunk( + blocks=blocks, + content=' '.join(b.content for b in blocks), + embed=' '.join(b.content for b in blocks), + ) + result = ResultFullResult(chunks=[chunk], type='full') + return FullParseResponse( + duration=1.5, + job_id=job_id, + result=result, + usage=ParseUsage(num_pages=1, credits=credits), + ) + + def _make_mock_client(self, full_response): + """Return a mock Reducto client wired up with the given full_response.""" + mock_client = MagicMock() + mock_upload = MagicMock() + mock_upload.file_id = 'uploaded-file-id-abc' + mock_client.upload.return_value = mock_upload + mock_client.parse.run.return_value = MagicMock() + return mock_client + + def test_reducto_driver_can_be_created(self): + driver = ReductoDriver(ReductoConfig()) + + assert driver.supported_levels == ['page', 'block'] + + def test_reducto_driver_unrecognized_level_handled(self): + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('empty-doc.pdf') + + with pytest.raises(ValueError) as excinfo: + driver.parse(path, level='custom') + + assert 'not supported' in str(excinfo.value) + assert '[custom]' in str(excinfo.value) + + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_handle_not_existing_file(self, mock_create_client): + mock_create_client.return_value = MagicMock() + driver = ReductoDriver(ReductoConfig()) + + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path) + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_parsing_metadata_populated( + self, mock_create_client, mock_handle_url_response + ): + full_response = self._build_full_response(job_id='my-job-99', credits=6.0) + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document.parsing_metadata is not None + assert document.parsing_metadata['job_id'] == 'my-job-99' + assert document.parsing_metadata['upload_file_id'] == 'uploaded-file-id-abc' + assert document.parsing_metadata['duration'] == 1.5 + assert document.parsing_metadata['num_pages'] == 1 + assert document.parsing_metadata['cost_estimation'] == 6.0 + assert document.parsing_metadata['cost_estimation_unit'] == 'credits' + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_upload_file_id_stored_in_metadata( + self, mock_create_client, mock_handle_url_response + ): + full_response = self._build_full_response() + mock_client = self._make_mock_client(full_response) + mock_client.upload.return_value.file_id = 'specific-file-id-xyz' + mock_create_client.return_value = mock_client + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + path = self.__fixture_path('test-doc.pdf') + document = driver.parse(path, level='block') + + assert document.parsing_metadata['upload_file_id'] == 'specific-file-id-xyz' + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_text_blocks_converted( + self, mock_create_client, mock_handle_url_response + ): + from reducto.types.shared.parse_response import ( + ResultFullResultChunkBlock, + ) + from reducto.types.bounding_box import BoundingBox as ReductoBox + + bbox = ReductoBox(left=0.0, top=0.0, width=1.0, height=0.1, page=1) + blocks = [ + ResultFullResultChunkBlock(type='Text', content='A paragraph.', bbox=bbox), + ResultFullResultChunkBlock( + type='Section Header', content='A heading.', bbox=bbox + ), + ] + full_response = self._build_full_response(blocks=blocks) + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + document = driver.parse(self.__fixture_path('test-doc.pdf'), level='block') + + assert len(document.pages) == 1 + page_blocks = document.pages[0].blocks + assert len(page_blocks) == 2 + assert all(isinstance(b, TextBlock) for b in page_blocks) + assert page_blocks[0].role == 'paragraph' + assert page_blocks[0].category == 'Text' + assert page_blocks[1].role == 'heading' + assert page_blocks[1].category == 'Section Header' + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_table_blocks_converted( + self, mock_create_client, mock_handle_url_response + ): + from reducto.types.shared.parse_response import ( + ResultFullResultChunkBlock, + ) + from reducto.types.bounding_box import BoundingBox as ReductoBox + + bbox = ReductoBox(left=0.0, top=0.1, width=1.0, height=0.3, page=1) + blocks = [ + ResultFullResultChunkBlock( + type='Table', content='| col1 | col2 |\n| a | b |', bbox=bbox + ), + ] + full_response = self._build_full_response(blocks=blocks) + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + document = driver.parse(self.__fixture_path('test-doc.pdf'), level='block') + + page_blocks = document.pages[0].blocks + assert len(page_blocks) == 1 + assert isinstance(page_blocks[0], TableBlock) + assert page_blocks[0].role == 'table' + assert page_blocks[0].text == '| col1 | col2 |\n| a | b |' + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_image_blocks_converted( + self, mock_create_client, mock_handle_url_response + ): + from reducto.types.shared.parse_response import ( + ResultFullResultChunkBlock, + ) + from reducto.types.bounding_box import BoundingBox as ReductoBox + + bbox = ReductoBox(left=0.2, top=0.2, width=0.6, height=0.4, page=1) + blocks = [ + ResultFullResultChunkBlock( + type='Figure', content='A chart showing revenue.', bbox=bbox + ), + ] + full_response = self._build_full_response(blocks=blocks) + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + document = driver.parse(self.__fixture_path('test-doc.pdf'), level='block') + + page_blocks = document.pages[0].blocks + assert len(page_blocks) == 1 + assert isinstance(page_blocks[0], ImageBlock) + assert page_blocks[0].role == 'figure' + assert page_blocks[0].alt_text == 'A chart showing revenue.' + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_bounding_box_converted( + self, mock_create_client, mock_handle_url_response + ): + from reducto.types.shared.parse_response import ( + ResultFullResultChunkBlock, + ) + from reducto.types.bounding_box import BoundingBox as ReductoBox + + bbox = ReductoBox(left=0.1, top=0.2, width=0.5, height=0.3, page=1) + blocks = [ + ResultFullResultChunkBlock(type='Text', content='Block with bbox.', bbox=bbox) + ] + full_response = self._build_full_response(blocks=blocks) + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + document = driver.parse(self.__fixture_path('test-doc.pdf'), level='block') + + block = document.pages[0].blocks[0] + assert block.bbox is not None + assert block.bbox.x0 == pytest.approx(0.1) + assert block.bbox.y0 == pytest.approx(0.2) + assert block.bbox.x1 == pytest.approx(0.6) + assert block.bbox.y1 == pytest.approx(0.5) + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_page_level_has_no_blocks( + self, mock_create_client, mock_handle_url_response + ): + full_response = self._build_full_response() + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + document = driver.parse(self.__fixture_path('test-doc.pdf'), level='page') + + assert len(document.pages) == 1 + assert document.pages[0].blocks is None + assert isinstance(document.pages[0].text, str) + + @patch('reducto.lib.helpers.handle_url_response') + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + def test_reducto_driver_blocks_grouped_by_page( + self, mock_create_client, mock_handle_url_response + ): + from reducto.types.shared.parse_response import ( + ResultFullResult, + ResultFullResultChunk, + ResultFullResultChunkBlock, + ) + from reducto.types.bounding_box import BoundingBox as ReductoBox + from reducto.lib.helpers import FullParseResponse + from reducto.types.parse_usage import ParseUsage + + bbox_p1 = ReductoBox(left=0.0, top=0.0, width=1.0, height=0.1, page=1) + bbox_p2 = ReductoBox(left=0.0, top=0.0, width=1.0, height=0.1, page=2) + blocks = [ + ResultFullResultChunkBlock(type='Text', content='Page 1 text.', bbox=bbox_p1), + ResultFullResultChunkBlock(type='Text', content='Page 2 text.', bbox=bbox_p2), + ] + chunk = ResultFullResultChunk(blocks=blocks, content='Page 1 text. Page 2 text.', embed='') + result = ResultFullResult(chunks=[chunk], type='full') + full_response = FullParseResponse( + duration=2.0, + job_id='multi-page-job', + result=result, + usage=ParseUsage(num_pages=2, credits=6.0), + ) + + mock_create_client.return_value = self._make_mock_client(full_response) + mock_handle_url_response.return_value = full_response + + driver = ReductoDriver(ReductoConfig()) + document = driver.parse(self.__fixture_path('test-doc.pdf'), level='block') + + assert len(document.pages) == 2 + assert document.pages[0].number == 1 + assert len(document.pages[0].blocks) == 1 + assert document.pages[0].blocks[0].text == 'Page 1 text.' + assert document.pages[1].number == 2 + assert len(document.pages[1].blocks) == 1 + assert document.pages[1].blocks[0].text == 'Page 2 text.' + + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_reducto_driver_handles_authentication_error( + self, mock_tracer, mock_create_client + ): + from reducto._exceptions import AuthenticationError as ReductoAuthError + from httpx import Response, Request + + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + mock_client = MagicMock() + mock_create_client.return_value = mock_client + mock_client.upload.side_effect = ReductoAuthError( + 'Invalid API key', + response=Response( + status_code=401, + request=Request('POST', 'https://platform.reducto.ai/upload'), + ), + body={'error': 'Invalid API key'}, + ) + + driver = ReductoDriver(ReductoConfig(api_key='invalid')) + path = self.__fixture_path('test-doc.pdf') + + with pytest.raises(AuthenticationException) as excinfo: + driver.parse(path) + + assert excinfo.value.service == 'ReductoDriver' + + @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') + @patch('parxy_core.drivers.abstract_driver.tracer') + def test_reducto_driver_tracing_exception_recorded(self, mock_tracer, mock_create_client): + mock_create_client.return_value = MagicMock() + mock_span = MagicMock() + mock_span.__enter__ = Mock(return_value=mock_span) + mock_span.__exit__ = Mock(return_value=False) + mock_tracer.span = Mock(return_value=mock_span) + mock_tracer.count = Mock() + mock_tracer.error = Mock() + + driver = ReductoDriver(ReductoConfig()) + path = self.__fixture_path('non-existing-file.pdf') + + with pytest.raises(FileNotFoundException): + driver.parse(path, level='block') + + mock_tracer.error.assert_called_once() + assert mock_tracer.error.call_args[0][0] == 'Parsing failed' + + mock_tracer.count.assert_called_once() + count_call = mock_tracer.count.call_args + assert count_call[0][0] == 'documents.failures' + assert count_call[1]['driver'] == 'ReductoDriver' diff --git a/uv.lock b/uv.lock index ea58672..89777e5 100644 --- a/uv.lock +++ b/uv.lock @@ -2037,6 +2037,7 @@ all = [ { name = "pdfminer-six" }, { name = "pdfplumber" }, { name = "pypdfium2" }, + { name = "reductoai" }, { name = "textual" }, { name = "unstructured", extra = ["pdf"] }, ] @@ -2061,6 +2062,9 @@ pdfplumber = [ pypdfium2 = [ { name = "pypdfium2" }, ] +reducto = [ + { name = "reductoai" }, +] tui = [ { name = "textual" }, ] @@ -2100,6 +2104,8 @@ requires-dist = [ { name = "pypdfium2", marker = "extra == 'all'", specifier = ">=5.7.1" }, { name = "pypdfium2", marker = "extra == 'pypdfium2'", specifier = ">=5.7.1" }, { name = "python-dotenv", specifier = ">=1.0.1" }, + { name = "reductoai", marker = "extra == 'all'", specifier = ">=0.22.0" }, + { name = "reductoai", marker = "extra == 'reducto'", specifier = ">=0.22.0" }, { name = "requests", specifier = ">=2.32.4" }, { name = "rich", specifier = ">=13.7.0" }, { name = "textual", marker = "extra == 'all'", specifier = ">=0.89.0" }, @@ -2109,7 +2115,7 @@ requires-dist = [ { name = "unstructured", extras = ["pdf"], marker = "extra == 'unstructured-local'", specifier = ">=0.18.13" }, { name = "validators", specifier = ">=0.35.0" }, ] -provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "pdfplumber", "pdfminer", "docling", "all"] +provides-extras = ["llama", "llmwhisperer", "unstructured-local", "landingai", "tui", "pypdfium2", "pdfplumber", "pdfminer", "docling", "reducto", "all"] [package.metadata.requires-dev] dev = [ @@ -2800,6 +2806,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/a6/51fc1b0e61e3326e1c68a61cfd0c6b3c34c843681c4b1eefbf0596f59162/rapidfuzz-3.14.5-cp314-cp314t-win_arm64.whl", hash = "sha256:3e91dcd2549b8f8d843f98ba03a17e01f3d8b72ce942adbbb6761bc58ffce813", size = 855409, upload-time = "2026-04-07T11:16:15.787Z" }, ] +[[package]] +name = "reductoai" +version = "0.22.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2d/56/fc608656c68c261a2c61056d5fa8f59440a6a023b8071a5295a10654b4b9/reductoai-0.22.0.tar.gz", hash = "sha256:77965a930627f4f440fe7d7dbe7318c643da589e4c1fefbb2d4106b15af77a71", size = 313933, upload-time = "2026-03-29T01:25:49.705Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/bb/fa663e679011a91f41012cb66aa18e14278ea617868a769369895efd70d0/reductoai-0.22.0-py3-none-any.whl", hash = "sha256:a0c8d9c0372c49d618da83304f62ed0da0aae57fe37abd4072dbe87e206aa2e6", size = 154407, upload-time = "2026-03-29T01:25:48.356Z" }, +] + [[package]] name = "referencing" version = "0.37.0" From 4f34f02199a515f53320aca203457ada1c0224f0 Mon Sep 17 00:00:00 2001 From: avvertix <5672748+avvertix@users.noreply.github.com> Date: Mon, 25 May 2026 13:28:42 +0000 Subject: [PATCH 2/2] Fix styling --- src/parxy_core/models/config.py | 4 +++- tests/drivers/test_reducto.py | 20 +++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/parxy_core/models/config.py b/src/parxy_core/models/config.py index 64e9554..05dc4a2 100644 --- a/src/parxy_core/models/config.py +++ b/src/parxy_core/models/config.py @@ -285,7 +285,9 @@ class ReductoConfig(BaseConfig): extraction_mode: Optional[Literal['ocr', 'hybrid']] = None """Text extraction mode. 'hybrid' combines OCR with embedded text (default). 'ocr' uses OCR only.""" - table_output_format: Optional[Literal['html', 'json', 'md', 'jsonbbox', 'dynamic', 'csv']] = None + table_output_format: Optional[ + Literal['html', 'json', 'md', 'jsonbbox', 'dynamic', 'csv'] + ] = None """Table output format. Default None (uses API default of 'dynamic').""" model_config = SettingsConfigDict( diff --git a/tests/drivers/test_reducto.py b/tests/drivers/test_reducto.py index b31fb05..7657f9c 100644 --- a/tests/drivers/test_reducto.py +++ b/tests/drivers/test_reducto.py @@ -436,7 +436,9 @@ def test_reducto_driver_bounding_box_converted( bbox = ReductoBox(left=0.1, top=0.2, width=0.5, height=0.3, page=1) blocks = [ - ResultFullResultChunkBlock(type='Text', content='Block with bbox.', bbox=bbox) + ResultFullResultChunkBlock( + type='Text', content='Block with bbox.', bbox=bbox + ) ] full_response = self._build_full_response(blocks=blocks) mock_create_client.return_value = self._make_mock_client(full_response) @@ -485,10 +487,16 @@ def test_reducto_driver_blocks_grouped_by_page( bbox_p1 = ReductoBox(left=0.0, top=0.0, width=1.0, height=0.1, page=1) bbox_p2 = ReductoBox(left=0.0, top=0.0, width=1.0, height=0.1, page=2) blocks = [ - ResultFullResultChunkBlock(type='Text', content='Page 1 text.', bbox=bbox_p1), - ResultFullResultChunkBlock(type='Text', content='Page 2 text.', bbox=bbox_p2), + ResultFullResultChunkBlock( + type='Text', content='Page 1 text.', bbox=bbox_p1 + ), + ResultFullResultChunkBlock( + type='Text', content='Page 2 text.', bbox=bbox_p2 + ), ] - chunk = ResultFullResultChunk(blocks=blocks, content='Page 1 text. Page 2 text.', embed='') + chunk = ResultFullResultChunk( + blocks=blocks, content='Page 1 text. Page 2 text.', embed='' + ) result = ResultFullResult(chunks=[chunk], type='full') full_response = FullParseResponse( duration=2.0, @@ -547,7 +555,9 @@ def test_reducto_driver_handles_authentication_error( @patch('parxy_core.drivers.reducto.ReductoDriver._create_client') @patch('parxy_core.drivers.abstract_driver.tracer') - def test_reducto_driver_tracing_exception_recorded(self, mock_tracer, mock_create_client): + def test_reducto_driver_tracing_exception_recorded( + self, mock_tracer, mock_create_client + ): mock_create_client.return_value = MagicMock() mock_span = MagicMock() mock_span.__enter__ = Mock(return_value=mock_span)