OneOffTech · avvertix · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/README.md b/README.md
@@ -155,17 +155,18 @@ For more information take a look at our [Getting Started with Parxy tutorial](./
 
 ## Supported services
 
-| Service or Library | Support status | Extra | Local file | Remote file | 
-|--------------------|----------------|-------|------------|-------------|
-| [**PyMuPDF**](https://pymupdf.readthedocs.io/en/latest/) | Live | - | ✅ | ✅ |
-| [**PdfAct**](https://github.com/data-house/pdfact) | Live | - | ✅ | ✅ |
-| [**Unstructured** library](https://docs.unstructured.io/open-source/introduction/overview) | Preview | `unstructured_local` | ✅ | ✅ |
-| [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | Preview | `landingai` | ✅ | ✅ |
-| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ |
-| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ |
-| [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) | Planned |  |  |  |
-| [**Chunkr**](https://www.chunkr.ai/) | Planned |  |  |  |
-| [**Docling**](https://docling-project.github.io/docling/) | Planned |  |  |  |
+| Service or Library | Extra | Local file | Remote file | Status |
+|--------------------|-------|------------|-------------|----------------|
+| [**PyMuPDF**](https://pymupdf.readthedocs.io/en/latest/) |  - | ✅ | ✅ | |
+| [**PdfAct**](https://github.com/data-house/pdfact) |  - | ✅ | ✅ | |
+| [**Unstructured** library](https://docs.unstructured.io/open-source/introduction/overview) | `unstructured_local` | ✅ | ✅ | Preview |
+| [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | `landingai` | ✅ | ✅ | Preview |
+| [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | `llama` | ✅ | ✅ | Preview |
+| [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | `llmwhisperer` | ✅ | ✅ | Preview |
+| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | `pypdfium2` | ✅ | ✅ | Preview |
+| [**Unstructured.io** cloud service](https://docs.unstructured.io/open-source/introduction/overview) |   |  |  | Planned |
+| [**Chunkr**](https://www.chunkr.ai/) |   |  |  | Planned |
+| [**Docling**](https://docling-project.github.io/docling/) |   |  |  | Planned |
 
 
 ...and more can be added via the [live extension](#live-extension)!

diff --git a/docs/supported_services.md b/docs/supported_services.md
@@ -16,7 +16,7 @@ Parxy supports the following document processing services and libraries. The **E
 | [**Landing AI Agentic Document Extraction**](https://landing.ai/agentic-document-extraction) | Preview | `landingai` | ✅ | ✅ |
 | [**LlamaParse**](https://docs.cloud.llamaindex.ai/llamaparse/overview) | Preview | `llama` | ✅ | ✅ |
 | [**LLMWhisperer**](https://docs.unstract.com/llmwhisperer/index.html) | Preview | `llmwhisperer` | ✅ | ✅ |
-
+| [**Pypdfium2**](https://github.com/pypdfium2-team/pypdfium2) | Preview | `pypdfium2` | ✅ | ✅ |
 
 Status meanings: **Live** = stable; **Preview** = functional but the API may change.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -46,12 +46,16 @@ landingai = [
 tui = [
     "textual>=0.89.0",
 ]
+pypdfium2 = [
+    "pypdfium2>=5.7.1",
+]
 all = [
     "llama-cloud>=2.0.0",
     "llmwhisperer-client>=2.4.2",
     "unstructured[pdf]>=0.18.13",
     "landingai-ade>=0.15.1",
     "textual>=0.89.0",
+    "pypdfium2>=5.7.1",
 ]
 
 

diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py
@@ -8,3 +8,6 @@
 from parxy_core.drivers.unstructured_local import (
     UnstructuredLocalDriver as UnstructuredLocalDriver,
 )
+from parxy_core.drivers.pypdfium2 import (
+    PyPDFium2Driver as PyPDFium2Driver,
+)
diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py
@@ -9,6 +9,7 @@
 from parxy_core.drivers.llamaparse import LlamaParseDriver
 from parxy_core.drivers.llmwhisperer import LlmWhispererDriver
 from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver
+from parxy_core.drivers.pypdfium2 import PyPDFium2Driver
 from parxy_core.models import (
     PdfActConfig,
     LandingAIConfig,
@@ -209,6 +210,9 @@ def _create_unstructured_local_driver(self) -> UnstructuredLocalDriver:
             logger=self._logger,
         )
 
+    def _create_pypdfium_driver(self) -> PyPDFium2Driver:
+        return PyPDFium2Driver(logger=self._logger)
+
     def _create_landingai_driver(self) -> LandingAIADEDriver:
         """Create a LandingAI ADE Driver instance.
 
@@ -283,6 +287,7 @@ def get_supported_drivers(self) -> List[str]:
             'llamaparse',
             'llmwhisperer',
             'unstructured_local',
+            'pypdfium',
         ]
 
         return supported_drivers

diff --git a/src/parxy_core/drivers/pypdfium2.py b/src/parxy_core/drivers/pypdfium2.py
@@ -0,0 +1,158 @@
+"""PyPDFium2 driver for parxy."""
+
+import io
+
+from datetime import datetime
+
+from parxy_core.drivers import Driver
+from parxy_core.models import Document, Page, Metadata, TocEntry, BoundingBox
+
+
+class PyPDFium2Driver(Driver):
+    """PDF parser using PyPDFium2 - Chrome's PDF engine.
+
+    PyPDFium2 wraps PDFium, the PDF rendering engine used in Chrome.
+    Fast and reliable for text extraction.
+
+    Thread-safety: PDFium is not thread-safe and in practice crashes even
+    when calls are serialized via a lock or routed through a dedicated
+    single-thread executor. Batch processing with this driver must be run
+    with a single worker; ``Parxy.batch_iter`` enforces this automatically.
+    """
+
+    supported_levels = ['page', 'block']
+
+    def _initialize_driver(self):
+        """Initialize PyPDFium2 driver by checking if the library is available."""
+        try:
+            import pypdfium2  # noqa: F401
+        except ImportError as e:
+            raise ImportError(
+                'pypdfium2 is required. Install with: pip install parxy[pypdfium2]'
+            ) from e
+        return self
+
+    def _handle(
+        self, file: str | io.BytesIO | bytes, level: str = 'page', **kwargs
+    ) -> Document:
+        """Parse PDF to Document object.
+
+        Parameters
+        ----------
+        file : str | io.BytesIO | bytes
+            Path, URL or stream of the file to parse.
+        level : str, optional
+            Desired extraction level. Default is "page".
+        **kwargs : dict
+            Additional keyword arguments.
+
+        Returns
+        -------
+        Document
+            A parsed Document in unified format.
+        """
+        import pypdfium2 as pdfium
+
+        if level == 'block':
+            level = 'page'  # Only page is really supported, added block as it is the default for Parxy
+
+        filename, stream = self.handle_file_input(file)
+
+        with self._trace_parse(filename, stream, **kwargs) as span:
+            pdf = pdfium.PdfDocument(stream)
+
+            pages = []
+            for page_num, page in enumerate(pdf, start=1):
+                textpage = page.get_textpage()
+                text = textpage.get_text_range()
+                textpage.close()
+                page.close()
+                if text and text.strip():
+                    pages.append(
+                        Page(
+                            number=page_num,
+                            text=text.strip(),
+                            blocks=None,
+                        )
+                    )
+                else:
+                    # Include empty pages to maintain page numbering
+                    pages.append(
+                        Page(
+                            number=page_num,
+                            text='',
+                            blocks=None,
+                        )
+                    )
+            outline = []
+            for bm in pdf.get_toc(max_depth=15):
+                dest = bm.get_dest()
+                page_num = None
+                bbox = None
+                if dest:
+                    index = dest.get_index()
+                    page_num = index + 1 if index is not None else None
+                    view_mode, view_pos = dest.get_view()
+                    # XYZ (1): [left, top, zoom] — destination point
+                    # FITR (4): [left, bottom, right, top] — destination rect
+                    if view_mode == 1 and len(view_pos) >= 2:
+                        bbox = BoundingBox(
+                            x0=view_pos[0],
+                            y0=view_pos[1],
+                            x1=view_pos[0],
+                            y1=view_pos[1],
+                        )
+                    elif view_mode == 4 and len(view_pos) >= 4:
+                        bbox = BoundingBox(
+                            x0=view_pos[0],
+                            y0=view_pos[1],
+                            x1=view_pos[2],
+                            y1=view_pos[3],
+                        )
+                outline.append(
+                    TocEntry(
+                        title=bm.get_title(),
+                        page=page_num,
+                        level=bm.level,
+                        bbox=bbox,
+                    )
+                )
+
+            span.set_attribute('output.pages', len(pages))
+
+            metadata = pdf.get_metadata_dict()
+            pdf.close()
+
+        return Document(
+            filename=filename,
+            pages=pages,
+            outline=outline or None,
+            metadata=Metadata(
+                title=metadata.get('Title'),
+                author=metadata.get('Author'),
+                subject=metadata.get('Subject'),
+                keywords=metadata.get('Keywords'),
+                creator=metadata.get('Creator'),
+                producer=metadata.get('Producer'),
+                created_at=_parse_pdf_date(metadata.get('CreationDate')),
+                updated_at=_parse_pdf_date(metadata.get('ModDate')),
+            ),
+        )
+
+
+def _parse_pdf_date(pdf_date: str) -> str | None:
+    """
+    Parse PDF date string to ISO format.
+    PDF date format: D:YYYYMMDDHHmmSSOHH'mm'
+    """
+    if not pdf_date:
+        return None
+    try:
+        # Remove prefix if present
+        if pdf_date.startswith('D:'):
+            pdf_date = pdf_date[2:]
+        # Only take up to seconds
+        dt = datetime.strptime(pdf_date[:14], '%Y%m%d%H%M%S')
+        return dt.isoformat()
+    except Exception:
+        return None
diff --git a/src/parxy_core/facade/parxy.py b/src/parxy_core/facade/parxy.py
@@ -214,6 +214,19 @@ def batch_iter(
         # Determine number of workers
         max_workers = workers if workers else (os.cpu_count() or 2)
 
+        # PDFium is not thread-safe and crashes under any form of concurrency,
+        # even when calls are serialized via a lock or a dedicated executor.
+        # Force serial execution when pypdfium is among the requested drivers,
+        # or any task explicitly asks for it.
+        task_driver_names = {
+            d
+            for t in tasks
+            if isinstance(t, BatchTask) and t.drivers
+            for d in t.drivers
+        }
+        if 'pypdfium' in set(default_drivers) | task_driver_names:
+            max_workers = 1
+
         # Normalize tasks into single-driver BatchTask objects.
         # When a BatchTask specifies multiple drivers it is split into
         # one BatchTask per driver so each unit of work targets exactly

diff --git a/src/parxy_core/models/__init__.py b/src/parxy_core/models/__init__.py
@@ -12,6 +12,7 @@
     TextBlock as TextBlock,
     Page as Page,
     Metadata as Metadata,
+    TocEntry as TocEntry,
     Document as Document,
     BatchTask as BatchTask,
     BatchResult as BatchResult,

diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
@@ -114,12 +114,19 @@ class Metadata(BaseModel):
     updated_at: Optional[str] = None
 
 
+class TocEntry(BaseModel):
+    title: str
+    page: Optional[int] = None
+    level: Optional[int] = None
+    bbox: Optional[BoundingBox] = None
+
+
 class Document(BaseModel):
     filename: Optional[str] = None
     language: Optional[str] = None
     metadata: Optional[Metadata] = None
     pages: List[Page]
-    outline: Optional[List[str]] = None
+    outline: Optional[List[str | TocEntry]] = None
     source_data: Optional[dict[str, Any]] = None
     parsing_metadata: Optional[dict[str, Any]] = None