Added ability to select extractor

pvhynes · pvhynes · commit 7fa30c65fa01 · 2026-01-13T17:13:59.000Z
diff --git a/README.md b/README.md
@@ -46,6 +46,25 @@ To convert to CSV, XML or HTML simply change `c.xlsx` to be `c.csv`, `c.xml` or
 
 To specify Excel (single sheet) or Excel (multiple sheets) use `c.xlsx_single` or `c.xlsx_multiple`.
 
+## Extractor
+
+You can specify which extraction engine to use when creating a `Client`. The available extractors are `standard` (default), `ai-1`, and `ai-2`.
+
+For AI extractors (`ai-1` and `ai-2`), you can also specify an `extract` option to control what content is extracted: `tables` (default) or `tables-paragraphs`.
+
+```py
+import pdftables_api
+
+# Standard extractor (default)
+c = pdftables_api.Client('my-api-key')
+
+# AI extractors for complex documents
+c = pdftables_api.Client('my-api-key', extractor='ai-1', extract='tables')
+c = pdftables_api.Client('my-api-key', extractor='ai-2', extract='tables-paragraphs')
+```
+
+See [PDFTables API documentation](https://pdftables.com/pdf-to-excel-api) for details.
+
 ## Test
 
 Tests run with pytest: `make test`
diff --git a/pdftables_api/__init__.py b/pdftables_api/__init__.py
@@ -13,6 +13,11 @@
 # limitations under the License.
 
 from .pdftables_api import (
+    EXTRACT_TABLES,
+    EXTRACT_TABLES_PARAGRAPHS,
+    EXTRACTOR_AI_1,
+    EXTRACTOR_AI_2,
+    EXTRACTOR_STANDARD,
     FORMAT_CSV,
     FORMAT_XLSX,
     FORMAT_XLSX_MULTIPLE,
@@ -30,4 +35,9 @@
     "FORMAT_XML",
     "APIException",
     "Client",
+    "EXTRACTOR_STANDARD",
+    "EXTRACTOR_AI_1",
+    "EXTRACTOR_AI_2",
+    "EXTRACT_TABLES",
+    "EXTRACT_TABLES_PARAGRAPHS",
 ]
diff --git a/pdftables_api/pdftables_api.py b/pdftables_api/pdftables_api.py
@@ -42,13 +42,71 @@
 }
 _STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML}
 
+EXTRACTOR_STANDARD = "standard"
+EXTRACTOR_AI_1 = "ai-1"
+EXTRACTOR_AI_2 = "ai-2"
+
+EXTRACT_TABLES = "tables"
+EXTRACT_TABLES_PARAGRAPHS = "tables-paragraphs"
+
+# Valid extractor options for each extractor type
+_VALID_EXTRACTOR_VALUES = {
+    EXTRACTOR_STANDARD: (),  # Standard extractor has no options
+    EXTRACTOR_AI_1: (
+        EXTRACT_TABLES,
+        EXTRACT_TABLES_PARAGRAPHS,
+    ),  # Use a tuple for consistent order in error messages
+    EXTRACTOR_AI_2: (
+        EXTRACT_TABLES,
+        EXTRACT_TABLES_PARAGRAPHS,
+    ),
+}
+
+# Valid extractor types
+_VALID_EXTRACTORS = tuple(_VALID_EXTRACTOR_VALUES.keys())
+
 
 class Client:
-    def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT):
+    def __init__(
+        self,
+        api_key,
+        api_url=_API_URL,
+        timeout=_DEFAULT_TIMEOUT,
+        extractor=EXTRACTOR_STANDARD,
+        extract=None,
+    ):
         self.api_key = api_key
         self.api_url = api_url
         self.timeout = timeout
 
+        # Validate and set extractor configuration
+        self._validate_extractor(extractor, extract)
+        self.extractor = extractor
+        self.extract = extract
+
+    @staticmethod
+    def _validate_extractor(extractor, extract):
+        """Validate extractor and extract parameters."""
+        if extractor not in _VALID_EXTRACTORS:
+            valid_extractors = ", ".join(_VALID_EXTRACTORS)
+            raise ValueError(
+                f'Invalid extractor "{extractor}". Valid options are: {valid_extractors}'
+            )
+
+        valid_extract_values = _VALID_EXTRACTOR_VALUES[extractor]
+        if extract is not None and extract not in valid_extract_values:
+            if len(valid_extract_values) == 0:
+                raise ValueError(
+                    f'Extractor "{extractor}" does not support extract parameter'
+                )
+            else:
+                valid_extract_values_str = ", ".join(
+                    str(opt) for opt in valid_extract_values
+                )
+                raise ValueError(
+                    f'Invalid extract value "{extract}" for extractor "{extractor}". Valid values are: {valid_extract_values_str}'
+                )
+
     def xlsx(self, pdf_path, xlsx_path=None):
         """
         Convenience method to convert PDF to XLSX multiple sheets.
@@ -147,7 +205,17 @@ def request(self, pdf_fo, out_format=None, query_params=None, **requests_params)
         url = self.api_url
         files = {"f": ("file.pdf", pdf_fo)}
         params = query_params if query_params else {}
-        params.update({"key": self.api_key, "format": out_format})
+        params.update(
+            {
+                "key": self.api_key,
+                "format": out_format,
+                "extractor": self.extractor,
+            }
+        )
+
+        # Add extract value if specified
+        if self.extract:
+            params.update({"extract": self.extract})
 
         response = requests.post(
             url, files=files, stream=True, params=params, **requests_params
diff --git a/test/test_pdftables_api.py b/test/test_pdftables_api.py
@@ -20,7 +20,15 @@
 import pytest
 import requests_mock
 
-from pdftables_api import APIException, Client
+from pdftables_api import (
+    EXTRACT_TABLES,
+    EXTRACT_TABLES_PARAGRAPHS,
+    EXTRACTOR_AI_1,
+    EXTRACTOR_AI_2,
+    EXTRACTOR_STANDARD,
+    APIException,
+    Client,
+)
 
 
 class TestEnsureExtFormat(TestCase):
@@ -181,6 +189,157 @@ def test_response_unknown_file_format(self):
                 c.dump(png_fo)
 
 
+class TestExtractorParameters(TestCase):
+    def test_default_extractor(self):
+        """Test that default extractor is 'standard' with no extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=standard",
+                text="xlsx output",
+            )
+
+            c = Client("fake_key")
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name)
+
+    def test_ai1_extractor_with_no_extract(self):
+        """Test ai-1 extractor with no extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1",
+                text="xlsx output",
+            )
+
+            c = Client("fake_key", extractor=EXTRACTOR_AI_1)
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name)
+
+    def test_ai1_extractor_with_tables(self):
+        """Test ai-1 extractor with 'tables' extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1&extract=tables",
+                text="xlsx output",
+            )
+
+            c = Client("fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES)
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name)
+
+    def test_ai1_extractor_with_tables_paragraphs(self):
+        """Test ai-1 extractor with 'tables-paragraphs' extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-1&extract=tables-paragraphs",
+                text="csv output",
+            )
+
+            c = Client(
+                "fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES_PARAGRAPHS
+            )
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name, out_format="csv")
+
+    def test_ai2_extractor_with_no_extract(self):
+        """Test ai-2 extractor with no extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2",
+                text="xlsx output",
+            )
+
+            c = Client("fake_key", extractor=EXTRACTOR_AI_2)
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name)
+
+    def test_ai2_extractor_with_tables(self):
+        """Test ai-2 extractor with 'tables' extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2&extract=tables",
+                text="xlsx output",
+            )
+
+            c = Client("fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES)
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name)
+
+    def test_ai2_extractor_with_tables_paragraphs(self):
+        """Test ai-2 extractor with 'tables-paragraphs' extract parameter."""
+        with requests_mock.mock() as m:
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-2&extract=tables-paragraphs",
+                text="csv output",
+            )
+
+            c = Client(
+                "fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS
+            )
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name, out_format="csv")
+
+    def test_standard_extractor_no_extract_param_in_url(self):
+        """Test that standard extractor doesn't include extract parameter in URL."""
+        with requests_mock.mock() as m:
+            # Note: no 'extract' parameter in the URL for standard extractor
+            m.post(
+                "https://pdftables.com/api?key=fake_key&format=csv&extractor=standard",
+                text="csv output",
+            )
+
+            c = Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=None)
+            with NamedTemporaryFile(suffix="test.pdf") as tf:
+                tf.write(b"Hello world")
+                tf.file.close()
+                c.convert(tf.name, out_format="csv")
+
+    def test_invalid_extractor_raises_error(self):
+        """Test that invalid extractor raises ValueError."""
+        with pytest.raises(
+            ValueError,
+            match='^Invalid extractor "invalid". Valid options are: standard, ai-1, ai-2$',
+        ):
+            Client("fake_key", extractor="invalid")
+
+    def test_invalid_extract_for_standard_raises_error(self):
+        """Test that providing extract parameter for standard extractor raises ValueError."""
+        with pytest.raises(
+            ValueError,
+            match='^Extractor "standard" does not support extract parameter$',
+        ):
+            Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=EXTRACT_TABLES)
+
+    def test_invalid_extract_for_ai_raises_error(self):
+        """Test that invalid extract value for AI extractor raises ValueError."""
+        with pytest.raises(
+            ValueError,
+            match='^Invalid extract value "invalid" for extractor "ai-1". Valid values are: tables, tables-paragraphs$',
+        ):
+            Client("fake_key", extractor=EXTRACTOR_AI_1, extract="invalid")
+
+    def test_invalid_extract_for_ai2_raises_error(self):
+        """Test that invalid extract value for AI-2 extractor raises ValueError."""
+        with pytest.raises(
+            ValueError,
+            match='^Invalid extract value "invalid" for extractor "ai-2". Valid values are: tables, tables-paragraphs$',
+        ):
+            Client("fake_key", extractor=EXTRACTOR_AI_2, extract="invalid")
+
+
 def consume(s):
     r = b""
     for chunk in s: