Skip to content

Commit 1ec7f13

Browse files
committed
Added ability to select extractor
1 parent bafe51b commit 1ec7f13

File tree

4 files changed

+231
-3
lines changed

4 files changed

+231
-3
lines changed

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,25 @@ To convert to CSV, XML or HTML simply change `c.xlsx` to be `c.csv`, `c.xml` or
4646

4747
To specify Excel (single sheet) or Excel (multiple sheets) use `c.xlsx_single` or `c.xlsx_multiple`.
4848

49+
## Extractor
50+
51+
You can specify which extraction engine to use when creating a Client. The available extractors are `standard` (default), `ai-1`, and `ai-2`.
52+
53+
For AI extractors (`ai-1` and `ai-2`), you can also specify an `extract` option to control what content is extracted: `tables` (default) or `tables-paragraphs`.
54+
55+
```py
56+
import pdftables_api
57+
58+
# Standard extractor (default)
59+
c = pdftables_api.Client('my-api-key')
60+
61+
# AI extractors for complex documents
62+
c = pdftables_api.Client('my-api-key', extractor='ai-1', extract='tables')
63+
c = pdftables_api.Client('my-api-key', extractor='ai-2', extract='tables-paragraphs')
64+
```
65+
66+
See [PDFTables API documentation](https://pdftables.com/pdf-to-excel-api) for details.
67+
4968
## Test
5069

5170
Tests run with pytest: `make test`

pdftables_api/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@
2020
FORMAT_XML,
2121
APIException,
2222
Client,
23+
EXTRACTOR_STANDARD,
24+
EXTRACTOR_AI_1,
25+
EXTRACTOR_AI_2,
26+
EXTRACT_TABLES,
27+
EXTRACT_TABLES_PARAGRAPHS,
2328
)
2429

2530
__all__ = [
@@ -30,4 +35,9 @@
3035
"FORMAT_XML",
3136
"APIException",
3237
"Client",
38+
"EXTRACTOR_STANDARD",
39+
"EXTRACTOR_AI_1",
40+
"EXTRACTOR_AI_2",
41+
"EXTRACT_TABLES",
42+
"EXTRACT_TABLES_PARAGRAPHS",
3343
]

pdftables_api/pdftables_api.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,71 @@
4242
}
4343
_STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML}
4444

45+
EXTRACTOR_STANDARD = "standard"
46+
EXTRACTOR_AI_1 = "ai-1"
47+
EXTRACTOR_AI_2 = "ai-2"
48+
49+
EXTRACT_TABLES = "tables"
50+
EXTRACT_TABLES_PARAGRAPHS = "tables-paragraphs"
51+
52+
# Valid extractor options for each extractor type
53+
_VALID_EXTRACTOR_VALUES = {
54+
EXTRACTOR_STANDARD: {}, # Standard extractor has no options
55+
EXTRACTOR_AI_1: (
56+
EXTRACT_TABLES,
57+
EXTRACT_TABLES_PARAGRAPHS,
58+
), # Use a tuple for consistent order in error messages
59+
EXTRACTOR_AI_2: (
60+
EXTRACT_TABLES,
61+
EXTRACT_TABLES_PARAGRAPHS,
62+
),
63+
}
64+
65+
# Valid extractor types
66+
_VALID_EXTRACTORS = tuple(_VALID_EXTRACTOR_VALUES.keys())
67+
4568

4669
class Client:
47-
def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT):
70+
def __init__(
71+
self,
72+
api_key,
73+
api_url=_API_URL,
74+
timeout=_DEFAULT_TIMEOUT,
75+
extractor=EXTRACTOR_STANDARD,
76+
extract=None,
77+
):
4878
self.api_key = api_key
4979
self.api_url = api_url
5080
self.timeout = timeout
5181

82+
# Validate and set extractor configuration
83+
self._validate_extractor(extractor, extract)
84+
self.extractor = extractor
85+
self.extract = extract
86+
87+
@staticmethod
88+
def _validate_extractor(extractor, extract):
89+
"""Validate extractor and extract parameters."""
90+
if extractor not in _VALID_EXTRACTORS:
91+
valid_extractors = ", ".join(_VALID_EXTRACTORS)
92+
raise ValueError(
93+
f'Invalid extractor "{extractor}". Valid options are: {valid_extractors}'
94+
)
95+
96+
valid_extract_values = _VALID_EXTRACTOR_VALUES[extractor]
97+
if extract is not None and extract not in valid_extract_values:
98+
if len(valid_extract_values) == 0:
99+
raise ValueError(
100+
f'Extractor "{extractor}" does not support extract parameter'
101+
)
102+
else:
103+
valid_extract_values_str = ", ".join(
104+
str(opt) for opt in valid_extract_values
105+
)
106+
raise ValueError(
107+
f'Invalid extract value "{extract}" for extractor "{extractor}". Valid values are: {valid_extract_values_str}'
108+
)
109+
52110
def xlsx(self, pdf_path, xlsx_path=None):
53111
"""
54112
Convenience method to convert PDF to XLSX multiple sheets.
@@ -147,7 +205,17 @@ def request(self, pdf_fo, out_format=None, query_params=None, **requests_params)
147205
url = self.api_url
148206
files = {"f": ("file.pdf", pdf_fo)}
149207
params = query_params if query_params else {}
150-
params.update({"key": self.api_key, "format": out_format})
208+
params.update(
209+
{
210+
"key": self.api_key,
211+
"format": out_format,
212+
"extractor": self.extractor,
213+
}
214+
)
215+
216+
# Add extract values if specified
217+
if self.extract:
218+
params.update({"extract": self.extract})
151219

152220
response = requests.post(
153221
url, files=files, stream=True, params=params, **requests_params

test/test_pdftables_api.py

Lines changed: 132 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,15 @@
2020
import pytest
2121
import requests_mock
2222

23-
from pdftables_api import APIException, Client
23+
from pdftables_api import (
24+
Client,
25+
APIException,
26+
EXTRACTOR_STANDARD,
27+
EXTRACTOR_AI_1,
28+
EXTRACTOR_AI_2,
29+
EXTRACT_TABLES,
30+
EXTRACT_TABLES_PARAGRAPHS,
31+
)
2432

2533

2634
class TestEnsureExtFormat(TestCase):
@@ -181,6 +189,129 @@ def test_response_unknown_file_format(self):
181189
c.dump(png_fo)
182190

183191

192+
class TestExtractorParameters(TestCase):
193+
def test_default_extractor(self):
194+
"""Test that default extractor is 'standard' with no extract parameter."""
195+
with requests_mock.mock() as m:
196+
m.post(
197+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=standard",
198+
text="xlsx output",
199+
)
200+
201+
c = Client("fake_key")
202+
with NamedTemporaryFile(suffix="test.pdf") as tf:
203+
tf.write(b"Hello world")
204+
tf.file.close()
205+
c.convert(tf.name)
206+
207+
def test_ai_extractor_with_tables(self):
208+
"""Test AI extractor with 'tables' extract parameter."""
209+
with requests_mock.mock() as m:
210+
m.post(
211+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1&extract=tables",
212+
text="xlsx output",
213+
)
214+
215+
c = Client("fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES)
216+
with NamedTemporaryFile(suffix="test.pdf") as tf:
217+
tf.write(b"Hello world")
218+
tf.file.close()
219+
c.convert(tf.name)
220+
221+
def test_ai_extractor_with_tables_paragraphs(self):
222+
"""Test AI extractor with 'tables-paragraphs' extract parameter."""
223+
with requests_mock.mock() as m:
224+
m.post(
225+
"https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-1&extract=tables-paragraphs",
226+
text="csv output",
227+
)
228+
229+
c = Client(
230+
"fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES_PARAGRAPHS
231+
)
232+
with NamedTemporaryFile(suffix="test.pdf") as tf:
233+
tf.write(b"Hello world")
234+
tf.file.close()
235+
c.convert(tf.name, out_format="csv")
236+
237+
def test_ai2_extractor_with_tables(self):
238+
"""Test AI-2 extractor with 'tables' extract parameter."""
239+
with requests_mock.mock() as m:
240+
m.post(
241+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2&extract=tables",
242+
text="xlsx output",
243+
)
244+
245+
c = Client("fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES)
246+
with NamedTemporaryFile(suffix="test.pdf") as tf:
247+
tf.write(b"Hello world")
248+
tf.file.close()
249+
c.convert(tf.name)
250+
251+
def test_ai2_extractor_with_tables_paragraphs(self):
252+
"""Test AI-2 extractor with 'tables-paragraphs' extract parameter."""
253+
with requests_mock.mock() as m:
254+
m.post(
255+
"https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-2&extract=tables-paragraphs",
256+
text="csv output",
257+
)
258+
259+
c = Client(
260+
"fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS
261+
)
262+
with NamedTemporaryFile(suffix="test.pdf") as tf:
263+
tf.write(b"Hello world")
264+
tf.file.close()
265+
c.convert(tf.name, out_format="csv")
266+
267+
def test_standard_extractor_no_extract_param_in_url(self):
268+
"""Test that standard extractor doesn't include extract parameter in URL."""
269+
with requests_mock.mock() as m:
270+
# Note: no 'extract' parameter in the URL for standard extractor
271+
m.post(
272+
"https://pdftables.com/api?key=fake_key&format=csv&extractor=standard",
273+
text="csv output",
274+
)
275+
276+
c = Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=None)
277+
with NamedTemporaryFile(suffix="test.pdf") as tf:
278+
tf.write(b"Hello world")
279+
tf.file.close()
280+
c.convert(tf.name, out_format="csv")
281+
282+
def test_invalid_extractor_raises_error(self):
283+
"""Test that invalid extractor raises ValueError."""
284+
with pytest.raises(
285+
ValueError,
286+
match='^Invalid extractor "invalid". Valid options are: standard, ai-1, ai-2$',
287+
):
288+
Client("fake_key", extractor="invalid")
289+
290+
def test_invalid_extract_for_standard_raises_error(self):
291+
"""Test that providing extract parameter for standard extractor raises ValueError."""
292+
with pytest.raises(
293+
ValueError,
294+
match='^Extractor "standard" does not support extract parameter$',
295+
):
296+
Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=EXTRACT_TABLES)
297+
298+
def test_invalid_extract_for_ai_raises_error(self):
299+
"""Test that invalid extract value for AI extractor raises ValueError."""
300+
with pytest.raises(
301+
ValueError,
302+
match='^Invalid extract value "invalid" for extractor "ai-1". Valid values are: tables, tables-paragraphs$',
303+
):
304+
Client("fake_key", extractor=EXTRACTOR_AI_1, extract="invalid")
305+
306+
def test_invalid_extract_for_ai2_raises_error(self):
307+
"""Test that invalid extract value for AI-2 extractor raises ValueError."""
308+
with pytest.raises(
309+
ValueError,
310+
match='^Invalid extract value "invalid" for extractor "ai-2". Valid values are: tables, tables-paragraphs$',
311+
):
312+
Client("fake_key", extractor=EXTRACTOR_AI_2, extract="invalid")
313+
314+
184315
def consume(s):
185316
r = b""
186317
for chunk in s:

0 commit comments

Comments
 (0)