Skip to content

Commit 7fa30c6

Browse files
committed
Added ability to select extractor
1 parent bafe51b commit 7fa30c6

File tree

4 files changed

+259
-3
lines changed

4 files changed

+259
-3
lines changed

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,25 @@ To convert to CSV, XML or HTML simply change `c.xlsx` to be `c.csv`, `c.xml` or
4646

4747
To specify Excel (single sheet) or Excel (multiple sheets) use `c.xlsx_single` or `c.xlsx_multiple`.
4848

49+
## Extractor
50+
51+
You can specify which extraction engine to use when creating a `Client`. The available extractors are `standard` (default), `ai-1`, and `ai-2`.
52+
53+
For AI extractors (`ai-1` and `ai-2`), you can also specify an `extract` option to control what content is extracted: `tables` (default) or `tables-paragraphs`.
54+
55+
```py
56+
import pdftables_api
57+
58+
# Standard extractor (default)
59+
c = pdftables_api.Client('my-api-key')
60+
61+
# AI extractors for complex documents
62+
c = pdftables_api.Client('my-api-key', extractor='ai-1', extract='tables')
63+
c = pdftables_api.Client('my-api-key', extractor='ai-2', extract='tables-paragraphs')
64+
```
65+
66+
See [PDFTables API documentation](https://pdftables.com/pdf-to-excel-api) for details.
67+
4968
## Test
5069

5170
Tests run with pytest: `make test`

pdftables_api/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
# limitations under the License.
1414

1515
from .pdftables_api import (
16+
EXTRACT_TABLES,
17+
EXTRACT_TABLES_PARAGRAPHS,
18+
EXTRACTOR_AI_1,
19+
EXTRACTOR_AI_2,
20+
EXTRACTOR_STANDARD,
1621
FORMAT_CSV,
1722
FORMAT_XLSX,
1823
FORMAT_XLSX_MULTIPLE,
@@ -30,4 +35,9 @@
3035
"FORMAT_XML",
3136
"APIException",
3237
"Client",
38+
"EXTRACTOR_STANDARD",
39+
"EXTRACTOR_AI_1",
40+
"EXTRACTOR_AI_2",
41+
"EXTRACT_TABLES",
42+
"EXTRACT_TABLES_PARAGRAPHS",
3343
]

pdftables_api/pdftables_api.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,71 @@
4242
}
4343
_STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML}
4444

45+
EXTRACTOR_STANDARD = "standard"
46+
EXTRACTOR_AI_1 = "ai-1"
47+
EXTRACTOR_AI_2 = "ai-2"
48+
49+
EXTRACT_TABLES = "tables"
50+
EXTRACT_TABLES_PARAGRAPHS = "tables-paragraphs"
51+
52+
# Valid extractor options for each extractor type
53+
_VALID_EXTRACTOR_VALUES = {
54+
EXTRACTOR_STANDARD: (), # Standard extractor has no options
55+
EXTRACTOR_AI_1: (
56+
EXTRACT_TABLES,
57+
EXTRACT_TABLES_PARAGRAPHS,
58+
), # Use a tuple for consistent order in error messages
59+
EXTRACTOR_AI_2: (
60+
EXTRACT_TABLES,
61+
EXTRACT_TABLES_PARAGRAPHS,
62+
),
63+
}
64+
65+
# Valid extractor types
66+
_VALID_EXTRACTORS = tuple(_VALID_EXTRACTOR_VALUES.keys())
67+
4568

4669
class Client:
47-
def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT):
70+
def __init__(
71+
self,
72+
api_key,
73+
api_url=_API_URL,
74+
timeout=_DEFAULT_TIMEOUT,
75+
extractor=EXTRACTOR_STANDARD,
76+
extract=None,
77+
):
4878
self.api_key = api_key
4979
self.api_url = api_url
5080
self.timeout = timeout
5181

82+
# Validate and set extractor configuration
83+
self._validate_extractor(extractor, extract)
84+
self.extractor = extractor
85+
self.extract = extract
86+
87+
@staticmethod
88+
def _validate_extractor(extractor, extract):
89+
"""Validate extractor and extract parameters."""
90+
if extractor not in _VALID_EXTRACTORS:
91+
valid_extractors = ", ".join(_VALID_EXTRACTORS)
92+
raise ValueError(
93+
f'Invalid extractor "{extractor}". Valid options are: {valid_extractors}'
94+
)
95+
96+
valid_extract_values = _VALID_EXTRACTOR_VALUES[extractor]
97+
if extract is not None and extract not in valid_extract_values:
98+
if len(valid_extract_values) == 0:
99+
raise ValueError(
100+
f'Extractor "{extractor}" does not support extract parameter'
101+
)
102+
else:
103+
valid_extract_values_str = ", ".join(
104+
str(opt) for opt in valid_extract_values
105+
)
106+
raise ValueError(
107+
f'Invalid extract value "{extract}" for extractor "{extractor}". Valid values are: {valid_extract_values_str}'
108+
)
109+
52110
def xlsx(self, pdf_path, xlsx_path=None):
53111
"""
54112
Convenience method to convert PDF to XLSX multiple sheets.
@@ -147,7 +205,17 @@ def request(self, pdf_fo, out_format=None, query_params=None, **requests_params)
147205
url = self.api_url
148206
files = {"f": ("file.pdf", pdf_fo)}
149207
params = query_params if query_params else {}
150-
params.update({"key": self.api_key, "format": out_format})
208+
params.update(
209+
{
210+
"key": self.api_key,
211+
"format": out_format,
212+
"extractor": self.extractor,
213+
}
214+
)
215+
216+
# Add extract value if specified
217+
if self.extract:
218+
params.update({"extract": self.extract})
151219

152220
response = requests.post(
153221
url, files=files, stream=True, params=params, **requests_params

test/test_pdftables_api.py

Lines changed: 160 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,15 @@
2020
import pytest
2121
import requests_mock
2222

23-
from pdftables_api import APIException, Client
23+
from pdftables_api import (
24+
EXTRACT_TABLES,
25+
EXTRACT_TABLES_PARAGRAPHS,
26+
EXTRACTOR_AI_1,
27+
EXTRACTOR_AI_2,
28+
EXTRACTOR_STANDARD,
29+
APIException,
30+
Client,
31+
)
2432

2533

2634
class TestEnsureExtFormat(TestCase):
@@ -181,6 +189,157 @@ def test_response_unknown_file_format(self):
181189
c.dump(png_fo)
182190

183191

192+
class TestExtractorParameters(TestCase):
193+
def test_default_extractor(self):
194+
"""Test that default extractor is 'standard' with no extract parameter."""
195+
with requests_mock.mock() as m:
196+
m.post(
197+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=standard",
198+
text="xlsx output",
199+
)
200+
201+
c = Client("fake_key")
202+
with NamedTemporaryFile(suffix="test.pdf") as tf:
203+
tf.write(b"Hello world")
204+
tf.file.close()
205+
c.convert(tf.name)
206+
207+
def test_ai1_extractor_with_no_extract(self):
208+
"""Test ai-1 extractor with no extract parameter."""
209+
with requests_mock.mock() as m:
210+
m.post(
211+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1",
212+
text="xlsx output",
213+
)
214+
215+
c = Client("fake_key", extractor=EXTRACTOR_AI_1)
216+
with NamedTemporaryFile(suffix="test.pdf") as tf:
217+
tf.write(b"Hello world")
218+
tf.file.close()
219+
c.convert(tf.name)
220+
221+
def test_ai1_extractor_with_tables(self):
222+
"""Test ai-1 extractor with 'tables' extract parameter."""
223+
with requests_mock.mock() as m:
224+
m.post(
225+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1&extract=tables",
226+
text="xlsx output",
227+
)
228+
229+
c = Client("fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES)
230+
with NamedTemporaryFile(suffix="test.pdf") as tf:
231+
tf.write(b"Hello world")
232+
tf.file.close()
233+
c.convert(tf.name)
234+
235+
def test_ai1_extractor_with_tables_paragraphs(self):
236+
"""Test ai-1 extractor with 'tables-paragraphs' extract parameter."""
237+
with requests_mock.mock() as m:
238+
m.post(
239+
"https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-1&extract=tables-paragraphs",
240+
text="csv output",
241+
)
242+
243+
c = Client(
244+
"fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES_PARAGRAPHS
245+
)
246+
with NamedTemporaryFile(suffix="test.pdf") as tf:
247+
tf.write(b"Hello world")
248+
tf.file.close()
249+
c.convert(tf.name, out_format="csv")
250+
251+
def test_ai2_extractor_with_no_extract(self):
252+
"""Test ai-2 extractor with no extract parameter."""
253+
with requests_mock.mock() as m:
254+
m.post(
255+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2",
256+
text="xlsx output",
257+
)
258+
259+
c = Client("fake_key", extractor=EXTRACTOR_AI_2)
260+
with NamedTemporaryFile(suffix="test.pdf") as tf:
261+
tf.write(b"Hello world")
262+
tf.file.close()
263+
c.convert(tf.name)
264+
265+
def test_ai2_extractor_with_tables(self):
266+
"""Test ai-2 extractor with 'tables' extract parameter."""
267+
with requests_mock.mock() as m:
268+
m.post(
269+
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2&extract=tables",
270+
text="xlsx output",
271+
)
272+
273+
c = Client("fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES)
274+
with NamedTemporaryFile(suffix="test.pdf") as tf:
275+
tf.write(b"Hello world")
276+
tf.file.close()
277+
c.convert(tf.name)
278+
279+
def test_ai2_extractor_with_tables_paragraphs(self):
280+
"""Test ai-2 extractor with 'tables-paragraphs' extract parameter."""
281+
with requests_mock.mock() as m:
282+
m.post(
283+
"https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-2&extract=tables-paragraphs",
284+
text="csv output",
285+
)
286+
287+
c = Client(
288+
"fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS
289+
)
290+
with NamedTemporaryFile(suffix="test.pdf") as tf:
291+
tf.write(b"Hello world")
292+
tf.file.close()
293+
c.convert(tf.name, out_format="csv")
294+
295+
def test_standard_extractor_no_extract_param_in_url(self):
296+
"""Test that standard extractor doesn't include extract parameter in URL."""
297+
with requests_mock.mock() as m:
298+
# Note: no 'extract' parameter in the URL for standard extractor
299+
m.post(
300+
"https://pdftables.com/api?key=fake_key&format=csv&extractor=standard",
301+
text="csv output",
302+
)
303+
304+
c = Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=None)
305+
with NamedTemporaryFile(suffix="test.pdf") as tf:
306+
tf.write(b"Hello world")
307+
tf.file.close()
308+
c.convert(tf.name, out_format="csv")
309+
310+
def test_invalid_extractor_raises_error(self):
311+
"""Test that invalid extractor raises ValueError."""
312+
with pytest.raises(
313+
ValueError,
314+
match='^Invalid extractor "invalid". Valid options are: standard, ai-1, ai-2$',
315+
):
316+
Client("fake_key", extractor="invalid")
317+
318+
def test_invalid_extract_for_standard_raises_error(self):
319+
"""Test that providing extract parameter for standard extractor raises ValueError."""
320+
with pytest.raises(
321+
ValueError,
322+
match='^Extractor "standard" does not support extract parameter$',
323+
):
324+
Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=EXTRACT_TABLES)
325+
326+
def test_invalid_extract_for_ai_raises_error(self):
327+
"""Test that invalid extract value for AI extractor raises ValueError."""
328+
with pytest.raises(
329+
ValueError,
330+
match='^Invalid extract value "invalid" for extractor "ai-1". Valid values are: tables, tables-paragraphs$',
331+
):
332+
Client("fake_key", extractor=EXTRACTOR_AI_1, extract="invalid")
333+
334+
def test_invalid_extract_for_ai2_raises_error(self):
335+
"""Test that invalid extract value for AI-2 extractor raises ValueError."""
336+
with pytest.raises(
337+
ValueError,
338+
match='^Invalid extract value "invalid" for extractor "ai-2". Valid values are: tables, tables-paragraphs$',
339+
):
340+
Client("fake_key", extractor=EXTRACTOR_AI_2, extract="invalid")
341+
342+
184343
def consume(s):
185344
r = b""
186345
for chunk in s:

0 commit comments

Comments
 (0)