python-pdftables-api/pdftables_api/pdftables_api.py at e36bb206ba79f5b672b13261f642d4e42329b979 · pdftables/python-pdftables-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# Copyright 2026 Cantabular Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from shutil import copyfileobj

import requests

FORMAT_CSV = "csv"
FORMAT_HTML = "html"
FORMAT_XLSX_MULTIPLE = "xlsx-multiple"
FORMAT_XLSX_SINGLE = "xlsx-single"
FORMAT_XLSX = FORMAT_XLSX_MULTIPLE
FORMAT_XML = "xml"

_API_URL = "https://pdftables.com/api"
_DEFAULT_TIMEOUT = (10, 300)  # seconds (connect and read)
_FORMATS_EXT = {
    FORMAT_CSV: ".csv",
    FORMAT_HTML: ".html",
    FORMAT_XLSX: ".xlsx",
    FORMAT_XLSX_MULTIPLE: ".xlsx",
    FORMAT_XLSX_SINGLE: ".xlsx",
    FORMAT_XML: ".xml",
}
_EXT_FORMATS = {
    ".csv": FORMAT_CSV,
    ".html": FORMAT_HTML,
    ".xlsx": FORMAT_XLSX,
    ".xml": FORMAT_XML,
}
_STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML}

EXTRACTOR_STANDARD = "standard"
EXTRACTOR_AI_1 = "ai-1"
EXTRACTOR_AI_2 = "ai-2"

EXTRACT_TABLES = "tables"
EXTRACT_TABLES_PARAGRAPHS = "tables-paragraphs"

# Valid extractor options for each extractor type
_VALID_EXTRACTOR_VALUES = {
    EXTRACTOR_STANDARD: (),  # Standard extractor has no options
    EXTRACTOR_AI_1: (
        EXTRACT_TABLES,
        EXTRACT_TABLES_PARAGRAPHS,
    ),  # Use a tuple for consistent order in error messages
    EXTRACTOR_AI_2: (
        EXTRACT_TABLES,
        EXTRACT_TABLES_PARAGRAPHS,
    ),
}

# Valid extractor types
_VALID_EXTRACTORS = tuple(_VALID_EXTRACTOR_VALUES.keys())


class Client:
    def __init__(
        self,
        api_key,
        api_url=_API_URL,
        timeout=_DEFAULT_TIMEOUT,
        extractor=EXTRACTOR_STANDARD,
        extract=None,
    ):
        self.api_key = api_key
        self.api_url = api_url
        self.timeout = timeout

        # Validate and set extractor configuration
        self._validate_extractor(extractor, extract)
        self.extractor = extractor
        self.extract = extract

    @staticmethod
    def _validate_extractor(extractor, extract):
        """Validate extractor and extract parameters."""
        if extractor not in _VALID_EXTRACTORS:
            valid_extractors = ", ".join(_VALID_EXTRACTORS)
            raise ValueError(
                f'Invalid extractor "{extractor}". Valid options are: {valid_extractors}'
            )

        valid_extract_values = _VALID_EXTRACTOR_VALUES[extractor]
        if extract is not None and extract not in valid_extract_values:
            if len(valid_extract_values) == 0:
                raise ValueError(
                    f'Extractor "{extractor}" does not support extract parameter'
                )
            else:
                valid_extract_values_str = ", ".join(
                    str(opt) for opt in valid_extract_values
                )
                raise ValueError(
                    f'Invalid extract value "{extract}" for extractor "{extractor}". Valid values are: {valid_extract_values_str}'
                )

    def xlsx(self, pdf_path, xlsx_path=None):
        """
        Convenience method to convert PDF to XLSX multiple sheets.

        If xlsx_path is None, returns the output as a byte string.
        """
        return self.xlsx_multiple(pdf_path, xlsx_path)

    def xlsx_single(self, pdf_path, xlsx_path=None):
        """
        Convenience method to convert PDF to XLSX single sheet.

        If xlsx_path is None, returns the output as a byte string.
        """
        return self.convert(pdf_path, xlsx_path, out_format=FORMAT_XLSX_SINGLE)

    def xlsx_multiple(self, pdf_path, xlsx_path=None):
        """
        Convenience method to convert PDF to XLSX multiple sheets.

        If xlsx_path is None, returns the output as a byte string.
        """
        return self.convert(pdf_path, xlsx_path, out_format=FORMAT_XLSX_MULTIPLE)

    def xml(self, pdf_path, xml_path=None):
        """
        Convenience method to convert PDF to XML.

        If xml_path is None, returns the output as a string.
        """
        return self.convert(pdf_path, xml_path, out_format=FORMAT_XML)

    def csv(self, pdf_path, csv_path=None):
        """
        Convenience method to convert PDF to CSV.

        If csv_path is None, returns the output as a string.
        """
        return self.convert(pdf_path, csv_path, out_format=FORMAT_CSV)

    def html(self, pdf_path, html_path=None):
        """
        Convenience method to convert HTML to CSV.

        If html_path is None, returns the output as a string.
        """
        return self.convert(pdf_path, html_path, out_format=FORMAT_HTML)

    def convert(
        self,
        pdf_path,
        out_path=None,
        out_format=None,
        query_params=None,
        **requests_params,
    ):
        """
        Convert PDF given by `pdf_path` into `format` at `out_path`.

        If `out_path` is None, returns a string containing the contents, or a
        bytes for binary output types (e.g, XLSX)
        """
        (out_path, out_format) = Client.ensure_format_ext(out_path, out_format)
        with open(pdf_path, "rb") as pdf_fo:
            response = self.request(pdf_fo, out_format, query_params, **requests_params)

            if out_path is None:
                use_text = out_format in _STRING_FORMATS
                return response.text if use_text else response.content

            with open(out_path, "wb") as out_fo:
                converted_fo = response.raw
                # Ensure that gzip content is decoded.
                converted_fo.decode_content = True
                copyfileobj(converted_fo, out_fo)

    def dump(self, pdf_fo, out_format=None, query_params=None, **requests_params):
        """
        Convert PDF file object given by `pdf_fo` into an output stream iterator.
        """
        response = self.request(pdf_fo, out_format, query_params, **requests_params)

        return response.iter_content(chunk_size=4096)

    def request(self, pdf_fo, out_format=None, query_params=None, **requests_params):
        """
        Convert PDF given by `pdf_path`, returning requests.Response object.
        """
        if self.api_key == "":
            raise APIException("Invalid API key")

        if "timeout" not in requests_params:
            requests_params.update({"timeout": self.timeout})

        (_, out_format) = Client.ensure_format_ext(None, out_format)
        url = self.api_url
        files = {"f": ("file.pdf", pdf_fo)}
        params = query_params if query_params else {}
        params.update(
            {
                "key": self.api_key,
                "format": out_format,
                "extractor": self.extractor,
                "extract": self.extract,
            }
        )

        response = requests.post(
            url, files=files, stream=True, params=params, **requests_params
        )

        if response.status_code == 400:
            raise APIException("Unknown file format")
        elif response.status_code == 401:
            raise APIException("Unauthorized API key")
        elif response.status_code == 402:
            raise APIException("Usage limit exceeded")
        elif response.status_code == 403:
            raise APIException("Unknown format requested")
        response.raise_for_status()

        return response

    def remaining(self, query_params=None, **requests_params):
        """
        Provide information of remaining pages quota.
        """
        if self.api_key == "":
            raise APIException("Invalid API key")

        url = self.api_url + "/remaining"
        params = query_params if query_params else {}
        params.update({"key": self.api_key})

        response = requests.get(url, params=params, **requests_params)

        if response.status_code == 401:
            raise APIException("Unauthorized API key")
        response.raise_for_status()

        return int(response.content)

    @staticmethod
    def ensure_format_ext(out_path, out_format):
        """
        Ensure the appropriate file extension and format is given. If not
        provided, try to guess either.
        """
        if out_format is not None and out_format not in _FORMATS_EXT.keys():
            raise ValueError("Invalid output format")

        default_format = FORMAT_XLSX_MULTIPLE

        # Check if stdout is desired
        if out_path is None:
            if out_format is None:
                out_format = default_format
            return (None, out_format)

        _, ext = os.path.splitext(out_path)

        # Guess output format by file extension
        if out_format is None:
            if ext in _FORMATS_EXT.values():
                out_format = _EXT_FORMATS[ext]
            else:
                out_format = default_format

        # Ensure correct file extension by output format
        if ext not in _FORMATS_EXT.values() or ext != _FORMATS_EXT[out_format]:
            out_path = out_path + _FORMATS_EXT[out_format]

        return (out_path, out_format)


class APIException(Exception):
    pass