Skip to content

Commit 7c2e132

Browse files
akshowhiniExtractTable
authored andcommitted
v1.2.0 (#9)
* Updated tests * Accept URL inputs, user defined pages * Added sample images * Accept URLs in filepath * v1.1.0 * v1.2.0
1 parent aec0c2e commit 7c2e132

File tree

10 files changed

+144
-26
lines changed

10 files changed

+144
-26
lines changed

ExtractTable/FileOperations/__init__.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
"""
44
import typing as ty
55
import os
6+
import shutil
7+
import tempfile
8+
9+
import requests
10+
import PyPDF2
11+
612
from ..exceptions import ClientFileError
713

814

@@ -26,3 +32,111 @@ def size_error(self) -> ty.Union[Exception, None]:
2632
if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
2733
return
2834
raise ClientFileError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
35+
36+
37+
class PrepareInput:
38+
"""
39+
Handle PDF work
40+
"""
41+
def __enter__(self):
42+
return self
43+
44+
def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str):
45+
self.filepath = filepath
46+
self.temp_dir = tempfile.mkdtemp()
47+
if self.filepath.startswith(("http://", "https://")):
48+
self.filepath = self.download_file(self.filepath)
49+
self.pages = pages
50+
# Save time by using the real file,
51+
# if "all" pages or an image file
52+
if pages == "all" or not self.filepath.lower().endswith(".pdf"):
53+
pass
54+
else:
55+
print("[Info]: Aggregating user defined pages..", self.pages)
56+
gather_pages = self._get_pages(filepath, pages)
57+
self.filepath = self.pdf_separator(gather_pages)
58+
CheckFile(self.filepath)
59+
60+
def pdf_separator(self, gather_pages: set):
61+
"""PDF Splitter"""
62+
merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath))
63+
with open(merged_pdf, 'wb') as out_file:
64+
pdf_reader = PyPDF2.PdfFileReader(self.filepath)
65+
pdf_writer = PyPDF2.PdfFileWriter()
66+
for page in gather_pages:
67+
try:
68+
pdf_writer.addPage(pdf_reader.getPage(page-1))
69+
except IndexError:
70+
raise EOFError(f"File has only {pdf_reader.numPages} pages, but asked for {self.pages}")
71+
pdf_writer.write(out_file)
72+
return merged_pdf
73+
74+
@staticmethod
75+
def _get_pages(filepath: os.PathLike, pages: str) -> set:
76+
# Credits to camelot library - customized
77+
"""Converts pages string to list of ints.
78+
79+
Parameters
80+
----------
81+
filepath : Pathlike
82+
Filepath or URL of the PDF file.
83+
pages : str, optional (default: '1')
84+
Comma-separated page numbers.
85+
Example: '1,3,4' or '1,4-end' or 'all'.
86+
87+
Returns
88+
-------
89+
P : list
90+
List of int page numbers.
91+
92+
"""
93+
page_numbers = []
94+
pages_needed = []
95+
96+
if pages == "1":
97+
page_numbers.append({"start": 1, "end": 1})
98+
else:
99+
with open(filepath, "rb") as file_obj:
100+
infile = PyPDF2.PdfFileReader(file_obj, strict=False)
101+
if pages == "all":
102+
page_numbers.append({"start": 1, "end": infile.getNumPages()})
103+
else:
104+
for r in pages.split(","):
105+
if "-" in r:
106+
a, b = r.split("-")
107+
if b == "end":
108+
b = infile.getNumPages()
109+
page_numbers.append({"start": int(a), "end": int(b)})
110+
else:
111+
page_numbers.append({"start": int(r), "end": int(r)})
112+
113+
for p in page_numbers:
114+
pages_needed.extend(range(p["start"], p["end"] + 1))
115+
116+
return set(pages_needed)
117+
118+
def download_file(self, url: str):
119+
"""
120+
Download file to local
121+
:param url: PDF file path
122+
:return: downloaded file local filepath
123+
"""
124+
with requests.get(url, stream=True) as r:
125+
r.raise_for_status()
126+
_, r_ext = r.headers['Content-Type'].rsplit('/', 1)
127+
fname, f_ext = os.path.basename(url).rsplit('.', 1)
128+
ext = r_ext if r_ext else f_ext
129+
ext = ext.lower()
130+
# TODO use filetype lib to find extension
131+
tmp_fname = os.path.join(self.temp_dir, f"{fname}.{ext}")
132+
with open(tmp_fname, 'wb') as f:
133+
for chunk in r.iter_content(chunk_size=1024):
134+
if chunk: # filter out keep-alive new chunks
135+
f.write(chunk)
136+
f.flush()
137+
return tmp_fname
138+
139+
def __exit__(self, exc_type, exc_val, exc_tb):
140+
"""Delete the temporary directory created for an instance"""
141+
if self.temp_dir:
142+
shutil.rmtree(self.temp_dir)

ExtractTable/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
VERSION = (1, 0, 2)
1+
VERSION = (1, 2, 0)
22
PRERELEASE = None # "alpha", "beta" or "rc"
3-
REVISION = 1
3+
REVISION = None
44

55

66
def generate_version(version, prerelease=None, revision=None):

ExtractTable/client.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import requests as rq
1212

13-
from .FileOperations import CheckFile
13+
from .FileOperations import PrepareInput
1414
from .config import HOST, JobStatus
1515
from .parsers import ResponseParser, OutputParser
1616
from .common import ConvertTo
@@ -119,21 +119,24 @@ def process_file(
119119
:param dup_check: Idempotent requests handler
120120
:param indexing: If row index is needed
121121
:param kwargs:
122-
123-
max_wait_time: (int) 300 default; Maximum Time to wait before returning to the client
124-
anyother form-data to be sent to the server
122+
max_wait_time: int, optional (default: 300);
123+
Maximum Time to wait before returning to the client
124+
pages : str, optional (default: '1')
125+
Comma-separated page numbers.
126+
Example: '1,3,4' or '1,4-end' or 'all'.
127+
anyother form-data to be sent to the server for future considerations
125128
:return: user requested output in list;
126129
"""
127-
CheckFile(filepath)
128130
# Raise a warning if unknown format is requested
129131
if output_format not in self._OUTPUT_FORMATS:
130132
default_format = "dict"
131133
warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
132134
f"Assigned default format: {default_format}"
133135
warnings.warn(warn_msg)
134136

135-
with open(filepath, 'rb') as fp:
136-
server_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
137+
with PrepareInput(filepath, pages=kwargs.pop("pages", "1")) as infile:
138+
with open(infile.filepath, 'rb') as fp:
139+
server_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
137140

138141
result = ConvertTo(data=server_resp, fmt=output_format, index=indexing).output
139142
return result

example-code.ipynb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,10 @@
183183
"colab": {}
184184
},
185185
"source": [
186-
"filepath = r'testimages/chervolet.jpg'\n",
186+
"# filepath = \"image_path_or_image_url_with_tables\"\n",
187+
"# filepath = r'samples/BlurryImage.jpg'\n",
188+
"filepath = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\"\n",
189+
"\n",
187190
"table_data = et_sess.process_file(filepath)"
188191
],
189192
"execution_count": 0,

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
requests>=2.21
22
pandas>=0.24
3+
PyPDF2>=1.26

samples/BlurryImage.jpg

287 KB
Loading

samples/QualityImage.jpg

134 KB
Loading

tests/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
API_KEY = ''
22
FILE_PATH = r''
3-
RESULTS_FOLDER = r''

tests/test_client.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,35 @@
1+
import os
12
import io
23

34
import pytest
45

56
from ExtractTable.client import ExtractTable
6-
from ExtractTable.common import UsageStats
77
from ExtractTable.exceptions import ServiceError
8-
from tests.constants import API_KEY, FILE_PATH, RESULTS_FOLDER
8+
from tests.constants import API_KEY, FILE_PATH
99

1010

1111
@pytest.fixture
1212
def client():
1313
return ExtractTable(API_KEY)
1414

1515

16-
def test_process_file(client: ExtractTable):
17-
assert not (client.process_file(FILE_PATH, RESULTS_FOLDER))
18-
19-
20-
def test_process_file_index(client: ExtractTable):
21-
assert not (client.process_file(FILE_PATH, RESULTS_FOLDER, True))
22-
23-
24-
def test_check_usage(client: ExtractTable):
25-
assert isinstance(client.check_usage(), UsageStats)
16+
def test_get_result_fail(client: ExtractTable):
17+
with pytest.raises(ServiceError):
18+
assert not client.get_result('')
2619

2720

2821
def test_trigger_process_fail(client: ExtractTable):
2922
with pytest.raises(ServiceError):
3023
client.trigger_process(io.BytesIO())
3124

3225

33-
def test_get_result_fail(client: ExtractTable):
34-
with pytest.raises(ServiceError):
35-
assert client.get_result('')
26+
def test_check_usage(client: ExtractTable):
27+
assert isinstance(client.check_usage(), dict)
28+
29+
30+
def test_process_file(client: ExtractTable):
31+
assert isinstance(client.process_file(filepath=FILE_PATH, output_format="df"), list)
3632

3733

34+
def test_process_file_csv(client: ExtractTable, fmt="csv"):
35+
return all([os.path.exists(x) and x.endswith(fmt) for x in client.process_file(filepath=FILE_PATH, output_format=fmt)])

0 commit comments

Comments
 (0)