Skip to content

Commit 62efb5e

Browse files
committed
Accept URL inputs, user defined pages
1 parent 7d76f21 commit 62efb5e

File tree

3 files changed

+125
-7
lines changed

3 files changed

+125
-7
lines changed

ExtractTable/FileOperations/__init__.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
"""
44
import typing as ty
55
import os
6+
import shutil
7+
import tempfile
8+
9+
import requests
10+
import PyPDF2
11+
612
from ..exceptions import ClientFileError
713

814

@@ -26,3 +32,111 @@ def size_error(self) -> ty.Union[Exception, None]:
2632
if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
2733
return
2834
raise ClientFileError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
35+
36+
37+
class PrepareInput:
38+
"""
39+
Handle PDF work
40+
"""
41+
def __enter__(self):
42+
return self
43+
44+
def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str):
45+
self.filepath = filepath
46+
self.temp_dir = tempfile.mkdtemp()
47+
if self.filepath.startswith(("http://", "https://")):
48+
self.filepath = self.download_file(self.filepath)
49+
self.pages = pages
50+
# Save time by using the real file,
51+
# if "all" pages or an image file
52+
if pages == "all" or not self.filepath.lower().endswith(".pdf"):
53+
pass
54+
else:
55+
print("[Info]: Aggregating user defined pages..", self.pages)
56+
gather_pages = self._get_pages(filepath, pages)
57+
self.filepath = self.pdf_separator(gather_pages)
58+
CheckFile(self.filepath)
59+
60+
def pdf_separator(self, gather_pages: set):
61+
"""PDF Splitter"""
62+
merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath))
63+
with open(merged_pdf, 'wb') as out_file:
64+
pdf_reader = PyPDF2.PdfFileReader(self.filepath)
65+
pdf_writer = PyPDF2.PdfFileWriter()
66+
for page in gather_pages:
67+
try:
68+
pdf_writer.addPage(pdf_reader.getPage(page-1))
69+
except IndexError:
70+
raise EOFError(f"File has only {pdf_reader.numPages} pages, but asked for {self.pages}")
71+
pdf_writer.write(out_file)
72+
return merged_pdf
73+
74+
@staticmethod
75+
def _get_pages(filepath: os.PathLike, pages: str) -> set:
76+
# Credits to camelot library - customized
77+
"""Converts pages string to list of ints.
78+
79+
Parameters
80+
----------
81+
filepath : Pathlike
82+
Filepath or URL of the PDF file.
83+
pages : str, optional (default: '1')
84+
Comma-separated page numbers.
85+
Example: '1,3,4' or '1,4-end' or 'all'.
86+
87+
Returns
88+
-------
89+
P : list
90+
List of int page numbers.
91+
92+
"""
93+
page_numbers = []
94+
pages_needed = []
95+
96+
if pages == "1":
97+
page_numbers.append({"start": 1, "end": 1})
98+
else:
99+
with open(filepath, "rb") as file_obj:
100+
infile = PyPDF2.PdfFileReader(file_obj, strict=False)
101+
if pages == "all":
102+
page_numbers.append({"start": 1, "end": infile.getNumPages()})
103+
else:
104+
for r in pages.split(","):
105+
if "-" in r:
106+
a, b = r.split("-")
107+
if b == "end":
108+
b = infile.getNumPages()
109+
page_numbers.append({"start": int(a), "end": int(b)})
110+
else:
111+
page_numbers.append({"start": int(r), "end": int(r)})
112+
113+
for p in page_numbers:
114+
pages_needed.extend(range(p["start"], p["end"] + 1))
115+
116+
return set(pages_needed)
117+
118+
def download_file(self, url: str):
119+
"""
120+
Download file to local
121+
:param url: PDF file path
122+
:return: downloaded file local filepath
123+
"""
124+
with requests.get(url, stream=True) as r:
125+
r.raise_for_status()
126+
_, r_ext = r.headers['Content-Type'].rsplit('/', 1)
127+
fname, f_ext = os.path.basename(url).rsplit('.', 1)
128+
ext = r_ext if r_ext else f_ext
129+
ext = ext.lower()
130+
# TODO use filetype lib to find extension
131+
tmp_fname = os.path.join(self.temp_dir, f"{fname}.{ext}")
132+
with open(tmp_fname, 'wb') as f:
133+
for chunk in r.iter_content(chunk_size=1024):
134+
if chunk: # filter out keep-alive new chunks
135+
f.write(chunk)
136+
f.flush()
137+
return tmp_fname
138+
139+
def __exit__(self, exc_type, exc_val, exc_tb):
140+
"""Delete the temporary directory created for an instance"""
141+
if self.temp_dir:
142+
shutil.rmtree(self.temp_dir)

ExtractTable/client.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import requests as rq
1212

13-
from .FileOperations import CheckFile
13+
from .FileOperations import PrepareInput
1414
from .config import HOST, JobStatus
1515
from .parsers import ResponseParser, OutputParser
1616
from .common import ConvertTo
@@ -119,21 +119,24 @@ def process_file(
119119
:param dup_check: Idempotent requests handler
120120
:param indexing: If row index is needed
121121
:param kwargs:
122-
123-
max_wait_time: (int) 300 default; Maximum Time to wait before returning to the client
124-
anyother form-data to be sent to the server
122+
max_wait_time: int, optional (default: 300);
123+
Maximum Time to wait before returning to the client
124+
pages : str, optional (default: '1')
125+
Comma-separated page numbers.
126+
Example: '1,3,4' or '1,4-end' or 'all'.
127+
anyother form-data to be sent to the server for future considerations
125128
:return: user requested output in list;
126129
"""
127-
CheckFile(filepath)
128130
# Raise a warning if unknown format is requested
129131
if output_format not in self._OUTPUT_FORMATS:
130132
default_format = "dict"
131133
warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
132134
f"Assigned default format: {default_format}"
133135
warnings.warn(warn_msg)
134136

135-
with open(filepath, 'rb') as fp:
136-
server_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
137+
with PrepareInput(filepath, pages=kwargs.pop("pages", "1")) as infile:
138+
with open(infile.filepath, 'rb') as fp:
139+
server_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
137140

138141
result = ConvertTo(data=server_resp, fmt=output_format, index=indexing).output
139142
return result

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
requests>=2.21
22
pandas>=0.24
3+
PyPDF2>=1.26

0 commit comments

Comments
 (0)