v1.2.0 (#9)

akshowhini · ExtractTable · commit 7c2e132b1361 · 2019-10-20T18:51:03.000-04:00
* Updated tests

* Accept URL inputs, user defined pages

* Added sample images

* Accept URLs in filepath

* v1.1.0

* v1.2.0
diff --git a/ExtractTable/FileOperations/__init__.py b/ExtractTable/FileOperations/__init__.py
@@ -3,6 +3,12 @@
 """
 import typing as ty
 import os
+import shutil
+import tempfile
+
+import requests
+import PyPDF2
+
 from ..exceptions import ClientFileError
 
 
@@ -26,3 +32,111 @@ def size_error(self) -> ty.Union[Exception, None]:
         if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
             return
         raise ClientFileError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
+
+
+class PrepareInput:
+    """
+    Handle PDF work
+    """
+    def __enter__(self):
+        return self
+
+    def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str):
+        self.filepath = filepath
+        self.temp_dir = tempfile.mkdtemp()
+        if self.filepath.startswith(("http://", "https://")):
+            self.filepath = self.download_file(self.filepath)
+        self.pages = pages
+        # Save time by using the real file,
+        # if "all" pages or an image file
+        if pages == "all" or not self.filepath.lower().endswith(".pdf"):
+            pass
+        else:
+            print("[Info]: Aggregating user defined pages..", self.pages)
+            gather_pages = self._get_pages(filepath, pages)
+            self.filepath = self.pdf_separator(gather_pages)
+        CheckFile(self.filepath)
+
+    def pdf_separator(self, gather_pages: set):
+        """PDF Splitter"""
+        merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath))
+        with open(merged_pdf, 'wb') as out_file:
+            pdf_reader = PyPDF2.PdfFileReader(self.filepath)
+            pdf_writer = PyPDF2.PdfFileWriter()
+            for page in gather_pages:
+                try:
+                    pdf_writer.addPage(pdf_reader.getPage(page-1))
+                except IndexError:
+                    raise EOFError(f"File has only {pdf_reader.numPages} pages, but asked for {self.pages}")
+            pdf_writer.write(out_file)
+        return merged_pdf
+
+    @staticmethod
+    def _get_pages(filepath: os.PathLike, pages: str) -> set:
+        # Credits to camelot library - customized
+        """Converts pages string to list of ints.
+
+        Parameters
+        ----------
+        filepath : Pathlike
+            Filepath or URL of the PDF file.
+        pages : str, optional (default: '1')
+            Comma-separated page numbers.
+            Example: '1,3,4' or '1,4-end' or 'all'.
+
+        Returns
+        -------
+        P : list
+            List of int page numbers.
+
+        """
+        page_numbers = []
+        pages_needed = []
+
+        if pages == "1":
+            page_numbers.append({"start": 1, "end": 1})
+        else:
+            with open(filepath, "rb") as file_obj:
+                infile = PyPDF2.PdfFileReader(file_obj, strict=False)
+                if pages == "all":
+                    page_numbers.append({"start": 1, "end": infile.getNumPages()})
+                else:
+                    for r in pages.split(","):
+                        if "-" in r:
+                            a, b = r.split("-")
+                            if b == "end":
+                                b = infile.getNumPages()
+                            page_numbers.append({"start": int(a), "end": int(b)})
+                        else:
+                            page_numbers.append({"start": int(r), "end": int(r)})
+
+        for p in page_numbers:
+            pages_needed.extend(range(p["start"], p["end"] + 1))
+
+        return set(pages_needed)
+
+    def download_file(self, url: str):
+        """
+        Download file to local
+        :param url: PDF file path
+        :return: downloaded file local filepath
+        """
+        with requests.get(url, stream=True) as r:
+            r.raise_for_status()
+            _, r_ext = r.headers['Content-Type'].rsplit('/', 1)
+            fname, f_ext = os.path.basename(url).rsplit('.', 1)
+            ext = r_ext if r_ext else f_ext
+            ext = ext.lower()
+            # TODO use filetype lib to find extension
+            tmp_fname = os.path.join(self.temp_dir, f"{fname}.{ext}")
+            with open(tmp_fname, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:   # filter out keep-alive new chunks
+                        f.write(chunk)
+                        f.flush()
+        return tmp_fname
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Delete the temporary directory created for an instance"""
+        if self.temp_dir:
+            shutil.rmtree(self.temp_dir)
diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py
@@ -1,6 +1,6 @@
-VERSION = (1, 0, 2)
+VERSION = (1, 2, 0)
 PRERELEASE = None  # "alpha", "beta" or "rc"
-REVISION = 1
+REVISION = None
 
 
 def generate_version(version, prerelease=None, revision=None):
diff --git a/ExtractTable/client.py b/ExtractTable/client.py
@@ -10,7 +10,7 @@
 
 import requests as rq
 
-from .FileOperations import CheckFile
+from .FileOperations import PrepareInput
 from .config import HOST, JobStatus
 from .parsers import ResponseParser, OutputParser
 from .common import ConvertTo
@@ -119,21 +119,24 @@ def process_file(
         :param dup_check: Idempotent requests handler
         :param indexing: If row index is needed
         :param kwargs:
-
-            max_wait_time: (int) 300 default; Maximum Time to wait before returning to the client
-            anyother form-data to be sent to the server
+            max_wait_time: int, optional (default: 300);
+                Maximum Time to wait before returning to the client
+            pages : str, optional (default: '1')
+                Comma-separated page numbers.
+                Example: '1,3,4' or '1,4-end' or 'all'.
+            anyother form-data to be sent to the server for future considerations
         :return: user requested output in list;
         """
-        CheckFile(filepath)
         # Raise a warning if unknown format is requested
         if output_format not in self._OUTPUT_FORMATS:
             default_format = "dict"
             warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
                        f"Assigned default format: {default_format}"
             warnings.warn(warn_msg)
 
-        with open(filepath, 'rb') as fp:
-            server_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
+        with PrepareInput(filepath, pages=kwargs.pop("pages", "1")) as infile:
+            with open(infile.filepath, 'rb') as fp:
+                server_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
 
         result = ConvertTo(data=server_resp, fmt=output_format, index=indexing).output
         return result
diff --git a/example-code.ipynb b/example-code.ipynb
@@ -183,7 +183,10 @@
     "colab": {}
    },
    "source": [
-    "filepath = r'testimages/chervolet.jpg'\n",
+    "# filepath = \"image_path_or_image_url_with_tables\"\n",
+    "# filepath = r'samples/BlurryImage.jpg'\n",
+    "filepath = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\"\n",
+    "\n",
     "table_data = et_sess.process_file(filepath)"
    ],
    "execution_count": 0,
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 requests>=2.21
 pandas>=0.24
+PyPDF2>=1.26
diff --git a/samples/BlurryImage.jpg b/samples/BlurryImage.jpg
diff --git a/samples/QualityImage.jpg b/samples/QualityImage.jpg
diff --git a/samples/SmallFont.jpg b/samples/SmallFont.jpg
diff --git a/tests/constants.py b/tests/constants.py
@@ -1,3 +1,2 @@
 API_KEY = ''
 FILE_PATH = r''
-RESULTS_FOLDER = r''
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -1,37 +1,35 @@
+import os
 import io
 
 import pytest
 
 from ExtractTable.client import ExtractTable
-from ExtractTable.common import UsageStats
 from ExtractTable.exceptions import ServiceError
-from tests.constants import API_KEY, FILE_PATH, RESULTS_FOLDER
+from tests.constants import API_KEY, FILE_PATH
 
 
 @pytest.fixture
 def client():
     return ExtractTable(API_KEY)
 
 
-def test_process_file(client: ExtractTable):
-    assert not (client.process_file(FILE_PATH, RESULTS_FOLDER))
-
-
-def test_process_file_index(client: ExtractTable):
-    assert not (client.process_file(FILE_PATH, RESULTS_FOLDER, True))
-
-
-def test_check_usage(client: ExtractTable):
-    assert isinstance(client.check_usage(), UsageStats)
+def test_get_result_fail(client: ExtractTable):
+    with pytest.raises(ServiceError):
+        assert not client.get_result('')
 
 
 def test_trigger_process_fail(client: ExtractTable):
     with pytest.raises(ServiceError):
         client.trigger_process(io.BytesIO())
 
 
-def test_get_result_fail(client: ExtractTable):
-    with pytest.raises(ServiceError):
-        assert client.get_result('')
+def test_check_usage(client: ExtractTable):
+    assert isinstance(client.check_usage(), dict)
+
+
+def test_process_file(client: ExtractTable):
+    assert isinstance(client.process_file(filepath=FILE_PATH, output_format="df"), list)
 
 
+def test_process_file_csv(client: ExtractTable, fmt="csv"):
+    return all([os.path.exists(x) and x.endswith(fmt) for x in client.process_file(filepath=FILE_PATH, output_format=fmt)])

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`requests>=2.21`
`2`	`2`	`pandas>=0.24`
	`3`	`+PyPDF2>=1.26`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`API_KEY = ''`
`2`	`2`	`FILE_PATH = r''`
`3`		`-RESULTS_FOLDER = r''`