33"""
44import typing as ty
55import os
6+ import shutil
7+ import tempfile
8+
9+ import requests
10+ import PyPDF2
11+
612from ..exceptions import ClientFileError
713
814
@@ -26,3 +32,111 @@ def size_error(self) -> ty.Union[Exception, None]:
2632 if os .stat (self .filepath ).st_size <= self .__THRESHOLD_SIZE__ * 1027 * 1027 :
2733 return
2834 raise ClientFileError (Message = f"File Size greater than the threshold { self .__THRESHOLD_SIZE__ } Mb." )
35+
36+
37+ class PrepareInput :
38+ """
39+ Handle PDF work
40+ """
41+ def __enter__ (self ):
42+ return self
43+
44+ def __init__ (self , filepath : ty .Union [os .PathLike , str ], pages : str ):
45+ self .filepath = filepath
46+ self .temp_dir = tempfile .mkdtemp ()
47+ if self .filepath .startswith (("http://" , "https://" )):
48+ self .filepath = self .download_file (self .filepath )
49+ self .pages = pages
50+ # Save time by using the real file,
51+ # if "all" pages or an image file
52+ if pages == "all" or not self .filepath .lower ().endswith (".pdf" ):
53+ pass
54+ else :
55+ print ("[Info]: Aggregating user defined pages.." , self .pages )
56+ gather_pages = self ._get_pages (filepath , pages )
57+ self .filepath = self .pdf_separator (gather_pages )
58+ CheckFile (self .filepath )
59+
60+ def pdf_separator (self , gather_pages : set ):
61+ """PDF Splitter"""
62+ merged_pdf = os .path .join (self .temp_dir , str (self .pages ) + os .path .basename (self .filepath ))
63+ with open (merged_pdf , 'wb' ) as out_file :
64+ pdf_reader = PyPDF2 .PdfFileReader (self .filepath )
65+ pdf_writer = PyPDF2 .PdfFileWriter ()
66+ for page in gather_pages :
67+ try :
68+ pdf_writer .addPage (pdf_reader .getPage (page - 1 ))
69+ except IndexError :
70+ raise EOFError (f"File has only { pdf_reader .numPages } pages, but asked for { self .pages } " )
71+ pdf_writer .write (out_file )
72+ return merged_pdf
73+
74+ @staticmethod
75+ def _get_pages (filepath : os .PathLike , pages : str ) -> set :
76+ # Credits to camelot library - customized
77+ """Converts pages string to list of ints.
78+
79+ Parameters
80+ ----------
81+ filepath : Pathlike
82+ Filepath or URL of the PDF file.
83+ pages : str, optional (default: '1')
84+ Comma-separated page numbers.
85+ Example: '1,3,4' or '1,4-end' or 'all'.
86+
87+ Returns
88+ -------
89+ P : list
90+ List of int page numbers.
91+
92+ """
93+ page_numbers = []
94+ pages_needed = []
95+
96+ if pages == "1" :
97+ page_numbers .append ({"start" : 1 , "end" : 1 })
98+ else :
99+ with open (filepath , "rb" ) as file_obj :
100+ infile = PyPDF2 .PdfFileReader (file_obj , strict = False )
101+ if pages == "all" :
102+ page_numbers .append ({"start" : 1 , "end" : infile .getNumPages ()})
103+ else :
104+ for r in pages .split ("," ):
105+ if "-" in r :
106+ a , b = r .split ("-" )
107+ if b == "end" :
108+ b = infile .getNumPages ()
109+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
110+ else :
111+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
112+
113+ for p in page_numbers :
114+ pages_needed .extend (range (p ["start" ], p ["end" ] + 1 ))
115+
116+ return set (pages_needed )
117+
118+ def download_file (self , url : str ):
119+ """
120+ Download file to local
121+ :param url: PDF file path
122+ :return: downloaded file local filepath
123+ """
124+ with requests .get (url , stream = True ) as r :
125+ r .raise_for_status ()
126+ _ , r_ext = r .headers ['Content-Type' ].rsplit ('/' , 1 )
127+ fname , f_ext = os .path .basename (url ).rsplit ('.' , 1 )
128+ ext = r_ext if r_ext else f_ext
129+ ext = ext .lower ()
130+ # TODO use filetype lib to find extension
131+ tmp_fname = os .path .join (self .temp_dir , f"{ fname } .{ ext } " )
132+ with open (tmp_fname , 'wb' ) as f :
133+ for chunk in r .iter_content (chunk_size = 1024 ):
134+ if chunk : # filter out keep-alive new chunks
135+ f .write (chunk )
136+ f .flush ()
137+ return tmp_fname
138+
139+ def __exit__ (self , exc_type , exc_val , exc_tb ):
140+ """Delete the temporary directory created for an instance"""
141+ if self .temp_dir :
142+ shutil .rmtree (self .temp_dir )
0 commit comments