55import com .mindee .MindeeException ;
66import com .mindee .input .InputSourceUtils ;
77import com .mindee .input .LocalInputSource ;
8- import com .mindee .v1 .product .invoicesplitter .InvoiceSplitterV1InvoicePageGroup ;
98import java .awt .image .BufferedImage ;
109import java .io .ByteArrayInputStream ;
1110import java .io .IOException ;
1211import java .util .ArrayList ;
13- import java .util .Iterator ;
1412import java .util .List ;
15- import java .util .stream .Collectors ;
1613import javax .imageio .ImageIO ;
1714import org .apache .pdfbox .Loader ;
1815import org .apache .pdfbox .pdmodel .PDDocument ;
2421/**
2522 * PDF extraction class.
2623 */
27- public class PDFExtractor {
28- private final PDDocument sourcePdf ;
29- private final String filename ;
30-
31- /**
32- * Init from a path.
33- *
34- * @param filePath Path to the file.
35- * @throws IOException Throws if the file can't be accessed.
36- */
37- public PDFExtractor (String filePath ) throws IOException {
38- this (new LocalInputSource (filePath ));
39- }
24+ public class BasePDFExtractor {
25+ protected final PDDocument sourcePdf ;
26+ protected final String filename ;
4027
4128 /**
4229 * Init from a {@link LocalInputSource}.
4330 *
4431 * @param source The local source.
4532 * @throws IOException Throws if the file can't be accessed.
4633 */
47- public PDFExtractor (LocalInputSource source ) throws IOException {
34+ protected BasePDFExtractor (LocalInputSource source ) throws IOException {
4835 this .filename = source .getFilename ();
4936 if (source .isPdf ()) {
5037 this .sourcePdf = Loader .loadPDF (source .getFile ());
5138 } else {
52- PDDocument document = new PDDocument ();
53- PDPage page = new PDPage ();
39+ var document = new PDDocument ();
40+ var page = new PDPage ();
5441 document .addPage (page );
5542 BufferedImage bufferedImage = byteArrayToBufferedImage (source .getFile ());
5643 PDImageXObject pdImage = LosslessFactory .createFromImage (document , bufferedImage );
@@ -65,7 +52,6 @@ public PDFExtractor(LocalInputSource source) throws IOException {
6552 );
6653 }
6754 this .sourcePdf = document ;
68-
6955 }
7056 }
7157
@@ -101,7 +87,7 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO
10187 public List <ExtractedPDF > extractSubDocuments (
10288 List <List <Integer >> pageIndexes
10389 ) throws IOException {
104- List < ExtractedPDF > extractedPDFs = new ArrayList <>();
90+ var extractedPDFs = new ArrayList <ExtractedPDF >();
10591
10692 for (List <Integer > pageIndexElement : pageIndexes ) {
10793 if (pageIndexElement .isEmpty ()) {
@@ -126,65 +112,4 @@ public List<ExtractedPDF> extractSubDocuments(
126112 }
127113 return extractedPDFs ;
128114 }
129-
130- /**
131- * Extract invoices from the given page indexes (from an invoice-splitter prediction).
132- *
133- * @param pageIndexes List of page indexes.
134- * @return a list of extracted files.
135- * @throws IOException Throws if the file can't be accessed.
136- */
137- public List <ExtractedPDF > extractInvoices (
138- List <InvoiceSplitterV1InvoicePageGroup > pageIndexes
139- ) throws IOException {
140-
141- List <List <Integer >> indexes = pageIndexes
142- .stream ()
143- .map (InvoiceSplitterV1InvoicePageGroup ::getPageIndexes )
144- .collect (Collectors .toList ());
145-
146- return extractSubDocuments (indexes );
147- }
148-
149- /**
150- * Extract invoices from the given page indexes (from an invoice-splitter prediction).
151- *
152- * @param pageIndexes List of page indexes.
153- * @param strict Whether the extraction should strictly follow the confidence scores or not.
154- * @return a list of extracted files.
155- * @throws IOException Throws if the file can't be accessed.
156- */
157- public List <ExtractedPDF > extractInvoices (
158- List <InvoiceSplitterV1InvoicePageGroup > pageIndexes ,
159- boolean strict
160- ) throws IOException {
161- List <List <Integer >> correctPageIndexes = new ArrayList <>();
162- if (!strict ) {
163- return extractInvoices (pageIndexes );
164- }
165- Iterator <InvoiceSplitterV1InvoicePageGroup > iterator = pageIndexes .iterator ();
166- List <Integer > currentList = new ArrayList <>();
167- Double previousConfidence = null ;
168- while (iterator .hasNext ()) {
169- InvoiceSplitterV1InvoicePageGroup pageIndex = iterator .next ();
170- Double confidence = pageIndex .getConfidence ();
171- List <Integer > pageList = pageIndex .getPageIndexes ();
172-
173- if (confidence == 1.0 && previousConfidence == null ) {
174- currentList = new ArrayList <>(pageList );
175- } else if (confidence == 1.0 ) {
176- correctPageIndexes .add (currentList );
177- currentList = new ArrayList <>(pageList );
178- } else if (confidence == 0.0 && !iterator .hasNext ()) {
179- currentList .addAll (pageList );
180- correctPageIndexes .add (currentList );
181- } else {
182- correctPageIndexes .add (currentList );
183- correctPageIndexes .add (pageList );
184- }
185- previousConfidence = confidence ;
186- }
187- return extractSubDocuments (correctPageIndexes );
188- }
189-
190115}
0 commit comments