|
3 | 3 | import com.mindee.MindeeException; |
4 | 4 | import com.mindee.input.PageOptions; |
5 | 5 | import java.awt.image.BufferedImage; |
| 6 | +import java.io.ByteArrayInputStream; |
6 | 7 | import java.io.ByteArrayOutputStream; |
7 | 8 | import java.io.IOException; |
8 | 9 | import java.util.ArrayList; |
|
13 | 14 | import java.util.stream.Collectors; |
14 | 15 | import java.util.stream.IntStream; |
15 | 16 | import org.apache.pdfbox.Loader; |
| 17 | +import org.apache.pdfbox.io.RandomAccessReadBuffer; |
16 | 18 | import org.apache.pdfbox.pdmodel.PDDocument; |
17 | 19 | import org.apache.pdfbox.pdmodel.common.PDRectangle; |
18 | 20 | import org.apache.pdfbox.rendering.ImageType; |
19 | 21 | import org.apache.pdfbox.rendering.PDFRenderer; |
| 22 | +import org.apache.pdfbox.text.PDFTextStripper; |
20 | 23 |
|
21 | 24 | /** |
22 | 25 | * Allows performing various operations on PDFs. |
23 | 26 | */ |
24 | | -public final class PDFBoxApi implements PDFOperation { |
| 27 | +public final class PDFBoxApi implements InputSourcePDFOperation { |
25 | 28 |
|
26 | 29 | @Override |
27 | 30 | public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException { |
@@ -61,33 +64,76 @@ public int getNumberOfPages(byte[] fileBytes) throws IOException { |
61 | 64 | return pageCount; |
62 | 65 | } |
63 | 66 |
|
| 67 | + /** |
| 68 | + * Returns true if the file is a PDF. |
| 69 | + */ |
64 | 70 | @Override |
65 | | - public PdfPageImage pdfPageToImage( |
66 | | - byte[] fileBytes, |
67 | | - String filename, |
68 | | - int pageNumber |
69 | | - ) throws IOException { |
70 | | - int index = pageNumber - 1; |
71 | | - PDDocument document = Loader.loadPDF(fileBytes); |
72 | | - var pdfRenderer = new PDFRenderer(document); |
73 | | - BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); |
74 | | - document.close(); |
75 | | - return new PdfPageImage(imageBuffer, index, filename, "jpg"); |
| 71 | + public boolean isPdf(byte[] fileBytes) { |
| 72 | + try { |
| 73 | + Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); |
| 74 | + } catch (IOException e) { |
| 75 | + return false; |
| 76 | + } |
| 77 | + return true; |
76 | 78 | } |
77 | 79 |
|
| 80 | + /** |
| 81 | + * Returns true if the source PDF has source text inside. Returns false for images. |
| 82 | + * |
| 83 | + * @param fileBytes A byte array representing a PDF. |
| 84 | + * @return True if at least one character exists in one page. |
| 85 | + * @throws MindeeException if the file could not be read. |
| 86 | + */ |
78 | 87 | @Override |
79 | | - public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException { |
80 | | - PDDocument document = Loader.loadPDF(fileBytes); |
81 | | - var pdfRenderer = new PDFRenderer(document); |
82 | | - List<PdfPageImage> pdfPageImages = new ArrayList<>(); |
83 | | - for (int i = 0; i < document.getNumberOfPages(); i++) { |
84 | | - var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); |
85 | | - pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); |
| 88 | + public boolean hasSourceText(byte[] fileBytes) { |
| 89 | + try { |
| 90 | + PDDocument document = Loader |
| 91 | + .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); |
| 92 | + PDFTextStripper stripper = new PDFTextStripper(); |
| 93 | + |
| 94 | + for (int i = 0; i < document.getNumberOfPages(); i++) { |
| 95 | + stripper.setStartPage(i + 1); |
| 96 | + stripper.setEndPage(i + 1); |
| 97 | + String pageText = stripper.getText(document); |
| 98 | + if (!pageText.trim().isEmpty()) { |
| 99 | + document.close(); |
| 100 | + return true; |
| 101 | + } |
| 102 | + } |
| 103 | + document.close(); |
| 104 | + } catch (IOException e) { |
| 105 | + return false; |
86 | 106 | } |
87 | | - document.close(); |
88 | | - return pdfPageImages; |
| 107 | + return false; |
89 | 108 | } |
90 | 109 |
|
| 110 | +// @Override |
| 111 | +// public PdfPageImage pdfPageToImage( |
| 112 | +// byte[] fileBytes, |
| 113 | +// String filename, |
| 114 | +// int pageNumber |
| 115 | +// ) throws IOException { |
| 116 | +// int index = pageNumber - 1; |
| 117 | +// PDDocument document = Loader.loadPDF(fileBytes); |
| 118 | +// var pdfRenderer = new PDFRenderer(document); |
| 119 | +// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); |
| 120 | +// document.close(); |
| 121 | +// return new PdfPageImage(imageBuffer, index, filename, "jpg"); |
| 122 | +// } |
| 123 | + |
| 124 | +// @Override |
| 125 | +// public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException { |
| 126 | +// PDDocument document = Loader.loadPDF(fileBytes); |
| 127 | +// var pdfRenderer = new PDFRenderer(document); |
| 128 | +// List<PdfPageImage> pdfPageImages = new ArrayList<>(); |
| 129 | +// for (int i = 0; i < document.getNumberOfPages(); i++) { |
| 130 | +// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); |
| 131 | +// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); |
| 132 | +// } |
| 133 | +// document.close(); |
| 134 | +// return pdfPageImages; |
| 135 | +// } |
| 136 | + |
91 | 137 | private BufferedImage pdfPageToImageBuffer( |
92 | 138 | int index, |
93 | 139 | PDDocument document, |
@@ -128,10 +174,10 @@ private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPag |
128 | 174 | } |
129 | 175 | } |
130 | 176 |
|
131 | | - private boolean checkPdfOpen(byte[] documentFile) { |
| 177 | + private boolean checkPdfOpen(byte[] fileBytes) { |
132 | 178 | boolean opens = false; |
133 | 179 | try { |
134 | | - Loader.loadPDF(documentFile).close(); |
| 180 | + Loader.loadPDF(fileBytes).close(); |
135 | 181 | opens = true; |
136 | 182 | } catch (IOException e) { |
137 | 183 | e.printStackTrace(); |
|
0 commit comments