Skip to content

Commit 276dc8a

Browse files
committed
♻️ use interfaces for PDF operations (#319)
1 parent bae7239 commit 276dc8a

17 files changed

Lines changed: 245 additions & 335 deletions

src/main/java/com/mindee/image/ImageExtractor.java

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@
44
import com.mindee.geometry.PositionDataField;
55
import com.mindee.input.InputSourceUtils;
66
import com.mindee.input.LocalInputSource;
7-
import com.mindee.pdf.PDFBoxApi;
8-
import com.mindee.pdf.PDFOperation;
9-
import com.mindee.pdf.PdfPageImage;
107
import java.awt.image.BufferedImage;
118
import java.io.ByteArrayInputStream;
129
import java.io.IOException;
1310
import java.util.ArrayList;
1411
import java.util.List;
1512
import javax.imageio.ImageIO;
13+
import org.apache.pdfbox.Loader;
14+
import org.apache.pdfbox.pdmodel.PDDocument;
15+
import org.apache.pdfbox.pdmodel.common.PDRectangle;
16+
import org.apache.pdfbox.rendering.ImageType;
17+
import org.apache.pdfbox.rendering.PDFRenderer;
1618

1719
/**
1820
* Extract sub-images from an image.
@@ -22,14 +24,14 @@ public class ImageExtractor {
2224
private final String filename;
2325
private final String saveFormat;
2426

25-
public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException {
27+
public ImageExtractor(LocalInputSource source) throws IOException {
2628
this.filename = source.getFilename();
2729
this.pageImages = new ArrayList<>();
2830

29-
if (source.isPdf()) {
31+
if (source.isPDF()) {
3032
this.saveFormat = "jpg";
31-
var pdfPageImages = pdfOperation.pdfToImages(source);
32-
for (PdfPageImage pdfPageImage : pdfPageImages) {
33+
var pdfPageImages = pdfToImages(source.getFile(), this.filename);
34+
for (PDFPageImage pdfPageImage : pdfPageImages) {
3335
this.pageImages.add(pdfPageImage.getImage());
3436
}
3537
} else {
@@ -41,14 +43,34 @@ public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws
4143
}
4244
}
4345

44-
/**
45-
* Init from a {@link LocalInputSource}.
46-
*
47-
* @param source The local source.
48-
* @throws IOException Throws if the file can't be accessed.
49-
*/
50-
public ImageExtractor(LocalInputSource source) throws IOException {
51-
this(source, new PDFBoxApi());
46+
public List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
47+
PDDocument document = Loader.loadPDF(fileBytes);
48+
var pdfRenderer = new PDFRenderer(document);
49+
List<PDFPageImage> pdfPageImages = new ArrayList<>();
50+
for (int i = 0; i < document.getNumberOfPages(); i++) {
51+
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
52+
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
53+
}
54+
document.close();
55+
return pdfPageImages;
56+
}
57+
58+
private BufferedImage pdfPageToImageBuffer(
59+
int index,
60+
PDDocument document,
61+
PDFRenderer pdfRenderer
62+
) throws IOException {
63+
PDRectangle bbox = document.getPage(index).getBBox();
64+
float dimension = bbox.getWidth() * bbox.getHeight();
65+
int dpi;
66+
if (dimension < 200000) {
67+
dpi = 300;
68+
} else if (dimension < 300000) {
69+
dpi = 250;
70+
} else {
71+
dpi = 200;
72+
}
73+
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
5274
}
5375

5476
/**

src/main/java/com/mindee/pdf/PdfPageImage.java renamed to src/main/java/com/mindee/image/PDFPageImage.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package com.mindee.pdf;
1+
package com.mindee.image;
22

33
import com.mindee.MindeeException;
44
import com.mindee.input.InputSourceUtils;
@@ -16,13 +16,13 @@
1616
* A page in a PDF extracted as an image.
1717
*/
1818
@Getter
19-
public class PdfPageImage {
19+
public class PDFPageImage {
2020
private final BufferedImage image;
2121
private final int originalIndex;
2222
private final String saveFormat;
2323
private final String originalFilename;
2424

25-
public PdfPageImage(
25+
public PDFPageImage(
2626
BufferedImage image,
2727
int originalIndex,
2828
String originalFilename,

src/main/java/com/mindee/input/InputSourceUtils.java

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
package com.mindee.input;
22

33
import com.mindee.MindeeException;
4-
import java.io.ByteArrayInputStream;
5-
import java.io.IOException;
6-
import org.apache.pdfbox.Loader;
7-
import org.apache.pdfbox.io.RandomAccessReadBuffer;
8-
import org.apache.pdfbox.pdmodel.PDDocument;
9-
import org.apache.pdfbox.text.PDFTextStripper;
104

115
/**
126
* Utilities for working with files.
@@ -65,46 +59,4 @@ public static String[] splitNameStrict(String filename) throws MindeeException {
6559
}
6660
return new String[] { name, extension };
6761
}
68-
69-
/**
70-
* Returns true if the file is a PDF.
71-
*/
72-
public static boolean isPdf(byte[] fileBytes) {
73-
try {
74-
Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
75-
} catch (IOException e) {
76-
return false;
77-
}
78-
return true;
79-
}
80-
81-
/**
82-
* Returns true if the source PDF has source text inside. Returns false for images.
83-
*
84-
* @param fileBytes A byte array representing a PDF.
85-
* @return True if at least one character exists in one page.
86-
* @throws MindeeException if the file could not be read.
87-
*/
88-
public static boolean hasSourceText(byte[] fileBytes) {
89-
try {
90-
PDDocument document = Loader
91-
.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
92-
PDFTextStripper stripper = new PDFTextStripper();
93-
94-
for (int i = 0; i < document.getNumberOfPages(); i++) {
95-
stripper.setStartPage(i + 1);
96-
stripper.setEndPage(i + 1);
97-
String pageText = stripper.getText(document);
98-
if (!pageText.trim().isEmpty()) {
99-
document.close();
100-
return true;
101-
}
102-
}
103-
document.close();
104-
} catch (IOException e) {
105-
return false;
106-
}
107-
108-
return false;
109-
}
11062
}

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
package com.mindee.input;
22

33
import com.mindee.image.ImageCompressor;
4-
import com.mindee.pdf.PDFBoxApi;
4+
import com.mindee.pdf.PDFCompression;
55
import com.mindee.pdf.PDFCompressor;
6-
import com.mindee.pdf.PDFOperation;
6+
import com.mindee.pdf.PDFInputOperation;
7+
import com.mindee.pdf.PDFInputOperator;
78
import java.io.File;
89
import java.io.IOException;
910
import java.io.InputStream;
@@ -17,14 +18,18 @@
1718
/**
1819
* A source document for Mindee API operations.
1920
*/
20-
public final class LocalInputSource {
21+
public class LocalInputSource {
2122

2223
@Getter
2324
private byte[] file;
2425
@Getter
2526
private final String filename;
2627
@Setter
27-
private PDFOperation pdfOperation;
28+
private PDFInputOperation pdfInputOperator;
29+
@Setter
30+
private PDFCompression pdfCompressor;
31+
// Store here to avoid recalculating every time.
32+
private Boolean isPDF;
2833

2934
public LocalInputSource(InputStream file, String filename) throws IOException {
3035
this.file = IOUtils.toByteArray(file);
@@ -57,11 +62,18 @@ public LocalInputSource(String fileAsBase64, String filename) {
5762
this.filename = filename;
5863
}
5964

60-
public PDFOperation getPdfOperation() {
61-
if (this.pdfOperation == null) {
62-
this.pdfOperation = new PDFBoxApi();
65+
private PDFInputOperation getPDFInputOperator() {
66+
if (this.pdfInputOperator == null) {
67+
this.pdfInputOperator = new PDFInputOperator();
68+
}
69+
return this.pdfInputOperator;
70+
}
71+
72+
private PDFCompression getPDFCompressor() {
73+
if (this.pdfCompressor == null) {
74+
this.pdfCompressor = new PDFCompressor();
6375
}
64-
return this.pdfOperation;
76+
return this.pdfCompressor;
6577
}
6678

6779
/**
@@ -71,10 +83,10 @@ public PDFOperation getPdfOperation() {
7183
* @throws IOException If an I/O error occurs during the PDF operation.
7284
*/
7385
public int getPageCount() throws IOException {
74-
if (!this.isPdf()) {
86+
if (!this.isPDF()) {
7587
return 1;
7688
}
77-
return getPdfOperation().getNumberOfPages(this.file);
89+
return getPDFInputOperator().getPageCount(this.file);
7890
}
7991

8092
/**
@@ -84,51 +96,58 @@ public int getPageCount() throws IOException {
8496
* @throws IOException If an I/O error occurs during the PDF operation.
8597
*/
8698
public void applyPageOptions(PageOptions pageOptions) throws IOException {
87-
if (pageOptions != null && this.isPdf()) {
88-
this.file = getPdfOperation().split(this.file, pageOptions).getFile();
99+
if (pageOptions != null && this.isPDF()) {
100+
this.file = getPDFInputOperator().split(this.file, pageOptions).getFile();
89101
}
90102
}
91103

92-
public boolean isPdf() {
93-
return InputSourceUtils.isPdf(this.file);
94-
}
95-
96-
public boolean hasSourceText() {
97-
return InputSourceUtils.hasSourceText(this.file);
104+
/**
105+
* Returns true if the file is a PDF.
106+
*/
107+
public boolean isPDF() {
108+
if (this.isPDF == null) {
109+
this.isPDF = getPDFInputOperator().isPDF(this.file);
110+
}
111+
return this.isPDF;
98112
}
99113

100114
public void compress(
101-
Integer quality,
115+
int quality,
102116
Integer maxWidth,
103117
Integer maxHeight,
104118
Boolean forceSourceText,
105119
Boolean disableSourceText
106120
) throws IOException {
107-
if (isPdf()) {
108-
this.file = PDFCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText);
121+
if (isPDF()) {
122+
this.file = getPDFCompressor()
123+
.compressPDF(this.file, quality, forceSourceText, disableSourceText);
109124
} else {
110125
this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight);
111126
}
112127
}
113128

114129
public void compress(
115-
Integer quality,
130+
int quality,
116131
Integer maxWidth,
117132
Integer maxHeight,
118133
Boolean forceSourceText
119134
) throws IOException {
120135
this.compress(quality, maxWidth, maxHeight, forceSourceText, true);
121136
}
122137

123-
public void compress(Integer quality, Integer maxWidth, Integer maxHeight) throws IOException {
124-
this.compress(quality, maxWidth, maxHeight, false, true);
138+
public void compress(
139+
int quality,
140+
boolean forceSourceText,
141+
boolean disableSourceText
142+
) throws IOException {
143+
this.compress(quality, null, null, forceSourceText, disableSourceText);
125144
}
126145

127-
public void compress(Integer quality, Integer maxWidth) throws IOException {
128-
this.compress(quality, maxWidth, null, false, true);
146+
public void compress(int quality, Integer maxWidth, Integer maxHeight) throws IOException {
147+
this.compress(quality, maxWidth, maxHeight, false, true);
129148
}
130149

131-
public void compress(Integer quality) throws IOException {
150+
public void compress(int quality) throws IOException {
132151
this.compress(quality, null, null, false, true);
133152
}
134153

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import java.awt.image.BufferedImage;
77
import java.io.ByteArrayInputStream;
88
import java.io.ByteArrayOutputStream;
9-
import java.io.File;
109
import java.io.IOException;
1110
import java.util.ArrayList;
1211
import java.util.List;
@@ -33,9 +32,9 @@ public class BasePDFExtractor {
3332
* @param source The local source.
3433
* @throws IOException Throws if the file can't be accessed.
3534
*/
36-
protected BasePDFExtractor(LocalInputSource source) throws IOException {
35+
public BasePDFExtractor(LocalInputSource source) throws IOException {
3736
this.filename = source.getFilename();
38-
if (source.isPdf()) {
37+
if (source.isPDF()) {
3938
this.sourcePdf = Loader.loadPDF(source.getFile());
4039
} else {
4140
var document = new PDDocument();
@@ -57,15 +56,6 @@ protected BasePDFExtractor(LocalInputSource source) throws IOException {
5756
}
5857
}
5958

60-
/**
61-
* Get the number of pages in the PDF file.
62-
*
63-
* @return The number of pages in the PDF file.
64-
*/
65-
public int getPageCount() {
66-
return sourcePdf.getNumberOfPages();
67-
}
68-
6959
/**
7060
* Converts an array to a buffered image.
7161
*
@@ -106,10 +96,7 @@ public List<ExtractedPDF> extractSubDocuments(
10696
+ splitName[1];
10797
extractedPDFs
10898
.add(
109-
new ExtractedPDF(
110-
Loader.loadPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false)),
111-
fieldFilename
112-
)
99+
new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename)
113100
);
114101
}
115102
return extractedPDFs;
@@ -149,25 +136,7 @@ private static byte[] createPdfFromExistingPdf(
149136
return output;
150137
}
151138

152-
/**
153-
* Merge specified PDF pages together.
154-
*
155-
* @param file The PDF file.
156-
* @param pageNumbers Lit of page numbers to merge together.
157-
*/
158-
public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
159-
PDDocument document = Loader.loadPDF(file);
160-
return createPdfFromExistingPdf(document, pageNumbers, true);
161-
}
162-
163-
public static byte[] mergePdfPages(
164-
PDDocument document,
165-
List<Integer> pageNumbers
166-
) throws IOException {
167-
return mergePdfPages(document, pageNumbers, true);
168-
}
169-
170-
public static byte[] mergePdfPages(
139+
public byte[] mergePdfPages(
171140
PDDocument document,
172141
List<Integer> pageNumbers,
173142
boolean closeOriginal

0 commit comments

Comments
 (0)