Skip to content

Commit 1500d02

Browse files
committed
will it work?
1 parent 4dc08e0 commit 1500d02

16 files changed

Lines changed: 138 additions & 321 deletions

src/main/java/com/mindee/image/ImageExtractor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public ImageExtractor(LocalInputSource source) throws IOException {
2929
this.filename = source.getFilename();
3030
this.pageImages = new ArrayList<>();
3131

32-
if (source.isPdf()) {
32+
if (source.isPDF()) {
3333
this.saveFormat = "jpg";
3434
var pdfPageImages = pdfToImages(source.getFile(), this.filename);
3535
for (PdfPageImage pdfPageImage : pdfPageImages) {

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import com.mindee.image.ImageCompressor;
44
import com.mindee.pdf.PDFCompression;
55
import com.mindee.pdf.PDFCompressor;
6-
import com.mindee.pdf.PDFInputSource;
7-
import com.mindee.pdf.PDFInputSourcer;
6+
import com.mindee.pdf.PDFInputOperation;
7+
import com.mindee.pdf.PDFInputOperator;
88
import java.io.File;
99
import java.io.IOException;
1010
import java.io.InputStream;
@@ -18,16 +18,18 @@
1818
/**
1919
* A source document for Mindee API operations.
2020
*/
21-
public final class LocalInputSource {
21+
public class LocalInputSource {
2222

2323
@Getter
2424
private byte[] file;
2525
@Getter
2626
private final String filename;
2727
@Setter
28-
private PDFInputSource pdfOperation;
28+
private PDFInputOperation pdfInputOperator;
2929
@Setter
30-
private PDFCompressor pdfCompressor;
30+
private PDFCompression pdfCompressor;
31+
// Store here to avoid recalculating every time.
32+
private Boolean isPDF;
3133

3234
public LocalInputSource(InputStream file, String filename) throws IOException {
3335
this.file = IOUtils.toByteArray(file);
@@ -60,14 +62,14 @@ public LocalInputSource(String fileAsBase64, String filename) {
6062
this.filename = filename;
6163
}
6264

63-
public PDFInputSource getPdfOperation() {
64-
if (this.pdfOperation == null) {
65-
this.pdfOperation = new PDFInputSourcer();
65+
private PDFInputOperation getPdfInputOperator() {
66+
if (this.pdfInputOperator == null) {
67+
this.pdfInputOperator = new PDFInputOperator();
6668
}
67-
return this.pdfOperation;
69+
return this.pdfInputOperator;
6870
}
6971

70-
public PDFCompression getPdfCompressor() {
72+
private PDFCompression getPdfCompressor() {
7173
if (this.pdfCompressor == null) {
7274
this.pdfCompressor = new PDFCompressor();
7375
}
@@ -81,10 +83,10 @@ public PDFCompression getPdfCompressor() {
8183
* @throws IOException If an I/O error occurs during the PDF operation.
8284
*/
8385
public int getPageCount() throws IOException {
84-
if (!this.isPdf()) {
86+
if (!this.isPDF()) {
8587
return 1;
8688
}
87-
return getPdfOperation().getNumberOfPages(this.file);
89+
return getPdfInputOperator().getPageCount(this.file);
8890
}
8991

9092
/**
@@ -94,17 +96,19 @@ public int getPageCount() throws IOException {
9496
* @throws IOException If an I/O error occurs during the PDF operation.
9597
*/
9698
public void applyPageOptions(PageOptions pageOptions) throws IOException {
97-
if (pageOptions != null && this.isPdf()) {
98-
this.file = getPdfOperation().split(this.file, pageOptions).getFile();
99+
if (pageOptions != null && this.isPDF()) {
100+
this.file = getPdfInputOperator().split(this.file, pageOptions).getFile();
99101
}
100102
}
101103

102-
public boolean isPdf() {
103-
return getPdfOperation().isPdf(this.file);
104-
}
105-
106-
public boolean hasSourceText() {
107-
return getPdfOperation().hasSourceText(this.file);
104+
/**
105+
* Returns true if the file is a PDF.
106+
*/
107+
public boolean isPDF() {
108+
if (this.isPDF == null) {
109+
this.isPDF = getPdfInputOperator().isPDF(this.file);
110+
}
111+
return this.isPDF;
108112
}
109113

110114
public LocalInputSource compress(
@@ -114,9 +118,9 @@ public LocalInputSource compress(
114118
Boolean forceSourceText,
115119
Boolean disableSourceText
116120
) throws IOException {
117-
if (isPdf()) {
121+
if (isPDF()) {
118122
this.file = getPdfCompressor()
119-
.compressPdf(this.file, quality, forceSourceText, disableSourceText);
123+
.compressPDF(this.file, quality, forceSourceText, disableSourceText);
120124
} else {
121125
this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight);
122126
}

src/main/java/com/mindee/pdf/PDFExtractorBase.java renamed to src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 3 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import java.awt.image.BufferedImage;
77
import java.io.ByteArrayInputStream;
88
import java.io.ByteArrayOutputStream;
9-
import java.io.File;
109
import java.io.IOException;
1110
import java.util.ArrayList;
1211
import java.util.List;
@@ -17,16 +16,13 @@
1716
import org.apache.pdfbox.pdmodel.PDDocument;
1817
import org.apache.pdfbox.pdmodel.PDPage;
1918
import org.apache.pdfbox.pdmodel.PDPageContentStream;
20-
import org.apache.pdfbox.pdmodel.common.PDRectangle;
2119
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
2220
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
23-
import org.apache.pdfbox.rendering.ImageType;
24-
import org.apache.pdfbox.rendering.PDFRenderer;
2521

2622
/**
2723
* PDF extraction class.
2824
*/
29-
public class PDFExtractorBase implements PDFExtraction {
25+
public class BasePDFExtractor {
3026
protected final PDDocument sourcePdf;
3127
protected final String filename;
3228

@@ -36,9 +32,9 @@ public class PDFExtractorBase implements PDFExtraction {
3632
* @param source The local source.
3733
* @throws IOException Throws if the file can't be accessed.
3834
*/
39-
public PDFExtractorBase(LocalInputSource source) throws IOException {
35+
public BasePDFExtractor(LocalInputSource source) throws IOException {
4036
this.filename = source.getFilename();
41-
if (source.isPdf()) {
37+
if (source.isPDF()) {
4238
this.sourcePdf = Loader.loadPDF(source.getFile());
4339
} else {
4440
var document = new PDDocument();
@@ -60,60 +56,6 @@ public PDFExtractorBase(LocalInputSource source) throws IOException {
6056
}
6157
}
6258

63-
// @Override
64-
// public PdfPageImage pdfPageToImage(
65-
// byte[] fileBytes,
66-
// String filename,
67-
// int pageNumber
68-
// ) throws IOException {
69-
// int index = pageNumber - 1;
70-
// PDDocument document = Loader.loadPDF(fileBytes);
71-
// var pdfRenderer = new PDFRenderer(document);
72-
// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
73-
// document.close();
74-
// return new PdfPageImage(imageBuffer, index, filename, "jpg");
75-
// }
76-
//
77-
// @Override
78-
// public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
79-
// PDDocument document = Loader.loadPDF(fileBytes);
80-
// var pdfRenderer = new PDFRenderer(document);
81-
// List<PdfPageImage> pdfPageImages = new ArrayList<>();
82-
// for (int i = 0; i < document.getNumberOfPages(); i++) {
83-
// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
84-
// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
85-
// }
86-
// document.close();
87-
// return pdfPageImages;
88-
// }
89-
90-
private BufferedImage pdfPageToImageBuffer(
91-
int index,
92-
PDDocument document,
93-
PDFRenderer pdfRenderer
94-
) throws IOException {
95-
PDRectangle bbox = document.getPage(index).getBBox();
96-
float dimension = bbox.getWidth() * bbox.getHeight();
97-
int dpi;
98-
if (dimension < 200000) {
99-
dpi = 300;
100-
} else if (dimension < 300000) {
101-
dpi = 250;
102-
} else {
103-
dpi = 200;
104-
}
105-
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
106-
}
107-
108-
/**
109-
* Get the number of pages in the PDF file.
110-
*
111-
* @return The number of pages in the PDF file.
112-
*/
113-
public int getPageCount() {
114-
return sourcePdf.getNumberOfPages();
115-
}
116-
11759
/**
11860
* Converts an array to a buffered image.
11961
*
@@ -197,24 +139,6 @@ private static byte[] createPdfFromExistingPdf(
197139
return output;
198140
}
199141

200-
/**
201-
* Merge specified PDF pages together.
202-
*
203-
* @param file The PDF file.
204-
* @param pageNumbers Lit of page numbers to merge together.
205-
*/
206-
@Override
207-
public byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
208-
PDDocument document = Loader.loadPDF(file);
209-
return mergePdfPages(document, pageNumbers, true);
210-
}
211-
212-
@Override
213-
public byte[] mergePdfPages(PDDocument document, List<Integer> pageNumbers) throws IOException {
214-
return mergePdfPages(document, pageNumbers, true);
215-
}
216-
217-
@Override
218142
public byte[] mergePdfPages(
219143
PDDocument document,
220144
List<Integer> pageNumbers,

src/main/java/com/mindee/pdf/PDFCompression.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,26 @@
33
import java.io.IOException;
44

55
public interface PDFCompression {
6-
byte[] compressPdf(
7-
byte[] pdfData,
6+
byte[] compressPDF(
7+
byte[] fileBytes,
88
Integer imageQuality,
99
Boolean forceSourceTextCompression,
1010
Boolean disableSourceText
1111
) throws IOException;
1212

13-
default byte[] compressPdf(
14-
byte[] pdfData,
13+
default byte[] compressPDF(
14+
byte[] fileBytes,
1515
Integer imageQuality,
1616
Boolean forceSourceTextCompression
1717
) throws IOException {
18-
return compressPdf(pdfData, imageQuality, forceSourceTextCompression, true);
18+
return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true);
1919
}
2020

21-
default byte[] compressPdf(byte[] pdfData, Integer imageQuality) throws IOException {
22-
return compressPdf(pdfData, imageQuality, false, true);
21+
default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException {
22+
return compressPDF(fileBytes, imageQuality, false, true);
2323
}
2424

25-
default byte[] compressPdf(byte[] pdfData) throws IOException {
26-
return compressPdf(pdfData, 85, false, true);
25+
default byte[] compressPDF(byte[] fileBytes) throws IOException {
26+
return compressPDF(fileBytes, 85, false, true);
2727
}
2828
}

src/main/java/com/mindee/pdf/PDFCompressor.java

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
package com.mindee.pdf;
22

3+
import com.mindee.MindeeException;
34
import java.awt.*;
45
import java.awt.image.BufferedImage;
6+
import java.io.ByteArrayInputStream;
57
import java.io.ByteArrayOutputStream;
68
import java.io.IOException;
79
import java.util.List;
810
import org.apache.pdfbox.Loader;
11+
import org.apache.pdfbox.io.RandomAccessReadBuffer;
912
import org.apache.pdfbox.pdmodel.PDDocument;
1013
import org.apache.pdfbox.pdmodel.PDPage;
1114
import org.apache.pdfbox.pdmodel.PDPageContentStream;
@@ -24,21 +27,21 @@
2427
* PDF compression class.
2528
*/
2629
public class PDFCompressor implements PDFCompression {
27-
PDFInputSourcer pdfInputSourcer;
30+
private final PDFInputOperator pdfInputOperator;
2831

2932
public PDFCompressor() {
30-
pdfInputSourcer = new PDFInputSourcer();
33+
this.pdfInputOperator = new PDFInputOperator();
3134
}
3235

3336
@Override
34-
public byte[] compressPdf(
35-
byte[] pdfData,
37+
public byte[] compressPDF(
38+
byte[] fileBytes,
3639
Integer imageQuality,
3740
Boolean forceSourceTextCompression,
3841
Boolean disableSourceText
3942
) throws IOException {
40-
if (!pdfInputSourcer.isPdf(pdfData)) {
41-
return pdfData;
43+
if (!pdfInputOperator.isPDF(fileBytes)) {
44+
return fileBytes;
4245
}
4346

4447
if (forceSourceTextCompression == null) {
@@ -47,14 +50,14 @@ public byte[] compressPdf(
4750
if (disableSourceText == null) {
4851
disableSourceText = true;
4952
}
50-
if (!forceSourceTextCompression && pdfInputSourcer.hasSourceText(pdfData)) {
53+
if (!forceSourceTextCompression && hasSourceText(fileBytes)) {
5154
System.out
5255
.println(
5356
"MINDEE WARNING: Found text inside of the provided PDF file. Compression operation aborted."
5457
);
55-
return pdfData;
58+
return fileBytes;
5659
}
57-
try (PDDocument inputDoc = Loader.loadPDF(pdfData); PDDocument outputDoc = new PDDocument()) {
60+
try (var inputDoc = Loader.loadPDF(fileBytes); PDDocument outputDoc = new PDDocument()) {
5861

5962
var pdfRenderer = new PDFRenderer(inputDoc);
6063

@@ -79,6 +82,35 @@ public byte[] compressPdf(
7982
}
8083
}
8184

85+
/**
86+
* Returns true if the source PDF has source text inside. Returns false for images.
87+
*
88+
* @param fileBytes A byte array representing a PDF.
89+
* @return True if at least one character exists in one page.
90+
* @throws MindeeException if the file could not be read.
91+
*/
92+
private boolean hasSourceText(byte[] fileBytes) {
93+
try {
94+
PDDocument document = Loader
95+
.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
96+
var stripper = new PDFTextStripper();
97+
98+
for (int i = 0; i < document.getNumberOfPages(); i++) {
99+
stripper.setStartPage(i + 1);
100+
stripper.setEndPage(i + 1);
101+
String pageText = stripper.getText(document);
102+
if (!pageText.trim().isEmpty()) {
103+
document.close();
104+
return true;
105+
}
106+
}
107+
document.close();
108+
} catch (IOException e) {
109+
return false;
110+
}
111+
return false;
112+
}
113+
82114
private static byte[] documentToBytes(PDDocument document) throws IOException {
83115
var outputStream = new ByteArrayOutputStream();
84116
document.save(outputStream);
@@ -122,9 +154,9 @@ protected void writeString(String text, List<TextPosition> textPositions) throws
122154
return;
123155
}
124156

125-
TextPosition firstPosition = textPositions.get(0);
157+
var firstPosition = textPositions.get(0);
126158
float fontSize = firstPosition.getFontSizeInPt();
127-
PDColor color = getGraphicsState().getNonStrokingColor();
159+
var color = getGraphicsState().getNonStrokingColor();
128160
contentStream.beginText();
129161
contentStream.setFont(firstPosition.getFont(), fontSize);
130162
contentStream.setNonStrokingColor(convertToAwtColor(color));

0 commit comments

Comments
 (0)