Skip to content

Commit 4013379

Browse files
committed
will it work?
1 parent 4dc08e0 commit 4013379

14 files changed

Lines changed: 87 additions & 278 deletions

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import com.mindee.image.ImageCompressor;
44
import com.mindee.pdf.PDFCompression;
55
import com.mindee.pdf.PDFCompressor;
6-
import com.mindee.pdf.PDFInputSource;
7-
import com.mindee.pdf.PDFInputSourcer;
6+
import com.mindee.pdf.PDFInputOperation;
7+
import com.mindee.pdf.PDFInputOperator;
88
import java.io.File;
99
import java.io.IOException;
1010
import java.io.InputStream;
@@ -18,14 +18,14 @@
1818
/**
1919
* A source document for Mindee API operations.
2020
*/
21-
public final class LocalInputSource {
21+
public class LocalInputSource {
2222

2323
@Getter
2424
private byte[] file;
2525
@Getter
2626
private final String filename;
2727
@Setter
28-
private PDFInputSource pdfOperation;
28+
private PDFInputOperation pdfInputOperation;
2929
@Setter
3030
private PDFCompressor pdfCompressor;
3131

@@ -60,14 +60,14 @@ public LocalInputSource(String fileAsBase64, String filename) {
6060
this.filename = filename;
6161
}
6262

63-
public PDFInputSource getPdfOperation() {
64-
if (this.pdfOperation == null) {
65-
this.pdfOperation = new PDFInputSourcer();
63+
private PDFInputOperation getPdfInputOperation() {
64+
if (this.pdfInputOperation == null) {
65+
this.pdfInputOperation = new PDFInputOperator();
6666
}
67-
return this.pdfOperation;
67+
return this.pdfInputOperation;
6868
}
6969

70-
public PDFCompression getPdfCompressor() {
70+
private PDFCompression getPdfCompressor() {
7171
if (this.pdfCompressor == null) {
7272
this.pdfCompressor = new PDFCompressor();
7373
}
@@ -84,7 +84,7 @@ public int getPageCount() throws IOException {
8484
if (!this.isPdf()) {
8585
return 1;
8686
}
87-
return getPdfOperation().getNumberOfPages(this.file);
87+
return getPdfInputOperation().getNumberOfPages(this.file);
8888
}
8989

9090
/**
@@ -95,16 +95,12 @@ public int getPageCount() throws IOException {
9595
*/
9696
public void applyPageOptions(PageOptions pageOptions) throws IOException {
9797
if (pageOptions != null && this.isPdf()) {
98-
this.file = getPdfOperation().split(this.file, pageOptions).getFile();
98+
this.file = getPdfInputOperation().split(this.file, pageOptions).getFile();
9999
}
100100
}
101101

102102
public boolean isPdf() {
103-
return getPdfOperation().isPdf(this.file);
104-
}
105-
106-
public boolean hasSourceText() {
107-
return getPdfOperation().hasSourceText(this.file);
103+
return getPdfInputOperation().isPDF(this.file);
108104
}
109105

110106
public LocalInputSource compress(
@@ -116,7 +112,7 @@ public LocalInputSource compress(
116112
) throws IOException {
117113
if (isPdf()) {
118114
this.file = getPdfCompressor()
119-
.compressPdf(this.file, quality, forceSourceText, disableSourceText);
115+
.compressPDF(this.file, quality, forceSourceText, disableSourceText);
120116
} else {
121117
this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight);
122118
}

src/main/java/com/mindee/pdf/PDFExtractorBase.java renamed to src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 2 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import java.awt.image.BufferedImage;
77
import java.io.ByteArrayInputStream;
88
import java.io.ByteArrayOutputStream;
9-
import java.io.File;
109
import java.io.IOException;
1110
import java.util.ArrayList;
1211
import java.util.List;
@@ -17,16 +16,13 @@
1716
import org.apache.pdfbox.pdmodel.PDDocument;
1817
import org.apache.pdfbox.pdmodel.PDPage;
1918
import org.apache.pdfbox.pdmodel.PDPageContentStream;
20-
import org.apache.pdfbox.pdmodel.common.PDRectangle;
2119
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
2220
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
23-
import org.apache.pdfbox.rendering.ImageType;
24-
import org.apache.pdfbox.rendering.PDFRenderer;
2521

2622
/**
2723
* PDF extraction class.
2824
*/
29-
public class PDFExtractorBase implements PDFExtraction {
25+
public class BasePDFExtractor {
3026
protected final PDDocument sourcePdf;
3127
protected final String filename;
3228

@@ -36,7 +32,7 @@ public class PDFExtractorBase implements PDFExtraction {
3632
* @param source The local source.
3733
* @throws IOException Throws if the file can't be accessed.
3834
*/
39-
public PDFExtractorBase(LocalInputSource source) throws IOException {
35+
public BasePDFExtractor(LocalInputSource source) throws IOException {
4036
this.filename = source.getFilename();
4137
if (source.isPdf()) {
4238
this.sourcePdf = Loader.loadPDF(source.getFile());
@@ -60,60 +56,6 @@ public PDFExtractorBase(LocalInputSource source) throws IOException {
6056
}
6157
}
6258

63-
// @Override
64-
// public PdfPageImage pdfPageToImage(
65-
// byte[] fileBytes,
66-
// String filename,
67-
// int pageNumber
68-
// ) throws IOException {
69-
// int index = pageNumber - 1;
70-
// PDDocument document = Loader.loadPDF(fileBytes);
71-
// var pdfRenderer = new PDFRenderer(document);
72-
// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
73-
// document.close();
74-
// return new PdfPageImage(imageBuffer, index, filename, "jpg");
75-
// }
76-
//
77-
// @Override
78-
// public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
79-
// PDDocument document = Loader.loadPDF(fileBytes);
80-
// var pdfRenderer = new PDFRenderer(document);
81-
// List<PdfPageImage> pdfPageImages = new ArrayList<>();
82-
// for (int i = 0; i < document.getNumberOfPages(); i++) {
83-
// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
84-
// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
85-
// }
86-
// document.close();
87-
// return pdfPageImages;
88-
// }
89-
90-
private BufferedImage pdfPageToImageBuffer(
91-
int index,
92-
PDDocument document,
93-
PDFRenderer pdfRenderer
94-
) throws IOException {
95-
PDRectangle bbox = document.getPage(index).getBBox();
96-
float dimension = bbox.getWidth() * bbox.getHeight();
97-
int dpi;
98-
if (dimension < 200000) {
99-
dpi = 300;
100-
} else if (dimension < 300000) {
101-
dpi = 250;
102-
} else {
103-
dpi = 200;
104-
}
105-
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
106-
}
107-
108-
/**
109-
* Get the number of pages in the PDF file.
110-
*
111-
* @return The number of pages in the PDF file.
112-
*/
113-
public int getPageCount() {
114-
return sourcePdf.getNumberOfPages();
115-
}
116-
11759
/**
11860
* Converts an array to a buffered image.
11961
*
@@ -197,24 +139,6 @@ private static byte[] createPdfFromExistingPdf(
197139
return output;
198140
}
199141

200-
/**
201-
* Merge specified PDF pages together.
202-
*
203-
* @param file The PDF file.
204-
* @param pageNumbers Lit of page numbers to merge together.
205-
*/
206-
@Override
207-
public byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
208-
PDDocument document = Loader.loadPDF(file);
209-
return mergePdfPages(document, pageNumbers, true);
210-
}
211-
212-
@Override
213-
public byte[] mergePdfPages(PDDocument document, List<Integer> pageNumbers) throws IOException {
214-
return mergePdfPages(document, pageNumbers, true);
215-
}
216-
217-
@Override
218142
public byte[] mergePdfPages(
219143
PDDocument document,
220144
List<Integer> pageNumbers,

src/main/java/com/mindee/pdf/PDFCompression.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,26 @@
33
import java.io.IOException;
44

55
public interface PDFCompression {
6-
byte[] compressPdf(
7-
byte[] pdfData,
6+
byte[] compressPDF(
7+
byte[] fileBytes,
88
Integer imageQuality,
99
Boolean forceSourceTextCompression,
1010
Boolean disableSourceText
1111
) throws IOException;
1212

13-
default byte[] compressPdf(
14-
byte[] pdfData,
13+
default byte[] compressPDF(
14+
byte[] fileBytes,
1515
Integer imageQuality,
1616
Boolean forceSourceTextCompression
1717
) throws IOException {
18-
return compressPdf(pdfData, imageQuality, forceSourceTextCompression, true);
18+
return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true);
1919
}
2020

21-
default byte[] compressPdf(byte[] pdfData, Integer imageQuality) throws IOException {
22-
return compressPdf(pdfData, imageQuality, false, true);
21+
default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException {
22+
return compressPDF(fileBytes, imageQuality, false, true);
2323
}
2424

25-
default byte[] compressPdf(byte[] pdfData) throws IOException {
26-
return compressPdf(pdfData, 85, false, true);
25+
default byte[] compressPDF(byte[] fileBytes) throws IOException {
26+
return compressPDF(fileBytes, 85, false, true);
2727
}
2828
}

src/main/java/com/mindee/pdf/PDFCompressor.java

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
package com.mindee.pdf;
22

3+
import com.mindee.MindeeException;
34
import java.awt.*;
45
import java.awt.image.BufferedImage;
6+
import java.io.ByteArrayInputStream;
57
import java.io.ByteArrayOutputStream;
68
import java.io.IOException;
79
import java.util.List;
810
import org.apache.pdfbox.Loader;
11+
import org.apache.pdfbox.io.RandomAccessReadBuffer;
912
import org.apache.pdfbox.pdmodel.PDDocument;
1013
import org.apache.pdfbox.pdmodel.PDPage;
1114
import org.apache.pdfbox.pdmodel.PDPageContentStream;
@@ -24,21 +27,21 @@
2427
* PDF compression class.
2528
*/
2629
public class PDFCompressor implements PDFCompression {
27-
PDFInputSourcer pdfInputSourcer;
30+
PDFInputOperator pdfInputOperator;
2831

2932
public PDFCompressor() {
30-
pdfInputSourcer = new PDFInputSourcer();
33+
pdfInputOperator = new PDFInputOperator();
3134
}
3235

3336
@Override
34-
public byte[] compressPdf(
35-
byte[] pdfData,
37+
public byte[] compressPDF(
38+
byte[] fileBytes,
3639
Integer imageQuality,
3740
Boolean forceSourceTextCompression,
3841
Boolean disableSourceText
3942
) throws IOException {
40-
if (!pdfInputSourcer.isPdf(pdfData)) {
41-
return pdfData;
43+
if (!pdfInputOperator.isPDF(fileBytes)) {
44+
return fileBytes;
4245
}
4346

4447
if (forceSourceTextCompression == null) {
@@ -47,14 +50,14 @@ public byte[] compressPdf(
4750
if (disableSourceText == null) {
4851
disableSourceText = true;
4952
}
50-
if (!forceSourceTextCompression && pdfInputSourcer.hasSourceText(pdfData)) {
53+
if (!forceSourceTextCompression && hasSourceText(fileBytes)) {
5154
System.out
5255
.println(
5356
"MINDEE WARNING: Found text inside of the provided PDF file. Compression operation aborted."
5457
);
55-
return pdfData;
58+
return fileBytes;
5659
}
57-
try (PDDocument inputDoc = Loader.loadPDF(pdfData); PDDocument outputDoc = new PDDocument()) {
60+
try (PDDocument inputDoc = Loader.loadPDF(fileBytes); PDDocument outputDoc = new PDDocument()) {
5861

5962
var pdfRenderer = new PDFRenderer(inputDoc);
6063

@@ -79,6 +82,35 @@ public byte[] compressPdf(
7982
}
8083
}
8184

85+
/**
86+
* Returns true if the source PDF has source text inside. Returns false for images.
87+
*
88+
* @param fileBytes A byte array representing a PDF.
89+
* @return True if at least one character exists in one page.
90+
* @throws MindeeException if the file could not be read.
91+
*/
92+
private boolean hasSourceText(byte[] fileBytes) {
93+
try {
94+
PDDocument document = Loader
95+
.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
96+
PDFTextStripper stripper = new PDFTextStripper();
97+
98+
for (int i = 0; i < document.getNumberOfPages(); i++) {
99+
stripper.setStartPage(i + 1);
100+
stripper.setEndPage(i + 1);
101+
String pageText = stripper.getText(document);
102+
if (!pageText.trim().isEmpty()) {
103+
document.close();
104+
return true;
105+
}
106+
}
107+
document.close();
108+
} catch (IOException e) {
109+
return false;
110+
}
111+
return false;
112+
}
113+
82114
private static byte[] documentToBytes(PDDocument document) throws IOException {
83115
var outputStream = new ByteArrayOutputStream();
84116
document.save(outputStream);

src/main/java/com/mindee/pdf/PDFExtraction.java

Lines changed: 0 additions & 38 deletions
This file was deleted.

0 commit comments

Comments
 (0)