Skip to content

Commit bbe4ace

Browse files
committed
again with the PDF interfaces
1 parent 1bf5df7 commit bbe4ace

9 files changed

Lines changed: 174 additions & 158 deletions

File tree

src/main/java/com/mindee/image/ImageExtractor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import com.mindee.input.InputSourceUtils;
66
import com.mindee.input.LocalInputSource;
77
import com.mindee.pdf.PDFBoxApi;
8-
import com.mindee.pdf.PDFOperation;
8+
import com.mindee.pdf.ExtractionPDFOperation;
99
import com.mindee.pdf.PdfPageImage;
1010
import java.awt.image.BufferedImage;
1111
import java.io.ByteArrayInputStream;
@@ -22,7 +22,7 @@ public class ImageExtractor {
2222
private final String filename;
2323
private final String saveFormat;
2424

25-
public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException {
25+
public ImageExtractor(LocalInputSource source, ExtractionPDFOperation pdfOperation) throws IOException {
2626
this.filename = source.getFilename();
2727
this.pageImages = new ArrayList<>();
2828

src/main/java/com/mindee/input/InputSourceUtils.java

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
package com.mindee.input;
22

33
import com.mindee.MindeeException;
4-
import java.io.ByteArrayInputStream;
5-
import java.io.IOException;
6-
import org.apache.pdfbox.Loader;
7-
import org.apache.pdfbox.io.RandomAccessReadBuffer;
8-
import org.apache.pdfbox.pdmodel.PDDocument;
9-
import org.apache.pdfbox.text.PDFTextStripper;
104

115
/**
126
* Utilities for working with files.
@@ -65,46 +59,4 @@ public static String[] splitNameStrict(String filename) throws MindeeException {
6559
}
6660
return new String[] { name, extension };
6761
}
68-
69-
/**
70-
* Returns true if the file is a PDF.
71-
*/
72-
public static boolean isPdf(byte[] fileBytes) {
73-
try {
74-
Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
75-
} catch (IOException e) {
76-
return false;
77-
}
78-
return true;
79-
}
80-
81-
/**
82-
* Returns true if the source PDF has source text inside. Returns false for images.
83-
*
84-
* @param fileBytes A byte array representing a PDF.
85-
* @return True if at least one character exists in one page.
86-
* @throws MindeeException if the file could not be read.
87-
*/
88-
public static boolean hasSourceText(byte[] fileBytes) {
89-
try {
90-
PDDocument document = Loader
91-
.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
92-
PDFTextStripper stripper = new PDFTextStripper();
93-
94-
for (int i = 0; i < document.getNumberOfPages(); i++) {
95-
stripper.setStartPage(i + 1);
96-
stripper.setEndPage(i + 1);
97-
String pageText = stripper.getText(document);
98-
if (!pageText.trim().isEmpty()) {
99-
document.close();
100-
return true;
101-
}
102-
}
103-
document.close();
104-
} catch (IOException e) {
105-
return false;
106-
}
107-
108-
return false;
109-
}
11062
}

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package com.mindee.input;
22

33
import com.mindee.image.ImageCompressor;
4+
import com.mindee.pdf.InputSourcePDFOperation;
45
import com.mindee.pdf.PDFBoxApi;
56
import com.mindee.pdf.PDFCompressor;
6-
import com.mindee.pdf.PDFOperation;
77
import java.io.File;
88
import java.io.IOException;
99
import java.io.InputStream;
@@ -24,7 +24,7 @@ public final class LocalInputSource {
2424
@Getter
2525
private final String filename;
2626
@Setter
27-
private PDFOperation pdfOperation;
27+
private InputSourcePDFOperation pdfOperation;
2828

2929
public LocalInputSource(InputStream file, String filename) throws IOException {
3030
this.file = IOUtils.toByteArray(file);
@@ -57,7 +57,7 @@ public LocalInputSource(String fileAsBase64, String filename) {
5757
this.filename = filename;
5858
}
5959

60-
public PDFOperation getPdfOperation() {
60+
public InputSourcePDFOperation getPdfOperation() {
6161
if (this.pdfOperation == null) {
6262
this.pdfOperation = new PDFBoxApi();
6363
}
@@ -90,11 +90,11 @@ public void applyPageOptions(PageOptions pageOptions) throws IOException {
9090
}
9191

9292
public boolean isPdf() {
93-
return InputSourceUtils.isPdf(this.file);
93+
return getPdfOperation().isPdf(this.file);
9494
}
9595

9696
public boolean hasSourceText() {
97-
return InputSourceUtils.hasSourceText(this.file);
97+
return getPdfOperation().hasSourceText(this.file);
9898
}
9999

100100
public void compress(

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ private static byte[] createPdfFromExistingPdf(
157157
*/
158158
public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
159159
PDDocument document = Loader.loadPDF(file);
160-
return createPdfFromExistingPdf(document, pageNumbers, true);
160+
return mergePdfPages(document, pageNumbers, true);
161161
}
162162

163163
public static byte[] mergePdfPages(
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.mindee.pdf;
2+
3+
import com.mindee.input.LocalInputSource;
4+
5+
import java.io.IOException;
6+
import java.util.List;
7+
8+
public interface ExtractionPDFOperation {
9+
// /**
10+
// * Render a single page of a PDF as an image.
11+
// */
12+
// PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException;
13+
//
14+
// default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException {
15+
// return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber);
16+
// }
17+
18+
/**
19+
* Render all pages of a PDF as images.
20+
*/
21+
List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException;
22+
23+
default List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
24+
return pdfToImages(source.getFile(), source.getFilename());
25+
}
26+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package com.mindee.pdf;
2+
3+
import com.mindee.MindeeException;
4+
import com.mindee.input.LocalInputSource;
5+
import com.mindee.input.PageOptions;
6+
import java.io.IOException;
7+
8+
public interface InputSourcePDFOperation {
9+
10+
/**
11+
* Split a PDF file.
12+
*/
13+
SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException;
14+
15+
/**
16+
* Get the number of pages in a PDF file.
17+
*/
18+
int getNumberOfPages(byte[] fileBytes) throws IOException;
19+
20+
default int getNumberOfPages(LocalInputSource inputSource) throws IOException {
21+
return getNumberOfPages(inputSource.getFile());
22+
}
23+
24+
/**
25+
* Returns true if the file is a PDF.
26+
*/
27+
boolean isPdf(byte[] fileBytes);
28+
29+
/**
30+
* Returns true if the source PDF has source text inside. Returns false for images.
31+
*
32+
* @param fileBytes A byte array representing a PDF.
33+
* @return True if at least one character exists in one page.
34+
* @throws MindeeException if the file could not be read.
35+
*/
36+
boolean hasSourceText(byte[] fileBytes);
37+
}

src/main/java/com/mindee/pdf/PDFBoxApi.java

Lines changed: 69 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import com.mindee.MindeeException;
44
import com.mindee.input.PageOptions;
55
import java.awt.image.BufferedImage;
6+
import java.io.ByteArrayInputStream;
67
import java.io.ByteArrayOutputStream;
78
import java.io.IOException;
89
import java.util.ArrayList;
@@ -13,15 +14,17 @@
1314
import java.util.stream.Collectors;
1415
import java.util.stream.IntStream;
1516
import org.apache.pdfbox.Loader;
17+
import org.apache.pdfbox.io.RandomAccessReadBuffer;
1618
import org.apache.pdfbox.pdmodel.PDDocument;
1719
import org.apache.pdfbox.pdmodel.common.PDRectangle;
1820
import org.apache.pdfbox.rendering.ImageType;
1921
import org.apache.pdfbox.rendering.PDFRenderer;
22+
import org.apache.pdfbox.text.PDFTextStripper;
2023

2124
/**
2225
* Allows performing various operations on PDFs.
2326
*/
24-
public final class PDFBoxApi implements PDFOperation {
27+
public final class PDFBoxApi implements InputSourcePDFOperation {
2528

2629
@Override
2730
public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException {
@@ -61,33 +64,76 @@ public int getNumberOfPages(byte[] fileBytes) throws IOException {
6164
return pageCount;
6265
}
6366

67+
/**
68+
* Returns true if the file is a PDF.
69+
*/
6470
@Override
65-
public PdfPageImage pdfPageToImage(
66-
byte[] fileBytes,
67-
String filename,
68-
int pageNumber
69-
) throws IOException {
70-
int index = pageNumber - 1;
71-
PDDocument document = Loader.loadPDF(fileBytes);
72-
var pdfRenderer = new PDFRenderer(document);
73-
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
74-
document.close();
75-
return new PdfPageImage(imageBuffer, index, filename, "jpg");
71+
public boolean isPdf(byte[] fileBytes) {
72+
try {
73+
Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
74+
} catch (IOException e) {
75+
return false;
76+
}
77+
return true;
7678
}
7779

80+
/**
81+
* Returns true if the source PDF has source text inside. Returns false for images.
82+
*
83+
* @param fileBytes A byte array representing a PDF.
84+
* @return True if at least one character exists in one page.
85+
* @throws MindeeException if the file could not be read.
86+
*/
7887
@Override
79-
public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
80-
PDDocument document = Loader.loadPDF(fileBytes);
81-
var pdfRenderer = new PDFRenderer(document);
82-
List<PdfPageImage> pdfPageImages = new ArrayList<>();
83-
for (int i = 0; i < document.getNumberOfPages(); i++) {
84-
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
85-
pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
88+
public boolean hasSourceText(byte[] fileBytes) {
89+
try {
90+
PDDocument document = Loader
91+
.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
92+
PDFTextStripper stripper = new PDFTextStripper();
93+
94+
for (int i = 0; i < document.getNumberOfPages(); i++) {
95+
stripper.setStartPage(i + 1);
96+
stripper.setEndPage(i + 1);
97+
String pageText = stripper.getText(document);
98+
if (!pageText.trim().isEmpty()) {
99+
document.close();
100+
return true;
101+
}
102+
}
103+
document.close();
104+
} catch (IOException e) {
105+
return false;
86106
}
87-
document.close();
88-
return pdfPageImages;
107+
return false;
89108
}
90109

110+
// @Override
111+
// public PdfPageImage pdfPageToImage(
112+
// byte[] fileBytes,
113+
// String filename,
114+
// int pageNumber
115+
// ) throws IOException {
116+
// int index = pageNumber - 1;
117+
// PDDocument document = Loader.loadPDF(fileBytes);
118+
// var pdfRenderer = new PDFRenderer(document);
119+
// BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
120+
// document.close();
121+
// return new PdfPageImage(imageBuffer, index, filename, "jpg");
122+
// }
123+
124+
// @Override
125+
// public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
126+
// PDDocument document = Loader.loadPDF(fileBytes);
127+
// var pdfRenderer = new PDFRenderer(document);
128+
// List<PdfPageImage> pdfPageImages = new ArrayList<>();
129+
// for (int i = 0; i < document.getNumberOfPages(); i++) {
130+
// var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
131+
// pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
132+
// }
133+
// document.close();
134+
// return pdfPageImages;
135+
// }
136+
91137
private BufferedImage pdfPageToImageBuffer(
92138
int index,
93139
PDDocument document,
@@ -128,10 +174,10 @@ private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPag
128174
}
129175
}
130176

131-
private boolean checkPdfOpen(byte[] documentFile) {
177+
private boolean checkPdfOpen(byte[] fileBytes) {
132178
boolean opens = false;
133179
try {
134-
Loader.loadPDF(documentFile).close();
180+
Loader.loadPDF(fileBytes).close();
135181
opens = true;
136182
} catch (IOException e) {
137183
e.printStackTrace();

src/main/java/com/mindee/pdf/PDFOperation.java

Lines changed: 0 additions & 44 deletions
This file was deleted.

0 commit comments

Comments
 (0)