Skip to content

Commit b91646a

Browse files
committed
♻️ 💥 use interfaces for all PDF operations (#317)
1 parent 9f48e54 commit b91646a

13 files changed

Lines changed: 348 additions & 515 deletions

File tree

src/main/java/com/mindee/image/ImageExtractor.java

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import com.mindee.geometry.PositionDataField;
55
import com.mindee.input.InputSourceUtils;
66
import com.mindee.input.LocalInputSource;
7-
import com.mindee.pdf.PDFUtils;
7+
import com.mindee.pdf.PDFBoxApi;
8+
import com.mindee.pdf.PDFOperation;
89
import com.mindee.pdf.PdfPageImage;
910
import java.awt.image.BufferedImage;
1011
import java.io.ByteArrayInputStream;
@@ -21,29 +22,13 @@ public class ImageExtractor {
2122
private final String filename;
2223
private final String saveFormat;
2324

24-
/**
25-
* Init from a path.
26-
*
27-
* @param filePath Path to the file.
28-
* @throws IOException Throws if the file can't be accessed.
29-
*/
30-
public ImageExtractor(String filePath) throws IOException {
31-
this(new LocalInputSource(filePath));
32-
}
33-
34-
/**
35-
* Init from a {@link LocalInputSource}.
36-
*
37-
* @param source The local source.
38-
* @throws IOException Throws if the file can't be accessed.
39-
*/
40-
public ImageExtractor(LocalInputSource source) throws IOException {
25+
public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException {
4126
this.filename = source.getFilename();
4227
this.pageImages = new ArrayList<>();
4328

4429
if (source.isPdf()) {
4530
this.saveFormat = "jpg";
46-
var pdfPageImages = PDFUtils.pdfToImages(source);
31+
var pdfPageImages = pdfOperation.pdfToImages(source);
4732
for (PdfPageImage pdfPageImage : pdfPageImages) {
4833
this.pageImages.add(pdfPageImage.getImage());
4934
}
@@ -56,6 +41,16 @@ public ImageExtractor(LocalInputSource source) throws IOException {
5641
}
5742
}
5843

44+
/**
45+
* Init from a {@link LocalInputSource}.
46+
*
47+
* @param source The local source.
48+
* @throws IOException Throws if the file can't be accessed.
49+
*/
50+
public ImageExtractor(LocalInputSource source) throws IOException {
51+
this(source, new PDFBoxApi());
52+
}
53+
5954
/**
6055
* Get the number of pages in the file.
6156
*

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import com.mindee.pdf.PDFBoxApi;
55
import com.mindee.pdf.PDFCompressor;
66
import com.mindee.pdf.PDFOperation;
7-
import com.mindee.pdf.PDFUtils;
8-
import com.mindee.pdf.SplitQuery;
97
import java.io.File;
108
import java.io.IOException;
119
import java.io.InputStream;
@@ -76,7 +74,7 @@ public int getPageCount() throws IOException {
7674
if (!this.isPdf()) {
7775
return 1;
7876
}
79-
return PDFUtils.getNumberOfPages(this.file);
77+
return getPdfOperation().getNumberOfPages(this.file);
8078
}
8179

8280
/**
@@ -87,7 +85,7 @@ public int getPageCount() throws IOException {
8785
*/
8886
public void applyPageOptions(PageOptions pageOptions) throws IOException {
8987
if (pageOptions != null && this.isPdf()) {
90-
this.file = getPdfOperation().split(new SplitQuery(this.file, pageOptions)).getFile();
88+
this.file = getPdfOperation().split(this.file, pageOptions).getFile();
9189
}
9290
}
9391

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
package com.mindee.pdf;
22

3-
import static com.mindee.pdf.PDFUtils.mergePdfPages;
4-
53
import com.mindee.MindeeException;
64
import com.mindee.input.InputSourceUtils;
75
import com.mindee.input.LocalInputSource;
86
import java.awt.image.BufferedImage;
97
import java.io.ByteArrayInputStream;
8+
import java.io.ByteArrayOutputStream;
9+
import java.io.File;
1010
import java.io.IOException;
1111
import java.util.ArrayList;
1212
import java.util.List;
1313
import javax.imageio.ImageIO;
1414
import org.apache.pdfbox.Loader;
15+
import org.apache.pdfbox.cos.COSDictionary;
16+
import org.apache.pdfbox.cos.COSName;
1517
import org.apache.pdfbox.pdmodel.PDDocument;
1618
import org.apache.pdfbox.pdmodel.PDPage;
1719
import org.apache.pdfbox.pdmodel.PDPageContentStream;
@@ -112,4 +114,64 @@ public List<ExtractedPDF> extractSubDocuments(
112114
}
113115
return extractedPDFs;
114116
}
117+
118+
private static PDPage clonePage(PDPage page) {
119+
120+
COSDictionary pageDict = page.getCOSObject();
121+
COSDictionary newPageDict = new COSDictionary(pageDict);
122+
123+
newPageDict.removeItem(COSName.ANNOTS);
124+
125+
return new PDPage(newPageDict);
126+
}
127+
128+
private static byte[] createPdfFromExistingPdf(
129+
PDDocument document,
130+
List<Integer> pageNumbers,
131+
boolean closeOriginal
132+
) throws IOException {
133+
var outputStream = new ByteArrayOutputStream();
134+
var newDocument = new PDDocument();
135+
int pageCount = document.getNumberOfPages();
136+
pageNumbers
137+
.stream()
138+
.filter(i -> i < pageCount)
139+
.forEach(i -> newDocument.addPage(clonePage(document.getPage(i))));
140+
141+
newDocument.save(outputStream);
142+
newDocument.close();
143+
if (closeOriginal) {
144+
document.close();
145+
}
146+
147+
byte[] output = outputStream.toByteArray();
148+
outputStream.close();
149+
return output;
150+
}
151+
152+
/**
153+
* Merge specified PDF pages together.
154+
*
155+
* @param file The PDF file.
156+
* @param pageNumbers Lit of page numbers to merge together.
157+
*/
158+
public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
159+
PDDocument document = Loader.loadPDF(file);
160+
return createPdfFromExistingPdf(document, pageNumbers, true);
161+
}
162+
163+
public static byte[] mergePdfPages(
164+
PDDocument document,
165+
List<Integer> pageNumbers
166+
) throws IOException {
167+
return mergePdfPages(document, pageNumbers, true);
168+
}
169+
170+
public static byte[] mergePdfPages(
171+
PDDocument document,
172+
List<Integer> pageNumbers,
173+
boolean closeOriginal
174+
) throws IOException {
175+
return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
176+
}
115177
}

src/main/java/com/mindee/pdf/PDFBoxApi.java

Lines changed: 65 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import com.mindee.MindeeException;
44
import com.mindee.input.PageOptions;
5+
import java.awt.image.BufferedImage;
56
import java.io.ByteArrayOutputStream;
67
import java.io.IOException;
78
import java.util.ArrayList;
@@ -13,28 +14,31 @@
1314
import java.util.stream.IntStream;
1415
import org.apache.pdfbox.Loader;
1516
import org.apache.pdfbox.pdmodel.PDDocument;
17+
import org.apache.pdfbox.pdmodel.common.PDRectangle;
18+
import org.apache.pdfbox.rendering.ImageType;
19+
import org.apache.pdfbox.rendering.PDFRenderer;
1620

1721
/**
1822
* Allows performing various operations on PDFs.
1923
*/
2024
public final class PDFBoxApi implements PDFOperation {
2125

2226
@Override
23-
public SplitPDF split(SplitQuery splitQuery) throws IOException {
27+
public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException {
2428

25-
if (!checkPdfOpen(splitQuery.getFile())) {
29+
if (!checkPdfOpen(fileBytes)) {
2630
throw new MindeeException("This document cannot be open and cannot be split.");
2731
}
2832

29-
try (var originalDocument = Loader.loadPDF(splitQuery.getFile())) {
33+
try (var originalDocument = Loader.loadPDF(fileBytes)) {
3034
try (var splitDocument = new PDDocument()) {
31-
int totalOriginalPages = countPages(splitQuery.getFile());
35+
int totalOriginalPages = getNumberOfPages(fileBytes);
3236

33-
if (totalOriginalPages < splitQuery.getPageOptions().getOnMinPages()) {
34-
return new SplitPDF(splitQuery.getFile(), totalOriginalPages);
37+
if (totalOriginalPages < pageOptions.getOnMinPages()) {
38+
return new SplitPDF(fileBytes, totalOriginalPages);
3539
}
3640

37-
var pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages);
41+
var pageRange = getPageRanges(pageOptions, totalOriginalPages);
3842
pageRange
3943
.stream()
4044
.filter(i -> i < totalOriginalPages)
@@ -43,12 +47,65 @@ public SplitPDF split(SplitQuery splitQuery) throws IOException {
4347
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
4448
splitDocument.save(outputStream);
4549
byte[] splitPdf = outputStream.toByteArray();
46-
return new SplitPDF(splitPdf, countPages(splitPdf));
50+
return new SplitPDF(splitPdf, getNumberOfPages(splitPdf));
4751
}
4852
}
4953
}
5054
}
5155

56+
@Override
57+
public int getNumberOfPages(byte[] fileBytes) throws IOException {
58+
var document = Loader.loadPDF(fileBytes);
59+
int pageCount = document.getNumberOfPages();
60+
document.close();
61+
return pageCount;
62+
}
63+
64+
@Override
65+
public PdfPageImage pdfPageToImage(
66+
byte[] fileBytes,
67+
String filename,
68+
int pageNumber
69+
) throws IOException {
70+
int index = pageNumber - 1;
71+
PDDocument document = Loader.loadPDF(fileBytes);
72+
var pdfRenderer = new PDFRenderer(document);
73+
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
74+
document.close();
75+
return new PdfPageImage(imageBuffer, index, filename, "jpg");
76+
}
77+
78+
@Override
79+
public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
80+
PDDocument document = Loader.loadPDF(fileBytes);
81+
var pdfRenderer = new PDFRenderer(document);
82+
List<PdfPageImage> pdfPageImages = new ArrayList<>();
83+
for (int i = 0; i < document.getNumberOfPages(); i++) {
84+
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
85+
pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
86+
}
87+
document.close();
88+
return pdfPageImages;
89+
}
90+
91+
private BufferedImage pdfPageToImageBuffer(
92+
int index,
93+
PDDocument document,
94+
PDFRenderer pdfRenderer
95+
) throws IOException {
96+
PDRectangle bbox = document.getPage(index).getBBox();
97+
float dimension = bbox.getWidth() * bbox.getHeight();
98+
int dpi;
99+
if (dimension < 200000) {
100+
dpi = 300;
101+
} else if (dimension < 300000) {
102+
dpi = 250;
103+
} else {
104+
dpi = 200;
105+
}
106+
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
107+
}
108+
52109
private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPages) {
53110

54111
Set<Integer> pages = Optional
@@ -81,8 +138,4 @@ private boolean checkPdfOpen(byte[] documentFile) {
81138
}
82139
return opens;
83140
}
84-
85-
private int countPages(byte[] documentFile) throws IOException {
86-
return PDFUtils.getNumberOfPages(documentFile);
87-
}
88141
}

0 commit comments

Comments
 (0)