-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathImageExtractor.java
More file actions
209 lines (193 loc) · 6.88 KB
/
ImageExtractor.java
File metadata and controls
209 lines (193 loc) · 6.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
package com.mindee.image;
import com.mindee.geometry.Bbox;
import com.mindee.geometry.PositionDataField;
import com.mindee.input.InputSourceUtils;
import com.mindee.input.LocalInputSource;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
/**
* Extract sub-images from an image.
*/
public class ImageExtractor {
private final List<BufferedImage> pageImages;
private final String filename;
private final String saveFormat;
public ImageExtractor(LocalInputSource source) throws IOException {
this.pageImages = new ArrayList<>();
if (source.isPDF()) {
this.saveFormat = "jpg";
var pdfPageImages = pdfToImages(source.getFile(), source.getFilename());
for (PDFPageImage pdfPageImage : pdfPageImages) {
this.pageImages.add(pdfPageImage.getImage());
}
this.filename = source.getFilename() + "." + this.saveFormat;
} else {
this.filename = source.getFilename();
String[] splitName = InputSourceUtils.splitNameStrict(this.filename);
this.saveFormat = splitName[1].toLowerCase();
var input = new ByteArrayInputStream(source.getFile());
this.pageImages.add(ImageIO.read(input));
}
}
private List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
List<PDFPageImage> pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
}
document.close();
return pdfPageImages;
}
private BufferedImage pdfPageToImageBuffer(
int index,
PDDocument document,
PDFRenderer pdfRenderer
) throws IOException {
PDRectangle bbox = document.getPage(index).getBBox();
float dimension = bbox.getWidth() * bbox.getHeight();
int dpi;
if (dimension < 200000) {
dpi = 300;
} else if (dimension < 300000) {
dpi = 250;
} else {
dpi = 200;
}
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
}
/**
* Get the number of pages in the file.
*
* @return The number of pages in the file.
*/
public int getPageCount() {
return this.pageImages.size();
}
/**
* Extract multiple images on a given page from a list of fields having position data.
*
* @param <FieldT> Type of field (needs to support positioning data).
* @param fields List of Fields to extract.
* @param pageIndex The page index to extract, begins at 0.
* @return A list of {@link ExtractedImage}.
*/
public <FieldT extends PositionDataField> ExtractedImages extractImagesFromPage(
List<FieldT> fields,
int pageIndex
) {
return extractImagesFromPage(fields, pageIndex, this.filename);
}
/**
* Extract multiple images on a given page from a list of fields having position data.
*
* @param <FieldT> Type of field (needs to support positioning data).
* @param fields List of Fields to extract.
* @param pageIndex The page index to extract, begins at 0.
* @param outputName The base output filename, must have an image extension.
* @return A list of {@link ExtractedImage}.
*/
public <FieldT extends PositionDataField> ExtractedImages extractImagesFromPage(
List<FieldT> fields,
int pageIndex,
String outputName
) {
String filename;
if (this.getPageCount() > 1) {
String[] splitName = InputSourceUtils.splitNameStrict(outputName);
filename = splitName[0] + "." + this.saveFormat;
} else {
filename = outputName;
}
return extractFromPage(fields, pageIndex, filename);
}
private <FieldT extends PositionDataField> ExtractedImages extractFromPage(
List<FieldT> fields,
int pageIndex,
String outputName
) {
String[] splitName = InputSourceUtils.splitNameStrict(outputName);
var filename = String
.format("%s_page-%3s.%s", splitName[0], pageIndex + 1, splitName[1])
.replace(" ", "0");
var extractedImages = new ExtractedImages();
for (int i = 0; i < fields.size(); i++) {
ExtractedImage extractedImage = extractImage(fields.get(i), pageIndex, i + 1, filename);
if (extractedImage != null) {
extractedImages.add(extractedImage);
}
}
return extractedImages;
}
/**
* Extract a single image from a field having position data.
*
* @param <FieldT> Type of field (needs to support positioning data).
* @param field The field to extract.
* @param index The index to use for naming the extracted image.
* @param filename Name of the file.
* @param pageIndex The page index to extract, begins at 0.
* @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid
* position data.
*/
public <FieldT extends PositionDataField> ExtractedImage extractImage(
FieldT field,
int pageIndex,
int index,
String filename
) {
String[] splitName = InputSourceUtils.splitNameStrict(filename);
String saveFormat = splitName[1].toLowerCase();
var polygon = field.getPolygon();
if (polygon == null) {
return null;
}
String fieldFilename = splitName[0]
+ String.format("_%3s", index).replace(" ", "0")
+ "."
+ saveFormat;
return new ExtractedImage(
extractImage(polygon.getAsBbox(), pageIndex),
fieldFilename,
saveFormat,
pageIndex
);
}
/**
* Extract a single image from a field having position data.
*
* @param <FieldT> Type of field (needs to support positioning data).
* @param field The field to extract.
* @param index The index to use for naming the extracted image.
* @param pageIndex The 0-based page index to extract.
* @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid
* position data.
*/
public <FieldT extends PositionDataField> ExtractedImage extractImage(
FieldT field,
int pageIndex,
int index
) {
return extractImage(field, pageIndex, index, this.filename);
}
private BufferedImage extractImage(Bbox bbox, int pageIndex) {
var image = this.pageImages.get(pageIndex);
int width = image.getWidth();
int height = image.getHeight();
int minX = (int) Math.round(bbox.getMinX() * width);
int maxX = (int) Math.round(bbox.getMaxX() * width);
int minY = (int) Math.round(bbox.getMinY() * height);
int maxY = (int) Math.round(bbox.getMaxY() * height);
return image.getSubimage(minX, minY, maxX - minX, maxY - minY);
}
}