66import com .mindee .input .InputSourceUtils ;
77import com .mindee .input .LocalInputSource ;
88import com .mindee .parsing .standard .PositionData ;
9+ import com .mindee .pdf .PDFUtils ;
10+ import com .mindee .pdf .PdfPageImage ;
911import java .awt .image .BufferedImage ;
1012import java .io .ByteArrayInputStream ;
11- import java .io .File ;
1213import java .io .IOException ;
1314import java .util .ArrayList ;
1415import java .util .List ;
1819 * Extract sub-images from an image.
1920 */
2021public class ImageExtractor {
21- private final BufferedImage bufferedImage ;
22+ private final List < BufferedImage > pageImages ;
2223 private final String filename ;
24+ private final String saveFormat ;
2325
2426 /**
2527 * Init from a path.
2628 * @param filePath Path to the file.
2729 */
2830 public ImageExtractor (String filePath ) throws IOException {
29- File file = new File (filePath );
30- this .filename = file .getName ();
31- this .bufferedImage = ImageIO .read (file );
31+ this (new LocalInputSource (filePath ));
3232 }
3333
3434 /**
@@ -37,29 +37,79 @@ public ImageExtractor(String filePath) throws IOException {
3737 */
3838 public ImageExtractor (LocalInputSource source ) throws IOException {
3939 this .filename = source .getFilename ();
40- ByteArrayInputStream input = new ByteArrayInputStream (source .getFile ());
41- this .bufferedImage = ImageIO .read (input );
40+ this .pageImages = new ArrayList <>();
41+
42+ if (source .isPdf ()) {
43+ this .saveFormat = "jpg" ;
44+ List <PdfPageImage > pdfPageImages = PDFUtils .pdfToImages (source );
45+ for (PdfPageImage pdfPageImage : pdfPageImages ) {
46+ this .pageImages .add (pdfPageImage .getImage ());
47+ }
48+ } else {
49+ String [] splitName = InputSourceUtils .splitNameStrict (this .filename );
50+ this .saveFormat = splitName [1 ].toLowerCase ();
51+
52+ ByteArrayInputStream input = new ByteArrayInputStream (source .getFile ());
53+ this .pageImages .add (ImageIO .read (input ));
54+ }
55+ }
56+
57+ /**
58+ * @return The number of pages in the file.
59+ */
60+ public int getPageCount () {
61+ return this .pageImages .size ();
4262 }
4363
4464 /**
4565 * Extract images from a list of fields having position data.
66+ * Use this when the input file is a PDF with multiple pages.
4667 * @param fields List of Fields to extract.
68+ * @param pageIndex The page index to extract, begins at 0.
4769 * @return A list of {@link ExtractedImage}.
4870 */
49- public <FieldT extends PositionData > List <ExtractedImage > extractImages (List <FieldT > fields ) {
50- return extractImages (fields , this .filename );
71+ public <FieldT extends PositionData > List <ExtractedImage > extractImagesFromPage (
72+ List <FieldT > fields ,
73+ int pageIndex
74+ ) {
75+ return extractImagesFromPage (fields , pageIndex , this .filename );
5176 }
5277
5378 /**
5479 * Extract images from a list of fields having position data.
80+ * Use this when the input file is a PDF with multiple pages.
5581 * @param fields List of Fields to extract.
56- * @param filename The base output filename.
82+ * @param pageIndex The page index to extract, begins at 0.
83+ * @param outputName The base output filename, must have an image extension.
5784 * @return A list of {@link ExtractedImage}.
5885 */
59- public <FieldT extends PositionData > List <ExtractedImage > extractImages (List <FieldT > fields , String filename ) {
86+ public <FieldT extends PositionData > List <ExtractedImage > extractImagesFromPage (
87+ List <FieldT > fields ,
88+ int pageIndex ,
89+ String outputName
90+ ) {
91+ String filename ;
92+ if (this .getPageCount () > 1 ) {
93+ String [] splitName = InputSourceUtils .splitNameStrict (outputName );
94+ filename = splitName [0 ] + "." + this .saveFormat ;
95+ } else {
96+ filename = outputName ;
97+ }
98+ return extractFromPage (fields , pageIndex , filename );
99+ }
100+
101+ private <FieldT extends PositionData > List <ExtractedImage > extractFromPage (
102+ List <FieldT > fields ,
103+ int pageIndex ,
104+ String outputName
105+ ) {
106+ String [] splitName = InputSourceUtils .splitNameStrict (outputName );
107+ String filename = String .format ("%s_page-%3s.%s" , splitName [0 ], pageIndex + 1 , splitName [1 ])
108+ .replace (" " , "0" );
109+
60110 List <ExtractedImage > extractedImages = new ArrayList <>();
61111 for (int i = 0 ; i < fields .size (); i ++) {
62- ExtractedImage extractedImage = extractImage (fields .get (i ), filename , i +1 );
112+ ExtractedImage extractedImage = extractImage (fields .get (i ), pageIndex , i +1 , filename );
63113 if (extractedImage != null ) {
64114 extractedImages .add (extractedImage );
65115 }
@@ -71,9 +121,15 @@ public <FieldT extends PositionData> List<ExtractedImage> extractImages(List<Fie
71121 * Extract an image from a field having position data.
72122 * @param field The field to extract.
73123 * @param index The index to use for naming the extracted image.
124+ * @param pageIndex The page index to extract, begins at 0.
74125 * @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid position data.
75126 */
76- public <FieldT extends PositionData > ExtractedImage extractImage (FieldT field , String filename , int index ) {
127+ public <FieldT extends PositionData > ExtractedImage extractImage (
128+ FieldT field ,
129+ int pageIndex ,
130+ int index ,
131+ String filename
132+ ) {
77133 String [] splitName = InputSourceUtils .splitNameStrict (filename );
78134 String saveFormat = splitName [1 ].toLowerCase ();
79135 Polygon boundingBox = field .getBoundingBox ();
@@ -84,27 +140,29 @@ public <FieldT extends PositionData> ExtractedImage extractImage(FieldT field, S
84140 String fieldFilename = splitName [0 ]
85141 + String .format ("_%3s" , index ).replace (" " , "0" )
86142 + "."
87- + splitName [ 1 ] ;
88- return new ExtractedImage (extractImage (bbox ), fieldFilename , saveFormat );
143+ + saveFormat ;
144+ return new ExtractedImage (extractImage (bbox , pageIndex ), fieldFilename , saveFormat );
89145 }
90146
91147 /**
92148 * Extract an image from a field having position data.
93149 * @param field The field to extract.
94150 * @param index The index to use for naming the extracted image.
151+ * @param pageIndex The page index to extract, begins at 0.
95152 * @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid position data.
96153 */
97- public <FieldT extends PositionData > ExtractedImage extractImage (FieldT field , int index ) {
98- return extractImage (field , this . filename , index );
154+ public <FieldT extends PositionData > ExtractedImage extractImage (FieldT field , int pageIndex , int index ) {
155+ return extractImage (field , pageIndex , index , this . filename );
99156 }
100157
101- private BufferedImage extractImage (Bbox bbox ) {
102- int width = this .bufferedImage .getWidth ();
103- int height = this .bufferedImage .getHeight ();
158+ private BufferedImage extractImage (Bbox bbox , int pageIndex ) {
159+ BufferedImage image = this .pageImages .get (pageIndex );
160+ int width = image .getWidth ();
161+ int height = image .getHeight ();
104162 int minX = (int ) Math .round (bbox .getMinX () * width );
105163 int maxX = (int ) Math .round (bbox .getMaxX () * width );
106164 int minY = (int ) Math .round (bbox .getMinY () * height );
107165 int maxY = (int ) Math .round (bbox .getMaxY () * height );
108- return this . bufferedImage .getSubimage (minX , minY , maxX - minX , maxY - minY );
166+ return image .getSubimage (minX , minY , maxX - minX , maxY - minY );
109167 }
110168}
0 commit comments