again with the PDF interfaces

ianardee · ianardee · commit bbe4aceb0a6c · 2026-04-28T10:59:39.000+02:00
diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java
@@ -5,7 +5,7 @@
 import com.mindee.input.InputSourceUtils;
 import com.mindee.input.LocalInputSource;
 import com.mindee.pdf.PDFBoxApi;
-import com.mindee.pdf.PDFOperation;
+import com.mindee.pdf.ExtractionPDFOperation;
 import com.mindee.pdf.PdfPageImage;
 import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
@@ -22,7 +22,7 @@ public class ImageExtractor {
   private final String filename;
   private final String saveFormat;
 
-  public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException {
+  public ImageExtractor(LocalInputSource source, ExtractionPDFOperation pdfOperation) throws IOException {
     this.filename = source.getFilename();
     this.pageImages = new ArrayList<>();
 
diff --git a/src/main/java/com/mindee/input/InputSourceUtils.java b/src/main/java/com/mindee/input/InputSourceUtils.java
@@ -1,12 +1,6 @@
 package com.mindee.input;
 
 import com.mindee.MindeeException;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.io.RandomAccessReadBuffer;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.text.PDFTextStripper;
 
 /**
  * Utilities for working with files.
@@ -65,46 +59,4 @@ public static String[] splitNameStrict(String filename) throws MindeeException {
     }
     return new String[] { name, extension };
   }
-
-  /**
-   * Returns true if the file is a PDF.
-   */
-  public static boolean isPdf(byte[] fileBytes) {
-    try {
-      Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
-    } catch (IOException e) {
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * Returns true if the source PDF has source text inside. Returns false for images.
-   *
-   * @param fileBytes A byte array representing a PDF.
-   * @return True if at least one character exists in one page.
-   * @throws MindeeException if the file could not be read.
-   */
-  public static boolean hasSourceText(byte[] fileBytes) {
-    try {
-      PDDocument document = Loader
-        .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
-      PDFTextStripper stripper = new PDFTextStripper();
-
-      for (int i = 0; i < document.getNumberOfPages(); i++) {
-        stripper.setStartPage(i + 1);
-        stripper.setEndPage(i + 1);
-        String pageText = stripper.getText(document);
-        if (!pageText.trim().isEmpty()) {
-          document.close();
-          return true;
-        }
-      }
-      document.close();
-    } catch (IOException e) {
-      return false;
-    }
-
-    return false;
-  }
 }
diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java
@@ -1,9 +1,9 @@
 package com.mindee.input;
 
 import com.mindee.image.ImageCompressor;
+import com.mindee.pdf.InputSourcePDFOperation;
 import com.mindee.pdf.PDFBoxApi;
 import com.mindee.pdf.PDFCompressor;
-import com.mindee.pdf.PDFOperation;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -24,7 +24,7 @@ public final class LocalInputSource {
   @Getter
   private final String filename;
   @Setter
-  private PDFOperation pdfOperation;
+  private InputSourcePDFOperation pdfOperation;
 
   public LocalInputSource(InputStream file, String filename) throws IOException {
     this.file = IOUtils.toByteArray(file);
@@ -57,7 +57,7 @@ public LocalInputSource(String fileAsBase64, String filename) {
     this.filename = filename;
   }
 
-  public PDFOperation getPdfOperation() {
+  public InputSourcePDFOperation getPdfOperation() {
     if (this.pdfOperation == null) {
       this.pdfOperation = new PDFBoxApi();
     }
@@ -90,11 +90,11 @@ public void applyPageOptions(PageOptions pageOptions) throws IOException {
   }
 
   public boolean isPdf() {
-    return InputSourceUtils.isPdf(this.file);
+    return getPdfOperation().isPdf(this.file);
   }
 
   public boolean hasSourceText() {
-    return InputSourceUtils.hasSourceText(this.file);
+    return getPdfOperation().hasSourceText(this.file);
   }
 
   public void compress(
diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java
@@ -157,7 +157,7 @@ private static byte[] createPdfFromExistingPdf(
    */
   public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
     PDDocument document = Loader.loadPDF(file);
-    return createPdfFromExistingPdf(document, pageNumbers, true);
+    return mergePdfPages(document, pageNumbers, true);
   }
 
   public static byte[] mergePdfPages(
diff --git a/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java b/src/main/java/com/mindee/pdf/ExtractionPDFOperation.java
@@ -0,0 +1,26 @@
+package com.mindee.pdf;
+
+import com.mindee.input.LocalInputSource;
+
+import java.io.IOException;
+import java.util.List;
+
+public interface ExtractionPDFOperation {
+//  /**
+//   * Render a single page of a PDF as an image.
+//   */
+//  PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException;
+//
+//  default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException {
+//    return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber);
+//  }
+
+  /**
+   * Render all pages of a PDF as images.
+   */
+  List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException;
+
+  default List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
+    return pdfToImages(source.getFile(), source.getFilename());
+  }
+}
diff --git a/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java b/src/main/java/com/mindee/pdf/InputSourcePDFOperation.java
@@ -0,0 +1,37 @@
+package com.mindee.pdf;
+
+import com.mindee.MindeeException;
+import com.mindee.input.LocalInputSource;
+import com.mindee.input.PageOptions;
+import java.io.IOException;
+
+public interface InputSourcePDFOperation {
+
+  /**
+   * Split a PDF file.
+   */
+  SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException;
+
+  /**
+   * Get the number of pages in a PDF file.
+   */
+  int getNumberOfPages(byte[] fileBytes) throws IOException;
+
+  default int getNumberOfPages(LocalInputSource inputSource) throws IOException {
+    return getNumberOfPages(inputSource.getFile());
+  }
+
+  /**
+   * Returns true if the file is a PDF.
+   */
+  boolean isPdf(byte[] fileBytes);
+
+  /**
+   * Returns true if the source PDF has source text inside. Returns false for images.
+   *
+   * @param fileBytes A byte array representing a PDF.
+   * @return True if at least one character exists in one page.
+   * @throws MindeeException if the file could not be read.
+   */
+  boolean hasSourceText(byte[] fileBytes);
+}
diff --git a/src/main/java/com/mindee/pdf/PDFBoxApi.java b/src/main/java/com/mindee/pdf/PDFBoxApi.java
@@ -3,6 +3,7 @@
 import com.mindee.MindeeException;
 import com.mindee.input.PageOptions;
 import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -13,15 +14,17 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
 
 /**
  * Allows performing various operations on PDFs.
  */
-public final class PDFBoxApi implements PDFOperation {
+public final class PDFBoxApi implements InputSourcePDFOperation {
 
   @Override
   public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException {
@@ -61,33 +64,76 @@ public int getNumberOfPages(byte[] fileBytes) throws IOException {
     return pageCount;
   }
 
+  /**
+   * Returns true if the file is a PDF.
+   */
   @Override
-  public PdfPageImage pdfPageToImage(
-      byte[] fileBytes,
-      String filename,
-      int pageNumber
-  ) throws IOException {
-    int index = pageNumber - 1;
-    PDDocument document = Loader.loadPDF(fileBytes);
-    var pdfRenderer = new PDFRenderer(document);
-    BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
-    document.close();
-    return new PdfPageImage(imageBuffer, index, filename, "jpg");
+  public boolean isPdf(byte[] fileBytes) {
+    try {
+      Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
+    } catch (IOException e) {
+      return false;
+    }
+    return true;
   }
 
+  /**
+   * Returns true if the source PDF has source text inside. Returns false for images.
+   *
+   * @param fileBytes A byte array representing a PDF.
+   * @return True if at least one character exists in one page.
+   * @throws MindeeException if the file could not be read.
+   */
   @Override
-  public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
-    PDDocument document = Loader.loadPDF(fileBytes);
-    var pdfRenderer = new PDFRenderer(document);
-    List<PdfPageImage> pdfPageImages = new ArrayList<>();
-    for (int i = 0; i < document.getNumberOfPages(); i++) {
-      var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
-      pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
+  public boolean hasSourceText(byte[] fileBytes) {
+    try {
+      PDDocument document = Loader
+        .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
+      PDFTextStripper stripper = new PDFTextStripper();
+
+      for (int i = 0; i < document.getNumberOfPages(); i++) {
+        stripper.setStartPage(i + 1);
+        stripper.setEndPage(i + 1);
+        String pageText = stripper.getText(document);
+        if (!pageText.trim().isEmpty()) {
+          document.close();
+          return true;
+        }
+      }
+      document.close();
+    } catch (IOException e) {
+      return false;
     }
-    document.close();
-    return pdfPageImages;
+    return false;
   }
 
+//  @Override
+//  public PdfPageImage pdfPageToImage(
+//      byte[] fileBytes,
+//      String filename,
+//      int pageNumber
+//  ) throws IOException {
+//    int index = pageNumber - 1;
+//    PDDocument document = Loader.loadPDF(fileBytes);
+//    var pdfRenderer = new PDFRenderer(document);
+//    BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
+//    document.close();
+//    return new PdfPageImage(imageBuffer, index, filename, "jpg");
+//  }
+
+//  @Override
+//  public List<PdfPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
+//    PDDocument document = Loader.loadPDF(fileBytes);
+//    var pdfRenderer = new PDFRenderer(document);
+//    List<PdfPageImage> pdfPageImages = new ArrayList<>();
+//    for (int i = 0; i < document.getNumberOfPages(); i++) {
+//      var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
+//      pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg"));
+//    }
+//    document.close();
+//    return pdfPageImages;
+//  }
+
   private BufferedImage pdfPageToImageBuffer(
       int index,
       PDDocument document,
@@ -128,10 +174,10 @@ private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPag
     }
   }
 
-  private boolean checkPdfOpen(byte[] documentFile) {
+  private boolean checkPdfOpen(byte[] fileBytes) {
     boolean opens = false;
     try {
-      Loader.loadPDF(documentFile).close();
+      Loader.loadPDF(fileBytes).close();
       opens = true;
     } catch (IOException e) {
       e.printStackTrace();
diff --git a/src/main/java/com/mindee/pdf/PDFOperation.java b/src/main/java/com/mindee/pdf/PDFOperation.java
diff --git a/src/test/java/com/mindee/pdf/PDFOperationTest.java b/src/test/java/com/mindee/pdf/PDFOperationTest.java

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ private static byte[] createPdfFromExistingPdf(`
`157`	`157`	`*/`
`158`	`158`	`public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {`
`159`	`159`	`PDDocument document = Loader.loadPDF(file);`
`160`		`- return createPdfFromExistingPdf(document, pageNumbers, true);`
	`160`	`+ return mergePdfPages(document, pageNumbers, true);`
`161`	`161`	`}`
`162`	`162`
`163`	`163`	`public static byte[] mergePdfPages(`