From bf25dccd71d13d7372d94db91e1727eec348bbf6 Mon Sep 17 00:00:00 2001 From: Fernando Blat Date: Wed, 20 May 2026 16:07:23 +0200 Subject: [PATCH] fix: install JAI Image I/O JPEG2000 plugin for PDFBox PDFBox failed to rasterize PDF pages with JPEG2000-encoded image streams, logging "Cannot read JPEG2000 image: Java Advanced Imaging (JAI) Image I/O Tools are not installed" and skipping OCR on those pages. Drop the jai-imageio-core and jai-imageio-jpeg2000 JARs into /tika-extras/, which the apache/tika base image already includes on the server classpath, so the J2KImageReaderSpi is auto-discovered via the ImageIO ServiceLoader. --- Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Dockerfile b/Dockerfile index 616239f..2736986 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,14 @@ RUN apt-get update \ tesseract-ocr-eus \ && rm -rf /var/lib/apt/lists/* +ARG JAI_IMAGEIO_CORE_VERSION=1.4.0 +ARG JAI_IMAGEIO_JPEG2000_VERSION=1.4.0 +RUN mkdir -p /tika-extras \ + && wget -q -O /tika-extras/jai-imageio-core.jar \ + "https://repo1.maven.org/maven2/com/github/jai-imageio/jai-imageio-core/${JAI_IMAGEIO_CORE_VERSION}/jai-imageio-core-${JAI_IMAGEIO_CORE_VERSION}.jar" \ + && wget -q -O /tika-extras/jai-imageio-jpeg2000.jar \ + "https://repo1.maven.org/maven2/com/github/jai-imageio/jai-imageio-jpeg2000/${JAI_IMAGEIO_JPEG2000_VERSION}/jai-imageio-jpeg2000-${JAI_IMAGEIO_JPEG2000_VERSION}.jar" + COPY tika-config.xml /opt/tika-config.xml USER 35002:35002