Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import no.ndla.imageapi.model.api.{ImageMetaDomainDumpDTO, ImageMetaInformationV
import no.ndla.imageapi.model.domain.ImageMetaInformation
import no.ndla.imageapi.repository.ImageRepository
import no.ndla.imageapi.service.search.{ImageIndexService, TagIndexService}
import no.ndla.imageapi.service.{ConverterService, ReadService}
import no.ndla.imageapi.service.{ConverterService, ReadService, WriteService}
import no.ndla.imageapi.Props
import no.ndla.network.clients.MyNDLAApiClient
import no.ndla.network.tapir.NoNullJsonPrinter.jsonBody
Expand All @@ -33,6 +33,7 @@ import scala.util.{Failure, Success}
class InternController(using
imageRepository: ImageRepository,
readService: ReadService,
writeService: WriteService,
converterService: ConverterService,
imageIndexService: ImageIndexService,
tagIndexService: TagIndexService,
Expand All @@ -42,13 +43,23 @@ class InternController(using
myNDLAApiClient: MyNDLAApiClient,
) extends TapirController
with StrictLogging {

import errorHelpers.*

override val prefix: EndpointInput[Unit] = "intern"
override val enableSwagger = false
private val stringInternalServerError = statusCode(StatusCode.InternalServerError).and(stringBody)

override val endpoints: List[ServerEndpoint[Any, Eff]] =
List(postIndex, deleteIndex, getExternImageId, getDomainImageFromUrl, dumpImages, dumpSingleImage, postDump)
override val endpoints: List[ServerEndpoint[Any, Eff]] = List(
postIndex,
deleteIndex,
getExternImageId,
getDomainImageFromUrl,
dumpImages,
dumpSingleImage,
postDump,
startExifDataMigration,
)

def postIndex: ServerEndpoint[Any, Eff] = endpoint
.post
Expand Down Expand Up @@ -130,7 +141,8 @@ class InternController(using
}

val urlQueryParam: EndpointInput.Query[Option[String]] = query[Option[String]]("url")
def getDomainImageFromUrl: ServerEndpoint[Any, Eff] = endpoint

def getDomainImageFromUrl: ServerEndpoint[Any, Eff] = endpoint
.get
.in("domain_image_from_url")
.in(urlQueryParam)
Expand Down Expand Up @@ -174,4 +186,23 @@ class InternController(using
.serverLogicPure { imageMeta =>
imageRepository.insert(imageMeta)
}

def startExifDataMigration: ServerEndpoint[Any, Eff] = endpoint
.post
.in("migrate" / "exif")
.out(jsonBody[String])
.serverLogicPure { _ =>
logger.info("Starting EXIF data extraction for all existing images...")

Thread
.ofVirtual()
.start(() => {
writeService.extractAndStoreExifDataForExistingImages() match {
case Success(_) => logger.info("Successfully finished EXIF data extraction for all existing images")
case Failure(ex) => logger.error("Failed to extract EXIF data for existing images", ex)
}
})

"Started EXIF data extraction for all existing images".asRight
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ case class ImageFileDTO(
variants: Seq[ImageVariantDTO],
@description(LanguageDocString)
language: String,
@description("Date image was taken, if available")
originalDate: Option[String],
)

object ImageFileDTO {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ case class UploadedImage(
contentType: ImageContentType,
dimensions: Option[ImageDimensions],
variants: Seq[ImageVariant],
originalDate: Option[String],
)

object UploadedImage {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ case class ImageFileData(
contentType: ImageContentType,
dimensions: Option[ImageDimensions],
variants: Seq[ImageVariant],
originalDate: Option[String],
override val language: String,
) extends WithLanguage {
def getFileStem: String = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class ConverterService(using clock: Clock, props: Props) extends StrictLogging {
dimensions = dimensions,
variants = variants,
language = image.language,
originalDate = image.originalDate,
)
}

Expand Down Expand Up @@ -347,6 +348,7 @@ class ConverterService(using clock: Clock, props: Props) extends StrictLogging {
contentType = upload.contentType,
dimensions = upload.dimensions,
variants = upload.variants,
originalDate = upload.originalDate,
language = language,
)
}
Expand Down
60 changes: 60 additions & 0 deletions image-api/src/main/scala/no/ndla/imageapi/service/ExifUtil.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Part of NDLA image-api
* Copyright (C) 2026 NDLA
*
* See LICENSE
*
*/

package no.ndla.imageapi.service

import com.sksamuel.scrimage.metadata.ImageMetadata
import com.typesafe.scalalogging.StrictLogging

import java.io.InputStream
import scala.util.Try

object ExifUtil extends StrictLogging {

// Common EXIF date/time tags to check for when extracting the original capture date
private val ExifDateTimeOriginal = "Exif SubIFD:Date/Time Original"
private val ExifDateTime = "Exif IFD0:Date/Time"

def extractDate(exifData: Option[Map[String, String]]): Option[String] =
if (exifData.isEmpty) {
None
} else {
exifData.flatMap(data => data.get(ExifDateTimeOriginal).orElse(data.get(ExifDateTime)).orElse(None))
}

/** Extracts all EXIF key-value pairs from an ImageMetadata.
*/
def extractMetadataMap(metadata: ImageMetadata): Option[Map[String, String]] = {
val values = metadata
.getDirectories
.flatMap { directory =>
directory
.getTags
.flatMap { tag =>
val name = s"${directory.getName}:${tag.getName}"
val value = tag.getRawValue
Option.when(value != null && value.nonEmpty)(name -> value)
}
}
.toMap
if (values.isEmpty) None
else Some(values)
}

/** Extracts all EXIF key-value pairs from an InputStream.
*/
def extractExifDataFromStream(stream: InputStream): Option[Map[String, String]] = {
Try {
extractMetadataMap(ImageMetadata.fromStream(stream))
}.recover { case ex =>
logger.warn(s"Failed to extract EXIF data from stream: ${ex.getMessage}", ex)
None
}
.getOrElse(None)
}
}
135 changes: 121 additions & 14 deletions image-api/src/main/scala/no/ndla/imageapi/service/WriteService.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ import com.sksamuel.scrimage.webp.WebpWriter
import com.typesafe.scalalogging.StrictLogging
import no.ndla.common.Clock
import no.ndla.common.aws.NdlaCloudFrontClient
import no.ndla.common.errors.{MissingIdException, ValidationException}
import no.ndla.common.errors.{MissingBucketKeyException, MissingIdException, ValidationException}
import no.ndla.common.implicits.*
import no.ndla.common.model.api.{Deletable, Delete, Missing, UpdateWith}
import no.ndla.common.model.domain.UploadedFile
import no.ndla.common.model.{NDLADate, domain as common}
import no.ndla.database.DBUtility
import no.ndla.imageapi.Props
import no.ndla.imageapi.model.*
import no.ndla.imageapi.model.api.{
Expand All @@ -34,6 +35,7 @@ import no.ndla.language.Language
import no.ndla.language.Language.{mergeLanguageFields, sortByLanguagePriority}
import no.ndla.language.model.LanguageField
import no.ndla.network.tapir.auth.TokenUser
import scalikejdbc.DBSession

import java.util.concurrent.Executors
import scala.concurrent.duration.DurationInt
Expand All @@ -50,6 +52,7 @@ class WriteService(using
imageConverter: ImageConverter,
tagIndexService: TagIndexService,
cloudFrontClient: NdlaCloudFrontClient,
dbUtility: DBUtility,
clock: Clock,
random: Random,
props: Props,
Expand Down Expand Up @@ -414,7 +417,7 @@ class WriteService(using
val processableStream = imageConverter.uploadedFileToImageStream(file, fileName) match {
case Success(stream: ImageStream.Processable) => stream
case Success(stream: ImageStream.Gif) => return uploadGifImageStream(stream)
case Success(stream: ImageStream.Unprocessable) => return uploadImageStream(stream, None)
case Success(stream: ImageStream.Unprocessable) => return uploadImageStream(stream, None, None)
case Failure(ex) => return Failure(ex)
}

Expand All @@ -423,6 +426,8 @@ class WriteService(using
case Failure(ex) => return Failure(ex)
}
val dimensions = ImageDimensions(processableImage.image.width, processableImage.image.height)
val exifData = ExifUtil.extractMetadataMap(processableImage.image.getMetadata)

// Since a stream cannot be read from twice, we need to create a new stream for uploading the original image.
// At this point we know that the image is processable, so we can safely create a new processable ImageStream.
val originalImageStream =
Expand All @@ -432,8 +437,9 @@ class WriteService(using
case given ExecutionContext =>
val variantsFuture =
generateAndUploadVariantsAsync(processableImage, dimensions, uniqueFileStem, processableImage.format)
val maybeUploadedOriginalImage = uploadImageStream(originalImageStream, Some(dimensions))
val maybeVariants = Try(Await.result(variantsFuture, 1.minute))
val maybeUploadedOriginalImage =
uploadImageStream(originalImageStream, Some(dimensions), ExifUtil.extractDate(exifData))
val maybeVariants = Try(Await.result(variantsFuture, 1.minute))

(maybeUploadedOriginalImage, maybeVariants) match {
case (Success(uploadedImage), Success(variants)) => Success(uploadedImage.copy(variants = variants))
Expand Down Expand Up @@ -490,16 +496,28 @@ class WriteService(using

private def uploadGifImageStream(gifImageStream: ImageStream.Gif): Try[UploadedImage] = ScrimageUtil
.getDimensionsFromGifStream(gifImageStream)
.flatMap((stream, dim) => uploadImageStream(stream, dim.some))

private def uploadImageStream(stream: ImageStream, dimensions: Option[ImageDimensions]): Try[UploadedImage] =
Using(stream) { imageStream =>
val contentLength = imageStream.contentLength
val contentType = imageStream.contentType
imageStorage
.uploadFromStream(imageStream.fileName, imageStream.stream, contentLength, contentType)
.map(bucketKey => UploadedImage(bucketKey, contentLength, contentType, dimensions, Seq.empty))
}.flatten
.flatMap((stream, dim) => uploadImageStream(stream, dim.some, None))

private def uploadImageStream(
stream: ImageStream,
dimensions: Option[ImageDimensions],
originalDate: Option[String],
): Try[UploadedImage] = Using(stream) { imageStream =>
val contentLength = imageStream.contentLength
val contentType = imageStream.contentType
imageStorage
.uploadFromStream(imageStream.fileName, imageStream.stream, contentLength, contentType)
.map(bucketKey =>
UploadedImage(
fileName = bucketKey,
size = contentLength,
contentType = contentType,
dimensions = dimensions,
variants = Seq.empty,
originalDate = originalDate,
)
)
}.flatten

private def deleteImageAndVariants(image: ImageFileData): Try[Unit] = {
val variantsResult = imageStorage.deleteObjects(image.variants.map(_.bucketKey))
Expand All @@ -511,6 +529,95 @@ class WriteService(using
}
}

/** Batch job that fetches all images from S3, extracts EXIF data, and updates the database. Images that already have
* non-empty exifData are skipped. Missing S3 objects are ignored.
*/
def extractAndStoreExifDataForExistingImages(): Try[Unit] = {
val batchSize = 20
val batchIterator = imageRepository.getImageMetaBatched(batchSize) match {
case Success(it) => it
case Failure(ex) => return Failure(ex)
}
val totalBatchCount = batchIterator.knownSize
Using(ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(batchSize))) {
case given ExecutionContext => batchIterator
.zipWithIndex
.map { (batch, index) =>
logger.info(s"EXIF migration: Processing batch ${index + 1} of $totalBatchCount (batch size = $batchSize)")

val batchFuture = Future.traverse {
batch
.filter(meta => meta.id.nonEmpty && meta.images.exists(_.originalDate.isEmpty))
.flatMap(meta => meta.images.filter(_.originalDate.isEmpty).map(meta -> _))
} { (imageMeta, imageFile) =>
extractExifForImageFileAsync(imageMeta, imageFile)
}

val storeResultsFuture = batchFuture.map { results =>
results
.sequence
.flatMap { metasWithFiles =>
metasWithFiles
.groupMap(_._1)(_._2)
.toSeq
.traverse { (imageMeta, updatedFiles) =>
val updatedFileMap = updatedFiles.map(f => (f.fileName, f.language) -> f).toMap
val mergedImageFiles = imageMeta
.images
.map { existing =>
updatedFileMap.getOrElse((existing.fileName, existing.language), existing)
}
dbUtility.writeSession { case given DBSession =>
val updatedMeta = imageMeta.copy(images = mergedImageFiles)
imageRepository.update(updatedMeta, updatedMeta.id.get).map(_ => ())
}
}
}
}

Try(Await.result(storeResultsFuture, 5.minutes)).flatten
}
.collectFirst { case Failure(ex) =>
ex
} match {
case Some(ex) => Failure(ex)
case None => Success(())
}
}.flatten
}

private def extractExifForImageFileAsync(imageMeta: ImageMetaInformation, imageFile: ImageFileData)(using
ExecutionContext
): Future[Try[(ImageMetaInformation, ImageFileData)]] = Future {
imageStorage
.getRaw(imageFile.fileName)
.map { s3Object =>
Using(s3Object.stream) { stream =>
val exifData = ExifUtil.extractExifDataFromStream(stream)
val updatedFile = imageFile.copy(originalDate = ExifUtil.extractDate(exifData))
imageMeta -> updatedFile
}.getOrElse {
logger.warn(
s"EXIF migration: Failed to extract EXIF data for image due to error reading stream (imageMetaId = ${imageMeta.id.get}, fileName = ${imageFile.fileName})"
)
imageMeta -> imageFile
}
}
.recoverWith {
case ex: MissingBucketKeyException =>
logger.warn(
s"EXIF migration: Ignoring missing bucket object for image (imageMetaId = ${imageMeta.id.get}, fileName = ${imageFile.fileName})"
)
Success(imageMeta -> imageFile)
case ex =>
logger.error(
s"EXIF migration: Failed to extract EXIF data for image (imageMetaId = ${imageMeta.id.get}, fileName = ${imageFile.fileName})",
ex,
)
Failure(ex)
}
}

private def moveImageAndVariants(image: ImageFileData, newBucketPrefix: String): Try[ImageFileData] = {
val variantKeysToNewVariants = image
.variants
Expand Down
Loading
Loading