Add mpp correction factor and highest res .dcm selection.

blanca-pablos · blanca-pablos · commit a3597c50cd85 · 2025-11-20T09:45:32.000+01:00
* Add mpp correction factor for tiffs written using older versions of
vips containing a bug that wrote resolution as px / mm instead of cm.
* Select only highest resolution file from DICOM series in multiple files.
diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py
@@ -3,55 +3,38 @@
 import base64
 import re
 import time
+from collections import defaultdict
 from collections.abc import Callable, Generator
 from http import HTTPStatus
 from importlib.util import find_spec
 from pathlib import Path
 from typing import Any
 
 import google_crc32c
+import pydicom
 import requests
 from loguru import logger
 
 from aignostics.bucket import Service as BucketService
-from aignostics.constants import (
-    TEST_APP_APPLICATION_ID,
-)
-from aignostics.platform import (
-    LIST_APPLICATION_RUNS_MAX_PAGE_SIZE,
-    ApiException,
-    Application,
-    ApplicationSummary,
-    ApplicationVersion,
-    Client,
-    InputArtifact,
-    InputItem,
-    NotFoundException,
-    Run,
-    RunData,
-    RunOutput,
-    RunState,
-)
-from aignostics.platform import (
-    Service as PlatformService,
-)
+from aignostics.constants import TEST_APP_APPLICATION_ID
+from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE,
+                                 ApiException, Application, ApplicationSummary,
+                                 ApplicationVersion, Client, InputArtifact,
+                                 InputItem, NotFoundException, Run, RunData,
+                                 RunOutput, RunState)
+from aignostics.platform import Service as PlatformService
 from aignostics.utils import BaseService, Health, sanitize_path_component
 from aignostics.wsi import Service as WSIService
 
-from ._download import (
-    download_available_items,
-    download_url_to_file_with_progress,
-    extract_filename_from_url,
-    update_progress,
-)
+from ._download import (download_available_items,
+                        download_url_to_file_with_progress,
+                        extract_filename_from_url, update_progress)
 from ._models import DownloadProgress, DownloadProgressState
 from ._settings import Settings
-from ._utils import (
-    get_mime_type_for_artifact,
-    get_supported_extensions_for_application,
-    is_not_terminated_with_deadline_exceeded,
-    validate_due_date,
-)
+from ._utils import (get_mime_type_for_artifact,
+                     get_supported_extensions_for_application,
+                     is_not_terminated_with_deadline_exceeded,
+                     validate_due_date)
 
 has_qupath_extra = find_spec("ijson")
 if has_qupath_extra:
@@ -312,6 +295,55 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None
             for key_value in key_value_pairs:
                 Service._process_key_value_pair(entry, key_value, external_id)
 
+    @staticmethod
+    def _filter_dicom_series_files(source_directory: Path) -> set[Path]:
+        """Filter DICOM files to keep only one representative per series.
+
+        For multi-file DICOM series, keeps only the highest resolution file.
+        OpenSlide will find other files in the same directory when needed.
+
+        Args:
+            source_directory: The directory to scan.
+
+        Returns:
+            set[Path]: Set of DICOM files to exclude from processing.
+        """
+        dicom_files = list(source_directory.glob("**/*.dcm"))
+        series_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list)
+
+        # Group by SeriesInstanceUID with dimensions
+        for dcm_file in dicom_files:
+            try:
+                ds = pydicom.dcmread(dcm_file, stop_before_pixels=True)
+                series_uid = ds.SeriesInstanceUID
+                # Get dimensions (Rows and Columns from DICOM)
+                rows = int(getattr(ds, "Rows", 0))
+                cols = int(getattr(ds, "Columns", 0))
+                series_groups[series_uid].append((dcm_file, rows, cols))
+            except Exception as e:
+                logger.debug(f"Could not read DICOM {dcm_file}: {e}")
+                # Treat as standalone - don't exclude
+
+        # For each series with multiple files, keep only the highest resolution one
+        files_to_exclude = set()
+        for series_uid, files_with_dims in series_groups.items():
+            if len(files_with_dims) > 1:
+                # Find the file with the largest dimensions (rows * cols = total pixels)
+                highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2])
+                file_to_keep, rows, cols = highest_res_file
+
+                # Exclude all others
+                for file_path, r, c in files_with_dims:
+                    if file_path != file_to_keep:
+                        files_to_exclude.add(file_path)
+
+                logger.debug(
+                    f"DICOM series {series_uid}: keeping {file_to_keep.name} "
+                    f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files"
+                )
+
+        return files_to_exclude
+
     @staticmethod
     def generate_metadata_from_source_directory(  # noqa: PLR0913, PLR0917
         source_directory: Path,
@@ -366,10 +398,17 @@ def generate_metadata_from_source_directory(  # noqa: PLR0913, PLR0917
 
         metadata = []
 
+        # Pre-filter: exclude redundant DICOM files from multi-file series
+        dicom_files_to_exclude = Service._filter_dicom_series_files(source_directory)
+
         try:
             extensions = get_supported_extensions_for_application(application_id)
             for extension in extensions:
                 for file_path in source_directory.glob(f"**/*{extension}"):
+                    # Skip excluded DICOM files
+                    if file_path in dicom_files_to_exclude:
+                        continue
+
                     # Generate CRC32C checksum with google_crc32c and encode as base64
                     hash_sum = google_crc32c.Checksum()  # type: ignore[no-untyped-call]
                     with file_path.open("rb") as f:
diff --git a/src/aignostics/wsi/_openslide_handler.py b/src/aignostics/wsi/_openslide_handler.py
@@ -1,12 +1,14 @@
 """Handler for wsi files using OpenSlide."""
 
+import re
 from pathlib import Path
 from typing import Any
 
 import defusedxml.ElementTree as ET  # noqa: N817
 import openslide
 from loguru import logger
 from openslide import ImageSlide, OpenSlide, open_slide
+from packaging import version
 from PIL.Image import Image
 
 TIFF_IMAGE_DESCRIPTION = "tiff.ImageDescription"
@@ -62,6 +64,34 @@ def _detect_format(self) -> str | None:
 
         return base_format
 
+    def _get_mpp_correction_factor(self, props: dict[str, Any]) -> float:
+        """Handle a scaling bug in libvips<8.8.3 for tiff files.
+
+        libvips<8.8.3 had a bug which wrote the tiff.XResolution as px / mm, but it should be
+        px / cm. Therefore, the resolution is 10x smaller than expected. To counteract, one has
+        to multiply the mpp with 0.1. Source: https://github.com/libvips/libvips/issues/1421
+
+        Returns:
+            float: Correction factor (0.1 for buggy versions, 1.0 otherwise).
+        """
+        _LEGACY_MPP_FACTOR = 1 / 10
+
+        try:
+            xml_string = props[TIFF_IMAGE_DESCRIPTION]
+
+            # Match custom metadata for library version used during export
+            libvips_version_match = re.findall(r"libVips-version.*?(\d+\.\d+\.\d+)", xml_string, re.DOTALL)
+            if not libvips_version_match:
+                return _LEGACY_MPP_FACTOR
+
+            if version.parse(libvips_version_match[0]) >= version.parse("8.8.3"):
+                # Bug-free libvips version was used during initial pyramid export
+                return 1.0
+            else:
+                return _LEGACY_MPP_FACTOR
+        except Exception:
+            return _LEGACY_MPP_FACTOR
+
     def get_thumbnail(self) -> Image:
         """Get thumbnail of the slide.
 
@@ -122,7 +152,7 @@ def _parse_xml_image_description(self, xml_string: str) -> dict[str, Any]:  # no
         except ET.ParseError:
             return {}
 
-    def _get_level_info(self) -> list[dict[str, Any]]:
+    def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]:
         """Get detailed information for each level.
 
         Returns:
@@ -131,8 +161,9 @@ def _get_level_info(self) -> list[dict[str, Any]]:
         """
         levels = []
         props = dict(self.slide.properties)
-        base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0))
-        base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0))
+        mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0
+        base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor
+        base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor
 
         for level in range(self.slide.level_count):
             width, height = self.slide.level_dimensions[level]
@@ -178,6 +209,7 @@ def get_metadata(self) -> dict[str, Any]:
         props = dict(self.slide.properties)
         file_size = self.path.stat().st_size
         base_width, base_height = self.slide.dimensions
+        mpp_correction_factor = self._get_mpp_correction_factor(props)
 
         metadata = {
             "format": self._detect_format(),
@@ -188,8 +220,8 @@ def get_metadata(self) -> dict[str, Any]:
             },
             "dimensions": {"width": base_width, "height": base_height},
             "resolution": {
-                "mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)),
-                "mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)),
+                "mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor,
+                "mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor,
                 "unit": props.get("tiff.ResolutionUnit", "unknown"),
                 "x_resolution": float(props.get("tiff.XResolution", 0)),
                 "y_resolution": float(props.get("tiff.YResolution", 0)),
@@ -204,7 +236,7 @@ def get_metadata(self) -> dict[str, Any]:
                 "width": int(props.get("openslide.level[0].tile-width", 256)),
                 "height": int(props.get("openslide.level[0].tile-height", 256)),
             },
-            "levels": {"count": self.slide.level_count, "data": self._get_level_info()},
+            "levels": {"count": self.slide.level_count, "data": self._get_level_info(mpp_correction_factor)},
             "extra": ", ".join([
                 props.get("dicom.ImageType[0]", "0"),
                 props.get("dicom.ImageType[1]", "1"),