Skip to content

Commit 9e89446

Browse files
committed
Add mpp correction factor and highest res .dcm selection.
* Add mpp correction factor for tiffs written using older versions of vips containing a bug that wrote resolution as px / mm instead of cm. * Select only highest resolution file from DICOM series in multiple files.
1 parent 2ba3248 commit 9e89446

File tree

2 files changed

+98
-12
lines changed

2 files changed

+98
-12
lines changed

src/aignostics/application/_service.py

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
import base64
44
import re
55
import time
6+
from collections import defaultdict
67
from collections.abc import Callable, Generator
78
from http import HTTPStatus
89
from importlib.util import find_spec
910
from pathlib import Path
1011
from typing import Any
1112

1213
import google_crc32c
14+
import pydicom
1315
import requests
1416
from loguru import logger
1517

1618
from aignostics.bucket import Service as BucketService
17-
from aignostics.constants import (
18-
TEST_APP_APPLICATION_ID,
19-
)
19+
from aignostics.constants import TEST_APP_APPLICATION_ID
2020
from aignostics.platform import (
2121
LIST_APPLICATION_RUNS_MAX_PAGE_SIZE,
2222
ApiException,
@@ -32,9 +32,7 @@
3232
RunOutput,
3333
RunState,
3434
)
35-
from aignostics.platform import (
36-
Service as PlatformService,
37-
)
35+
from aignostics.platform import Service as PlatformService
3836
from aignostics.utils import BaseService, Health, sanitize_path_component
3937
from aignostics.wsi import Service as WSIService
4038

@@ -312,6 +310,55 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None
312310
for key_value in key_value_pairs:
313311
Service._process_key_value_pair(entry, key_value, external_id)
314312

313+
@staticmethod
314+
def _filter_dicom_series_files(source_directory: Path) -> set[Path]:
315+
"""Filter DICOM files to keep only one representative per series.
316+
317+
For multi-file DICOM series, keeps only the highest resolution file.
318+
OpenSlide will find other files in the same directory when needed.
319+
320+
Args:
321+
source_directory: The directory to scan.
322+
323+
Returns:
324+
set[Path]: Set of DICOM files to exclude from processing.
325+
"""
326+
dicom_files = list(source_directory.glob("**/*.dcm"))
327+
series_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list)
328+
329+
# Group by SeriesInstanceUID with dimensions
330+
for dcm_file in dicom_files:
331+
try:
332+
ds = pydicom.dcmread(dcm_file, stop_before_pixels=True)
333+
series_uid = ds.SeriesInstanceUID
334+
# Get dimensions (Rows and Columns from DICOM)
335+
rows = int(getattr(ds, "Rows", 0))
336+
cols = int(getattr(ds, "Columns", 0))
337+
series_groups[series_uid].append((dcm_file, rows, cols))
338+
except Exception as e:
339+
logger.debug(f"Could not read DICOM {dcm_file}: {e}")
340+
# Treat as standalone - don't exclude
341+
342+
# For each series with multiple files, keep only the highest resolution one
343+
files_to_exclude = set()
344+
for series_uid, files_with_dims in series_groups.items():
345+
if len(files_with_dims) > 1:
346+
# Find the file with the largest dimensions (rows * cols = total pixels)
347+
highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2])
348+
file_to_keep, rows, cols = highest_res_file
349+
350+
# Exclude all others
351+
for file_path, _, _ in files_with_dims:
352+
if file_path != file_to_keep:
353+
files_to_exclude.add(file_path)
354+
355+
logger.debug(
356+
f"DICOM series {series_uid}: keeping {file_to_keep.name} "
357+
f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files"
358+
)
359+
360+
return files_to_exclude
361+
315362
@staticmethod
316363
def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917
317364
source_directory: Path,
@@ -366,10 +413,17 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917
366413

367414
metadata = []
368415

416+
# Pre-filter: exclude redundant DICOM files from multi-file series
417+
dicom_files_to_exclude = Service._filter_dicom_series_files(source_directory)
418+
369419
try:
370420
extensions = get_supported_extensions_for_application(application_id)
371421
for extension in extensions:
372422
for file_path in source_directory.glob(f"**/*{extension}"):
423+
# Skip excluded DICOM files
424+
if file_path in dicom_files_to_exclude:
425+
continue
426+
373427
# Generate CRC32C checksum with google_crc32c and encode as base64
374428
hash_sum = google_crc32c.Checksum() # type: ignore[no-untyped-call]
375429
with file_path.open("rb") as f:

src/aignostics/wsi/_openslide_handler.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
"""Handler for wsi files using OpenSlide."""
22

3+
import re
34
from pathlib import Path
45
from typing import Any
56

67
import defusedxml.ElementTree as ET # noqa: N817
78
import openslide
89
from loguru import logger
910
from openslide import ImageSlide, OpenSlide, open_slide
11+
from packaging import version
1012
from PIL.Image import Image
1113

1214
TIFF_IMAGE_DESCRIPTION = "tiff.ImageDescription"
@@ -62,6 +64,34 @@ def _detect_format(self) -> str | None:
6264

6365
return base_format
6466

67+
@staticmethod
68+
def _get_mpp_correction_factor(props: dict[str, Any]) -> float:
69+
"""Handle a scaling bug in libvips<8.8.3 for tiff files.
70+
71+
libvips<8.8.3 had a bug which wrote the tiff.XResolution as px / mm, but it should be
72+
px / cm. Therefore, the resolution is 10x smaller than expected. To counteract, one has
73+
to multiply the mpp with 0.1. Source: https://github.com/libvips/libvips/issues/1421
74+
75+
Returns:
76+
float: Correction factor (0.1 for buggy versions, 1.0 otherwise).
77+
"""
78+
LEGACY_MPP_FACTOR = 1 / 10 # noqa: N806
79+
80+
try:
81+
xml_string = props[TIFF_IMAGE_DESCRIPTION]
82+
83+
# Match custom metadata for library version used during export
84+
libvips_version_match = re.findall(r"libVips-version.*?(\d+\.\d+\.\d+)", xml_string, re.DOTALL)
85+
if not libvips_version_match:
86+
return LEGACY_MPP_FACTOR
87+
88+
if version.parse(libvips_version_match[0]) >= version.parse("8.8.3"):
89+
# Bug-free libvips version was used during initial pyramid export
90+
return 1.0
91+
return LEGACY_MPP_FACTOR
92+
except Exception:
93+
return LEGACY_MPP_FACTOR
94+
6595
def get_thumbnail(self) -> Image:
6696
"""Get thumbnail of the slide.
6797
@@ -122,7 +152,7 @@ def _parse_xml_image_description(self, xml_string: str) -> dict[str, Any]: # no
122152
except ET.ParseError:
123153
return {}
124154

125-
def _get_level_info(self) -> list[dict[str, Any]]:
155+
def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]:
126156
"""Get detailed information for each level.
127157
128158
Returns:
@@ -131,8 +161,9 @@ def _get_level_info(self) -> list[dict[str, Any]]:
131161
"""
132162
levels = []
133163
props = dict(self.slide.properties)
134-
base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0))
135-
base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0))
164+
mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0
165+
base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor
166+
base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor
136167

137168
for level in range(self.slide.level_count):
138169
width, height = self.slide.level_dimensions[level]
@@ -178,6 +209,7 @@ def get_metadata(self) -> dict[str, Any]:
178209
props = dict(self.slide.properties)
179210
file_size = self.path.stat().st_size
180211
base_width, base_height = self.slide.dimensions
212+
mpp_correction_factor = self._get_mpp_correction_factor(props)
181213

182214
metadata = {
183215
"format": self._detect_format(),
@@ -188,8 +220,8 @@ def get_metadata(self) -> dict[str, Any]:
188220
},
189221
"dimensions": {"width": base_width, "height": base_height},
190222
"resolution": {
191-
"mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)),
192-
"mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)),
223+
"mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor,
224+
"mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor,
193225
"unit": props.get("tiff.ResolutionUnit", "unknown"),
194226
"x_resolution": float(props.get("tiff.XResolution", 0)),
195227
"y_resolution": float(props.get("tiff.YResolution", 0)),
@@ -204,7 +236,7 @@ def get_metadata(self) -> dict[str, Any]:
204236
"width": int(props.get("openslide.level[0].tile-width", 256)),
205237
"height": int(props.get("openslide.level[0].tile-height", 256)),
206238
},
207-
"levels": {"count": self.slide.level_count, "data": self._get_level_info()},
239+
"levels": {"count": self.slide.level_count, "data": self._get_level_info(mpp_correction_factor)},
208240
"extra": ", ".join([
209241
props.get("dicom.ImageType[0]", "0"),
210242
props.get("dicom.ImageType[1]", "1"),

0 commit comments

Comments
 (0)