Skip to content

Commit a3597c5

Browse files
committed
Add mpp correction factor and highest res .dcm selection.
* Add mpp correction factor for tiffs written using older versions of vips containing a bug that wrote resolution as px / mm instead of cm. * Select only highest resolution file from DICOM series in multiple files.
1 parent 2ba3248 commit a3597c5

File tree

2 files changed

+110
-39
lines changed

2 files changed

+110
-39
lines changed

src/aignostics/application/_service.py

Lines changed: 72 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,55 +3,38 @@
33
import base64
44
import re
55
import time
6+
from collections import defaultdict
67
from collections.abc import Callable, Generator
78
from http import HTTPStatus
89
from importlib.util import find_spec
910
from pathlib import Path
1011
from typing import Any
1112

1213
import google_crc32c
14+
import pydicom
1315
import requests
1416
from loguru import logger
1517

1618
from aignostics.bucket import Service as BucketService
17-
from aignostics.constants import (
18-
TEST_APP_APPLICATION_ID,
19-
)
20-
from aignostics.platform import (
21-
LIST_APPLICATION_RUNS_MAX_PAGE_SIZE,
22-
ApiException,
23-
Application,
24-
ApplicationSummary,
25-
ApplicationVersion,
26-
Client,
27-
InputArtifact,
28-
InputItem,
29-
NotFoundException,
30-
Run,
31-
RunData,
32-
RunOutput,
33-
RunState,
34-
)
35-
from aignostics.platform import (
36-
Service as PlatformService,
37-
)
19+
from aignostics.constants import TEST_APP_APPLICATION_ID
20+
from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE,
21+
ApiException, Application, ApplicationSummary,
22+
ApplicationVersion, Client, InputArtifact,
23+
InputItem, NotFoundException, Run, RunData,
24+
RunOutput, RunState)
25+
from aignostics.platform import Service as PlatformService
3826
from aignostics.utils import BaseService, Health, sanitize_path_component
3927
from aignostics.wsi import Service as WSIService
4028

41-
from ._download import (
42-
download_available_items,
43-
download_url_to_file_with_progress,
44-
extract_filename_from_url,
45-
update_progress,
46-
)
29+
from ._download import (download_available_items,
30+
download_url_to_file_with_progress,
31+
extract_filename_from_url, update_progress)
4732
from ._models import DownloadProgress, DownloadProgressState
4833
from ._settings import Settings
49-
from ._utils import (
50-
get_mime_type_for_artifact,
51-
get_supported_extensions_for_application,
52-
is_not_terminated_with_deadline_exceeded,
53-
validate_due_date,
54-
)
34+
from ._utils import (get_mime_type_for_artifact,
35+
get_supported_extensions_for_application,
36+
is_not_terminated_with_deadline_exceeded,
37+
validate_due_date)
5538

5639
has_qupath_extra = find_spec("ijson")
5740
if has_qupath_extra:
@@ -312,6 +295,55 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None
312295
for key_value in key_value_pairs:
313296
Service._process_key_value_pair(entry, key_value, external_id)
314297

298+
@staticmethod
299+
def _filter_dicom_series_files(source_directory: Path) -> set[Path]:
300+
"""Filter DICOM files to keep only one representative per series.
301+
302+
For multi-file DICOM series, keeps only the highest resolution file.
303+
OpenSlide will find other files in the same directory when needed.
304+
305+
Args:
306+
source_directory: The directory to scan.
307+
308+
Returns:
309+
set[Path]: Set of DICOM files to exclude from processing.
310+
"""
311+
dicom_files = list(source_directory.glob("**/*.dcm"))
312+
series_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list)
313+
314+
# Group by SeriesInstanceUID with dimensions
315+
for dcm_file in dicom_files:
316+
try:
317+
ds = pydicom.dcmread(dcm_file, stop_before_pixels=True)
318+
series_uid = ds.SeriesInstanceUID
319+
# Get dimensions (Rows and Columns from DICOM)
320+
rows = int(getattr(ds, "Rows", 0))
321+
cols = int(getattr(ds, "Columns", 0))
322+
series_groups[series_uid].append((dcm_file, rows, cols))
323+
except Exception as e:
324+
logger.debug(f"Could not read DICOM {dcm_file}: {e}")
325+
# Treat as standalone - don't exclude
326+
327+
# For each series with multiple files, keep only the highest resolution one
328+
files_to_exclude = set()
329+
for series_uid, files_with_dims in series_groups.items():
330+
if len(files_with_dims) > 1:
331+
# Find the file with the largest dimensions (rows * cols = total pixels)
332+
highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2])
333+
file_to_keep, rows, cols = highest_res_file
334+
335+
# Exclude all others
336+
for file_path, r, c in files_with_dims:
337+
if file_path != file_to_keep:
338+
files_to_exclude.add(file_path)
339+
340+
logger.debug(
341+
f"DICOM series {series_uid}: keeping {file_to_keep.name} "
342+
f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files"
343+
)
344+
345+
return files_to_exclude
346+
315347
@staticmethod
316348
def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917
317349
source_directory: Path,
@@ -366,10 +398,17 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917
366398

367399
metadata = []
368400

401+
# Pre-filter: exclude redundant DICOM files from multi-file series
402+
dicom_files_to_exclude = Service._filter_dicom_series_files(source_directory)
403+
369404
try:
370405
extensions = get_supported_extensions_for_application(application_id)
371406
for extension in extensions:
372407
for file_path in source_directory.glob(f"**/*{extension}"):
408+
# Skip excluded DICOM files
409+
if file_path in dicom_files_to_exclude:
410+
continue
411+
373412
# Generate CRC32C checksum with google_crc32c and encode as base64
374413
hash_sum = google_crc32c.Checksum() # type: ignore[no-untyped-call]
375414
with file_path.open("rb") as f:

src/aignostics/wsi/_openslide_handler.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
"""Handler for wsi files using OpenSlide."""
22

3+
import re
34
from pathlib import Path
45
from typing import Any
56

67
import defusedxml.ElementTree as ET # noqa: N817
78
import openslide
89
from loguru import logger
910
from openslide import ImageSlide, OpenSlide, open_slide
11+
from packaging import version
1012
from PIL.Image import Image
1113

1214
TIFF_IMAGE_DESCRIPTION = "tiff.ImageDescription"
@@ -62,6 +64,34 @@ def _detect_format(self) -> str | None:
6264

6365
return base_format
6466

67+
def _get_mpp_correction_factor(self, props: dict[str, Any]) -> float:
68+
"""Handle a scaling bug in libvips<8.8.3 for tiff files.
69+
70+
libvips<8.8.3 had a bug which wrote the tiff.XResolution as px / mm, but it should be
71+
px / cm. Therefore, the resolution is 10x smaller than expected. To counteract, one has
72+
to multiply the mpp with 0.1. Source: https://github.com/libvips/libvips/issues/1421
73+
74+
Returns:
75+
float: Correction factor (0.1 for buggy versions, 1.0 otherwise).
76+
"""
77+
_LEGACY_MPP_FACTOR = 1 / 10
78+
79+
try:
80+
xml_string = props[TIFF_IMAGE_DESCRIPTION]
81+
82+
# Match custom metadata for library version used during export
83+
libvips_version_match = re.findall(r"libVips-version.*?(\d+\.\d+\.\d+)", xml_string, re.DOTALL)
84+
if not libvips_version_match:
85+
return _LEGACY_MPP_FACTOR
86+
87+
if version.parse(libvips_version_match[0]) >= version.parse("8.8.3"):
88+
# Bug-free libvips version was used during initial pyramid export
89+
return 1.0
90+
else:
91+
return _LEGACY_MPP_FACTOR
92+
except Exception:
93+
return _LEGACY_MPP_FACTOR
94+
6595
def get_thumbnail(self) -> Image:
6696
"""Get thumbnail of the slide.
6797
@@ -122,7 +152,7 @@ def _parse_xml_image_description(self, xml_string: str) -> dict[str, Any]: # no
122152
except ET.ParseError:
123153
return {}
124154

125-
def _get_level_info(self) -> list[dict[str, Any]]:
155+
def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]:
126156
"""Get detailed information for each level.
127157
128158
Returns:
@@ -131,8 +161,9 @@ def _get_level_info(self) -> list[dict[str, Any]]:
131161
"""
132162
levels = []
133163
props = dict(self.slide.properties)
134-
base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0))
135-
base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0))
164+
mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0
165+
base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor
166+
base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor
136167

137168
for level in range(self.slide.level_count):
138169
width, height = self.slide.level_dimensions[level]
@@ -178,6 +209,7 @@ def get_metadata(self) -> dict[str, Any]:
178209
props = dict(self.slide.properties)
179210
file_size = self.path.stat().st_size
180211
base_width, base_height = self.slide.dimensions
212+
mpp_correction_factor = self._get_mpp_correction_factor(props)
181213

182214
metadata = {
183215
"format": self._detect_format(),
@@ -188,8 +220,8 @@ def get_metadata(self) -> dict[str, Any]:
188220
},
189221
"dimensions": {"width": base_width, "height": base_height},
190222
"resolution": {
191-
"mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)),
192-
"mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)),
223+
"mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor,
224+
"mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor,
193225
"unit": props.get("tiff.ResolutionUnit", "unknown"),
194226
"x_resolution": float(props.get("tiff.XResolution", 0)),
195227
"y_resolution": float(props.get("tiff.YResolution", 0)),
@@ -204,7 +236,7 @@ def get_metadata(self) -> dict[str, Any]:
204236
"width": int(props.get("openslide.level[0].tile-width", 256)),
205237
"height": int(props.get("openslide.level[0].tile-height", 256)),
206238
},
207-
"levels": {"count": self.slide.level_count, "data": self._get_level_info()},
239+
"levels": {"count": self.slide.level_count, "data": self._get_level_info(mpp_correction_factor)},
208240
"extra": ", ".join([
209241
props.get("dicom.ImageType[0]", "0"),
210242
props.get("dicom.ImageType[1]", "1"),

0 commit comments

Comments
 (0)