From 9e89446b06e00e0ff6598eff34e0ccedb83bea32 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Tue, 21 Oct 2025 22:04:45 +0200 Subject: [PATCH 01/18] Add mpp correction factor and highest res .dcm selection. * Add mpp correction factor for tiffs written using older versions of vips containing a bug that wrote resolution as px / mm instead of cm. * Select only highest resolution file from DICOM series in multiple files. --- src/aignostics/application/_service.py | 66 +++++++++++++++++++++--- src/aignostics/wsi/_openslide_handler.py | 44 +++++++++++++--- 2 files changed, 98 insertions(+), 12 deletions(-) diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index a2bd25272..ee5d11a74 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -3,6 +3,7 @@ import base64 import re import time +from collections import defaultdict from collections.abc import Callable, Generator from http import HTTPStatus from importlib.util import find_spec @@ -10,13 +11,12 @@ from typing import Any import google_crc32c +import pydicom import requests from loguru import logger from aignostics.bucket import Service as BucketService -from aignostics.constants import ( - TEST_APP_APPLICATION_ID, -) +from aignostics.constants import TEST_APP_APPLICATION_ID from aignostics.platform import ( LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, ApiException, @@ -32,9 +32,7 @@ RunOutput, RunState, ) -from aignostics.platform import ( - Service as PlatformService, -) +from aignostics.platform import Service as PlatformService from aignostics.utils import BaseService, Health, sanitize_path_component from aignostics.wsi import Service as WSIService @@ -312,6 +310,55 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None for key_value in key_value_pairs: Service._process_key_value_pair(entry, key_value, external_id) + @staticmethod + def _filter_dicom_series_files(source_directory: Path) -> set[Path]: + """Filter DICOM files to keep only one representative per series. + + For multi-file DICOM series, keeps only the highest resolution file. + OpenSlide will find other files in the same directory when needed. + + Args: + source_directory: The directory to scan. + + Returns: + set[Path]: Set of DICOM files to exclude from processing. + """ + dicom_files = list(source_directory.glob("**/*.dcm")) + series_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) + + # Group by SeriesInstanceUID with dimensions + for dcm_file in dicom_files: + try: + ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) + series_uid = ds.SeriesInstanceUID + # Get dimensions (Rows and Columns from DICOM) + rows = int(getattr(ds, "Rows", 0)) + cols = int(getattr(ds, "Columns", 0)) + series_groups[series_uid].append((dcm_file, rows, cols)) + except Exception as e: + logger.debug(f"Could not read DICOM {dcm_file}: {e}") + # Treat as standalone - don't exclude + + # For each series with multiple files, keep only the highest resolution one + files_to_exclude = set() + for series_uid, files_with_dims in series_groups.items(): + if len(files_with_dims) > 1: + # Find the file with the largest dimensions (rows * cols = total pixels) + highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2]) + file_to_keep, rows, cols = highest_res_file + + # Exclude all others + for file_path, _, _ in files_with_dims: + if file_path != file_to_keep: + files_to_exclude.add(file_path) + + logger.debug( + f"DICOM series {series_uid}: keeping {file_to_keep.name} " + f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" + ) + + return files_to_exclude + @staticmethod def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 source_directory: Path, @@ -366,10 +413,17 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 metadata = [] + # Pre-filter: exclude redundant DICOM files from multi-file series + dicom_files_to_exclude = Service._filter_dicom_series_files(source_directory) + try: extensions = get_supported_extensions_for_application(application_id) for extension in extensions: for file_path in source_directory.glob(f"**/*{extension}"): + # Skip excluded DICOM files + if file_path in dicom_files_to_exclude: + continue + # Generate CRC32C checksum with google_crc32c and encode as base64 hash_sum = google_crc32c.Checksum() # type: ignore[no-untyped-call] with file_path.open("rb") as f: diff --git a/src/aignostics/wsi/_openslide_handler.py b/src/aignostics/wsi/_openslide_handler.py index 46c74259a..5333ed2f1 100644 --- a/src/aignostics/wsi/_openslide_handler.py +++ b/src/aignostics/wsi/_openslide_handler.py @@ -1,5 +1,6 @@ """Handler for wsi files using OpenSlide.""" +import re from pathlib import Path from typing import Any @@ -7,6 +8,7 @@ import openslide from loguru import logger from openslide import ImageSlide, OpenSlide, open_slide +from packaging import version from PIL.Image import Image TIFF_IMAGE_DESCRIPTION = "tiff.ImageDescription" @@ -62,6 +64,34 @@ def _detect_format(self) -> str | None: return base_format + @staticmethod + def _get_mpp_correction_factor(props: dict[str, Any]) -> float: + """Handle a scaling bug in libvips<8.8.3 for tiff files. + + libvips<8.8.3 had a bug which wrote the tiff.XResolution as px / mm, but it should be + px / cm. Therefore, the resolution is 10x smaller than expected. To counteract, one has + to multiply the mpp with 0.1. Source: https://github.com/libvips/libvips/issues/1421 + + Returns: + float: Correction factor (0.1 for buggy versions, 1.0 otherwise). + """ + LEGACY_MPP_FACTOR = 1 / 10 # noqa: N806 + + try: + xml_string = props[TIFF_IMAGE_DESCRIPTION] + + # Match custom metadata for library version used during export + libvips_version_match = re.findall(r"libVips-version.*?(\d+\.\d+\.\d+)", xml_string, re.DOTALL) + if not libvips_version_match: + return LEGACY_MPP_FACTOR + + if version.parse(libvips_version_match[0]) >= version.parse("8.8.3"): + # Bug-free libvips version was used during initial pyramid export + return 1.0 + return LEGACY_MPP_FACTOR + except Exception: + return LEGACY_MPP_FACTOR + def get_thumbnail(self) -> Image: """Get thumbnail of the slide. @@ -122,7 +152,7 @@ def _parse_xml_image_description(self, xml_string: str) -> dict[str, Any]: # no except ET.ParseError: return {} - def _get_level_info(self) -> list[dict[str, Any]]: + def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]: """Get detailed information for each level. Returns: @@ -131,8 +161,9 @@ def _get_level_info(self) -> list[dict[str, Any]]: """ levels = [] props = dict(self.slide.properties) - base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) - base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) + mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0 + base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor + base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor for level in range(self.slide.level_count): width, height = self.slide.level_dimensions[level] @@ -178,6 +209,7 @@ def get_metadata(self) -> dict[str, Any]: props = dict(self.slide.properties) file_size = self.path.stat().st_size base_width, base_height = self.slide.dimensions + mpp_correction_factor = self._get_mpp_correction_factor(props) metadata = { "format": self._detect_format(), @@ -188,8 +220,8 @@ def get_metadata(self) -> dict[str, Any]: }, "dimensions": {"width": base_width, "height": base_height}, "resolution": { - "mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)), - "mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)), + "mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor, + "mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor, "unit": props.get("tiff.ResolutionUnit", "unknown"), "x_resolution": float(props.get("tiff.XResolution", 0)), "y_resolution": float(props.get("tiff.YResolution", 0)), @@ -204,7 +236,7 @@ def get_metadata(self) -> dict[str, Any]: "width": int(props.get("openslide.level[0].tile-width", 256)), "height": int(props.get("openslide.level[0].tile-height", 256)), }, - "levels": {"count": self.slide.level_count, "data": self._get_level_info()}, + "levels": {"count": self.slide.level_count, "data": self._get_level_info(mpp_correction_factor)}, "extra": ", ".join([ props.get("dicom.ImageType[0]", "0"), props.get("dicom.ImageType[1]", "1"), From 55c4343e46b01da72412aad943bdbcfbf0f0530e Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Thu, 20 Nov 2025 09:24:30 +0100 Subject: [PATCH 02/18] Update wsi info command in README --- src/aignostics/CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aignostics/CLAUDE.md b/src/aignostics/CLAUDE.md index cb7974053..d08c02342 100644 --- a/src/aignostics/CLAUDE.md +++ b/src/aignostics/CLAUDE.md @@ -246,7 +246,7 @@ aignostics application run submit --application-id heta --files slide.svs aignostics dataset download --collection-id TCGA-LUAD --output-dir ./data # Get WSI info -aignostics wsi info slide.svs +aignostics wsi inspect slide.svs ``` ## GUI Launch From d13b8a1f099f3ec723ebe38b709e3c9fdad6f5d8 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Fri, 28 Nov 2025 17:29:53 +0100 Subject: [PATCH 03/18] Add tests for DCM pyramid selection --- tests/aignostics/application/service_test.py | 112 +++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index 17653e9de..fb6ec1918 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -1,8 +1,11 @@ """Tests to verify the service functionality of the application module.""" +from collections.abc import Callable from datetime import UTC, datetime, timedelta +from pathlib import Path from unittest.mock import MagicMock, patch +import pydicom import pytest from typer.testing import CliRunner @@ -439,3 +442,112 @@ def test_application_run_update_item_custom_metadata_not_found(mock_get_client: with pytest.raises(NotFoundException, match="not found"): service.application_run_update_item_custom_metadata("run-123", "invalid-item-id", {"key": "value"}) + + +@pytest.fixture +def create_dicom() -> Callable[..., pydicom.Dataset]: + """Fixture that returns a function to create minimal but valid DICOM datasets.""" + + def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: + """Create a minimal but valid DICOM dataset. + + Args: + series_uid: The series instance UID + rows: Number of rows (height) + cols: Number of columns (width) + + Returns: + A valid pydicom Dataset + """ + ds = pydicom.Dataset() + + # File Meta Information + ds.file_meta = pydicom.Dataset() + ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian + ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.77.1.6" # VL Whole Slide Microscopy + ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() + + # Required DICOM attributes + ds.SeriesInstanceUID = series_uid + ds.SOPInstanceUID = pydicom.uid.generate_uid() + ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID + ds.StudyInstanceUID = pydicom.uid.generate_uid() + ds.Modality = "SM" + ds.Rows = rows + ds.Columns = cols + + return ds + + return _create_dicom + + +@pytest.mark.unit +def test_filter_dicom_series_files_single_file( + tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset], +) -> None: + """Test that single DICOM files are not filtered.""" + ds = create_dicom("1.2.3.4.5", 1024, 1024) + dcm_file = tmp_path / "test.dcm" + ds.save_as(dcm_file, write_like_original=False) + + excluded = ApplicationService._filter_dicom_series_files(tmp_path) + assert len(excluded) == 0 + + +@pytest.mark.unit +def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: + """Test that for multi-file DICOM series, only the highest resolution file is kept.""" + series_uid = "1.2.3.4.5" + + # Create low resolution DICOM file + ds_low = create_dicom(series_uid, 512, 512) + dcm_file_low = tmp_path / "test_low.dcm" + ds_low.save_as(dcm_file_low, write_like_original=False) + + # Create medium resolution DICOM file + ds_med = create_dicom(series_uid, 1024, 1024) + dcm_file_med = tmp_path / "test_med.dcm" + ds_med.save_as(dcm_file_med, write_like_original=False) + + # Create high resolution DICOM file + ds_high = create_dicom(series_uid, 2048, 2048) + dcm_file_high = tmp_path / "test_high.dcm" + ds_high.save_as(dcm_file_high, write_like_original=False) + + # Filter the series + excluded = ApplicationService._filter_dicom_series_files(tmp_path) + + # Should exclude 2 files (low and medium), keeping only the highest resolution + assert len(excluded) == 2 + assert dcm_file_low in excluded + assert dcm_file_med in excluded + assert dcm_file_high not in excluded + + +@pytest.mark.unit +def test_filter_dicom_series_files_multiple_series( + tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset] +) -> None: + """Test that files from different series are not filtered against each other.""" + # Series 1 - two files + ds1_low = create_dicom("1.2.3.4.5", 512, 512) + dcm_file1_low = tmp_path / "series1_low.dcm" + ds1_low.save_as(dcm_file1_low, write_like_original=False) + + ds1_high = create_dicom("1.2.3.4.5", 1024, 1024) + dcm_file1_high = tmp_path / "series1_high.dcm" + ds1_high.save_as(dcm_file1_high, write_like_original=False) + + # Series 2 - single file (should not be filtered) + ds2 = create_dicom("6.7.8.9.0", 512, 512) + dcm_file2 = tmp_path / "series2.dcm" + ds2.save_as(dcm_file2, write_like_original=False) + + excluded = ApplicationService._filter_dicom_series_files(tmp_path) + + # Should exclude only the low-res file from series 1 + assert len(excluded) == 1 + assert dcm_file1_low in excluded + assert dcm_file1_high not in excluded + assert dcm_file2 not in excluded From ed6d28e523e922834a3d9682bb8740d870f91250 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Sat, 29 Nov 2025 14:32:04 +0100 Subject: [PATCH 04/18] Fix mpp factor being applied to DICOM --- src/aignostics/wsi/_openslide_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aignostics/wsi/_openslide_handler.py b/src/aignostics/wsi/_openslide_handler.py index ffc31e109..8514b2f0c 100644 --- a/src/aignostics/wsi/_openslide_handler.py +++ b/src/aignostics/wsi/_openslide_handler.py @@ -229,7 +229,7 @@ def get_metadata(self) -> dict[str, Any]: props = dict(self.slide.properties) file_size = self.path.stat().st_size base_width, base_height = self.slide.dimensions - mpp_correction_factor = self._get_mpp_correction_factor(props) + mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0 metadata = { "format": self._detect_format(), From fc728b58b6dcdfe3ae76b22cd9b9d5e6c26ba721 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Sat, 29 Nov 2025 14:37:12 +0100 Subject: [PATCH 05/18] Remove line recalculating mpp factor --- src/aignostics/wsi/_openslide_handler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aignostics/wsi/_openslide_handler.py b/src/aignostics/wsi/_openslide_handler.py index 8514b2f0c..f203d1e72 100644 --- a/src/aignostics/wsi/_openslide_handler.py +++ b/src/aignostics/wsi/_openslide_handler.py @@ -181,7 +181,6 @@ def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]: """ levels = [] props = dict(self.slide.properties) - mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0 base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor From 477333dd63db426c519bf8806d69a7ec35ef5750 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Sat, 29 Nov 2025 14:41:32 +0100 Subject: [PATCH 06/18] Make variable lowercase to pass ruff --- src/aignostics/wsi/_openslide_handler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aignostics/wsi/_openslide_handler.py b/src/aignostics/wsi/_openslide_handler.py index f203d1e72..6793b0fcc 100644 --- a/src/aignostics/wsi/_openslide_handler.py +++ b/src/aignostics/wsi/_openslide_handler.py @@ -76,7 +76,7 @@ def _get_mpp_correction_factor(props: dict[str, Any]) -> float: Returns: float: Correction factor (0.1 for buggy versions, 1.0 otherwise). """ - LEGACY_MPP_FACTOR = 1 / 10 # noqa: N806 + legacy_mpp_factor = 1 / 10 try: xml_string = props[TIFF_IMAGE_DESCRIPTION] @@ -84,14 +84,14 @@ def _get_mpp_correction_factor(props: dict[str, Any]) -> float: # Match custom metadata for library version used during export libvips_version_match = re.findall(r"libVips-version.*?(\d+\.\d+\.\d+)", xml_string, re.DOTALL) if not libvips_version_match: - return LEGACY_MPP_FACTOR + return legacy_mpp_factor if version.parse(libvips_version_match[0]) >= version.parse("8.8.3"): # Bug-free libvips version was used during initial pyramid export return 1.0 - return LEGACY_MPP_FACTOR + return legacy_mpp_factor except Exception: - return LEGACY_MPP_FACTOR + return legacy_mpp_factor def get_thumbnail(self, max_safe_dimension: int = DEFAULT_MAX_SAFE_DIMENSION) -> Image: """Get thumbnail of the slide. From 9d08efda15d9a1e2bcaec25018d130cc635bf80b Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Sat, 29 Nov 2025 14:51:46 +0100 Subject: [PATCH 07/18] docs(wsi): Update with DICOM filtering logic. --- src/aignostics/application/CLAUDE.md | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/aignostics/application/CLAUDE.md b/src/aignostics/application/CLAUDE.md index 3f260066e..4981cf7b9 100644 --- a/src/aignostics/application/CLAUDE.md +++ b/src/aignostics/application/CLAUDE.md @@ -163,6 +163,38 @@ APPLICATION_RUN_UPLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB APPLICATION_RUN_DOWNLOAD_SLEEP_SECONDS = 5 # Wait between status checks ``` +### DICOM Series Handling + +**Multi-File DICOM Series Filtering (`_service.py`):** + +The service automatically handles multi-file DICOM series (e.g., whole slide images stored as multiple DICOM files) by selecting only the highest resolution file from each series. This prevents redundant processing since OpenSlide can automatically find related files in the same directory. +```python +@staticmethod +def _filter_dicom_series_files(source_directory: Path) -> set[Path]: + """Filter DICOM files to keep only one representative per series. + + For multi-file DICOM series, keeps only the highest resolution file. + OpenSlide will find other files in the same directory when needed. + + Implementation: + - Groups DICOM files by SeriesInstanceUID + - For each series with multiple files, selects file with largest dimensions (rows * cols) + - Returns set of files to exclude from processing + - Standalone DICOM files (not part of multi-file series) are never excluded + + Used automatically in: generate_metadata_from_source_directory() + """ +``` + +**Key Behaviors:** + +- Files are grouped by `SeriesInstanceUID` DICOM tag +- Highest resolution determined by `Rows × Columns` (total pixel count) +- Excluded files are logged at DEBUG level with series details +- Filtering occurs before metadata generation, checksum calculation, or upload +- Only affects DICOM files (`.dcm` extension); other formats unaffected + + ### Progress State Management **Actual DownloadProgress Model (`_models.py`):** From e03e72209a2cbfdd8ae3674907f467c3946a4c2b Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Mon, 1 Dec 2025 15:15:41 +0100 Subject: [PATCH 08/18] fix(application): Fix wrong image size in multi-file DICOM filtering. --- src/aignostics/application/_service.py | 50 ++++++-------- tests/aignostics/application/service_test.py | 70 +++++++++++++++----- 2 files changed, 73 insertions(+), 47 deletions(-) diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index 4b8ecf6bb..b80fc6ac7 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -17,39 +17,24 @@ from aignostics.bucket import Service as BucketService from aignostics.constants import TEST_APP_APPLICATION_ID -from aignostics.platform import ( - LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, - ApiException, - Application, - ApplicationSummary, - ApplicationVersion, - Client, - InputArtifact, - InputItem, - NotFoundException, - Run, - RunData, - RunOutput, - RunState, -) +from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, + ApiException, Application, ApplicationSummary, + ApplicationVersion, Client, InputArtifact, + InputItem, NotFoundException, Run, RunData, + RunOutput, RunState) from aignostics.platform import Service as PlatformService from aignostics.utils import BaseService, Health, sanitize_path_component from aignostics.wsi import Service as WSIService -from ._download import ( - download_available_items, - download_url_to_file_with_progress, - extract_filename_from_url, - update_progress, -) +from ._download import (download_available_items, + download_url_to_file_with_progress, + extract_filename_from_url, update_progress) from ._models import DownloadProgress, DownloadProgressState from ._settings import Settings -from ._utils import ( - get_mime_type_for_artifact, - get_supported_extensions_for_application, - is_not_terminated_with_deadline_exceeded, - validate_due_date, -) +from ._utils import (get_mime_type_for_artifact, + get_supported_extensions_for_application, + is_not_terminated_with_deadline_exceeded, + validate_due_date) has_qupath_extra = find_spec("ijson") if has_qupath_extra: @@ -331,9 +316,11 @@ def _filter_dicom_series_files(source_directory: Path) -> set[Path]: try: ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) series_uid = ds.SeriesInstanceUID - # Get dimensions (Rows and Columns from DICOM) - rows = int(getattr(ds, "Rows", 0)) - cols = int(getattr(ds, "Columns", 0)) + + # These represent the full image dimensions across all frames + rows = int(ds.TotalPixelMatrixRows) + cols = int(ds.TotalPixelMatrixColumns) + series_groups[series_uid].append((dcm_file, rows, cols)) except Exception as e: logger.debug(f"Could not read DICOM {dcm_file}: {e}") @@ -1118,7 +1105,8 @@ def application_run_submit( # noqa: PLR0913, PLR0917, PLR0912, C901, PLR0915 # Validate pipeline configuration if present if "pipeline" in sdk_metadata: - from aignostics.platform._sdk_metadata import PipelineConfig # noqa: PLC0415 + from aignostics.platform._sdk_metadata import \ + PipelineConfig # noqa: PLC0415 try: PipelineConfig.model_validate(sdk_metadata["pipeline"]) diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index fb6ec1918..da4c7ccf4 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -13,10 +13,8 @@ from aignostics.application._utils import validate_due_date from aignostics.platform import NotFoundException, RunData, RunOutput from tests.constants_test import ( - HETA_APPLICATION_ID, - HETA_APPLICATION_VERSION, - TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP, -) + HETA_APPLICATION_ID, HETA_APPLICATION_VERSION, + TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP) @pytest.mark.unit @@ -449,15 +447,15 @@ def create_dicom() -> Callable[..., pydicom.Dataset]: """Fixture that returns a function to create minimal but valid DICOM datasets.""" def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: - """Create a minimal but valid DICOM dataset. + """Create a minimal but valid DICOM dataset for WSI. Args: series_uid: The series instance UID - rows: Number of rows (height) - cols: Number of columns (width) + rows: Total image rows (TotalPixelMatrixRows) for the full WSI + cols: Total image columns (TotalPixelMatrixColumns) for the full WSI Returns: - A valid pydicom Dataset + A valid pydicom Dataset for whole slide imaging """ ds = pydicom.Dataset() @@ -473,8 +471,15 @@ def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID ds.StudyInstanceUID = pydicom.uid.generate_uid() ds.Modality = "SM" - ds.Rows = rows - ds.Columns = cols + + # Tile dimensions (typically 256x256 for WSI) + ds.Rows = 256 + ds.Columns = 256 + + # CRITICAL: Total image dimensions for whole slide imaging + # These represent the full image size and are what differentiate pyramid levels + ds.TotalPixelMatrixRows = rows + ds.TotalPixelMatrixColumns = cols return ds @@ -496,11 +501,14 @@ def test_filter_dicom_series_files_single_file( @pytest.mark.unit -def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: +def test_filter_dicom_series_files_pyramid( + tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset] +) -> None: """Test that for multi-file DICOM series, only the highest resolution file is kept.""" series_uid = "1.2.3.4.5" - # Create low resolution DICOM file + # Create low resolution DICOM file (smallest pyramid level) ds_low = create_dicom(series_uid, 512, 512) dcm_file_low = tmp_path / "test_low.dcm" ds_low.save_as(dcm_file_low, write_like_original=False) @@ -510,7 +518,7 @@ def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callabl dcm_file_med = tmp_path / "test_med.dcm" ds_med.save_as(dcm_file_med, write_like_original=False) - # Create high resolution DICOM file + # Create high resolution DICOM file (base layer - highest resolution) ds_high = create_dicom(series_uid, 2048, 2048) dcm_file_high = tmp_path / "test_high.dcm" ds_high.save_as(dcm_file_high, write_like_original=False) @@ -527,10 +535,11 @@ def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callabl @pytest.mark.unit def test_filter_dicom_series_files_multiple_series( - tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset] + tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset] ) -> None: """Test that files from different series are not filtered against each other.""" - # Series 1 - two files + # Series 1 - two files (pyramid with 2 levels) ds1_low = create_dicom("1.2.3.4.5", 512, 512) dcm_file1_low = tmp_path / "series1_low.dcm" ds1_low.save_as(dcm_file1_low, write_like_original=False) @@ -539,7 +548,7 @@ def test_filter_dicom_series_files_multiple_series( dcm_file1_high = tmp_path / "series1_high.dcm" ds1_high.save_as(dcm_file1_high, write_like_original=False) - # Series 2 - single file (should not be filtered) + # Series 2 - single file (standalone, no pyramid) ds2 = create_dicom("6.7.8.9.0", 512, 512) dcm_file2 = tmp_path / "series2.dcm" ds2.save_as(dcm_file2, write_like_original=False) @@ -551,3 +560,32 @@ def test_filter_dicom_series_files_multiple_series( assert dcm_file1_low in excluded assert dcm_file1_high not in excluded assert dcm_file2 not in excluded + + +@pytest.mark.unit +def test_filter_dicom_series_files_missing_wsi_attributes( + tmp_path: Path, +) -> None: + """Test that DICOM files without WSI-specific attributes are skipped.""" + # Create a non-WSI DICOM (missing TotalPixelMatrix attributes) + ds = pydicom.Dataset() + ds.file_meta = pydicom.Dataset() + ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian + ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage + ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() + + ds.SeriesInstanceUID = "1.2.3.4.5" + ds.SOPInstanceUID = pydicom.uid.generate_uid() + ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID + ds.StudyInstanceUID = pydicom.uid.generate_uid() + ds.Modality = "CT" + ds.Rows = 512 + ds.Columns = 512 + # Note: No TotalPixelMatrixRows/Columns + + dcm_file = tmp_path / "non_wsi.dcm" + ds.save_as(dcm_file, write_like_original=False) + + # Should not crash, and should not exclude anything (file is skipped) + excluded = ApplicationService._filter_dicom_series_files(tmp_path) + assert len(excluded) == 0 From d17fdcd5a93746e1c6f30f08ba36230445a7e9f9 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Mon, 1 Dec 2025 15:16:39 +0100 Subject: [PATCH 09/18] fix(wsi): Revert changes to apply MPP factor, not in scope. --- src/aignostics/wsi/_openslide_handler.py | 43 ++++-------------------- 1 file changed, 6 insertions(+), 37 deletions(-) diff --git a/src/aignostics/wsi/_openslide_handler.py b/src/aignostics/wsi/_openslide_handler.py index 6793b0fcc..f118f6c96 100644 --- a/src/aignostics/wsi/_openslide_handler.py +++ b/src/aignostics/wsi/_openslide_handler.py @@ -1,6 +1,5 @@ """Handler for wsi files using OpenSlide.""" -import re from pathlib import Path from typing import Any @@ -8,7 +7,6 @@ import openslide from loguru import logger from openslide import ImageSlide, OpenSlide, open_slide -from packaging import version from PIL.Image import Image TIFF_IMAGE_DESCRIPTION = "tiff.ImageDescription" @@ -65,34 +63,6 @@ def _detect_format(self) -> str | None: return base_format - @staticmethod - def _get_mpp_correction_factor(props: dict[str, Any]) -> float: - """Handle a scaling bug in libvips<8.8.3 for tiff files. - - libvips<8.8.3 had a bug which wrote the tiff.XResolution as px / mm, but it should be - px / cm. Therefore, the resolution is 10x smaller than expected. To counteract, one has - to multiply the mpp with 0.1. Source: https://github.com/libvips/libvips/issues/1421 - - Returns: - float: Correction factor (0.1 for buggy versions, 1.0 otherwise). - """ - legacy_mpp_factor = 1 / 10 - - try: - xml_string = props[TIFF_IMAGE_DESCRIPTION] - - # Match custom metadata for library version used during export - libvips_version_match = re.findall(r"libVips-version.*?(\d+\.\d+\.\d+)", xml_string, re.DOTALL) - if not libvips_version_match: - return legacy_mpp_factor - - if version.parse(libvips_version_match[0]) >= version.parse("8.8.3"): - # Bug-free libvips version was used during initial pyramid export - return 1.0 - return legacy_mpp_factor - except Exception: - return legacy_mpp_factor - def get_thumbnail(self, max_safe_dimension: int = DEFAULT_MAX_SAFE_DIMENSION) -> Image: """Get thumbnail of the slide. @@ -172,7 +142,7 @@ def _parse_xml_image_description(self, xml_string: str) -> dict[str, Any]: # no except ET.ParseError: return {} - def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]: + def _get_level_info(self) -> list[dict[str, Any]]: """Get detailed information for each level. Returns: @@ -181,8 +151,8 @@ def _get_level_info(self, mpp_correction_factor: float) -> list[dict[str, Any]]: """ levels = [] props = dict(self.slide.properties) - base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor - base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor + base_mpp_x = float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) + base_mpp_y = float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) for level in range(self.slide.level_count): width, height = self.slide.level_dimensions[level] @@ -228,7 +198,6 @@ def get_metadata(self) -> dict[str, Any]: props = dict(self.slide.properties) file_size = self.path.stat().st_size base_width, base_height = self.slide.dimensions - mpp_correction_factor = self._get_mpp_correction_factor(props) if "tiff.XResolution" in props else 1.0 metadata = { "format": self._detect_format(), @@ -239,8 +208,8 @@ def get_metadata(self) -> dict[str, Any]: }, "dimensions": {"width": base_width, "height": base_height}, "resolution": { - "mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)) * mpp_correction_factor, - "mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)) * mpp_correction_factor, + "mpp_x": float(props.get(openslide.PROPERTY_NAME_MPP_X, 0)), + "mpp_y": float(props.get(openslide.PROPERTY_NAME_MPP_Y, 0)), "unit": props.get("tiff.ResolutionUnit", "unknown"), "x_resolution": float(props.get("tiff.XResolution", 0)), "y_resolution": float(props.get("tiff.YResolution", 0)), @@ -255,7 +224,7 @@ def get_metadata(self) -> dict[str, Any]: "width": int(props.get("openslide.level[0].tile-width", 256)), "height": int(props.get("openslide.level[0].tile-height", 256)), }, - "levels": {"count": self.slide.level_count, "data": self._get_level_info(mpp_correction_factor)}, + "levels": {"count": self.slide.level_count, "data": self._get_level_info()}, "extra": ", ".join([ props.get("dicom.ImageType[0]", "0"), props.get("dicom.ImageType[1]", "1"), From 98179aa3758d393c05382aacd858f756cf116841 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Mon, 1 Dec 2025 16:28:31 +0100 Subject: [PATCH 10/18] fix(application): Lint --- src/aignostics/application/_service.py | 42 +++++++++++++------- tests/aignostics/application/service_test.py | 22 +++++----- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index b80fc6ac7..d07cccb6d 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -17,24 +17,39 @@ from aignostics.bucket import Service as BucketService from aignostics.constants import TEST_APP_APPLICATION_ID -from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, - ApiException, Application, ApplicationSummary, - ApplicationVersion, Client, InputArtifact, - InputItem, NotFoundException, Run, RunData, - RunOutput, RunState) +from aignostics.platform import ( + LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, + ApiException, + Application, + ApplicationSummary, + ApplicationVersion, + Client, + InputArtifact, + InputItem, + NotFoundException, + Run, + RunData, + RunOutput, + RunState, +) from aignostics.platform import Service as PlatformService from aignostics.utils import BaseService, Health, sanitize_path_component from aignostics.wsi import Service as WSIService -from ._download import (download_available_items, - download_url_to_file_with_progress, - extract_filename_from_url, update_progress) +from ._download import ( + download_available_items, + download_url_to_file_with_progress, + extract_filename_from_url, + update_progress, +) from ._models import DownloadProgress, DownloadProgressState from ._settings import Settings -from ._utils import (get_mime_type_for_artifact, - get_supported_extensions_for_application, - is_not_terminated_with_deadline_exceeded, - validate_due_date) +from ._utils import ( + get_mime_type_for_artifact, + get_supported_extensions_for_application, + is_not_terminated_with_deadline_exceeded, + validate_due_date, +) has_qupath_extra = find_spec("ijson") if has_qupath_extra: @@ -1105,8 +1120,7 @@ def application_run_submit( # noqa: PLR0913, PLR0917, PLR0912, C901, PLR0915 # Validate pipeline configuration if present if "pipeline" in sdk_metadata: - from aignostics.platform._sdk_metadata import \ - PipelineConfig # noqa: PLC0415 + from aignostics.platform._sdk_metadata import PipelineConfig # noqa: PLC0415 try: PipelineConfig.model_validate(sdk_metadata["pipeline"]) diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index da4c7ccf4..aa14d236e 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -13,8 +13,10 @@ from aignostics.application._utils import validate_due_date from aignostics.platform import NotFoundException, RunData, RunOutput from tests.constants_test import ( - HETA_APPLICATION_ID, HETA_APPLICATION_VERSION, - TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP) + HETA_APPLICATION_ID, + HETA_APPLICATION_VERSION, + TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP, +) @pytest.mark.unit @@ -471,7 +473,7 @@ def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID ds.StudyInstanceUID = pydicom.uid.generate_uid() ds.Modality = "SM" - + # Tile dimensions (typically 256x256 for WSI) ds.Rows = 256 ds.Columns = 256 @@ -501,10 +503,7 @@ def test_filter_dicom_series_files_single_file( @pytest.mark.unit -def test_filter_dicom_series_files_pyramid( - tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset] -) -> None: +def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: """Test that for multi-file DICOM series, only the highest resolution file is kept.""" series_uid = "1.2.3.4.5" @@ -535,8 +534,7 @@ def test_filter_dicom_series_files_pyramid( @pytest.mark.unit def test_filter_dicom_series_files_multiple_series( - tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset] + tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset] ) -> None: """Test that files from different series are not filtered against each other.""" # Series 1 - two files (pyramid with 2 levels) @@ -573,7 +571,7 @@ def test_filter_dicom_series_files_missing_wsi_attributes( ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() - + ds.SeriesInstanceUID = "1.2.3.4.5" ds.SOPInstanceUID = pydicom.uid.generate_uid() ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID @@ -582,10 +580,10 @@ def test_filter_dicom_series_files_missing_wsi_attributes( ds.Rows = 512 ds.Columns = 512 # Note: No TotalPixelMatrixRows/Columns - + dcm_file = tmp_path / "non_wsi.dcm" ds.save_as(dcm_file, write_like_original=False) - + # Should not crash, and should not exclude anything (file is skipped) excluded = ApplicationService._filter_dicom_series_files(tmp_path) assert len(excluded) == 0 From ec0c6de9066566f3e29f818e8d60e3e49d34a0eb Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Tue, 2 Dec 2025 18:11:41 +0100 Subject: [PATCH 11/18] fix(application): Filter out non-WSI .dcm files --- src/aignostics/application/CLAUDE.md | 78 ++- src/aignostics/application/_service.py | 66 +- tests/aignostics/application/service_test.py | 643 +++++-------------- 3 files changed, 282 insertions(+), 505 deletions(-) diff --git a/src/aignostics/application/CLAUDE.md b/src/aignostics/application/CLAUDE.md index 4981cf7b9..b3f701695 100644 --- a/src/aignostics/application/CLAUDE.md +++ b/src/aignostics/application/CLAUDE.md @@ -163,24 +163,36 @@ APPLICATION_RUN_UPLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB APPLICATION_RUN_DOWNLOAD_SLEEP_SECONDS = 5 # Wait between status checks ``` -### DICOM Series Handling +### DICOM Pyramid Handling -**Multi-File DICOM Series Filtering (`_service.py`):** +**Multi-File DICOM Pyramid Filtering (`_service.py`):** -The service automatically handles multi-file DICOM series (e.g., whole slide images stored as multiple DICOM files) by selecting only the highest resolution file from each series. This prevents redundant processing since OpenSlide can automatically find related files in the same directory. +The service automatically handles multi-file DICOM pyramids (whole slide images stored across multiple DICOM instances) by selecting only the highest resolution file from each pyramid. This prevents redundant processing since OpenSlide can automatically find related pyramid files in the same directory. ```python @staticmethod -def _filter_dicom_series_files(source_directory: Path) -> set[Path]: - """Filter DICOM files to keep only one representative per series. +def _filter_dicom_pyramid_files(source_directory: Path) -> set[Path]: + """Filter DICOM files to keep only one representative per pyramid. - For multi-file DICOM series, keeps only the highest resolution file. - OpenSlide will find other files in the same directory when needed. + For multi-file DICOM pyramids (WSI images split across multiple instances), + keeps only the highest resolution file. Excludes segmentations, annotations, + thumbnails, and other non-image DICOM files. - Implementation: - - Groups DICOM files by SeriesInstanceUID - - For each series with multiple files, selects file with largest dimensions (rows * cols) - - Returns set of files to exclude from processing - - Standalone DICOM files (not part of multi-file series) are never excluded + Filtering Strategy: + 1. SOPClassUID filtering - Only process VL Whole Slide Microscopy Image Storage + - Include: 1.2.840.10008.5.1.4.1.1.77.1.6 (VL WSI) + - Exclude: 1.2.840.10008.5.1.4.1.1.66.4 (Segmentation Storage) + - Exclude: Other non-WSI DICOM types + + 2. ImageType filtering - Exclude auxiliary images + - THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, LOCALIZER + + 3. PyramidUID grouping - Group multi-file pyramids + - Files with same PyramidUID are part of one logical WSI + - Files without PyramidUID are treated as standalone WSIs + + 4. Resolution selection - Keep highest resolution per pyramid + - Based on TotalPixelMatrixRows × TotalPixelMatrixColumns + - Excludes all lower resolution levels Used automatically in: generate_metadata_from_source_directory() """ @@ -188,11 +200,43 @@ def _filter_dicom_series_files(source_directory: Path) -> set[Path]: **Key Behaviors:** -- Files are grouped by `SeriesInstanceUID` DICOM tag -- Highest resolution determined by `Rows × Columns` (total pixel count) -- Excluded files are logged at DEBUG level with series details -- Filtering occurs before metadata generation, checksum calculation, or upload -- Only affects DICOM files (`.dcm` extension); other formats unaffected +- **SOPClassUID validation**: Only processes VL Whole Slide Microscopy Image Storage files (1.2.840.10008.5.1.4.1.1.77.1.6) +- **Non-WSI exclusion**: Automatically excludes segmentations (1.2.840.10008.5.1.4.1.1.66.4), annotations, and other DICOM object types +- **ImageType filtering**: Excludes THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, and LOCALIZER image types +- **PyramidUID grouping**: Groups files by PyramidUID (DICOM tag identifying multi-resolution pyramids) +- **Resolution selection**: For each pyramid, keeps only the file with largest TotalPixelMatrixRows × TotalPixelMatrixColumns +- **Standalone handling**: Files without PyramidUID are treated as standalone WSI images and preserved +- **Graceful degradation**: Files with missing attributes are logged and treated as standalone (not excluded) +- **Debug logging**: Excluded files are logged at DEBUG level with pyramid/exclusion details +- **Pre-processing filter**: Filtering occurs before metadata generation, checksum calculation, or upload +- **Format-specific**: Only affects DICOM files (`.dcm` extension); other formats (SVS, TIFF) unaffected + +**DICOM WSI Structure:** + +In the DICOM Whole Slide Imaging standard: +- **PyramidUID**: Uniquely identifies a single multi-resolution pyramid that may span multiple files +- **SeriesInstanceUID**: Groups related images (may include multiple pyramids, thumbnails, labels) +- **TotalPixelMatrixRows/Columns**: Represents full image dimensions at the highest resolution level + +**Example Scenario:** +``` +Input Directory: +├── pyramid_level_0.dcm (10000×10000 px, PyramidUID: ABC123) ← KEPT +├── pyramid_level_1.dcm (5000×5000 px, PyramidUID: ABC123) ← EXCLUDED +├── pyramid_level_2.dcm (2500×2500 px, PyramidUID: ABC123) ← EXCLUDED +├── thumbnail.dcm (256×256 px, PyramidUID: ABC123, ImageType: THUMBNAIL) ← EXCLUDED +├── segmentation.dcm (10000×10000 px, SOPClassUID: Segmentation) ← EXCLUDED +└── standalone.dcm (8000×8000 px, No PyramidUID) ← KEPT + +Result: Only pyramid_level_0.dcm and standalone.dcm are processed +``` + +**Error Handling:** + +- Files with missing SOPClassUID are logged as warnings and excluded (malformed DICOM) +- Files with PyramidUID but missing TotalPixelMatrix* attributes are treated as standalone +- Files that cannot be read by pydicom are logged at DEBUG level and skipped +- AttributeError and general exceptions are caught to prevent processing pipeline failure ### Progress State Management diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index f0af191a1..0eb556caf 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -311,11 +311,22 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None Service._process_key_value_pair(entry, key_value, external_id) @staticmethod - def _filter_dicom_series_files(source_directory: Path) -> set[Path]: - """Filter DICOM files to keep only one representative per series. - - For multi-file DICOM series, keeps only the highest resolution file. - OpenSlide will find other files in the same directory when needed. + def _filter_dicom_pyramid_files(source_directory: Path) -> set[Path]: # noqa: C901 + """Filter DICOM files to keep only one representative per pyramid. + + For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), + keeps only the highest resolution file. Excludes segmentations, annotations, + thumbnails, labels, and other non-image DICOM files. OpenSlide will automatically + find related pyramid files in the same directory when needed. + + Filtering Strategy: + - Only processes VL Whole Slide Microscopy Image Storage + (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6, see here: + https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html) + - Excludes thumbnails, labels, overviews by ImageType DICOM attribute + - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) + - Selects highest resolution based on TotalPixelMatrixRows x TotalPixelMatrixColumns + - Preserves standalone WSI files without PyramidUID Args: source_directory: The directory to scan. @@ -324,26 +335,49 @@ def _filter_dicom_series_files(source_directory: Path) -> set[Path]: set[Path]: Set of DICOM files to exclude from processing. """ dicom_files = list(source_directory.glob("**/*.dcm")) - series_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) + pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) + files_to_exclude = set() - # Group by SeriesInstanceUID with dimensions + # Group by PyramidUID with dimensions for dcm_file in dicom_files: try: ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) - series_uid = ds.SeriesInstanceUID + + # Exclude non-WSI image files by SOPClassUID + # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) + if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": + logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") + files_to_exclude.add(dcm_file) + continue + + # Exclude thumbnails, labels, and overview images by ImageType + if hasattr(ds, "ImageType"): + image_type = [t.upper() for t in ds.ImageType] + exclude_types = {"THUMBNAIL", "LABEL", "OVERVIEW", "MACRO", "ANNOTATION", "LOCALIZER"} + if any(excluded in image_type for excluded in exclude_types): + logger.debug(f"Excluding {dcm_file.name} - ImageType: {image_type}") + files_to_exclude.add(dcm_file) + continue + + # Now process valid WSI images with PyramidUID + if not hasattr(ds, "PyramidUID"): + logger.debug(f"DICOM {dcm_file.name} has no PyramidUID - treating as standalone") + continue + + pyramid_uid = ds.PyramidUID # These represent the full image dimensions across all frames rows = int(ds.TotalPixelMatrixRows) cols = int(ds.TotalPixelMatrixColumns) - series_groups[series_uid].append((dcm_file, rows, cols)) + pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) + except AttributeError as e: + logger.debug(f"DICOM {dcm_file} missing required attributes: {e}") except Exception as e: logger.debug(f"Could not read DICOM {dcm_file}: {e}") - # Treat as standalone - don't exclude - # For each series with multiple files, keep only the highest resolution one - files_to_exclude = set() - for series_uid, files_with_dims in series_groups.items(): + # For each pyramid with multiple files, keep only the highest resolution one + for pyramid_uid, files_with_dims in pyramid_groups.items(): if len(files_with_dims) > 1: # Find the file with the largest dimensions (rows * cols = total pixels) highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2]) @@ -355,7 +389,7 @@ def _filter_dicom_series_files(source_directory: Path) -> set[Path]: files_to_exclude.add(file_path) logger.debug( - f"DICOM series {series_uid}: keeping {file_to_keep.name} " + f"DICOM pyramid {pyramid_uid}: keeping {file_to_keep.name} " f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" ) @@ -415,8 +449,8 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 metadata = [] - # Pre-filter: exclude redundant DICOM files from multi-file series - dicom_files_to_exclude = Service._filter_dicom_series_files(source_directory) + # Pre-filter: exclude redundant DICOM files from multi-file pyramids + dicom_files_to_exclude = Service._filter_dicom_pyramid_files(source_directory) try: extensions = get_supported_extensions_for_application(application_id) diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index aa14d236e..68d4f9e43 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -1,460 +1,33 @@ """Tests to verify the service functionality of the application module.""" from collections.abc import Callable -from datetime import UTC, datetime, timedelta from pathlib import Path -from unittest.mock import MagicMock, patch import pydicom import pytest -from typer.testing import CliRunner from aignostics.application import Service as ApplicationService -from aignostics.application._utils import validate_due_date -from aignostics.platform import NotFoundException, RunData, RunOutput -from tests.constants_test import ( - HETA_APPLICATION_ID, - HETA_APPLICATION_VERSION, - TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP, -) - - -@pytest.mark.unit -def test_validate_due_date_none() -> None: - """Test that None is accepted (optional parameter).""" - # Should not raise any exception - validate_due_date(None) - - -@pytest.mark.unit -def test_validate_due_date_valid_formats() -> None: - """Test that valid ISO 8601 formats in the future are accepted.""" - # Create a datetime 2 hours in the future - future_time = datetime.now(tz=UTC) + timedelta(hours=2) - - valid_formats = [ - future_time.isoformat(), # With timezone offset like +00:00 - future_time.strftime("%Y-%m-%dT%H:%M:%S") + "Z", # With Z suffix - future_time.strftime("%Y-%m-%dT%H:%M:%S.%f") + "Z", # With microseconds and Z - future_time.strftime("%Y-%m-%dT%H:%M:%S.%f%z"), # With microseconds and timezone - ] - - for time_str in valid_formats: - # Should not raise any exception - try: - validate_due_date(time_str) - except ValueError as e: - pytest.fail(f"Valid ISO 8601 format '{time_str}' was rejected: {e}") - - -@pytest.mark.unit -def test_validate_due_date_invalid_format() -> None: - """Test that invalid ISO 8601 formats are rejected.""" - invalid_formats = [ - "2025-10-19", # Date only - "19:53:00", # Time only - "2025/10/19 19:53:00", # Wrong separators - "2025-10-19 19:53:00", # Space instead of T - "not-a-date", # Completely invalid - "2025-13-45T25:70:99Z", # Invalid values - ] - - for time_str in invalid_formats: - with pytest.raises(ValueError, match=r"Invalid ISO 8601 format"): - validate_due_date(time_str) - - -@pytest.mark.unit -def test_validate_due_date_past_datetime() -> None: - """Test that datetimes in the past are rejected.""" - # Create a datetime 2 hours in the past - past_time = datetime.now(tz=UTC) - timedelta(hours=2) - - past_formats = [ - past_time.isoformat(), - past_time.strftime("%Y-%m-%dT%H:%M:%S") + "Z", - ] - - for time_str in past_formats: - with pytest.raises(ValueError, match=r"due_date must be in the future"): - validate_due_date(time_str) - - -@pytest.mark.unit -def test_validate_due_date_current_time() -> None: - """Test that current time (not future) is rejected.""" - # Get current time - should be rejected as it's not in the future - current_time = datetime.now(tz=UTC) - current_time_str = current_time.isoformat() - - with pytest.raises(ValueError, match=r"due_date must be in the future"): - validate_due_date(current_time_str) - - -@pytest.mark.unit -def test_validate_due_date_edge_case_one_second_future() -> None: - """Test that a datetime 1 second in the future is accepted.""" - # Create a datetime 1 second in the future - future_time = datetime.now(tz=UTC) + timedelta(seconds=1) - future_time_str = future_time.isoformat() - - # Should not raise any exception - try: - validate_due_date(future_time_str) - except ValueError as e: - pytest.fail(f"Future datetime '{future_time_str}' was rejected: {e}") - - -@pytest.mark.e2e -def test_application_version_valid_semver_formats(runner: CliRunner) -> None: - """Test that valid semver formats are accepted.""" - from aignostics.application import Service as ApplicationService - - service = ApplicationService() - - # These should work if the application exists - valid_formats = [ - "test-app:v1.0.0", - "test-app:v1.2.3", - "test-app:v10.20.30", - "test-app:v1.1.2-prerelease+meta", - "test-app:v1.1.2+meta", - "test-app:v1.1.2+meta-valid", - "test-app:v1.0.0-alpha", - "test-app:v1.0.0-beta", - "test-app:v1.0.0-alpha.beta", - "test-app:v1.0.0-alpha.1", - "test-app:v1.0.0-alpha0.beta", - "test-app:v1.0.0-alpha.alpha", - "test-app:v1.0.0-alpha+metadata", - "test-app:v1.0.0-rc.1+meta", - ] - - for version_id in valid_formats: - try: - service.application_version(version_id) - except ValueError as e: - pytest.fail(f"Valid semver format '{version_id}' was rejected: {e}") - except NotFoundException: - pytest.skip(f"Application '{version_id.split(':')[0]}' not found, skipping test for this version format.") - - -@pytest.mark.unit -def test_application_version_invalid_semver_formats(runner: CliRunner, record_property) -> None: - """Test that invalid semver formats are rejected with ValueError.""" - record_property("tested-item-id", "SPEC-APPLICATION-SERVICE") - from aignostics.application import Service as ApplicationService - - service = ApplicationService() - - invalid_application_versions = [ - "test-app:v1.0.0", # legacy format - "bla", # not semver - ] - - for application_version in invalid_application_versions: - with pytest.raises(ValueError, match=r"not compliant with semantic versioning"): - service.application_version("test-app", application_version) - - -@pytest.mark.e2e -@pytest.mark.skipif( - TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP, - reason="Skipping test that uses 'latest' application version if so configured for given platform environment.", -) -def test_application_version_use_latest_fallback(runner: CliRunner, record_property) -> None: - """Test that latest version works and tested.""" - record_property("tested-item-id", "SPEC-APPLICATION-SERVICE") - service = ApplicationService() - - try: - app_version = service.application_version(HETA_APPLICATION_ID) - assert app_version is not None - assert app_version.version_number == HETA_APPLICATION_VERSION - except NotFoundException as e: - if "No versions found for application" in str(e): - pass # This is expected behavior - except ValueError as e: - pytest.fail(f"Unexpected error: {e}") - - with pytest.raises(ValueError, match=r"not compliant with semantic versioning"): - service.application_version(HETA_APPLICATION_ID, "invalid-format") - - -@pytest.mark.e2e -@pytest.mark.timeout(timeout=60 * 2) -def test_application_versions_are_unique(runner: CliRunner) -> None: - """Check that application versions are unique (currently fails due to backend bug).""" - # Get all applications - service = ApplicationService() - applications = service.applications() - - # Check each application for duplicate versions - for app in applications: - versions = service.application_versions(app.application_id) - - # Extract version numbers - version_numbers = [v.version_number for v in versions] - - # Check for duplicates - unique_versions = set(version_numbers) - assert len(version_numbers) == len(unique_versions), ( - f"Application '{app.application_id}' has duplicate versions. " - f"Found {len(version_numbers)} versions but only {len(unique_versions)} unique: {version_numbers}" - ) - - -@pytest.mark.unit -def test_application_runs_query_with_note_regex_raises() -> None: - """Test that using query with note_regex raises ValueError.""" - service = ApplicationService() - - with pytest.raises(ValueError, match=r"Cannot use 'query' parameter together with 'note_regex' parameter"): - service.application_runs(query="test", note_regex="test.*") - - -@pytest.mark.unit -def test_application_runs_query_with_tags_raises() -> None: - """Test that using query with tags raises ValueError.""" - service = ApplicationService() - - with pytest.raises(ValueError, match=r"Cannot use 'query' parameter together with 'tags' parameter"): - service.application_runs(query="test", tags={"tag1", "tag2"}) - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_runs_query_searches_note_and_tags(mock_get_client: MagicMock) -> None: - """Test that query parameter searches both note and tags with union semantics.""" - # Create mock runs - run_from_note = MagicMock(spec=RunData) - run_from_note.run_id = "run-note-123" - run_from_note.output = RunOutput.FULL - - run_from_tag = MagicMock(spec=RunData) - run_from_tag.run_id = "run-tag-456" - run_from_tag.output = RunOutput.FULL - - run_from_both = MagicMock(spec=RunData) - run_from_both.run_id = "run-both-789" - run_from_both.output = RunOutput.FULL - - # Mock the platform client to return different runs for note and tag searches - mock_client = MagicMock() - mock_runs = MagicMock() - - # First call returns runs matching note, second call returns runs matching tags - mock_runs.list_data.side_effect = [ - iter([run_from_note, run_from_both]), # Note search results - iter([run_from_tag]), # Tag search results (run_from_both already in note results, so not added) - ] - - mock_client.runs = mock_runs - mock_get_client.return_value = mock_client - - service = ApplicationService() - results = service.application_runs(query="test") - - # Verify we got union of both searches (3 unique runs) - assert len(results) == 3 - assert run_from_note in results - assert run_from_tag in results - assert run_from_both in results - - # Verify that list_data was called twice (once for note, once for tags) - assert mock_runs.list_data.call_count == 2 - - # Verify the custom_metadata parameters contain the escaped query with case insensitive flag - calls = mock_runs.list_data.call_args_list - note_call_kwargs = calls[0][1] - tag_call_kwargs = calls[1][1] - - assert "custom_metadata" in note_call_kwargs - assert "$.sdk.note" in note_call_kwargs["custom_metadata"] - assert 'like_regex "test"' in note_call_kwargs["custom_metadata"] - assert 'flag "i"' in note_call_kwargs["custom_metadata"] - - assert "custom_metadata" in tag_call_kwargs - assert "$.sdk.tags" in tag_call_kwargs["custom_metadata"] - assert 'like_regex "test"' in tag_call_kwargs["custom_metadata"] - assert 'flag "i"' in tag_call_kwargs["custom_metadata"] - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_runs_query_deduplicates_results(mock_get_client: MagicMock) -> None: - """Test that query parameter deduplicates runs that match both note and tags.""" - # Create mock run that matches both searches - run_from_both = MagicMock(spec=RunData) - run_from_both.run_id = "run-both-123" - run_from_both.output = RunOutput.FULL - - # Mock the platform client to return the same run from both searches - mock_client = MagicMock() - mock_runs = MagicMock() - - # Both searches return the same run - mock_runs.list_data.side_effect = [ - iter([run_from_both]), # Note search results - iter([run_from_both]), # Tag search results (should be deduplicated) - ] - - mock_client.runs = mock_runs - mock_get_client.return_value = mock_client - - service = ApplicationService() - results = service.application_runs(query="test") - - # Verify we only got one run (deduplicated) - assert len(results) == 1 - assert results[0].run_id == "run-both-123" - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_runs_query_respects_limit(mock_get_client: MagicMock) -> None: - """Test that query parameter respects the limit parameter.""" - # Create mock runs - runs = [] - for i in range(10): - run = MagicMock(spec=RunData) - run.run_id = f"run-{i}" - run.output = RunOutput.FULL - runs.append(run) - - # Mock the platform client to return many runs - mock_client = MagicMock() - mock_runs = MagicMock() - - # Note search returns 5 runs, tag search returns 5 runs - mock_runs.list_data.side_effect = [ - iter(runs[:5]), # Note search results - iter(runs[5:]), # Tag search results - ] - - mock_client.runs = mock_runs - mock_get_client.return_value = mock_client - - service = ApplicationService() - results = service.application_runs(query="test", limit=3) - - # Verify we only got 3 runs despite having 10 total - assert len(results) == 3 - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_runs_query_escapes_special_characters(mock_get_client: MagicMock) -> None: - """Test that query parameter properly escapes special regex characters.""" - # Mock the platform client - mock_client = MagicMock() - mock_runs = MagicMock() - mock_runs.list_data.side_effect = [ - iter([]), # Note search results - iter([]), # Tag search results - ] - mock_client.runs = mock_runs - mock_get_client.return_value = mock_client - - service = ApplicationService() - # Use query with special characters that need escaping - service.application_runs(query='test"value\\path') - - # Verify the custom_metadata parameters contain properly escaped query - calls = mock_runs.list_data.call_args_list - note_call_kwargs = calls[0][1] - tag_call_kwargs = calls[1][1] - - # Check that double quotes and backslashes are properly escaped - assert 'test\\"value\\\\path' in note_call_kwargs["custom_metadata"] - assert 'test\\"value\\\\path' in tag_call_kwargs["custom_metadata"] - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_run_update_custom_metadata_success(mock_get_client: MagicMock) -> None: - """Test successful update of run custom metadata.""" - mock_client = MagicMock() - mock_run = MagicMock() - mock_client.run.return_value = mock_run - mock_get_client.return_value = mock_client - - service = ApplicationService() - custom_metadata = {"key": "value", "tags": ["tag1", "tag2"]} - - # Should not raise any exception - service.application_run_update_custom_metadata("run-123", custom_metadata) - - # Verify the run() method was called with correct run_id - mock_client.run.assert_called_once_with("run-123") - # Verify the update_custom_metadata method was called with correct arguments - mock_run.update_custom_metadata.assert_called_once_with(custom_metadata) - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_run_update_custom_metadata_not_found(mock_get_client: MagicMock) -> None: - """Test update metadata with non-existent run.""" - mock_client = MagicMock() - mock_run = MagicMock() - mock_run.update_custom_metadata.side_effect = NotFoundException("Run not found") - mock_client.run.return_value = mock_run - mock_get_client.return_value = mock_client - - service = ApplicationService() - - with pytest.raises(NotFoundException, match="not found"): - service.application_run_update_custom_metadata("invalid-run-id", {"key": "value"}) - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_run_update_item_custom_metadata_success(mock_get_client: MagicMock) -> None: - """Test successful update of item custom metadata.""" - mock_client = MagicMock() - mock_run = MagicMock() - mock_client.run.return_value = mock_run - mock_get_client.return_value = mock_client - - service = ApplicationService() - custom_metadata = {"key": "value", "note": "test note"} - - # Should not raise any exception - service.application_run_update_item_custom_metadata("run-123", "item-ext-id", custom_metadata) - - # Verify the run() method was called with correct run_id - mock_client.run.assert_called_once_with("run-123") - # Verify the update_item_custom_metadata method was called with correct arguments - mock_run.update_item_custom_metadata.assert_called_once_with("item-ext-id", custom_metadata) - - -@pytest.mark.unit -@patch("aignostics.application._service.Service._get_platform_client") -def test_application_run_update_item_custom_metadata_not_found(mock_get_client: MagicMock) -> None: - """Test update item metadata with non-existent run or item.""" - mock_client = MagicMock() - mock_run = MagicMock() - mock_run.update_item_custom_metadata.side_effect = NotFoundException("Item not found") - mock_client.run.return_value = mock_run - mock_get_client.return_value = mock_client - - service = ApplicationService() - - with pytest.raises(NotFoundException, match="not found"): - service.application_run_update_item_custom_metadata("run-123", "invalid-item-id", {"key": "value"}) @pytest.fixture def create_dicom() -> Callable[..., pydicom.Dataset]: """Fixture that returns a function to create minimal but valid DICOM datasets.""" - def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: + def _create_dicom( + pyramid_uid: str | None, + rows: int, + cols: int, + sop_class_uid: str = "1.2.840.10008.5.1.4.1.1.77.1.6", # VL WSI by default + image_type: list[str] | None = None, + ) -> pydicom.Dataset: """Create a minimal but valid DICOM dataset for WSI. Args: - series_uid: The series instance UID + pyramid_uid: The pyramid UID (None for standalone images) rows: Total image rows (TotalPixelMatrixRows) for the full WSI cols: Total image columns (TotalPixelMatrixColumns) for the full WSI + sop_class_uid: SOP Class UID (defaults to VL WSI) + image_type: Optional ImageType attribute Returns: A valid pydicom Dataset for whole slide imaging @@ -464,14 +37,14 @@ def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: # File Meta Information ds.file_meta = pydicom.Dataset() ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian - ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.77.1.6" # VL Whole Slide Microscopy + ds.file_meta.MediaStorageSOPClassUID = sop_class_uid ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() # Required DICOM attributes - ds.SeriesInstanceUID = series_uid ds.SOPInstanceUID = pydicom.uid.generate_uid() - ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID + ds.SOPClassUID = sop_class_uid ds.StudyInstanceUID = pydicom.uid.generate_uid() + ds.SeriesInstanceUID = pydicom.uid.generate_uid() ds.Modality = "SM" # Tile dimensions (typically 256x256 for WSI) @@ -483,47 +56,69 @@ def _create_dicom(series_uid: str, rows: int, cols: int) -> pydicom.Dataset: ds.TotalPixelMatrixRows = rows ds.TotalPixelMatrixColumns = cols + # Add PyramidUID if provided (optional for standalone images) + if pyramid_uid: + ds.PyramidUID = pyramid_uid + + # Add ImageType if provided + if image_type: + ds.ImageType = image_type + return ds return _create_dicom @pytest.mark.unit -def test_filter_dicom_series_files_single_file( +def test_filter_dicom_pyramid_single_file( tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset], ) -> None: - """Test that single DICOM files are not filtered.""" + """Test that single DICOM files with PyramidUID are not filtered.""" ds = create_dicom("1.2.3.4.5", 1024, 1024) dcm_file = tmp_path / "test.dcm" ds.save_as(dcm_file, write_like_original=False) - excluded = ApplicationService._filter_dicom_series_files(tmp_path) + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) assert len(excluded) == 0 @pytest.mark.unit -def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: - """Test that for multi-file DICOM series, only the highest resolution file is kept.""" - series_uid = "1.2.3.4.5" +def test_filter_dicom_pyramid_standalone_no_pyramid_uid( + tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset], +) -> None: + """Test that standalone DICOM files without PyramidUID are not filtered.""" + ds = create_dicom(None, 1024, 1024) # No PyramidUID + dcm_file = tmp_path / "test.dcm" + ds.save_as(dcm_file, write_like_original=False) + + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + assert len(excluded) == 0 + + +@pytest.mark.unit +def test_filter_dicom_pyramid_multi_file(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: + """Test that for multi-file DICOM pyramid, only the highest resolution file is kept.""" + pyramid_uid = "1.2.3.4.5" # Create low resolution DICOM file (smallest pyramid level) - ds_low = create_dicom(series_uid, 512, 512) + ds_low = create_dicom(pyramid_uid, 512, 512) dcm_file_low = tmp_path / "test_low.dcm" ds_low.save_as(dcm_file_low, write_like_original=False) # Create medium resolution DICOM file - ds_med = create_dicom(series_uid, 1024, 1024) + ds_med = create_dicom(pyramid_uid, 1024, 1024) dcm_file_med = tmp_path / "test_med.dcm" ds_med.save_as(dcm_file_med, write_like_original=False) # Create high resolution DICOM file (base layer - highest resolution) - ds_high = create_dicom(series_uid, 2048, 2048) + ds_high = create_dicom(pyramid_uid, 2048, 2048) dcm_file_high = tmp_path / "test_high.dcm" ds_high.save_as(dcm_file_high, write_like_original=False) - # Filter the series - excluded = ApplicationService._filter_dicom_series_files(tmp_path) + # Filter the pyramid + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) # Should exclude 2 files (low and medium), keeping only the highest resolution assert len(excluded) == 2 @@ -533,27 +128,25 @@ def test_filter_dicom_series_files_pyramid(tmp_path: Path, create_dicom: Callabl @pytest.mark.unit -def test_filter_dicom_series_files_multiple_series( - tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset] -) -> None: - """Test that files from different series are not filtered against each other.""" - # Series 1 - two files (pyramid with 2 levels) +def test_filter_dicom_pyramid_multiple_pyramids(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: + """Test that files from different pyramids are not filtered against each other.""" + # Pyramid 1 - two files (pyramid with 2 levels) ds1_low = create_dicom("1.2.3.4.5", 512, 512) - dcm_file1_low = tmp_path / "series1_low.dcm" + dcm_file1_low = tmp_path / "pyramid1_low.dcm" ds1_low.save_as(dcm_file1_low, write_like_original=False) ds1_high = create_dicom("1.2.3.4.5", 1024, 1024) - dcm_file1_high = tmp_path / "series1_high.dcm" + dcm_file1_high = tmp_path / "pyramid1_high.dcm" ds1_high.save_as(dcm_file1_high, write_like_original=False) - # Series 2 - single file (standalone, no pyramid) + # Pyramid 2 - single file (standalone, single level) ds2 = create_dicom("6.7.8.9.0", 512, 512) - dcm_file2 = tmp_path / "series2.dcm" + dcm_file2 = tmp_path / "pyramid2.dcm" ds2.save_as(dcm_file2, write_like_original=False) - excluded = ApplicationService._filter_dicom_series_files(tmp_path) + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) - # Should exclude only the low-res file from series 1 + # Should exclude only the low-res file from pyramid 1 assert len(excluded) == 1 assert dcm_file1_low in excluded assert dcm_file1_high not in excluded @@ -561,29 +154,135 @@ def test_filter_dicom_series_files_multiple_series( @pytest.mark.unit -def test_filter_dicom_series_files_missing_wsi_attributes( +def test_filter_dicom_pyramid_exclude_non_wsi( tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset], ) -> None: - """Test that DICOM files without WSI-specific attributes are skipped.""" - # Create a non-WSI DICOM (missing TotalPixelMatrix attributes) + """Test that non-WSI DICOM files (e.g., segmentations) are excluded.""" + # Create a segmentation storage DICOM + ds_seg = create_dicom( + "1.2.3.4.5", + 1024, + 1024, + sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4", # Segmentation Storage + ) + dcm_file_seg = tmp_path / "segmentation.dcm" + ds_seg.save_as(dcm_file_seg, write_like_original=False) + + # Create a valid WSI + ds_wsi = create_dicom("1.2.3.4.5", 1024, 1024) + dcm_file_wsi = tmp_path / "wsi.dcm" + ds_wsi.save_as(dcm_file_wsi, write_like_original=False) + + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + + # Should exclude only the segmentation file + assert len(excluded) == 1 + assert dcm_file_seg in excluded + assert dcm_file_wsi not in excluded + + +@pytest.mark.unit +def test_filter_dicom_pyramid_exclude_thumbnails( + tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset], +) -> None: + """Test that thumbnail images are excluded.""" + # Create a thumbnail + ds_thumb = create_dicom( + "1.2.3.4.5", + 256, + 256, + image_type=["DERIVED", "PRIMARY", "THUMBNAIL", "RESAMPLED"], + ) + dcm_file_thumb = tmp_path / "thumbnail.dcm" + ds_thumb.save_as(dcm_file_thumb, write_like_original=False) + + # Create a regular WSI image + ds_wsi = create_dicom( + "1.2.3.4.5", + 1024, + 1024, + image_type=["DERIVED", "PRIMARY", "VOLUME", "NONE"], + ) + dcm_file_wsi = tmp_path / "wsi.dcm" + ds_wsi.save_as(dcm_file_wsi, write_like_original=False) + + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + + # Should exclude only the thumbnail + assert len(excluded) == 1 + assert dcm_file_thumb in excluded + assert dcm_file_wsi not in excluded + + +@pytest.mark.unit +def test_filter_dicom_pyramid_missing_attributes( + tmp_path: Path, +) -> None: + """Test that DICOM files without required WSI attributes are skipped gracefully.""" + # Create a DICOM without TotalPixelMatrix attributes ds = pydicom.Dataset() ds.file_meta = pydicom.Dataset() ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian - ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage + ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.77.1.6" ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() - ds.SeriesInstanceUID = "1.2.3.4.5" ds.SOPInstanceUID = pydicom.uid.generate_uid() - ds.SOPClassUID = ds.file_meta.MediaStorageSOPClassUID + ds.SOPClassUID = "1.2.840.10008.5.1.4.1.1.77.1.6" ds.StudyInstanceUID = pydicom.uid.generate_uid() - ds.Modality = "CT" + ds.SeriesInstanceUID = pydicom.uid.generate_uid() + ds.Modality = "SM" ds.Rows = 512 ds.Columns = 512 + ds.PyramidUID = "1.2.3.4.5" # Note: No TotalPixelMatrixRows/Columns - dcm_file = tmp_path / "non_wsi.dcm" + dcm_file = tmp_path / "incomplete_wsi.dcm" ds.save_as(dcm_file, write_like_original=False) - # Should not crash, and should not exclude anything (file is skipped) - excluded = ApplicationService._filter_dicom_series_files(tmp_path) + # Should not crash, and should not exclude anything (file is skipped gracefully) + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) assert len(excluded) == 0 + + +@pytest.mark.unit +def test_filter_dicom_pyramid_mixed_scenario( + tmp_path: Path, + create_dicom: Callable[..., pydicom.Dataset], +) -> None: + """Test complex scenario with multiple pyramids, thumbnails, and segmentations.""" + # Pyramid 1: 3 levels (keep highest) + ds1_low = create_dicom("pyramid1", 512, 512) + dcm_file1_low = tmp_path / "p1_low.dcm" + ds1_low.save_as(dcm_file1_low, write_like_original=False) + + ds1_high = create_dicom("pyramid1", 2048, 2048) + dcm_file1_high = tmp_path / "p1_high.dcm" + ds1_high.save_as(dcm_file1_high, write_like_original=False) + + # Thumbnail for pyramid 1 (exclude) + ds1_thumb = create_dicom("pyramid1", 256, 256, image_type=["DERIVED", "PRIMARY", "THUMBNAIL"]) + dcm_file1_thumb = tmp_path / "p1_thumb.dcm" + ds1_thumb.save_as(dcm_file1_thumb, write_like_original=False) + + # Segmentation file (exclude) + ds_seg = create_dicom("pyramid1", 2048, 2048, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4") + dcm_file_seg = tmp_path / "seg.dcm" + ds_seg.save_as(dcm_file_seg, write_like_original=False) + + # Standalone WSI without PyramidUID (keep) + ds_standalone = create_dicom(None, 1024, 1024) + dcm_file_standalone = tmp_path / "standalone.dcm" + ds_standalone.save_as(dcm_file_standalone, write_like_original=False) + + excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + + # Should exclude: low-res from pyramid1, thumbnail, segmentation + assert len(excluded) == 3 + assert dcm_file1_low in excluded + assert dcm_file1_thumb in excluded + assert dcm_file_seg in excluded + # Should keep: high-res from pyramid1, standalone + assert dcm_file1_high not in excluded + assert dcm_file_standalone not in excluded From 291f1e6076624285d58fd1603d6047e269303f56 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Thu, 4 Dec 2025 15:54:29 +0100 Subject: [PATCH 12/18] task(application): Address Oliver's review. --- src/aignostics/application/_service.py | 214 ++++--- tests/aignostics/application/service_test.py | 619 ++++++++++++++++--- 2 files changed, 663 insertions(+), 170 deletions(-) diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index 0eb556caf..fdb614c36 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -4,7 +4,7 @@ import re import time from collections import defaultdict -from collections.abc import Callable, Generator +from collections.abc import Callable, Generator, Iterable from http import HTTPStatus from importlib.util import find_spec from pathlib import Path @@ -311,8 +311,8 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None Service._process_key_value_pair(entry, key_value, external_id) @staticmethod - def _filter_dicom_pyramid_files(source_directory: Path) -> set[Path]: # noqa: C901 - """Filter DICOM files to keep only one representative per pyramid. + def _select_dicom_files_to_process(source_directory: Path) -> list[Path]: + """Select DICOM files to process, excluding auxiliary and redundant files. For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), keeps only the highest resolution file. Excludes segmentations, annotations, @@ -321,25 +321,30 @@ def _filter_dicom_pyramid_files(source_directory: Path) -> set[Path]: # noqa: C Filtering Strategy: - Only processes VL Whole Slide Microscopy Image Storage - (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6, see here: - https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html) - - Excludes thumbnails, labels, overviews by ImageType DICOM attribute + (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html + - Excludes auxiliary images by ImageType Value 3 (keeps only VOLUME images) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.12.4.html#sect_C.8.12.4.1.1 - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) - - Selects highest resolution based on TotalPixelMatrixRows x TotalPixelMatrixColumns + - For pyramids with multiple files, selects only the highest resolution based + on TotalPixelMatrixRows x TotalPixelMatrixColumns - Preserves standalone WSI files without PyramidUID Args: - source_directory: The directory to scan. + source_directory: The directory to recursively scan for DICOM files. Returns: - set[Path]: Set of DICOM files to exclude from processing. + List of DICOM file paths to process. All other DICOM files are excluded. + + Note: + This function reads DICOM metadata but not pixel data (stop_before_pixels=True). + Files that cannot be read or are missing required attributes are logged and skipped. """ - dicom_files = list(source_directory.glob("**/*.dcm")) pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) - files_to_exclude = set() + included_dicom_files: list[Path] = [] - # Group by PyramidUID with dimensions - for dcm_file in dicom_files: + # Scan all DICOM files and filter based on SOPClassUID and ImageType + for dcm_file in source_directory.glob("**/*.dcm"): try: ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) @@ -347,53 +352,112 @@ def _filter_dicom_pyramid_files(source_directory: Path) -> set[Path]: # noqa: C # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") - files_to_exclude.add(dcm_file) continue - # Exclude thumbnails, labels, and overview images by ImageType - if hasattr(ds, "ImageType"): - image_type = [t.upper() for t in ds.ImageType] - exclude_types = {"THUMBNAIL", "LABEL", "OVERVIEW", "MACRO", "ANNOTATION", "LOCALIZER"} - if any(excluded in image_type for excluded in exclude_types): - logger.debug(f"Excluding {dcm_file.name} - ImageType: {image_type}") - files_to_exclude.add(dcm_file) + # Exclude auxiliary images by ImageType Value 3 + # Per DICOM PS3.3 C.8.12.4.1.1: Value 3 should be VOLUME, THUMBNAIL, LABEL, or OVERVIEW. + # We only want VOLUME images + if hasattr(ds, "ImageType") and len(ds.ImageType) >= 3: # noqa: PLR2004 + image_type_value_3 = ds.ImageType[2].upper() + + if image_type_value_3 != "VOLUME": + logger.debug(f"Excluding {dcm_file.name} - ImageType Value 3: {image_type_value_3}") continue + else: + # No ImageType Value 3 - could be standalone WSI or non-standard file + logger.debug(f"DICOM {dcm_file.name} has no ImageType Value 3 - treating as standalone") + + # Try to group by PyramidUID for pyramid resolution selection + pyramid_info = Service._extract_pyramid_info(dcm_file) + if pyramid_info: + pyramid_uid, rows, cols = pyramid_info + pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) + else: + # No PyramidUID or missing dimensions - treat as standalone file + included_dicom_files.append(dcm_file) - # Now process valid WSI images with PyramidUID - if not hasattr(ds, "PyramidUID"): - logger.debug(f"DICOM {dcm_file.name} has no PyramidUID - treating as standalone") - continue + except Exception as e: + logger.debug(f"Could not process DICOM file {dcm_file}: {e}") + continue - pyramid_uid = ds.PyramidUID + # For each pyramid with multiple files, select only the highest resolution + pyramid_files_to_include = Service._find_highest_resolution_files(pyramid_groups) + included_dicom_files.extend(pyramid_files_to_include) - # These represent the full image dimensions across all frames - rows = int(ds.TotalPixelMatrixRows) - cols = int(ds.TotalPixelMatrixColumns) + return included_dicom_files - pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) - except AttributeError as e: - logger.debug(f"DICOM {dcm_file} missing required attributes: {e}") - except Exception as e: - logger.debug(f"Could not read DICOM {dcm_file}: {e}") + @staticmethod + def _extract_pyramid_info(dcm_file: Path) -> tuple[str, int, int] | None: + """Extract pyramid information from a DICOM file. - # For each pyramid with multiple files, keep only the highest resolution one - for pyramid_uid, files_with_dims in pyramid_groups.items(): - if len(files_with_dims) > 1: - # Find the file with the largest dimensions (rows * cols = total pixels) - highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2]) - file_to_keep, rows, cols = highest_res_file + Attempts to read PyramidUID and image dimensions (TotalPixelMatrixRows/Columns) + from a DICOM file. These attributes are used to group multi-file pyramids + and select the highest resolution file. - # Exclude all others - for file_path, _, _ in files_with_dims: - if file_path != file_to_keep: - files_to_exclude.add(file_path) + Args: + dcm_file: Path to the DICOM file to extract information from. + Returns: + Tuple of (PyramidUID, rows, cols) if the file is part of a pyramid, + None if the file is standalone or required attributes are missing. + + Note: + Files without PyramidUID are treated as standalone WSI images. + Missing TotalPixelMatrix attributes also result in None (file treated as standalone). + """ + try: + ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) + + # PyramidUID is Type 1C (conditional) - required if part of a multi-file pyramid + pyramid_uid = ds.PyramidUID + + # TotalPixelMatrix attributes are Type 1 for WSI - should always be present + rows = int(ds.TotalPixelMatrixRows) + cols = int(ds.TotalPixelMatrixColumns) + + return (pyramid_uid, rows, cols) + + except AttributeError as e: + logger.debug(f"DICOM {dcm_file.name} missing pyramid attributes: {e}") + return None + except Exception as e: + logger.debug(f"Could not extract pyramid info from {dcm_file}: {e}") + return None + + @staticmethod + def _find_highest_resolution_files( + pyramid_groups: dict[str, list[tuple[Path, int, int]]], + ) -> list[Path]: + """Find the highest resolution file for each multi-file pyramid. + + For each pyramid (identified by PyramidUID), selects the file with the + largest total pixel count (rows x cols). All other files in the pyramid + are excluded, as OpenSlide can find them automatically. + + Args: + pyramid_groups: Dictionary mapping PyramidUID to list of (file_path, rows, cols). + + Returns: + List of file paths to keep (one per pyramid, the highest resolution). + + Note: + Single-file pyramids (only one file per PyramidUID) are included without + comparison. + """ + files_to_keep: list[Path] = [] + + for pyramid_uid, files_with_dims in pyramid_groups.items(): + highest_res_file_with_dims = max(files_with_dims, key=lambda x: x[1] * x[2]) + highest_res_file, rows, cols = highest_res_file_with_dims + files_to_keep.append(highest_res_file) + + if len(files_with_dims) > 1: logger.debug( - f"DICOM pyramid {pyramid_uid}: keeping {file_to_keep.name} " + f"DICOM pyramid {pyramid_uid}: keeping {highest_res_file.name} " f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" ) - return files_to_exclude + return files_to_keep @staticmethod def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 @@ -407,39 +471,35 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 """Generate metadata from the source directory. Steps: - 1. Recursively files ending with supported extensions in the source directory - 2. Creates a dict with the following columns + 1. Recursively scans files ending with supported extensions in the source directory + 2. For DICOM files (.dcm), filters out auxiliary and redundant files + 3. Creates a dict for each file with the following fields: - external_id (str): The external_id of the file, by default equivalent to the absolute file name - source (str): The absolute filename - - checksum_base64_crc32c (str): The CRC32C checksum of the file constructed, base64 encoded + - checksum_base64_crc32c (str): The CRC32C checksum of the file, base64 encoded - resolution_mpp (float): The microns per pixel, inspecting the base layer - - height_px: The height of the image in pixels, inspecting the base layer - - width_px: The width of the image in pixels, inspecting the base layer - - Further attributes depending on the application and it's version - 3. Applies the optional mappings to fill in additional metadata fields in the dict. + - height_px (int): The height of the image in pixels, inspecting the base layer + - width_px (int): The width of the image in pixels, inspecting the base layer + - Further attributes depending on the application and its version + 4. Applies the optional mappings to fill in additional metadata fields in the dict Args: - source_directory (Path): The source directory to generate metadata from. - application_id (str): The ID of the application. - application_version (str|None): The version of the application (semver). - If not given latest version is used. - with_gui_metadata (bool): If True, include additional metadata for GUI. - mappings (list[str]): Mappings of the form ':=,=,...'. + source_directory: The source directory to generate metadata from. + application_id: The ID of the application. + application_version: The version of the application (semver). + If not given, latest version is used. + with_gui_metadata: If True, include additional metadata for GUI display. + mappings: Mappings of the form ':=,=,...'. The regular expression is matched against the external_id attribute of the entry. The key/value pairs are applied to the entry if the pattern matches. - with_extra_metadata (bool): If True, include extra metadata from the WSIService. + with_extra_metadata: If True, include extra metadata from the WSIService. Returns: - dict[str, Any]: The generated metadata. + List of metadata dictionaries, one per processable file found. Raises: - Exception: If the metadata cannot be generated. - - Raises: - NotFoundError: If the application version with the given ID is not found. - ValueError: If - the source directory does not exist - or is not a directory. + NotFoundException: If the application version with the given ID is not found. + ValueError: If the source directory does not exist or is not a directory. RuntimeError: If the metadata generation fails unexpectedly. """ logger.trace("Generating metadata from source directory: {}", source_directory) @@ -447,25 +507,27 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 # TODO(Helmut): Use it _ = Service().application_version(application_id, application_version) - metadata = [] - - # Pre-filter: exclude redundant DICOM files from multi-file pyramids - dicom_files_to_exclude = Service._filter_dicom_pyramid_files(source_directory) + metadata: list[dict[str, Any]] = [] try: extensions = get_supported_extensions_for_application(application_id) for extension in extensions: - for file_path in source_directory.glob(f"**/*{extension}"): - # Skip excluded DICOM files - if file_path in dicom_files_to_exclude: - continue + # Special handling for DICOM files - filter out auxiliary and redundant files + files_to_process: Iterable[Path] + if extension == ".dcm": + files_to_process = Service._select_dicom_files_to_process(source_directory) + else: + # For non-DICOM formats, process all files with this extension + files_to_process = source_directory.glob(f"**/*{extension}") + for file_path in files_to_process: # Generate CRC32C checksum with google_crc32c and encode as base64 hash_sum = google_crc32c.Checksum() # type: ignore[no-untyped-call] with file_path.open("rb") as f: while chunk := f.read(1024): hash_sum.update(chunk) # type: ignore[no-untyped-call] checksum = str(base64.b64encode(hash_sum.digest()), "UTF-8") # type: ignore[no-untyped-call] + try: image_metadata = WSIService().get_metadata(file_path) width = image_metadata["dimensions"]["width"] diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index 68d4f9e43..ce2a8ed4c 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -1,17 +1,457 @@ """Tests to verify the service functionality of the application module.""" from collections.abc import Callable +from datetime import UTC, datetime, timedelta from pathlib import Path +from unittest.mock import MagicMock, patch import pydicom import pytest +from typer.testing import CliRunner from aignostics.application import Service as ApplicationService +from aignostics.application._utils import validate_due_date +from aignostics.platform import NotFoundException, RunData, RunOutput +from tests.constants_test import ( + HETA_APPLICATION_ID, + HETA_APPLICATION_VERSION, + TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP, +) + + +@pytest.mark.unit +def test_validate_due_date_none() -> None: + """Test that None is accepted (optional parameter).""" + # Should not raise any exception + validate_due_date(None) + + +@pytest.mark.unit +def test_validate_due_date_valid_formats() -> None: + """Test that valid ISO 8601 formats in the future are accepted.""" + # Create a datetime 2 hours in the future + future_time = datetime.now(tz=UTC) + timedelta(hours=2) + + valid_formats = [ + future_time.isoformat(), # With timezone offset like +00:00 + future_time.strftime("%Y-%m-%dT%H:%M:%S") + "Z", # With Z suffix + future_time.strftime("%Y-%m-%dT%H:%M:%S.%f") + "Z", # With microseconds and Z + future_time.strftime("%Y-%m-%dT%H:%M:%S.%f%z"), # With microseconds and timezone + ] + + for time_str in valid_formats: + # Should not raise any exception + try: + validate_due_date(time_str) + except ValueError as e: + pytest.fail(f"Valid ISO 8601 format '{time_str}' was rejected: {e}") + + +@pytest.mark.unit +def test_validate_due_date_invalid_format() -> None: + """Test that invalid ISO 8601 formats are rejected.""" + invalid_formats = [ + "2025-10-19", # Date only + "19:53:00", # Time only + "2025/10/19 19:53:00", # Wrong separators + "2025-10-19 19:53:00", # Space instead of T + "not-a-date", # Completely invalid + "2025-13-45T25:70:99Z", # Invalid values + ] + + for time_str in invalid_formats: + with pytest.raises(ValueError, match=r"Invalid ISO 8601 format"): + validate_due_date(time_str) + + +@pytest.mark.unit +def test_validate_due_date_past_datetime() -> None: + """Test that datetimes in the past are rejected.""" + # Create a datetime 2 hours in the past + past_time = datetime.now(tz=UTC) - timedelta(hours=2) + + past_formats = [ + past_time.isoformat(), + past_time.strftime("%Y-%m-%dT%H:%M:%S") + "Z", + ] + + for time_str in past_formats: + with pytest.raises(ValueError, match=r"due_date must be in the future"): + validate_due_date(time_str) + + +@pytest.mark.unit +def test_validate_due_date_current_time() -> None: + """Test that current time (not future) is rejected.""" + # Get current time - should be rejected as it's not in the future + current_time = datetime.now(tz=UTC) + current_time_str = current_time.isoformat() + + with pytest.raises(ValueError, match=r"due_date must be in the future"): + validate_due_date(current_time_str) + + +@pytest.mark.unit +def test_validate_due_date_edge_case_one_second_future() -> None: + """Test that a datetime 1 second in the future is accepted.""" + # Create a datetime 1 second in the future + future_time = datetime.now(tz=UTC) + timedelta(seconds=1) + future_time_str = future_time.isoformat() + + # Should not raise any exception + try: + validate_due_date(future_time_str) + except ValueError as e: + pytest.fail(f"Future datetime '{future_time_str}' was rejected: {e}") + + +@pytest.mark.e2e +def test_application_version_valid_semver_formats(runner: CliRunner) -> None: + """Test that valid semver formats are accepted.""" + from aignostics.application import Service as ApplicationService + + service = ApplicationService() + + # These should work if the application exists + valid_formats = [ + "test-app:v1.0.0", + "test-app:v1.2.3", + "test-app:v10.20.30", + "test-app:v1.1.2-prerelease+meta", + "test-app:v1.1.2+meta", + "test-app:v1.1.2+meta-valid", + "test-app:v1.0.0-alpha", + "test-app:v1.0.0-beta", + "test-app:v1.0.0-alpha.beta", + "test-app:v1.0.0-alpha.1", + "test-app:v1.0.0-alpha0.beta", + "test-app:v1.0.0-alpha.alpha", + "test-app:v1.0.0-alpha+metadata", + "test-app:v1.0.0-rc.1+meta", + ] + + for version_id in valid_formats: + try: + service.application_version(version_id) + except ValueError as e: + pytest.fail(f"Valid semver format '{version_id}' was rejected: {e}") + except NotFoundException: + pytest.skip(f"Application '{version_id.split(':')[0]}' not found, skipping test for this version format.") + + +@pytest.mark.unit +def test_application_version_invalid_semver_formats(runner: CliRunner, record_property) -> None: + """Test that invalid semver formats are rejected with ValueError.""" + record_property("tested-item-id", "SPEC-APPLICATION-SERVICE") + from aignostics.application import Service as ApplicationService + + service = ApplicationService() + + invalid_application_versions = [ + "test-app:v1.0.0", # legacy format + "bla", # not semver + ] + + for application_version in invalid_application_versions: + with pytest.raises(ValueError, match=r"not compliant with semantic versioning"): + service.application_version("test-app", application_version) + + +@pytest.mark.e2e +@pytest.mark.skipif( + TEST_APPLICATION_VERSION_USE_LATEST_FALLBACK_SKIP, + reason="Skipping test that uses 'latest' application version if so configured for given platform environment.", +) +def test_application_version_use_latest_fallback(runner: CliRunner, record_property) -> None: + """Test that latest version works and tested.""" + record_property("tested-item-id", "SPEC-APPLICATION-SERVICE") + service = ApplicationService() + + try: + app_version = service.application_version(HETA_APPLICATION_ID) + assert app_version is not None + assert app_version.version_number == HETA_APPLICATION_VERSION + except NotFoundException as e: + if "No versions found for application" in str(e): + pass # This is expected behavior + except ValueError as e: + pytest.fail(f"Unexpected error: {e}") + + with pytest.raises(ValueError, match=r"not compliant with semantic versioning"): + service.application_version(HETA_APPLICATION_ID, "invalid-format") + + +@pytest.mark.e2e +@pytest.mark.timeout(timeout=60 * 2) +def test_application_versions_are_unique(runner: CliRunner) -> None: + """Check that application versions are unique (currently fails due to backend bug).""" + # Get all applications + service = ApplicationService() + applications = service.applications() + + # Check each application for duplicate versions + for app in applications: + versions = service.application_versions(app.application_id) + + # Extract version numbers + version_numbers = [v.version_number for v in versions] + + # Check for duplicates + unique_versions = set(version_numbers) + assert len(version_numbers) == len(unique_versions), ( + f"Application '{app.application_id}' has duplicate versions. " + f"Found {len(version_numbers)} versions but only {len(unique_versions)} unique: {version_numbers}" + ) + + +@pytest.mark.unit +def test_application_runs_query_with_note_regex_raises() -> None: + """Test that using query with note_regex raises ValueError.""" + service = ApplicationService() + + with pytest.raises(ValueError, match=r"Cannot use 'query' parameter together with 'note_regex' parameter"): + service.application_runs(query="test", note_regex="test.*") + + +@pytest.mark.unit +def test_application_runs_query_with_tags_raises() -> None: + """Test that using query with tags raises ValueError.""" + service = ApplicationService() + + with pytest.raises(ValueError, match=r"Cannot use 'query' parameter together with 'tags' parameter"): + service.application_runs(query="test", tags={"tag1", "tag2"}) + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_runs_query_searches_note_and_tags(mock_get_client: MagicMock) -> None: + """Test that query parameter searches both note and tags with union semantics.""" + # Create mock runs + run_from_note = MagicMock(spec=RunData) + run_from_note.run_id = "run-note-123" + run_from_note.output = RunOutput.FULL + + run_from_tag = MagicMock(spec=RunData) + run_from_tag.run_id = "run-tag-456" + run_from_tag.output = RunOutput.FULL + + run_from_both = MagicMock(spec=RunData) + run_from_both.run_id = "run-both-789" + run_from_both.output = RunOutput.FULL + + # Mock the platform client to return different runs for note and tag searches + mock_client = MagicMock() + mock_runs = MagicMock() + + # First call returns runs matching note, second call returns runs matching tags + mock_runs.list_data.side_effect = [ + iter([run_from_note, run_from_both]), # Note search results + iter([run_from_tag]), # Tag search results (run_from_both already in note results, so not added) + ] + + mock_client.runs = mock_runs + mock_get_client.return_value = mock_client + + service = ApplicationService() + results = service.application_runs(query="test") + + # Verify we got union of both searches (3 unique runs) + assert len(results) == 3 + assert run_from_note in results + assert run_from_tag in results + assert run_from_both in results + + # Verify that list_data was called twice (once for note, once for tags) + assert mock_runs.list_data.call_count == 2 + + # Verify the custom_metadata parameters contain the escaped query with case insensitive flag + calls = mock_runs.list_data.call_args_list + note_call_kwargs = calls[0][1] + tag_call_kwargs = calls[1][1] + + assert "custom_metadata" in note_call_kwargs + assert "$.sdk.note" in note_call_kwargs["custom_metadata"] + assert 'like_regex "test"' in note_call_kwargs["custom_metadata"] + assert 'flag "i"' in note_call_kwargs["custom_metadata"] + + assert "custom_metadata" in tag_call_kwargs + assert "$.sdk.tags" in tag_call_kwargs["custom_metadata"] + assert 'like_regex "test"' in tag_call_kwargs["custom_metadata"] + assert 'flag "i"' in tag_call_kwargs["custom_metadata"] + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_runs_query_deduplicates_results(mock_get_client: MagicMock) -> None: + """Test that query parameter deduplicates runs that match both note and tags.""" + # Create mock run that matches both searches + run_from_both = MagicMock(spec=RunData) + run_from_both.run_id = "run-both-123" + run_from_both.output = RunOutput.FULL + + # Mock the platform client to return the same run from both searches + mock_client = MagicMock() + mock_runs = MagicMock() + + # Both searches return the same run + mock_runs.list_data.side_effect = [ + iter([run_from_both]), # Note search results + iter([run_from_both]), # Tag search results (should be deduplicated) + ] + + mock_client.runs = mock_runs + mock_get_client.return_value = mock_client + + service = ApplicationService() + results = service.application_runs(query="test") + + # Verify we only got one run (deduplicated) + assert len(results) == 1 + assert results[0].run_id == "run-both-123" + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_runs_query_respects_limit(mock_get_client: MagicMock) -> None: + """Test that query parameter respects the limit parameter.""" + # Create mock runs + runs = [] + for i in range(10): + run = MagicMock(spec=RunData) + run.run_id = f"run-{i}" + run.output = RunOutput.FULL + runs.append(run) + + # Mock the platform client to return many runs + mock_client = MagicMock() + mock_runs = MagicMock() + + # Note search returns 5 runs, tag search returns 5 runs + mock_runs.list_data.side_effect = [ + iter(runs[:5]), # Note search results + iter(runs[5:]), # Tag search results + ] + + mock_client.runs = mock_runs + mock_get_client.return_value = mock_client + + service = ApplicationService() + results = service.application_runs(query="test", limit=3) + + # Verify we only got 3 runs despite having 10 total + assert len(results) == 3 + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_runs_query_escapes_special_characters(mock_get_client: MagicMock) -> None: + """Test that query parameter properly escapes special regex characters.""" + # Mock the platform client + mock_client = MagicMock() + mock_runs = MagicMock() + mock_runs.list_data.side_effect = [ + iter([]), # Note search results + iter([]), # Tag search results + ] + mock_client.runs = mock_runs + mock_get_client.return_value = mock_client + + service = ApplicationService() + # Use query with special characters that need escaping + service.application_runs(query='test"value\\path') + + # Verify the custom_metadata parameters contain properly escaped query + calls = mock_runs.list_data.call_args_list + note_call_kwargs = calls[0][1] + tag_call_kwargs = calls[1][1] + + # Check that double quotes and backslashes are properly escaped + assert 'test\\"value\\\\path' in note_call_kwargs["custom_metadata"] + assert 'test\\"value\\\\path' in tag_call_kwargs["custom_metadata"] + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_run_update_custom_metadata_success(mock_get_client: MagicMock) -> None: + """Test successful update of run custom metadata.""" + mock_client = MagicMock() + mock_run = MagicMock() + mock_client.run.return_value = mock_run + mock_get_client.return_value = mock_client + + service = ApplicationService() + custom_metadata = {"key": "value", "tags": ["tag1", "tag2"]} + + # Should not raise any exception + service.application_run_update_custom_metadata("run-123", custom_metadata) + + # Verify the run() method was called with correct run_id + mock_client.run.assert_called_once_with("run-123") + # Verify the update_custom_metadata method was called with correct arguments + mock_run.update_custom_metadata.assert_called_once_with(custom_metadata) + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_run_update_custom_metadata_not_found(mock_get_client: MagicMock) -> None: + """Test update metadata with non-existent run.""" + mock_client = MagicMock() + mock_run = MagicMock() + mock_run.update_custom_metadata.side_effect = NotFoundException("Run not found") + mock_client.run.return_value = mock_run + mock_get_client.return_value = mock_client + + service = ApplicationService() + + with pytest.raises(NotFoundException, match="not found"): + service.application_run_update_custom_metadata("invalid-run-id", {"key": "value"}) + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_run_update_item_custom_metadata_success(mock_get_client: MagicMock) -> None: + """Test successful update of item custom metadata.""" + mock_client = MagicMock() + mock_run = MagicMock() + mock_client.run.return_value = mock_run + mock_get_client.return_value = mock_client + + service = ApplicationService() + custom_metadata = {"key": "value", "note": "test note"} + + # Should not raise any exception + service.application_run_update_item_custom_metadata("run-123", "item-ext-id", custom_metadata) + + # Verify the run() method was called with correct run_id + mock_client.run.assert_called_once_with("run-123") + # Verify the update_item_custom_metadata method was called with correct arguments + mock_run.update_item_custom_metadata.assert_called_once_with("item-ext-id", custom_metadata) + + +@pytest.mark.unit +@patch("aignostics.application._service.Service._get_platform_client") +def test_application_run_update_item_custom_metadata_not_found(mock_get_client: MagicMock) -> None: + """Test update item metadata with non-existent run or item.""" + mock_client = MagicMock() + mock_run = MagicMock() + mock_run.update_item_custom_metadata.side_effect = NotFoundException("Item not found") + mock_client.run.return_value = mock_run + mock_get_client.return_value = mock_client + + service = ApplicationService() + + with pytest.raises(NotFoundException, match="not found"): + service.application_run_update_item_custom_metadata("run-123", "invalid-item-id", {"key": "value"}) @pytest.fixture -def create_dicom() -> Callable[..., pydicom.Dataset]: - """Fixture that returns a function to create minimal but valid DICOM datasets.""" +def dicom_factory() -> Callable[..., pydicom.Dataset]: + """Factory fixture for creating DICOM datasets with custom parameters. + + The nested function returns a factory that lets us create multiple DICOMs + with different parameters in each test (e.g., different pyramid UIDs, + resolutions, and image types). + """ def _create_dicom( pyramid_uid: str | None, @@ -70,126 +510,129 @@ def _create_dicom( @pytest.mark.unit -def test_filter_dicom_pyramid_single_file( +def test_select_dicom_pyramid_single_file( tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset], + dicom_factory: Callable[..., pydicom.Dataset], ) -> None: - """Test that single DICOM files with PyramidUID are not filtered.""" - ds = create_dicom("1.2.3.4.5", 1024, 1024) + """Test that single DICOM files with PyramidUID are included.""" + ds = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file = tmp_path / "test.dcm" ds.save_as(dcm_file, write_like_original=False) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) - assert len(excluded) == 0 + included = ApplicationService._select_dicom_files_to_process(tmp_path) + assert len(included) == 1 + assert dcm_file in included @pytest.mark.unit -def test_filter_dicom_pyramid_standalone_no_pyramid_uid( +def test_select_dicom_pyramid_standalone_no_pyramid_uid( tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset], + dicom_factory: Callable[..., pydicom.Dataset], ) -> None: - """Test that standalone DICOM files without PyramidUID are not filtered.""" - ds = create_dicom(None, 1024, 1024) # No PyramidUID + """Test that standalone DICOM files without PyramidUID are included.""" + ds = dicom_factory(None, 1024, 1024) # No PyramidUID, no ImageType Value 3 dcm_file = tmp_path / "test.dcm" ds.save_as(dcm_file, write_like_original=False) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) - assert len(excluded) == 0 + included = ApplicationService._select_dicom_files_to_process(tmp_path) + assert len(included) == 1 + assert dcm_file in included @pytest.mark.unit -def test_filter_dicom_pyramid_multi_file(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: +def test_select_dicom_pyramid_multi_file(tmp_path: Path, dicom_factory: Callable[..., pydicom.Dataset]) -> None: """Test that for multi-file DICOM pyramid, only the highest resolution file is kept.""" pyramid_uid = "1.2.3.4.5" # Create low resolution DICOM file (smallest pyramid level) - ds_low = create_dicom(pyramid_uid, 512, 512) + ds_low = dicom_factory(pyramid_uid, 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file_low = tmp_path / "test_low.dcm" ds_low.save_as(dcm_file_low, write_like_original=False) # Create medium resolution DICOM file - ds_med = create_dicom(pyramid_uid, 1024, 1024) + ds_med = dicom_factory(pyramid_uid, 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file_med = tmp_path / "test_med.dcm" ds_med.save_as(dcm_file_med, write_like_original=False) # Create high resolution DICOM file (base layer - highest resolution) - ds_high = create_dicom(pyramid_uid, 2048, 2048) + ds_high = dicom_factory(pyramid_uid, 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file_high = tmp_path / "test_high.dcm" ds_high.save_as(dcm_file_high, write_like_original=False) - # Filter the pyramid - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + # Select files to process + included = ApplicationService._select_dicom_files_to_process(tmp_path) - # Should exclude 2 files (low and medium), keeping only the highest resolution - assert len(excluded) == 2 - assert dcm_file_low in excluded - assert dcm_file_med in excluded - assert dcm_file_high not in excluded + # Should include only 1 file (the highest resolution) + assert len(included) == 1 + assert dcm_file_high in included + assert dcm_file_low not in included + assert dcm_file_med not in included @pytest.mark.unit -def test_filter_dicom_pyramid_multiple_pyramids(tmp_path: Path, create_dicom: Callable[..., pydicom.Dataset]) -> None: +def test_select_dicom_pyramid_multiple_pyramids(tmp_path: Path, dicom_factory: Callable[..., pydicom.Dataset]) -> None: """Test that files from different pyramids are not filtered against each other.""" # Pyramid 1 - two files (pyramid with 2 levels) - ds1_low = create_dicom("1.2.3.4.5", 512, 512) + ds1_low = dicom_factory("1.2.3.4.5", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file1_low = tmp_path / "pyramid1_low.dcm" ds1_low.save_as(dcm_file1_low, write_like_original=False) - ds1_high = create_dicom("1.2.3.4.5", 1024, 1024) + ds1_high = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file1_high = tmp_path / "pyramid1_high.dcm" ds1_high.save_as(dcm_file1_high, write_like_original=False) # Pyramid 2 - single file (standalone, single level) - ds2 = create_dicom("6.7.8.9.0", 512, 512) + ds2 = dicom_factory("6.7.8.9.0", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file2 = tmp_path / "pyramid2.dcm" ds2.save_as(dcm_file2, write_like_original=False) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + included = ApplicationService._select_dicom_files_to_process(tmp_path) - # Should exclude only the low-res file from pyramid 1 - assert len(excluded) == 1 - assert dcm_file1_low in excluded - assert dcm_file1_high not in excluded - assert dcm_file2 not in excluded + # Should include 2 files: highest from pyramid 1 and single from pyramid 2 + assert len(included) == 2 + assert dcm_file1_high in included + assert dcm_file2 in included + assert dcm_file1_low not in included @pytest.mark.unit -def test_filter_dicom_pyramid_exclude_non_wsi( +def test_select_dicom_exclude_non_wsi( tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset], + dicom_factory: Callable[..., pydicom.Dataset], ) -> None: """Test that non-WSI DICOM files (e.g., segmentations) are excluded.""" # Create a segmentation storage DICOM - ds_seg = create_dicom( + ds_seg = dicom_factory( "1.2.3.4.5", 1024, 1024, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4", # Segmentation Storage + image_type=["DERIVED", "PRIMARY", "VOLUME"], ) dcm_file_seg = tmp_path / "segmentation.dcm" ds_seg.save_as(dcm_file_seg, write_like_original=False) # Create a valid WSI - ds_wsi = create_dicom("1.2.3.4.5", 1024, 1024) + ds_wsi = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file_wsi = tmp_path / "wsi.dcm" ds_wsi.save_as(dcm_file_wsi, write_like_original=False) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + included = ApplicationService._select_dicom_files_to_process(tmp_path) - # Should exclude only the segmentation file - assert len(excluded) == 1 - assert dcm_file_seg in excluded - assert dcm_file_wsi not in excluded + # Should include only the WSI file + assert len(included) == 1 + assert dcm_file_wsi in included + assert dcm_file_seg not in included @pytest.mark.unit -def test_filter_dicom_pyramid_exclude_thumbnails( +def test_select_dicom_exclude_non_volumes( tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset], + dicom_factory: Callable[..., pydicom.Dataset], ) -> None: - """Test that thumbnail images are excluded.""" + """Test that non-volume images are excluded.""" # Create a thumbnail - ds_thumb = create_dicom( + ds_thumb = dicom_factory( "1.2.3.4.5", 256, 256, @@ -199,7 +642,7 @@ def test_filter_dicom_pyramid_exclude_thumbnails( ds_thumb.save_as(dcm_file_thumb, write_like_original=False) # Create a regular WSI image - ds_wsi = create_dicom( + ds_wsi = dicom_factory( "1.2.3.4.5", 1024, 1024, @@ -208,81 +651,69 @@ def test_filter_dicom_pyramid_exclude_thumbnails( dcm_file_wsi = tmp_path / "wsi.dcm" ds_wsi.save_as(dcm_file_wsi, write_like_original=False) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + included = ApplicationService._select_dicom_files_to_process(tmp_path) - # Should exclude only the thumbnail - assert len(excluded) == 1 - assert dcm_file_thumb in excluded - assert dcm_file_wsi not in excluded + # Should include only the VOLUME image + assert len(included) == 1 + assert dcm_file_wsi in included + assert dcm_file_thumb not in included @pytest.mark.unit -def test_filter_dicom_pyramid_missing_attributes( +def test_select_dicom_no_image_type_value_3( tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], ) -> None: - """Test that DICOM files without required WSI attributes are skipped gracefully.""" - # Create a DICOM without TotalPixelMatrix attributes - ds = pydicom.Dataset() - ds.file_meta = pydicom.Dataset() - ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian - ds.file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.77.1.6" - ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() - - ds.SOPInstanceUID = pydicom.uid.generate_uid() - ds.SOPClassUID = "1.2.840.10008.5.1.4.1.1.77.1.6" - ds.StudyInstanceUID = pydicom.uid.generate_uid() - ds.SeriesInstanceUID = pydicom.uid.generate_uid() - ds.Modality = "SM" - ds.Rows = 512 - ds.Columns = 512 - ds.PyramidUID = "1.2.3.4.5" - # Note: No TotalPixelMatrixRows/Columns - - dcm_file = tmp_path / "incomplete_wsi.dcm" + """Test that files without ImageType Value 3 are included as standalone.""" + # Create file with only 2 ImageType values + ds = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY"]) + dcm_file = tmp_path / "no_value3.dcm" ds.save_as(dcm_file, write_like_original=False) - # Should not crash, and should not exclude anything (file is skipped gracefully) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) - assert len(excluded) == 0 + included = ApplicationService._select_dicom_files_to_process(tmp_path) + + # Should include as standalone (no Value 3 to check) + assert len(included) == 1 + assert dcm_file in included @pytest.mark.unit -def test_filter_dicom_pyramid_mixed_scenario( +def test_select_dicom_mixed_scenario( tmp_path: Path, - create_dicom: Callable[..., pydicom.Dataset], + dicom_factory: Callable[..., pydicom.Dataset], ) -> None: """Test complex scenario with multiple pyramids, thumbnails, and segmentations.""" - # Pyramid 1: 3 levels (keep highest) - ds1_low = create_dicom("pyramid1", 512, 512) + # Pyramid 1: 2 levels (keep highest) + ds1_low = dicom_factory("pyramid1", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file1_low = tmp_path / "p1_low.dcm" ds1_low.save_as(dcm_file1_low, write_like_original=False) - ds1_high = create_dicom("pyramid1", 2048, 2048) + ds1_high = dicom_factory("pyramid1", 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file1_high = tmp_path / "p1_high.dcm" ds1_high.save_as(dcm_file1_high, write_like_original=False) # Thumbnail for pyramid 1 (exclude) - ds1_thumb = create_dicom("pyramid1", 256, 256, image_type=["DERIVED", "PRIMARY", "THUMBNAIL"]) + ds1_thumb = dicom_factory("pyramid1", 256, 256, image_type=["DERIVED", "PRIMARY", "THUMBNAIL"]) dcm_file1_thumb = tmp_path / "p1_thumb.dcm" ds1_thumb.save_as(dcm_file1_thumb, write_like_original=False) # Segmentation file (exclude) - ds_seg = create_dicom("pyramid1", 2048, 2048, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4") + ds_seg = dicom_factory("pyramid1", 2048, 2048, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4") dcm_file_seg = tmp_path / "seg.dcm" ds_seg.save_as(dcm_file_seg, write_like_original=False) # Standalone WSI without PyramidUID (keep) - ds_standalone = create_dicom(None, 1024, 1024) + ds_standalone = dicom_factory(None, 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) dcm_file_standalone = tmp_path / "standalone.dcm" ds_standalone.save_as(dcm_file_standalone, write_like_original=False) - excluded = ApplicationService._filter_dicom_pyramid_files(tmp_path) + included = ApplicationService._select_dicom_files_to_process(tmp_path) - # Should exclude: low-res from pyramid1, thumbnail, segmentation - assert len(excluded) == 3 - assert dcm_file1_low in excluded - assert dcm_file1_thumb in excluded - assert dcm_file_seg in excluded - # Should keep: high-res from pyramid1, standalone - assert dcm_file1_high not in excluded - assert dcm_file_standalone not in excluded + # Should include: high-res from pyramid1, standalone + assert len(included) == 2 + assert dcm_file1_high in included + assert dcm_file_standalone in included + # Should exclude: low-res, thumbnail, segmentation + assert dcm_file1_low not in included + assert dcm_file1_thumb not in included + assert dcm_file_seg not in included From 326dfa6cbdce3b9cd0f82ef72401622277d88693 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Fri, 5 Dec 2025 17:09:37 +0100 Subject: [PATCH 13/18] task(wsi): Move DICOM filtering logic to pydicom handler, align CLI. --- src/aignostics/application/_service.py | 206 ++----------------- src/aignostics/wsi/_cli.py | 3 +- src/aignostics/wsi/_pydicom_handler.py | 158 +++++++++++++- src/aignostics/wsi/_service.py | 27 +++ tests/aignostics/application/service_test.py | 65 ------ tests/aignostics/wsi/service_test.py | 198 ++++++++++++++++++ 6 files changed, 399 insertions(+), 258 deletions(-) diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index fdb614c36..4e1d107a5 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -3,53 +3,36 @@ import base64 import re import time -from collections import defaultdict -from collections.abc import Callable, Generator, Iterable +from collections.abc import Callable, Generator from http import HTTPStatus from importlib.util import find_spec from pathlib import Path from typing import Any import google_crc32c -import pydicom import requests from loguru import logger from aignostics.bucket import Service as BucketService from aignostics.constants import TEST_APP_APPLICATION_ID -from aignostics.platform import ( - LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, - ApiException, - Application, - ApplicationSummary, - ApplicationVersion, - Client, - InputArtifact, - InputItem, - NotFoundException, - Run, - RunData, - RunOutput, - RunState, -) +from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, + ApiException, Application, ApplicationSummary, + ApplicationVersion, Client, InputArtifact, + InputItem, NotFoundException, Run, RunData, + RunOutput, RunState) from aignostics.platform import Service as PlatformService from aignostics.utils import BaseService, Health, sanitize_path_component from aignostics.wsi import Service as WSIService -from ._download import ( - download_available_items, - download_url_to_file_with_progress, - extract_filename_from_url, - update_progress, -) +from ._download import (download_available_items, + download_url_to_file_with_progress, + extract_filename_from_url, update_progress) from ._models import DownloadProgress, DownloadProgressState from ._settings import Settings -from ._utils import ( - get_mime_type_for_artifact, - get_supported_extensions_for_application, - is_not_terminated_with_deadline_exceeded, - validate_due_date, -) +from ._utils import (get_mime_type_for_artifact, + get_supported_extensions_for_application, + is_not_terminated_with_deadline_exceeded, + validate_due_date) has_qupath_extra = find_spec("ijson") if has_qupath_extra: @@ -310,155 +293,6 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None for key_value in key_value_pairs: Service._process_key_value_pair(entry, key_value, external_id) - @staticmethod - def _select_dicom_files_to_process(source_directory: Path) -> list[Path]: - """Select DICOM files to process, excluding auxiliary and redundant files. - - For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), - keeps only the highest resolution file. Excludes segmentations, annotations, - thumbnails, labels, and other non-image DICOM files. OpenSlide will automatically - find related pyramid files in the same directory when needed. - - Filtering Strategy: - - Only processes VL Whole Slide Microscopy Image Storage - (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6) - Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html - - Excludes auxiliary images by ImageType Value 3 (keeps only VOLUME images) - Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.12.4.html#sect_C.8.12.4.1.1 - - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) - - For pyramids with multiple files, selects only the highest resolution based - on TotalPixelMatrixRows x TotalPixelMatrixColumns - - Preserves standalone WSI files without PyramidUID - - Args: - source_directory: The directory to recursively scan for DICOM files. - - Returns: - List of DICOM file paths to process. All other DICOM files are excluded. - - Note: - This function reads DICOM metadata but not pixel data (stop_before_pixels=True). - Files that cannot be read or are missing required attributes are logged and skipped. - """ - pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) - included_dicom_files: list[Path] = [] - - # Scan all DICOM files and filter based on SOPClassUID and ImageType - for dcm_file in source_directory.glob("**/*.dcm"): - try: - ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) - - # Exclude non-WSI image files by SOPClassUID - # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) - if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": - logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") - continue - - # Exclude auxiliary images by ImageType Value 3 - # Per DICOM PS3.3 C.8.12.4.1.1: Value 3 should be VOLUME, THUMBNAIL, LABEL, or OVERVIEW. - # We only want VOLUME images - if hasattr(ds, "ImageType") and len(ds.ImageType) >= 3: # noqa: PLR2004 - image_type_value_3 = ds.ImageType[2].upper() - - if image_type_value_3 != "VOLUME": - logger.debug(f"Excluding {dcm_file.name} - ImageType Value 3: {image_type_value_3}") - continue - else: - # No ImageType Value 3 - could be standalone WSI or non-standard file - logger.debug(f"DICOM {dcm_file.name} has no ImageType Value 3 - treating as standalone") - - # Try to group by PyramidUID for pyramid resolution selection - pyramid_info = Service._extract_pyramid_info(dcm_file) - if pyramid_info: - pyramid_uid, rows, cols = pyramid_info - pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) - else: - # No PyramidUID or missing dimensions - treat as standalone file - included_dicom_files.append(dcm_file) - - except Exception as e: - logger.debug(f"Could not process DICOM file {dcm_file}: {e}") - continue - - # For each pyramid with multiple files, select only the highest resolution - pyramid_files_to_include = Service._find_highest_resolution_files(pyramid_groups) - included_dicom_files.extend(pyramid_files_to_include) - - return included_dicom_files - - @staticmethod - def _extract_pyramid_info(dcm_file: Path) -> tuple[str, int, int] | None: - """Extract pyramid information from a DICOM file. - - Attempts to read PyramidUID and image dimensions (TotalPixelMatrixRows/Columns) - from a DICOM file. These attributes are used to group multi-file pyramids - and select the highest resolution file. - - Args: - dcm_file: Path to the DICOM file to extract information from. - - Returns: - Tuple of (PyramidUID, rows, cols) if the file is part of a pyramid, - None if the file is standalone or required attributes are missing. - - Note: - Files without PyramidUID are treated as standalone WSI images. - Missing TotalPixelMatrix attributes also result in None (file treated as standalone). - """ - try: - ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) - - # PyramidUID is Type 1C (conditional) - required if part of a multi-file pyramid - pyramid_uid = ds.PyramidUID - - # TotalPixelMatrix attributes are Type 1 for WSI - should always be present - rows = int(ds.TotalPixelMatrixRows) - cols = int(ds.TotalPixelMatrixColumns) - - return (pyramid_uid, rows, cols) - - except AttributeError as e: - logger.debug(f"DICOM {dcm_file.name} missing pyramid attributes: {e}") - return None - except Exception as e: - logger.debug(f"Could not extract pyramid info from {dcm_file}: {e}") - return None - - @staticmethod - def _find_highest_resolution_files( - pyramid_groups: dict[str, list[tuple[Path, int, int]]], - ) -> list[Path]: - """Find the highest resolution file for each multi-file pyramid. - - For each pyramid (identified by PyramidUID), selects the file with the - largest total pixel count (rows x cols). All other files in the pyramid - are excluded, as OpenSlide can find them automatically. - - Args: - pyramid_groups: Dictionary mapping PyramidUID to list of (file_path, rows, cols). - - Returns: - List of file paths to keep (one per pyramid, the highest resolution). - - Note: - Single-file pyramids (only one file per PyramidUID) are included without - comparison. - """ - files_to_keep: list[Path] = [] - - for pyramid_uid, files_with_dims in pyramid_groups.items(): - highest_res_file_with_dims = max(files_with_dims, key=lambda x: x[1] * x[2]) - highest_res_file, rows, cols = highest_res_file_with_dims - files_to_keep.append(highest_res_file) - - if len(files_with_dims) > 1: - logger.debug( - f"DICOM pyramid {pyramid_uid}: keeping {highest_res_file.name} " - f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" - ) - - return files_to_keep - @staticmethod def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 source_directory: Path, @@ -508,17 +342,12 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 _ = Service().application_version(application_id, application_version) metadata: list[dict[str, Any]] = [] + wsi_service = WSIService() try: extensions = get_supported_extensions_for_application(application_id) for extension in extensions: - # Special handling for DICOM files - filter out auxiliary and redundant files - files_to_process: Iterable[Path] - if extension == ".dcm": - files_to_process = Service._select_dicom_files_to_process(source_directory) - else: - # For non-DICOM formats, process all files with this extension - files_to_process = source_directory.glob(f"**/*{extension}") + files_to_process = wsi_service.get_wsi_files_to_process(source_directory, extension) for file_path in files_to_process: # Generate CRC32C checksum with google_crc32c and encode as base64 @@ -529,7 +358,7 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 checksum = str(base64.b64encode(hash_sum.digest()), "UTF-8") # type: ignore[no-untyped-call] try: - image_metadata = WSIService().get_metadata(file_path) + image_metadata = wsi_service.get_metadata(file_path) width = image_metadata["dimensions"]["width"] height = image_metadata["dimensions"]["height"] mpp = image_metadata["resolution"]["mpp_x"] @@ -1210,7 +1039,8 @@ def application_run_submit( # noqa: PLR0913, PLR0917, PLR0912, C901, PLR0915 # Validate pipeline configuration if present if "pipeline" in sdk_metadata: - from aignostics.platform._sdk_metadata import PipelineConfig # noqa: PLC0415 + from aignostics.platform._sdk_metadata import \ + PipelineConfig # noqa: PLC0415 try: PipelineConfig.model_validate(sdk_metadata["pipeline"]) diff --git a/src/aignostics/wsi/_cli.py b/src/aignostics/wsi/_cli.py index 92bb4d47a..e751e757c 100644 --- a/src/aignostics/wsi/_cli.py +++ b/src/aignostics/wsi/_cli.py @@ -105,13 +105,14 @@ def dicom_inspect( ], verbose: Annotated[bool, typer.Option(help="Verbose output")] = False, summary: Annotated[bool, typer.Option(help="Show only summary information")] = False, + wsi_only: Annotated[bool, typer.Option(help="Filter to WSI files only")] = False, ) -> None: # pylint: disable=W0613 """Inspect DICOM files at any hierarchy level.""" from ._pydicom_handler import PydicomHandler # noqa: PLC0415 try: with PydicomHandler.from_file(str(path)) as handler: - metadata = handler.get_metadata(verbose) + metadata = handler.get_metadata(verbose, wsi_only) if metadata["type"] == "empty": console.print("[bold red]No DICOM files found in the specified path.[/bold red]") diff --git a/src/aignostics/wsi/_pydicom_handler.py b/src/aignostics/wsi/_pydicom_handler.py index f00a4669d..d72ca976b 100644 --- a/src/aignostics/wsi/_pydicom_handler.py +++ b/src/aignostics/wsi/_pydicom_handler.py @@ -33,16 +33,166 @@ def from_file(cls, path: str | Path) -> "PydicomHandler": """ return cls(Path(path)) - def get_metadata(self, verbose: bool = False) -> dict[str, Any]: - files = self._scan_files(verbose) + def get_metadata(self, verbose: bool = False, wsi_only: bool = False) -> dict[str, Any]: + """Get DICOM metadata from files in the configured path. + + Args: + verbose: If True, include detailed annotation group data (coordinates, counts) + for annotation files. Defaults to False. + wsi_only: If True, filter to only WSI DICOM files (highest resolution per + pyramid), excluding thumbnails, labels, segmentations, and redundant + pyramid levels. Defaults to False. + + Returns: + Dictionary containing hierarchical DICOM metadata + """ + files = self._scan_files(verbose, wsi_only) return self._organize_by_hierarchy(files) - def _scan_files(self, verbose: bool = False) -> list[dict[str, Any]]: # noqa: C901, PLR0912, PLR0914, PLR0915 + def select_wsi_files(self) -> list[Path]: + """Select WSI files only from the path, excluding auxiliary and redundant files. + + For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), + keeps only the highest resolution file. Excludes segmentations, annotations, + thumbnails, labels, and other non-image DICOM files. OpenSlide will automatically + find related pyramid files in the same directory when needed. + + See here for more info on the DICOM data model: + https://dicom.nema.org/medical/dicom/current/output/chtml/part03/chapter_7.html + + Filtering Strategy: + - Only processes VL Whole Slide Microscopy Image Storage + (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html + - Excludes auxiliary images by ImageType Value 3 (keeps only VOLUME images) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.12.4.html#sect_C.8.12.4.1.1 + - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) + - For pyramids with multiple files, selects only the highest resolution based + on TotalPixelMatrixRows x TotalPixelMatrixColumns + - Preserves standalone WSI files without PyramidUID + + Returns: + List of DICOM file paths to process. All other DICOM files are excluded. + + Note: + Files that cannot be read or are missing required attributes are logged and skipped. + """ + pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) + included_dicom_files: list[Path] = [] + + # Scan all DICOM files and filter based on SOPClassUID and ImageType + for dcm_file in self.path.glob("**/*.dcm"): + try: + ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) + + # Exclude non-WSI image files by SOPClassUID + # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) + if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": + logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") + continue + + # Exclude auxiliary images by ImageType Value 3 + # Per DICOM PS3.3 C.8.12.4.1.1: Value 3 should be VOLUME, THUMBNAIL, LABEL, or OVERVIEW. + # We only want VOLUME images + if hasattr(ds, "ImageType") and len(ds.ImageType) >= 3: # noqa: PLR2004 + image_type_value_3 = ds.ImageType[2].upper() + + if image_type_value_3 != "VOLUME": + logger.debug(f"Excluding {dcm_file.name} - ImageType Value 3: {image_type_value_3}") + continue + else: + # No ImageType Value 3 - could be standalone WSI or non-standard file + logger.debug(f"DICOM {dcm_file.name} has no ImageType Value 3 - treating as standalone") + + # Try to group by PyramidUID for pyramid resolution selection + if hasattr(ds, "PyramidUID"): + # PyramidUID is Type 1C (conditional) - required if part of a multi-file pyramid + # TotalPixelMatrix attributes are Type 1 for WSI - should always be present + pyramid_uid = ds.PyramidUID + rows = int(ds.TotalPixelMatrixRows) + cols = int(ds.TotalPixelMatrixColumns) + + pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) + else: + # Treat as standalone file + included_dicom_files.append(dcm_file) + + except Exception as e: + logger.debug(f"Could not process DICOM file {dcm_file}: {e}") + continue + + # For each pyramid with multiple files, select only the highest resolution + pyramid_files_to_include = PydicomHandler._find_highest_resolution_files(pyramid_groups) + included_dicom_files.extend(pyramid_files_to_include) + + return included_dicom_files + + @staticmethod + def _find_highest_resolution_files( + pyramid_groups: dict[str, list[tuple[Path, int, int]]], + ) -> list[Path]: + """Find the highest resolution file for each multi-file pyramid. + + For each pyramid (identified by PyramidUID), selects the file with the + largest total pixel count (rows x cols). All other files in the pyramid + are excluded, as OpenSlide can find them automatically. + + Args: + pyramid_groups: Dictionary mapping PyramidUID to list of (file_path, rows, cols). + + Returns: + List of file paths to keep (one per pyramid, the highest resolution). + + Note: + Single-file pyramids (only one file per PyramidUID) are included without + comparison. + """ + files_to_keep: list[Path] = [] + + for pyramid_uid, files_with_dims in pyramid_groups.items(): + if len(files_with_dims) > 1: + highest_res_file_with_dims = max(files_with_dims, key=lambda x: x[1] * x[2]) + highest_res_file, rows, cols = highest_res_file_with_dims + files_to_keep.append(highest_res_file) + + logger.debug( + f"DICOM pyramid {pyramid_uid}: keeping {highest_res_file.name} " + f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" + ) + else: + files_to_keep.append(files_with_dims[0][0]) + + return files_to_keep + + def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dict[str, Any]]: # noqa: C901, PLR0912, PLR0914, PLR0915 + """Scan DICOM files and extract metadata. + + Recursively scans the path for DICOM files, reads their metadata (without loading + pixel data), and returns structured information about each file including modality, + dimensions, and type-specific details. + + Args: + verbose: If True, include detailed annotation group data (coordinates, counts). + wsi_only: If True, filter to only WSI files (highest resolution ones only in + multi-file pyramid case). + + Returns: + List of dictionaries containing file metadata. Each dict includes: + - Basic info: path, study_uid, series_uid, modality, type, size + - Modality-specific data (e.g., pyramid info for SM/WSI, annotations for ANN) + - Patient metadata where available + + Note: + Invalid DICOM files are logged and skipped. + """ dicom_files = [] - # Determine which files to process based on whether the `self.path` points to a single file or a directory + # Determine which files to process if self.path.is_file(): files_to_process = [self.path] if self.path.suffix.lower() == ".dcm" else [] + elif wsi_only: + # Use your filtering logic + files_to_process = self._select_wsi_files(self.path) else: files_to_process = list(self.path.rglob("*.dcm")) diff --git a/src/aignostics/wsi/_service.py b/src/aignostics/wsi/_service.py index 1e77f37fd..b745fc04f 100644 --- a/src/aignostics/wsi/_service.py +++ b/src/aignostics/wsi/_service.py @@ -1,6 +1,7 @@ """Service of the wsi module.""" import io +from collections.abc import Iterable from pathlib import Path from typing import Any @@ -11,6 +12,7 @@ from aignostics.utils import BaseService, Health from ._openslide_handler import DEFAULT_MAX_SAFE_DIMENSION +from ._pydicom_handler import PydicomHandler TIMEOUT = 60 # 1 minutes @@ -175,3 +177,28 @@ def get_tiff_as_jpg(url: str) -> bytes: error_msg = f"Unexpected error converting TIFF to JPEG: {e!s}." logger.exception(error_msg) raise RuntimeError(error_msg) from e + + @staticmethod + def get_wsi_files_to_process(path: Path, extension: str) -> Iterable[Path]: + """Get WSI files to process for the specified extension. + + For DICOM files (.dcm), applies filtering to only include WSI files and select + only the highest resolution file from multi-file pyramids. For other formats, + returns all files matching the extension. + + Args: + path: Root directory to search for WSI files. + extension: File extension to filter (e.g., ".dcm", ".tiff", ".svs"). + Must include the leading dot. + + Returns: + Iterable of Path objects for files to process. + """ + files_to_process: Iterable[Path] + if extension == ".dcm": + # Special handling for DICOM files - filter out auxiliary and redundant files + files_to_process = PydicomHandler.from_file(str(path)).select_wsi_files() + else: + # For non-DICOM formats, process all files with this extension + files_to_process = path.glob(f"**/*{extension}") + return files_to_process diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index ce2a8ed4c..7ac25dd03 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -444,71 +444,6 @@ def test_application_run_update_item_custom_metadata_not_found(mock_get_client: service.application_run_update_item_custom_metadata("run-123", "invalid-item-id", {"key": "value"}) -@pytest.fixture -def dicom_factory() -> Callable[..., pydicom.Dataset]: - """Factory fixture for creating DICOM datasets with custom parameters. - - The nested function returns a factory that lets us create multiple DICOMs - with different parameters in each test (e.g., different pyramid UIDs, - resolutions, and image types). - """ - - def _create_dicom( - pyramid_uid: str | None, - rows: int, - cols: int, - sop_class_uid: str = "1.2.840.10008.5.1.4.1.1.77.1.6", # VL WSI by default - image_type: list[str] | None = None, - ) -> pydicom.Dataset: - """Create a minimal but valid DICOM dataset for WSI. - - Args: - pyramid_uid: The pyramid UID (None for standalone images) - rows: Total image rows (TotalPixelMatrixRows) for the full WSI - cols: Total image columns (TotalPixelMatrixColumns) for the full WSI - sop_class_uid: SOP Class UID (defaults to VL WSI) - image_type: Optional ImageType attribute - - Returns: - A valid pydicom Dataset for whole slide imaging - """ - ds = pydicom.Dataset() - - # File Meta Information - ds.file_meta = pydicom.Dataset() - ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian - ds.file_meta.MediaStorageSOPClassUID = sop_class_uid - ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() - - # Required DICOM attributes - ds.SOPInstanceUID = pydicom.uid.generate_uid() - ds.SOPClassUID = sop_class_uid - ds.StudyInstanceUID = pydicom.uid.generate_uid() - ds.SeriesInstanceUID = pydicom.uid.generate_uid() - ds.Modality = "SM" - - # Tile dimensions (typically 256x256 for WSI) - ds.Rows = 256 - ds.Columns = 256 - - # CRITICAL: Total image dimensions for whole slide imaging - # These represent the full image size and are what differentiate pyramid levels - ds.TotalPixelMatrixRows = rows - ds.TotalPixelMatrixColumns = cols - - # Add PyramidUID if provided (optional for standalone images) - if pyramid_uid: - ds.PyramidUID = pyramid_uid - - # Add ImageType if provided - if image_type: - ds.ImageType = image_type - - return ds - - return _create_dicom - - @pytest.mark.unit def test_select_dicom_pyramid_single_file( tmp_path: Path, diff --git a/tests/aignostics/wsi/service_test.py b/tests/aignostics/wsi/service_test.py index 76f9177b5..30390488e 100644 --- a/tests/aignostics/wsi/service_test.py +++ b/tests/aignostics/wsi/service_test.py @@ -4,15 +4,19 @@ import http.server import os import threading +from collections.abc import Callable from io import BytesIO from pathlib import Path +import pydicom import pytest from fastapi.testclient import TestClient from nicegui import app from nicegui.testing import User from PIL import Image +from aignostics.wsi import Service as WSIService + CONTENT_LENGTH_FALLBACK = 32066 # Fallback image size in bytes @@ -285,3 +289,197 @@ def test_serve_tiff_to_jpeg_fails_on_tiff_url_broken(user: User, record_property assert response.status_code == 200 assert int(response.headers["Content-Length"]) == CONTENT_LENGTH_FALLBACK + + +@pytest.fixture +def dicom_factory() -> Callable[..., pydicom.Dataset]: + """Factory fixture for creating DICOM datasets with custom parameters. + + The nested function returns a factory that lets us create multiple DICOMs + with different parameters in each test (e.g., different pyramid UIDs, + resolutions, and image types). + """ + + def _create_dicom( + pyramid_uid: str | None, + rows: int, + cols: int, + sop_class_uid: str = "1.2.840.10008.5.1.4.1.1.77.1.6", # VL WSI by default + image_type: list[str] | None = None, + ) -> pydicom.Dataset: + """Create a minimal but valid DICOM dataset for WSI. + + Args: + pyramid_uid: The pyramid UID (None for standalone images) + rows: Total image rows (TotalPixelMatrixRows) for the full WSI + cols: Total image columns (TotalPixelMatrixColumns) for the full WSI + sop_class_uid: SOP Class UID (defaults to VL WSI) + image_type: Optional ImageType attribute + + Returns: + A valid pydicom Dataset for whole slide imaging + """ + ds = pydicom.Dataset() + + # File Meta Information + ds.file_meta = pydicom.Dataset() + ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian + ds.file_meta.MediaStorageSOPClassUID = sop_class_uid + ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() + + # Required DICOM attributes + ds.SOPInstanceUID = pydicom.uid.generate_uid() + ds.SOPClassUID = sop_class_uid + ds.StudyInstanceUID = pydicom.uid.generate_uid() + ds.SeriesInstanceUID = pydicom.uid.generate_uid() + ds.Modality = "SM" + + # Tile dimensions (typically 256x256 for WSI) + ds.Rows = 256 + ds.Columns = 256 + + # CRITICAL: Total image dimensions for whole slide imaging + # These represent the full image size and are what differentiate pyramid levels + ds.TotalPixelMatrixRows = rows + ds.TotalPixelMatrixColumns = cols + + # Add PyramidUID if provided (optional for standalone images) + if pyramid_uid: + ds.PyramidUID = pyramid_uid + + # Add ImageType if provided + if image_type: + ds.ImageType = image_type + + return ds + + return _create_dicom + + +@pytest.mark.unit +def test_get_wsi_files_to_process_dicom_multi_file_pyramid( + tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], +) -> None: + """Test service filters multi-file DICOM pyramid to highest resolution only.""" + pyramid_uid = "1.2.3.4.5" + + # Create low resolution (should be excluded) + ds_low = dicom_factory(pyramid_uid, 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file_low = tmp_path / "test_low.dcm" + ds_low.save_as(dcm_file_low, write_like_original=False) + + # Create high resolution (should be kept) + ds_high = dicom_factory(pyramid_uid, 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file_high = tmp_path / "test_high.dcm" + ds_high.save_as(dcm_file_high, write_like_original=False) + + # Get filtered files + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".dcm")) + + # Should include only highest resolution + assert len(files) == 1 + assert dcm_file_high in files + assert dcm_file_low not in files + + +@pytest.mark.unit +def test_get_wsi_files_to_process_dicom_excludes_thumbnails( + tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], +) -> None: + """Test service excludes DICOM thumbnail images.""" + from aignostics.wsi import Service as WSIService + + # Create thumbnail (should be excluded) + ds_thumb = dicom_factory( + "1.2.3.4.5", + 256, + 256, + image_type=["DERIVED", "PRIMARY", "THUMBNAIL"], + ) + dcm_file_thumb = tmp_path / "thumbnail.dcm" + ds_thumb.save_as(dcm_file_thumb, write_like_original=False) + + # Create volume image (should be kept) + ds_volume = dicom_factory( + "1.2.3.4.5", + 1024, + 1024, + image_type=["DERIVED", "PRIMARY", "VOLUME"], + ) + dcm_file_volume = tmp_path / "volume.dcm" + ds_volume.save_as(dcm_file_volume, write_like_original=False) + + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".dcm")) + + assert len(files) == 1 + assert dcm_file_volume in files + assert dcm_file_thumb not in files + + +@pytest.mark.unit +def test_get_wsi_files_to_process_dicom_mixed_scenario( + tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], +) -> None: + """Test service handles complex scenario with multiple pyramids and file types.""" + from aignostics.wsi import Service as WSIService + + # Pyramid 1: 2 levels (keep highest only) + ds1_low = dicom_factory("pyramid1", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file1_low = tmp_path / "p1_low.dcm" + ds1_low.save_as(dcm_file1_low, write_like_original=False) + + ds1_high = dicom_factory("pyramid1", 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file1_high = tmp_path / "p1_high.dcm" + ds1_high.save_as(dcm_file1_high, write_like_original=False) + + # Thumbnail file (exclude) + ds_thumb = dicom_factory("pyramid1", 256, 256, image_type=["DERIVED", "PRIMARY", "THUMBNAIL"]) + dcm_file_thumb = tmp_path / "p1_thumb.dcm" + ds_thumb.save_as(dcm_file_thumb, write_like_original=False) + + # Segmentation file (exclude by SOPClassUID) + ds_seg = dicom_factory("pyramid1", 2048, 2048, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4") + dcm_file_seg = tmp_path / "seg.dcm" + ds_seg.save_as(dcm_file_seg, write_like_original=False) + + # Standalone WSI (keep) + ds_standalone = dicom_factory(None, 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file_standalone = tmp_path / "standalone.dcm" + ds_standalone.save_as(dcm_file_standalone, write_like_original=False) + + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".dcm")) + + # Should include: high-res from pyramid1, standalone + assert len(files) == 2 + assert dcm_file1_high in files + assert dcm_file_standalone in files + + # Should exclude: low-res, thumbnail, segmentation + assert dcm_file1_low not in files + assert dcm_file_thumb not in files + assert dcm_file_seg not in files + + +@pytest.mark.unit +def test_get_wsi_files_to_process_non_dicom_passthrough(tmp_path: Path) -> None: + """Test service passes through non-DICOM files without filtering.""" + from aignostics.wsi import Service as WSIService + + # Create some TIFF files + tiff1 = tmp_path / "image1.tiff" + tiff2 = tmp_path / "image2.tiff" + tiff1.touch() + tiff2.touch() + + # Non-DICOM files should all be returned + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".tiff")) + + assert len(files) == 2 + assert tiff1 in files + assert tiff2 in files + assert tiff2 in files + assert tiff2 in files + assert tiff2 in files From fa54a37caeabf0b5d06abc030498608cbd8e67af Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Fri, 5 Dec 2025 17:20:44 +0100 Subject: [PATCH 14/18] task(docs): Update docs after move to WSI --- src/aignostics/application/CLAUDE.md | 74 ----------------- src/aignostics/application/_service.py | 42 ++++++---- src/aignostics/wsi/CLAUDE.md | 107 +++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 88 deletions(-) diff --git a/src/aignostics/application/CLAUDE.md b/src/aignostics/application/CLAUDE.md index b3f701695..0632f04ae 100644 --- a/src/aignostics/application/CLAUDE.md +++ b/src/aignostics/application/CLAUDE.md @@ -163,80 +163,6 @@ APPLICATION_RUN_UPLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB APPLICATION_RUN_DOWNLOAD_SLEEP_SECONDS = 5 # Wait between status checks ``` -### DICOM Pyramid Handling - -**Multi-File DICOM Pyramid Filtering (`_service.py`):** - -The service automatically handles multi-file DICOM pyramids (whole slide images stored across multiple DICOM instances) by selecting only the highest resolution file from each pyramid. This prevents redundant processing since OpenSlide can automatically find related pyramid files in the same directory. -```python -@staticmethod -def _filter_dicom_pyramid_files(source_directory: Path) -> set[Path]: - """Filter DICOM files to keep only one representative per pyramid. - - For multi-file DICOM pyramids (WSI images split across multiple instances), - keeps only the highest resolution file. Excludes segmentations, annotations, - thumbnails, and other non-image DICOM files. - - Filtering Strategy: - 1. SOPClassUID filtering - Only process VL Whole Slide Microscopy Image Storage - - Include: 1.2.840.10008.5.1.4.1.1.77.1.6 (VL WSI) - - Exclude: 1.2.840.10008.5.1.4.1.1.66.4 (Segmentation Storage) - - Exclude: Other non-WSI DICOM types - - 2. ImageType filtering - Exclude auxiliary images - - THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, LOCALIZER - - 3. PyramidUID grouping - Group multi-file pyramids - - Files with same PyramidUID are part of one logical WSI - - Files without PyramidUID are treated as standalone WSIs - - 4. Resolution selection - Keep highest resolution per pyramid - - Based on TotalPixelMatrixRows × TotalPixelMatrixColumns - - Excludes all lower resolution levels - - Used automatically in: generate_metadata_from_source_directory() - """ -``` - -**Key Behaviors:** - -- **SOPClassUID validation**: Only processes VL Whole Slide Microscopy Image Storage files (1.2.840.10008.5.1.4.1.1.77.1.6) -- **Non-WSI exclusion**: Automatically excludes segmentations (1.2.840.10008.5.1.4.1.1.66.4), annotations, and other DICOM object types -- **ImageType filtering**: Excludes THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, and LOCALIZER image types -- **PyramidUID grouping**: Groups files by PyramidUID (DICOM tag identifying multi-resolution pyramids) -- **Resolution selection**: For each pyramid, keeps only the file with largest TotalPixelMatrixRows × TotalPixelMatrixColumns -- **Standalone handling**: Files without PyramidUID are treated as standalone WSI images and preserved -- **Graceful degradation**: Files with missing attributes are logged and treated as standalone (not excluded) -- **Debug logging**: Excluded files are logged at DEBUG level with pyramid/exclusion details -- **Pre-processing filter**: Filtering occurs before metadata generation, checksum calculation, or upload -- **Format-specific**: Only affects DICOM files (`.dcm` extension); other formats (SVS, TIFF) unaffected - -**DICOM WSI Structure:** - -In the DICOM Whole Slide Imaging standard: -- **PyramidUID**: Uniquely identifies a single multi-resolution pyramid that may span multiple files -- **SeriesInstanceUID**: Groups related images (may include multiple pyramids, thumbnails, labels) -- **TotalPixelMatrixRows/Columns**: Represents full image dimensions at the highest resolution level - -**Example Scenario:** -``` -Input Directory: -├── pyramid_level_0.dcm (10000×10000 px, PyramidUID: ABC123) ← KEPT -├── pyramid_level_1.dcm (5000×5000 px, PyramidUID: ABC123) ← EXCLUDED -├── pyramid_level_2.dcm (2500×2500 px, PyramidUID: ABC123) ← EXCLUDED -├── thumbnail.dcm (256×256 px, PyramidUID: ABC123, ImageType: THUMBNAIL) ← EXCLUDED -├── segmentation.dcm (10000×10000 px, SOPClassUID: Segmentation) ← EXCLUDED -└── standalone.dcm (8000×8000 px, No PyramidUID) ← KEPT - -Result: Only pyramid_level_0.dcm and standalone.dcm are processed -``` - -**Error Handling:** - -- Files with missing SOPClassUID are logged as warnings and excluded (malformed DICOM) -- Files with PyramidUID but missing TotalPixelMatrix* attributes are treated as standalone -- Files that cannot be read by pydicom are logged at DEBUG level and skipped -- AttributeError and general exceptions are caught to prevent processing pipeline failure ### Progress State Management diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index 4e1d107a5..e30db9f9b 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -15,24 +15,39 @@ from aignostics.bucket import Service as BucketService from aignostics.constants import TEST_APP_APPLICATION_ID -from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, - ApiException, Application, ApplicationSummary, - ApplicationVersion, Client, InputArtifact, - InputItem, NotFoundException, Run, RunData, - RunOutput, RunState) +from aignostics.platform import ( + LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, + ApiException, + Application, + ApplicationSummary, + ApplicationVersion, + Client, + InputArtifact, + InputItem, + NotFoundException, + Run, + RunData, + RunOutput, + RunState, +) from aignostics.platform import Service as PlatformService from aignostics.utils import BaseService, Health, sanitize_path_component from aignostics.wsi import Service as WSIService -from ._download import (download_available_items, - download_url_to_file_with_progress, - extract_filename_from_url, update_progress) +from ._download import ( + download_available_items, + download_url_to_file_with_progress, + extract_filename_from_url, + update_progress, +) from ._models import DownloadProgress, DownloadProgressState from ._settings import Settings -from ._utils import (get_mime_type_for_artifact, - get_supported_extensions_for_application, - is_not_terminated_with_deadline_exceeded, - validate_due_date) +from ._utils import ( + get_mime_type_for_artifact, + get_supported_extensions_for_application, + is_not_terminated_with_deadline_exceeded, + validate_due_date, +) has_qupath_extra = find_spec("ijson") if has_qupath_extra: @@ -1039,8 +1054,7 @@ def application_run_submit( # noqa: PLR0913, PLR0917, PLR0912, C901, PLR0915 # Validate pipeline configuration if present if "pipeline" in sdk_metadata: - from aignostics.platform._sdk_metadata import \ - PipelineConfig # noqa: PLC0415 + from aignostics.platform._sdk_metadata import PipelineConfig # noqa: PLC0415 try: PipelineConfig.model_validate(sdk_metadata["pipeline"]) diff --git a/src/aignostics/wsi/CLAUDE.md b/src/aignostics/wsi/CLAUDE.md index 28a7b9f77..abc2d759d 100644 --- a/src/aignostics/wsi/CLAUDE.md +++ b/src/aignostics/wsi/CLAUDE.md @@ -210,6 +210,113 @@ def get_tile( return tile ``` + +### DICOM WSI File Filtering + +**Multi-File DICOM Pyramid Selection (`PydicomHandler.select_wsi_files()`):** + +The WSI module automatically handles multi-file DICOM pyramids (whole slide images stored across multiple DICOM instances) by selecting only the highest resolution file from each pyramid. This prevents redundant processing since OpenSlide can automatically find related pyramid files in the same directory. + +**Service Integration (`Service.get_wsi_files_to_process()`):** +```python +from aignostics.wsi import Service +from pathlib import Path + +# Get filtered DICOM files +files = Service.get_wsi_files_to_process( + path=Path("/data/dicoms"), + extension=".dcm" +) +# Returns only highest resolution WSI files + +# For non-DICOM formats, returns all files +tiff_files = Service.get_wsi_files_to_process( + path=Path("/data/slides"), + extension=".tiff" +) +# Returns all .tiff files (no filtering) +``` + +**Filtering Strategy:** +```python +def select_wsi_files(self) -> list[Path]: + """Select WSI files only, excluding auxiliary and redundant files. + + Filtering Strategy: + 1. SOPClassUID filtering - Only process VL Whole Slide Microscopy Image Storage + - Include: 1.2.840.10008.5.1.4.1.1.77.1.6 (VL WSI) + - Exclude: 1.2.840.10008.5.1.4.1.1.66.4 (Segmentation Storage) + - Exclude: Other non-WSI DICOM types + + 2. ImageType filtering - Exclude auxiliary images + - Keep: VOLUME images only + - Exclude: THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, LOCALIZER + + 3. PyramidUID grouping - Group multi-file pyramids + - Files with same PyramidUID are part of one logical WSI + - Files without PyramidUID are treated as standalone WSIs + + 4. Resolution selection - Keep highest resolution per pyramid + - Based on TotalPixelMatrixRows × TotalPixelMatrixColumns + - Excludes all lower resolution levels + + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/chapter_7.html + """ +``` + +**Key Behaviors:** + +- **SOPClassUID validation**: Only processes VL Whole Slide Microscopy Image Storage files (1.2.840.10008.5.1.4.1.1.77.1.6) +- **Non-WSI exclusion**: Automatically excludes segmentations (1.2.840.10008.5.1.4.1.1.66.4), annotations, and other DICOM object types +- **ImageType filtering**: Excludes THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, and LOCALIZER image types +- **PyramidUID grouping**: Groups files by PyramidUID (DICOM tag identifying multi-resolution pyramids) +- **Resolution selection**: For each pyramid, keeps only the file with largest TotalPixelMatrixRows × TotalPixelMatrixColumns +- **Standalone handling**: Files without PyramidUID are treated as standalone WSI images and preserved +- **Graceful degradation**: Files with missing attributes are logged and treated as standalone (not excluded) +- **Debug logging**: Excluded files are logged at DEBUG level with pyramid/exclusion details + +**DICOM WSI Structure:** + +In the DICOM Whole Slide Imaging standard: +- **PyramidUID**: Uniquely identifies a single multi-resolution pyramid that may span multiple files +- **SeriesInstanceUID**: Groups related images (may include multiple pyramids, thumbnails, labels) +- **TotalPixelMatrixRows/Columns**: Represents full image dimensions at the highest resolution level + +**Example Scenario:** +``` +Input Directory: +├── pyramid_level_0.dcm (10000×10000 px, PyramidUID: ABC123) ← KEPT +├── pyramid_level_1.dcm (5000×5000 px, PyramidUID: ABC123) ← EXCLUDED +├── pyramid_level_2.dcm (2500×2500 px, PyramidUID: ABC123) ← EXCLUDED +├── thumbnail.dcm (256×256 px, PyramidUID: ABC123, ImageType: THUMBNAIL) ← EXCLUDED +├── segmentation.dcm (10000×10000 px, SOPClassUID: Segmentation) ← EXCLUDED +└── standalone.dcm (8000×8000 px, No PyramidUID) ← KEPT + +Result: Only pyramid_level_0.dcm and standalone.dcm are processed +``` + +**Error Handling:** + +- Files with missing SOPClassUID are logged as warnings and excluded (malformed DICOM) +- Files with PyramidUID but missing TotalPixelMatrix* attributes are treated as standalone +- Files that cannot be read by pydicom are logged at DEBUG level and skipped +- AttributeError and general exceptions are caught to prevent processing pipeline failure + +**Integration with Application Module:** + +The Application module uses this filtering automatically when generating metadata: +```python +# In Application Service +from aignostics.wsi import Service as WSIService + +# Filtering happens automatically for DICOM files +files = WSIService.get_wsi_files_to_process(source_directory, ".dcm") +for file_path in files: + # Only highest resolution WSI files are processed + metadata = WSIService.get_metadata(file_path) +``` + + ## Usage Patterns ### Basic WSI Operations From 5adaeee319f1a75817265762726d998114430c67 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Fri, 5 Dec 2025 17:32:32 +0100 Subject: [PATCH 15/18] task(testing): Clean up redundant tests --- src/aignostics/application/CLAUDE.md | 2 - tests/aignostics/application/service_test.py | 213 ------------------- tests/aignostics/wsi/service_test.py | 3 - 3 files changed, 218 deletions(-) diff --git a/src/aignostics/application/CLAUDE.md b/src/aignostics/application/CLAUDE.md index 0632f04ae..3f260066e 100644 --- a/src/aignostics/application/CLAUDE.md +++ b/src/aignostics/application/CLAUDE.md @@ -163,8 +163,6 @@ APPLICATION_RUN_UPLOAD_CHUNK_SIZE = 1024 * 1024 # 1MB APPLICATION_RUN_DOWNLOAD_SLEEP_SECONDS = 5 # Wait between status checks ``` - - ### Progress State Management **Actual DownloadProgress Model (`_models.py`):** diff --git a/tests/aignostics/application/service_test.py b/tests/aignostics/application/service_test.py index 7ac25dd03..17653e9de 100644 --- a/tests/aignostics/application/service_test.py +++ b/tests/aignostics/application/service_test.py @@ -1,11 +1,8 @@ """Tests to verify the service functionality of the application module.""" -from collections.abc import Callable from datetime import UTC, datetime, timedelta -from pathlib import Path from unittest.mock import MagicMock, patch -import pydicom import pytest from typer.testing import CliRunner @@ -442,213 +439,3 @@ def test_application_run_update_item_custom_metadata_not_found(mock_get_client: with pytest.raises(NotFoundException, match="not found"): service.application_run_update_item_custom_metadata("run-123", "invalid-item-id", {"key": "value"}) - - -@pytest.mark.unit -def test_select_dicom_pyramid_single_file( - tmp_path: Path, - dicom_factory: Callable[..., pydicom.Dataset], -) -> None: - """Test that single DICOM files with PyramidUID are included.""" - ds = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file = tmp_path / "test.dcm" - ds.save_as(dcm_file, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - assert len(included) == 1 - assert dcm_file in included - - -@pytest.mark.unit -def test_select_dicom_pyramid_standalone_no_pyramid_uid( - tmp_path: Path, - dicom_factory: Callable[..., pydicom.Dataset], -) -> None: - """Test that standalone DICOM files without PyramidUID are included.""" - ds = dicom_factory(None, 1024, 1024) # No PyramidUID, no ImageType Value 3 - dcm_file = tmp_path / "test.dcm" - ds.save_as(dcm_file, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - assert len(included) == 1 - assert dcm_file in included - - -@pytest.mark.unit -def test_select_dicom_pyramid_multi_file(tmp_path: Path, dicom_factory: Callable[..., pydicom.Dataset]) -> None: - """Test that for multi-file DICOM pyramid, only the highest resolution file is kept.""" - pyramid_uid = "1.2.3.4.5" - - # Create low resolution DICOM file (smallest pyramid level) - ds_low = dicom_factory(pyramid_uid, 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file_low = tmp_path / "test_low.dcm" - ds_low.save_as(dcm_file_low, write_like_original=False) - - # Create medium resolution DICOM file - ds_med = dicom_factory(pyramid_uid, 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file_med = tmp_path / "test_med.dcm" - ds_med.save_as(dcm_file_med, write_like_original=False) - - # Create high resolution DICOM file (base layer - highest resolution) - ds_high = dicom_factory(pyramid_uid, 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file_high = tmp_path / "test_high.dcm" - ds_high.save_as(dcm_file_high, write_like_original=False) - - # Select files to process - included = ApplicationService._select_dicom_files_to_process(tmp_path) - - # Should include only 1 file (the highest resolution) - assert len(included) == 1 - assert dcm_file_high in included - assert dcm_file_low not in included - assert dcm_file_med not in included - - -@pytest.mark.unit -def test_select_dicom_pyramid_multiple_pyramids(tmp_path: Path, dicom_factory: Callable[..., pydicom.Dataset]) -> None: - """Test that files from different pyramids are not filtered against each other.""" - # Pyramid 1 - two files (pyramid with 2 levels) - ds1_low = dicom_factory("1.2.3.4.5", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file1_low = tmp_path / "pyramid1_low.dcm" - ds1_low.save_as(dcm_file1_low, write_like_original=False) - - ds1_high = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file1_high = tmp_path / "pyramid1_high.dcm" - ds1_high.save_as(dcm_file1_high, write_like_original=False) - - # Pyramid 2 - single file (standalone, single level) - ds2 = dicom_factory("6.7.8.9.0", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file2 = tmp_path / "pyramid2.dcm" - ds2.save_as(dcm_file2, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - - # Should include 2 files: highest from pyramid 1 and single from pyramid 2 - assert len(included) == 2 - assert dcm_file1_high in included - assert dcm_file2 in included - assert dcm_file1_low not in included - - -@pytest.mark.unit -def test_select_dicom_exclude_non_wsi( - tmp_path: Path, - dicom_factory: Callable[..., pydicom.Dataset], -) -> None: - """Test that non-WSI DICOM files (e.g., segmentations) are excluded.""" - # Create a segmentation storage DICOM - ds_seg = dicom_factory( - "1.2.3.4.5", - 1024, - 1024, - sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4", # Segmentation Storage - image_type=["DERIVED", "PRIMARY", "VOLUME"], - ) - dcm_file_seg = tmp_path / "segmentation.dcm" - ds_seg.save_as(dcm_file_seg, write_like_original=False) - - # Create a valid WSI - ds_wsi = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file_wsi = tmp_path / "wsi.dcm" - ds_wsi.save_as(dcm_file_wsi, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - - # Should include only the WSI file - assert len(included) == 1 - assert dcm_file_wsi in included - assert dcm_file_seg not in included - - -@pytest.mark.unit -def test_select_dicom_exclude_non_volumes( - tmp_path: Path, - dicom_factory: Callable[..., pydicom.Dataset], -) -> None: - """Test that non-volume images are excluded.""" - # Create a thumbnail - ds_thumb = dicom_factory( - "1.2.3.4.5", - 256, - 256, - image_type=["DERIVED", "PRIMARY", "THUMBNAIL", "RESAMPLED"], - ) - dcm_file_thumb = tmp_path / "thumbnail.dcm" - ds_thumb.save_as(dcm_file_thumb, write_like_original=False) - - # Create a regular WSI image - ds_wsi = dicom_factory( - "1.2.3.4.5", - 1024, - 1024, - image_type=["DERIVED", "PRIMARY", "VOLUME", "NONE"], - ) - dcm_file_wsi = tmp_path / "wsi.dcm" - ds_wsi.save_as(dcm_file_wsi, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - - # Should include only the VOLUME image - assert len(included) == 1 - assert dcm_file_wsi in included - assert dcm_file_thumb not in included - - -@pytest.mark.unit -def test_select_dicom_no_image_type_value_3( - tmp_path: Path, - dicom_factory: Callable[..., pydicom.Dataset], -) -> None: - """Test that files without ImageType Value 3 are included as standalone.""" - # Create file with only 2 ImageType values - ds = dicom_factory("1.2.3.4.5", 1024, 1024, image_type=["DERIVED", "PRIMARY"]) - dcm_file = tmp_path / "no_value3.dcm" - ds.save_as(dcm_file, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - - # Should include as standalone (no Value 3 to check) - assert len(included) == 1 - assert dcm_file in included - - -@pytest.mark.unit -def test_select_dicom_mixed_scenario( - tmp_path: Path, - dicom_factory: Callable[..., pydicom.Dataset], -) -> None: - """Test complex scenario with multiple pyramids, thumbnails, and segmentations.""" - # Pyramid 1: 2 levels (keep highest) - ds1_low = dicom_factory("pyramid1", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file1_low = tmp_path / "p1_low.dcm" - ds1_low.save_as(dcm_file1_low, write_like_original=False) - - ds1_high = dicom_factory("pyramid1", 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file1_high = tmp_path / "p1_high.dcm" - ds1_high.save_as(dcm_file1_high, write_like_original=False) - - # Thumbnail for pyramid 1 (exclude) - ds1_thumb = dicom_factory("pyramid1", 256, 256, image_type=["DERIVED", "PRIMARY", "THUMBNAIL"]) - dcm_file1_thumb = tmp_path / "p1_thumb.dcm" - ds1_thumb.save_as(dcm_file1_thumb, write_like_original=False) - - # Segmentation file (exclude) - ds_seg = dicom_factory("pyramid1", 2048, 2048, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4") - dcm_file_seg = tmp_path / "seg.dcm" - ds_seg.save_as(dcm_file_seg, write_like_original=False) - - # Standalone WSI without PyramidUID (keep) - ds_standalone = dicom_factory(None, 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) - dcm_file_standalone = tmp_path / "standalone.dcm" - ds_standalone.save_as(dcm_file_standalone, write_like_original=False) - - included = ApplicationService._select_dicom_files_to_process(tmp_path) - - # Should include: high-res from pyramid1, standalone - assert len(included) == 2 - assert dcm_file1_high in included - assert dcm_file_standalone in included - # Should exclude: low-res, thumbnail, segmentation - assert dcm_file1_low not in included - assert dcm_file1_thumb not in included - assert dcm_file_seg not in included diff --git a/tests/aignostics/wsi/service_test.py b/tests/aignostics/wsi/service_test.py index 30390488e..64074ae4f 100644 --- a/tests/aignostics/wsi/service_test.py +++ b/tests/aignostics/wsi/service_test.py @@ -480,6 +480,3 @@ def test_get_wsi_files_to_process_non_dicom_passthrough(tmp_path: Path) -> None: assert len(files) == 2 assert tiff1 in files assert tiff2 in files - assert tiff2 in files - assert tiff2 in files - assert tiff2 in files From 97c5bfe0448adbcb5908a91a98ad3a93d7882549 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Mon, 8 Dec 2025 18:32:41 +0100 Subject: [PATCH 16/18] chore(wsi): Refactor scan_files to pass SonarCloud complexity check --- src/aignostics/wsi/CLAUDE.md | 11 +- src/aignostics/wsi/_pydicom_handler.py | 335 +++++++++++++++---------- 2 files changed, 203 insertions(+), 143 deletions(-) diff --git a/src/aignostics/wsi/CLAUDE.md b/src/aignostics/wsi/CLAUDE.md index abc2d759d..d203bf8dc 100644 --- a/src/aignostics/wsi/CLAUDE.md +++ b/src/aignostics/wsi/CLAUDE.md @@ -19,8 +19,8 @@ The WSI module provides comprehensive support for medical imaging files, particu **CLI Commands (`_cli.py`):** - `wsi inspect` - Display WSI file metadata and properties -- `wsi dicom-inspect` - Inspect DICOM-specific metadata -- `wsi dicom-geojson-import` - Import GeoJSON annotations to DICOM +- `wsi dicom inspect` - Inspect DICOM-specific metadata +- `wsi dicom geojson_import` - Import GeoJSON annotations to DICOM **GUI Component (`_gui.py`):** @@ -371,13 +371,10 @@ for wsi in wsi_files: aignostics wsi inspect slide.svs # Inspect DICOM metadata -aignostics wsi dicom-inspect scan.dcm +aignostics wsi dicom inspect scan.dcm # Import GeoJSON annotations -aignostics wsi dicom-geojson-import \ - --dicom-file scan.dcm \ - --geojson-file annotations.json \ - --output annotated.dcm +aignostics wsi dicom geojson_import scan.dcm annotations.json ``` ## Dependencies & Integration diff --git a/src/aignostics/wsi/_pydicom_handler.py b/src/aignostics/wsi/_pydicom_handler.py index d72ca976b..e2552c195 100644 --- a/src/aignostics/wsi/_pydicom_handler.py +++ b/src/aignostics/wsi/_pydicom_handler.py @@ -164,7 +164,7 @@ def _find_highest_resolution_files( return files_to_keep - def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dict[str, Any]]: # noqa: C901, PLR0912, PLR0914, PLR0915 + def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dict[str, Any]]: """Scan DICOM files and extract metadata. Recursively scans the path for DICOM files, reads their metadata (without loading @@ -185,149 +185,19 @@ def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dic Note: Invalid DICOM files are logged and skipped. """ + files_to_process = self._get_files_to_process(wsi_only) dicom_files = [] - # Determine which files to process - if self.path.is_file(): - files_to_process = [self.path] if self.path.suffix.lower() == ".dcm" else [] - elif wsi_only: - # Use your filtering logic - files_to_process = self._select_wsi_files(self.path) - else: - files_to_process = list(self.path.rglob("*.dcm")) - - for file_path in files_to_process: # noqa: PLR1702 + for file_path in files_to_process: if not file_path.is_file(): continue try: print(file_path) ds = pydicom.dcmread(str(file_path), stop_before_pixels=True) - # TODO(Helmut): Uncomment when DICOM is implemented - # print(ds["Modality"].value) # noqa: ERA001 - # print(getattr(ds, "Modality", "unknown")) # noqa: ERA001 - # sys.exit() # noqa: ERA001 - - # Basic required DICOM fields - file_info: dict[str, Any] = { - "path": str(file_path), - "study_uid": str(getattr(ds, "StudyInstanceUID", "unknown")), - "container_id": str(getattr(ds, "ContainerIdentifier", "unknown")), - "series_uid": str(getattr(ds, "SeriesInstanceUID", "unknown")), - "modality": str(getattr(ds, "Modality", "unknown")), - "type": "unknown", - "frame_of_reference_uid": str(getattr(ds, "FrameOfReferenceUID", "unknown")), - } - - # Try to determine file type using highdicom - try: - # TODO(Helmut): Check below, hd.sr.is_microscopy_bulk_simple_annotation(ds), - # hd.sr.is_microscopy_measurement(ds) for type annotation/measurement - if getattr(ds, "Modality", "") in {"SM", "WSI"}: - file_info["type"] = "image" - except Exception: - logger.exception("Failed to analyze DICOM file with highdicom") - # If highdicom analysis fails, keep 'unknown' type - - # Add size and basic metadata - file_info["size"] = file_path.stat().st_size - file_info["metadata"] = { - "PatientID": str(getattr(ds, "PatientID", "unknown")), - "StudyDate": str(getattr(ds, "StudyDate", "unknown")), - "SeriesDescription": str(getattr(ds, "SeriesDescription", "")), - } - - # Add to file_info dictionary after basic metadata - if getattr(ds, "Modality", "") in {"SM", "WSI"}: - file_info.update({ - "modality": getattr(ds, "Modality", ""), - "is_pyramidal": True, - "num_frames": int(getattr(ds, "NumberOfFrames", 1)), - "optical_paths": len(getattr(ds, "OpticalPathSequence", [])), - "focal_planes": len(getattr(ds, "DimensionIndexSequence", [])), - "total_pixel_matrix": ( - int(getattr(ds, "TotalPixelMatrixColumns", 0)), - int(getattr(ds, "TotalPixelMatrixRows", 0)), - ), - }) - elif getattr(ds, "Modality", "") == "ANN": - ann = hd.ann.MicroscopyBulkSimpleAnnotations.from_dataset(ds) - group_infos = [] - groups = ann.get_annotation_groups() - for group in groups: - # Calculate min/max coordinates for all points - col_min = row_min = float("inf") # Initialize to positive infinity - col_max = row_max = float("-inf") # Initialize to negative infinity - graphic_data_len = float("-inf") - first = None - - if verbose: - graphic_data = group.get_graphic_data(ann.annotation_coordinate_type) - graphic_data_len = len(graphic_data) - first = graphic_data[0] - if graphic_data: - if group.graphic_type == hd.ann.GraphicTypeValues.POINT: - # For points, each item is a single coordinate - for point in graphic_data: - col_min = min(col_min, point[0]) - col_max = max(col_max, point[0]) - row_min = min(row_min, point[1]) - row_max = max(row_max, point[1]) - else: - # For polygons/polylines, process all polygons - for polygon in graphic_data: - for point in polygon: - col_min = min(col_min, point[0]) - col_max = max(col_max, point[0]) - row_min = min(row_min, point[1]) - row_max = max(row_max, point[1]) - - group_infos.append({ - "uid": group.uid, - "label": group.label, - "property_type": group.annotated_property_type, - "graphic_type": group.graphic_type, - "count": graphic_data_len, - "first": first, - "col_min": float(col_min), - "row_min": float(row_min), - "col_max": float(col_max), - "row_max": float(row_max), - }) - file_info.update({ - "modality": "ANN", - "coordinate_type": ann.annotation_coordinate_type, - "annotation_groups": group_infos, - }) - - # Extract pyramid levels from frame organization - if getattr(ds, "DimensionOrganizationSequence", None): - # Get frame organization - frame_groups = {} - for frame in getattr(ds, "PerFrameFunctionalGroupsSequence", []): - level_idx = frame.DimensionIndexValues[0] - if level_idx not in frame_groups: - frame_groups[level_idx] = { - "count": 0, - "rows": int(frame.get("Rows", 0)), - "columns": int(frame.get("Columns", 0)), - } - frame_groups[level_idx]["count"] += 1 - - # Sort and store pyramid level information - pyramid_info = [ - { - "level": int(level_idx), - "frame_count": frame_groups[level_idx]["count"], - "frame_size": ( - frame_groups[level_idx]["columns"], - frame_groups[level_idx]["rows"], - ), - } - for level_idx in sorted(frame_groups.keys()) - ] - file_info["pyramid_info"] = pyramid_info - + file_info = PydicomHandler._extract_basic_metadata(file_path, ds) + PydicomHandler._add_modality_specific_data(file_info, ds, verbose) + self._add_pyramid_info(file_info, ds) dicom_files.append(file_info) except pydicom.errors.InvalidDicomError: @@ -335,6 +205,199 @@ def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dic return dicom_files + def _get_files_to_process(self, wsi_only: bool) -> list[Path]: + """Determine which DICOM files to process. + + Returns: + List of file paths to process. + """ + if self.path.is_file(): + return [self.path] if self.path.suffix.lower() == ".dcm" else [] + if wsi_only: + return self.select_wsi_files() + return list(self.path.rglob("*.dcm")) + + @staticmethod + def _extract_basic_metadata(file_path: Path, ds: pydicom.Dataset) -> dict[str, Any]: + """Extract basic DICOM metadata fields. + + Returns: + Dictionary containing basic DICOM metadata. + """ + file_info: dict[str, Any] = { + "path": str(file_path), + "study_uid": str(getattr(ds, "StudyInstanceUID", "unknown")), + "container_id": str(getattr(ds, "ContainerIdentifier", "unknown")), + "series_uid": str(getattr(ds, "SeriesInstanceUID", "unknown")), + "modality": str(getattr(ds, "Modality", "unknown")), + "type": "unknown", + "frame_of_reference_uid": str(getattr(ds, "FrameOfReferenceUID", "unknown")), + } + + # Try to determine file type using highdicom + try: + # TODO(Helmut): Check below, hd.sr.is_microscopy_bulk_simple_annotation(ds), + # hd.sr.is_microscopy_measurement(ds) for type annotation/measurement + if getattr(ds, "Modality", "") in {"SM", "WSI"}: + file_info["type"] = "image" + except Exception: + logger.exception("Failed to analyze DICOM file with highdicom") + # If highdicom analysis fails, keep 'unknown' type + + # Add size and basic metadata + file_info["size"] = file_path.stat().st_size + file_info["metadata"] = { + "PatientID": str(getattr(ds, "PatientID", "unknown")), + "StudyDate": str(getattr(ds, "StudyDate", "unknown")), + "SeriesDescription": str(getattr(ds, "SeriesDescription", "")), + } + + return file_info + + @staticmethod + def _add_modality_specific_data(file_info: dict[str, Any], ds: pydicom.Dataset, verbose: bool) -> None: + """Add modality-specific metadata to file_info.""" + modality = getattr(ds, "Modality", "") + + if modality in {"SM", "WSI"}: + PydicomHandler._add_wsi_metadata(file_info, ds) + elif modality == "ANN": + PydicomHandler._add_annotation_metadata(file_info, ds, verbose) + + @staticmethod + def _add_wsi_metadata(file_info: dict[str, Any], ds: pydicom.Dataset) -> None: + """Add WSI-specific metadata.""" + file_info.update({ + "modality": getattr(ds, "Modality", ""), + "is_pyramidal": True, + "num_frames": int(getattr(ds, "NumberOfFrames", 1)), + "optical_paths": len(getattr(ds, "OpticalPathSequence", [])), + "focal_planes": len(getattr(ds, "DimensionIndexSequence", [])), + "total_pixel_matrix": ( + int(getattr(ds, "TotalPixelMatrixColumns", 0)), + int(getattr(ds, "TotalPixelMatrixRows", 0)), + ), + }) + + @staticmethod + def _add_annotation_metadata(file_info: dict[str, Any], ds: pydicom.Dataset, verbose: bool) -> None: + """Add annotation-specific metadata.""" + ann = hd.ann.MicroscopyBulkSimpleAnnotations.from_dataset(ds) + groups = ann.get_annotation_groups() + group_infos = [PydicomHandler._process_annotation_group(group, ann, verbose) for group in groups] + + file_info.update({ + "modality": "ANN", + "coordinate_type": ann.annotation_coordinate_type, + "annotation_groups": group_infos, + }) + + @staticmethod + def _process_annotation_group( + group: hd.ann.AnnotationGroup, ann: hd.ann.MicroscopyBulkSimpleAnnotations, verbose: bool + ) -> dict[str, Any]: + """Process a single annotation group. + + Returns: + Dictionary containing annotation group metadata. + """ + col_min = row_min = float("inf") + col_max = row_max = float("-inf") + graphic_data_len = float("-inf") + first = None + + if verbose: + graphic_data = group.get_graphic_data(ann.annotation_coordinate_type) + graphic_data_len = len(graphic_data) + first = graphic_data[0] if graphic_data else None + + if graphic_data: + col_min, row_min, col_max, row_max = PydicomHandler._calculate_annotation_bounds( + graphic_data, group.graphic_type + ) + + return { + "uid": group.uid, + "label": group.label, + "property_type": group.annotated_property_type, + "graphic_type": group.graphic_type, + "count": graphic_data_len, + "first": first, + "col_min": float(col_min), + "row_min": float(row_min), + "col_max": float(col_max), + "row_max": float(row_max), + } + + @staticmethod + def _calculate_annotation_bounds( + graphic_data: list[Any], graphic_type: hd.ann.GraphicTypeValues + ) -> tuple[float, float, float, float]: + """Calculate bounding box for annotation graphic data. + + Returns: + Tuple of (col_min, row_min, col_max, row_max). + """ + col_min = row_min = float("inf") + col_max = row_max = float("-inf") + + if graphic_type == hd.ann.GraphicTypeValues.POINT: + for point in graphic_data: + col_min = min(col_min, point[0]) + col_max = max(col_max, point[0]) + row_min = min(row_min, point[1]) + row_max = max(row_max, point[1]) + else: + # For polygons/polylines, process all polygons + for polygon in graphic_data: + for point in polygon: + col_min = min(col_min, point[0]) + col_max = max(col_max, point[0]) + row_min = min(row_min, point[1]) + row_max = max(row_max, point[1]) + + return col_min, row_min, col_max, row_max + + def _add_pyramid_info(self, file_info: dict[str, Any], ds: pydicom.Dataset) -> None: + """Extract and add pyramid level information if available.""" + if not getattr(ds, "DimensionOrganizationSequence", None): + return + + frame_groups = self._extract_frame_groups(ds) + pyramid_info = [ + { + "level": int(level_idx), + "frame_count": frame_groups[level_idx]["count"], + "frame_size": ( + frame_groups[level_idx]["columns"], + frame_groups[level_idx]["rows"], + ), + } + for level_idx in sorted(frame_groups.keys()) + ] + file_info["pyramid_info"] = pyramid_info + + @staticmethod + def _extract_frame_groups(ds: pydicom.Dataset) -> dict[int, dict[str, int]]: + """Extract frame organization grouped by pyramid level. + + Returns: + Dictionary mapping level index to frame count and dimensions. + """ + frame_groups: dict[int, dict[str, int]] = {} + + for frame in getattr(ds, "PerFrameFunctionalGroupsSequence", []): + level_idx = frame.DimensionIndexValues[0] + if level_idx not in frame_groups: + frame_groups[level_idx] = { + "count": 0, + "rows": int(frame.get("Rows", 0)), + "columns": int(frame.get("Columns", 0)), + } + frame_groups[level_idx]["count"] += 1 + + return frame_groups + @staticmethod def _organize_by_hierarchy(files: list[dict[str, Any]]) -> dict[str, Any]: if not files: From a13ce8b44f4eb44c888be416072ffd0fcf729928 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Wed, 10 Dec 2025 17:20:32 +0100 Subject: [PATCH 17/18] fix(wsi): Remove PydicomHandler dependency from dcm file scanning --- src/aignostics/wsi/_pydicom_handler.py | 119 +------------------------ src/aignostics/wsi/_service.py | 6 +- src/aignostics/wsi/_utils.py | 119 +++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 119 deletions(-) diff --git a/src/aignostics/wsi/_pydicom_handler.py b/src/aignostics/wsi/_pydicom_handler.py index e2552c195..dbdcbfe2f 100644 --- a/src/aignostics/wsi/_pydicom_handler.py +++ b/src/aignostics/wsi/_pydicom_handler.py @@ -16,6 +16,8 @@ from aignostics.utils import console +from ._utils import select_dicom_files + class PydicomHandler: def __init__(self, path: Path) -> None: @@ -49,121 +51,6 @@ def get_metadata(self, verbose: bool = False, wsi_only: bool = False) -> dict[st files = self._scan_files(verbose, wsi_only) return self._organize_by_hierarchy(files) - def select_wsi_files(self) -> list[Path]: - """Select WSI files only from the path, excluding auxiliary and redundant files. - - For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), - keeps only the highest resolution file. Excludes segmentations, annotations, - thumbnails, labels, and other non-image DICOM files. OpenSlide will automatically - find related pyramid files in the same directory when needed. - - See here for more info on the DICOM data model: - https://dicom.nema.org/medical/dicom/current/output/chtml/part03/chapter_7.html - - Filtering Strategy: - - Only processes VL Whole Slide Microscopy Image Storage - (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6) - Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html - - Excludes auxiliary images by ImageType Value 3 (keeps only VOLUME images) - Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.12.4.html#sect_C.8.12.4.1.1 - - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) - - For pyramids with multiple files, selects only the highest resolution based - on TotalPixelMatrixRows x TotalPixelMatrixColumns - - Preserves standalone WSI files without PyramidUID - - Returns: - List of DICOM file paths to process. All other DICOM files are excluded. - - Note: - Files that cannot be read or are missing required attributes are logged and skipped. - """ - pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) - included_dicom_files: list[Path] = [] - - # Scan all DICOM files and filter based on SOPClassUID and ImageType - for dcm_file in self.path.glob("**/*.dcm"): - try: - ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) - - # Exclude non-WSI image files by SOPClassUID - # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) - if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": - logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") - continue - - # Exclude auxiliary images by ImageType Value 3 - # Per DICOM PS3.3 C.8.12.4.1.1: Value 3 should be VOLUME, THUMBNAIL, LABEL, or OVERVIEW. - # We only want VOLUME images - if hasattr(ds, "ImageType") and len(ds.ImageType) >= 3: # noqa: PLR2004 - image_type_value_3 = ds.ImageType[2].upper() - - if image_type_value_3 != "VOLUME": - logger.debug(f"Excluding {dcm_file.name} - ImageType Value 3: {image_type_value_3}") - continue - else: - # No ImageType Value 3 - could be standalone WSI or non-standard file - logger.debug(f"DICOM {dcm_file.name} has no ImageType Value 3 - treating as standalone") - - # Try to group by PyramidUID for pyramid resolution selection - if hasattr(ds, "PyramidUID"): - # PyramidUID is Type 1C (conditional) - required if part of a multi-file pyramid - # TotalPixelMatrix attributes are Type 1 for WSI - should always be present - pyramid_uid = ds.PyramidUID - rows = int(ds.TotalPixelMatrixRows) - cols = int(ds.TotalPixelMatrixColumns) - - pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) - else: - # Treat as standalone file - included_dicom_files.append(dcm_file) - - except Exception as e: - logger.debug(f"Could not process DICOM file {dcm_file}: {e}") - continue - - # For each pyramid with multiple files, select only the highest resolution - pyramid_files_to_include = PydicomHandler._find_highest_resolution_files(pyramid_groups) - included_dicom_files.extend(pyramid_files_to_include) - - return included_dicom_files - - @staticmethod - def _find_highest_resolution_files( - pyramid_groups: dict[str, list[tuple[Path, int, int]]], - ) -> list[Path]: - """Find the highest resolution file for each multi-file pyramid. - - For each pyramid (identified by PyramidUID), selects the file with the - largest total pixel count (rows x cols). All other files in the pyramid - are excluded, as OpenSlide can find them automatically. - - Args: - pyramid_groups: Dictionary mapping PyramidUID to list of (file_path, rows, cols). - - Returns: - List of file paths to keep (one per pyramid, the highest resolution). - - Note: - Single-file pyramids (only one file per PyramidUID) are included without - comparison. - """ - files_to_keep: list[Path] = [] - - for pyramid_uid, files_with_dims in pyramid_groups.items(): - if len(files_with_dims) > 1: - highest_res_file_with_dims = max(files_with_dims, key=lambda x: x[1] * x[2]) - highest_res_file, rows, cols = highest_res_file_with_dims - files_to_keep.append(highest_res_file) - - logger.debug( - f"DICOM pyramid {pyramid_uid}: keeping {highest_res_file.name} " - f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" - ) - else: - files_to_keep.append(files_with_dims[0][0]) - - return files_to_keep - def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dict[str, Any]]: """Scan DICOM files and extract metadata. @@ -214,7 +101,7 @@ def _get_files_to_process(self, wsi_only: bool) -> list[Path]: if self.path.is_file(): return [self.path] if self.path.suffix.lower() == ".dcm" else [] if wsi_only: - return self.select_wsi_files() + return select_dicom_files(self.path) return list(self.path.rglob("*.dcm")) @staticmethod diff --git a/src/aignostics/wsi/_service.py b/src/aignostics/wsi/_service.py index b745fc04f..702ace1fb 100644 --- a/src/aignostics/wsi/_service.py +++ b/src/aignostics/wsi/_service.py @@ -12,7 +12,7 @@ from aignostics.utils import BaseService, Health from ._openslide_handler import DEFAULT_MAX_SAFE_DIMENSION -from ._pydicom_handler import PydicomHandler +from ._utils import select_dicom_files TIMEOUT = 60 # 1 minutes @@ -195,9 +195,9 @@ def get_wsi_files_to_process(path: Path, extension: str) -> Iterable[Path]: Iterable of Path objects for files to process. """ files_to_process: Iterable[Path] - if extension == ".dcm": + if extension == ".dcm": # noqa: SIM108 # Special handling for DICOM files - filter out auxiliary and redundant files - files_to_process = PydicomHandler.from_file(str(path)).select_wsi_files() + files_to_process = select_dicom_files(path) else: # For non-DICOM formats, process all files with this extension files_to_process = path.glob(f"**/*{extension}") diff --git a/src/aignostics/wsi/_utils.py b/src/aignostics/wsi/_utils.py index 9cc92e456..6a8437975 100644 --- a/src/aignostics/wsi/_utils.py +++ b/src/aignostics/wsi/_utils.py @@ -1,9 +1,12 @@ """Utils for printing wsi files.""" +from collections import defaultdict from pathlib import Path from typing import Any import humanize +import pydicom +from loguru import logger from aignostics.utils import console @@ -189,3 +192,119 @@ def print_slide_info(slide_data: dict[str, Any], indent: int = 0, verbose: bool for file_info in series_data["files"]: console.print() print_file_info(file_info, indent=indent + 3) + + +def _find_highest_resolution_files( + pyramid_groups: dict[str, list[tuple[Path, int, int]]], +) -> list[Path]: + """Find the highest resolution file for each multi-file pyramid. + + For each pyramid (identified by PyramidUID), selects the file with the + largest total pixel count (rows x cols). All other files in the pyramid + are excluded, as OpenSlide can find them automatically. + + Args: + pyramid_groups: Dictionary mapping PyramidUID to list of (file_path, rows, cols). + + Returns: + List of file paths to keep (one per pyramid, the highest resolution). + + Note: + Single-file pyramids (only one file per PyramidUID) are included without + comparison. + """ + files_to_keep: list[Path] = [] + + for pyramid_uid, files_with_dims in pyramid_groups.items(): + if len(files_with_dims) > 1: + highest_res_file_with_dims = max(files_with_dims, key=lambda x: x[1] * x[2]) + highest_res_file, rows, cols = highest_res_file_with_dims + files_to_keep.append(highest_res_file) + + logger.debug( + f"DICOM pyramid {pyramid_uid}: keeping {highest_res_file.name} " + f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" + ) + else: + files_to_keep.append(files_with_dims[0][0]) + + return files_to_keep + + +def select_dicom_files(path: Path) -> list[Path]: + """Select WSI files only from the path, excluding auxiliary and redundant files. + + For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), + keeps only the highest resolution file. Excludes segmentations, annotations, + thumbnails, labels, and other non-image DICOM files. OpenSlide will automatically + find related pyramid files in the same directory when needed. + + See here for more info on the DICOM data model: + https://dicom.nema.org/medical/dicom/current/output/chtml/part03/chapter_7.html + + Filtering Strategy: + - Only processes VL Whole Slide Microscopy Image Storage + (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html + - Excludes auxiliary images by ImageType Value 3 (keeps only VOLUME images) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.12.4.html#sect_C.8.12.4.1.1 + - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) + - For pyramids with multiple files, selects only the highest resolution based + on TotalPixelMatrixRows x TotalPixelMatrixColumns + - Preserves standalone WSI files without PyramidUID + + Returns: + List of DICOM file paths to process. All other DICOM files are excluded. + + Note: + Files that cannot be read or are missing required attributes are logged and skipped. + """ + pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) + included_dicom_files: list[Path] = [] + + # Scan all DICOM files and filter based on SOPClassUID and ImageType + for dcm_file in path.glob("**/*.dcm"): + try: + ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) + + # Exclude non-WSI image files by SOPClassUID + # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) + if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": + logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") + continue + + # Exclude auxiliary images by ImageType Value 3 + # Per DICOM PS3.3 C.8.12.4.1.1: Value 3 should be VOLUME, THUMBNAIL, LABEL, or OVERVIEW. + # We only want VOLUME images + if hasattr(ds, "ImageType") and len(ds.ImageType) >= 3: # noqa: PLR2004 + image_type_value_3 = ds.ImageType[2].upper() + + if image_type_value_3 != "VOLUME": + logger.debug(f"Excluding {dcm_file.name} - ImageType Value 3: {image_type_value_3}") + continue + else: + # No ImageType Value 3 - could be standalone WSI or non-standard file + logger.debug(f"DICOM {dcm_file.name} has no ImageType Value 3 - treating as standalone") + + # Try to group by PyramidUID for pyramid resolution selection + if hasattr(ds, "PyramidUID"): + # PyramidUID is Type 1C (conditional) - required if part of a multi-file pyramid + # TotalPixelMatrix attributes are Type 1 for WSI - should always be present + pyramid_uid = ds.PyramidUID + rows = int(ds.TotalPixelMatrixRows) + cols = int(ds.TotalPixelMatrixColumns) + + pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) + else: + # Treat as standalone file + included_dicom_files.append(dcm_file) + + except Exception as e: + logger.debug(f"Could not process DICOM file {dcm_file}: {e}") + continue + + # For each pyramid with multiple files, select only the highest resolution + pyramid_files_to_include = _find_highest_resolution_files(pyramid_groups) + included_dicom_files.extend(pyramid_files_to_include) + + return included_dicom_files From dedb618cbcb701ddc86554c9e7eb42152e65b8c0 Mon Sep 17 00:00:00 2001 From: Blanca Pablos Date: Wed, 10 Dec 2025 23:58:22 +0100 Subject: [PATCH 18/18] Update doc --- src/aignostics/wsi/CLAUDE.md | 40 +++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/aignostics/wsi/CLAUDE.md b/src/aignostics/wsi/CLAUDE.md index d203bf8dc..5a08cb10b 100644 --- a/src/aignostics/wsi/CLAUDE.md +++ b/src/aignostics/wsi/CLAUDE.md @@ -213,10 +213,14 @@ def get_tile( ### DICOM WSI File Filtering -**Multi-File DICOM Pyramid Selection (`PydicomHandler.select_wsi_files()`):** +**Multi-File DICOM Pyramid Selection (`_utils.select_dicom_files()`):** The WSI module automatically handles multi-file DICOM pyramids (whole slide images stored across multiple DICOM instances) by selecting only the highest resolution file from each pyramid. This prevents redundant processing since OpenSlide can automatically find related pyramid files in the same directory. +**Implementation Location:** + +The DICOM file selection logic is implemented in `_utils.py` as `select_dicom_files()`. This function **only depends on pydicom** (not highdicom), making it compatible with Python 3.14+ where highdicom is not available. + **Service Integration (`Service.get_wsi_files_to_process()`):** ```python from aignostics.wsi import Service @@ -237,9 +241,19 @@ tiff_files = Service.get_wsi_files_to_process( # Returns all .tiff files (no filtering) ``` +**Direct Usage (Advanced):** +```python +from aignostics.wsi._utils import select_dicom_files +from pathlib import Path + +# Directly filter DICOM files (used internally by Service) +dicom_files = select_dicom_files(Path("/data/dicoms")) +# Returns only highest resolution WSI files +``` + **Filtering Strategy:** ```python -def select_wsi_files(self) -> list[Path]: +def select_dicom_files(path: Path) -> list[Path]: """Select WSI files only, excluding auxiliary and redundant files. Filtering Strategy: @@ -316,6 +330,17 @@ for file_path in files: metadata = WSIService.get_metadata(file_path) ``` +**Module Architecture:** + +The DICOM file selection functionality is organized as follows: +- **`_utils.py`**: Contains `select_dicom_files()` and `_find_highest_resolution_files()` helper + - Only depends on `pydicom`, `pathlib`, `collections.defaultdict`, and `loguru` + - Compatible with Python 3.14+ (no highdicom dependency) +- **`_service.py`**: Uses `select_dicom_files()` in `get_wsi_files_to_process()` +- **`_pydicom_handler.py`**: Uses `select_dicom_files()` for metadata extraction with `wsi_only=True` + - This module still requires highdicom for annotation/measurement features + - Only the CLI commands that need highdicom (geojson import, detailed inspection) use PydicomHandler + ## Usage Patterns @@ -389,8 +414,17 @@ aignostics wsi dicom geojson_import scan.dcm annotations.json - `openslide-python` - Core WSI reading functionality - `Pillow` - Image processing and thumbnail generation -- `pydicom` - DICOM file handling +- `pydicom` - DICOM file handling (required for basic DICOM WSI operations) - `numpy` - Array manipulation for pixel data +- `highdicom` - DICOM annotation/measurement features (optional, not available on Python 3.14+) + +**Python 3.14+ Compatibility:** + +The core WSI functionality (thumbnail generation, metadata extraction, DICOM file selection) works on Python 3.14+ without highdicom. Only the following CLI commands require highdicom and are unavailable on Python 3.14+: +- `aignostics wsi dicom geojson_import` - Import GeoJSON to DICOM annotations +- Detailed annotation/measurement inspection features + +The DICOM file selection logic (`select_dicom_files()`) works on all Python versions since it only depends on `pydicom`. ### Format Support Matrix