diff --git a/src/aignostics/CLAUDE.md b/src/aignostics/CLAUDE.md index cb7974053..d08c02342 100644 --- a/src/aignostics/CLAUDE.md +++ b/src/aignostics/CLAUDE.md @@ -246,7 +246,7 @@ aignostics application run submit --application-id heta --files slide.svs aignostics dataset download --collection-id TCGA-LUAD --output-dir ./data # Get WSI info -aignostics wsi info slide.svs +aignostics wsi inspect slide.svs ``` ## GUI Launch diff --git a/src/aignostics/application/_service.py b/src/aignostics/application/_service.py index ae065cfb7..61d27191d 100644 --- a/src/aignostics/application/_service.py +++ b/src/aignostics/application/_service.py @@ -14,9 +14,7 @@ from loguru import logger from aignostics.bucket import Service as BucketService -from aignostics.constants import ( - TEST_APP_APPLICATION_ID, -) +from aignostics.constants import TEST_APP_APPLICATION_ID from aignostics.platform import ( LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, ApiException, @@ -32,9 +30,7 @@ RunOutput, RunState, ) -from aignostics.platform import ( - Service as PlatformService, -) +from aignostics.platform import Service as PlatformService from aignostics.utils import BaseService, Health, sanitize_path_component from aignostics.wsi import Service as WSIService @@ -324,39 +320,35 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 """Generate metadata from the source directory. Steps: - 1. Recursively files ending with supported extensions in the source directory - 2. Creates a dict with the following columns + 1. Recursively scans files ending with supported extensions in the source directory + 2. For DICOM files (.dcm), filters out auxiliary and redundant files + 3. Creates a dict for each file with the following fields: - external_id (str): The external_id of the file, by default equivalent to the absolute file name - source (str): The absolute filename - - checksum_base64_crc32c (str): The CRC32C checksum of the file constructed, base64 encoded + - checksum_base64_crc32c (str): The CRC32C checksum of the file, base64 encoded - resolution_mpp (float): The microns per pixel, inspecting the base layer - - height_px: The height of the image in pixels, inspecting the base layer - - width_px: The width of the image in pixels, inspecting the base layer - - Further attributes depending on the application and it's version - 3. Applies the optional mappings to fill in additional metadata fields in the dict. + - height_px (int): The height of the image in pixels, inspecting the base layer + - width_px (int): The width of the image in pixels, inspecting the base layer + - Further attributes depending on the application and its version + 4. Applies the optional mappings to fill in additional metadata fields in the dict Args: - source_directory (Path): The source directory to generate metadata from. - application_id (str): The ID of the application. - application_version (str|None): The version of the application (semver). - If not given latest version is used. - with_gui_metadata (bool): If True, include additional metadata for GUI. - mappings (list[str]): Mappings of the form ':=,=,...'. + source_directory: The source directory to generate metadata from. + application_id: The ID of the application. + application_version: The version of the application (semver). + If not given, latest version is used. + with_gui_metadata: If True, include additional metadata for GUI display. + mappings: Mappings of the form ':=,=,...'. The regular expression is matched against the external_id attribute of the entry. The key/value pairs are applied to the entry if the pattern matches. - with_extra_metadata (bool): If True, include extra metadata from the WSIService. + with_extra_metadata: If True, include extra metadata from the WSIService. Returns: - dict[str, Any]: The generated metadata. + List of metadata dictionaries, one per processable file found. Raises: - Exception: If the metadata cannot be generated. - - Raises: - NotFoundError: If the application version with the given ID is not found. - ValueError: If - the source directory does not exist - or is not a directory. + NotFoundException: If the application version with the given ID is not found. + ValueError: If the source directory does not exist or is not a directory. RuntimeError: If the metadata generation fails unexpectedly. """ logger.trace("Generating metadata from source directory: {}", source_directory) @@ -364,20 +356,24 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 # TODO(Helmut): Use it _ = Service().application_version(application_id, application_version) - metadata = [] + metadata: list[dict[str, Any]] = [] + wsi_service = WSIService() try: extensions = get_supported_extensions_for_application(application_id) for extension in extensions: - for file_path in source_directory.glob(f"**/*{extension}"): + files_to_process = wsi_service.get_wsi_files_to_process(source_directory, extension) + + for file_path in files_to_process: # Generate CRC32C checksum with crc32c and encode as base64 hash_sum = crc32c.CRC32CHash() with file_path.open("rb") as f: while chunk := f.read(1024): hash_sum.update(chunk) checksum = str(base64.b64encode(hash_sum.digest()), "UTF-8") + try: - image_metadata = WSIService().get_metadata(file_path) + image_metadata = wsi_service.get_metadata(file_path) width = image_metadata["dimensions"]["width"] height = image_metadata["dimensions"]["height"] mpp = image_metadata["resolution"]["mpp_x"] diff --git a/src/aignostics/wsi/CLAUDE.md b/src/aignostics/wsi/CLAUDE.md index 28a7b9f77..5a08cb10b 100644 --- a/src/aignostics/wsi/CLAUDE.md +++ b/src/aignostics/wsi/CLAUDE.md @@ -19,8 +19,8 @@ The WSI module provides comprehensive support for medical imaging files, particu **CLI Commands (`_cli.py`):** - `wsi inspect` - Display WSI file metadata and properties -- `wsi dicom-inspect` - Inspect DICOM-specific metadata -- `wsi dicom-geojson-import` - Import GeoJSON annotations to DICOM +- `wsi dicom inspect` - Inspect DICOM-specific metadata +- `wsi dicom geojson_import` - Import GeoJSON annotations to DICOM **GUI Component (`_gui.py`):** @@ -210,6 +210,138 @@ def get_tile( return tile ``` + +### DICOM WSI File Filtering + +**Multi-File DICOM Pyramid Selection (`_utils.select_dicom_files()`):** + +The WSI module automatically handles multi-file DICOM pyramids (whole slide images stored across multiple DICOM instances) by selecting only the highest resolution file from each pyramid. This prevents redundant processing since OpenSlide can automatically find related pyramid files in the same directory. + +**Implementation Location:** + +The DICOM file selection logic is implemented in `_utils.py` as `select_dicom_files()`. This function **only depends on pydicom** (not highdicom), making it compatible with Python 3.14+ where highdicom is not available. + +**Service Integration (`Service.get_wsi_files_to_process()`):** +```python +from aignostics.wsi import Service +from pathlib import Path + +# Get filtered DICOM files +files = Service.get_wsi_files_to_process( + path=Path("/data/dicoms"), + extension=".dcm" +) +# Returns only highest resolution WSI files + +# For non-DICOM formats, returns all files +tiff_files = Service.get_wsi_files_to_process( + path=Path("/data/slides"), + extension=".tiff" +) +# Returns all .tiff files (no filtering) +``` + +**Direct Usage (Advanced):** +```python +from aignostics.wsi._utils import select_dicom_files +from pathlib import Path + +# Directly filter DICOM files (used internally by Service) +dicom_files = select_dicom_files(Path("/data/dicoms")) +# Returns only highest resolution WSI files +``` + +**Filtering Strategy:** +```python +def select_dicom_files(path: Path) -> list[Path]: + """Select WSI files only, excluding auxiliary and redundant files. + + Filtering Strategy: + 1. SOPClassUID filtering - Only process VL Whole Slide Microscopy Image Storage + - Include: 1.2.840.10008.5.1.4.1.1.77.1.6 (VL WSI) + - Exclude: 1.2.840.10008.5.1.4.1.1.66.4 (Segmentation Storage) + - Exclude: Other non-WSI DICOM types + + 2. ImageType filtering - Exclude auxiliary images + - Keep: VOLUME images only + - Exclude: THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, LOCALIZER + + 3. PyramidUID grouping - Group multi-file pyramids + - Files with same PyramidUID are part of one logical WSI + - Files without PyramidUID are treated as standalone WSIs + + 4. Resolution selection - Keep highest resolution per pyramid + - Based on TotalPixelMatrixRows × TotalPixelMatrixColumns + - Excludes all lower resolution levels + + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/chapter_7.html + """ +``` + +**Key Behaviors:** + +- **SOPClassUID validation**: Only processes VL Whole Slide Microscopy Image Storage files (1.2.840.10008.5.1.4.1.1.77.1.6) +- **Non-WSI exclusion**: Automatically excludes segmentations (1.2.840.10008.5.1.4.1.1.66.4), annotations, and other DICOM object types +- **ImageType filtering**: Excludes THUMBNAIL, LABEL, OVERVIEW, MACRO, ANNOTATION, and LOCALIZER image types +- **PyramidUID grouping**: Groups files by PyramidUID (DICOM tag identifying multi-resolution pyramids) +- **Resolution selection**: For each pyramid, keeps only the file with largest TotalPixelMatrixRows × TotalPixelMatrixColumns +- **Standalone handling**: Files without PyramidUID are treated as standalone WSI images and preserved +- **Graceful degradation**: Files with missing attributes are logged and treated as standalone (not excluded) +- **Debug logging**: Excluded files are logged at DEBUG level with pyramid/exclusion details + +**DICOM WSI Structure:** + +In the DICOM Whole Slide Imaging standard: +- **PyramidUID**: Uniquely identifies a single multi-resolution pyramid that may span multiple files +- **SeriesInstanceUID**: Groups related images (may include multiple pyramids, thumbnails, labels) +- **TotalPixelMatrixRows/Columns**: Represents full image dimensions at the highest resolution level + +**Example Scenario:** +``` +Input Directory: +├── pyramid_level_0.dcm (10000×10000 px, PyramidUID: ABC123) ← KEPT +├── pyramid_level_1.dcm (5000×5000 px, PyramidUID: ABC123) ← EXCLUDED +├── pyramid_level_2.dcm (2500×2500 px, PyramidUID: ABC123) ← EXCLUDED +├── thumbnail.dcm (256×256 px, PyramidUID: ABC123, ImageType: THUMBNAIL) ← EXCLUDED +├── segmentation.dcm (10000×10000 px, SOPClassUID: Segmentation) ← EXCLUDED +└── standalone.dcm (8000×8000 px, No PyramidUID) ← KEPT + +Result: Only pyramid_level_0.dcm and standalone.dcm are processed +``` + +**Error Handling:** + +- Files with missing SOPClassUID are logged as warnings and excluded (malformed DICOM) +- Files with PyramidUID but missing TotalPixelMatrix* attributes are treated as standalone +- Files that cannot be read by pydicom are logged at DEBUG level and skipped +- AttributeError and general exceptions are caught to prevent processing pipeline failure + +**Integration with Application Module:** + +The Application module uses this filtering automatically when generating metadata: +```python +# In Application Service +from aignostics.wsi import Service as WSIService + +# Filtering happens automatically for DICOM files +files = WSIService.get_wsi_files_to_process(source_directory, ".dcm") +for file_path in files: + # Only highest resolution WSI files are processed + metadata = WSIService.get_metadata(file_path) +``` + +**Module Architecture:** + +The DICOM file selection functionality is organized as follows: +- **`_utils.py`**: Contains `select_dicom_files()` and `_find_highest_resolution_files()` helper + - Only depends on `pydicom`, `pathlib`, `collections.defaultdict`, and `loguru` + - Compatible with Python 3.14+ (no highdicom dependency) +- **`_service.py`**: Uses `select_dicom_files()` in `get_wsi_files_to_process()` +- **`_pydicom_handler.py`**: Uses `select_dicom_files()` for metadata extraction with `wsi_only=True` + - This module still requires highdicom for annotation/measurement features + - Only the CLI commands that need highdicom (geojson import, detailed inspection) use PydicomHandler + + ## Usage Patterns ### Basic WSI Operations @@ -264,13 +396,10 @@ for wsi in wsi_files: aignostics wsi inspect slide.svs # Inspect DICOM metadata -aignostics wsi dicom-inspect scan.dcm +aignostics wsi dicom inspect scan.dcm # Import GeoJSON annotations -aignostics wsi dicom-geojson-import \ - --dicom-file scan.dcm \ - --geojson-file annotations.json \ - --output annotated.dcm +aignostics wsi dicom geojson_import scan.dcm annotations.json ``` ## Dependencies & Integration @@ -285,8 +414,17 @@ aignostics wsi dicom-geojson-import \ - `openslide-python` - Core WSI reading functionality - `Pillow` - Image processing and thumbnail generation -- `pydicom` - DICOM file handling +- `pydicom` - DICOM file handling (required for basic DICOM WSI operations) - `numpy` - Array manipulation for pixel data +- `highdicom` - DICOM annotation/measurement features (optional, not available on Python 3.14+) + +**Python 3.14+ Compatibility:** + +The core WSI functionality (thumbnail generation, metadata extraction, DICOM file selection) works on Python 3.14+ without highdicom. Only the following CLI commands require highdicom and are unavailable on Python 3.14+: +- `aignostics wsi dicom geojson_import` - Import GeoJSON to DICOM annotations +- Detailed annotation/measurement inspection features + +The DICOM file selection logic (`select_dicom_files()`) works on all Python versions since it only depends on `pydicom`. ### Format Support Matrix diff --git a/src/aignostics/wsi/_cli.py b/src/aignostics/wsi/_cli.py index 71a74d4d7..f70518468 100644 --- a/src/aignostics/wsi/_cli.py +++ b/src/aignostics/wsi/_cli.py @@ -132,6 +132,7 @@ def dicom_inspect( ], verbose: Annotated[bool, typer.Option(help="Verbose output")] = False, summary: Annotated[bool, typer.Option(help="Show only summary information")] = False, + wsi_only: Annotated[bool, typer.Option(help="Filter to WSI files only")] = False, ) -> None: # pylint: disable=W0613 """Inspect DICOM files at any hierarchy level.""" if not _check_highdicom_available(): @@ -142,7 +143,7 @@ def dicom_inspect( try: with PydicomHandler.from_file(str(path)) as handler: - metadata = handler.get_metadata(verbose) + metadata = handler.get_metadata(verbose, wsi_only) if metadata["type"] == "empty": console.print("[bold red]No DICOM files found in the specified path.[/bold red]") diff --git a/src/aignostics/wsi/_pydicom_handler.py b/src/aignostics/wsi/_pydicom_handler.py index f00a4669d..dbdcbfe2f 100644 --- a/src/aignostics/wsi/_pydicom_handler.py +++ b/src/aignostics/wsi/_pydicom_handler.py @@ -16,6 +16,8 @@ from aignostics.utils import console +from ._utils import select_dicom_files + class PydicomHandler: def __init__(self, path: Path) -> None: @@ -33,151 +35,56 @@ def from_file(cls, path: str | Path) -> "PydicomHandler": """ return cls(Path(path)) - def get_metadata(self, verbose: bool = False) -> dict[str, Any]: - files = self._scan_files(verbose) + def get_metadata(self, verbose: bool = False, wsi_only: bool = False) -> dict[str, Any]: + """Get DICOM metadata from files in the configured path. + + Args: + verbose: If True, include detailed annotation group data (coordinates, counts) + for annotation files. Defaults to False. + wsi_only: If True, filter to only WSI DICOM files (highest resolution per + pyramid), excluding thumbnails, labels, segmentations, and redundant + pyramid levels. Defaults to False. + + Returns: + Dictionary containing hierarchical DICOM metadata + """ + files = self._scan_files(verbose, wsi_only) return self._organize_by_hierarchy(files) - def _scan_files(self, verbose: bool = False) -> list[dict[str, Any]]: # noqa: C901, PLR0912, PLR0914, PLR0915 - dicom_files = [] + def _scan_files(self, verbose: bool = False, wsi_only: bool = False) -> list[dict[str, Any]]: + """Scan DICOM files and extract metadata. - # Determine which files to process based on whether the `self.path` points to a single file or a directory - if self.path.is_file(): - files_to_process = [self.path] if self.path.suffix.lower() == ".dcm" else [] - else: - files_to_process = list(self.path.rglob("*.dcm")) + Recursively scans the path for DICOM files, reads their metadata (without loading + pixel data), and returns structured information about each file including modality, + dimensions, and type-specific details. + + Args: + verbose: If True, include detailed annotation group data (coordinates, counts). + wsi_only: If True, filter to only WSI files (highest resolution ones only in + multi-file pyramid case). + + Returns: + List of dictionaries containing file metadata. Each dict includes: + - Basic info: path, study_uid, series_uid, modality, type, size + - Modality-specific data (e.g., pyramid info for SM/WSI, annotations for ANN) + - Patient metadata where available + + Note: + Invalid DICOM files are logged and skipped. + """ + files_to_process = self._get_files_to_process(wsi_only) + dicom_files = [] - for file_path in files_to_process: # noqa: PLR1702 + for file_path in files_to_process: if not file_path.is_file(): continue try: print(file_path) ds = pydicom.dcmread(str(file_path), stop_before_pixels=True) - # TODO(Helmut): Uncomment when DICOM is implemented - # print(ds["Modality"].value) # noqa: ERA001 - # print(getattr(ds, "Modality", "unknown")) # noqa: ERA001 - # sys.exit() # noqa: ERA001 - - # Basic required DICOM fields - file_info: dict[str, Any] = { - "path": str(file_path), - "study_uid": str(getattr(ds, "StudyInstanceUID", "unknown")), - "container_id": str(getattr(ds, "ContainerIdentifier", "unknown")), - "series_uid": str(getattr(ds, "SeriesInstanceUID", "unknown")), - "modality": str(getattr(ds, "Modality", "unknown")), - "type": "unknown", - "frame_of_reference_uid": str(getattr(ds, "FrameOfReferenceUID", "unknown")), - } - - # Try to determine file type using highdicom - try: - # TODO(Helmut): Check below, hd.sr.is_microscopy_bulk_simple_annotation(ds), - # hd.sr.is_microscopy_measurement(ds) for type annotation/measurement - if getattr(ds, "Modality", "") in {"SM", "WSI"}: - file_info["type"] = "image" - except Exception: - logger.exception("Failed to analyze DICOM file with highdicom") - # If highdicom analysis fails, keep 'unknown' type - - # Add size and basic metadata - file_info["size"] = file_path.stat().st_size - file_info["metadata"] = { - "PatientID": str(getattr(ds, "PatientID", "unknown")), - "StudyDate": str(getattr(ds, "StudyDate", "unknown")), - "SeriesDescription": str(getattr(ds, "SeriesDescription", "")), - } - - # Add to file_info dictionary after basic metadata - if getattr(ds, "Modality", "") in {"SM", "WSI"}: - file_info.update({ - "modality": getattr(ds, "Modality", ""), - "is_pyramidal": True, - "num_frames": int(getattr(ds, "NumberOfFrames", 1)), - "optical_paths": len(getattr(ds, "OpticalPathSequence", [])), - "focal_planes": len(getattr(ds, "DimensionIndexSequence", [])), - "total_pixel_matrix": ( - int(getattr(ds, "TotalPixelMatrixColumns", 0)), - int(getattr(ds, "TotalPixelMatrixRows", 0)), - ), - }) - elif getattr(ds, "Modality", "") == "ANN": - ann = hd.ann.MicroscopyBulkSimpleAnnotations.from_dataset(ds) - group_infos = [] - groups = ann.get_annotation_groups() - for group in groups: - # Calculate min/max coordinates for all points - col_min = row_min = float("inf") # Initialize to positive infinity - col_max = row_max = float("-inf") # Initialize to negative infinity - graphic_data_len = float("-inf") - first = None - - if verbose: - graphic_data = group.get_graphic_data(ann.annotation_coordinate_type) - graphic_data_len = len(graphic_data) - first = graphic_data[0] - if graphic_data: - if group.graphic_type == hd.ann.GraphicTypeValues.POINT: - # For points, each item is a single coordinate - for point in graphic_data: - col_min = min(col_min, point[0]) - col_max = max(col_max, point[0]) - row_min = min(row_min, point[1]) - row_max = max(row_max, point[1]) - else: - # For polygons/polylines, process all polygons - for polygon in graphic_data: - for point in polygon: - col_min = min(col_min, point[0]) - col_max = max(col_max, point[0]) - row_min = min(row_min, point[1]) - row_max = max(row_max, point[1]) - - group_infos.append({ - "uid": group.uid, - "label": group.label, - "property_type": group.annotated_property_type, - "graphic_type": group.graphic_type, - "count": graphic_data_len, - "first": first, - "col_min": float(col_min), - "row_min": float(row_min), - "col_max": float(col_max), - "row_max": float(row_max), - }) - file_info.update({ - "modality": "ANN", - "coordinate_type": ann.annotation_coordinate_type, - "annotation_groups": group_infos, - }) - - # Extract pyramid levels from frame organization - if getattr(ds, "DimensionOrganizationSequence", None): - # Get frame organization - frame_groups = {} - for frame in getattr(ds, "PerFrameFunctionalGroupsSequence", []): - level_idx = frame.DimensionIndexValues[0] - if level_idx not in frame_groups: - frame_groups[level_idx] = { - "count": 0, - "rows": int(frame.get("Rows", 0)), - "columns": int(frame.get("Columns", 0)), - } - frame_groups[level_idx]["count"] += 1 - - # Sort and store pyramid level information - pyramid_info = [ - { - "level": int(level_idx), - "frame_count": frame_groups[level_idx]["count"], - "frame_size": ( - frame_groups[level_idx]["columns"], - frame_groups[level_idx]["rows"], - ), - } - for level_idx in sorted(frame_groups.keys()) - ] - file_info["pyramid_info"] = pyramid_info - + file_info = PydicomHandler._extract_basic_metadata(file_path, ds) + PydicomHandler._add_modality_specific_data(file_info, ds, verbose) + self._add_pyramid_info(file_info, ds) dicom_files.append(file_info) except pydicom.errors.InvalidDicomError: @@ -185,6 +92,199 @@ def _scan_files(self, verbose: bool = False) -> list[dict[str, Any]]: # noqa: C return dicom_files + def _get_files_to_process(self, wsi_only: bool) -> list[Path]: + """Determine which DICOM files to process. + + Returns: + List of file paths to process. + """ + if self.path.is_file(): + return [self.path] if self.path.suffix.lower() == ".dcm" else [] + if wsi_only: + return select_dicom_files(self.path) + return list(self.path.rglob("*.dcm")) + + @staticmethod + def _extract_basic_metadata(file_path: Path, ds: pydicom.Dataset) -> dict[str, Any]: + """Extract basic DICOM metadata fields. + + Returns: + Dictionary containing basic DICOM metadata. + """ + file_info: dict[str, Any] = { + "path": str(file_path), + "study_uid": str(getattr(ds, "StudyInstanceUID", "unknown")), + "container_id": str(getattr(ds, "ContainerIdentifier", "unknown")), + "series_uid": str(getattr(ds, "SeriesInstanceUID", "unknown")), + "modality": str(getattr(ds, "Modality", "unknown")), + "type": "unknown", + "frame_of_reference_uid": str(getattr(ds, "FrameOfReferenceUID", "unknown")), + } + + # Try to determine file type using highdicom + try: + # TODO(Helmut): Check below, hd.sr.is_microscopy_bulk_simple_annotation(ds), + # hd.sr.is_microscopy_measurement(ds) for type annotation/measurement + if getattr(ds, "Modality", "") in {"SM", "WSI"}: + file_info["type"] = "image" + except Exception: + logger.exception("Failed to analyze DICOM file with highdicom") + # If highdicom analysis fails, keep 'unknown' type + + # Add size and basic metadata + file_info["size"] = file_path.stat().st_size + file_info["metadata"] = { + "PatientID": str(getattr(ds, "PatientID", "unknown")), + "StudyDate": str(getattr(ds, "StudyDate", "unknown")), + "SeriesDescription": str(getattr(ds, "SeriesDescription", "")), + } + + return file_info + + @staticmethod + def _add_modality_specific_data(file_info: dict[str, Any], ds: pydicom.Dataset, verbose: bool) -> None: + """Add modality-specific metadata to file_info.""" + modality = getattr(ds, "Modality", "") + + if modality in {"SM", "WSI"}: + PydicomHandler._add_wsi_metadata(file_info, ds) + elif modality == "ANN": + PydicomHandler._add_annotation_metadata(file_info, ds, verbose) + + @staticmethod + def _add_wsi_metadata(file_info: dict[str, Any], ds: pydicom.Dataset) -> None: + """Add WSI-specific metadata.""" + file_info.update({ + "modality": getattr(ds, "Modality", ""), + "is_pyramidal": True, + "num_frames": int(getattr(ds, "NumberOfFrames", 1)), + "optical_paths": len(getattr(ds, "OpticalPathSequence", [])), + "focal_planes": len(getattr(ds, "DimensionIndexSequence", [])), + "total_pixel_matrix": ( + int(getattr(ds, "TotalPixelMatrixColumns", 0)), + int(getattr(ds, "TotalPixelMatrixRows", 0)), + ), + }) + + @staticmethod + def _add_annotation_metadata(file_info: dict[str, Any], ds: pydicom.Dataset, verbose: bool) -> None: + """Add annotation-specific metadata.""" + ann = hd.ann.MicroscopyBulkSimpleAnnotations.from_dataset(ds) + groups = ann.get_annotation_groups() + group_infos = [PydicomHandler._process_annotation_group(group, ann, verbose) for group in groups] + + file_info.update({ + "modality": "ANN", + "coordinate_type": ann.annotation_coordinate_type, + "annotation_groups": group_infos, + }) + + @staticmethod + def _process_annotation_group( + group: hd.ann.AnnotationGroup, ann: hd.ann.MicroscopyBulkSimpleAnnotations, verbose: bool + ) -> dict[str, Any]: + """Process a single annotation group. + + Returns: + Dictionary containing annotation group metadata. + """ + col_min = row_min = float("inf") + col_max = row_max = float("-inf") + graphic_data_len = float("-inf") + first = None + + if verbose: + graphic_data = group.get_graphic_data(ann.annotation_coordinate_type) + graphic_data_len = len(graphic_data) + first = graphic_data[0] if graphic_data else None + + if graphic_data: + col_min, row_min, col_max, row_max = PydicomHandler._calculate_annotation_bounds( + graphic_data, group.graphic_type + ) + + return { + "uid": group.uid, + "label": group.label, + "property_type": group.annotated_property_type, + "graphic_type": group.graphic_type, + "count": graphic_data_len, + "first": first, + "col_min": float(col_min), + "row_min": float(row_min), + "col_max": float(col_max), + "row_max": float(row_max), + } + + @staticmethod + def _calculate_annotation_bounds( + graphic_data: list[Any], graphic_type: hd.ann.GraphicTypeValues + ) -> tuple[float, float, float, float]: + """Calculate bounding box for annotation graphic data. + + Returns: + Tuple of (col_min, row_min, col_max, row_max). + """ + col_min = row_min = float("inf") + col_max = row_max = float("-inf") + + if graphic_type == hd.ann.GraphicTypeValues.POINT: + for point in graphic_data: + col_min = min(col_min, point[0]) + col_max = max(col_max, point[0]) + row_min = min(row_min, point[1]) + row_max = max(row_max, point[1]) + else: + # For polygons/polylines, process all polygons + for polygon in graphic_data: + for point in polygon: + col_min = min(col_min, point[0]) + col_max = max(col_max, point[0]) + row_min = min(row_min, point[1]) + row_max = max(row_max, point[1]) + + return col_min, row_min, col_max, row_max + + def _add_pyramid_info(self, file_info: dict[str, Any], ds: pydicom.Dataset) -> None: + """Extract and add pyramid level information if available.""" + if not getattr(ds, "DimensionOrganizationSequence", None): + return + + frame_groups = self._extract_frame_groups(ds) + pyramid_info = [ + { + "level": int(level_idx), + "frame_count": frame_groups[level_idx]["count"], + "frame_size": ( + frame_groups[level_idx]["columns"], + frame_groups[level_idx]["rows"], + ), + } + for level_idx in sorted(frame_groups.keys()) + ] + file_info["pyramid_info"] = pyramid_info + + @staticmethod + def _extract_frame_groups(ds: pydicom.Dataset) -> dict[int, dict[str, int]]: + """Extract frame organization grouped by pyramid level. + + Returns: + Dictionary mapping level index to frame count and dimensions. + """ + frame_groups: dict[int, dict[str, int]] = {} + + for frame in getattr(ds, "PerFrameFunctionalGroupsSequence", []): + level_idx = frame.DimensionIndexValues[0] + if level_idx not in frame_groups: + frame_groups[level_idx] = { + "count": 0, + "rows": int(frame.get("Rows", 0)), + "columns": int(frame.get("Columns", 0)), + } + frame_groups[level_idx]["count"] += 1 + + return frame_groups + @staticmethod def _organize_by_hierarchy(files: list[dict[str, Any]]) -> dict[str, Any]: if not files: diff --git a/src/aignostics/wsi/_service.py b/src/aignostics/wsi/_service.py index 1e77f37fd..702ace1fb 100644 --- a/src/aignostics/wsi/_service.py +++ b/src/aignostics/wsi/_service.py @@ -1,6 +1,7 @@ """Service of the wsi module.""" import io +from collections.abc import Iterable from pathlib import Path from typing import Any @@ -11,6 +12,7 @@ from aignostics.utils import BaseService, Health from ._openslide_handler import DEFAULT_MAX_SAFE_DIMENSION +from ._utils import select_dicom_files TIMEOUT = 60 # 1 minutes @@ -175,3 +177,28 @@ def get_tiff_as_jpg(url: str) -> bytes: error_msg = f"Unexpected error converting TIFF to JPEG: {e!s}." logger.exception(error_msg) raise RuntimeError(error_msg) from e + + @staticmethod + def get_wsi_files_to_process(path: Path, extension: str) -> Iterable[Path]: + """Get WSI files to process for the specified extension. + + For DICOM files (.dcm), applies filtering to only include WSI files and select + only the highest resolution file from multi-file pyramids. For other formats, + returns all files matching the extension. + + Args: + path: Root directory to search for WSI files. + extension: File extension to filter (e.g., ".dcm", ".tiff", ".svs"). + Must include the leading dot. + + Returns: + Iterable of Path objects for files to process. + """ + files_to_process: Iterable[Path] + if extension == ".dcm": # noqa: SIM108 + # Special handling for DICOM files - filter out auxiliary and redundant files + files_to_process = select_dicom_files(path) + else: + # For non-DICOM formats, process all files with this extension + files_to_process = path.glob(f"**/*{extension}") + return files_to_process diff --git a/src/aignostics/wsi/_utils.py b/src/aignostics/wsi/_utils.py index 9cc92e456..6a8437975 100644 --- a/src/aignostics/wsi/_utils.py +++ b/src/aignostics/wsi/_utils.py @@ -1,9 +1,12 @@ """Utils for printing wsi files.""" +from collections import defaultdict from pathlib import Path from typing import Any import humanize +import pydicom +from loguru import logger from aignostics.utils import console @@ -189,3 +192,119 @@ def print_slide_info(slide_data: dict[str, Any], indent: int = 0, verbose: bool for file_info in series_data["files"]: console.print() print_file_info(file_info, indent=indent + 3) + + +def _find_highest_resolution_files( + pyramid_groups: dict[str, list[tuple[Path, int, int]]], +) -> list[Path]: + """Find the highest resolution file for each multi-file pyramid. + + For each pyramid (identified by PyramidUID), selects the file with the + largest total pixel count (rows x cols). All other files in the pyramid + are excluded, as OpenSlide can find them automatically. + + Args: + pyramid_groups: Dictionary mapping PyramidUID to list of (file_path, rows, cols). + + Returns: + List of file paths to keep (one per pyramid, the highest resolution). + + Note: + Single-file pyramids (only one file per PyramidUID) are included without + comparison. + """ + files_to_keep: list[Path] = [] + + for pyramid_uid, files_with_dims in pyramid_groups.items(): + if len(files_with_dims) > 1: + highest_res_file_with_dims = max(files_with_dims, key=lambda x: x[1] * x[2]) + highest_res_file, rows, cols = highest_res_file_with_dims + files_to_keep.append(highest_res_file) + + logger.debug( + f"DICOM pyramid {pyramid_uid}: keeping {highest_res_file.name} " + f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" + ) + else: + files_to_keep.append(files_with_dims[0][0]) + + return files_to_keep + + +def select_dicom_files(path: Path) -> list[Path]: + """Select WSI files only from the path, excluding auxiliary and redundant files. + + For multi-file DICOM pyramids (WSI images split across multiple DICOM instances), + keeps only the highest resolution file. Excludes segmentations, annotations, + thumbnails, labels, and other non-image DICOM files. OpenSlide will automatically + find related pyramid files in the same directory when needed. + + See here for more info on the DICOM data model: + https://dicom.nema.org/medical/dicom/current/output/chtml/part03/chapter_7.html + + Filtering Strategy: + - Only processes VL Whole Slide Microscopy Image Storage + (SOPClassUID 1.2.840.10008.5.1.4.1.1.77.1.6) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part04/sect_b.5.html + - Excludes auxiliary images by ImageType Value 3 (keeps only VOLUME images) + Reference: https://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.8.12.4.html#sect_C.8.12.4.1.1 + - Groups files by PyramidUID (unique identifier for multi-resolution pyramids) + - For pyramids with multiple files, selects only the highest resolution based + on TotalPixelMatrixRows x TotalPixelMatrixColumns + - Preserves standalone WSI files without PyramidUID + + Returns: + List of DICOM file paths to process. All other DICOM files are excluded. + + Note: + Files that cannot be read or are missing required attributes are logged and skipped. + """ + pyramid_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) + included_dicom_files: list[Path] = [] + + # Scan all DICOM files and filter based on SOPClassUID and ImageType + for dcm_file in path.glob("**/*.dcm"): + try: + ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) + + # Exclude non-WSI image files by SOPClassUID + # Only process VL Whole Slide Microscopy Image Storage (1.2.840.10008.5.1.4.1.1.77.1.6) + if ds.SOPClassUID != "1.2.840.10008.5.1.4.1.1.77.1.6": + logger.debug(f"Excluding {dcm_file.name} - not a WSI image (SOPClassUID: {ds.SOPClassUID})") + continue + + # Exclude auxiliary images by ImageType Value 3 + # Per DICOM PS3.3 C.8.12.4.1.1: Value 3 should be VOLUME, THUMBNAIL, LABEL, or OVERVIEW. + # We only want VOLUME images + if hasattr(ds, "ImageType") and len(ds.ImageType) >= 3: # noqa: PLR2004 + image_type_value_3 = ds.ImageType[2].upper() + + if image_type_value_3 != "VOLUME": + logger.debug(f"Excluding {dcm_file.name} - ImageType Value 3: {image_type_value_3}") + continue + else: + # No ImageType Value 3 - could be standalone WSI or non-standard file + logger.debug(f"DICOM {dcm_file.name} has no ImageType Value 3 - treating as standalone") + + # Try to group by PyramidUID for pyramid resolution selection + if hasattr(ds, "PyramidUID"): + # PyramidUID is Type 1C (conditional) - required if part of a multi-file pyramid + # TotalPixelMatrix attributes are Type 1 for WSI - should always be present + pyramid_uid = ds.PyramidUID + rows = int(ds.TotalPixelMatrixRows) + cols = int(ds.TotalPixelMatrixColumns) + + pyramid_groups[pyramid_uid].append((dcm_file, rows, cols)) + else: + # Treat as standalone file + included_dicom_files.append(dcm_file) + + except Exception as e: + logger.debug(f"Could not process DICOM file {dcm_file}: {e}") + continue + + # For each pyramid with multiple files, select only the highest resolution + pyramid_files_to_include = _find_highest_resolution_files(pyramid_groups) + included_dicom_files.extend(pyramid_files_to_include) + + return included_dicom_files diff --git a/tests/aignostics/wsi/service_test.py b/tests/aignostics/wsi/service_test.py index 76f9177b5..64074ae4f 100644 --- a/tests/aignostics/wsi/service_test.py +++ b/tests/aignostics/wsi/service_test.py @@ -4,15 +4,19 @@ import http.server import os import threading +from collections.abc import Callable from io import BytesIO from pathlib import Path +import pydicom import pytest from fastapi.testclient import TestClient from nicegui import app from nicegui.testing import User from PIL import Image +from aignostics.wsi import Service as WSIService + CONTENT_LENGTH_FALLBACK = 32066 # Fallback image size in bytes @@ -285,3 +289,194 @@ def test_serve_tiff_to_jpeg_fails_on_tiff_url_broken(user: User, record_property assert response.status_code == 200 assert int(response.headers["Content-Length"]) == CONTENT_LENGTH_FALLBACK + + +@pytest.fixture +def dicom_factory() -> Callable[..., pydicom.Dataset]: + """Factory fixture for creating DICOM datasets with custom parameters. + + The nested function returns a factory that lets us create multiple DICOMs + with different parameters in each test (e.g., different pyramid UIDs, + resolutions, and image types). + """ + + def _create_dicom( + pyramid_uid: str | None, + rows: int, + cols: int, + sop_class_uid: str = "1.2.840.10008.5.1.4.1.1.77.1.6", # VL WSI by default + image_type: list[str] | None = None, + ) -> pydicom.Dataset: + """Create a minimal but valid DICOM dataset for WSI. + + Args: + pyramid_uid: The pyramid UID (None for standalone images) + rows: Total image rows (TotalPixelMatrixRows) for the full WSI + cols: Total image columns (TotalPixelMatrixColumns) for the full WSI + sop_class_uid: SOP Class UID (defaults to VL WSI) + image_type: Optional ImageType attribute + + Returns: + A valid pydicom Dataset for whole slide imaging + """ + ds = pydicom.Dataset() + + # File Meta Information + ds.file_meta = pydicom.Dataset() + ds.file_meta.TransferSyntaxUID = pydicom.uid.ImplicitVRLittleEndian + ds.file_meta.MediaStorageSOPClassUID = sop_class_uid + ds.file_meta.MediaStorageSOPInstanceUID = pydicom.uid.generate_uid() + + # Required DICOM attributes + ds.SOPInstanceUID = pydicom.uid.generate_uid() + ds.SOPClassUID = sop_class_uid + ds.StudyInstanceUID = pydicom.uid.generate_uid() + ds.SeriesInstanceUID = pydicom.uid.generate_uid() + ds.Modality = "SM" + + # Tile dimensions (typically 256x256 for WSI) + ds.Rows = 256 + ds.Columns = 256 + + # CRITICAL: Total image dimensions for whole slide imaging + # These represent the full image size and are what differentiate pyramid levels + ds.TotalPixelMatrixRows = rows + ds.TotalPixelMatrixColumns = cols + + # Add PyramidUID if provided (optional for standalone images) + if pyramid_uid: + ds.PyramidUID = pyramid_uid + + # Add ImageType if provided + if image_type: + ds.ImageType = image_type + + return ds + + return _create_dicom + + +@pytest.mark.unit +def test_get_wsi_files_to_process_dicom_multi_file_pyramid( + tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], +) -> None: + """Test service filters multi-file DICOM pyramid to highest resolution only.""" + pyramid_uid = "1.2.3.4.5" + + # Create low resolution (should be excluded) + ds_low = dicom_factory(pyramid_uid, 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file_low = tmp_path / "test_low.dcm" + ds_low.save_as(dcm_file_low, write_like_original=False) + + # Create high resolution (should be kept) + ds_high = dicom_factory(pyramid_uid, 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file_high = tmp_path / "test_high.dcm" + ds_high.save_as(dcm_file_high, write_like_original=False) + + # Get filtered files + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".dcm")) + + # Should include only highest resolution + assert len(files) == 1 + assert dcm_file_high in files + assert dcm_file_low not in files + + +@pytest.mark.unit +def test_get_wsi_files_to_process_dicom_excludes_thumbnails( + tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], +) -> None: + """Test service excludes DICOM thumbnail images.""" + from aignostics.wsi import Service as WSIService + + # Create thumbnail (should be excluded) + ds_thumb = dicom_factory( + "1.2.3.4.5", + 256, + 256, + image_type=["DERIVED", "PRIMARY", "THUMBNAIL"], + ) + dcm_file_thumb = tmp_path / "thumbnail.dcm" + ds_thumb.save_as(dcm_file_thumb, write_like_original=False) + + # Create volume image (should be kept) + ds_volume = dicom_factory( + "1.2.3.4.5", + 1024, + 1024, + image_type=["DERIVED", "PRIMARY", "VOLUME"], + ) + dcm_file_volume = tmp_path / "volume.dcm" + ds_volume.save_as(dcm_file_volume, write_like_original=False) + + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".dcm")) + + assert len(files) == 1 + assert dcm_file_volume in files + assert dcm_file_thumb not in files + + +@pytest.mark.unit +def test_get_wsi_files_to_process_dicom_mixed_scenario( + tmp_path: Path, + dicom_factory: Callable[..., pydicom.Dataset], +) -> None: + """Test service handles complex scenario with multiple pyramids and file types.""" + from aignostics.wsi import Service as WSIService + + # Pyramid 1: 2 levels (keep highest only) + ds1_low = dicom_factory("pyramid1", 512, 512, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file1_low = tmp_path / "p1_low.dcm" + ds1_low.save_as(dcm_file1_low, write_like_original=False) + + ds1_high = dicom_factory("pyramid1", 2048, 2048, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file1_high = tmp_path / "p1_high.dcm" + ds1_high.save_as(dcm_file1_high, write_like_original=False) + + # Thumbnail file (exclude) + ds_thumb = dicom_factory("pyramid1", 256, 256, image_type=["DERIVED", "PRIMARY", "THUMBNAIL"]) + dcm_file_thumb = tmp_path / "p1_thumb.dcm" + ds_thumb.save_as(dcm_file_thumb, write_like_original=False) + + # Segmentation file (exclude by SOPClassUID) + ds_seg = dicom_factory("pyramid1", 2048, 2048, sop_class_uid="1.2.840.10008.5.1.4.1.1.66.4") + dcm_file_seg = tmp_path / "seg.dcm" + ds_seg.save_as(dcm_file_seg, write_like_original=False) + + # Standalone WSI (keep) + ds_standalone = dicom_factory(None, 1024, 1024, image_type=["DERIVED", "PRIMARY", "VOLUME"]) + dcm_file_standalone = tmp_path / "standalone.dcm" + ds_standalone.save_as(dcm_file_standalone, write_like_original=False) + + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".dcm")) + + # Should include: high-res from pyramid1, standalone + assert len(files) == 2 + assert dcm_file1_high in files + assert dcm_file_standalone in files + + # Should exclude: low-res, thumbnail, segmentation + assert dcm_file1_low not in files + assert dcm_file_thumb not in files + assert dcm_file_seg not in files + + +@pytest.mark.unit +def test_get_wsi_files_to_process_non_dicom_passthrough(tmp_path: Path) -> None: + """Test service passes through non-DICOM files without filtering.""" + from aignostics.wsi import Service as WSIService + + # Create some TIFF files + tiff1 = tmp_path / "image1.tiff" + tiff2 = tmp_path / "image2.tiff" + tiff1.touch() + tiff2.touch() + + # Non-DICOM files should all be returned + files = list(WSIService.get_wsi_files_to_process(tmp_path, ".tiff")) + + assert len(files) == 2 + assert tiff1 in files + assert tiff2 in files