|
3 | 3 | import base64 |
4 | 4 | import re |
5 | 5 | import time |
| 6 | +from collections import defaultdict |
6 | 7 | from collections.abc import Callable, Generator |
7 | 8 | from http import HTTPStatus |
8 | 9 | from importlib.util import find_spec |
9 | 10 | from pathlib import Path |
10 | 11 | from typing import Any |
11 | 12 |
|
12 | 13 | import google_crc32c |
| 14 | +import pydicom |
13 | 15 | import requests |
14 | 16 | from loguru import logger |
15 | 17 |
|
16 | 18 | from aignostics.bucket import Service as BucketService |
17 | | -from aignostics.constants import ( |
18 | | - TEST_APP_APPLICATION_ID, |
19 | | -) |
20 | | -from aignostics.platform import ( |
21 | | - LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, |
22 | | - ApiException, |
23 | | - Application, |
24 | | - ApplicationSummary, |
25 | | - ApplicationVersion, |
26 | | - Client, |
27 | | - InputArtifact, |
28 | | - InputItem, |
29 | | - NotFoundException, |
30 | | - Run, |
31 | | - RunData, |
32 | | - RunOutput, |
33 | | - RunState, |
34 | | -) |
35 | | -from aignostics.platform import ( |
36 | | - Service as PlatformService, |
37 | | -) |
| 19 | +from aignostics.constants import TEST_APP_APPLICATION_ID |
| 20 | +from aignostics.platform import (LIST_APPLICATION_RUNS_MAX_PAGE_SIZE, |
| 21 | + ApiException, Application, ApplicationSummary, |
| 22 | + ApplicationVersion, Client, InputArtifact, |
| 23 | + InputItem, NotFoundException, Run, RunData, |
| 24 | + RunOutput, RunState) |
| 25 | +from aignostics.platform import Service as PlatformService |
38 | 26 | from aignostics.utils import BaseService, Health, sanitize_path_component |
39 | 27 | from aignostics.wsi import Service as WSIService |
40 | 28 |
|
41 | | -from ._download import ( |
42 | | - download_available_items, |
43 | | - download_url_to_file_with_progress, |
44 | | - extract_filename_from_url, |
45 | | - update_progress, |
46 | | -) |
| 29 | +from ._download import (download_available_items, |
| 30 | + download_url_to_file_with_progress, |
| 31 | + extract_filename_from_url, update_progress) |
47 | 32 | from ._models import DownloadProgress, DownloadProgressState |
48 | 33 | from ._settings import Settings |
49 | | -from ._utils import ( |
50 | | - get_mime_type_for_artifact, |
51 | | - get_supported_extensions_for_application, |
52 | | - is_not_terminated_with_deadline_exceeded, |
53 | | - validate_due_date, |
54 | | -) |
| 34 | +from ._utils import (get_mime_type_for_artifact, |
| 35 | + get_supported_extensions_for_application, |
| 36 | + is_not_terminated_with_deadline_exceeded, |
| 37 | + validate_due_date) |
55 | 38 |
|
56 | 39 | has_qupath_extra = find_spec("ijson") |
57 | 40 | if has_qupath_extra: |
@@ -312,6 +295,55 @@ def _apply_mappings_to_entry(entry: dict[str, Any], mappings: list[str]) -> None |
312 | 295 | for key_value in key_value_pairs: |
313 | 296 | Service._process_key_value_pair(entry, key_value, external_id) |
314 | 297 |
|
| 298 | + @staticmethod |
| 299 | + def _filter_dicom_series_files(source_directory: Path) -> set[Path]: |
| 300 | + """Filter DICOM files to keep only one representative per series. |
| 301 | +
|
| 302 | + For multi-file DICOM series, keeps only the highest resolution file. |
| 303 | + OpenSlide will find other files in the same directory when needed. |
| 304 | +
|
| 305 | + Args: |
| 306 | + source_directory: The directory to scan. |
| 307 | +
|
| 308 | + Returns: |
| 309 | + set[Path]: Set of DICOM files to exclude from processing. |
| 310 | + """ |
| 311 | + dicom_files = list(source_directory.glob("**/*.dcm")) |
| 312 | + series_groups: dict[str, list[tuple[Path, int, int]]] = defaultdict(list) |
| 313 | + |
| 314 | + # Group by SeriesInstanceUID with dimensions |
| 315 | + for dcm_file in dicom_files: |
| 316 | + try: |
| 317 | + ds = pydicom.dcmread(dcm_file, stop_before_pixels=True) |
| 318 | + series_uid = ds.SeriesInstanceUID |
| 319 | + # Get dimensions (Rows and Columns from DICOM) |
| 320 | + rows = int(getattr(ds, "Rows", 0)) |
| 321 | + cols = int(getattr(ds, "Columns", 0)) |
| 322 | + series_groups[series_uid].append((dcm_file, rows, cols)) |
| 323 | + except Exception as e: |
| 324 | + logger.debug(f"Could not read DICOM {dcm_file}: {e}") |
| 325 | + # Treat as standalone - don't exclude |
| 326 | + |
| 327 | + # For each series with multiple files, keep only the highest resolution one |
| 328 | + files_to_exclude = set() |
| 329 | + for series_uid, files_with_dims in series_groups.items(): |
| 330 | + if len(files_with_dims) > 1: |
| 331 | + # Find the file with the largest dimensions (rows * cols = total pixels) |
| 332 | + highest_res_file = max(files_with_dims, key=lambda x: x[1] * x[2]) |
| 333 | + file_to_keep, rows, cols = highest_res_file |
| 334 | + |
| 335 | + # Exclude all others |
| 336 | + for file_path, r, c in files_with_dims: |
| 337 | + if file_path != file_to_keep: |
| 338 | + files_to_exclude.add(file_path) |
| 339 | + |
| 340 | + logger.debug( |
| 341 | + f"DICOM series {series_uid}: keeping {file_to_keep.name} " |
| 342 | + f"({rows}x{cols}), excluding {len(files_with_dims) - 1} related files" |
| 343 | + ) |
| 344 | + |
| 345 | + return files_to_exclude |
| 346 | + |
315 | 347 | @staticmethod |
316 | 348 | def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 |
317 | 349 | source_directory: Path, |
@@ -366,10 +398,17 @@ def generate_metadata_from_source_directory( # noqa: PLR0913, PLR0917 |
366 | 398 |
|
367 | 399 | metadata = [] |
368 | 400 |
|
| 401 | + # Pre-filter: exclude redundant DICOM files from multi-file series |
| 402 | + dicom_files_to_exclude = Service._filter_dicom_series_files(source_directory) |
| 403 | + |
369 | 404 | try: |
370 | 405 | extensions = get_supported_extensions_for_application(application_id) |
371 | 406 | for extension in extensions: |
372 | 407 | for file_path in source_directory.glob(f"**/*{extension}"): |
| 408 | + # Skip excluded DICOM files |
| 409 | + if file_path in dicom_files_to_exclude: |
| 410 | + continue |
| 411 | + |
373 | 412 | # Generate CRC32C checksum with google_crc32c and encode as base64 |
374 | 413 | hash_sum = google_crc32c.Checksum() # type: ignore[no-untyped-call] |
375 | 414 | with file_path.open("rb") as f: |
|
0 commit comments