From 467defa779416ae40c4d0f9689a6988b1690ecc8 Mon Sep 17 00:00:00 2001 From: rchevrier Date: Fri, 8 May 2026 18:08:41 +0000 Subject: [PATCH 1/6] First implementation for half orbit mixin --- src/fcollections/core/__init__.py | 2 + src/fcollections/core/_filesdb.py | 97 +++++++++++++------ src/fcollections/core/_mixins.py | 77 +++++++++++++-- .../implementations/_converters.py | 4 + .../implementations/_l3_lr_ssh.py | 11 ++- .../implementations/optional/_predicates.py | 4 + 6 files changed, 159 insertions(+), 36 deletions(-) diff --git a/src/fcollections/core/__init__.py b/src/fcollections/core/__init__.py index f02ffff..fcf69af 100644 --- a/src/fcollections/core/__init__.py +++ b/src/fcollections/core/__init__.py @@ -59,6 +59,7 @@ from ._mixins import ( DiscreteTimesMixin, DownloadMixin, + HalfOrbitMixin, ITemporalMixin, PeriodMixin, ) @@ -91,6 +92,7 @@ "DownloadMixin", "CaseType", "PeriodMixin", + "HalfOrbitMixin", "GroupMetadata", "group_metadata_from_netcdf", "VariableMetadata", diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index eacf56e..fe2b6b7 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -498,7 +498,20 @@ def _files( with warnings.catch_warnings(): warnings.simplefilter("error", category=PerformanceWarning) try: - subset_filters = self.unmixer.pick_subset(self.subsets, **kwargs) + # Sanitize field before + file_name_convention = self.layouts[0].conventions[-1] + + sanitized_subset_parameters = { + field_name: file_name_convention.get_field(field_name).sanitize( + reference_value + ) + for field_name, reference_value in kwargs.items() + if field_name in self.unmixer.partition_keys + } + + subset_filters = self.unmixer.pick_subset( + self.subsets, **sanitized_subset_parameters + ) kwargs |= subset_filters unmix = False except IndexError: @@ -509,6 +522,40 @@ def _files( "Subset unmixing could not be done before the files scan: it will be done after." ) + predicates, kwargs = self._auto_build_predicates_and_filters(predicates, kwargs) + + df = self.discoverer.to_dataframe( + predicates=predicates, + stat_fields=stat_fields, + enable_layouts=self.enable_layouts, + **{k: kwargs[k] for k in kwargs if k in self.listing_parameters}, + ) + + postprocesses = map( + lambda item: item[1], + filter( + lambda item: item[0], + [ + (unmix and self.unmixer is not None, self.unmixer), + (deduplicate and self.deduplicator is not None, self.deduplicator), + ( + sort and self.sort_keys is not None, + lambda df: df.sort_values(self.sort_keys, ignore_index=True), + ), + ], + ), + ) + + for postprocess in postprocesses: + df = postprocess(df) + + return df + + def _auto_build_predicates_and_filters( + self, + predicates: tp.Iterable[tp.Callable[[tuple[tp.Any, ...]], bool]], + kwargs, + ): # Auto-build declared predicates and additionnal filters. predicates = list(predicates) if self.filter_builders is not None: @@ -559,32 +606,7 @@ def _files( filters.keys(), ) - df = self.discoverer.to_dataframe( - predicates=predicates, - stat_fields=stat_fields, - enable_layouts=self.enable_layouts, - **{k: kwargs[k] for k in kwargs if k in self.listing_parameters}, - ) - - postprocesses = map( - lambda item: item[1], - filter( - lambda item: item[0], - [ - (unmix and self.unmixer is not None, self.unmixer), - (deduplicate and self.deduplicator is not None, self.deduplicator), - ( - sort and self.sort_keys is not None, - lambda df: df.sort_values(self.sort_keys, ignore_index=True), - ), - ], - ), - ) - - for postprocess in postprocesses: - df = postprocess(df) - - return df + return predicates, kwargs def _query(self, **kwargs) -> xr_t.Dataset | None: """Query a dataset by reading selected files in file system. @@ -836,10 +858,24 @@ def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: # warning will be emitted if unmix: try: - kwargs |= self.unmixer.pick_subset(self.subsets, **kwargs) + # Sanitize field before + file_name_convention = self.layouts[0].conventions[-1] + + sanitized_subset_parameters = { + field_name: file_name_convention.get_field(field_name).sanitize( + reference_value + ) + for field_name, reference_value in kwargs.items() + if field_name in self.unmixer.partition_keys + } + + kwargs |= self.unmixer.pick_subset( + self.subsets, **sanitized_subset_parameters + ) except IndexError: logger.debug("No subset, nothing to unmix") + _, kwargs = self._auto_build_predicates_and_filters([], kwargs) return {x[0] for x in metadata_collector.discover(**kwargs)} except LayoutMismatchError: msg = ( @@ -1134,3 +1170,8 @@ def build_filter(cls, *args: tp.Any) -> dict[str, tp.Any]: @abc.abstractmethod def parameter(cls) -> FileNameField: """Initialization parameter for the class.""" + + @classmethod + @abc.abstractmethod + def target_fields(cls) -> tuple[str, ...]: + """""" diff --git a/src/fcollections/core/_mixins.py b/src/fcollections/core/_mixins.py index cd1ca90..afc53ef 100644 --- a/src/fcollections/core/_mixins.py +++ b/src/fcollections/core/_mixins.py @@ -1,6 +1,7 @@ from __future__ import annotations import abc +import functools import logging import os import typing as tp @@ -16,17 +17,29 @@ times_holes, ) +from ._filesdb import PerformanceWarning + if tp.TYPE_CHECKING: # pragma: no cover import numpy as np - import pandas as pda_t logger = logging.getLogger(__name__) +def suppress_performance_warning(func): + + @functools.wraps(func) + def suppressed(*args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", PerformanceWarning) + return func(*args, **kwargs) + + return suppressed + + class ITemporalMixin(abc.ABC): @abc.abstractmethod - def list_files(self, *args, **kwargs) -> pda_t.DataFrame: + def filter_values(self, filter_name: str, **kwargs) -> set[tp.Any]: """The mixin relies on this method to build new functionalities.""" @abc.abstractmethod @@ -52,16 +65,22 @@ def time_coverage(self, **filters) -> Period | None: class PeriodMixin(ITemporalMixin): + @suppress_performance_warning def time_holes(self, **filters) -> tp.Generator[Period, None, None]: - periods = sorted(self.list_files(**filters)["time"].values) + periods = sorted(self.filter_values("time", **filters)) + if len(periods) == 0: + logger.info("All data filtered out with %s", filters) return [] reduced = fuse_successive_periods(periods) return periods_holes(reduced) + @suppress_performance_warning def time_coverage(self, **filters) -> Period | None: - periods = sorted(self.list_files(**filters)["time"].values) + periods = sorted(self.filter_values("time", **filters)) + if len(periods) == 0: + logger.info("All data filtered out with %s", filters) return None return periods_envelop(periods) @@ -71,24 +90,70 @@ class DiscreteTimesMixin(ITemporalMixin): def __init__(self, sampling: np.timedelta64 | None = None): self.sampling = sampling + @suppress_performance_warning def time_holes(self, **filters) -> tp.Generator[Period, None, None]: if self.sampling is None: msg = """No sampling specified, holes detection in the time serie cannot proceed""" warnings.warn(msg) return [] - times = sorted(self.list_files(**filters)["time"].values) + + times = sorted(self.filter_values("time", **filters)) + if len(times) == 0: + logger.info("All data filtered out with %s", filters) return [] return times_holes(times, self.sampling) + @suppress_performance_warning def time_coverage(self, **filters) -> Period | None: - times = sorted(self.list_files(**filters)["time"].values) + times = sorted(self.filter_values("time", **filters)) + if len(times) == 0: + logger.info("All data filtered out with %s", filters) return None return Period(times[0], times[-1]) +class HalfOrbitMixin: + + def cycle_range(self, **filters) -> tuple[int, int]: + cycles = sorted(self.filter_values("cycle_number", **filters)) + return cycles[0], cycles[-1] + + @suppress_performance_warning + def half_orbit_range(self, **filters) -> tuple[tuple[int, int], tuple[int, int]]: + first_cycle, last_cycle = self.cycle_range(**filters) + + for filter_builder in self.filter_builders: + if filter_builder.target_fields() == ("cycle_number",): + filters.pop(filter_builder.parameter().name, None) + + filters["cycle_number"] = first_cycle + first_pass = sorted(self.filter_values("pass_number", **filters))[0] + + filters["cycle_number"] = last_cycle + last_pass = sorted(self.filter_values("pass_number", **filters))[-1] + + return (first_cycle, first_pass), (last_cycle, last_pass) + + def time_coverage(self, **filters) -> Period | None: + with warnings.catch_warnings(): + warnings.simplefilter("error", PerformanceWarning) + + try: + cycle_range = self.cycle_range(**filters) + for filter_builder in self.filter_builders: + if filter_builder.target_fields() == ("cycle_number",): + filters.pop(filter_builder.parameter().name, None) + filters["cycle_number"] = list(cycle_range) + except PerformanceWarning: + # Don't try to accelerate if we must fall back to a slow listing + pass + + return super().time_coverage(**filters) + + class DownloadMixin(abc.ABC): @property diff --git a/src/fcollections/implementations/_converters.py b/src/fcollections/implementations/_converters.py index 4573924..ad038ec 100644 --- a/src/fcollections/implementations/_converters.py +++ b/src/fcollections/implementations/_converters.py @@ -49,3 +49,7 @@ def parameter(cls) -> FileNameField: case_type_decoded=CaseType.upper, case_type_encoded=CaseType.lower, ) + + @classmethod + def target_fields(cls) -> tuple[str, ...]: + return ("cycle_number",) diff --git a/src/fcollections/implementations/_l3_lr_ssh.py b/src/fcollections/implementations/_l3_lr_ssh.py index eeeea4b..febc4e7 100644 --- a/src/fcollections/implementations/_l3_lr_ssh.py +++ b/src/fcollections/implementations/_l3_lr_ssh.py @@ -10,6 +10,7 @@ FileNameFieldPeriod, FileNameFieldString, FilesDatabase, + HalfOrbitMixin, Layout, PeriodMixin, SubsetsUnmixer, @@ -112,9 +113,15 @@ def encode(self, a: str) -> str: ) -class BasicNetcdfFilesDatabaseSwotLRL3(FilesDatabase, PeriodMixin): +class BasicNetcdfFilesDatabaseSwotLRL3(FilesDatabase, HalfOrbitMixin, PeriodMixin): """Database mapping to select and read Swot LR L3 Netcdf files in a local - file system.""" + file system. + + Note + ---- + HalfOrbitMixin overrides the temporal coverage access (it is usually faster) + so it must be declared prior to the PeriodMixin + """ layouts = [ Layout([FileNameConventionSwotL3()]), diff --git a/src/fcollections/implementations/optional/_predicates.py b/src/fcollections/implementations/optional/_predicates.py index 837bca7..67a4d19 100644 --- a/src/fcollections/implementations/optional/_predicates.py +++ b/src/fcollections/implementations/optional/_predicates.py @@ -105,3 +105,7 @@ def parameter(cls) -> FileNameField: "retrieved" ), ) + + @classmethod + def target_fields(cls) -> tuple[str, ...]: + return ("cycle_number", "pass_number") From acf6d0494315728ce85f73c1e9e0e7248aac41ed Mon Sep 17 00:00:00 2001 From: rchevrier Date: Fri, 8 May 2026 18:36:43 +0000 Subject: [PATCH 2/6] small refacto --- src/fcollections/core/_filesdb.py | 89 ++++++++++++++++++------------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index fe2b6b7..3ed07f9 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -498,24 +498,7 @@ def _files( with warnings.catch_warnings(): warnings.simplefilter("error", category=PerformanceWarning) try: - # Sanitize field before - file_name_convention = self.layouts[0].conventions[-1] - - sanitized_subset_parameters = { - field_name: file_name_convention.get_field(field_name).sanitize( - reference_value - ) - for field_name, reference_value in kwargs.items() - if field_name in self.unmixer.partition_keys - } - - subset_filters = self.unmixer.pick_subset( - self.subsets, **sanitized_subset_parameters - ) - kwargs |= subset_filters - unmix = False - except IndexError: - logger.debug("No subset, nothing to unmix") + self._pick_subset_before_files_scan(kwargs) unmix = False except PerformanceWarning: logger.debug( @@ -856,24 +839,8 @@ def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: # there is an ambiguity. We need the subsets list whether the listing # is quick or slow. In case of a slow computation of subsets, a # warning will be emitted - if unmix: - try: - # Sanitize field before - file_name_convention = self.layouts[0].conventions[-1] - - sanitized_subset_parameters = { - field_name: file_name_convention.get_field(field_name).sanitize( - reference_value - ) - for field_name, reference_value in kwargs.items() - if field_name in self.unmixer.partition_keys - } - - kwargs |= self.unmixer.pick_subset( - self.subsets, **sanitized_subset_parameters - ) - except IndexError: - logger.debug("No subset, nothing to unmix") + if unmix and self.unmixer is not None: + self._pick_subset_before_files_scan(kwargs) _, kwargs = self._auto_build_predicates_and_filters([], kwargs) return {x[0] for x in metadata_collector.discover(**kwargs)} @@ -888,6 +855,43 @@ def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: warnings.warn(msg, PerformanceWarning) return set(self.list_files(**kwargs, unmix=unmix)[filter_name]) + def _pick_subset_before_files_scan(self, filters: dict[str, tp.Any]): + """Pick a subset without listing the files metadata. + + Listing the files metadata can be costly. If possible, we wish to + determine the subset by parsing the information in the folders. + + Parameters + ---------- + filters + Filters that needs to be applied on the files. These should also + contain the mandatory filters for subset selection. This parameter + is modified in place to add the automatically set filters for the + subset (refer to :attr:`SubsetUnmixer.auto_pick_last`) + + Warns + ----- + PerformanceWarning + In case the subset information cannot be found in the folders. + """ + try: + # Sanitize field before + file_name_convention = self.layouts[0].conventions[-1] + + sanitized_subset_parameters = { + field_name: file_name_convention.get_field(field_name).sanitize( + reference_value + ) + for field_name, reference_value in filters.items() + if field_name in self.unmixer.partition_keys + } + + filters |= self.unmixer.pick_subset( + self.subsets, **sanitized_subset_parameters + ) + except IndexError: + logger.debug("No subset, nothing to unmix") + def _validate_field(self, filter_name: str): """Check a field is declared in one of the layouts. @@ -1174,4 +1178,15 @@ def parameter(cls) -> FileNameField: @classmethod @abc.abstractmethod def target_fields(cls) -> tuple[str, ...]: - """""" + """Target fields of the predicate. + + The target fields determines which part of a metadata record (related + information about one file) is used. This can be useful to detect + incompatibilities between the predicate filtering, and more classic + filtering. + + Returns + ------- + tuple[str, ...] + Field names used by the predicate to filter a record. + """ From d705c8d8adabbc2cf73676fd7d79647331e28983 Mon Sep 17 00:00:00 2001 From: rchevrier Date: Fri, 8 May 2026 19:10:27 +0000 Subject: [PATCH 3/6] repair tests and add docstrings --- src/fcollections/core/_mixins.py | 115 ++++++++++++++++-- .../implementations/_l3_lr_ssh.py | 2 +- src/fcollections/time/_periods.py | 13 +- tests/core/test_mixins.py | 90 ++++++-------- 4 files changed, 156 insertions(+), 64 deletions(-) diff --git a/src/fcollections/core/_mixins.py b/src/fcollections/core/_mixins.py index afc53ef..1c24441 100644 --- a/src/fcollections/core/_mixins.py +++ b/src/fcollections/core/_mixins.py @@ -25,7 +25,14 @@ logger = logging.getLogger(__name__) -def suppress_performance_warning(func): +def suppress_performance_warning(func: tp.Callable) -> tp.Callable: + """Suppress PerformanceWarning when calling the input function. + + Returns + ------- + tp.Callable + The patched function with suppressed PerformanceWarning + """ @functools.wraps(func) def suppressed(*args, **kwargs): @@ -116,13 +123,47 @@ def time_coverage(self, **filters) -> Period | None: class HalfOrbitMixin: + """Mixin extending FilesDatabase with methods working on half orbits.""" + + def filter_values(self, filter_name: str, **kwargs) -> set[tp.Any]: + """The mixin relies on this method to build new functionalities.""" def cycle_range(self, **filters) -> tuple[int, int]: + """Extract the cycle range. + + Parameters + ---------- + filters + Set of filters to apply prior to extract the cycle range. This can + be used to pass the mandatory filters for selecting a single subset, + or to extract the cycle range for a single mission phase. + + Returns + ------- + tuple[int, int] + The first and last cycle matching the selection. + """ cycles = sorted(self.filter_values("cycle_number", **filters)) return cycles[0], cycles[-1] @suppress_performance_warning def half_orbit_range(self, **filters) -> tuple[tuple[int, int], tuple[int, int]]: + """Extract the half orbits range. + + Parameters + ---------- + filters + Set of filters to apply prior to extract the half orbits range. This + can be used to pass the mandatory filters for selecting a single + subset, or to extract the half orbits range for a single mission + phase. + + Returns + ------- + tuple[tuple[int, int], tuple[int, int]] + Two pairs of (cycle_number, pass_number) numbering the first and + last half orbit of the selection. + """ first_cycle, last_cycle = self.cycle_range(**filters) for filter_builder in self.filter_builders: @@ -138,20 +179,78 @@ def half_orbit_range(self, **filters) -> tuple[tuple[int, int], tuple[int, int]] return (first_cycle, first_pass), (last_cycle, last_pass) def time_coverage(self, **filters) -> Period | None: + """Extract the time coverage. + + The mixin implementation expects that the files will be grouped by + cycles in folders. This property can be used to first get the first and + last cycles, before listing the times for these two cycles. This is much + faster than getting the times for all the cycles. + + In case the hypothesis is not True (ie. folders do not contain the cycle + number information), we fall back to the classic implementation which is + slower. + + In addition, `cycle_number` ordering can break if multiple mission + phases are mixed in the selection. This will usually lead to an + inconsistent Period, which will again make the method fall back to the + default implementation. + + Parameters + ---------- + filters + Set of filters to apply prior to extract the time coverage. This + can be used to pass the mandatory filters for selecting a single + subset, or to extract the time coverage for a single mission phase. + + Returns + ------- + tuple[tuple[int, int], tuple[int, int]] + Two pairs of (cycle_number, pass_number) numbering the first and + last half orbit of the selection. + """ with warnings.catch_warnings(): warnings.simplefilter("error", PerformanceWarning) try: cycle_range = self.cycle_range(**filters) + + # The input filters will probably give a range for selecting a + # mission phase. Mission phase filters work on the cycle_number + # variable, and giving filters on the same variable will raise + # an error in the filter_values method. We must remove all + # filters working on the cycle_number variable. + edited_filters = filters.copy() for filter_builder in self.filter_builders: - if filter_builder.target_fields() == ("cycle_number",): - filters.pop(filter_builder.parameter().name, None) - filters["cycle_number"] = list(cycle_range) + if "cycle_number" in filter_builder.target_fields(): + logger.debug( + "Removed filter `%s` working on the " + "`cycle_number` variable", + filter_builder.parameter().name, + ) + edited_filters.pop(filter_builder.parameter().name, None) + edited_filters["cycle_number"] = list(cycle_range) except PerformanceWarning: - # Don't try to accelerate if we must fall back to a slow listing - pass - - return super().time_coverage(**filters) + # Don't try to accelerate, we must fall back to a slow listing + logger.debug( + "Shortcut using the `cycle_number` variable failed, " + "falling back listing `time` values without filters." + ) + + try: + return super().time_coverage(**edited_filters) + except ValueError: + # ValueError is raised if the period start > stop. This can arise if + # the cycle_number variable has a different order than the time + # variable. An example is the SWOT mission where the first mission + # phase CALVAL is numbered [400-600] whereas the second mission + # phase SCIENCE is numbered [1-399]. This sorting break will cause + # an inconsistent period, in which case we need to fall back to a + # full scan + logger.debug( + "Shortcut using the `cycle_number` variable failed, " + "falling back listing `time` values without filters." + ) + return super().time_coverage(**filters) class DownloadMixin(abc.ABC): diff --git a/src/fcollections/implementations/_l3_lr_ssh.py b/src/fcollections/implementations/_l3_lr_ssh.py index febc4e7..7fa8e44 100644 --- a/src/fcollections/implementations/_l3_lr_ssh.py +++ b/src/fcollections/implementations/_l3_lr_ssh.py @@ -120,7 +120,7 @@ class BasicNetcdfFilesDatabaseSwotLRL3(FilesDatabase, HalfOrbitMixin, PeriodMixi Note ---- HalfOrbitMixin overrides the temporal coverage access (it is usually faster) - so it must be declared prior to the PeriodMixin + so it must be declared prior to the PeriodMixin. """ layouts = [ diff --git a/src/fcollections/time/_periods.py b/src/fcollections/time/_periods.py index f63e878..1468ed4 100644 --- a/src/fcollections/time/_periods.py +++ b/src/fcollections/time/_periods.py @@ -7,7 +7,13 @@ @dc.dataclass(frozen=True) class Period: - """Period representation.""" + """Period representation. + + Raises + ------ + ValueError + If the period start > stop. + """ start: np.datetime64 """Date representing the start of the period.""" @@ -18,6 +24,11 @@ class Period: include_stop: bool = True """Inclusive (True) or strict (False) end selection.""" + def __post_init__(self): + if self.start > self.stop: + msg = "Cannot create a period with start > stop." + raise ValueError(msg) + @property def center(self) -> np.datetime64: return self.start + np.timedelta64((self.stop - self.start).item() / 2) diff --git a/tests/core/test_mixins.py b/tests/core/test_mixins.py index 93628b2..300aafb 100644 --- a/tests/core/test_mixins.py +++ b/tests/core/test_mixins.py @@ -1,9 +1,9 @@ +import typing as tp from pathlib import Path from unittest.mock import Mock import fsspec.implementations.memory as fs_mem import numpy as np -import pandas as pda import pytest from fcollections.core import DiscreteTimesMixin, DownloadMixin, PeriodMixin @@ -12,8 +12,8 @@ class PeriodMixinEmpty(PeriodMixin): - def list_files(self, *args, **kwargs): - return pda.DataFrame([], columns=["time", "filename"]) + def filter_values(self, filter_name: str, *args, **kwargs) -> set[tp.Any]: + return set() def test_period_mixin_empty(): @@ -24,45 +24,30 @@ def test_period_mixin_empty(): class PeriodMixinStub(PeriodMixin): - def list_files(self, *args, **kwargs): - return pda.DataFrame( - [ - ( - Period( - np.datetime64("2024-01-01"), - np.datetime64("2024-01-02"), - include_stop=False, - ), - "f1", - ), - ( - Period( - np.datetime64("2024-01-02"), - np.datetime64("2024-01-03"), - include_stop=False, - ), - "f2", - ), - ( - Period( - np.datetime64("2024-01-04"), - np.datetime64("2024-01-05"), - include_stop=False, - ), - "f3", - ), - ( - Period( - np.datetime64("2024-01-10"), - np.datetime64("2024-01-20"), - include_start=False, - include_stop=False, - ), - "f4", - ), - ], - columns=["time", "filename"], - ) + def filter_values(self, field_name: str, *args, **kwargs) -> set[tp.Any]: + return { + Period( + np.datetime64("2024-01-01"), + np.datetime64("2024-01-02"), + include_stop=False, + ), + Period( + np.datetime64("2024-01-02"), + np.datetime64("2024-01-03"), + include_stop=False, + ), + Period( + np.datetime64("2024-01-04"), + np.datetime64("2024-01-05"), + include_stop=False, + ), + Period( + np.datetime64("2024-01-10"), + np.datetime64("2024-01-20"), + include_start=False, + include_stop=False, + ), + } def test_period_mixin(): @@ -80,8 +65,8 @@ def test_period_mixin(): class DiscreteTimesEmpty(DiscreteTimesMixin): - def list_files(self, *args, **kwargs): - return pda.DataFrame([], columns=["time", "filename"]) + def filter_values(self, field_name: str, *args, **kwargs) -> set[tp.Any]: + return set() def test_discrete_times_mixin_empty(): @@ -92,16 +77,13 @@ def test_discrete_times_mixin_empty(): class DiscreteTimesStub(DiscreteTimesMixin): - def list_files(self, *args, **kwargs): - return pda.DataFrame( - [ - (np.datetime64("2024-01-01"), "f1"), - (np.datetime64("2024-01-02"), "f2"), - (np.datetime64("2024-01-04"), "f3"), - (np.datetime64("2024-01-10"), "f4"), - ], - columns=["time", "filename"], - ) + def filter_values(self, *args, **kwargs) -> set[tp.Any]: + return { + np.datetime64("2024-01-01"), + np.datetime64("2024-01-02"), + np.datetime64("2024-01-04"), + np.datetime64("2024-01-10"), + } def test_discrete_times_mixin(): From 94e6fe10056790679fdcfb98b8757a6da40a7ee1 Mon Sep 17 00:00:00 2001 From: rchevrier Date: Fri, 8 May 2026 20:37:54 +0000 Subject: [PATCH 4/6] Add core tests --- src/fcollections/core/_mixins.py | 113 +++++++++++++++++-------------- tests/core/test_mixins.py | 79 ++++++++++++++++++++- tests/time/test_periods.py | 5 ++ 3 files changed, 147 insertions(+), 50 deletions(-) diff --git a/src/fcollections/core/_mixins.py b/src/fcollections/core/_mixins.py index 1c24441..2558c84 100644 --- a/src/fcollections/core/_mixins.py +++ b/src/fcollections/core/_mixins.py @@ -122,13 +122,14 @@ def time_coverage(self, **filters) -> Period | None: return Period(times[0], times[-1]) -class HalfOrbitMixin: +class HalfOrbitMixin(abc.ABC): """Mixin extending FilesDatabase with methods working on half orbits.""" + @abc.abstractmethod def filter_values(self, filter_name: str, **kwargs) -> set[tp.Any]: """The mixin relies on this method to build new functionalities.""" - def cycle_range(self, **filters) -> tuple[int, int]: + def cycle_range(self, **filters) -> tuple[int, int] | None: """Extract the cycle range. Parameters @@ -140,14 +141,17 @@ def cycle_range(self, **filters) -> tuple[int, int]: Returns ------- - tuple[int, int] - The first and last cycle matching the selection. + tuple[int, int] | None + The first and last cycle matching the selection, or None if there is + no data. """ cycles = sorted(self.filter_values("cycle_number", **filters)) - return cycles[0], cycles[-1] + return (cycles[0], cycles[-1]) if len(cycles) >= 1 else None @suppress_performance_warning - def half_orbit_range(self, **filters) -> tuple[tuple[int, int], tuple[int, int]]: + def half_orbit_range( + self, **filters + ) -> tuple[tuple[int, int], tuple[int, int]] | None: """Extract the half orbits range. Parameters @@ -160,21 +164,23 @@ def half_orbit_range(self, **filters) -> tuple[tuple[int, int], tuple[int, int]] Returns ------- - tuple[tuple[int, int], tuple[int, int]] + tuple[tuple[int, int], tuple[int, int]] | None Two pairs of (cycle_number, pass_number) numbering the first and - last half orbit of the selection. + last half orbit of the selection, or None if there are no half + orbits. """ - first_cycle, last_cycle = self.cycle_range(**filters) + try: + first_cycle, last_cycle = self.cycle_range(**filters) + except TypeError: + return None - for filter_builder in self.filter_builders: - if filter_builder.target_fields() == ("cycle_number",): - filters.pop(filter_builder.parameter().name, None) + edited_filters = self._clean_filters(filters) - filters["cycle_number"] = first_cycle - first_pass = sorted(self.filter_values("pass_number", **filters))[0] + edited_filters["cycle_number"] = first_cycle + first_pass = sorted(self.filter_values("pass_number", **edited_filters))[0] - filters["cycle_number"] = last_cycle - last_pass = sorted(self.filter_values("pass_number", **filters))[-1] + edited_filters["cycle_number"] = last_cycle + last_pass = sorted(self.filter_values("pass_number", **edited_filters))[-1] return (first_cycle, first_pass), (last_cycle, last_pass) @@ -206,51 +212,60 @@ def time_coverage(self, **filters) -> Period | None: ------- tuple[tuple[int, int], tuple[int, int]] Two pairs of (cycle_number, pass_number) numbering the first and - last half orbit of the selection. + last half orbit of the selection, or None if there is no data. """ with warnings.catch_warnings(): warnings.simplefilter("error", PerformanceWarning) try: cycle_range = self.cycle_range(**filters) + if cycle_range is None: + return None + + edited_filters = self._clean_filters(filters) + edited_filters["cycle_number"] = min(cycle_range) + first_period = super().time_coverage(**edited_filters) + + edited_filters["cycle_number"] = max(cycle_range) + last_period = super().time_coverage(**edited_filters) - # The input filters will probably give a range for selecting a - # mission phase. Mission phase filters work on the cycle_number - # variable, and giving filters on the same variable will raise - # an error in the filter_values method. We must remove all - # filters working on the cycle_number variable. - edited_filters = filters.copy() - for filter_builder in self.filter_builders: - if "cycle_number" in filter_builder.target_fields(): - logger.debug( - "Removed filter `%s` working on the " - "`cycle_number` variable", - filter_builder.parameter().name, - ) - edited_filters.pop(filter_builder.parameter().name, None) - edited_filters["cycle_number"] = list(cycle_range) - except PerformanceWarning: + return Period( + first_period.start, + last_period.stop, + include_start=first_period.include_start, + include_stop=last_period.include_stop, + ) + + except (PerformanceWarning, ValueError): # Don't try to accelerate, we must fall back to a slow listing + # ValueError is raised if the period start > stop. This can arise if + # the cycle_number variable has a different order than the time + # variable. An example is the SWOT mission where the first mission + # phase CALVAL is numbered [400-600] whereas the second mission + # phase SCIENCE is numbered [1-399]. This sorting break will cause + # an inconsistent period, in which case we need to fall back to a + # full scan logger.debug( "Shortcut using the `cycle_number` variable failed, " "falling back listing `time` values without filters." ) - - try: - return super().time_coverage(**edited_filters) - except ValueError: - # ValueError is raised if the period start > stop. This can arise if - # the cycle_number variable has a different order than the time - # variable. An example is the SWOT mission where the first mission - # phase CALVAL is numbered [400-600] whereas the second mission - # phase SCIENCE is numbered [1-399]. This sorting break will cause - # an inconsistent period, in which case we need to fall back to a - # full scan - logger.debug( - "Shortcut using the `cycle_number` variable failed, " - "falling back listing `time` values without filters." - ) - return super().time_coverage(**filters) + return super().time_coverage(**filters) + + def _clean_filters(self, filters: dict[str, tp.Any]) -> dict[str, tp.Any]: + # The input filters will probably give a range for selecting a + # mission phase. Mission phase filters work on the cycle_number + # variable, and giving filters on the same variable will raise + # an error in the filter_values method. We must remove all + # filters working on the cycle_number variable. + edited_filters = filters.copy() + for filter_builder in self.filter_builders: + if "cycle_number" in filter_builder.target_fields(): + logger.debug( + "Removed filter `%s` working on the " "`cycle_number` variable", + filter_builder.parameter().name, + ) + edited_filters.pop(filter_builder.parameter().name, None) + return edited_filters class DownloadMixin(abc.ABC): diff --git a/tests/core/test_mixins.py b/tests/core/test_mixins.py index 300aafb..36a26f0 100644 --- a/tests/core/test_mixins.py +++ b/tests/core/test_mixins.py @@ -1,4 +1,5 @@ import typing as tp +import warnings from pathlib import Path from unittest.mock import Mock @@ -6,7 +7,13 @@ import numpy as np import pytest -from fcollections.core import DiscreteTimesMixin, DownloadMixin, PeriodMixin +from fcollections.core import ( + DiscreteTimesMixin, + DownloadMixin, + HalfOrbitMixin, + PerformanceWarning, + PeriodMixin, +) from fcollections.time import Period @@ -116,6 +123,76 @@ def test_discrete_times_mixin_no_sampling(): assert list(mixin.time_holes()) == [] +class HalfOrbitMixinEmpty(HalfOrbitMixin, PeriodMixinStub): + + filter_builders = [] + + def filter_values(self, field_name: str, *args, **kwargs) -> set[tp.Any]: + return set() + + +def test_half_orbit_mixin_empty(): + mixin = HalfOrbitMixinEmpty() + assert mixin.cycle_range() is None + assert mixin.half_orbit_range() is None + assert mixin.time_coverage() is None + assert len(list(mixin.time_holes())) == 0 + + +class HalfOrbitMixinStub(HalfOrbitMixin, PeriodMixinStub): + + filter_builders = [] + + def filter_values(self, filter_name, **kwargs): + if filter_name == "time": + return PeriodMixinStub.filter_values(self, filter_name) + elif filter_name == "cycle_number": + return {1, 2, 3, 500} + elif filter_name == "pass_number": + return {x * kwargs["cycle_number"] for x in range(1, 4)} + + +def test_half_orbit_mixin(): + mixin_temporal = PeriodMixinStub() + mixin = HalfOrbitMixinStub() + assert mixin.cycle_range() == (1, 500) + assert mixin.half_orbit_range() == ((1, 1), (500, 1500)) + assert mixin.time_coverage() == mixin_temporal.time_coverage() + + +class HalfOrbitMixinStubWithWarnings(HalfOrbitMixinStub): + + def filter_values(self, filter_name, **kwargs): + warnings.warn("Slow listing", PerformanceWarning) + return super().filter_values(filter_name, **kwargs) + + +def test_half_orbit_mixin_slow_time_coverage(): + mixin_temporal = PeriodMixinStub() + mixin = HalfOrbitMixinStubWithWarnings() + assert mixin.time_coverage() == mixin_temporal.time_coverage() + + +FILTER_BUILDER_MOCK = Mock() +FILTER_BUILDER_MOCK.parameter().name = "phase" +FILTER_BUILDER_MOCK.target_fields = Mock(return_value=("cycle_number", "pass_number")) + + +class HalfOrbitMixinStubWithFilterBuilders(HalfOrbitMixinStub): + filter_builders = [FILTER_BUILDER_MOCK] + + def filter_values(self, filter_name, **kwargs): + if filter_name != "cycle_number" and "phase" in kwargs: + raise ValueError() + return super().filter_values(filter_name, **kwargs) + + +def test_half_orbit_mixin_filter_builder_clean(): + mixin = HalfOrbitMixinStubWithFilterBuilders() + assert mixin.time_coverage(phase="CALVAL") == mixin.time_coverage() + assert mixin.half_orbit_range(phase="CALVAL") == mixin.half_orbit_range() + + class DownloadMixinMemory(DownloadMixin): @property diff --git a/tests/time/test_periods.py b/tests/time/test_periods.py index 1e0fc8b..2d57e5d 100644 --- a/tests/time/test_periods.py +++ b/tests/time/test_periods.py @@ -11,6 +11,11 @@ import numpy.typing as np_t +def test_inconsistent_period(): + with pytest.raises(ValueError): + Period(np.datetime64("2023-01-02"), np.datetime64("2023-01-01")) + + @pytest.fixture def reference() -> Period: return Period(np.datetime64("2023-01-01"), np.datetime64("2023-02-01")) From 41e52b3c7312685add4cd6401edeccd041a311a6 Mon Sep 17 00:00:00 2001 From: rchevrier Date: Tue, 12 May 2026 09:08:27 +0000 Subject: [PATCH 5/6] Add half orbit mixin to implementations --- src/fcollections/core/_filesdb.py | 7 +- .../implementations/_l2_lr_ssh.py | 11 ++- src/fcollections/implementations/_l3_lr_ww.py | 8 +- tests/fixtures/_l2_lr_ssh.py | 2 +- .../collections/test_l2_lr_ssh.py | 60 +++++++++++++ .../collections/test_l3_lr_ssh.py | 60 +++++++++++++ .../collections/test_l3_lr_windwave.py | 89 ++++++++++++++++++- 7 files changed, 230 insertions(+), 7 deletions(-) diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index 3ed07f9..ed25a86 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -842,8 +842,11 @@ def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: if unmix and self.unmixer is not None: self._pick_subset_before_files_scan(kwargs) - _, kwargs = self._auto_build_predicates_and_filters([], kwargs) - return {x[0] for x in metadata_collector.discover(**kwargs)} + edited_filters = kwargs.copy() + _, edited_filters = self._auto_build_predicates_and_filters( + [], edited_filters + ) + return {x[0] for x in metadata_collector.discover(**edited_filters)} except LayoutMismatchError: msg = ( "Layouts are enabled and should contain information about " diff --git a/src/fcollections/implementations/_l2_lr_ssh.py b/src/fcollections/implementations/_l2_lr_ssh.py index e49c210..e3a1d02 100644 --- a/src/fcollections/implementations/_l2_lr_ssh.py +++ b/src/fcollections/implementations/_l2_lr_ssh.py @@ -16,6 +16,7 @@ FileNameFieldPeriod, FileNameFieldString, FilesDatabase, + HalfOrbitMixin, ICodec, Layout, PeriodMixin, @@ -561,9 +562,15 @@ def __init__(self): ) -class BasicNetcdfFilesDatabaseSwotLRL2(FilesDatabase, PeriodMixin): +class BasicNetcdfFilesDatabaseSwotLRL2(FilesDatabase, HalfOrbitMixin, PeriodMixin): """Database mapping to select and read Swot LR L2 Netcdf files in a local - file system.""" + file system. + + Note + ---- + HalfOrbitMixin overrides the temporal coverage access (it is usually faster) + so it must be declared prior to the PeriodMixin. + """ layouts = [Layout([FileNameConventionSwotL2()]), AVISO_L2_LR_SSH_LAYOUT] reader = SwotReaderL2LRSSH() diff --git a/src/fcollections/implementations/_l3_lr_ww.py b/src/fcollections/implementations/_l3_lr_ww.py index 777ccb3..a95386f 100644 --- a/src/fcollections/implementations/_l3_lr_ww.py +++ b/src/fcollections/implementations/_l3_lr_ww.py @@ -9,6 +9,7 @@ FileNameFieldPeriod, FileNameFieldString, FilesDatabase, + HalfOrbitMixin, Layout, PeriodMixin, SubsetsUnmixer, @@ -62,9 +63,14 @@ def __init__(self): ) -class BasicNetcdfFilesDatabaseSwotLRWW(FilesDatabase, PeriodMixin): +class BasicNetcdfFilesDatabaseSwotLRWW(FilesDatabase, HalfOrbitMixin, PeriodMixin): """Database mapping to explore and read the L3_LR_WIND_WAVE product. + Note + ---- + HalfOrbitMixin overrides the temporal coverage access (it is usually faster) + so it must be declared prior to the PeriodMixin. + See Also -------- fcollections.implementations.AVISO_L3_LR_WINDWAVE_LAYOUT diff --git a/tests/fixtures/_l2_lr_ssh.py b/tests/fixtures/_l2_lr_ssh.py index 3e8e539..b3e3846 100644 --- a/tests/fixtures/_l2_lr_ssh.py +++ b/tests/fixtures/_l2_lr_ssh.py @@ -181,7 +181,7 @@ def l2_lr_ssh_dir( # Create fake data for the netcdf files, do not just use touch() to simulate # the file tree - root_dir = Path(tmpdir_factory.mktemp("l3_lr_ssh")) + root_dir = Path(tmpdir_factory.mktemp("l2_lr_ssh")) for file in l2_lr_ssh_files: relative_path = Path(file) if "Basic" in relative_path.name: diff --git a/tests/implementations/collections/test_l2_lr_ssh.py b/tests/implementations/collections/test_l2_lr_ssh.py index 480f460..c926e10 100644 --- a/tests/implementations/collections/test_l2_lr_ssh.py +++ b/tests/implementations/collections/test_l2_lr_ssh.py @@ -736,3 +736,63 @@ def test_subsets_flat(l2_lr_ssh_dir_empty_files: Path): with pytest.warns(PerformanceWarning): assert len(db.subsets) == len(expected) assert all([x in expected for x in db.subsets]) + + +class TestHalfOrbitMixin: + + def test_subsets_mixed(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + with pytest.raises(ValueError, match="unmixed"): + db.half_orbit_range() + + def test_half_orbit_range(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + assert db.half_orbit_range(subset="Basic") == ((482, 11), (546, 11)) + + def test_half_orbit_range_filtered(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + assert db.half_orbit_range(subset="Basic", version="P?C?") == ( + (482, 11), + (483, 26), + ) + + def test_half_orbit_range_phase(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + assert db.half_orbit_range(subset="Expert", phase="SCIENCE") == ( + (6, 11), + (7, 533), + ) + + def test_half_orbit_range_phase_empty(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + assert db.half_orbit_range(subset="Basic", phase="SCIENCE") is None + + def test_half_orbit_range_phase_layout( + self, l2_lr_ssh_dir_empty_files_layout: Path + ): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir_empty_files_layout) + assert db.half_orbit_range(subset="Expert", phase="SCIENCE") == ( + (6, 11), + (7, 533), + ) + + def test_temporal_coverage(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + assert db.time_coverage(subset="Unsmoothed") == Period( + np.datetime64("2024-01-25T02:53:52"), np.datetime64("2024-01-25T03:44:38") + ) + + def test_temporal_coverage_layout(self, l2_lr_ssh_dir_empty_files_layout: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir_empty_files_layout) + assert db.time_coverage(subset="Unsmoothed") == Period( + np.datetime64("2024-01-25T02:53:52"), np.datetime64("2024-01-25T03:44:38") + ) + + def test_cycle_range(self, l2_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir, enable_layouts=False) + with pytest.warns(PerformanceWarning): + assert db.cycle_range(subset="Expert") == (6, 577) + + def test_cycle_range_layout(self, l2_lr_ssh_dir_empty_files_layout: Path): + db = NetcdfFilesDatabaseSwotLRL2(l2_lr_ssh_dir_empty_files_layout) + assert db.cycle_range(subset="Expert") == (6, 577) diff --git a/tests/implementations/collections/test_l3_lr_ssh.py b/tests/implementations/collections/test_l3_lr_ssh.py index 69f1ced..8986dfc 100644 --- a/tests/implementations/collections/test_l3_lr_ssh.py +++ b/tests/implementations/collections/test_l3_lr_ssh.py @@ -785,3 +785,63 @@ def test_subsets(l3_lr_ssh_dir: Path): with pytest.warns(PerformanceWarning): assert len(db.subsets) == len(expected) assert all([subset in expected for subset in db.subsets]) + + +class TestHalfOrbitMixin: + + def test_subsets_mixed(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + with pytest.raises(ValueError, match="unmixed"): + db.half_orbit_range() + + def test_half_orbit_range(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + assert db.half_orbit_range(subset="Basic") == ((531, 25), (532, 26)) + + def test_half_orbit_range_filtered(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + assert db.half_orbit_range(subset="Basic", pass_number=25) == ( + (531, 25), + (532, 25), + ) + + def test_half_orbit_range_phase(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + assert db.half_orbit_range(subset="Basic", phase="CALVAL") == ( + (531, 25), + (532, 26), + ) + + def test_half_orbit_range_phase_empty(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + assert db.half_orbit_range(subset="Basic", phase="SCIENCE") is None + + def test_half_orbit_range_phase_layout( + self, l3_lr_ssh_dir_empty_files_layout: Path + ): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir_empty_files_layout) + assert db.half_orbit_range(subset="Basic", phase="CALVAL") == ( + (531, 25), + (532, 26), + ) + + def test_temporal_coverage(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + assert db.time_coverage(subset="Unsmoothed") == Period( + np.datetime64("2024-01-25T02:53:52"), np.datetime64("2024-01-25T03:44:38") + ) + + def test_temporal_coverage_layout(self, l3_lr_ssh_dir_empty_files_layout: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir_empty_files_layout) + assert db.time_coverage(subset="Unsmoothed") == Period( + np.datetime64("2024-01-25T02:53:52"), np.datetime64("2024-01-25T03:44:38") + ) + + def test_cycle_range(self, l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + with pytest.warns(PerformanceWarning): + assert db.cycle_range(subset="Expert") == (532, 533) + + def test_cycle_range_layout(self, l3_lr_ssh_dir_empty_files_layout: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir_empty_files_layout) + assert db.cycle_range(subset="Basic") == (531, 532) diff --git a/tests/implementations/collections/test_l3_lr_windwave.py b/tests/implementations/collections/test_l3_lr_windwave.py index 265941a..e1e737f 100644 --- a/tests/implementations/collections/test_l3_lr_windwave.py +++ b/tests/implementations/collections/test_l3_lr_windwave.py @@ -1,5 +1,6 @@ import os import typing as tp +from contextlib import nullcontext from pathlib import Path import numpy as np @@ -8,7 +9,7 @@ from fsspec.implementations.local import LocalFileSystem from utils import brute_force_geographical_selection -from fcollections.core import DirNode, FileSystemMetadataCollector +from fcollections.core import DirNode, FileSystemMetadataCollector, PerformanceWarning from fcollections.implementations import ( AVISO_L3_LR_WINDWAVE_LAYOUT, NetcdfFilesDatabaseSwotLRWW, @@ -412,3 +413,89 @@ def test_list_layout( expected = {os.path.basename(l3_lr_ww_files[ii]) for ii in expected} assert len(expected) > 0 assert expected == actual + + +class TestHalfOrbitMixin: + + @pytest.mark.parametrize( + "enable_layouts, filters, context, expected", + [ + (True, {}, pytest.raises(ValueError, match="unmixed"), None), + (False, {}, pytest.raises(ValueError, match="unmixed"), None), + (True, {"subset": "Light"}, nullcontext(), ((482, 11), (482, 12))), + (False, {"subset": "Light"}, nullcontext(), ((482, 11), (482, 12))), + ( + True, + {"subset": "Light", "pass_number": 11}, + nullcontext(), + ((482, 11), (482, 11)), + ), + ( + False, + {"subset": "Light", "pass_number": 11}, + nullcontext(), + ((482, 11), (482, 11)), + ), + ( + True, + {"subset": "Extended", "phase": "SCIENCE"}, + nullcontext(), + ((10, 10), (10, 10)), + ), + ( + False, + {"subset": "Extended", "phase": "SCIENCE"}, + nullcontext(), + ((10, 10), (10, 10)), + ), + (True, {"subset": "Extended", "phase": "CALVAL"}, nullcontext(), None), + (False, {"subset": "Extended", "phase": "CALVAL"}, nullcontext(), None), + ], + ids=[ + "missing_subset_key_layout", + "missing_subset_key_no_layout", + "auto_version_key_layout", + "auto_version_key_no_layout_warns", + "filtered_range_layout", + "filtered_range_no_layout", + "range_for_phase_layout", + "range_for_phase_no_layout", + "no_data_layout", + "no_data_no_layout", + ], + ) + def test_half_orbit_range( + self, + l3_lr_ww_dir_layout: Path, + enable_layouts: bool, + filters: dict[str, tp.Any], + context, + expected: tuple[tuple[int, int], tuple[int, int]] | None, + ): + db = NetcdfFilesDatabaseSwotLRWW( + l3_lr_ww_dir_layout, enable_layouts=enable_layouts + ) + with context: + assert db.half_orbit_range(**filters) == expected + + @pytest.mark.parametrize("enable_layouts", [True, False]) + def test_temporal_coverage(self, l3_lr_ww_dir_layout: Path, enable_layouts: bool): + db = NetcdfFilesDatabaseSwotLRWW( + l3_lr_ww_dir_layout, enable_layouts=enable_layouts + ) + assert db.time_coverage(subset="Extended") == Period( + np.datetime64("2024-01-25T02:53:52"), np.datetime64("2024-01-25T03:44:38") + ) + + @pytest.mark.parametrize( + "enable_layouts, context", + [(True, nullcontext()), (False, pytest.warns(PerformanceWarning))], + ) + def test_cycle_range( + self, l3_lr_ww_dir_layout: Path, enable_layouts: bool, context + ): + db = NetcdfFilesDatabaseSwotLRWW( + l3_lr_ww_dir_layout, enable_layouts=enable_layouts + ) + with context: + assert db.cycle_range(subset="Light") == (482, 482) From 0e12536e9a6517bb51589eed17ac56793975d89e Mon Sep 17 00:00:00 2001 From: rchevrier Date: Tue, 12 May 2026 09:25:08 +0000 Subject: [PATCH 6/6] Add documentation for half orbit mixin --- docs/custom.md | 8 ++++++-- docs/implementations/l2_lr_ssh.md | 29 +++++++++++++++++++++++++++++ docs/implementations/l3_lr_ssh.md | 29 +++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/docs/custom.md b/docs/custom.md index 9be1ba8..04d691e 100644 --- a/docs/custom.md +++ b/docs/custom.md @@ -394,6 +394,8 @@ set this field in the subset unmixer instead, enforcing one version per subset ``` +(mixins)= + ## Mixins A custom {class}`FilesDatabase ` may need additional functionalities apart from @@ -404,10 +406,12 @@ classes adding the functionalities are abstract, they should be mixed with other classes to get a complete implementation: these abstract classes are then called ``mixins``. -Two mixins are currently available: +Three mixins are currently available: - {class}`PeriodMixin `: works with time series - and can analyze the data to get the time coverage or detect holes + and can analyze the data to get the time coverage or detect holes. +- {class}`HalfOrbitMixin `: works with half + orbit granules to extract the half orbit range. - {class}`DownloadMixin `. Appends a download endpoint to a remote database. diff --git a/docs/implementations/l2_lr_ssh.md b/docs/implementations/l2_lr_ssh.md index fcdcdf7..19306f7 100644 --- a/docs/implementations/l2_lr_ssh.md +++ b/docs/implementations/l2_lr_ssh.md @@ -126,6 +126,35 @@ The following examples can be used to build complex queries ::: :::: +## Generic information + +Generic information about the files set can be extracted. This available +information is specific to the [mixins](#mixins) used to build +{class}`fcollections.implementations.NetcdfFilesDatabaseSwotLRL2` + +::::{tab-set} +:::{tab-item} PeriodMixin + - Time coverage + ```python + fc.time_coverage(subset='Expert', version='P?D?', phase='SCIENCE') + ``` + - Time holes + ```python + fc.time_holes(subset='Expert', version='P?D?', phase='SCIENCE') + ``` +::: +:::{tab-item} HalfOrbitMixin + - Half orbit range + ```python + fc.half_orbit_range(subset='Expert', version='P?D?', phase='SCIENCE') + ``` + - Cycle range + ```python + fc.cycle_range(subset='Expert', version='P?D?', phase='SCIENCE') + ``` +::: +:::: + ## Stack for temporal analysis diff --git a/docs/implementations/l3_lr_ssh.md b/docs/implementations/l3_lr_ssh.md index 82660c2..124a4e6 100644 --- a/docs/implementations/l3_lr_ssh.md +++ b/docs/implementations/l3_lr_ssh.md @@ -114,6 +114,35 @@ The following examples can be used to build complex queries ::: :::: +## Generic information + +Generic information about the files set can be extracted. This available +information is specific to the [mixins](#mixins) used to build +{class}`fcollections.implementations.NetcdfFilesDatabaseSwotLRL3` + +::::{tab-set} +:::{tab-item} PeriodMixin + - Time coverage + ```python + fc.time_coverage(subset='Expert') + ``` + - Time holes + ```python + fc.time_holes(subset='Expert') + ``` +::: +:::{tab-item} HalfOrbitMixin + - Half orbit range + ```python + fc.half_orbit_range(subset='Expert', phase='SCIENCE') + ``` + - Cycle range + ```python + fc.cycle_range(subset='Expert', phase='SCIENCE') + ``` +::: +:::: + ## Stack for temporal analysis The most prominent functionality is the ability to stack the half orbits when