From 3cee2ca75616ad4a433466a94034a558717844ed Mon Sep 17 00:00:00 2001 From: rchevrier Date: Fri, 1 May 2026 15:34:16 +0000 Subject: [PATCH 1/9] First implementation --- src/fcollections/core/_filesdb.py | 128 +++++++++++++++++++++++++++++- src/fcollections/core/_listing.py | 18 ++++- 2 files changed, 143 insertions(+), 3 deletions(-) diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index c72996d..eb9df61 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -10,6 +10,7 @@ import typing as tp import warnings from abc import ABCMeta +from copy import copy import docstring_parser as dcs import pandas as pda @@ -45,6 +46,7 @@ def __new__(cls, clsname, bases, attrs): _create_method(attrs, "query", "_query") _create_method(attrs, "list_files", "_files") _create_method(attrs, "variables_info", "_variables_info") + _create_method(attrs, "filter_info", "_filter_info") _create_method(attrs, "map", "_map") new_class = super().__new__(cls, clsname, bases, attrs) @@ -88,7 +90,13 @@ def __new__(cls, clsname, bases, attrs): new_class._variables_info.__doc__, *method_parameters["variables_info"], ) - + _patch_method( + new_class, + "filter_info", + "_filter_info", + new_class._filter_info.__doc__, + *method_parameters["filter_info"], + ) return new_class @@ -159,6 +167,7 @@ def _is_subset_key( map(lambda x: x[1], filter(_is_subset_key, parameters["convention"][1].items())) ) out["variables_info"] = (info_docstring, info_signature) + out["filter_info"] = (info_docstring, info_signature) return out @@ -604,6 +613,11 @@ def _variables_info(self, **kwargs) -> GroupMetadata | None: ValueError In case if one unique and homogeneous subset could not be extracted from the files metadata table + + See Also + -------- + subsets + To list the subsets keys. """ # This docstring will be superseded by the metaclass unknown = kwargs.keys() - ( @@ -667,6 +681,118 @@ def wrapped(record: dict[str, tp.Any]): bag = dask.bag.core.from_sequence(df.to_records()) return bag.map(wrapped) + def _filter_info(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: + """Returns the possible values for a given filter. + + Because the files collection may mix multiple subsets, we want to ensure + that we return the filter values for one subset only. The parameters of + this method are the subset partitioning keys and can be given by the + user. + + Parameters + ---------- + filter_name + Name of the filter for which the possible values should be + requested. + + Returns + ------- + : + A set containing the possible values for a given filter. + + Warns + ----- + UserWarning + If the requested filter information cannot be extracted from the + layouts' intermediate node. In this case, a full scan is triggered, + which will slow down the computation. + UserWarning + If layouts are not enabled for this instance. In this case, a full + scan is triggered, which will slow down the computation. + + Raises + ------ + LayoutMismatchError + In case ``enable_layouts`` is True and a mismatch between the + layouts and the actual files is detected + ValueError + In case if one unique and homogeneous subset could not be extracted + from the files metadata table + + See Also + -------- + subsets + To list the subsets keys. + """ + missing_partition_keys = set(self.unmixer.partition_keys) - set(kwargs.keys()) + if ( + filter_name not in self.unmixer.partition_keys + and len(self.subsets) > 1 + and len(missing_partition_keys) > 0 + ): + msg = ( + "Cannot work on heterogenous datasets, subset should be chosen " + f"by providing the following filters: {self.unmixer.partition_keys} " + "(subsets can be displayed using the 'subsets' property)." + ) + raise ValueError(msg) + + if not self.enable_layouts: + msg = ( + "Layouts are not enabled, full scan is necessary to deduce " + f"possible values of filter {filter_name}. Enable layouts to " + "improve performance." + ) + warnings.warn(msg) + return set(self.list_files(**kwargs)[filter_name]) + + selected_layouts = [] + for layout in self.layouts: + for ii, convention in enumerate(layout.conventions[:-1]): + # We could also handle a more complex case with a + # record containing multiple pieces of information. However, the + # current payload returned by the Visitors must be reviewed + # beforehand + if ( + len(convention.fields) == 1 + and convention.fields[0].name == filter_name + ): + selected_layouts.append(Layout(layout.conventions[: ii + 1])) + break + + if len(selected_layouts) == 0: + msg = ( + "Layouts do not contain intermediate nodes (directories) " + f"with information about filter {filter_name}. Falling back " + "to full scan to deduce the filters' possible values (the " + "performance is degraded)." + ) + warnings.warn(msg) + return set(self.list_files(**kwargs)[filter_name]) + + metadata_collector = FileSystemMetadataCollector( + selected_layouts, self.discoverer.root_node + ) + return {x[0] for x in metadata_collector.discover(**kwargs)} + + @property + def subsets(self) -> list[dict[str, tp.Any]]: + """List of the subsets combinations.""" + return list(self._subsets(0)) + + def _subsets( + self, current_index: int, **higher_partition_values: tp.Any + ) -> tp.Generator[dict[str, tp.Any], None, None]: + current_key = self.unmixer.partition_keys[current_index] + current_values = self.filter_info(current_key, **higher_partition_values) + for current_value in current_values: + result = copy(higher_partition_values) + result[current_key] = current_value + if current_index < len(self.unmixer.partition_keys) - 1: + yield from self._subsets(current_index + 1, **result) + else: + yield result + @dc.dataclass class SubsetsUnmixer: diff --git a/src/fcollections/core/_listing.py b/src/fcollections/core/_listing.py index 2d2dfd1..4a2d589 100644 --- a/src/fcollections/core/_listing.py +++ b/src/fcollections/core/_listing.py @@ -576,6 +576,7 @@ def visit_dir(self, dir_node: DirNode) -> VisitResult: layouts_for_children: list[Layout] = [] record = None + return_payload = False for layout in self.layouts: # Prune non matching layouts for this directory. We need to test all # layouts to eliminate non matching layouts as early as possible in @@ -583,6 +584,15 @@ def visit_dir(self, dir_node: DirNode) -> VisitResult: result = layout.parse_node(dir_node.level - 1, dir_node.name) if result is not None: layouts_for_children.append(layout) + + # In case the layout leaf is a folder, the exploration must be + # stopped even though the actual DirNode has children. This is + # because the layout won't be able to go deeper. This case arises + # when we want to extract information at a given level in the file + # system hierarchy. + if result is not None and dir_node.level == len(layout.conventions): + return_payload = True + if record is None: # Do not override a valid record with a None record = result @@ -603,8 +613,12 @@ def visit_dir(self, dir_node: DirNode) -> VisitResult: ) return VisitResult(False) - # Don't return a payload for dir nodes (will be subject to change later) - return VisitResult(True, None, layouts_for_children) + # Return payload for dir nodes only if dir nodes are the last convention + # defined in the layouts. + if return_payload: + return VisitResult(False, record, []) + else: + return VisitResult(True, None, layouts_for_children) def visit_file(self, file_node: FileNode) -> VisitResult: """Visits a file node. From 25e53e0ba49f5bc4d81e05e9924c362283162c1f Mon Sep 17 00:00:00 2001 From: rchevrier Date: Mon, 4 May 2026 08:29:33 +0000 Subject: [PATCH 2/9] Split L2 and L3 data in doc --- .gitignore | 2 ++ docs/implementations/l2_lr_ssh.md | 2 +- docs/implementations/l3_lr_ssh.md | 2 +- .../scripts/pull_data_l2_lr_ssh.py | 2 +- .../scripts/pull_data_l3_lr_ssh.py | 2 +- src/fcollections/core/_filesdb.py | 20 +++++++++---------- src/fcollections/core/_listing.py | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index bd62804..860ba5d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ dist coverage.xml jupyter_execute docs/implementations/data +docs/implementations/data_l2_lr_ssh +docs/implementations/data_l3_lr_ssh diff --git a/docs/implementations/l2_lr_ssh.md b/docs/implementations/l2_lr_ssh.md index ab69416..f2f2849 100644 --- a/docs/implementations/l2_lr_ssh.md +++ b/docs/implementations/l2_lr_ssh.md @@ -140,7 +140,7 @@ There are currently three modes for stacking the half orbits are cropped and we need an additional dimension to reflect the spatial jump ```{code-cell} -fc = NetcdfFilesDatabaseSwotLRL2("data") +fc = NetcdfFilesDatabaseSwotLRL2("data_l2_lr_ssh") ds = fc.query(stack='CYCLES', cycle_number=[9, 10, 11], pass_number=10, subset='Basic') ds.ssha_karin_2.data ``` diff --git a/docs/implementations/l3_lr_ssh.md b/docs/implementations/l3_lr_ssh.md index a81ab61..02d201c 100644 --- a/docs/implementations/l3_lr_ssh.md +++ b/docs/implementations/l3_lr_ssh.md @@ -127,7 +127,7 @@ There are currently three modes for stacking the half orbits are cropped and we need an additional dimension to reflect the spatial jump ```{code-cell} -fc = NetcdfFilesDatabaseSwotLRL3("data") +fc = NetcdfFilesDatabaseSwotLRL3("data_l3_lr_ssh") ds = fc.query(stack='CYCLES', version='2.0.1', cycle_number=[1, 2, 3], pass_number=10, subset='Basic') ds.ssha_filtered.data ``` diff --git a/docs/implementations/scripts/pull_data_l2_lr_ssh.py b/docs/implementations/scripts/pull_data_l2_lr_ssh.py index 32358a1..b0a8a53 100644 --- a/docs/implementations/scripts/pull_data_l2_lr_ssh.py +++ b/docs/implementations/scripts/pull_data_l2_lr_ssh.py @@ -6,7 +6,7 @@ logging.basicConfig() logging.getLogger("altimetry_downloader_aviso").setLevel("INFO") -DATA_DIR = Path(__file__).resolve().parent.parent / "data" +DATA_DIR = Path(__file__).resolve().parent.parent / "data_l2_lr_ssh" DATA_DIR.mkdir(exist_ok=True) if __name__ == "__main__": diff --git a/docs/implementations/scripts/pull_data_l3_lr_ssh.py b/docs/implementations/scripts/pull_data_l3_lr_ssh.py index fe02351..58d17c8 100644 --- a/docs/implementations/scripts/pull_data_l3_lr_ssh.py +++ b/docs/implementations/scripts/pull_data_l3_lr_ssh.py @@ -6,7 +6,7 @@ logging.basicConfig() logging.getLogger("altimetry_downloader_aviso").setLevel("INFO") -DATA_DIR = Path(__file__).resolve().parent.parent / "data" +DATA_DIR = Path(__file__).resolve().parent.parent / "data_l3_lr_ssh" DATA_DIR.mkdir(exist_ok=True) if __name__ == "__main__": diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index eb9df61..9cca37c 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -590,34 +590,32 @@ def _query(self, **kwargs) -> xr_t.Dataset | None: return ds def _variables_info(self, **kwargs) -> GroupMetadata | None: - """Returns the variables metadata. + """Return metadata describing the variables. Because the files collection may mix multiple subsets, we want to ensure that we return the variables of one subset only. The parameters of this method are the subset partitioning keys and can be given by the user to ensure a consistent set of variables. If the input parameters are not sufficient to unmix the subsets, the user will be notified with a - ValueError + ``ValueError``. Returns ------- + GroupMetadata | None A GroupMetadata containing the variables, dimensions, attributes and subgroups. None is returned in case no files is found for the given subset + See Also + -------- + FilesDatabase.subsets: To list the subsets keys. + Raises ------ LayoutMismatchError - In case ``enable_layouts`` is True and a mismatch between the - layouts and the actual files is detected + Raised if ``enable_layouts`` is True and a mismatch is detected. ValueError - In case if one unique and homogeneous subset could not be extracted - from the files metadata table - - See Also - -------- - subsets - To list the subsets keys. + Raised if a unique and homogeneous subset cannot be extracted. """ # This docstring will be superseded by the metaclass unknown = kwargs.keys() - ( diff --git a/src/fcollections/core/_listing.py b/src/fcollections/core/_listing.py index 4a2d589..f306d66 100644 --- a/src/fcollections/core/_listing.py +++ b/src/fcollections/core/_listing.py @@ -523,7 +523,7 @@ def __init__( layouts: list[Layout], stat_fields: tp.Iterable[str] = tuple(), on_mismatch_directory: LayoutMismatchHandling = LayoutMismatchHandling.RAISE, - on_mismatch_file: LayoutMismatchHandling = LayoutMismatchHandling.IGNORE, + on_mismatch_file: LayoutMismatchHandling = LayoutMismatchHandling.RAISE, ): self.layouts = layouts self.stat_fields = list(stat_fields) From a60b8359667ea72f70c96ae109b375c48e6dcee6 Mon Sep 17 00:00:00 2001 From: rchevrier Date: Mon, 4 May 2026 08:30:57 +0000 Subject: [PATCH 3/9] Rename filter_info in filter_values --- src/fcollections/core/_filesdb.py | 35 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index 9cca37c..13aeddf 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -18,7 +18,7 @@ from fsspec.implementations.local import LocalFileSystem from ._filenames import FileNameConvention -from ._listing import DirNode, FileSystemMetadataCollector, Layout +from ._listing import DirNode, FileSystemMetadataCollector, Layout, LayoutMismatchError from ._metadata import GroupMetadata from ._readers import IFilesReader @@ -46,7 +46,7 @@ def __new__(cls, clsname, bases, attrs): _create_method(attrs, "query", "_query") _create_method(attrs, "list_files", "_files") _create_method(attrs, "variables_info", "_variables_info") - _create_method(attrs, "filter_info", "_filter_info") + _create_method(attrs, "filter_values", "_filter_values") _create_method(attrs, "map", "_map") new_class = super().__new__(cls, clsname, bases, attrs) @@ -92,10 +92,10 @@ def __new__(cls, clsname, bases, attrs): ) _patch_method( new_class, - "filter_info", - "_filter_info", - new_class._filter_info.__doc__, - *method_parameters["filter_info"], + "filter_values", + "_filter_values", + new_class._filter_values.__doc__, + *method_parameters["filter_values"], ) return new_class @@ -167,7 +167,7 @@ def _is_subset_key( map(lambda x: x[1], filter(_is_subset_key, parameters["convention"][1].items())) ) out["variables_info"] = (info_docstring, info_signature) - out["filter_info"] = (info_docstring, info_signature) + out["filter_values"] = (info_docstring, info_signature) return out @@ -679,7 +679,7 @@ def wrapped(record: dict[str, tp.Any]): bag = dask.bag.core.from_sequence(df.to_records()) return bag.map(wrapped) - def _filter_info(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: + def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: """Returns the possible values for a given filter. Because the files collection may mix multiple subsets, we want to ensure @@ -738,7 +738,7 @@ def _filter_info(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: if not self.enable_layouts: msg = ( "Layouts are not enabled, full scan is necessary to deduce " - f"possible values of filter {filter_name}. Enable layouts to " + f"possible values of filter '{filter_name}'. Enable layouts to " "improve performance." ) warnings.warn(msg) @@ -761,7 +761,7 @@ def _filter_info(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: if len(selected_layouts) == 0: msg = ( "Layouts do not contain intermediate nodes (directories) " - f"with information about filter {filter_name}. Falling back " + f"with information about filter '{filter_name}'. Falling back " "to full scan to deduce the filters' possible values (the " "performance is degraded)." ) @@ -771,7 +771,18 @@ def _filter_info(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: metadata_collector = FileSystemMetadataCollector( selected_layouts, self.discoverer.root_node ) - return {x[0] for x in metadata_collector.discover(**kwargs)} + + try: + return {x[0] for x in metadata_collector.discover(**kwargs)} + except LayoutMismatchError: + logger.info( + "Layouts are enabled and should contain information about " + "filter '%s' in their intermediate nodes. However," + " the actual file organization does not match the layouts. " + "Falling back to full scan (the performance is degraded).", + filter_name, + ) + return set(self.list_files(**kwargs)[filter_name]) @property def subsets(self) -> list[dict[str, tp.Any]]: @@ -782,7 +793,7 @@ def _subsets( self, current_index: int, **higher_partition_values: tp.Any ) -> tp.Generator[dict[str, tp.Any], None, None]: current_key = self.unmixer.partition_keys[current_index] - current_values = self.filter_info(current_key, **higher_partition_values) + current_values = self._filter_values(current_key, **higher_partition_values) for current_value in current_values: result = copy(higher_partition_values) result[current_key] = current_value From 8cda056f3ac5145760cd38421bd19312a186ad2c Mon Sep 17 00:00:00 2001 From: rchevrier Date: Mon, 4 May 2026 10:45:21 +0000 Subject: [PATCH 4/9] add tests --- src/fcollections/core/_filesdb.py | 24 ++++ tests/core/test_filesdb.py | 132 +++++++++++++++--- tests/core/test_listing.py | 39 +++++- .../collections/test_l3_lr_ssh.py | 24 ++++ 4 files changed, 199 insertions(+), 20 deletions(-) diff --git a/src/fcollections/core/_filesdb.py b/src/fcollections/core/_filesdb.py index 13aeddf..40cafcb 100644 --- a/src/fcollections/core/_filesdb.py +++ b/src/fcollections/core/_filesdb.py @@ -4,6 +4,7 @@ import abc import dataclasses as dc +import functools import inspect import logging import textwrap @@ -716,12 +717,33 @@ def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: ValueError In case if one unique and homogeneous subset could not be extracted from the files metadata table + ValueError + In case the input filter name is in none of the configured layouts. See Also -------- subsets To list the subsets keys. """ + if self.enable_layouts: + conventions = map(lambda l: l.conventions, self.layouts) + conventions = functools.reduce(lambda a, b: a + b, conventions) + fields = map( + lambda convention: {f.name for f in convention.fields}, conventions + ) + field_names = functools.reduce(lambda a, b: a | b, fields) + else: + field_names = { + field.name for field in self.layouts[0].conventions[-1].fields + } + + if filter_name not in field_names: + msg = ( + f"Unknown filter name {filter_name}. Possible values are " + f"{field_names}" + ) + raise ValueError(msg) + missing_partition_keys = set(self.unmixer.partition_keys) - set(kwargs.keys()) if ( filter_name not in self.unmixer.partition_keys @@ -787,6 +809,8 @@ def _filter_values(self, filter_name: str, **kwargs: tp.Any) -> set[tp.Any]: @property def subsets(self) -> list[dict[str, tp.Any]]: """List of the subsets combinations.""" + if self.unmixer is None: + return [] return list(self._subsets(0)) def _subsets( diff --git a/tests/core/test_filesdb.py b/tests/core/test_filesdb.py index d8b665c..67a6e90 100644 --- a/tests/core/test_filesdb.py +++ b/tests/core/test_filesdb.py @@ -28,8 +28,6 @@ ) if tp.TYPE_CHECKING: - from pathlib import Path - import fsspec @@ -82,6 +80,10 @@ class FilesDatabaseTestNoUnmixer(FilesDatabase): Layout([FileNameConventionTest()]), Layout( [ + FileNameConvention( + regex=re.compile(r"^a_(?P\d{3})$"), + fields=[FileNameFieldInteger("a_number")], + ), FileNameConvention( regex=re.compile(r"^(?Pfoo|bar)$"), fields=[FileNameFieldString("b_string")], @@ -461,9 +463,20 @@ def test_query_metadata_injection(db_good_dim: FilesDatabaseTestGoodDim): @pytest.fixture(scope="session") def db_with_files_bad_layout() -> FilesDatabaseTest: fs = fs_mem.MemoryFileSystem() - fs.touch("baz/foo/a_file_001_20250101.nc") - fs.touch("baz/bar/a_file_002_20250101.nc") - db = FilesDatabaseTest(path="/", fs=fs) + fs.touch("/bad_layout/baz/a_001/foo/a_file_001_20250101.nc") + fs.touch("/bad_layout/baz/a_001/bar/a_file_001_20250101.nc") + fs.touch("/bad_layout/baz/a_002/bar/a_file_002_20250101.nc") + db = FilesDatabaseTest(path="/bad_layout", fs=fs) + return db + + +@pytest.fixture(scope="session") +def db_with_files_good_layout( + db_with_files_bad_layout: FilesDatabaseTest, +) -> FilesDatabaseTest: + fs = db_with_files_bad_layout.fs + fixed_path = Path(db_with_files_bad_layout.path) / "baz" + db = FilesDatabaseTest(fixed_path, fs, enable_layouts=True) return db @@ -472,31 +485,29 @@ def test_query_bad_layout(db_with_files_bad_layout: FilesDatabaseTest): db_with_files_bad_layout.query() -def test_query_bad_layout_fallback(db_with_files_bad_layout: FilesDatabaseTest): - fs = db_with_files_bad_layout.fs - fixed_path = Path(db_with_files_bad_layout.path) / "baz" - db = FilesDatabaseTest(fixed_path, fs, enable_layouts=True) - reference = db.query(a_number=1) +def test_query_bad_layout_fallback( + db_with_files_bad_layout: FilesDatabaseTest, + db_with_files_good_layout: FilesDatabaseTest, +): + reference = db_with_files_good_layout.query(a_number=1) assert reference is not None - db = FilesDatabaseTest(db_with_files_bad_layout.path, fs, enable_layouts=False) + db = FilesDatabaseTest( + db_with_files_bad_layout.path, db_with_files_bad_layout.fs, enable_layouts=False + ) actual = db.query(a_number=1) xr.testing.assert_equal(reference, actual) -def test_query_layout_parameter_not_known(db_with_files_bad_layout: FilesDatabaseTest): - fs = db_with_files_bad_layout.fs - fixed_path = Path(db_with_files_bad_layout.path) / "baz" - db = FilesDatabaseTest(fixed_path, fs, enable_layouts=True) - +def test_query_layout_parameter_not_known(db_with_files_good_layout: FilesDatabaseTest): # Low level interface knows of layout filters - df = db.discoverer.to_dataframe(b_string="foo") + df = db_with_files_good_layout.discoverer.to_dataframe(b_string="foo") assert len(df) > 0 # But higher level interface does not (yet) with pytest.raises(ValueError): - db.query(b_string="foo") + db_with_files_good_layout.query(b_string="foo") def test_map_no_dask(monkeypatch: pytest.MonkeyPatch, db_with_files: FilesDatabaseTest): @@ -528,5 +539,88 @@ def test_map_wrong_parameter( db_with_files.map(lambda x, y: None, **{parameter: value}) -def test_map_empty(db_with_files: FileDatabaseTest): +def test_map_empty(db_with_files: FilesDatabaseTest): assert db_with_files.map(lambda x, y: x, a_number=-1).compute() == [] + + +def test_subsets_no_unmixer(db_with_files_good_layout: FilesDatabaseTest): + db = FilesDatabaseTestNoUnmixer( + db_with_files_good_layout.path, db_with_files_good_layout.fs + ) + assert len(db.subsets) == 0 + + +def test_subsets_empty_dir(db_with_files_good_layout: FilesDatabaseTest): + db_with_files_good_layout.fs.mkdir("empty") + db = FilesDatabaseTest("empty", db_with_files_good_layout.fs) + assert len(db.subsets) == 0 + + +def test_subsets(db_with_files_good_layout: FilesDatabaseTest): + assert len(db_with_files_good_layout.subsets) == 2 + assert all( + [ + subset in [{"a_number": 1}, {"a_number": 2}] + for subset in db_with_files_good_layout.subsets + ] + ) + + +def test_filters_value_full_scan_filter_not_in_layout( + db_with_files_good_layout: FilesDatabaseTest, +): + with pytest.warns(UserWarning, match="intermediate"): + values = db_with_files_good_layout.filter_values("time", a_number=1) + assert values == {np.datetime64("2025-01-01")} + + +def test_filters_value_layouts_disabled_unknown_field( + db_with_files_bad_layout: FilesDatabaseTest, +): + db = FilesDatabaseTest( + db_with_files_bad_layout.path, db_with_files_bad_layout.fs, enable_layouts=False + ) + with pytest.raises(ValueError, match="Unknown"): + db.filter_values("b_string", a_number=1) + + +def test_filters_value_layouts_disabled_full_scan( + db_with_files_bad_layout: FilesDatabaseTest, +): + db = FilesDatabaseTest( + db_with_files_bad_layout.path, db_with_files_bad_layout.fs, enable_layouts=False + ) + with pytest.warns(UserWarning, match="enabled"): + values = db.filter_values( + "a_number", + ) + assert values == {1, 2} + + +def test_filters_value_full_scan_flat(db_with_files: FilesDatabaseTest): + values = db_with_files.filter_values("a_number") + assert values == {1, 2} + + +def test_filters_value_full_scan_layouts_mismatch( + db_with_files_bad_layout: FilesDatabaseTest, +): + with pytest.raises(LayoutMismatchError): + db_with_files_bad_layout.filter_values("b_string", a_number=1) + + +def test_filters_value_layout(db_with_files_good_layout: FilesDatabaseTest): + values = db_with_files_good_layout.filter_values("b_string", a_number=1) + assert values == {"foo", "bar"} + + +def test_filters_value_unknown(db_with_files_good_layout: FilesDatabaseTest): + with pytest.raises(ValueError, match="Unknown filter"): + db_with_files_good_layout.filter_values("c_unknown", a_number=1) + + +def test_filters_value_missing_subset_selection( + db_with_files_good_layout: FilesDatabaseTest, +): + with pytest.raises(ValueError, match="heterogenous datasets"): + db_with_files_good_layout.filter_values("b_string") diff --git a/tests/core/test_listing.py b/tests/core/test_listing.py index 46f3a06..6d2054d 100644 --- a/tests/core/test_listing.py +++ b/tests/core/test_listing.py @@ -456,6 +456,41 @@ def test_layout_visit_dir( assert result.surviving_layouts == [layouts_v2[ii] for ii in layouts_selection] +@pytest.fixture(scope="session") +def layouts_v3(layout: Layout) -> list[Layout]: + return [ + Layout([layout.conventions[0]]), + Layout([layout.conventions[1]]), + ] + + +@pytest.mark.parametrize( + "path, level, expected_explore_next, payload", + [ + ("root", 0, True, None), + ("root/RED", 1, False, (Color.RED,)), + ("root/HR_009", 1, False, ("HR", 9)), + ], +) +def test_layout_visit_dir_payload( + layouts_v3: list[Layout], + memory_fs: MemoryFileSystem, + memory_root: Path, + path: str, + level: int, + expected_explore_next: bool, + payload: None | tuple[tp.Any, ...], +): + + path = memory_root / path + node = DirNode(path.name, {"name": path.as_posix()}, memory_fs, level) + + visitor = LayoutVisitor(layouts_v3) + result = visitor.visit_dir(node) + assert result.explore_next == expected_explore_next + assert result.payload == payload + + @pytest.mark.parametrize( "path, context, on_mismatch", [ @@ -613,7 +648,9 @@ def test_walk_layout( for layout in layouts_v2: layout.set_filters(**filters) visitor = LayoutVisitor( - layouts_v2, on_mismatch_directory=LayoutMismatchHandling.IGNORE + layouts_v2, + on_mismatch_directory=LayoutMismatchHandling.IGNORE, + on_mismatch_file=LayoutMismatchHandling.IGNORE, ) root_str = (memory_root / "root").as_posix() root_node = DirNode(root_str, {"name": root_str}, memory_fs, 0) diff --git a/tests/implementations/collections/test_l3_lr_ssh.py b/tests/implementations/collections/test_l3_lr_ssh.py index 43aa574..b5d5018 100644 --- a/tests/implementations/collections/test_l3_lr_ssh.py +++ b/tests/implementations/collections/test_l3_lr_ssh.py @@ -738,3 +738,27 @@ def test_list_swot_lr_l3_layout( assert set(map(tuple, actual.to_numpy())) == set( map(tuple, expected.to_numpy()) ) + + +def test_subsets(l3_lr_ssh_dir: Path): + db = NetcdfFilesDatabaseSwotLRL3(l3_lr_ssh_dir) + expected = [ + { + "subset": ProductSubset.Unsmoothed, + "version": "2.0.1", + }, + { + "subset": ProductSubset.Expert, + "version": "2.0.1", + }, + { + "subset": ProductSubset.Expert, + "version": "1.0.2", + }, + { + "subset": ProductSubset.Basic, + "version": "1.0.2", + }, + ] + assert len(db.subsets) == len(expected) + assert all([subset in expected for subset in db.subsets]) From cfab26c14f80ae5d2782b1e198b49c19a50594ce Mon Sep 17 00:00:00 2001 From: rchevrier Date: Tue, 5 May 2026 08:35:09 +0000 Subject: [PATCH 5/9] fix tests --- tests/fixtures/_ww.py | 3 +-- tests/implementations/test_docstrings.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/fixtures/_ww.py b/tests/fixtures/_ww.py index 4b42af1..97f1e90 100644 --- a/tests/fixtures/_ww.py +++ b/tests/fixtures/_ww.py @@ -128,7 +128,6 @@ def l3_lr_ww_files() -> list[str]: "v2_0/Light/cycle_482/SWOT_L3_LR_WIND_WAVE_482_011_20230406T051804_20230406T060909_v2.0.nc", "v2_0/Light/cycle_482/SWOT_L3_LR_WIND_WAVE_482_012_20230406T060910_20230406T070014_v2.0.nc", "v3_0/Extended/cycle_010/SWOT_L3_LR_WIND_WAVE_Extended_010_010_20240125T025352_20240125T034438_v3.0.nc", - "v2_0/Light/cycle_482/SWOT_L3_LR_WIND_WAVE_482_012_20230406T060910_20230406T070014_PIC2_v2.0.nc", ] @@ -136,7 +135,7 @@ def l3_lr_ww_files() -> list[str]: def l3_lr_ww_dir_layout( tmp_path_factory: pytest.TempPathFactory, l3_lr_ww_files: list[str] ) -> Path: - root = tmp_path_factory.mktemp("l2_lr_ssh") + root = tmp_path_factory.mktemp("l3_lr_ww") # create test folder for filepath in l3_lr_ww_files: diff --git a/tests/implementations/test_docstrings.py b/tests/implementations/test_docstrings.py index eb8246a..468746b 100644 --- a/tests/implementations/test_docstrings.py +++ b/tests/implementations/test_docstrings.py @@ -135,10 +135,10 @@ def test_query_docstring(db, keywords, request): @pytest.mark.parametrize( "db, keywords", [ - ("swot_lr_l2_db", ["variables metadata", "level", "subset"]), - ("swot_lr_l3_db", ["variables metadata"]), - ("nadir_db", ["variables metadata", "resolution", "sensor"]), - ("sst_db", ["variables metadata"]), + ("swot_lr_l2_db", ["metadata describing the variables", "level", "subset"]), + ("swot_lr_l3_db", ["metadata describing the variables"]), + ("nadir_db", ["metadata describing the variables", "resolution", "sensor"]), + ("sst_db", ["metadata describing the variables"]), ], ) def test_variables_info_docstring(db, keywords, request): From abebd5c6e68331083ad03283ef258d8d6a051887 Mon Sep 17 00:00:00 2001 From: rchevrier Date: Tue, 5 May 2026 11:30:32 +0000 Subject: [PATCH 6/9] Update documentation --- docs/advanced.md | 1 + docs/getting_started.md | 61 ++++++++++++++++--- src/fcollections/core/_listing.py | 12 ++++ .../implementations/_l2_lr_ssh.py | 15 ++--- tests/core/test_listing.py | 8 +++ .../collections/test_l2_lr_ssh.py | 26 ++++++++ tests/implementations/test_docstrings.py | 4 +- 7 files changed, 108 insertions(+), 19 deletions(-) diff --git a/docs/advanced.md b/docs/advanced.md index dc3d8e0..f1088c9 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -48,6 +48,7 @@ Remote file system listing can be quite long. Implementations are usually shipped with layouts for an improved listing speed. See the {ref}`Layout ` introduction if listing performance becomes an issue. +(disable-layouts)= ## Disable layouts diff --git a/docs/getting_started.md b/docs/getting_started.md index 54ee7fa..853ab7c 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -48,6 +48,7 @@ given criterias ```{code-cell} from fcollections.implementations import NetcdfFilesDatabaseSwotLRL2 + fc = NetcdfFilesDatabaseSwotLRL2(path) fc.list_files(cycle_number=1) ``` @@ -83,6 +84,8 @@ ds = fc.query(selected_variables=['ssha']) list(ds.variables) ``` +### Filter types + Each implementation has its own filters. By order of availability, the user should consult: @@ -95,23 +98,42 @@ should consult: fc.query? ``` +### Filter values + +Possible values for a given filter can be displayed + +```{code-cell} +fc.filter_values('version') +``` + +Only filters whose information are contained in the intermediated folders can be +scanned in a quick way, other will trigger a full scan. As such, to ensure +optimal performance, this method should be called with the layouts enabled, with +files organized with folders (see the [advanced section](#disable-layouts)), and +on filters whose information is encoded in the folders. + ## Access metadata The database can display information about the variables and attributes contained in the files' collection using the ``variables_info`` method ```{code-cell} -fc.variables_info(subset='Expert') +# Use the enumeration name for filtering a specific subset +fc.variables_info() ``` It will offer a simple collapsible tree view with multiple levels of nesting depending on the data you manipulate -In order to return consistent metadata, the method ensures that only one -homogeneous subset is selected. In case you handle unmixable data (for example -Expert and Unsmoothed datasets), you must give proper filters on the subset -partitioning keys ``fc.unmixer.partition_keys``. If these filters are missing, -an error with the possible choices will be raised. +## Subsets + +### Errors on mixed subsets + +In order to return consistent results, most methods must work on an homogeneous +subset of data. In case multiple subsets are mixed (for example Expert and +Unsmoothed datasets), proper filters matching the partitioning keys must be +given. If these filters are missing, an error with the possible choices will be +raised. ```{code-cell} :tags: [raises-exception] @@ -123,7 +145,32 @@ ds.to_netcdf(f'{path}/SWOT_L2_LR_SSH_Unsmoothed_001_012_20240101T030000_20240101 fc.variables_info() ``` +### Compatibility matrix + +The following table summarizes which methods can work on mixed data. Most +methods need homogeneous data and will require filtering the subset. + +| Method | Works on mixed data ? | +|--------------------|-----------------------| +| ``list_files`` | Yes | +| ``variables_info`` | No | +| ``filter_values`` | No | +| ``query`` | No | +| ``map`` | No | + +### Listing subsets + +Subsets that are on the file system can be listed using the +{meth}`subsets ` property. + +```{code-cell} +fc.subsets +``` + +One of the returned choices must be selected and used as a filter to work on an +homogeneous dataset. + ```{code-cell} -# Use the enumeration name for filtering +# Use the enumeration name for filtering a specific subset fc.variables_info(subset='Expert') ``` diff --git a/src/fcollections/core/_listing.py b/src/fcollections/core/_listing.py index f306d66..c2aafcc 100644 --- a/src/fcollections/core/_listing.py +++ b/src/fcollections/core/_listing.py @@ -264,6 +264,10 @@ def children(self) -> tp.Iterator[INode]: The child nodes, either files or folders """ + @abc.abstractmethod + def clear(self): + """Clear child nodes.""" + class FileNode(INode): """File node of a file system tree.""" @@ -281,6 +285,9 @@ def children(self) -> tp.Iterator[INode]: """ return [] + def clear(self): + pass + class DirNode(INode): """Directory node of a file system tree. @@ -326,6 +333,9 @@ def children(self) -> tp.Iterable[INode]: self._children = list(self._compute_children()) return self._children + def clear(self): + self._children = None + def _compute_children(self) -> tp.Iterator[INode]: # return list of FileNode or DirNode instances # Block of code extracted from fsspec and simplified (no topbottom @@ -798,6 +808,7 @@ def walk(node: INode, visitor: IVisitor) -> tp.Iterator[tp.Any]: return for child in node.children(): + logger.debug("child %s", child.name) yield from walk(child, visitor.advance(result)) @@ -929,6 +940,7 @@ def discover( # to modify the Layout interface layout.set_filters(**filters) + self.root_node.clear() if enable_layouts: logger.debug("Using layouts to speed up listing") visitor = LayoutVisitor(self.layouts, stat_fields) diff --git a/src/fcollections/implementations/_l2_lr_ssh.py b/src/fcollections/implementations/_l2_lr_ssh.py index dde774a..c43c06c 100644 --- a/src/fcollections/implementations/_l2_lr_ssh.py +++ b/src/fcollections/implementations/_l2_lr_ssh.py @@ -31,7 +31,7 @@ SWOT_L2_PATTERN = re.compile( - r"SWOT_(?P.*)_LR_SSH_(?P.*)_(?P\d{3})_(?P\d{3})_" + r"SWOT_L2_LR_SSH_(?P.*)_(?P\d{3})_(?P\d{3})_" r"(?P