diff --git a/changes/3612.feature.md b/changes/3612.feature.md new file mode 100644 index 0000000000..821159d418 --- /dev/null +++ b/changes/3612.feature.md @@ -0,0 +1,3 @@ +Added the convenience method for `zarr.Group` to copy to a destination store which +can be of a different type than the original store of the `zarr.Group` to be +copied. This will also copy over the metadata as is. \ No newline at end of file diff --git a/docs/user-guide/groups.md b/docs/user-guide/groups.md index 57201216b6..78f1f309b9 100644 --- a/docs/user-guide/groups.md +++ b/docs/user-guide/groups.md @@ -134,4 +134,12 @@ print(root.tree()) ``` !!! note - [`zarr.Group.tree`][] requires the optional [rich](https://rich.readthedocs.io/en/stable/) dependency. It can be installed with the `[tree]` extra. \ No newline at end of file + [`zarr.Group.tree`][] requires the optional [rich](https://rich.readthedocs.io/en/stable/) dependency. It can be installed with the `[tree]` extra. + +You can copy a Group including consolidated metadata to a new destination store +(type of store can differ from the source store) using the `copy_to` method: + +```python exec="true" session="groups" source="above" result="ansi" +destination_store = zarr.storage.MemoryStore() +new_group = root.copy_to(destination_store, overwrite=True) +``` \ No newline at end of file diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 9b5fee275b..14c49f338f 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -472,6 +472,7 @@ async def from_store( store: StoreLike, *, attributes: dict[str, Any] | None = None, + consolidated_metadata: ConsolidatedMetadata | None = None, overwrite: bool = False, zarr_format: ZarrFormat = 3, ) -> AsyncGroup: @@ -486,7 +487,11 @@ async def from_store( await ensure_no_existing_node(store_path, zarr_format=zarr_format) attributes = attributes or {} group = cls( - metadata=GroupMetadata(attributes=attributes, zarr_format=zarr_format), + metadata=GroupMetadata( + attributes=attributes, + consolidated_metadata=consolidated_metadata, + zarr_format=zarr_format, + ), store_path=store_path, ) await group._save_metadata(ensure_parents=True) @@ -697,6 +702,89 @@ def from_dict( store_path=store_path, ) + async def copy_to( + self, + store: StoreLike, + *, + overwrite: bool = False, + use_consolidated_for_children: bool = True, + ) -> AsyncGroup: + """ + Copy this group and all its contents to a new store. + + Parameters + ---------- + store : StoreLike + The store to copy to. + overwrite : bool, optional + If True, overwrite any existing data in the target store. Default is False. + use_consolidated_for_children : bool, default True + Whether to use the consolidated metadata of child groups when iterating over the store contents. + Note that this only affects groups loaded from the store. If the current Group already has + consolidated metadata, it will always be used. + + Returns + ------- + AsyncGroup + The new group in the target store. + """ + target_zarr_format = self.metadata.zarr_format + + new_group = await self.from_store( + store, + overwrite=overwrite, + attributes=self.metadata.attributes, + consolidated_metadata=self.metadata.consolidated_metadata, + zarr_format=target_zarr_format, + ) + + async for _, member in self.members( + max_depth=None, use_consolidated_for_children=use_consolidated_for_children + ): + child_path = member.store_path.path + target_path = StorePath(store=new_group.store, path=child_path) + + if isinstance(member, AsyncGroup): + await self.from_store( + store=target_path, + zarr_format=target_zarr_format, + overwrite=overwrite, + attributes=member.metadata.attributes, + consolidated_metadata=member.metadata.consolidated_metadata, + ) + else: + kwargs = {} + if target_zarr_format == 3: + kwargs["chunk_key_encoding"] = member.metadata.chunk_key_encoding + kwargs["dimension_names"] = member.metadata.dimension_names + else: + kwargs["chunk_key_encoding"] = { + "name": "v2", + "separator": member.metadata.dimension_separator, + } + # Serializer done this way in case of having zarr_format 2, otherwise mypy complains. + new_array = await new_group.create_array( + name=child_path, + shape=member.shape, + dtype=member.dtype, + chunks=member.chunks, + shards=member.shards, + filters=member.filters, + compressors=member.compressors, + serializer=member.serializer if member.serializer is not None else "auto", + fill_value=member.metadata.fill_value, + attributes=member.attrs, + overwrite=overwrite, + config={"order": member.order}, + **kwargs, + ) + + for region in member._iter_shard_regions(): + data = await member.getitem(selection=region) + await new_array.setitem(selection=region, value=data) + + return new_group + async def setitem(self, key: str, value: Any) -> None: """ Fastpath for creating a new array @@ -945,6 +1033,7 @@ async def create_group( *, overwrite: bool = False, attributes: dict[str, Any] | None = None, + consolidated_metadata: ConsolidatedMetadata | None = None, ) -> AsyncGroup: """Create a sub-group. @@ -956,6 +1045,9 @@ async def create_group( If True, do not raise an error if the group already exists. attributes : dict, optional Group attributes. + consolidated_metadata : ConsolidatedMetadata, optional + Consolidated Zarr metadata mapping that represents the entire hierarchy's + group and array metadata collected into a single dictionary. Returns ------- @@ -965,6 +1057,7 @@ async def create_group( return await type(self).from_store( self.store_path / name, attributes=attributes, + consolidated_metadata=consolidated_metadata, overwrite=overwrite, zarr_format=self.metadata.zarr_format, ) @@ -1810,6 +1903,7 @@ def from_store( store: StoreLike, *, attributes: dict[str, Any] | None = None, + consolidated_metadata: ConsolidatedMetadata | None = None, zarr_format: ZarrFormat = 3, overwrite: bool = False, ) -> Group: @@ -1823,6 +1917,8 @@ def from_store( for a description of all valid StoreLike values. attributes : dict, optional A dictionary of JSON-serializable values with user-defined attributes. + consolidated_metadata : ConsolidatedMetadata, optional + Consolidated Metadata for this Group. This should contain metadata of child nodes below this group. zarr_format : {2, 3}, optional Zarr storage format version. overwrite : bool, optional @@ -1842,6 +1938,7 @@ def from_store( AsyncGroup.from_store( store, attributes=attributes, + consolidated_metadata=consolidated_metadata, overwrite=overwrite, zarr_format=zarr_format, ), @@ -1874,6 +1971,42 @@ def open( obj = sync(AsyncGroup.open(store, zarr_format=zarr_format)) return cls(obj) + def copy_to( + self, + store: StoreLike, + *, + overwrite: bool = False, + use_consolidated_for_children: bool = True, + ) -> Group: + """ + Copy this group and all its contents to a new store. + + Parameters + ---------- + store : StoreLike + The store to copy to. + overwrite : bool, optional + If True, overwrite any existing data in the target store. Default is False. + use_consolidated_for_children : bool, default True + Whether to use the consolidated metadata of child groups when iterating over the store contents. + Note that this only affects groups loaded from the store. If the current Group already has + consolidated metadata, it will always be used. + + Returns + ------- + AsyncGroup + The new group in the target store. + """ + return Group( + sync( + self._async_group.copy_to( + store=store, + overwrite=overwrite, + use_consolidated_for_children=use_consolidated_for_children, + ) + ) + ) + def __getitem__(self, path: str) -> AnyArray | Group: """Obtain a group member. @@ -2392,13 +2525,26 @@ def tree(self, expand: bool | None = None, level: int | None = None) -> Any: """ return self._sync(self._async_group.tree(expand=expand, level=level)) - def create_group(self, name: str, **kwargs: Any) -> Group: + def create_group( + self, + name: str, + overwrite: bool = False, + attributes: dict[str, Any] | None = None, + consolidated_metadata: ConsolidatedMetadata | None = None, + ) -> Group: """Create a sub-group. Parameters ---------- name : str Name of the new subgroup. + overwrite : bool, optional + If True, do not raise an error if the group already exists. + attributes : dict, optional + Group attributes. + consolidated_metadata : ConsolidatedMetadata, optional + Consolidated Zarr metadata mapping that represents the entire hierarchy's + group and array metadata collected into a single dictionary. Returns ------- @@ -2412,7 +2558,16 @@ def create_group(self, name: str, **kwargs: Any) -> Group: >>> subgroup """ - return Group(self._sync(self._async_group.create_group(name, **kwargs))) + return Group( + self._sync( + self._async_group.create_group( + name, + overwrite=overwrite, + attributes=attributes, + consolidated_metadata=consolidated_metadata, + ) + ) + ) def require_group(self, name: str, **kwargs: Any) -> Group: """Obtain a sub-group, creating one if it doesn't exist. diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 5eb17214fe..41deb04e6e 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -352,7 +352,8 @@ def basic_indices( allow_ellipsis: bool = True, ) -> Any: """Basic indices without unsupported negative slices.""" - strategy = npst.basic_indices( + # We can ignore here as it is just to numpy type hints being Literal[False | True] for overload variants + strategy = npst.basic_indices( # type: ignore[call-overload] shape=shape, min_dims=min_dims, max_dims=max_dims, @@ -362,7 +363,7 @@ def basic_indices( lambda idxr: ( not ( is_negative_slice(idxr) - or (isinstance(idxr, tuple) and any(is_negative_slice(idx) for idx in idxr)) # type: ignore[redundant-expr] + or (isinstance(idxr, tuple) and any(is_negative_slice(idx) for idx in idxr)) ) ) ) diff --git a/tests/test_group.py b/tests/test_group.py index 6f1f4e68fa..37af174ab7 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -63,6 +63,89 @@ from zarr.core.common import JSON, ZarrFormat +def is_deprecated(method): + """Check if a method is marked as deprecated.""" + # Check for @deprecated decorator + return hasattr(method, "__deprecated__") or ( + hasattr(method, "__wrapped__") and hasattr(method.__wrapped__, "__deprecated__") + ) + + +def get_method_names(cls): + """Extract public method names from a class, excluding deprecated methods.""" + return [ + name + for name, method in inspect.getmembers(cls, predicate=inspect.isfunction) + if not name.startswith("_") and not is_deprecated(method) + ] + + +def get_method_signature(cls, method_name: str): + """Get the signature of a method from a class.""" + method = getattr(cls, method_name) + sig = inspect.signature(method) + return {name: param for name, param in sig.parameters.items() if name != "self"} + + +# TODO Go one by one through the methods in skipped and fix the mismatches. +@pytest.mark.parametrize( + ("sync_class", "async_class", "skip_methods"), + [ + ( + Group, + AsyncGroup, + ["create", "update_attributes_async", "get", "require_array", "require_group"], + ) + ], +) +def test_class_method_parameters_match(sync_class, async_class, skip_methods) -> None: + """ + Test that methods for classes and their async counterparts match. + + Tests that the parameters for sync and async methods match. This test, + tests parameter names, types, and default values. + """ + + method_names = get_method_names(sync_class) + for method in skip_methods: + method_names.remove(method) + + for method_name in method_names: + assert hasattr(async_class, method_name), ( + f"Async class {async_class.__name__} missing method '{method_name}'" + ) + + sync_params = get_method_signature(sync_class, method_name) + async_params = get_method_signature(async_class, method_name) + + sync_param_names = set(sync_params.keys()) + async_param_names = set(async_params.keys()) + + assert sync_param_names == async_param_names, ( + f"Parameter names don't match for '{method_name}'. " + f"Sync: {sync_param_names}, Async: {async_param_names}" + ) + + mismatches = [] + for param_name in sync_params: + sync_param = sync_params[param_name] + async_param = async_params[param_name] + + if sync_param.annotation != async_param.annotation: + mismatches.append( + f"{param_name}: annotation mismatch " + f"(sync: {sync_param.annotation}, async: {async_param.annotation})" + ) + + if sync_param.default != async_param.default: + mismatches.append( + f"{param_name}: default mismatch " + f"(sync: {sync_param.default}, async: {async_param.default})" + ) + + assert mismatches == [], f"Parameter mismatches in '{method_name}': {mismatches}" + + @pytest.fixture(params=["local", "memory", "zip"]) async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: result = await parse_store(request.param, str(tmpdir)) @@ -250,6 +333,78 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad members_observed = group.members(max_depth=-1) +@pytest.mark.parametrize( + ("zarr_format", "shards", "consolidate_metadata"), + [ + (2, None, False), + (2, None, True), + (3, (50,), False), + (3, (50,), True), + ], +) +def test_copy_to(zarr_format: int, shards: tuple[int, ...], consolidate_metadata: bool) -> None: + src_store = MemoryStore() + src = Group.from_store(src_store, attributes={"root": True}, zarr_format=zarr_format) + + subgroup = src.create_group("subgroup", attributes={"subgroup": True}) + + subgroup_arr_data = np.arange(50) + subgroup.create_array( + "subgroup_dataset", + shape=(50,), + chunks=(10,), + shards=shards, + dtype=subgroup_arr_data.dtype, + ) + subgroup["subgroup_dataset"] = subgroup_arr_data + + arr_data = np.arange(100) + src.create_array( + "dataset", + shape=(100,), + chunks=(10,), + shards=shards, + dtype=arr_data.dtype, + ) + src["dataset"] = arr_data + + if consolidate_metadata: + if zarr_format == 3: + with pytest.warns(ZarrUserWarning, match="Consolidated metadata is currently"): + src = zarr.consolidate_metadata(src_store) + with pytest.warns(ZarrUserWarning, match="Consolidated metadata is currently"): + zarr.consolidate_metadata(src_store, path="subgroup") + else: + src = zarr.consolidate_metadata(src_store) + zarr.consolidate_metadata(src_store, path="subgroup") + + dst_store = MemoryStore() + + dst = src.copy_to(dst_store, overwrite=True) + + assert dst.attrs.get("root") is True + + subgroup = dst["subgroup"] + assert isinstance(subgroup, Group) + assert subgroup.attrs.get("subgroup") is True + + copied_arr = dst["dataset"] + copied_data = copied_arr[:] + assert np.array_equal(copied_data, arr_data) + + copied_subgroup_arr = subgroup["subgroup_dataset"] + copied_subgroup_data = copied_subgroup_arr[:] + assert np.array_equal(copied_subgroup_data, subgroup_arr_data) + + if consolidate_metadata: + assert zarr.open_group(dst_store).metadata.consolidated_metadata + if zarr_format == 3: + assert zarr.open_group(dst_store, path="subgroup").metadata.consolidated_metadata + else: + assert not zarr.open_group(dst_store).metadata.consolidated_metadata + assert not zarr.open_group(dst_store, path="subgroup").metadata.consolidated_metadata + + def test_group(store: Store, zarr_format: ZarrFormat) -> None: """ Test basic Group routines.