diff --git a/Cargo.lock b/Cargo.lock index 9bc5b87..a5d03a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3095,6 +3095,7 @@ dependencies = [ "chrono", "chrono-tz", "indexmap 2.14.0", + "inventory", "libc", "once_cell", "portable-atomic", diff --git a/Cargo.toml b/Cargo.toml index 0f63e8b..8eac120 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ icechunk = { version = "2.0.0", default-features = false } ndarray = "0.17.2" numpy = { version = "0.29", features = ["half"] } object_store = "0.13.2" -pyo3 = { version = "0.29", features = ["macros", "abi3-py311"] } +pyo3 = { version = "0.29", features = ["macros", "abi3-py311", "multiple-pymethods"] } pyo3-arrow = "0.19" pyo3-async-runtimes = { version = "0.29", features = ["tokio-runtime"] } pyo3-bytes = "0.7.1" diff --git a/dev-docs/specs/2026-06-22-shared-pymethods-macros-design.md b/dev-docs/specs/2026-06-22-shared-pymethods-macros-design.md new file mode 100644 index 0000000..67d94dd --- /dev/null +++ b/dev-docs/specs/2026-06-22-shared-pymethods-macros-design.md @@ -0,0 +1,112 @@ +# Shared `#[pymethods]` macros for sync/async Array & Group + +**Date:** 2026-06-22 +**Status:** Approved + +## Problem + +`PyArray`/`PyAsyncArray` (and `PyGroup`/`PyAsyncGroup`) duplicate a set of +metadata accessors that are byte-for-byte identical between the sync and async +variants. These accessors only read from `self.inner` and perform no I/O, so +they do not differ across the sync/async split — yet they are copy-pasted into +both `#[pymethods]` blocks. As more upstream `zarrs` `Group` methods get +exposed, this duplication will grow. + +## Constraints / why not the obvious alternatives + +- **No generic Rust base pyclass.** `#[pyclass]` cannot be generic; pyo3 must + register one concrete type with the interpreter. `#[pyclass] struct + PyBaseArray` will not compile. +- **pyo3 inheritance does not share logic here.** A base `#[pymethods]` block + can only access *base* fields, but the storage-typed `inner` lives in the + subclass. A Rust base could provide `isinstance` but could not implement + `shape`/`dtype`/etc. +- **Python API stays unchanged.** Goal is internal Rust DRY only. A + Python-visible base class / Protocol is explicitly *not* a requirement and is + out of scope for this change (could be added later as a pure-Python layer). + +## Design + +Use a `macro_rules!` macro per node type that expands to a **complete, separate +`#[pymethods] impl $ty` block**. The macro is parameterized over the concrete +type (`$ty:ty`) and invoked at module level (not inside an existing +`#[pymethods]` block), so it re-expands against each concrete type and sidesteps +the generics limitation. This works for both sync and async because async's +`inner: Arc>` derefs to `Array<…>`, so `self.inner.shape()` etc. +compile identically. + +### Why a whole `#[pymethods]` block (and `multiple-pymethods`) + +A macro invocation *inside* a `#[pymethods]` block does not work: pyo3's +proc-macro processes the impl block before the `macro_rules!` invocation is +expanded, so it never sees the generated methods (`error: macros cannot be used +as items in #[pymethods] impl blocks`). The macro must therefore emit its own +`#[::pyo3::pymethods] impl $ty { … }` block. + +Having two `#[pymethods]` blocks for one type (the macro-generated one plus the +hand-written type-specific one) requires the **`multiple-pymethods`** pyo3 +feature. This is enabled in `Cargo.toml`. It is verified compatible with the +crate's `abi3-py311` feature, and pulls in `inventory`, which is already a +transitive dependency via pyo3 — so no meaningful new dependency. + +### `src/array/shared.rs` + +Defines `array_metadata_accessors!($ty)` emitting a `#[pymethods] impl $ty` +block with these 9 `#[getter]` accessors: + +- `attrs`, `chunk_grid`, `codecs`, `dimension_names`, `dtype`, `metadata`, + `ndim`, `path`, `shape` + +`metadata` returns `PyArrayMetadata` (the metadata newtype handles +pythonization via its `IntoPyObject` impl) rather than calling `pythonize` +inline, mirroring the group accessor. + +Exported via `pub(crate) use array_metadata_accessors;`. Uses `$crate::`-rooted +paths for referenced types (`PyChunkGrid`, `PyCodecChain`, `PyDataType`, +`PyArrayMetadata`) and fully-qualified `::pyo3::` / `::pythonize::` paths so the +macro is hygienic regardless of the caller's imports. + +### `src/group/shared.rs` + +Defines `group_metadata_accessors!($ty)` emitting a `#[pymethods] impl $ty` +block with the storage-agnostic, no-I/O `Group` accessors: + +- `attrs` (`#[getter]`, pythonized attributes map) +- `metadata` (`#[getter]`, returns `PyGroupMetadata` — the newtype's + `IntoPyObject` impl handles pythonization, so the getter just clones + `self.inner.metadata()` and `.into()`s it; no inline `pythonize`) +- `consolidated_metadata` (`#[getter]`, returns + `Option` — `self.inner.consolidated_metadata()` + already returns an owned `Option`, so it just `.map(Into::into)`s) +- `path` (`#[getter]`, `self.inner.path().as_str()`) + +`PyConsolidatedMetadata` is added to `src/metadata.rs` via the existing +`pythonized_metadata!` macro (wrapping +`zarrs::metadata_ext::group::consolidated_metadata::ConsolidatedMetadata`, +which derives `Serialize`/`Deserialize`/`Clone`). + +Grows further as more storage-agnostic `zarrs` `Group` methods are exposed. +I/O methods (`array_keys`/`group_keys`, child navigation) stay per-type. + +### Wiring + +- `src/array/mod.rs` and `src/group/mod.rs` each gain `mod shared;`. +- `sync.rs` / `async.rs` for both array and group `use` the macro and invoke it + at module level (between the struct's inherent `impl` and the type-specific + `#[pymethods]` block), with a one-line comment pointing at `shared.rs`. The + duplicated accessors are deleted from both files. + +## Out of scope (stays per-type) + +- `__repr__` — differs by type-name string (`"Array"` vs `"AsyncArray"`). +- All I/O methods — `open`/`open_async`, `retrieve_*`, `__getitem__`, + `array_keys`/`group_keys` — different signatures and bodies (sync returns a + value; async takes `py` and returns `future_into_py(...)`). +- Any Python-visible base class / Protocol. + +## Success criteria + +- `cargo build` succeeds; the macro-generated methods are present on all four + Python classes (verified via existing behavior / a quick attribute check). +- No change to the Python-visible API surface. +- The 9 array accessors and group `attrs` exist in exactly one place each. diff --git a/python/zarrista/_group.pyi b/python/zarrista/_group.pyi index bec3a23..c1da61d 100644 --- a/python/zarrista/_group.pyi +++ b/python/zarrista/_group.pyi @@ -1,4 +1,4 @@ -from zarr_metadata import JSONValue +from zarr_metadata import ConsolidatedMetadataV3, GroupMetadataV3, JSONValue from ._array import Array, AsyncArray from ._store import AsyncStore, FilesystemStore, MemoryStore @@ -12,6 +12,15 @@ class Group: @property def attrs(self) -> dict[str, JSONValue]: """The group's user attributes as a dict.""" + @property + def metadata(self) -> GroupMetadataV3: + """The group's full Zarr v3 metadata.""" + @property + def consolidated_metadata(self) -> ConsolidatedMetadataV3 | None: + """The consolidated metadata, if present in the group metadata.""" + @property + def path(self) -> str: + """The group's path in the store.""" def array_keys(self) -> list[str]: """Names of the direct child arrays.""" def group_keys(self) -> list[str]: @@ -31,6 +40,15 @@ class AsyncGroup: @property def attrs(self) -> dict[str, JSONValue]: """The group's user attributes as a dict.""" + @property + def metadata(self) -> GroupMetadataV3: + """The group's full Zarr v3 metadata.""" + @property + def consolidated_metadata(self) -> ConsolidatedMetadataV3 | None: + """The consolidated metadata, if present in the group metadata.""" + @property + def path(self) -> str: + """The group's path in the store.""" async def array_keys(self) -> list[str]: """Names of the direct child arrays.""" async def group_keys(self) -> list[str]: diff --git a/src/array/async.rs b/src/array/async.rs index b8e6a80..0fb5095 100644 --- a/src/array/async.rs +++ b/src/array/async.rs @@ -3,19 +3,16 @@ use std::sync::Arc; use crate::array::selection::PySelection; +use crate::array::shared::array_metadata_accessors; use crate::array::util::PyChunkIndices; -use crate::chunks::PyChunkGrid; -use crate::codec::{PyCodecChain, PyCodecOptions}; +use crate::codec::PyCodecOptions; use crate::decoded_array::DecodedArray; -use crate::dtype::PyDataType; use crate::error::ZarristaError; use crate::node::PyNodePath; use crate::storage::PyAsyncStorage; use pyo3::prelude::*; use pyo3_async_runtimes::tokio::future_into_py; use pyo3_bytes::PyBytes; -use pythonize::pythonize; -use pythonize::Result as PythonizeResult; use zarrs::array::Array; use zarrs::storage::AsyncReadableWritableListableStorageTraits; @@ -31,6 +28,9 @@ impl PyAsyncArray { } } +// Metadata accessors shared with `PyArray`; see `array/shared.rs`. +array_metadata_accessors!(PyAsyncArray); + #[pymethods] impl PyAsyncArray { /// Read a region with numpy-style basic indexing, e.g. `await arr[0:10, :, 5]`. @@ -70,51 +70,6 @@ impl PyAsyncArray { }) } - /// The array's user attributes as a dict. - #[getter] - fn attrs<'py>(&self, py: Python<'py>) -> PythonizeResult> { - pythonize(py, self.inner.attributes()) - } - - #[getter] - fn chunk_grid(&self) -> PyChunkGrid { - self.inner.chunk_grid().clone().into() - } - - #[getter] - fn codecs(&self) -> PyCodecChain { - self.inner.codecs().into() - } - - /// The dimension names, if any were specified. - #[getter] - fn dimension_names(&self) -> &Option>> { - self.inner.dimension_names() - } - - /// The Zarr data-type - #[getter] - fn dtype(&self) -> PyDataType { - self.inner.data_type().clone().into() - } - - #[getter] - fn metadata<'py>(&self, py: Python<'py>) -> Bound<'py, PyAny> { - pythonize(py, self.inner.metadata()).unwrap() - } - - /// The number of dimensions. - #[getter] - fn ndim(&self) -> usize { - self.inner.dimensionality() - } - - /// The array's path in the store. - #[getter] - fn path(&self) -> &str { - self.inner.path().as_str() - } - /// Read a region of the array as `Data`, using numpy-style basic indexing. fn retrieve_array_subset<'py>( &self, @@ -168,10 +123,4 @@ impl PyAsyncArray { Ok(encoded.map(PyBytes::new)) }) } - - /// The array shape. - #[getter] - fn shape(&self) -> &[u64] { - self.inner.shape() - } } diff --git a/src/array/mod.rs b/src/array/mod.rs index eccdc51..fc97157 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -1,5 +1,6 @@ mod r#async; mod selection; +mod shared; mod sync; mod util; diff --git a/src/array/shared.rs b/src/array/shared.rs new file mode 100644 index 0000000..0314d2e --- /dev/null +++ b/src/array/shared.rs @@ -0,0 +1,67 @@ +//! Shared `#[pymethods]` for `PyArray` / `PyAsyncArray`. +//! +//! These accessors only read from `self.inner` and perform no I/O, so they are +//! identical between the sync and async variants. The macro emits a separate +//! `#[pymethods]` block (requires the `multiple-pymethods` pyo3 feature). +macro_rules! array_metadata_accessors { + ($ty:ty) => { + #[::pyo3::pymethods] + impl $ty { + /// The array's user attributes as a dict. + #[getter] + fn attrs<'py>( + &self, + py: ::pyo3::Python<'py>, + ) -> ::pythonize::Result<::pyo3::Bound<'py, ::pyo3::PyAny>> { + ::pythonize::pythonize(py, self.inner.attributes()) + } + + #[getter] + fn chunk_grid(&self) -> $crate::chunks::PyChunkGrid { + self.inner.chunk_grid().clone().into() + } + + #[getter] + fn codecs(&self) -> $crate::codec::PyCodecChain { + self.inner.codecs().into() + } + + /// The dimension names, if any were specified. + #[getter] + fn dimension_names(&self) -> &Option>> { + self.inner.dimension_names() + } + + /// The Zarr data-type + #[getter] + fn dtype(&self) -> $crate::dtype::PyDataType { + self.inner.data_type().clone().into() + } + + #[getter] + fn metadata(&self) -> $crate::metadata::PyArrayMetadata { + self.inner.metadata().clone().into() + } + + /// The number of dimensions. + #[getter] + fn ndim(&self) -> usize { + self.inner.dimensionality() + } + + /// The array's path in the store. + #[getter] + fn path(&self) -> &str { + self.inner.path().as_str() + } + + /// The array shape. + #[getter] + fn shape(&self) -> &[u64] { + self.inner.shape() + } + } + }; +} + +pub(crate) use array_metadata_accessors; diff --git a/src/array/sync.rs b/src/array/sync.rs index 32387c2..f7c95b1 100644 --- a/src/array/sync.rs +++ b/src/array/sync.rs @@ -1,18 +1,15 @@ //! The `Array` Python class: metadata accessors and numpy-style reads. use crate::array::selection::PySelection; +use crate::array::shared::array_metadata_accessors; use crate::array::util::PyChunkIndices; -use crate::chunks::PyChunkGrid; -use crate::codec::{PyCodecChain, PyCodecOptions}; +use crate::codec::PyCodecOptions; use crate::decoded_array::DecodedArray; -use crate::dtype::PyDataType; use crate::error::ZarristaResult; use crate::node::PyNodePath; use crate::storage::PySyncStorage; use pyo3::prelude::*; use pyo3_bytes::PyBytes; -use pythonize::pythonize; -use pythonize::Result as PythonizeResult; use zarrs::array::Array; use zarrs::storage::ReadableWritableListableStorageTraits; @@ -28,6 +25,9 @@ impl PyArray { } } +// Metadata accessors shared with `PyAsyncArray`; see `array/shared.rs`. +array_metadata_accessors!(PyArray); + #[pymethods] impl PyArray { /// Read a region with numpy-style basic indexing, e.g. `arr[0:10, :, 5]`. @@ -54,51 +54,6 @@ impl PyArray { Ok(Self::new(inner)) } - /// The array's user attributes as a dict. - #[getter] - fn attrs<'py>(&self, py: Python<'py>) -> PythonizeResult> { - pythonize(py, self.inner.attributes()) - } - - #[getter] - fn chunk_grid(&self) -> PyChunkGrid { - self.inner.chunk_grid().clone().into() - } - - #[getter] - fn codecs(&self) -> PyCodecChain { - self.inner.codecs().into() - } - - /// The dimension names, if any were specified. - #[getter] - fn dimension_names(&self) -> &Option>> { - self.inner.dimension_names() - } - - /// The Zarr data-type - #[getter] - fn dtype(&self) -> PyDataType { - self.inner.data_type().clone().into() - } - - #[getter] - fn metadata<'py>(&self, py: Python<'py>) -> Bound<'py, PyAny> { - pythonize(py, self.inner.metadata()).unwrap() - } - - /// The number of dimensions. - #[getter] - fn ndim(&self) -> usize { - self.inner.dimensionality() - } - - /// The array's path in the store. - #[getter] - fn path(&self) -> &str { - self.inner.path().as_str() - } - /// Read a region of the array, using numpy-style basic indexing. /// /// Returns one of the decoded result classes (`Tensor`, `VariableArray`, @@ -129,10 +84,4 @@ impl PyArray { let encoded = self.inner.retrieve_encoded_chunk(chunk_indices.as_ref())?; Ok(encoded.map(|buf| PyBytes::new(buf.into()))) } - - /// The array shape. - #[getter] - fn shape(&self) -> &[u64] { - self.inner.shape() - } } diff --git a/src/group/async.rs b/src/group/async.rs index ba168d4..cf69b6e 100644 --- a/src/group/async.rs +++ b/src/group/async.rs @@ -2,12 +2,11 @@ use std::sync::Arc; use super::last_segment; use crate::error::ZarristaError; +use crate::group::shared::group_metadata_accessors; use crate::node::{open_node_async, PyNodePath}; use crate::storage::PyAsyncStorage; use pyo3::prelude::*; use pyo3_async_runtimes::tokio::future_into_py; -use pythonize::pythonize; -use pythonize::Result as PythonizeResult; use zarrs::group::Group; use zarrs::node::NodePath; use zarrs::storage::AsyncReadableWritableListableStorageTraits; @@ -34,6 +33,9 @@ impl PyAsyncGroup { } } +// Metadata accessors shared with `PyGroup`; see `group/shared.rs`. +group_metadata_accessors!(PyAsyncGroup); + #[pymethods] impl PyAsyncGroup { /// Open the group stored at `path` in `store`. @@ -56,12 +58,6 @@ impl PyAsyncGroup { }) } - /// The group's user attributes as a dict. - #[getter] - fn attrs<'py>(&self, py: Python<'py>) -> PythonizeResult> { - pythonize(py, self.inner.attributes()) - } - /// Names of the direct child arrays. fn array_keys<'py>(&self, py: Python<'py>) -> PyResult> { let inner = self.inner.clone(); diff --git a/src/group/mod.rs b/src/group/mod.rs index 7667608..b501ad8 100644 --- a/src/group/mod.rs +++ b/src/group/mod.rs @@ -1,4 +1,5 @@ mod r#async; +mod shared; mod sync; pub use r#async::PyAsyncGroup; diff --git a/src/group/shared.rs b/src/group/shared.rs new file mode 100644 index 0000000..94bb6e2 --- /dev/null +++ b/src/group/shared.rs @@ -0,0 +1,39 @@ +//! Shared `#[pymethods]` for `PyGroup` / `PyAsyncGroup`. +//! +//! These accessors only read from `self.inner` and perform no I/O, so they are +//! identical between the sync and async variants. The macro emits a separate +//! `#[pymethods]` block (requires the `multiple-pymethods` pyo3 feature). +macro_rules! group_metadata_accessors { + ($ty:ty) => { + #[::pyo3::pymethods] + impl $ty { + /// The group's user attributes as a dict. + #[getter] + fn attrs<'py>( + &self, + py: ::pyo3::Python<'py>, + ) -> ::pythonize::Result<::pyo3::Bound<'py, ::pyo3::PyAny>> { + ::pythonize::pythonize(py, self.inner.attributes()) + } + + #[getter] + fn metadata(&self) -> $crate::metadata::PyGroupMetadata { + self.inner.metadata().clone().into() + } + + /// The consolidated metadata, if present in the group metadata. + #[getter] + fn consolidated_metadata(&self) -> Option<$crate::metadata::PyConsolidatedMetadata> { + self.inner.consolidated_metadata().map(Into::into) + } + + /// The group's path in the store. + #[getter] + fn path(&self) -> &str { + self.inner.path().as_str() + } + } + }; +} + +pub(crate) use group_metadata_accessors; diff --git a/src/group/sync.rs b/src/group/sync.rs index d23ff0b..70df26d 100644 --- a/src/group/sync.rs +++ b/src/group/sync.rs @@ -4,11 +4,10 @@ use std::sync::Arc; use super::last_segment; use crate::error::ZarristaResult; +use crate::group::shared::group_metadata_accessors; use crate::node::{open_node, Node, PyNodePath}; use crate::storage::PySyncStorage; use pyo3::prelude::*; -use pythonize::pythonize; -use pythonize::Result as PythonizeResult; use zarrs::group::Group; use zarrs::node::NodePath; use zarrs::storage::ReadableWritableListableStorageTraits; @@ -35,6 +34,9 @@ impl PyGroup { } } +// Metadata accessors shared with `PyAsyncGroup`; see `group/shared.rs`. +group_metadata_accessors!(PyGroup); + #[pymethods] impl PyGroup { /// Open the group stored at `path` in `store`. @@ -49,12 +51,6 @@ impl PyGroup { Ok(Self::new(store, path.into(), inner)) } - /// The group's user attributes as a dict. - #[getter] - fn attrs<'py>(&self, py: Python<'py>) -> PythonizeResult> { - pythonize(py, self.inner.attributes()) - } - /// Names of the direct child arrays. fn array_keys(&self) -> ZarristaResult> { let paths = self.inner.child_array_paths()?; diff --git a/src/metadata.rs b/src/metadata.rs index 2637835..f04a673 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -3,6 +3,7 @@ use pythonize::{depythonize, pythonize, PythonizeError}; use zarrs::metadata::v2::{ArrayMetadataV2, GroupMetadataV2, MetadataV2}; use zarrs::metadata::v3::{ArrayMetadataV3, GroupMetadataV3, MetadataV3}; use zarrs::metadata::{ArrayMetadata, GroupMetadata}; +use zarrs::metadata_ext::group::consolidated_metadata::ConsolidatedMetadata; /// Generate a pythonize-compatible newtype wrapper around a zarrs metadata type. macro_rules! pythonized_metadata { @@ -63,3 +64,4 @@ pythonized_metadata!(PyArrayMetadataV3, ArrayMetadataV3); pythonized_metadata!(PyGroupMetadata, GroupMetadata); pythonized_metadata!(PyGroupMetadataV2, GroupMetadataV2); pythonized_metadata!(PyGroupMetadataV3, GroupMetadataV3); +pythonized_metadata!(PyConsolidatedMetadata, ConsolidatedMetadata); diff --git a/src/node/node_path.rs b/src/node/node_path.rs index 21688d8..de28a9e 100644 --- a/src/node/node_path.rs +++ b/src/node/node_path.rs @@ -17,10 +17,10 @@ impl PyNodePath { } } -impl<'a, 'py> FromPyObject<'a, 'py> for PyNodePath { +impl FromPyObject<'_, '_> for PyNodePath { type Error = PyErr; - fn extract(obj: Borrowed<'a, 'py, PyAny>) -> Result { + fn extract(obj: Borrowed<'_, '_, PyAny>) -> Result { let path = obj.extract::()?; NodePath::new(&path) .map(PyNodePath)