From 2bce36610fac96b485e7a9b7cec49dde7f5fc30f Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 23 Mar 2026 16:10:39 +0000 Subject: [PATCH 1/7] Add function to ensure optional anndata groups in subset operations --- src/h5ad/core/subset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py index d9e7829..da66a4b 100644 --- a/src/h5ad/core/subset.py +++ b/src/h5ad/core/subset.py @@ -44,6 +44,11 @@ def _group_get(parent: Any, key: str) -> Any | None: return parent[key] if key in parent else None +def _ensure_optional_anndata_groups(dst: Any) -> None: + for key in ("layers", "obsm", "obsp", "varm", "varp"): + _ensure_group(dst, key) + + def _decode_attr(value: Any) -> Any: if isinstance(value, bytes): return value.decode("utf-8") @@ -517,6 +522,8 @@ def subset_h5ad( total=1, ) + _ensure_optional_anndata_groups(dst) + if inplace: if file.exists(): if file.is_dir(): From 8553664f4ed9d24205164891fd0c66291dc140d9 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 23 Mar 2026 16:10:45 +0000 Subject: [PATCH 2/7] Add functions to ensure and validate AnnData root attributes in store operations --- src/h5ad/storage/__init__.py | 70 ++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py index 43d876d..6998344 100644 --- a/src/h5ad/storage/__init__.py +++ b/src/h5ad/storage/__init__.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any, Iterable, Optional, Sequence import shutil +import warnings import h5py @@ -15,6 +16,10 @@ import numpy as np +ROOT_ENCODING_TYPE = "anndata" +ROOT_ENCODING_VERSION = "0.1.0" + + @dataclass class Store: backend: str @@ -96,19 +101,63 @@ def open_store(path: Path, mode: str) -> Store: if backend == "zarr": _require_zarr() root = zarr.open_group(str(path), mode=mode) + if _is_writable_mode(mode): + ensure_anndata_root_attrs(root) + else: + warn_if_missing_anndata_root_attrs(root, path=path) return Store(backend="zarr", root=root, path=path) root = h5py.File(path, mode) + if _is_writable_mode(mode): + ensure_anndata_root_attrs(root) + else: + warn_if_missing_anndata_root_attrs(root, path=path) return Store(backend="hdf5", root=root, path=path) +def _decode_attr(value: Any) -> Any: + if isinstance(value, bytes): + return value.decode("utf-8") + return value + + +def _is_writable_mode(mode: str) -> bool: + return any(flag in mode for flag in ("w", "a", "+", "x")) + + +def has_valid_anndata_root_attrs(root: Any) -> bool: + enc_type = _decode_attr(root.attrs.get("encoding-type", None)) + enc_ver = _decode_attr(root.attrs.get("encoding-version", None)) + return enc_type == ROOT_ENCODING_TYPE and enc_ver == ROOT_ENCODING_VERSION + + +def ensure_anndata_root_attrs(root: Any) -> None: + root.attrs["encoding-type"] = ROOT_ENCODING_TYPE + root.attrs["encoding-version"] = ROOT_ENCODING_VERSION + + +def warn_if_missing_anndata_root_attrs(root: Any, *, path: Path) -> None: + if has_valid_anndata_root_attrs(root): + return + + enc_type = _decode_attr(root.attrs.get("encoding-type", None)) + enc_ver = _decode_attr(root.attrs.get("encoding-version", None)) + warnings.warn( + ( + f"Store '{path}' root is missing required AnnData attrs " + f"(encoding-type='anndata', encoding-version='0.1.0'). " + f"Found encoding-type={enc_type!r}, encoding-version={enc_ver!r}." + ), + UserWarning, + stacklevel=2, + ) + + def _normalize_attr_value(value: Any, target_backend: str) -> Any: if target_backend == "zarr": if isinstance(value, bytes): return value.decode("utf-8") if isinstance(value, (list, tuple)): - return [ - v.decode("utf-8") if isinstance(v, bytes) else v for v in value - ] + return [v.decode("utf-8") if isinstance(v, bytes) else v for v in value] if isinstance(value, np.ndarray): if value.dtype.kind in ("S", "O"): return [ @@ -187,7 +236,9 @@ def create_dataset( if zarr_format == 3: kwargs = dict(kwargs) kwargs.pop("compressor", None) - elif zarr_format == 2 and "compressors" in kwargs and "compressor" not in kwargs: + elif ( + zarr_format == 2 and "compressors" in kwargs and "compressor" not in kwargs + ): kwargs = dict(kwargs) compressors = kwargs.pop("compressors") if isinstance(compressors, (list, tuple)) and len(compressors) == 1: @@ -234,8 +285,12 @@ def copy_dataset(src: Any, dst_group: Any, name: str) -> Any: return ds -def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = ()) -> Any: - if is_hdf5_group(dst_group) and (is_hdf5_group(src_obj) or is_hdf5_dataset(src_obj)): +def copy_tree( + src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = () +) -> Any: + if is_hdf5_group(dst_group) and ( + is_hdf5_group(src_obj) or is_hdf5_dataset(src_obj) + ): if not exclude: dst_group.copy(src_obj, dst_group, name) return dst_group[name] @@ -256,6 +311,9 @@ def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] def copy_store_contents(src_root: Any, dst_root: Any) -> None: + target_backend = "zarr" if is_zarr_group(dst_root) else "hdf5" + copy_attrs(src_root.attrs, dst_root.attrs, target_backend=target_backend) + ensure_anndata_root_attrs(dst_root) for key in src_root.keys(): copy_tree(src_root[key], dst_root, key) From 796e74cfaee49da8c8facc1c4e43fcdefa2928f6 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 23 Mar 2026 16:10:51 +0000 Subject: [PATCH 3/7] Add tests for AnnData root encoding attributes enforcement and warnings --- tests/test_storage_root_attrs.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/test_storage_root_attrs.py diff --git a/tests/test_storage_root_attrs.py b/tests/test_storage_root_attrs.py new file mode 100644 index 0000000..13f4621 --- /dev/null +++ b/tests/test_storage_root_attrs.py @@ -0,0 +1,42 @@ +"""Tests for AnnData root encoding attributes enforcement/warnings.""" + +from pathlib import Path + +import h5py +import pytest + +from h5ad.storage import open_store + + +def _make_minimal_h5ad(path: Path) -> None: + with h5py.File(path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.create_dataset("obs_names", data=[b"cell_1"]) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.create_dataset("var_names", data=[b"gene_1"]) + + f.create_dataset("X", data=[[1.0]]) + + +def test_open_store_read_warns_for_missing_root_attrs(temp_dir: Path) -> None: + file_path = temp_dir / "missing_root_attrs.h5ad" + _make_minimal_h5ad(file_path) + + with pytest.warns(UserWarning, match="missing required AnnData attrs"): + with open_store(file_path, "r"): + pass + + +def test_open_store_writable_mode_sets_root_attrs(temp_dir: Path) -> None: + file_path = temp_dir / "set_root_attrs.h5ad" + _make_minimal_h5ad(file_path) + + with open_store(file_path, "a"): + pass + + with h5py.File(file_path, "r") as f: + assert f.attrs.get("encoding-type") == "anndata" + assert f.attrs.get("encoding-version") == "0.1.0" From 4d44e4639c9a225a58ba364e746ea72826ff50e6 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 23 Mar 2026 16:10:57 +0000 Subject: [PATCH 4/7] Add test for optional empty groups in subset_h5ad function --- tests/test_subset.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_subset.py b/tests/test_subset.py index 78c5cf8..a21fc20 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -284,6 +284,28 @@ def test_subset_sparse_empty_result(self, sample_sparse_csr_h5ad, temp_dir): class TestSubsetH5ad: """Integration tests for subset_h5ad function.""" + def test_subset_h5ad_creates_optional_empty_groups(self, sample_h5ad_file, temp_dir): + """Subset output should include optional AnnData groups even if absent in source.""" + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=sample_h5ad_file, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + for key in ("layers", "obsm", "obsp", "varm", "varp"): + assert key in f + assert isinstance(f[key], h5py.Group) + def test_subset_h5ad_obs_only(self, sample_h5ad_file, temp_dir): """Test subsetting h5ad file by obs only.""" obs_file = temp_dir / "obs_names.txt" From 1911cdc7652e194be573865ec5326742b84a3976 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 23 Mar 2026 16:11:02 +0000 Subject: [PATCH 5/7] Refactor test_subset_h5ad to improve readability and ensure optional empty groups are included in the subset output --- tests/test_subset.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_subset.py b/tests/test_subset.py index a21fc20..42d5ec6 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -284,7 +284,9 @@ def test_subset_sparse_empty_result(self, sample_sparse_csr_h5ad, temp_dir): class TestSubsetH5ad: """Integration tests for subset_h5ad function.""" - def test_subset_h5ad_creates_optional_empty_groups(self, sample_h5ad_file, temp_dir): + def test_subset_h5ad_creates_optional_empty_groups( + self, sample_h5ad_file, temp_dir + ): """Subset output should include optional AnnData groups even if absent in source.""" obs_file = temp_dir / "obs_names.txt" obs_file.write_text("cell_1\ncell_3\n") @@ -460,7 +462,9 @@ def test_subset_h5ad_obsp_sparse_group(self, temp_dir): conn.attrs["shape"] = np.array([4, 4], dtype=np.int64) conn.create_dataset("data", data=np.array([1.0, 2.0, 3.0, 4.0])) conn.create_dataset("indices", data=np.array([0, 1, 2, 3], dtype=np.int64)) - conn.create_dataset("indptr", data=np.array([0, 1, 2, 3, 4], dtype=np.int64)) + conn.create_dataset( + "indptr", data=np.array([0, 1, 2, 3, 4], dtype=np.int64) + ) obs_file = temp_dir / "obs_names.txt" obs_file.write_text("cell_1\ncell_3\n") @@ -584,7 +588,8 @@ def _csr_group(parent, name, shape): obs = f.create_group("obs") obs.attrs["_index"] = "obs_names" obs.create_dataset( - "obs_names", data=np.array(["cell_1", "cell_2", "cell_3", "cell_4"], dtype="S") + "obs_names", + data=np.array(["cell_1", "cell_2", "cell_3", "cell_4"], dtype="S"), ) var = f.create_group("var") From a62f6e0d437ae7937d973216ba0a83e27d04398f Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 23 Mar 2026 16:11:20 +0000 Subject: [PATCH 6/7] Bump h5ad package version to 0.3.1 --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index cb342bd..7884068 100644 --- a/uv.lock +++ b/uv.lock @@ -134,7 +134,7 @@ wheels = [ [[package]] name = "h5ad" -version = "0.3.0" +version = "0.3.1" source = { editable = "." } dependencies = [ { name = "h5py" }, From 7ba61c9ca538d3cf412e82cabd2a72d189841685 Mon Sep 17 00:00:00 2001 From: Aljes Binkevich Date: Mon, 23 Mar 2026 16:51:29 +0000 Subject: [PATCH 7/7] Update src/h5ad/storage/__init__.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/h5ad/storage/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py index 6998344..29c2227 100644 --- a/src/h5ad/storage/__init__.py +++ b/src/h5ad/storage/__init__.py @@ -143,8 +143,8 @@ def warn_if_missing_anndata_root_attrs(root: Any, *, path: Path) -> None: enc_ver = _decode_attr(root.attrs.get("encoding-version", None)) warnings.warn( ( - f"Store '{path}' root is missing required AnnData attrs " - f"(encoding-type='anndata', encoding-version='0.1.0'). " + f"Store '{path}' root has missing or invalid AnnData attrs " + f"(encoding-type={ROOT_ENCODING_TYPE!r}, encoding-version={ROOT_ENCODING_VERSION!r}). " f"Found encoding-type={enc_type!r}, encoding-version={enc_ver!r}." ), UserWarning,