From 6b71964850be0d0d6b65b2245021314fdbf3983b Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 10 Mar 2026 19:30:20 +0100 Subject: [PATCH 1/6] test: add RED tests for znh5md <-> asebytes full round-trip coverage Parametrized tests over all s22_* fixtures verify: - znh5md write -> asebytes read (catches H5MD path mapping bug with per-atom calc results like energies, stresses, magmoms) - asebytes write -> znh5md read - Full bidirectional round-trip - *.h5 extension with H5MD content (catches registry routing bug) 9 tests fail RED, exposing two bugs: 1. H5MDStore path mapping loses actual H5MD paths on round-trip 2. Registry sends *.h5 H5MD files to RaggedColumnarBackend Co-Authored-By: Claude Opus 4.6 --- tests/contract/test_znh5md_roundtrip.py | 145 ++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/contract/test_znh5md_roundtrip.py diff --git a/tests/contract/test_znh5md_roundtrip.py b/tests/contract/test_znh5md_roundtrip.py new file mode 100644 index 0000000..adc1112 --- /dev/null +++ b/tests/contract/test_znh5md_roundtrip.py @@ -0,0 +1,145 @@ +"""znh5md <-> asebytes full round-trip tests. + +Covers ALL s22_* fixtures to ensure every property type (positions, numbers, +cell, pbc, all calc results, custom info, custom per-atom arrays, velocities, +mixed pbc/cell, sparse properties) round-trips correctly between znh5md and +asebytes. + +The existing interop tests only checked positions+numbers+forces, which use +standard H5MD element names and missed mapping bugs where per-atom calc +results placed by znh5md in particles/ (with ASE_ENTRY_ORIGIN=calc) could +not be found on read. +""" + +from __future__ import annotations + +import numpy as np +import pytest +import znh5md + +from asebytes import ASEIO + +from .conftest import assert_atoms_equal + +# All s22_* fixture names from conftest that produce list[ase.Atoms] +S22_FIXTURES = [ + "s22", + "s22_energy", + "s22_energy_forces", + "s22_all_properties", + "s22_info_arrays_calc", + "s22_mixed_pbc_cell", + "s22_info_arrays_calc_missing_inbetween", +] + + +# --------------------------------------------------------------------------- +# znh5md -> asebytes +# --------------------------------------------------------------------------- + + +class TestZnH5MDToAsebytes: + """Verify asebytes can read files written by znh5md.""" + + @pytest.mark.parametrize("fixture_name", S22_FIXTURES) + def test_read(self, tmp_path, fixture_name, request): + frames = request.getfixturevalue(fixture_name) + path = str(tmp_path / "znh5md.h5md") + znh5md.IO(path).extend(frames) + + db = ASEIO(path) + assert len(db) == len(frames) + for i, expected in enumerate(frames): + result = db[i] + assert_atoms_equal(result, expected) + + +# --------------------------------------------------------------------------- +# asebytes -> znh5md +# --------------------------------------------------------------------------- + + +class TestAsebytesToZnH5MD: + """Verify znh5md can read files written by asebytes.""" + + @pytest.mark.parametrize("fixture_name", S22_FIXTURES) + def test_write(self, tmp_path, fixture_name, request): + frames = request.getfixturevalue(fixture_name) + path = str(tmp_path / "asebytes.h5md") + ASEIO(path).extend(frames) + + zio = znh5md.IO(path) + assert len(zio) == len(frames) + for i, expected in enumerate(frames): + result = zio[i] + n = len(expected) + np.testing.assert_allclose( + result.positions[:n], expected.positions, atol=1e-6, + err_msg=f"Frame {i}: positions mismatch", + ) + np.testing.assert_array_equal( + result.numbers[:n], expected.numbers, + err_msg=f"Frame {i}: numbers mismatch", + ) + + +# --------------------------------------------------------------------------- +# Full bidirectional: znh5md -> asebytes -> znh5md +# --------------------------------------------------------------------------- + + +class TestBidirectionalRoundtrip: + """Write with znh5md, read with asebytes, write back, read with znh5md.""" + + @pytest.mark.parametrize("fixture_name", S22_FIXTURES) + def test_roundtrip(self, tmp_path, fixture_name, request): + frames = request.getfixturevalue(fixture_name) + + # Step 1: znh5md write -> asebytes read + zpath = str(tmp_path / "step1.h5md") + znh5md.IO(zpath).extend(frames) + db = ASEIO(zpath) + intermediate = [db[i] for i in range(len(db))] + + # Step 2: asebytes write -> znh5md read + apath = str(tmp_path / "step2.h5md") + ASEIO(apath).extend(intermediate) + zio = znh5md.IO(apath) + assert len(zio) == len(frames) + for i, expected in enumerate(frames): + result = zio[i] + n = len(expected) + np.testing.assert_allclose( + result.positions[:n], expected.positions, atol=1e-6, + err_msg=f"Frame {i}: positions mismatch", + ) + np.testing.assert_array_equal( + result.numbers[:n], expected.numbers, + err_msg=f"Frame {i}: numbers mismatch", + ) + + +# --------------------------------------------------------------------------- +# Registry: *.h5 files with H5MD content +# --------------------------------------------------------------------------- + + +class TestH5ExtensionWithH5MDContent: + """*.h5 files containing H5MD data should be readable via ASEIO. + + znh5md writes valid H5MD regardless of file extension. The registry + currently maps *.h5 -> RaggedColumnarBackend, which can't read H5MD. + It should detect the h5md group and use H5MDBackend instead. + """ + + @pytest.mark.parametrize("fixture_name", S22_FIXTURES) + def test_h5_extension(self, tmp_path, fixture_name, request): + frames = request.getfixturevalue(fixture_name) + path = str(tmp_path / "data.h5") + znh5md.IO(path).extend(frames) + + db = ASEIO(path) + assert len(db) == len(frames) + for i, expected in enumerate(frames): + result = db[i] + assert_atoms_equal(result, expected) From a9458d5c61ac89b40a2b6f6f4e9d4a68f8fa1bb7 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 10 Mar 2026 19:49:13 +0100 Subject: [PATCH 2/6] feat(fix-h5md): add H5MD content sniffing for *.h5 files - Add _is_h5md_file() helper that checks for "h5md" group in HDF5 files - Insert sniffing logic in resolve_backend to redirect *.h5 files containing H5MD metadata to the H5MD backend Co-Authored-By: Claude Opus 4.6 --- src/asebytes/_registry.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/asebytes/_registry.py b/src/asebytes/_registry.py index 42a4a14..060b50c 100644 --- a/src/asebytes/_registry.py +++ b/src/asebytes/_registry.py @@ -127,6 +127,21 @@ def parse_uri(path: str) -> tuple[str | None, str]: # --------------------------------------------------------------------------- +def _is_h5md_file(path: str) -> bool: + """Check if an HDF5 file contains H5MD metadata.""" + from pathlib import Path + + if not Path(path).is_file(): + return False + try: + import h5py + + with h5py.File(path, "r") as f: + return "h5md" in f + except Exception: + return False + + def _pick_class(entry: _RegistryEntry, path: str, writable: bool | None): """Import the module from *entry* and return the appropriate class.""" mod = _import_module(entry.module_path) @@ -209,6 +224,27 @@ def resolve_backend( candidates.append(entry) + # -- Sniff *.h5 files for H5MD content --------------------------------- + if ( + scheme is None + and layer == "object" + and candidates + and any( + e.match_type == "pattern" and e.match_value == "*.h5" + for e in candidates + ) + and _is_h5md_file(path_or_uri) + ): + # Replace with H5MD backend entry + h5md_entries = [ + e for e in _REGISTRY + if e.match_type == "pattern" + and e.match_value == "*.h5md" + and e.layer == layer + ] + if h5md_entries: + candidates = h5md_entries + # -- No direct match -> cross-layer adapter wrapping -------------------- if not candidates: if _allow_fallback: From b56cf74a6c547b945878510c9d63496249f510bd Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 10 Mar 2026 19:50:56 +0100 Subject: [PATCH 3/6] fix(h5md): cache discovered H5 paths to fix round-trip with foreign properties Properties like energies, stresses, magmoms placed by znh5md in /particles/ were lost on read because _column_to_h5() statically mapped them to /observables/. Now _walk_elements() caches the actual H5 path during discovery, and _get_ds() checks the cache first. Co-Authored-By: Claude Opus 4.6 --- src/asebytes/h5md/_store.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/asebytes/h5md/_store.py b/src/asebytes/h5md/_store.py index 00c08ce..5381d45 100644 --- a/src/asebytes/h5md/_store.py +++ b/src/asebytes/h5md/_store.py @@ -108,6 +108,7 @@ def __init__( self._compression_opts = compression_opts self._chunk_frames = chunk_frames self._ds_cache: dict[str, Any] = {} # column name -> h5py.Dataset + self._path_cache: dict[str, str] = {} # column name -> actual h5 path # ------------------------------------------------------------------ # Path translation @@ -214,7 +215,10 @@ def _get_ds(self, key: str) -> Any: """Return cached h5py.Dataset for the ``value`` sub-dataset.""" ds = self._ds_cache.get(key) if ds is None: - h5_path, _ = self._column_to_h5(key) + # Check path cache first (populated by list_arrays discovery) + h5_path = self._path_cache.get(key) + if h5_path is None: + h5_path, _ = self._column_to_h5(key) if h5_path is None: raise KeyError(f"Unknown column: {key!r}") ds = self._file[f"{h5_path}/value"] @@ -445,9 +449,11 @@ def _walk_elements( if isinstance(child, h5py.Group): if "value" in child: # This is an H5MD element - col = self._h5_to_column(f"{path}/{child_name}") + h5_path = f"{path}/{child_name}" + col = self._h5_to_column(h5_path) if col is not None: out.append(col) + self._path_cache[col] = h5_path else: # Recurse (e.g. into box/) self._walk_elements(child, f"{path}/{child_name}", out) From 229b6baf20af67caf9c2da8b9fc3f74abfa8287f Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 10 Mar 2026 20:06:51 +0100 Subject: [PATCH 4/6] fix(h5md): remove redundant has_array guard in per-atom detection In _discover(), columns from list_arrays() are known to exist, so the has_array() check was redundant and broke per-atom detection for columns whose _column_to_h5() mapping disagreed with the actual H5 path (e.g. per-atom energies placed in /particles/ by znh5md). Co-Authored-By: Claude Opus 4.6 --- src/asebytes/h5md/_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/asebytes/h5md/_backend.py b/src/asebytes/h5md/_backend.py index a146404..11d50e3 100644 --- a/src/asebytes/h5md/_backend.py +++ b/src/asebytes/h5md/_backend.py @@ -199,8 +199,8 @@ def _discover(self) -> None: if col.startswith("_"): continue self._columns.append(col) - # Per-atom detection: particles/ columns with ndim >= 2 - if col not in ("cell", "pbc") and self._store.has_array(col): + # Per-atom detection: columns with ndim >= 2 + if col not in ("cell", "pbc"): shape = self._store.get_shape(col) if len(shape) >= 2 and shape[1] > 1: self._per_atom_cols.add(col) From 5d95065d0f5ecc9661ca77ad5dce78edebeda2b0 Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 10 Mar 2026 21:02:56 +0100 Subject: [PATCH 5/6] fix(h5md): address PR #16 CodeRabbit review comments - Use H5 path (/particles/ vs /observables/) for per-atom detection instead of shape heuristic that misclassified frame-level vectors - Centralize path resolution with _resolve_h5_path helper used by both _get_ds and has_array to fix foreign column lookups - Remove test slicing ([:n]) that masked potential padding leaks and add explicit atom count assertions Co-Authored-By: Claude Opus 4.6 --- src/asebytes/h5md/_backend.py | 10 ++++++---- src/asebytes/h5md/_store.py | 15 ++++++++++----- tests/contract/test_znh5md_roundtrip.py | 16 ++++++++++------ 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/asebytes/h5md/_backend.py b/src/asebytes/h5md/_backend.py index 11d50e3..f15ec10 100644 --- a/src/asebytes/h5md/_backend.py +++ b/src/asebytes/h5md/_backend.py @@ -199,11 +199,13 @@ def _discover(self) -> None: if col.startswith("_"): continue self._columns.append(col) - # Per-atom detection: columns with ndim >= 2 + # Per-atom detection: columns in /particles/ with ndim >= 2 if col not in ("cell", "pbc"): - shape = self._store.get_shape(col) - if len(shape) >= 2 and shape[1] > 1: - self._per_atom_cols.add(col) + h5_path = self._store._path_cache.get(col) + if h5_path is not None and h5_path.startswith("/particles/"): + shape = self._store.get_shape(col) + if len(shape) >= 2: + self._per_atom_cols.add(col) # Rebuild known_arrays and shapes self._known_arrays = set() diff --git a/src/asebytes/h5md/_store.py b/src/asebytes/h5md/_store.py index 5381d45..a6cf87d 100644 --- a/src/asebytes/h5md/_store.py +++ b/src/asebytes/h5md/_store.py @@ -211,14 +211,19 @@ def _get_element_origin(self, h5_path: str) -> str | None: # Internal helpers # ------------------------------------------------------------------ + def _resolve_h5_path(self, key: str) -> str | None: + """Resolve column name to H5 path, checking cache first.""" + h5_path = self._path_cache.get(key) + if h5_path is not None: + return h5_path + h5_path, _ = self._column_to_h5(key) + return h5_path + def _get_ds(self, key: str) -> Any: """Return cached h5py.Dataset for the ``value`` sub-dataset.""" ds = self._ds_cache.get(key) if ds is None: - # Check path cache first (populated by list_arrays discovery) - h5_path = self._path_cache.get(key) - if h5_path is None: - h5_path, _ = self._column_to_h5(key) + h5_path = self._resolve_h5_path(key) if h5_path is None: raise KeyError(f"Unknown column: {key!r}") ds = self._file[f"{h5_path}/value"] @@ -401,7 +406,7 @@ def has_array(self, name: str) -> bool: return name in self._file[meta_path] except KeyError: return False - h5_path, _ = self._column_to_h5(name) + h5_path = self._resolve_h5_path(name) if h5_path is None: return False try: diff --git a/tests/contract/test_znh5md_roundtrip.py b/tests/contract/test_znh5md_roundtrip.py index adc1112..d4e3850 100644 --- a/tests/contract/test_znh5md_roundtrip.py +++ b/tests/contract/test_znh5md_roundtrip.py @@ -72,13 +72,15 @@ def test_write(self, tmp_path, fixture_name, request): assert len(zio) == len(frames) for i, expected in enumerate(frames): result = zio[i] - n = len(expected) + assert len(result) == len(expected), ( + f"Frame {i}: atom count mismatch ({len(result)} != {len(expected)})" + ) np.testing.assert_allclose( - result.positions[:n], expected.positions, atol=1e-6, + result.positions, expected.positions, atol=1e-6, err_msg=f"Frame {i}: positions mismatch", ) np.testing.assert_array_equal( - result.numbers[:n], expected.numbers, + result.numbers, expected.numbers, err_msg=f"Frame {i}: numbers mismatch", ) @@ -108,13 +110,15 @@ def test_roundtrip(self, tmp_path, fixture_name, request): assert len(zio) == len(frames) for i, expected in enumerate(frames): result = zio[i] - n = len(expected) + assert len(result) == len(expected), ( + f"Frame {i}: atom count mismatch ({len(result)} != {len(expected)})" + ) np.testing.assert_allclose( - result.positions[:n], expected.positions, atol=1e-6, + result.positions, expected.positions, atol=1e-6, err_msg=f"Frame {i}: positions mismatch", ) np.testing.assert_array_equal( - result.numbers[:n], expected.numbers, + result.numbers, expected.numbers, err_msg=f"Frame {i}: numbers mismatch", ) From 20ce0a0dc9b449068e828434fedaf178af87757f Mon Sep 17 00:00:00 2001 From: Fabian Zills Date: Tue, 10 Mar 2026 21:35:12 +0100 Subject: [PATCH 6/6] test(h5md): use full assert_atoms_equal in all round-trip tests Replace weak positions/numbers-only assertions in TestAsebytesToZnH5MD and TestBidirectionalRoundtrip with assert_atoms_equal to catch regressions in cell, pbc, info, arrays, and calc properties. Merge TestH5ExtensionWithH5MDContent into TestZnH5MDToAsebytes as a parametrized extension (.h5md/.h5) to remove duplication. Co-Authored-By: Claude Opus 4.6 --- tests/contract/test_znh5md_roundtrip.py | 64 +++++-------------------- 1 file changed, 11 insertions(+), 53 deletions(-) diff --git a/tests/contract/test_znh5md_roundtrip.py b/tests/contract/test_znh5md_roundtrip.py index d4e3850..8a7a60a 100644 --- a/tests/contract/test_znh5md_roundtrip.py +++ b/tests/contract/test_znh5md_roundtrip.py @@ -13,7 +13,6 @@ from __future__ import annotations -import numpy as np import pytest import znh5md @@ -34,17 +33,22 @@ # --------------------------------------------------------------------------- -# znh5md -> asebytes +# znh5md -> asebytes (*.h5md and *.h5 via content sniffing) # --------------------------------------------------------------------------- class TestZnH5MDToAsebytes: - """Verify asebytes can read files written by znh5md.""" + """Verify asebytes can read files written by znh5md. + + Tests both the native *.h5md extension and *.h5 files that require + content sniffing to route to H5MDBackend instead of ColumnarBackend. + """ @pytest.mark.parametrize("fixture_name", S22_FIXTURES) - def test_read(self, tmp_path, fixture_name, request): + @pytest.mark.parametrize("ext", [".h5md", ".h5"]) + def test_read(self, tmp_path, fixture_name, ext, request): frames = request.getfixturevalue(fixture_name) - path = str(tmp_path / "znh5md.h5md") + path = str(tmp_path / f"znh5md{ext}") znh5md.IO(path).extend(frames) db = ASEIO(path) @@ -72,17 +76,7 @@ def test_write(self, tmp_path, fixture_name, request): assert len(zio) == len(frames) for i, expected in enumerate(frames): result = zio[i] - assert len(result) == len(expected), ( - f"Frame {i}: atom count mismatch ({len(result)} != {len(expected)})" - ) - np.testing.assert_allclose( - result.positions, expected.positions, atol=1e-6, - err_msg=f"Frame {i}: positions mismatch", - ) - np.testing.assert_array_equal( - result.numbers, expected.numbers, - err_msg=f"Frame {i}: numbers mismatch", - ) + assert_atoms_equal(result, expected, atol=1e-6) # --------------------------------------------------------------------------- @@ -110,40 +104,4 @@ def test_roundtrip(self, tmp_path, fixture_name, request): assert len(zio) == len(frames) for i, expected in enumerate(frames): result = zio[i] - assert len(result) == len(expected), ( - f"Frame {i}: atom count mismatch ({len(result)} != {len(expected)})" - ) - np.testing.assert_allclose( - result.positions, expected.positions, atol=1e-6, - err_msg=f"Frame {i}: positions mismatch", - ) - np.testing.assert_array_equal( - result.numbers, expected.numbers, - err_msg=f"Frame {i}: numbers mismatch", - ) - - -# --------------------------------------------------------------------------- -# Registry: *.h5 files with H5MD content -# --------------------------------------------------------------------------- - - -class TestH5ExtensionWithH5MDContent: - """*.h5 files containing H5MD data should be readable via ASEIO. - - znh5md writes valid H5MD regardless of file extension. The registry - currently maps *.h5 -> RaggedColumnarBackend, which can't read H5MD. - It should detect the h5md group and use H5MDBackend instead. - """ - - @pytest.mark.parametrize("fixture_name", S22_FIXTURES) - def test_h5_extension(self, tmp_path, fixture_name, request): - frames = request.getfixturevalue(fixture_name) - path = str(tmp_path / "data.h5") - znh5md.IO(path).extend(frames) - - db = ASEIO(path) - assert len(db) == len(frames) - for i, expected in enumerate(frames): - result = db[i] - assert_atoms_equal(result, expected) + assert_atoms_equal(result, expected, atol=1e-6)