From d16dc2b9ad2211bd732a8fc34407f72b69dc82c6 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 11 Feb 2026 13:25:33 +0000 Subject: [PATCH 1/5] Fixes to dataset equivalence testing on xarray loads. --- .../test_xarray_load_and_save_equivalence.py | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_xarray_load_and_save_equivalence.py b/tests/integration/test_xarray_load_and_save_equivalence.py index 0030dca..8b532f0 100644 --- a/tests/integration/test_xarray_load_and_save_equivalence.py +++ b/tests/integration/test_xarray_load_and_save_equivalence.py @@ -6,6 +6,7 @@ (2) check equivalence of files : xarray -> file VS xarray->ncdata->file """ +import numpy as np import pytest import xarray @@ -13,7 +14,6 @@ from ncdata.threadlock_sharing import lockshare_context from ncdata.utils import dataset_differences from ncdata.xarray import from_xarray, to_xarray - from tests.data_testcase_schemas import ( BAD_LOADSAVE_TESTCASES, session_testdir, @@ -38,6 +38,35 @@ def use_xarraylock(): yield +def equivalence_fix_datasets( + ds_from: xarray.Dataset, ds_to: xarray.Dataset +) -> (xarray.Dataset, xarray.Dataset): + """ + Modify datasets in legitimate ways to make "ds_from.identical(ds_to)". + + The key differences are due to coordinates remaining lazy in loading via ncdata, but + have data fetched in the "normal" load. + The coordinates apparently remain 'identical', but it affects the dataset indexes. + + Minimum found necessary : where in 'ds_from' we find a lazy coordinate, which is a + real one in 'ds_to', remove the associated index from 'ds_to'. + """ + drop_indices = [] + for varname, var in ds_from.variables.items(): + if hasattr(var.data, "compute"): + var_other = ds_to.variables.get(varname, None) + if isinstance(var_other.data, np.ndarray): + # This is lazy, but the reference var is real : replace with real data. + if varname in ds_to.indexes: + drop_indices.append(varname) + + # NB drop_indexes is *not* an inplace operation! + # So replace returned 'ds_to' with new dataset. + ds_to = ds_to.drop_indexes(drop_indices) + # NB: as it currently is, we do *not* ever have to modify/replace 'ds_from'. + return ds_from, ds_to + + def test_load_direct_vs_viancdata(standard_testcase, use_xarraylock, tmp_path): source_filepath = standard_testcase.filepath ncdata = from_nc4(source_filepath) @@ -52,7 +81,15 @@ def test_load_direct_vs_viancdata(standard_testcase, use_xarraylock, tmp_path): # Load same, via ncdata xr_ncdata_ds = to_xarray(ncdata) - # Treat as OK if it passes xarray comparison + # Check that datasets are "equal" : but NB this only compares values + assert xr_ds.equals(xr_ncdata_ds) + + # 'Fix' equivalence, by making lazy vars real + removing missing indices. + # These are the expected differences due to ncdata passing lazy arrays. + # This should then make "Dataset.identical" true. + xr_ncdata_ds, xr_ds = equivalence_fix_datasets( + ds_from=xr_ncdata_ds, ds_to=xr_ds + ) assert xr_ds.identical(xr_ncdata_ds) From 08879a0d520e1dc242c5cb30fdba8f5c3b5114c4 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 11 Feb 2026 16:00:32 +0000 Subject: [PATCH 2/5] Added towncrier fragment. --- docs/changelog_fragments/195.dev.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/changelog_fragments/195.dev.rst diff --git a/docs/changelog_fragments/195.dev.rst b/docs/changelog_fragments/195.dev.rst new file mode 100644 index 0000000..e8a68a4 --- /dev/null +++ b/docs/changelog_fragments/195.dev.rst @@ -0,0 +1 @@ +Fixed xarray load tests for new behaviour of xarray.Dataset.identical. From 38861c25db0d72bd1d38cec03b78a6d7b2c0d934 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 12 Feb 2026 17:12:48 +0000 Subject: [PATCH 3/5] Use own concept of 'dataset equivalence'. --- .../test_xarray_load_and_save_equivalence.py | 81 +++++++++++-------- 1 file changed, 48 insertions(+), 33 deletions(-) diff --git a/tests/integration/test_xarray_load_and_save_equivalence.py b/tests/integration/test_xarray_load_and_save_equivalence.py index 8b532f0..9290b62 100644 --- a/tests/integration/test_xarray_load_and_save_equivalence.py +++ b/tests/integration/test_xarray_load_and_save_equivalence.py @@ -38,33 +38,57 @@ def use_xarraylock(): yield -def equivalence_fix_datasets( - ds_from: xarray.Dataset, ds_to: xarray.Dataset -) -> (xarray.Dataset, xarray.Dataset): +def check_load_equivalence(ds1: xarray.Dataset, ds2: xarray.Dataset): """ - Modify datasets in legitimate ways to make "ds_from.identical(ds_to)". + Check that datasets differ only in "expected" ways. The key differences are due to coordinates remaining lazy in loading via ncdata, but - have data fetched in the "normal" load. - The coordinates apparently remain 'identical', but it affects the dataset indexes. - - Minimum found necessary : where in 'ds_from' we find a lazy coordinate, which is a - real one in 'ds_to', remove the associated index from 'ds_to'. + having real data in a "normal" load. This also affects which coords have indexes, + but we are not checking that here anyway. """ - drop_indices = [] - for varname, var in ds_from.variables.items(): - if hasattr(var.data, "compute"): - var_other = ds_to.variables.get(varname, None) - if isinstance(var_other.data, np.ndarray): - # This is lazy, but the reference var is real : replace with real data. - if varname in ds_to.indexes: - drop_indices.append(varname) - - # NB drop_indexes is *not* an inplace operation! - # So replace returned 'ds_to' with new dataset. - ds_to = ds_to.drop_indexes(drop_indices) - # NB: as it currently is, we do *not* ever have to modify/replace 'ds_from'. - return ds_from, ds_to + + def check_attrs_equivalent(attrs1, attrs2): + # Because dict-eq does not work when values can be arrays (!) + okay = set(attrs1.keys()) == set(attrs2.keys()) + if okay: + for attr in attrs1: + okay = np.all(attrs1[attr] == attrs2[attr]) + if not okay: + break + assert okay + + def check_vars_equivalent(v1, v2): + check_attrs_equivalent(v1.attrs, v2.attrs) + assert v1.dims == v2.dims + assert v1.dtype == v2.dtype + if v1.dtype.kind not in ("iufM"): + # Nonnumeric cases are relatively simple + result = np.all(v1.data == v2.data) + else: + # Numeric cases must allow for NaNs, which don't compare + d1, d2 = v1.data, v2.data + if d1.ndim == 0: + # awkward special case where indexing operations otherwise fail + d1, d2 = [a.reshape((a.size,)) for a in (d1, d2)] + data_diff = d1 - d2 + # Account for NaN -or "NaT" for time types + data_diff = data_diff[np.logical_not(np.isnan(data_diff))] + # Note: not entirely happy with exact equality, but the time types make this + if data_diff.dtype.kind == "f": + # Slight tolerance on floats + result = np.allclose(data_diff, 0) + else: + # Exact equality - including time types, which allclose can't handle. + result = np.all(data_diff == 0) + if hasattr(result, "compute"): + result = result.compute() + assert result + + check_attrs_equivalent(ds1.attrs, ds2.attrs) + assert ds1.dims == ds2.dims + assert list(ds1.variables) == list(ds2.variables) + for varname in ds1.variables: + check_vars_equivalent(ds1.variables[varname], ds2.variables[varname]) def test_load_direct_vs_viancdata(standard_testcase, use_xarraylock, tmp_path): @@ -81,16 +105,7 @@ def test_load_direct_vs_viancdata(standard_testcase, use_xarraylock, tmp_path): # Load same, via ncdata xr_ncdata_ds = to_xarray(ncdata) - # Check that datasets are "equal" : but NB this only compares values - assert xr_ds.equals(xr_ncdata_ds) - - # 'Fix' equivalence, by making lazy vars real + removing missing indices. - # These are the expected differences due to ncdata passing lazy arrays. - # This should then make "Dataset.identical" true. - xr_ncdata_ds, xr_ds = equivalence_fix_datasets( - ds_from=xr_ncdata_ds, ds_to=xr_ds - ) - assert xr_ds.identical(xr_ncdata_ds) + check_load_equivalence(xr_ds, xr_ncdata_ds) def test_save_direct_vs_viancdata(standard_testcase, tmp_path): From 3a85ebbc2ca036f193815c7376581e0a2b266f3c Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 13 Feb 2026 18:19:40 +0000 Subject: [PATCH 4/5] Simplify xr.Dataset equivalence test. --- .../test_xarray_load_and_save_equivalence.py | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/tests/integration/test_xarray_load_and_save_equivalence.py b/tests/integration/test_xarray_load_and_save_equivalence.py index 9290b62..aa236db 100644 --- a/tests/integration/test_xarray_load_and_save_equivalence.py +++ b/tests/integration/test_xarray_load_and_save_equivalence.py @@ -14,6 +14,7 @@ from ncdata.threadlock_sharing import lockshare_context from ncdata.utils import dataset_differences from ncdata.xarray import from_xarray, to_xarray + from tests.data_testcase_schemas import ( BAD_LOADSAVE_TESTCASES, session_testdir, @@ -61,25 +62,11 @@ def check_vars_equivalent(v1, v2): check_attrs_equivalent(v1.attrs, v2.attrs) assert v1.dims == v2.dims assert v1.dtype == v2.dtype - if v1.dtype.kind not in ("iufM"): - # Nonnumeric cases are relatively simple - result = np.all(v1.data == v2.data) - else: - # Numeric cases must allow for NaNs, which don't compare - d1, d2 = v1.data, v2.data - if d1.ndim == 0: - # awkward special case where indexing operations otherwise fail - d1, d2 = [a.reshape((a.size,)) for a in (d1, d2)] - data_diff = d1 - d2 - # Account for NaN -or "NaT" for time types - data_diff = data_diff[np.logical_not(np.isnan(data_diff))] - # Note: not entirely happy with exact equality, but the time types make this - if data_diff.dtype.kind == "f": - # Slight tolerance on floats - result = np.allclose(data_diff, 0) - else: - # Exact equality - including time types, which allclose can't handle. - result = np.all(data_diff == 0) + # Numeric compare may need to allow for NaNs : floats *and datetimes* + equal_nan = ( + v1.dtype.kind in "fM" + ) # cannot set kwarg when not applicable + result = np.array_equal(v1.data, v2.data, equal_nan=equal_nan) if hasattr(result, "compute"): result = result.compute() assert result From bcc5ed6fa95cc6cea6e627e8d2d2785df15d0153 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 13 Feb 2026 18:25:44 +0000 Subject: [PATCH 5/5] Reconfigure docs to ref SciTools. --- docs/conf.py | 6 +++--- pyproject.toml | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 32f3f87..f354243 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,8 +19,8 @@ # -- Project information ----------------------------------------------------- project = "ncdata" -copyright = "2023, pp-mo" -author = "pp-mo" +copyright = "2023, SciTools" +author = "SciTools" # The complete version, including alpha/beta/rc tags version_parts = [str(part) for part in version_tuple] @@ -149,7 +149,7 @@ html_context = { # Possibly needed for pydata_theme? "github_repo": "ncdata", - "github_user": "pp-mo", + "github_user": "SciTools", "github_version": "main", "doc_path": "docs", # Default light/dark mode. diff --git a/pyproject.toml b/pyproject.toml index 8043c4e..da7d6af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "ncdata" authors = [ - {name = "Patrick Peglar", email = "patrick.peglar@metoffice.gov.uk"}, + {name = "Iris Contributors", email = "scitools.pub@gmail.com"} ] description = "Abstract NetCDF data objects, providing fast data transfer between analysis packages." requires-python = ">=3.10" @@ -50,10 +50,10 @@ dependencies = ["numpy", "dask", "netCDF4"] [project.urls] -Code = "https://github.com/pp-mo/ncdata" -Discussions = "https://github.com/pp-mo/ncdata/discussions" +Code = "https://github.com/SciTools/ncdata" +Discussions = "https://github.com/SciTools/ncdata/discussions" Documentation = "https://ncdata.readthedocs.io" -Issues = "https://github.com/pp-mo/ncdata/issues" +Issues = "https://github.com/SciTools/ncdata/issues" [tool.setuptools] license-files = ["LICENSE"] @@ -88,7 +88,7 @@ package = "ncdata" package_dir = "lib" directory = "docs/changelog_fragments" filename = "docs/change_log.rst" -issue_format = "`ISSUE#{issue} `_" +issue_format = "`ISSUE#{issue} `_" underlines = ["~", "^", "*", "+"] [[tool.towncrier.type]]