From dc8d846c448815bc9402bc7a1333c83afa987226 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 17 May 2026 10:07:56 -0700 Subject: [PATCH 1/3] geotiff: parity matrix harness scaffold (#1985) PR 1 of the issue #1985 plan. Adds xrspatial/geotiff/tests/test_backend_parity_matrix.py: a single parametrized harness that asserts pixel array, dtype, dims, coord values, transform tuple, CRS, nodata sentinel, masking state, and a small canonical-attrs subset for one (fixture, backend) cell at a time. Wired up end-to-end on one fixture (int16 single-band, no nodata, EPSG:4326) against one backend (eager numpy). The fixture list and backend list are module-level constants so follow-up PRs in the plan add a row each instead of editing the harness body. The "selected canonical attrs" subset and the masked_nodata assertion are gated on issues #1984 and #1988 respectively; both are stubbed so this PR does not depend on either landing. --- .../tests/test_backend_parity_matrix.py | 354 ++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_backend_parity_matrix.py diff --git a/xrspatial/geotiff/tests/test_backend_parity_matrix.py b/xrspatial/geotiff/tests/test_backend_parity_matrix.py new file mode 100644 index 000000000..b0d81b897 --- /dev/null +++ b/xrspatial/geotiff/tests/test_backend_parity_matrix.py @@ -0,0 +1,354 @@ +"""Required backend parity matrix per high-risk fixture (issue #1985). + +Single source of truth for "does backend X still match the reference on +fixture Z." Existing scattered parity files (``test_attrs_parity_1548.py``, +``test_backend_pixel_parity_matrix_1813.py``, etc.) stay in place as +named regression markers for their bug numbers; new parity assertions go +here. + +Harness contract +---------------- + +Every cell calls a single :func:`assert_parity` helper that checks the +same set of fields on the same fixture across every wired-up backend: + +* pixel array (byte-equal for int, NaN-aware closeness for float) +* dtype +* dims and dim order +* coord values and coord dtype (per axis) +* transform tuple (rasterio 6-tuple) +* CRS as EPSG int when present, plus ``crs_wkt`` string +* declared nodata sentinel +* masking state (``attrs.get('masked_nodata')`` once issue #1988 lands; + until then we accept absence as "unspecified" rather than asserting a + value) +* a small subset of canonical attrs whose round-trip semantics are + already settled in the module (``raster_type``, ``transform``, + ``crs``, ``crs_wkt``). The "selected canonical attrs" list from issue + #1985 will be tightened in a follow-up once issue #1984's PR 4 lands + the contract version stamp. + +Scope of this PR (matrix scaffold) +---------------------------------- + +PR 1 of the issue #1985 plan: harness end-to-end on one fixture (int16 +single-band, no nodata) against one backend (eager numpy). Follow-up +PRs add fixtures and backends without touching the harness itself. +""" +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Callable + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff + + +# --------------------------------------------------------------------------- +# Backend selection +# --------------------------------------------------------------------------- + +# Each backend entry maps a stable id to the ``open_geotiff`` kwargs that +# select it. Follow-up PRs add dask+numpy, gpu, dask+gpu, http, and vrt +# entries. The matrix is parametrized over this list so wiring up a new +# backend means appending one row, not editing every test. +_BACKENDS = [ + pytest.param({}, id="numpy"), +] + + +# --------------------------------------------------------------------------- +# Fixture descriptors +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class _FixtureSpec: + """Declarative description of one high-risk fixture. + + Attributes + ---------- + fix_id + Stable id used in the parametrize call. Appears in test names. + dtype + Pixel dtype of the underlying array (and the on-disk SampleFormat). + expected_dims + Tuple of dim names in expected order. + expected_crs_epsg + EPSG int the read path should emit under ``attrs['crs']``. + expected_nodata + Declared nodata sentinel that the read path should surface under + ``attrs['nodata']``. ``None`` means the fixture has no declared + nodata; the harness then asserts ``'nodata' not in attrs``. + expected_masked + Tri-valued. ``True`` / ``False`` pin ``attrs['masked_nodata']`` + once issue #1988 has landed and a backend opts in. ``None`` + means "do not assert" -- used for fixtures without nodata or + until the masking-state attr is wired up. + builder + Callable receiving a directory ``Path`` and returning the full + on-disk path. The builder is responsible for writing the file + and any sidecars (e.g. a ``.vrt`` over auxiliary tiles). + """ + + fix_id: str + dtype: np.dtype + expected_dims: tuple[str, ...] + expected_crs_epsg: int | None + expected_nodata: object + expected_masked: bool | None + builder: Callable[[Path], Path] + + +def _wrap_2d(arr: np.ndarray, *, crs: int | None) -> xr.DataArray: + """Wrap a 2-D numpy array as a writer-ready DataArray. + + The y/x coords are unit pixels, which keeps the read-back transform + tuple bit-exact (``(1.0, 0.0, 0.0, 0.0, -1.0, height)`` for the + descending-y convention used elsewhere in the test suite). + """ + height, width = arr.shape + da = xr.DataArray( + arr, dims=["y", "x"], + coords={ + "y": np.arange(height - 1, -1, -1, dtype=np.float64), + "x": np.arange(width, dtype=np.float64), + }, + attrs={}, + ) + if crs is not None: + da.attrs["crs"] = crs + return da + + +def _build_int16_single_band(dir_path: Path) -> Path: + """High-risk fixture: int16 single-band stripped TIFF, EPSG:4326, no nodata. + + Integer dtype catches sign-extension and predictor bugs that float + fixtures hide. Seeded so the byte-equal pixel assertion is reproducible. + """ + rng = np.random.default_rng(seed=19850) + arr = rng.integers(-30000, 30000, size=(32, 32), dtype=np.int16) + path = dir_path / "parity_1985_int16_single_band.tif" + to_geotiff( + _wrap_2d(arr, crs=4326), str(path), + compression="none", tiled=False, + ) + return path + + +_FIXTURES: list[_FixtureSpec] = [ + _FixtureSpec( + fix_id="int16-single-band", + dtype=np.dtype("int16"), + expected_dims=("y", "x"), + expected_crs_epsg=4326, + expected_nodata=None, + expected_masked=None, + builder=_build_int16_single_band, + ), +] + + +@pytest.fixture(scope="session") +def _parity_matrix_dir(tmp_path_factory): + """Session-scoped scratch dir, one write per fixture id. + + Tests reuse files across cells (12 fixtures times 6 backends would + otherwise be 72 redundant writes per pytest run). + """ + return tmp_path_factory.mktemp("parity_matrix_1985") + + +@pytest.fixture +def parity_fixture(_parity_matrix_dir): + """Resolve a :class:`_FixtureSpec` to an on-disk path. + + Files are cached across the session: a fixture already present on + disk is returned without rewriting. + """ + dir_path = _parity_matrix_dir + + def _resolve(spec: _FixtureSpec) -> Path: + path = dir_path / f"parity_1985_{spec.fix_id}.tif" + if path.exists(): + return path + built = spec.builder(dir_path) + # Builders that emit non-tiff suffixes (vrt) return their own + # path; rename only when the builder used a different name. + if built != path and built.suffix == ".tif": + built.rename(path) + return path + return built + return _resolve + + +# --------------------------------------------------------------------------- +# Materialisation + comparison helpers +# --------------------------------------------------------------------------- + +def _materialise(da: xr.DataArray) -> np.ndarray: + """Return a numpy view of ``da.data`` regardless of backend. + + Handles dask (``.compute``) and cupy (``.get``) without forcing the + caller to know which backend produced the DataArray. + """ + raw = da.data + if hasattr(raw, "compute"): + raw = raw.compute() + if hasattr(raw, "get"): + raw = raw.get() + return np.asarray(raw) + + +def _coord_view(da: xr.DataArray, name: str) -> np.ndarray: + return np.asarray(da.coords[name].values) + + +def _assert_pixels_equal(ref: np.ndarray, actual: np.ndarray, *, label: str) -> None: + """Pixel equality, dtype-aware. + + Integer arrays must be byte-identical; float arrays compare NaN-aware + with ``equal_nan=True``. Diverging dtypes always fail -- a backend + that silently upcasts has a bug. + """ + assert ref.dtype == actual.dtype, ( + f"{label}: dtype differs ref={ref.dtype} actual={actual.dtype}" + ) + assert ref.shape == actual.shape, ( + f"{label}: shape differs ref={ref.shape} actual={actual.shape}" + ) + if ref.dtype.kind == "f": + assert np.array_equal(ref, actual, equal_nan=True), ( + f"{label}: float pixels differ (NaN-aware)" + ) + else: + assert ref.tobytes() == actual.tobytes(), ( + f"{label}: integer pixel bytes differ" + ) + + +# --------------------------------------------------------------------------- +# The matrix cell +# --------------------------------------------------------------------------- + +def assert_parity( + da: xr.DataArray, + spec: _FixtureSpec, + *, + path: Path, + label: str, +) -> None: + """Assert every parity field for one (fixture, backend) cell. + + Run against an already-read DataArray rather than re-opening here so + the same helper applies to both ``open_geotiff(path, **kwargs)`` and + the explicit ``read_geotiff_dask`` / ``read_geotiff_gpu`` / + ``read_vrt`` entry points wired up in follow-up PRs. ``path`` is the + on-disk fixture, used to build the eager-numpy reference. + + The eager-numpy read of the same file is the reference for the pixel + array, coord values, dtype, dims, and transform tuple. Attrs are + asserted against the spec rather than against the reference, so a + bug that drops an attr in *every* backend still fails this cell. + """ + ref = open_geotiff(str(path)) + + # Pixel array, dtype, shape. + _assert_pixels_equal( + _materialise(ref), _materialise(da), label=label, + ) + + # Dims + order. + assert da.dims == spec.expected_dims, ( + f"{label}: dims {da.dims!r} != expected {spec.expected_dims!r}" + ) + + # Coord values and coord dtype, per axis. + for axis in spec.expected_dims: + if axis not in ref.coords: + continue + ref_c = _coord_view(ref, axis) + actual_c = _coord_view(da, axis) + assert ref_c.dtype == actual_c.dtype, ( + f"{label}: coord {axis!r} dtype " + f"ref={ref_c.dtype} actual={actual_c.dtype}" + ) + assert ref_c.tobytes() == actual_c.tobytes(), ( + f"{label}: coord {axis!r} bytes differ" + ) + + # Transform tuple. + ref_t = ref.attrs.get("transform") + actual_t = da.attrs.get("transform") + assert ref_t == actual_t, ( + f"{label}: transform tuple differs ref={ref_t!r} actual={actual_t!r}" + ) + + # CRS: EPSG int + WKT string. + if spec.expected_crs_epsg is not None: + assert da.attrs.get("crs") == spec.expected_crs_epsg, ( + f"{label}: attrs['crs'] {da.attrs.get('crs')!r} != " + f"expected {spec.expected_crs_epsg!r}" + ) + ref_wkt = ref.attrs.get("crs_wkt") + actual_wkt = da.attrs.get("crs_wkt") + assert ref_wkt == actual_wkt, ( + f"{label}: crs_wkt differs ref={ref_wkt!r} actual={actual_wkt!r}" + ) + + # Nodata sentinel + masking state. + if spec.expected_nodata is None: + assert "nodata" not in da.attrs, ( + f"{label}: fixture declares no nodata but attrs['nodata']=" + f"{da.attrs.get('nodata')!r}" + ) + else: + assert da.attrs.get("nodata") == spec.expected_nodata, ( + f"{label}: attrs['nodata'] {da.attrs.get('nodata')!r} != " + f"expected {spec.expected_nodata!r}" + ) + + # Masking state. ``attrs['masked_nodata']`` is the post-#1988 attr. + # Until it ships, ``None`` means "do not assert" and the matrix + # tolerates absence. The follow-up PR that wires #1988-aware + # fixtures will tighten this. + if spec.expected_masked is not None: + actual_masked = da.attrs.get("masked_nodata") + assert actual_masked == spec.expected_masked, ( + f"{label}: attrs['masked_nodata'] {actual_masked!r} != " + f"expected {spec.expected_masked!r}" + ) + + # Selected canonical attrs: the reference and the actual agree on + # presence and value. The list is intentionally narrow until issue + # #1984's PR 4 (contract version stamp) lands and we can extend it. + canonical_keys = ("raster_type", "transform", "crs", "crs_wkt") + for key in canonical_keys: + ref_v = ref.attrs.get(key) + actual_v = da.attrs.get(key) + assert ref_v == actual_v, ( + f"{label}: canonical attr {key!r} differs " + f"ref={ref_v!r} actual={actual_v!r}" + ) + + +# --------------------------------------------------------------------------- +# The single matrix test entry point +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("spec", _FIXTURES, ids=lambda s: s.fix_id) +@pytest.mark.parametrize("backend_kwargs", _BACKENDS) +def test_backend_parity_matrix(parity_fixture, spec, backend_kwargs): + """One cell per (fixture, backend). Asserts every parity field. + + A new backend or fixture lights up automatically on the next pytest + run -- no per-cell test function needed. + """ + path = parity_fixture(spec) + da = open_geotiff(str(path), **backend_kwargs) + label = f"fixture={spec.fix_id} backend={backend_kwargs}" + assert_parity(da, spec, path=path, label=label) From a69f03070dd76702debcca8018f0660846896683 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 17 May 2026 10:10:28 -0700 Subject: [PATCH 2/3] geotiff: address review on parity matrix scaffold (#1985) Follow-up to review on PR #2005. - Wire a dask+numpy backend row so the matrix runs one real cross-backend cell from day one. The eager-numpy row stays as the self-check. - Builder signature is now ``(dir_path, target_path)``: the caller owns the cache-key filename and passes it to the builder. Removes the rename round-trip that papered over an underscored builder name vs a dashed cache key. - Sanitise ``/`` in ``fix_id`` so future fixture ids like ``stripped/int16/none`` (the convention in ``test_backend_pixel_parity_matrix_1813.py``) do not create subdirectories under the session tmp dir. - Fix the ``_wrap_2d`` docstring transform example. The actual read-back tuple for a height-H fixture is ``(1.0, 0.0, -0.5, 0.0, -1.0, H - 0.5)`` due to the PixelIsArea half-pixel offset that the writer round-trips. --- .../tests/test_backend_parity_matrix.py | 43 +++++++++++-------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/xrspatial/geotiff/tests/test_backend_parity_matrix.py b/xrspatial/geotiff/tests/test_backend_parity_matrix.py index b0d81b897..0c89b6f1e 100644 --- a/xrspatial/geotiff/tests/test_backend_parity_matrix.py +++ b/xrspatial/geotiff/tests/test_backend_parity_matrix.py @@ -58,6 +58,7 @@ # backend means appending one row, not editing every test. _BACKENDS = [ pytest.param({}, id="numpy"), + pytest.param({"chunks": 16}, id="dask+numpy"), ] @@ -89,9 +90,12 @@ class _FixtureSpec: means "do not assert" -- used for fixtures without nodata or until the masking-state attr is wired up. builder - Callable receiving a directory ``Path`` and returning the full - on-disk path. The builder is responsible for writing the file - and any sidecars (e.g. a ``.vrt`` over auxiliary tiles). + Callable receiving a directory ``Path`` and the resolved target + ``Path`` (cache-key filename). Writes the file at ``target`` and + returns the final on-disk path. Most builders just return + ``target`` unchanged; sidecar-producing builders (e.g. a + ``.vrt`` over auxiliary tiles) may write multiple files and + return the entry path. """ fix_id: str @@ -100,15 +104,17 @@ class _FixtureSpec: expected_crs_epsg: int | None expected_nodata: object expected_masked: bool | None - builder: Callable[[Path], Path] + builder: Callable[[Path, Path], Path] def _wrap_2d(arr: np.ndarray, *, crs: int | None) -> xr.DataArray: """Wrap a 2-D numpy array as a writer-ready DataArray. - The y/x coords are unit pixels, which keeps the read-back transform - tuple bit-exact (``(1.0, 0.0, 0.0, 0.0, -1.0, height)`` for the - descending-y convention used elsewhere in the test suite). + Uses unit-pixel descending-y coords (``y = height-1 .. 0``, + ``x = 0 .. width-1``). The read-back transform tuple for a height-H + fixture is ``(1.0, 0.0, -0.5, 0.0, -1.0, H - 0.5)`` -- the half-pixel + offsets come from the PixelIsArea convention (origin is the pixel + edge, coords are pixel centres) that the writer round-trips. """ height, width = arr.shape da = xr.DataArray( @@ -124,20 +130,20 @@ def _wrap_2d(arr: np.ndarray, *, crs: int | None) -> xr.DataArray: return da -def _build_int16_single_band(dir_path: Path) -> Path: +def _build_int16_single_band(dir_path: Path, target: Path) -> Path: """High-risk fixture: int16 single-band stripped TIFF, EPSG:4326, no nodata. Integer dtype catches sign-extension and predictor bugs that float fixtures hide. Seeded so the byte-equal pixel assertion is reproducible. """ + del dir_path # builder writes directly to the resolved target path rng = np.random.default_rng(seed=19850) arr = rng.integers(-30000, 30000, size=(32, 32), dtype=np.int16) - path = dir_path / "parity_1985_int16_single_band.tif" to_geotiff( - _wrap_2d(arr, crs=4326), str(path), + _wrap_2d(arr, crs=4326), str(target), compression="none", tiled=False, ) - return path + return target _FIXTURES: list[_FixtureSpec] = [ @@ -173,16 +179,15 @@ def parity_fixture(_parity_matrix_dir): dir_path = _parity_matrix_dir def _resolve(spec: _FixtureSpec) -> Path: - path = dir_path / f"parity_1985_{spec.fix_id}.tif" + # ``fix_id`` may contain ``/`` once dtype/compression-keyed + # fixtures land (cf. ``test_backend_pixel_parity_matrix_1813.py`` + # ids like ``stripped/int16/none``). Flatten to a single + # filename so the resolver never creates subdirectories. + safe_id = spec.fix_id.replace("/", "-") + path = dir_path / f"parity_1985_{safe_id}.tif" if path.exists(): return path - built = spec.builder(dir_path) - # Builders that emit non-tiff suffixes (vrt) return their own - # path; rename only when the builder used a different name. - if built != path and built.suffix == ".tif": - built.rename(path) - return path - return built + return spec.builder(dir_path, path) return _resolve From 05e90ad0651efbf88f37ee6d6853d1b715919a12 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 17 May 2026 10:15:56 -0700 Subject: [PATCH 3/3] geotiff: assert spec.dtype in parity matrix harness (#1985) Second review-pr pass on PR #2005 caught that ``_FixtureSpec.dtype`` was declared and documented but never read by ``assert_parity``. The only dtype check was reference-vs-actual inside ``_assert_pixels_equal``, which means a backend bug that uniformly upcasts in both the reference read and the backend read would not be caught. Assert ``actual.dtype == spec.dtype`` against the spec so a silent upcast that the reference also exhibits still fails the cell. Update the ``assert_parity`` docstring to match what the function actually does (dims/coords/transform vs reference; dtype, crs, nodata vs spec). --- .../geotiff/tests/test_backend_parity_matrix.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/xrspatial/geotiff/tests/test_backend_parity_matrix.py b/xrspatial/geotiff/tests/test_backend_parity_matrix.py index 0c89b6f1e..d88b36994 100644 --- a/xrspatial/geotiff/tests/test_backend_parity_matrix.py +++ b/xrspatial/geotiff/tests/test_backend_parity_matrix.py @@ -256,15 +256,23 @@ def assert_parity( on-disk fixture, used to build the eager-numpy reference. The eager-numpy read of the same file is the reference for the pixel - array, coord values, dtype, dims, and transform tuple. Attrs are - asserted against the spec rather than against the reference, so a - bug that drops an attr in *every* backend still fails this cell. + array, coord values, dims, and transform tuple. ``spec.dtype`` and + ``spec.expected_crs_epsg`` / ``spec.expected_nodata`` are asserted + against the actual independently of the reference, so a bug that + silently changes them in *every* backend still fails this cell. """ ref = open_geotiff(str(path)) # Pixel array, dtype, shape. + actual_arr = _materialise(da) _assert_pixels_equal( - _materialise(ref), _materialise(da), label=label, + _materialise(ref), actual_arr, label=label, + ) + + # Dtype against the spec, not just against the reference. Catches a + # silent upcast that the reference would also exhibit. + assert actual_arr.dtype == spec.dtype, ( + f"{label}: dtype {actual_arr.dtype} != spec dtype {spec.dtype}" ) # Dims + order.