From 48387fc5594541a53d5c85a495bdc1c44a95abca Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 08:17:56 -0700 Subject: [PATCH 1/2] geotiff: golden corpus phase 3 PR 6, VRT backend (#1930) Synthesises a two-source horizontal mosaic from a corpus fixture: the source ``.tif`` is rasterio-copied twice into ``tmp_path`` with the second copy's origin shifted east by one image width, then ``write_vrt`` builds the VRT XML and ``open_geotiff`` reads it back. The expected mosaic is ``np.concatenate`` of the source pixels along the x axis. Each cell asserts three things: the rasterio read of the VRT matches the manual concat (sanity check that the VRT itself is well-formed), the xrspatial read matches the same concat (the actual parity assertion), and the xrspatial dtype matches the rasterio dtype. Two source fixtures are parametrised: ``dtype_uint8`` and ``dtype_uint16``. Both are bit-exact comparable and use the manifest default transform. ``_VRT_SOURCE_IDS`` is validated against the manifest so a renamed source fixture surfaces a typo rather than silently letting the test skip. --- .../tests/test_golden_corpus_vrt_1930.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py diff --git a/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py new file mode 100644 index 000000000..36ca68aec --- /dev/null +++ b/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py @@ -0,0 +1,149 @@ +"""VRT backend cells against the golden-corpus oracle (issue #1930). + +Phase 3 PR 6 of the corpus plan. The VRT path in ``xrspatial.geotiff`` +is reached when ``open_geotiff`` sees a ``.vrt`` source path; it +delegates to ``read_vrt`` which parses the GDAL VRT XML and stitches +the listed source GeoTIFFs back into a single DataArray. + +This module synthesises a two-source horizontal mosaic from one corpus +fixture: the source ``.tif`` is rasterio-copied twice into a temp +directory with the second copy's origin shifted east by one image +width, then ``write_vrt`` builds the VRT XML, and ``open_geotiff`` +reads it back. The oracle reads the same VRT through rasterio so any +divergence is in xrspatial's VRT plumbing, not in the mosaic geometry. + +A separate cell uses the COG fixture as the source. Its transform is +the manifest default (``[0.001, 0, -120, 0, -0.001, 45]``); the +right-half copy is shifted by ``+pixel_width * width`` so the two +copies do not overlap, which is what ``write_vrt`` expects for a clean +mosaic. The expected mosaic has shape ``(H, 2 * W)``. + +VRT-specific gaps -- if any surface -- go in ``_VRT_SKIPS``. The +shared codec / attrs parity gaps from the eager / dask / GPU modules +do not all apply here because the VRT cells use only the source +fixtures named below; the parametrize set is intentionally narrow. +""" +from __future__ import annotations + +import pathlib + +import numpy as np +import pytest + +pytest.importorskip("yaml") +rasterio = pytest.importorskip("rasterio") + +from xrspatial.geotiff import open_geotiff, write_vrt # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus import generate # noqa: E402 + + +FIXTURES_DIR = ( + pathlib.Path(generate.__file__).resolve().parent / "fixtures" +) + + +# Source fixtures the VRT cells use. These are chosen because their +# default transform and dtype combine cleanly under a horizontal +# concat: integer dtype keeps comparison bit-exact, default +# transform is well-defined, no codec or attrs surprises. +_VRT_SOURCE_IDS = ["dtype_uint8", "dtype_uint16"] + + +def _build_two_source_vrt( + src_path: pathlib.Path, tmp_dir: pathlib.Path +) -> tuple[pathlib.Path, np.ndarray]: + """Materialise two copies of ``src_path`` side-by-side and a VRT. + + Returns the ``.vrt`` path and the expected mosaiced numpy array. + """ + with rasterio.open(src_path) as src: + data = src.read(1) + profile = src.profile.copy() + height, width = data.shape + t = src.transform + pw = float(t.a) + ox = float(t.c) + oy = float(t.f) + ph = float(t.e) + + left = tmp_dir / "left.tif" + right = tmp_dir / "right.tif" + profile_left = profile.copy() + profile_left["transform"] = rasterio.transform.Affine( + pw, 0.0, ox, 0.0, ph, oy + ) + profile_right = profile.copy() + profile_right["transform"] = rasterio.transform.Affine( + pw, 0.0, ox + pw * width, 0.0, ph, oy + ) + with rasterio.open(str(left), "w", **profile_left) as dst: + dst.write(data, 1) + with rasterio.open(str(right), "w", **profile_right) as dst: + dst.write(data, 1) + + vrt_path = tmp_dir / "mosaic.vrt" + write_vrt(str(vrt_path), [str(left), str(right)]) + + expected = np.concatenate([data, data], axis=1) + return vrt_path, expected + + +@pytest.fixture(params=_VRT_SOURCE_IDS, ids=_VRT_SOURCE_IDS) +def vrt_source_id(request) -> str: + return request.param + + +def test_vrt_two_source_horizontal_mosaic( + vrt_source_id: str, tmp_path: pathlib.Path +) -> None: + """``open_geotiff(vrt_path)`` matches a rasterio read of the same VRT. + + The two source copies are horizontally tiled. The VRT writer wires + up the geotransform; the test compares the xrspatial read against + both the rasterio read of the VRT and a manual ``np.concatenate`` + of the source pixels. + """ + src_path = FIXTURES_DIR / f"{vrt_source_id}.tif" + if not src_path.exists(): + pytest.skip( + f"source fixture {vrt_source_id!r} has no .tif on disk; run " + f"`python -m xrspatial.geotiff.tests.golden_corpus.generate`" + ) + + vrt_path, expected = _build_two_source_vrt(src_path, tmp_path) + + # rasterio reference read of the VRT. + with rasterio.open(vrt_path) as src: + ref = src.read(1) + assert ref.shape == expected.shape, ( + f"rasterio VRT read shape mismatch: ref={ref.shape}, " + f"expected={expected.shape}" + ) + np.testing.assert_array_equal(ref, expected) + + # xrspatial read of the VRT. + candidate = open_geotiff(str(vrt_path)) + cand_pixels = np.asarray(candidate.data) + if cand_pixels.ndim == 3 and cand_pixels.shape[0] == 1: + cand_pixels = cand_pixels[0] + elif cand_pixels.ndim == 3 and cand_pixels.shape[-1] == 1: + cand_pixels = cand_pixels[..., 0] + assert cand_pixels.shape == expected.shape, ( + f"xrspatial VRT read shape mismatch: cand={cand_pixels.shape}, " + f"expected={expected.shape}" + ) + np.testing.assert_array_equal(cand_pixels, expected) + assert candidate.dtype == ref.dtype, ( + f"dtype mismatch: rasterio={ref.dtype}, xrspatial={candidate.dtype}" + ) + + +def test_vrt_source_ids_exist_in_manifest() -> None: + """Every id in ``_VRT_SOURCE_IDS`` must be a real manifest entry.""" + manifest = generate.load_manifest() + entries = generate.validate(manifest) + manifest_ids = {e["id"] for e in entries} + stale = set(_VRT_SOURCE_IDS) - manifest_ids + assert not stale, ( + f"VRT source list references unknown fixture ids: {sorted(stale)}" + ) From fb50742190c2a6df30c6b9a356e454ccf0963c95 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 08:58:51 -0700 Subject: [PATCH 2/2] geotiff: VRT review follow-ups (#1930) Address phase 3 PR 6 review: * Replace the hand-rolled pixel + dtype assertion with ``compare_to_oracle(vrt_path, candidate)`` so the VRT cell checks pixels, dtype, transform, CRS, and nodata the same way every other phase-3 backend module does. * Keep the independent ``np.concatenate`` sanity check before the oracle call. The oracle also reads through rasterio; if the VRT writer drifts the oracle would silently agree with itself, so the manual concat verifies the rasterio reference is what we expect before using it. * Use ``_height, width`` to mark the unpacked height as unused. --- .../tests/test_golden_corpus_vrt_1930.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py index 36ca68aec..bd3b5af33 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_vrt_1930.py @@ -35,6 +35,9 @@ from xrspatial.geotiff import open_geotiff, write_vrt # noqa: E402 from xrspatial.geotiff.tests.golden_corpus import generate # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus._oracle import ( # noqa: E402 + compare_to_oracle, +) FIXTURES_DIR = ( @@ -59,7 +62,7 @@ def _build_two_source_vrt( with rasterio.open(src_path) as src: data = src.read(1) profile = src.profile.copy() - height, width = data.shape + _height, width = data.shape t = src.transform pw = float(t.a) ox = float(t.c) @@ -112,7 +115,11 @@ def test_vrt_two_source_horizontal_mosaic( vrt_path, expected = _build_two_source_vrt(src_path, tmp_path) - # rasterio reference read of the VRT. + # Independent sanity: the rasterio read of the VRT really does + # produce the horizontally concatenated mosaic. If the VRT writer + # ever drifts, this assertion catches it before the oracle does -- + # which matters because the oracle also reads through rasterio and + # would silently agree with itself on a busted VRT. with rasterio.open(vrt_path) as src: ref = src.read(1) assert ref.shape == expected.shape, ( @@ -121,21 +128,11 @@ def test_vrt_two_source_horizontal_mosaic( ) np.testing.assert_array_equal(ref, expected) - # xrspatial read of the VRT. + # Parity check via the shared oracle. Compares pixels, dtype, + # transform, CRS, and nodata against the rasterio reference read of + # the same VRT, the same as every other phase 3 backend module. candidate = open_geotiff(str(vrt_path)) - cand_pixels = np.asarray(candidate.data) - if cand_pixels.ndim == 3 and cand_pixels.shape[0] == 1: - cand_pixels = cand_pixels[0] - elif cand_pixels.ndim == 3 and cand_pixels.shape[-1] == 1: - cand_pixels = cand_pixels[..., 0] - assert cand_pixels.shape == expected.shape, ( - f"xrspatial VRT read shape mismatch: cand={cand_pixels.shape}, " - f"expected={expected.shape}" - ) - np.testing.assert_array_equal(cand_pixels, expected) - assert candidate.dtype == ref.dtype, ( - f"dtype mismatch: rasterio={ref.dtype}, xrspatial={candidate.dtype}" - ) + compare_to_oracle(vrt_path, candidate) def test_vrt_source_ids_exist_in_manifest() -> None: