From 46eefe4f38887363083701956ef49773f6e92889 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 14:04:52 -0700 Subject: [PATCH 1/2] geotiff: add fsspec backend to the golden corpus matrix (#1930) Final corpus-coverage backend per issue #1930's proposal text ("fsspec where applicable"). Pushes each fixture's bytes into fsspec's in-process memory filesystem and reads them back through open_geotiff via _CloudSource, so the cloud read path exercises the same fixtures the eager / dask / GPU / VRT / HTTP modules do without needing real credentials or a localstack-style service. The skip / xfail taxonomy starts empty: every shared codec/attrs gap has been closed at the oracle layer, and the fsspec eager path piggybacks on the eager decode primitives. Only the intentional MinIsWhite skip and the schema-only fixture (no .tif on disk) come up as skips. --- .../tests/test_golden_corpus_fsspec_1930.py | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py diff --git a/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py new file mode 100644 index 000000000..de53dd8c1 --- /dev/null +++ b/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py @@ -0,0 +1,277 @@ +"""fsspec backend cells against the golden-corpus oracle (issue #1930). + +Final corpus-coverage backend per issue #1930's proposal text +("fsspec where applicable"). Mirrors the eager, dask, and GPU parity +layers but reads each fixture through an fsspec ``memory://`` URL: +the fixture's bytes are pushed into the in-process fsspec memory +filesystem, then ``open_geotiff('memory:///corpus/.tif')`` walks +through ``_is_fsspec_uri`` and ``_CloudSource`` the same way an +``s3://`` or ``gs://`` URL would. The ``memory://`` scheme ships +with fsspec and runs without any external credentials or services, +so it works in plain CI while still exercising the cloud read path. + +The cloud-eager read path returns a numpy-backed DataArray (the +``_CloudSource`` constructor downloads the bytes via ``read_all()`` +within the ``max_cloud_bytes`` budget; see #1928). It does NOT chunk +into dask. ``test_fsspec_candidate_is_actually_numpy`` pins that +contract. + +Skip / xfail taxonomy +--------------------- +``_PARITY_GAPS`` lists shared codec / attrs gaps; ``_FSSPEC_SKIPS`` +lists gaps that surface only on the fsspec path; ``_INTENTIONAL_SKIPS`` +covers by-design divergences (MinIsWhite inversion). All three tables +start empty on this PR because every shared gap has been closed at +the oracle / codec layer and the fsspec read path piggybacks on the +eager decode primitives. + +Resolved gaps (no longer xfail): + +* Integer nodata masking -- closed by the oracle's + ``_normalise_for_masked_nodata`` helper. +* RGB band axis order on the JPEG-YCbCr fixture -- closed by the + oracle's ``_normalise_axis_order`` helper. +* ``crs_citation_only`` -- closed by ``_synthesize_user_defined_wkt`` + in ``_geotags.py``. + +Intentional skip (``skip``): + +* ``nodata_miniswhite_uint8`` -- MinIsWhite photometric inversion. + xrspatial inverts pixels per #1797; rasterio leaves them raw. + Covered by ``test_miniswhite_backend_parity_1797.py``. + +Memory filesystem hygiene +------------------------- +fsspec's memory filesystem is a process-global singleton. The +``_memory_fs_clean`` fixture wipes it before and after each test so +fixtures from earlier tests cannot leak into the read path and a +flaky test cannot poison its successors. +""" +from __future__ import annotations + +import pathlib + +import pytest + +pytest.importorskip("yaml") +pytest.importorskip("rasterio") +fsspec = pytest.importorskip("fsspec") + +import numpy as np # noqa: E402 + +from xrspatial.geotiff import open_geotiff # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus import generate # noqa: E402 +from xrspatial.geotiff.tests.golden_corpus._marks import ( # noqa: E402 + fast_slow_marks_for, +) +from xrspatial.geotiff.tests.golden_corpus._oracle import ( # noqa: E402 + compare_to_oracle, +) + + +FIXTURES_DIR = ( + pathlib.Path(generate.__file__).resolve().parent / "fixtures" +) + + +# Shared codec / attrs gaps. Empty: every gap the corpus has surfaced +# has been closed at the oracle / codec layer. +_PARITY_GAPS: dict[str, str] = {} + +# fsspec-only gaps. Empty in the first pass; add an entry only when +# a fixture is fsspec-specific (i.e. eager passes, fsspec does not). +_FSSPEC_SKIPS: dict[str, str] = {} + +# Fixtures whose overview-IFD code path is not yet wired into the +# xrspatial reader. The base-IFD parity check still runs; the +# overview-level loop driven by ``candidate_factory`` is skipped for +# these ids. See the eager module for the rationale. +_OVERVIEW_READER_GAPS: dict[str, str] = { + "overview_external_ovr_uint16": ( + "External .ovr sidecar reader is not implemented in xrspatial. " + "The base IFD still parity-checks; the overview levels live in " + "a sibling .tif.ovr that the reader does not open." + ), +} + +_INTENTIONAL_SKIPS: dict[str, str] = { + "nodata_miniswhite_uint8": ( + "MinIsWhite photometric inversion: xrspatial inverts pixels per " + "#1797; rasterio leaves them raw. Covered by " + "test_miniswhite_backend_parity_1797.py." + ), +} + + +def _resolved_fixtures() -> list[dict]: + """Return manifest entries with defaults merged, sorted by id.""" + manifest = generate.load_manifest() + entries = generate.validate(manifest) + entries.sort(key=lambda e: e["id"]) + return entries + + +def _fixture_path(entry: dict) -> pathlib.Path: + return FIXTURES_DIR / f"{entry['id']}.tif" + + +def _is_lossy(entry: dict) -> bool: + tol = entry.get("tolerance") or {} + return bool(tol.get("lossy", False)) + + +def _build_param(entry: dict) -> pytest.param: + """Wrap a fixture entry in a ``pytest.param`` with the right marks. + + Shared codec / attrs gaps and fsspec-only gaps both become strict + ``xfail`` so the cell flips to a hard failure the day the gap + closes. Intentional skips get a plain skip. Slow fixtures inherit + ``pytest.mark.slow`` from the corpus helper so the fast / slow + split (#2062) cooperates. + """ + fid = entry["id"] + marks = list(fast_slow_marks_for(entry)) + if fid in _PARITY_GAPS: + marks.append( + pytest.mark.xfail(reason=_PARITY_GAPS[fid], strict=True) + ) + elif fid in _FSSPEC_SKIPS: + marks.append( + pytest.mark.xfail(reason=_FSSPEC_SKIPS[fid], strict=True) + ) + elif fid in _INTENTIONAL_SKIPS: + marks.append(pytest.mark.skip(reason=_INTENTIONAL_SKIPS[fid])) + return pytest.param(entry, id=fid, marks=marks) + + +_FIXTURES = _resolved_fixtures() +_PARAMS = [_build_param(e) for e in _FIXTURES] + + +def _serve_via_memory(payload: bytes, fixture_id: str) -> str: + """Push ``payload`` into the fsspec ``memory://`` filesystem. + + Returns the ``memory:///corpus/.tif`` URL the reader + should open. The three-slash form is required: ``memory://`` writes + must use a path beginning with ``/`` or fsspec rejects them. + """ + fs = fsspec.filesystem("memory") + url = f"memory:///corpus/{fixture_id}.tif" + # ``pipe`` is the idiomatic single-shot byte writer for fsspec + # filesystems; ``test_cloud_read_byte_limit_1928.py`` uses the + # same pattern. + fs.pipe(f"/corpus/{fixture_id}.tif", payload) + return url + + +@pytest.fixture +def _memory_fs_clean(): + """Wipe the fsspec memory filesystem around each test. + + The memory filesystem is a process-global singleton, so leftover + state from an earlier test (or the sanity-check test below) could + otherwise show up in the read path. Clears before and after for + belt-and-braces hygiene. + """ + fs = fsspec.filesystem("memory") + fs.store.clear() + try: + yield fs + finally: + fs.store.clear() + + +@pytest.mark.parametrize("manifest_entry", _PARAMS) +def test_fsspec_parity(manifest_entry: dict, _memory_fs_clean) -> None: + """``open_geotiff('memory://...')`` agrees with the rasterio oracle. + + The fixture's bytes are written into the fsspec memory filesystem + and read back through ``_CloudSource``. This is the same read path + a real ``s3://`` / ``gs://`` URL would take, minus the network and + credentials. The oracle reads from the local on-disk fixture path + because rasterio does not know about the memory filesystem; both + sides see the same bytes either way. + """ + fixture_id = manifest_entry["id"] + path = _fixture_path(manifest_entry) + if not path.exists(): + pytest.skip( + f"fixture {fixture_id!r} has no .tif on disk; run " + f"`python -m xrspatial.geotiff.tests.golden_corpus.generate` " + f"to materialise the full corpus" + ) + + with open(path, "rb") as f: + payload = f.read() + url = _serve_via_memory(payload, fixture_id) + + candidate = open_geotiff(url) + # When the fixture carries pyramid overviews, hand the oracle a + # factory it can use to fetch each overview level via the same + # cloud-eager path. The overview-IFD scan also goes through + # ``_CloudSource``, so any divergence there shows up here. + overviews = manifest_entry.get("overviews") or [] + factory = ( + (lambda lvl, u=url: open_geotiff(u, overview_level=lvl)) + if overviews and fixture_id not in _OVERVIEW_READER_GAPS + else None + ) + compare_to_oracle( + path, + candidate, + lossy=_is_lossy(manifest_entry), + candidate_factory=factory, + ) + + +def test_taxonomy_ids_are_in_manifest() -> None: + """Every id in the parity-gap, fsspec-skip, overview-gap, or + intentional-skip tables must exist in the manifest. + + Guards against typos: a stale entry would silently keep a + known-bad fixture marked as expected-to-fail even after it was + renamed or removed. + """ + manifest_ids = {e["id"] for e in _FIXTURES} + tagged = ( + set(_PARITY_GAPS) + | set(_FSSPEC_SKIPS) + | set(_OVERVIEW_READER_GAPS) + | set(_INTENTIONAL_SKIPS) + ) + stale = tagged - manifest_ids + assert not stale, ( + f"taxonomy references unknown fixture ids: {sorted(stale)}" + ) + + +def test_fsspec_candidate_is_actually_numpy(_memory_fs_clean) -> None: + """Sanity check: the fsspec read path returns a numpy-backed array. + + ``open_geotiff(memory_url)`` without ``chunks=`` should go through + the eager ``_CloudSource`` path which downloads the object and + decodes into a numpy array. If a regression accidentally wired the + fsspec URI to a dask-chunked path, this test catches it. + + Picks the first eligible (not skipped, on disk) fixture in sorted + id order so the assertion is stable across corpus growth. + """ + eligible = [ + e for e in _FIXTURES + if e["id"] not in _PARITY_GAPS + and e["id"] not in _FSSPEC_SKIPS + and e["id"] not in _INTENTIONAL_SKIPS + and _fixture_path(e).exists() + ] + if not eligible: + pytest.skip("no eligible fixture on disk") + entry = eligible[0] + path = _fixture_path(entry) + with open(path, "rb") as f: + payload = f.read() + url = _serve_via_memory(payload, entry["id"]) + da = open_geotiff(url) + assert isinstance(da.data, np.ndarray), ( + f"expected a numpy.ndarray for the fsspec eager path on " + f"{entry['id']!r}, got {type(da.data).__name__}" + ) From 646b588dd5d0167117375919ab8285e3d13764b6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 14:12:38 -0700 Subject: [PATCH 2/2] geotiff: tidy fsspec corpus module nits (#1930) Review follow-up on #2076: * Drop the leading underscore on the ``memory_fs_clean`` pytest fixture. Pytest fixtures are public test API; the underscore convention is for module-private helpers, not fixtures. Existing modules in the repo don't use it. * Rename the overview-factory lambda parameter ``lvl`` to ``level`` so the variable matches the kwarg it forwards to and stays consistent with the other backend modules' factories. The third dismissed nit (repeated ``open(path, 'rb')`` reads) is genuinely low-value; left as-is. --- .../geotiff/tests/test_golden_corpus_fsspec_1930.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py index de53dd8c1..5b345b673 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_fsspec_1930.py @@ -43,7 +43,7 @@ Memory filesystem hygiene ------------------------- fsspec's memory filesystem is a process-global singleton. The -``_memory_fs_clean`` fixture wipes it before and after each test so +``memory_fs_clean`` fixture wipes it before and after each test so fixtures from earlier tests cannot leak into the read path and a flaky test cannot poison its successors. """ @@ -165,7 +165,7 @@ def _serve_via_memory(payload: bytes, fixture_id: str) -> str: @pytest.fixture -def _memory_fs_clean(): +def memory_fs_clean(): """Wipe the fsspec memory filesystem around each test. The memory filesystem is a process-global singleton, so leftover @@ -182,7 +182,7 @@ def _memory_fs_clean(): @pytest.mark.parametrize("manifest_entry", _PARAMS) -def test_fsspec_parity(manifest_entry: dict, _memory_fs_clean) -> None: +def test_fsspec_parity(manifest_entry: dict, memory_fs_clean) -> None: """``open_geotiff('memory://...')`` agrees with the rasterio oracle. The fixture's bytes are written into the fsspec memory filesystem @@ -212,7 +212,7 @@ def test_fsspec_parity(manifest_entry: dict, _memory_fs_clean) -> None: # ``_CloudSource``, so any divergence there shows up here. overviews = manifest_entry.get("overviews") or [] factory = ( - (lambda lvl, u=url: open_geotiff(u, overview_level=lvl)) + (lambda level, u=url: open_geotiff(u, overview_level=level)) if overviews and fixture_id not in _OVERVIEW_READER_GAPS else None ) @@ -245,7 +245,7 @@ def test_taxonomy_ids_are_in_manifest() -> None: ) -def test_fsspec_candidate_is_actually_numpy(_memory_fs_clean) -> None: +def test_fsspec_candidate_is_actually_numpy(memory_fs_clean) -> None: """Sanity check: the fsspec read path returns a numpy-backed array. ``open_geotiff(memory_url)`` without ``chunks=`` should go through