|
| 1 | +"""fsspec backend cells against the golden-corpus oracle (issue #1930). |
| 2 | +
|
| 3 | +Final corpus-coverage backend per issue #1930's proposal text |
| 4 | +("fsspec where applicable"). Mirrors the eager, dask, and GPU parity |
| 5 | +layers but reads each fixture through an fsspec ``memory://`` URL: |
| 6 | +the fixture's bytes are pushed into the in-process fsspec memory |
| 7 | +filesystem, then ``open_geotiff('memory:///corpus/<id>.tif')`` walks |
| 8 | +through ``_is_fsspec_uri`` and ``_CloudSource`` the same way an |
| 9 | +``s3://`` or ``gs://`` URL would. The ``memory://`` scheme ships |
| 10 | +with fsspec and runs without any external credentials or services, |
| 11 | +so it works in plain CI while still exercising the cloud read path. |
| 12 | +
|
| 13 | +The cloud-eager read path returns a numpy-backed DataArray (the |
| 14 | +``_CloudSource`` constructor downloads the bytes via ``read_all()`` |
| 15 | +within the ``max_cloud_bytes`` budget; see #1928). It does NOT chunk |
| 16 | +into dask. ``test_fsspec_candidate_is_actually_numpy`` pins that |
| 17 | +contract. |
| 18 | +
|
| 19 | +Skip / xfail taxonomy |
| 20 | +--------------------- |
| 21 | +``_PARITY_GAPS`` lists shared codec / attrs gaps; ``_FSSPEC_SKIPS`` |
| 22 | +lists gaps that surface only on the fsspec path; ``_INTENTIONAL_SKIPS`` |
| 23 | +covers by-design divergences (MinIsWhite inversion). All three tables |
| 24 | +start empty on this PR because every shared gap has been closed at |
| 25 | +the oracle / codec layer and the fsspec read path piggybacks on the |
| 26 | +eager decode primitives. |
| 27 | +
|
| 28 | +Resolved gaps (no longer xfail): |
| 29 | +
|
| 30 | +* Integer nodata masking -- closed by the oracle's |
| 31 | + ``_normalise_for_masked_nodata`` helper. |
| 32 | +* RGB band axis order on the JPEG-YCbCr fixture -- closed by the |
| 33 | + oracle's ``_normalise_axis_order`` helper. |
| 34 | +* ``crs_citation_only`` -- closed by ``_synthesize_user_defined_wkt`` |
| 35 | + in ``_geotags.py``. |
| 36 | +
|
| 37 | +Intentional skip (``skip``): |
| 38 | +
|
| 39 | +* ``nodata_miniswhite_uint8`` -- MinIsWhite photometric inversion. |
| 40 | + xrspatial inverts pixels per #1797; rasterio leaves them raw. |
| 41 | + Covered by ``test_miniswhite_backend_parity_1797.py``. |
| 42 | +
|
| 43 | +Memory filesystem hygiene |
| 44 | +------------------------- |
| 45 | +fsspec's memory filesystem is a process-global singleton. The |
| 46 | +``memory_fs_clean`` fixture wipes it before and after each test so |
| 47 | +fixtures from earlier tests cannot leak into the read path and a |
| 48 | +flaky test cannot poison its successors. |
| 49 | +""" |
| 50 | +from __future__ import annotations |
| 51 | + |
| 52 | +import pathlib |
| 53 | + |
| 54 | +import pytest |
| 55 | + |
| 56 | +pytest.importorskip("yaml") |
| 57 | +pytest.importorskip("rasterio") |
| 58 | +fsspec = pytest.importorskip("fsspec") |
| 59 | + |
| 60 | +import numpy as np # noqa: E402 |
| 61 | + |
| 62 | +from xrspatial.geotiff import open_geotiff # noqa: E402 |
| 63 | +from xrspatial.geotiff.tests.golden_corpus import generate # noqa: E402 |
| 64 | +from xrspatial.geotiff.tests.golden_corpus._marks import ( # noqa: E402 |
| 65 | + fast_slow_marks_for, |
| 66 | +) |
| 67 | +from xrspatial.geotiff.tests.golden_corpus._oracle import ( # noqa: E402 |
| 68 | + compare_to_oracle, |
| 69 | +) |
| 70 | + |
| 71 | + |
| 72 | +FIXTURES_DIR = ( |
| 73 | + pathlib.Path(generate.__file__).resolve().parent / "fixtures" |
| 74 | +) |
| 75 | + |
| 76 | + |
| 77 | +# Shared codec / attrs gaps. Empty: every gap the corpus has surfaced |
| 78 | +# has been closed at the oracle / codec layer. |
| 79 | +_PARITY_GAPS: dict[str, str] = {} |
| 80 | + |
| 81 | +# fsspec-only gaps. Empty in the first pass; add an entry only when |
| 82 | +# a fixture is fsspec-specific (i.e. eager passes, fsspec does not). |
| 83 | +_FSSPEC_SKIPS: dict[str, str] = {} |
| 84 | + |
| 85 | +# Fixtures whose overview-IFD code path is not yet wired into the |
| 86 | +# xrspatial reader. The base-IFD parity check still runs; the |
| 87 | +# overview-level loop driven by ``candidate_factory`` is skipped for |
| 88 | +# these ids. See the eager module for the rationale. |
| 89 | +_OVERVIEW_READER_GAPS: dict[str, str] = { |
| 90 | + "overview_external_ovr_uint16": ( |
| 91 | + "External .ovr sidecar reader is not implemented in xrspatial. " |
| 92 | + "The base IFD still parity-checks; the overview levels live in " |
| 93 | + "a sibling .tif.ovr that the reader does not open." |
| 94 | + ), |
| 95 | +} |
| 96 | + |
| 97 | +_INTENTIONAL_SKIPS: dict[str, str] = { |
| 98 | + "nodata_miniswhite_uint8": ( |
| 99 | + "MinIsWhite photometric inversion: xrspatial inverts pixels per " |
| 100 | + "#1797; rasterio leaves them raw. Covered by " |
| 101 | + "test_miniswhite_backend_parity_1797.py." |
| 102 | + ), |
| 103 | +} |
| 104 | + |
| 105 | + |
| 106 | +def _resolved_fixtures() -> list[dict]: |
| 107 | + """Return manifest entries with defaults merged, sorted by id.""" |
| 108 | + manifest = generate.load_manifest() |
| 109 | + entries = generate.validate(manifest) |
| 110 | + entries.sort(key=lambda e: e["id"]) |
| 111 | + return entries |
| 112 | + |
| 113 | + |
| 114 | +def _fixture_path(entry: dict) -> pathlib.Path: |
| 115 | + return FIXTURES_DIR / f"{entry['id']}.tif" |
| 116 | + |
| 117 | + |
| 118 | +def _is_lossy(entry: dict) -> bool: |
| 119 | + tol = entry.get("tolerance") or {} |
| 120 | + return bool(tol.get("lossy", False)) |
| 121 | + |
| 122 | + |
| 123 | +def _build_param(entry: dict) -> pytest.param: |
| 124 | + """Wrap a fixture entry in a ``pytest.param`` with the right marks. |
| 125 | +
|
| 126 | + Shared codec / attrs gaps and fsspec-only gaps both become strict |
| 127 | + ``xfail`` so the cell flips to a hard failure the day the gap |
| 128 | + closes. Intentional skips get a plain skip. Slow fixtures inherit |
| 129 | + ``pytest.mark.slow`` from the corpus helper so the fast / slow |
| 130 | + split (#2062) cooperates. |
| 131 | + """ |
| 132 | + fid = entry["id"] |
| 133 | + marks = list(fast_slow_marks_for(entry)) |
| 134 | + if fid in _PARITY_GAPS: |
| 135 | + marks.append( |
| 136 | + pytest.mark.xfail(reason=_PARITY_GAPS[fid], strict=True) |
| 137 | + ) |
| 138 | + elif fid in _FSSPEC_SKIPS: |
| 139 | + marks.append( |
| 140 | + pytest.mark.xfail(reason=_FSSPEC_SKIPS[fid], strict=True) |
| 141 | + ) |
| 142 | + elif fid in _INTENTIONAL_SKIPS: |
| 143 | + marks.append(pytest.mark.skip(reason=_INTENTIONAL_SKIPS[fid])) |
| 144 | + return pytest.param(entry, id=fid, marks=marks) |
| 145 | + |
| 146 | + |
| 147 | +_FIXTURES = _resolved_fixtures() |
| 148 | +_PARAMS = [_build_param(e) for e in _FIXTURES] |
| 149 | + |
| 150 | + |
| 151 | +def _serve_via_memory(payload: bytes, fixture_id: str) -> str: |
| 152 | + """Push ``payload`` into the fsspec ``memory://`` filesystem. |
| 153 | +
|
| 154 | + Returns the ``memory:///corpus/<fixture_id>.tif`` URL the reader |
| 155 | + should open. The three-slash form is required: ``memory://`` writes |
| 156 | + must use a path beginning with ``/`` or fsspec rejects them. |
| 157 | + """ |
| 158 | + fs = fsspec.filesystem("memory") |
| 159 | + url = f"memory:///corpus/{fixture_id}.tif" |
| 160 | + # ``pipe`` is the idiomatic single-shot byte writer for fsspec |
| 161 | + # filesystems; ``test_cloud_read_byte_limit_1928.py`` uses the |
| 162 | + # same pattern. |
| 163 | + fs.pipe(f"/corpus/{fixture_id}.tif", payload) |
| 164 | + return url |
| 165 | + |
| 166 | + |
| 167 | +@pytest.fixture |
| 168 | +def memory_fs_clean(): |
| 169 | + """Wipe the fsspec memory filesystem around each test. |
| 170 | +
|
| 171 | + The memory filesystem is a process-global singleton, so leftover |
| 172 | + state from an earlier test (or the sanity-check test below) could |
| 173 | + otherwise show up in the read path. Clears before and after for |
| 174 | + belt-and-braces hygiene. |
| 175 | + """ |
| 176 | + fs = fsspec.filesystem("memory") |
| 177 | + fs.store.clear() |
| 178 | + try: |
| 179 | + yield fs |
| 180 | + finally: |
| 181 | + fs.store.clear() |
| 182 | + |
| 183 | + |
| 184 | +@pytest.mark.parametrize("manifest_entry", _PARAMS) |
| 185 | +def test_fsspec_parity(manifest_entry: dict, memory_fs_clean) -> None: |
| 186 | + """``open_geotiff('memory://...')`` agrees with the rasterio oracle. |
| 187 | +
|
| 188 | + The fixture's bytes are written into the fsspec memory filesystem |
| 189 | + and read back through ``_CloudSource``. This is the same read path |
| 190 | + a real ``s3://`` / ``gs://`` URL would take, minus the network and |
| 191 | + credentials. The oracle reads from the local on-disk fixture path |
| 192 | + because rasterio does not know about the memory filesystem; both |
| 193 | + sides see the same bytes either way. |
| 194 | + """ |
| 195 | + fixture_id = manifest_entry["id"] |
| 196 | + path = _fixture_path(manifest_entry) |
| 197 | + if not path.exists(): |
| 198 | + pytest.skip( |
| 199 | + f"fixture {fixture_id!r} has no .tif on disk; run " |
| 200 | + f"`python -m xrspatial.geotiff.tests.golden_corpus.generate` " |
| 201 | + f"to materialise the full corpus" |
| 202 | + ) |
| 203 | + |
| 204 | + with open(path, "rb") as f: |
| 205 | + payload = f.read() |
| 206 | + url = _serve_via_memory(payload, fixture_id) |
| 207 | + |
| 208 | + candidate = open_geotiff(url) |
| 209 | + # When the fixture carries pyramid overviews, hand the oracle a |
| 210 | + # factory it can use to fetch each overview level via the same |
| 211 | + # cloud-eager path. The overview-IFD scan also goes through |
| 212 | + # ``_CloudSource``, so any divergence there shows up here. |
| 213 | + overviews = manifest_entry.get("overviews") or [] |
| 214 | + factory = ( |
| 215 | + (lambda level, u=url: open_geotiff(u, overview_level=level)) |
| 216 | + if overviews and fixture_id not in _OVERVIEW_READER_GAPS |
| 217 | + else None |
| 218 | + ) |
| 219 | + compare_to_oracle( |
| 220 | + path, |
| 221 | + candidate, |
| 222 | + lossy=_is_lossy(manifest_entry), |
| 223 | + candidate_factory=factory, |
| 224 | + ) |
| 225 | + |
| 226 | + |
| 227 | +def test_taxonomy_ids_are_in_manifest() -> None: |
| 228 | + """Every id in the parity-gap, fsspec-skip, overview-gap, or |
| 229 | + intentional-skip tables must exist in the manifest. |
| 230 | +
|
| 231 | + Guards against typos: a stale entry would silently keep a |
| 232 | + known-bad fixture marked as expected-to-fail even after it was |
| 233 | + renamed or removed. |
| 234 | + """ |
| 235 | + manifest_ids = {e["id"] for e in _FIXTURES} |
| 236 | + tagged = ( |
| 237 | + set(_PARITY_GAPS) |
| 238 | + | set(_FSSPEC_SKIPS) |
| 239 | + | set(_OVERVIEW_READER_GAPS) |
| 240 | + | set(_INTENTIONAL_SKIPS) |
| 241 | + ) |
| 242 | + stale = tagged - manifest_ids |
| 243 | + assert not stale, ( |
| 244 | + f"taxonomy references unknown fixture ids: {sorted(stale)}" |
| 245 | + ) |
| 246 | + |
| 247 | + |
| 248 | +def test_fsspec_candidate_is_actually_numpy(memory_fs_clean) -> None: |
| 249 | + """Sanity check: the fsspec read path returns a numpy-backed array. |
| 250 | +
|
| 251 | + ``open_geotiff(memory_url)`` without ``chunks=`` should go through |
| 252 | + the eager ``_CloudSource`` path which downloads the object and |
| 253 | + decodes into a numpy array. If a regression accidentally wired the |
| 254 | + fsspec URI to a dask-chunked path, this test catches it. |
| 255 | +
|
| 256 | + Picks the first eligible (not skipped, on disk) fixture in sorted |
| 257 | + id order so the assertion is stable across corpus growth. |
| 258 | + """ |
| 259 | + eligible = [ |
| 260 | + e for e in _FIXTURES |
| 261 | + if e["id"] not in _PARITY_GAPS |
| 262 | + and e["id"] not in _FSSPEC_SKIPS |
| 263 | + and e["id"] not in _INTENTIONAL_SKIPS |
| 264 | + and _fixture_path(e).exists() |
| 265 | + ] |
| 266 | + if not eligible: |
| 267 | + pytest.skip("no eligible fixture on disk") |
| 268 | + entry = eligible[0] |
| 269 | + path = _fixture_path(entry) |
| 270 | + with open(path, "rb") as f: |
| 271 | + payload = f.read() |
| 272 | + url = _serve_via_memory(payload, entry["id"]) |
| 273 | + da = open_geotiff(url) |
| 274 | + assert isinstance(da.data, np.ndarray), ( |
| 275 | + f"expected a numpy.ndarray for the fsspec eager path on " |
| 276 | + f"{entry['id']!r}, got {type(da.data).__name__}" |
| 277 | + ) |
0 commit comments