|
| 1 | +"""Consolidated VRT ``missing_sources`` policy matrix (#2367, work item of #2342). |
| 2 | +
|
| 3 | +This file complements ``test_vrt_missing_sources_policy_1799.py`` and |
| 4 | +``test_vrt_chunked_missing_sources_1799.py`` by covering the full |
| 5 | +release contract in one place: every policy value (default, |
| 6 | +``'raise'``, ``'warn'``, invalid) is exercised against both read paths |
| 7 | +(eager ``read_vrt`` and dask ``open_geotiff(..., chunks=...)``), with |
| 8 | +assertions on the exception or warning type, the message text, and the |
| 9 | +actual output array values where applicable. |
| 10 | +
|
| 11 | +The existing 1799 / 1843 / 2265 tests pin individual cases. This file |
| 12 | +keeps the four-by-two matrix together so a future kwarg refactor that |
| 13 | +silently drops parity between the eager and chunked paths regresses a |
| 14 | +single, focused test file. |
| 15 | +
|
| 16 | +Release contract (see ``_backends/vrt.py:206`` docstring): |
| 17 | +
|
| 18 | +* ``'raise'`` is the default since #1860. |
| 19 | +* ``'raise'`` fails fast with ``FileNotFoundError`` naming the missing |
| 20 | + source path. The chunked path raises at build time (#2265) so a |
| 21 | + partial mosaic never surfaces silently from a delayed compute. |
| 22 | +* ``'warn'`` is the explicit opt-in. It emits |
| 23 | + ``GeoTIFFFallbackWarning`` naming the missing source and returns the |
| 24 | + mosaic with NaN (or the band's nodata sentinel) in the corresponding |
| 25 | + region. ``attrs['vrt_holes']`` records the affected source(s). |
| 26 | +* Any other value raises ``ValueError`` naming the bad kwarg. |
| 27 | +""" |
| 28 | +from __future__ import annotations |
| 29 | + |
| 30 | +import os |
| 31 | +import warnings |
| 32 | + |
| 33 | +import numpy as np |
| 34 | +import pytest |
| 35 | +import xarray as xr |
| 36 | + |
| 37 | +from xrspatial.geotiff import ( |
| 38 | + GeoTIFFFallbackWarning, |
| 39 | + open_geotiff, |
| 40 | + read_vrt, |
| 41 | + to_geotiff, |
| 42 | +) |
| 43 | + |
| 44 | + |
| 45 | +PRESENT_FILL = 7.0 |
| 46 | + |
| 47 | + |
| 48 | +def _build_partial_vrt(tmp_path) -> tuple[str, str, str]: |
| 49 | + """Build a 2-source VRT: left half is real, right half points at a |
| 50 | + non-existent file. |
| 51 | +
|
| 52 | + Returns ``(vrt_path, present_src_path, missing_path)``. Filenames |
| 53 | + embed issue #2367 to keep parallel test runs from colliding on |
| 54 | + shared tmp roots. |
| 55 | + """ |
| 56 | + src = os.path.join(tmp_path, "src_2367_present.tif") |
| 57 | + arr = np.full((4, 4), PRESENT_FILL, dtype=np.float32) |
| 58 | + da = xr.DataArray( |
| 59 | + arr, dims=("y", "x"), |
| 60 | + attrs={"transform": (1.0, 0.0, 0.0, 0.0, -1.0, 0.0)}, |
| 61 | + ) |
| 62 | + to_geotiff(da, src) |
| 63 | + |
| 64 | + missing = os.path.join(tmp_path, "missing_2367.tif") |
| 65 | + vrt_path = os.path.join(tmp_path, "partial_2367.vrt") |
| 66 | + with open(vrt_path, "w") as f: |
| 67 | + f.write( |
| 68 | + '<VRTDataset rasterXSize="8" rasterYSize="4">\n' |
| 69 | + '<GeoTransform>0.0, 1.0, 0.0, 0.0, 0.0, -1.0</GeoTransform>\n' |
| 70 | + '<VRTRasterBand dataType="Float32" band="1">\n' |
| 71 | + '<SimpleSource>\n' |
| 72 | + f'<SourceFilename relativeToVRT="0">{src}</SourceFilename>\n' |
| 73 | + '<SourceBand>1</SourceBand>\n' |
| 74 | + '<SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n' |
| 75 | + '<DstRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n' |
| 76 | + '</SimpleSource>\n' |
| 77 | + '<SimpleSource>\n' |
| 78 | + f'<SourceFilename relativeToVRT="0">{missing}</SourceFilename>\n' |
| 79 | + '<SourceBand>1</SourceBand>\n' |
| 80 | + '<SrcRect xOff="0" yOff="0" xSize="4" ySize="4"/>\n' |
| 81 | + '<DstRect xOff="4" yOff="0" xSize="4" ySize="4"/>\n' |
| 82 | + '</SimpleSource>\n' |
| 83 | + '</VRTRasterBand>\n' |
| 84 | + '</VRTDataset>\n' |
| 85 | + ) |
| 86 | + return vrt_path, src, missing |
| 87 | + |
| 88 | + |
| 89 | +# --------------------------------------------------------------------------- |
| 90 | +# Reader-path fixtures. Each "reader" callable accepts ``(source, |
| 91 | +# **kwargs)`` and returns a DataArray. The eager reader returns a numpy- |
| 92 | +# backed array; the dask reader returns a chunked DataArray that still |
| 93 | +# needs ``.compute()`` to materialise values. |
| 94 | +# --------------------------------------------------------------------------- |
| 95 | + |
| 96 | +def _eager_reader(source, **kwargs): |
| 97 | + return read_vrt(source, **kwargs) |
| 98 | + |
| 99 | + |
| 100 | +def _dask_reader(source, **kwargs): |
| 101 | + # ``open_geotiff`` routes ``.vrt`` to ``read_vrt`` and forwards |
| 102 | + # ``chunks=`` / ``missing_sources=`` unchanged. Using a small chunk |
| 103 | + # size keeps the partial mosaic split across multiple tasks so the |
| 104 | + # lazy path is genuinely exercised. |
| 105 | + return open_geotiff(source, chunks=4, **kwargs) |
| 106 | + |
| 107 | + |
| 108 | +READERS = [ |
| 109 | + pytest.param(_eager_reader, id="eager_read_vrt"), |
| 110 | + pytest.param(_dask_reader, id="dask_open_geotiff_chunks"), |
| 111 | +] |
| 112 | + |
| 113 | + |
| 114 | +# --------------------------------------------------------------------------- |
| 115 | +# Default policy: no kwarg -> raises. |
| 116 | +# --------------------------------------------------------------------------- |
| 117 | + |
| 118 | +class TestDefaultPolicyRaises: |
| 119 | + """No ``missing_sources`` kwarg -> ``FileNotFoundError`` naming the |
| 120 | + missing source. This is the public default since #1860 and the |
| 121 | + release matrix in #2342 calls it out as a hard contract.""" |
| 122 | + |
| 123 | + @pytest.mark.parametrize("reader", READERS) |
| 124 | + def test_default_raises_filenotfound_naming_source( |
| 125 | + self, reader, tmp_path, |
| 126 | + ): |
| 127 | + vrt_path, _, missing = _build_partial_vrt(str(tmp_path)) |
| 128 | + with pytest.raises(FileNotFoundError) as excinfo: |
| 129 | + reader(vrt_path) |
| 130 | + # The basename of the missing source must appear in the |
| 131 | + # message. The chunked path quotes the full path; the eager |
| 132 | + # path may quote just the source filename or the resolved |
| 133 | + # absolute path depending on which guard fires first. Match on |
| 134 | + # the basename to stay portable across both. |
| 135 | + assert "missing_2367.tif" in str(excinfo.value), ( |
| 136 | + f"Default policy raise must name the missing source. " |
| 137 | + f"Got: {excinfo.value!r}" |
| 138 | + ) |
| 139 | + |
| 140 | + |
| 141 | +# --------------------------------------------------------------------------- |
| 142 | +# Explicit raise: same shape as default. |
| 143 | +# --------------------------------------------------------------------------- |
| 144 | + |
| 145 | +class TestExplicitRaisePolicy: |
| 146 | + """``missing_sources='raise'`` passed explicitly must behave the |
| 147 | + same as the default. Pins that an explicit opt-in does not |
| 148 | + accidentally route through a separate code branch.""" |
| 149 | + |
| 150 | + @pytest.mark.parametrize("reader", READERS) |
| 151 | + def test_explicit_raise_matches_default(self, reader, tmp_path): |
| 152 | + vrt_path, _, _ = _build_partial_vrt(str(tmp_path)) |
| 153 | + with pytest.raises(FileNotFoundError) as excinfo: |
| 154 | + reader(vrt_path, missing_sources="raise") |
| 155 | + assert "missing_2367.tif" in str(excinfo.value) |
| 156 | + |
| 157 | + |
| 158 | +# --------------------------------------------------------------------------- |
| 159 | +# Warn opt-in: warning class, message, and output values all pinned. |
| 160 | +# --------------------------------------------------------------------------- |
| 161 | + |
| 162 | +class TestWarnPolicyEmitsWarningAndFillsNodata: |
| 163 | + """``missing_sources='warn'`` is the lenient opt-in. |
| 164 | +
|
| 165 | + Three things to lock in: |
| 166 | +
|
| 167 | + 1. The warning class is ``GeoTIFFFallbackWarning`` (not a bare |
| 168 | + ``UserWarning``) and the message names the missing source. |
| 169 | + 2. ``attrs['vrt_holes']`` records the affected source. |
| 170 | + 3. The returned array shows ``PRESENT_FILL`` on the present half |
| 171 | + and NaN on the missing half. The eager path materialises this |
| 172 | + immediately; the chunked path needs ``.compute()`` and emits the |
| 173 | + warning at compute time rather than build time, but the |
| 174 | + resulting array values must match. |
| 175 | + """ |
| 176 | + |
| 177 | + def test_eager_warn_emits_and_fills(self, tmp_path): |
| 178 | + vrt_path, _, missing = _build_partial_vrt(str(tmp_path)) |
| 179 | + # Use ``match=`` for the class + message check in one step, |
| 180 | + # matching the sibling 1799 test's style. |
| 181 | + with pytest.warns( |
| 182 | + GeoTIFFFallbackWarning, match="missing_2367.tif", |
| 183 | + ): |
| 184 | + da = read_vrt(vrt_path, missing_sources="warn") |
| 185 | + |
| 186 | + # vrt_holes attr is populated and points at the missing file. |
| 187 | + assert "vrt_holes" in da.attrs |
| 188 | + sources = [h["source"] for h in da.attrs["vrt_holes"]] |
| 189 | + assert any(s.endswith("missing_2367.tif") for s in sources) |
| 190 | + |
| 191 | + # Output values: present half == 7.0, missing half == NaN. |
| 192 | + out = np.asarray(da) |
| 193 | + np.testing.assert_array_equal( |
| 194 | + out[:, :4], np.full((4, 4), PRESENT_FILL, dtype=np.float32), |
| 195 | + ) |
| 196 | + assert np.all(np.isnan(out[:, 4:])), ( |
| 197 | + "Lenient policy must leave the missing region as NaN on " |
| 198 | + "float bands." |
| 199 | + ) |
| 200 | + |
| 201 | + def test_dask_warn_emits_at_compute_and_fills(self, tmp_path): |
| 202 | + vrt_path, _, missing = _build_partial_vrt(str(tmp_path)) |
| 203 | + # Build the lazy DataArray. The parse-time sweep populates |
| 204 | + # ``vrt_holes`` here without forcing a decode. |
| 205 | + da = open_geotiff( |
| 206 | + vrt_path, chunks=4, missing_sources="warn", |
| 207 | + ) |
| 208 | + assert "vrt_holes" in da.attrs, ( |
| 209 | + "Chunked warn path must populate vrt_holes at build so " |
| 210 | + "callers can branch on partial mosaics without computing." |
| 211 | + ) |
| 212 | + |
| 213 | + with warnings.catch_warnings(record=True) as caught: |
| 214 | + warnings.simplefilter("always") |
| 215 | + computed = da.compute() |
| 216 | + |
| 217 | + msgs = [ |
| 218 | + str(w.message) for w in caught |
| 219 | + if isinstance(w.message, GeoTIFFFallbackWarning) |
| 220 | + ] |
| 221 | + assert any("missing_2367.tif" in m for m in msgs), ( |
| 222 | + f"Chunked warn path must emit GeoTIFFFallbackWarning at " |
| 223 | + f"compute naming the missing source; got: {msgs!r}" |
| 224 | + ) |
| 225 | + |
| 226 | + out = np.asarray(computed) |
| 227 | + np.testing.assert_array_equal( |
| 228 | + out[:, :4], np.full((4, 4), PRESENT_FILL, dtype=np.float32), |
| 229 | + ) |
| 230 | + assert np.all(np.isnan(out[:, 4:])) |
| 231 | + |
| 232 | + |
| 233 | +# --------------------------------------------------------------------------- |
| 234 | +# Invalid policy strings. |
| 235 | +# --------------------------------------------------------------------------- |
| 236 | + |
| 237 | +class TestInvalidPolicyRejected: |
| 238 | + """Garbage values for ``missing_sources`` raise ``ValueError`` at |
| 239 | + the public-API boundary. The message must name the bad value so |
| 240 | + typos like ``'raises'`` surface clearly. |
| 241 | +
|
| 242 | + Sanity for the chunked path too: the same value-validation block |
| 243 | + runs before ``_read_vrt_chunked`` dispatches, so the eager and |
| 244 | + chunked invocations both reject identically.""" |
| 245 | + |
| 246 | + @pytest.mark.parametrize("reader", READERS) |
| 247 | + @pytest.mark.parametrize( |
| 248 | + "bad_value", ["ignore", "RAISE", "raises", "", "warn ", "1"], |
| 249 | + ) |
| 250 | + def test_invalid_policy_raises_value_error_naming_value( |
| 251 | + self, reader, bad_value, tmp_path, |
| 252 | + ): |
| 253 | + vrt_path, _, _ = _build_partial_vrt(str(tmp_path)) |
| 254 | + with pytest.raises(ValueError) as excinfo: |
| 255 | + reader(vrt_path, missing_sources=bad_value) |
| 256 | + msg = str(excinfo.value) |
| 257 | + assert "missing_sources" in msg, ( |
| 258 | + f"ValueError must name the kwarg; got {msg!r}" |
| 259 | + ) |
| 260 | + # The current implementation quotes the bad value via repr(). |
| 261 | + # Use repr() here so the assertion stays robust across the few |
| 262 | + # acceptable formats (single quotes, double quotes, empty |
| 263 | + # string repr). |
| 264 | + assert repr(bad_value) in msg, ( |
| 265 | + f"ValueError must echo the bad value back to the caller; " |
| 266 | + f"got {msg!r}" |
| 267 | + ) |
0 commit comments