From 3777dd526e13eb16653f0a2e5d1f29d3c0335fd2 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 20 May 2026 08:40:24 -0700 Subject: [PATCH 1/2] geotiff: add read-finalization helpers _finalize_eager_read / _finalize_lazy_read_attrs (#2177) Wave 1 of #2162. Add two private helpers in xrspatial/geotiff/_attrs.py that capture the read-finalization pipelines duplicated across backends. The helpers are dead code until waves 2 (#2178 dask, #2179 eager) and 3 (#2180 VRT, GPU) consume them. _finalize_eager_read: validates geo_info, populates attrs, applies the sentinel mask, casts dtype, sets nodata attrs (with pixels_present as a bool), returns an xarray.DataArray. mask_sentinel is a parameter because the three GPU eager sites derive it three different ways (MinIsWhite inversion, CPU fallback, raw nodata). _finalize_lazy_read_attrs: validates geo_info, populates attrs, sets nodata attrs with pixels_present=None per the documented dask contract from #2135 (a strict per-chunk reduction would force eager .compute()). Returns the attrs dict only; the caller assembles the dask graph and builds the DataArray itself. _validate_read_geo_info runs first in both helpers so partial attrs do not leak on validation failure. No public API change. No call sites migrated. Helper signatures are frozen so wave 2 and 3 can depend on them. --- xrspatial/geotiff/_attrs.py | 282 +++++++++ .../tests/test_finalization_helpers_2162.py | 584 ++++++++++++++++++ 2 files changed, 866 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_finalization_helpers_2162.py diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 6a9d3f63d..d8b4baa58 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -161,6 +161,7 @@ import numpy as np from ._coords import ( + coords_from_geo_info as _coords_from_geo_info, transform_tuple_from_pixel_geometry as _transform_tuple_from_pixel_geometry, ) from ._geotags import ( @@ -1128,3 +1129,284 @@ def _extract_rich_tags(attrs: dict) -> dict: 'y_resolution': attrs.get('y_resolution'), 'resolution_unit': res_unit, } + + +def _apply_eager_nodata_mask(arr, *, mask_sentinel, mask_nodata): + """Apply the nodata-to-NaN mask on an eager (host-side) numpy buffer. + + Mirrors the inline block in ``open_geotiff`` so the eager helper can + share one implementation. Returns ``(arr, nodata_pixels_present)`` + where ``arr`` may have been promoted from an integer dtype to float64 + when the sentinel matched at least one pixel, and + ``nodata_pixels_present`` is the bool used to populate + ``attrs['nodata_pixels_present']``. ``None`` means "no scan was + appropriate for this dtype / sentinel combination." + + The sentinel is taken as the ``mask_sentinel`` parameter rather than + being read from ``geo_info``. Three GPU eager sites derive it three + different ways (``_mw_mask_nodata`` local, the CPU-fallback + ``_cpu_fallback_geo._mask_nodata``, raw ``nodata``), so the helper + accepts the sentinel value directly. + """ + nodata_pixels_present: bool | None = None + if mask_sentinel is None: + return arr, nodata_pixels_present + if mask_nodata: + if arr.dtype.kind == 'f': + if not np.isnan(mask_sentinel): + mask_f = arr == arr.dtype.type(mask_sentinel) + nodata_pixels_present = bool(mask_f.any()) + if nodata_pixels_present: + arr[mask_f] = np.nan + else: + # NaN-only sentinel on a float buffer: ``mask_nodata`` is + # a no-op, but downstream may want to know if any NaN + # pixels already exist in the source so the attr stays + # informative. + nodata_pixels_present = bool(np.isnan(arr).any()) + elif arr.dtype.kind in ('u', 'i'): + # Integer arrays: convert to float to represent NaN. Gate on + # finite + integer + in-range so a sentinel that cannot match + # an integer pixel resolves to ``False`` rather than crashing + # in the equality cast (mirrors the eager block in + # ``open_geotiff`` for #1774 / #1564 / #1616). + if (np.isfinite(mask_sentinel) + and float(mask_sentinel).is_integer()): + nodata_int = int(mask_sentinel) + info = np.iinfo(arr.dtype) + if info.min <= nodata_int <= info.max: + mask = arr == arr.dtype.type(nodata_int) + nodata_pixels_present = bool(mask.any()) + if nodata_pixels_present: + arr = arr.astype(np.float64) + arr[mask] = np.nan + else: + nodata_pixels_present = False + else: + nodata_pixels_present = False + else: + # ``mask_nodata=False``: do not rewrite pixels, but still surface + # ``attrs['nodata_pixels_present']`` so callers know whether + # literal sentinel pixels survive in the buffer (issue #2135). + if arr.dtype.kind == 'f': + if np.isnan(mask_sentinel): + nodata_pixels_present = bool(np.isnan(arr).any()) + else: + nodata_pixels_present = bool( + (arr == arr.dtype.type(mask_sentinel)).any() + ) + elif arr.dtype.kind in ('u', 'i'): + if (np.isfinite(mask_sentinel) + and float(mask_sentinel).is_integer()): + nodata_int = int(mask_sentinel) + info = np.iinfo(arr.dtype) + if info.min <= nodata_int <= info.max: + nodata_pixels_present = bool( + (arr == arr.dtype.type(nodata_int)).any() + ) + else: + nodata_pixels_present = False + else: + nodata_pixels_present = False + return arr, nodata_pixels_present + + +def _finalize_eager_read( + arr, + *, + geo_info, + nodata, + mask_sentinel, + mask_nodata, + dtype, + window, + name, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, + attrs_in: dict | None = None, +): + """Validate, populate attrs, mask, cast, and build an eager DataArray. + + Wave 1 of #2162 -- ties together the four steps every eager read path + runs after the bytes land in a host (or cupy) buffer: + + 1. :func:`_validate_read_geo_info` -- runs first so a rejected file + does not leak a partially-populated attrs dict. + 2. :func:`_populate_attrs_from_geo_info` -- writes the canonical attrs + (transform / crs / georef_status / etc.) onto a fresh dict. + 3. Mask nodata pixels to NaN using ``mask_sentinel`` when + ``mask_nodata=True`` and the source declared one. Records the + ``nodata_pixels_present`` bool either way. + 4. Cast to ``dtype`` when explicit; record ``nodata_dtype_cast``. + 5. :func:`_set_nodata_attrs` -- stamps the nodata lifecycle attrs. + 6. Build an :class:`xarray.DataArray` with coords from + :func:`_coords_from_geo_info`. + + The ``mask_sentinel`` parameter is intentionally separate from + ``geo_info.nodata`` because the three GPU eager sites derive it three + different ways (``_mw_mask_nodata`` local on the stripped path, the + CPU-fallback ``_cpu_fallback_geo._mask_nodata`` on the tiled path, + raw ``nodata`` on the CPU-decode-then-upload path for URL / fsspec + sources). Read paths that don't need MinIsWhite inversion can pass + ``mask_sentinel=nodata``. + + Numpy buffers only; the GPU eager call sites pass cupy ndarrays and + rely on cupy's numpy-equivalent operators (``arr.dtype.kind``, + ``arr.astype``, boolean fancy indexing, ``arr == scalar``) so the + same helper covers both backends. The GPU eager path also passes a + masking function that runs on the device; this helper handles the + host-side path. (Wave 2 will migrate eager numpy / VRT eager; wave 3 + will migrate the three GPU eager sites with the mask step factored + out.) + + Returns a :class:`xarray.DataArray` ready for the caller to return + from the read function. The caller assembles the dask graph + separately when a lazy backend is in play; this helper is eager-only. + """ + # Step 1: validate first so partial attrs never leak. + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) + + # Step 2: populate attrs from geo_info onto a fresh dict (or onto a + # caller-supplied seed dict, which lets the GPU/VRT migration carry + # backend-specific keys through without bypassing the helper). + attrs: dict = dict(attrs_in) if attrs_in else {} + _populate_attrs_from_geo_info(attrs, geo_info, window=window) + + # Step 3: apply the nodata-to-NaN mask (or compute pixels_present + # without rewriting if ``mask_nodata=False``). Skipped entirely when + # the source declared no sentinel. + nodata_pixels_present: bool | None = None + if nodata is not None: + arr, nodata_pixels_present = _apply_eager_nodata_mask( + arr, mask_sentinel=mask_sentinel, mask_nodata=mask_nodata, + ) + + # Step 4: caller-requested dtype cast (post-mask so the integer + # promotion above runs first). ``_validate_dtype_cast`` lives in + # ``_validation``; local import keeps ``_attrs`` free of a top-level + # validation dependency for parity with ``_validate_read_geo_info``. + dtype_cast_attr: str | None = None + if dtype is not None: + from ._validation import _validate_dtype_cast + target = np.dtype(dtype) + _validate_dtype_cast(np.dtype(str(arr.dtype)), target) + arr = arr.astype(target) + dtype_cast_attr = target.name + + # Step 5: stamp the nodata lifecycle attrs. ``masked`` is True iff + # the caller opted into masking AND the final buffer dtype is float, + # mirroring the existing call sites (the integer promotion above + # only runs when the sentinel matched at least one pixel, so an + # ``int`` buffer + ``mask_nodata=True`` here means "no pixels were + # masked" rather than "masking was disabled"). + _set_nodata_attrs( + attrs, nodata, + masked=(mask_nodata and np.dtype(str(arr.dtype)).kind == 'f'), + pixels_present=nodata_pixels_present, + dtype_cast=dtype_cast_attr, + ) + + # Step 6: build the DataArray. ``_coords_from_geo_info`` honours the + # windowed-read contract (origin shifted to the window's top-left). + height, width = arr.shape[:2] + coords = _coords_from_geo_info( + geo_info, height, width, window=window, + ) + if arr.ndim == 3: + dims = ['y', 'x', 'band'] + coords['band'] = np.arange(arr.shape[2]) + else: + dims = ['y', 'x'] + + # Local import to avoid a top-level ``import xarray`` -- ``_attrs`` + # is imported from ``__init__`` and the lazy import keeps the module + # load graph the same shape it already has. + import xarray as xr + return xr.DataArray(arr, dims=dims, coords=coords, name=name, attrs=attrs) + + +def _finalize_lazy_read_attrs( + *, + geo_info, + nodata, + mask_nodata, + dtype, + window, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, + attrs_in: dict | None = None, +): + """Validate and populate attrs for dask-style lazy reads. + + Wave 1 of #2162 -- the lazy counterpart of + :func:`_finalize_eager_read`. The dask + dask-GPU backends cannot + fold the nodata mask into a single eager step because masking runs + per-chunk inside the graph; they only need the attrs side of the + pipeline. This helper: + + 1. :func:`_validate_read_geo_info` -- runs first so partial attrs + never leak on validation failure. + 2. :func:`_populate_attrs_from_geo_info` -- writes the canonical + attrs onto a fresh dict. + 3. :func:`_set_nodata_attrs` -- ``masked`` is True iff the caller + opted into masking AND the graph dtype is float. ``dtype_cast`` + is recorded when the caller passed an explicit ``dtype=`` kwarg. + ``pixels_present=None`` is the documented dask contract from + issue #2135: a strict per-chunk reduction would force an eager + ``.compute()`` and break the lazy contract, so the attr is left + absent on lazy outputs. + + Returns the attrs ``dict`` only; the caller assembles the dask graph + and builds the :class:`xarray.DataArray` itself, so this helper + deliberately does not touch arrays or coords. + + The ``dtype`` parameter accepts a numpy dtype, a string ('float64'), + or ``None`` (no explicit cast). When ``None`` the masked-flag still + has to know the graph dtype; the caller passes whatever ``dtype=`` + they resolved onto the graph (e.g. ``target_dtype`` in the dask + backend) so the helper can derive ``masked``. The integer-to-float + promotion that an eager path applies when sentinel pixels are + present is handled by the dask backend before it calls this helper; + ``dtype`` here is the resolved graph dtype, not the file dtype. + """ + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) + + attrs: dict = dict(attrs_in) if attrs_in else {} + _populate_attrs_from_geo_info(attrs, geo_info, window=window) + + # ``masked`` mirrors the eager helper's rule and the existing dask + # call site contract: the graph applies masking per-chunk only when + # ``mask_nodata=True`` AND the graph dtype is float, so an int graph + # with ``mask_nodata=True`` still carries literal sentinel values. + # ``dtype`` here is the resolved graph dtype; the dask backend + # promotes int -> float64 before calling this helper when the + # caller wants masking on an int source. + if dtype is None: + masked = False + else: + masked = bool(mask_nodata and np.dtype(dtype).kind == 'f') + + # ``dtype_cast`` records the caller-supplied ``dtype=`` kwarg so + # consumers can tell float-because-masked from float-because-cast. + # The dask backend resolves ``dtype`` for the graph internally; the + # helper exposes it via ``attrs['nodata_dtype_cast']`` when set. + dtype_cast_attr = ( + np.dtype(dtype).name if dtype is not None else None + ) + + _set_nodata_attrs( + attrs, nodata, + masked=masked, + pixels_present=None, + dtype_cast=dtype_cast_attr, + ) + + return attrs diff --git a/xrspatial/geotiff/tests/test_finalization_helpers_2162.py b/xrspatial/geotiff/tests/test_finalization_helpers_2162.py new file mode 100644 index 000000000..61221f07d --- /dev/null +++ b/xrspatial/geotiff/tests/test_finalization_helpers_2162.py @@ -0,0 +1,584 @@ +"""Unit tests for the read-finalization helpers in ``_attrs.py``. + +Issue #2177 (PR B of #2162). + +The two helpers ``_finalize_eager_read`` and ``_finalize_lazy_read_attrs`` +capture the post-decode finalization steps duplicated across the four +read backends. This test file pins their behaviour in isolation: it +synthesises ``GeoInfo`` fixtures and exercises the helpers directly, +without going through the read backends (those are migrated in waves 2 +and 3). + +The four invariants covered here: + +1. Eager helper populates the nodata lifecycle attrs (``nodata``, + ``nodata_pixels_present``, ``nodata_dtype_cast``, ``masked_nodata``) + and the ``georef_status`` attr across float and integer input dtypes. +2. ``mask_nodata=False`` leaves the buffer alone and surfaces + ``nodata_pixels_present`` without rewriting. +3. Lazy helper produces the same attrs minus ``nodata_pixels_present`` + (the dask contract from #2135). +4. Both helpers validate ``geo_info`` first so a rejected file does not + leak partial attrs onto the caller's dict. +5. Both helpers route ``mask_sentinel != nodata`` through the masking + step (the GPU MinIsWhite inversion case). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff._attrs import ( + _finalize_eager_read, + _finalize_lazy_read_attrs, +) +from xrspatial.geotiff._errors import UnparseableCRSError + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +class _FakeTransform: + """Stand-in for ``GeoInfo.transform``. Axis-aligned by default.""" + + def __init__(self, origin_x=0.0, origin_y=10.0, + pixel_width=1.0, pixel_height=-1.0, + rotated_affine=None): + self.origin_x = origin_x + self.origin_y = origin_y + self.pixel_width = pixel_width + self.pixel_height = pixel_height + self.rotated_affine = rotated_affine + + +class _FakeGeoInfo: + """Minimal ``GeoInfo`` stand-in covering the fields the helpers read. + + Mirrors the shape used in ``test_geotiff_metadata_2139.py`` so the + two test files stay in sync if the ``GeoInfo`` field set ever grows. + """ + + def __init__( + self, + *, + transform=None, + crs_epsg=None, + crs_wkt=None, + raster_type=1, # RASTER_PIXEL_IS_AREA + has_georef=True, + nodata=None, + extra_tags=None, + image_description=None, + extra_samples=None, + gdal_metadata=None, + gdal_metadata_xml=None, + x_resolution=None, + y_resolution=None, + resolution_unit=None, + ): + self.transform = transform if transform is not None else _FakeTransform() + self.crs_epsg = crs_epsg + self.crs_wkt = crs_wkt + self.raster_type = raster_type + self.has_georef = has_georef + self.nodata = nodata + self.extra_tags = extra_tags + self.image_description = image_description + self.extra_samples = extra_samples + self.gdal_metadata = gdal_metadata + self.gdal_metadata_xml = gdal_metadata_xml + self.x_resolution = x_resolution + self.y_resolution = y_resolution + self.resolution_unit = resolution_unit + + +def _default_geo_info(**overrides): + """A minimal georeferenced ``_FakeGeoInfo`` for happy-path tests.""" + base = dict(crs_epsg=4326, crs_wkt='EPSG:4326', nodata=-9999) + base.update(overrides) + return _FakeGeoInfo(**base) + + +# --------------------------------------------------------------------------- +# Eager helper: attrs surface +# --------------------------------------------------------------------------- + + +def test_eager_float_input_sets_nodata_lifecycle_attrs(): + arr = np.array([[1.0, -9999.0], [2.0, 3.0]], dtype=np.float32) + gi = _default_geo_info() + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=True, + dtype=None, + window=None, + name='t', + ) + + assert da.attrs['nodata'] == -9999 + assert da.attrs['masked_nodata'] is True + assert da.attrs['nodata_pixels_present'] is True + # No explicit dtype= cast was requested. + assert 'nodata_dtype_cast' not in da.attrs + # georef_status comes from _populate_attrs_from_geo_info. + assert da.attrs['georef_status'] == 'full' + # The sentinel pixel is now NaN. + assert np.isnan(np.asarray(da.values)[0, 1]) + + +def test_eager_int_input_promotes_and_sets_attrs(): + arr = np.array([[1, 0], [0, -1]], dtype=np.int16) + gi = _default_geo_info(nodata=0) + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=0, + mask_sentinel=0, + mask_nodata=True, + dtype=None, + window=None, + name='t', + ) + + assert da.dtype.kind == 'f' + assert da.attrs['nodata'] == 0 + assert da.attrs['masked_nodata'] is True + assert da.attrs['nodata_pixels_present'] is True + assert da.attrs['georef_status'] == 'full' + + +def test_eager_explicit_dtype_records_dtype_cast(): + arr = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + gi = _default_geo_info() + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=True, + dtype='float64', + window=None, + name='t', + ) + + assert da.dtype == np.float64 + assert da.attrs['nodata_dtype_cast'] == 'float64' + # No sentinel matched, so pixels_present is False rather than absent. + assert da.attrs['nodata_pixels_present'] is False + + +def test_eager_no_sentinel_pixels_present_is_false(): + arr = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + gi = _default_geo_info() + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=True, + dtype=None, + window=None, + name='t', + ) + + assert da.attrs['nodata_pixels_present'] is False + assert da.attrs['masked_nodata'] is True + + +# --------------------------------------------------------------------------- +# Eager helper: mask_nodata=False opt-out (issue #2052) +# --------------------------------------------------------------------------- + + +def test_eager_mask_nodata_false_skips_mask_keeps_attr_surface(): + arr = np.array([[1.0, -9999.0], [2.0, 3.0]], dtype=np.float32) + gi = _default_geo_info() + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=False, + dtype=None, + window=None, + name='t', + ) + + # The sentinel pixel is preserved literally. + assert da.values[0, 1] == -9999.0 + # ``masked_nodata=False`` per the #2092 contract. + assert da.attrs['masked_nodata'] is False + # ``nodata_pixels_present`` still surfaces so callers know a sentinel + # pixel exists in the buffer (#2135). + assert da.attrs['nodata_pixels_present'] is True + + +def test_eager_mask_nodata_false_no_sentinel_present(): + arr = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + gi = _default_geo_info() + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=False, + dtype=None, + window=None, + name='t', + ) + + assert da.attrs['nodata_pixels_present'] is False + assert da.attrs['masked_nodata'] is False + + +# --------------------------------------------------------------------------- +# Eager helper: no declared sentinel +# --------------------------------------------------------------------------- + + +def test_eager_no_nodata_omits_nodata_attrs(): + arr = np.array([[1.0, 2.0]], dtype=np.float32) + gi = _default_geo_info(nodata=None) + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=None, + mask_sentinel=None, + mask_nodata=True, + dtype=None, + window=None, + name='t', + ) + + assert 'nodata' not in da.attrs + assert 'masked_nodata' not in da.attrs + assert 'nodata_pixels_present' not in da.attrs + assert 'nodata_dtype_cast' not in da.attrs + + +# --------------------------------------------------------------------------- +# Eager helper: mask_sentinel != nodata (GPU MinIsWhite inversion, #1809) +# --------------------------------------------------------------------------- + + +def test_eager_mask_sentinel_differs_from_nodata(): + # MinIsWhite inverts the sentinel: nodata=0 on disk but the in-memory + # buffer has been inverted, so the masking value is 255 (8-bit) not 0. + arr = np.array([[100.0, 255.0], [50.0, 0.0]], dtype=np.float32) + gi = _default_geo_info(nodata=0) + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=0, # canonical attrs['nodata'] keeps the on-disk sentinel + mask_sentinel=255, # actual value to match in the in-memory buffer + mask_nodata=True, + dtype=None, + window=None, + name='t', + ) + + # attrs['nodata'] keeps the file sentinel so a round-trip write + # rebuilds the original GDAL_NODATA tag. + assert da.attrs['nodata'] == 0 + # The pixel matching mask_sentinel=255 became NaN; the literal-0 + # pixel was left alone because mask_sentinel != 0. + arr_out = np.asarray(da.values) + assert np.isnan(arr_out[0, 1]) + assert arr_out[1, 1] == 0.0 + assert da.attrs['nodata_pixels_present'] is True + + +# --------------------------------------------------------------------------- +# Eager helper: validate-first ordering (no partial attrs leak) +# --------------------------------------------------------------------------- + + +def test_eager_raises_on_unparseable_crs_without_partial_attrs(): + # Try to bypass pyproj entirely: if pyproj is missing, the check + # short-circuits and the test would be a no-op. Skip in that case. + pytest.importorskip('pyproj') + + arr = np.array([[1.0, 2.0]], dtype=np.float32) + # An obviously-bad WKT string. The unparseable-CRS check raises + # before any attrs population step runs. + gi = _default_geo_info(crs_wkt='NOT-A-VALID-WKT-STRING') + + # ``attrs_in`` is mutated only via ``dict(attrs_in)`` copy, but the + # important contract is "if validation raises, no DataArray is built + # and the caller's seed dict is untouched." Pass a seed dict and + # confirm it is unchanged afterwards. + seed = {'sentinel_marker': True} + with pytest.raises(UnparseableCRSError): + _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=True, + dtype=None, + window=None, + name='t', + attrs_in=seed, + ) + # Seed dict was never written to; the validation failure raised + # before any ``attrs[...]`` step ran. + assert seed == {'sentinel_marker': True} + + +def test_eager_allow_unparseable_crs_bypasses_check(): + pytest.importorskip('pyproj') + + arr = np.array([[1.0, 2.0]], dtype=np.float32) + gi = _default_geo_info(crs_wkt='NOT-A-VALID-WKT-STRING') + + # Opt-in bypass: the unparseable WKT lands on the attrs unchanged. + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=True, + dtype=None, + window=None, + name='t', + allow_unparseable_crs=True, + ) + assert da.attrs['crs_wkt'] == 'NOT-A-VALID-WKT-STRING' + + +# --------------------------------------------------------------------------- +# Eager helper: attrs_in seed forwarded onto the DataArray +# --------------------------------------------------------------------------- + + +def test_eager_attrs_in_seed_is_copied_onto_dataarray(): + arr = np.array([[1.0]], dtype=np.float32) + gi = _default_geo_info() + + seed = {'extra_user_attr': 'kept'} + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=None, + mask_sentinel=None, + mask_nodata=True, + dtype=None, + window=None, + name='t', + attrs_in=seed, + ) + + assert da.attrs['extra_user_attr'] == 'kept' + # The seed dict was not mutated -- the helper copies before writing. + assert seed == {'extra_user_attr': 'kept'} + + +# --------------------------------------------------------------------------- +# Lazy helper: attrs surface (pixels_present is None per #2135 dask contract) +# --------------------------------------------------------------------------- + + +def test_lazy_float_dtype_sets_masked_true_no_pixels_present_attr(): + gi = _default_geo_info() + + attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=-9999, + mask_nodata=True, + dtype='float64', + window=None, + ) + + assert attrs['nodata'] == -9999 + assert attrs['masked_nodata'] is True + assert attrs['nodata_dtype_cast'] == 'float64' + # pixels_present stays absent on the lazy path (#2135 dask contract). + assert 'nodata_pixels_present' not in attrs + assert attrs['georef_status'] == 'full' + + +def test_lazy_int_graph_dtype_keeps_masked_false(): + gi = _default_geo_info(nodata=0) + + attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=0, + mask_nodata=True, + dtype='int16', + window=None, + ) + + # Integer graph -> the per-chunk mask cannot have run, so masked=False + # mirrors the #2092 contract even though the caller asked for masking. + assert attrs['masked_nodata'] is False + assert attrs['nodata_dtype_cast'] == 'int16' + + +def test_lazy_mask_nodata_false_sets_masked_false(): + gi = _default_geo_info() + + attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=-9999, + mask_nodata=False, + dtype='float64', + window=None, + ) + + assert attrs['masked_nodata'] is False + assert 'nodata_pixels_present' not in attrs + + +def test_lazy_no_dtype_resolves_to_masked_false(): + # ``dtype=None`` means the caller didn't resolve a graph dtype to + # compare against (matches the pre-#2135 dask paths in tests). + gi = _default_geo_info() + + attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=-9999, + mask_nodata=True, + dtype=None, + window=None, + ) + + assert attrs['masked_nodata'] is False + assert 'nodata_dtype_cast' not in attrs + + +def test_lazy_no_nodata_omits_nodata_attrs(): + gi = _default_geo_info(nodata=None) + + attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=None, + mask_nodata=True, + dtype='float64', + window=None, + ) + + assert 'nodata' not in attrs + assert 'masked_nodata' not in attrs + assert 'nodata_pixels_present' not in attrs + assert 'nodata_dtype_cast' not in attrs + + +# --------------------------------------------------------------------------- +# Lazy helper: validate-first ordering +# --------------------------------------------------------------------------- + + +def test_lazy_raises_on_unparseable_crs_without_partial_attrs(): + pytest.importorskip('pyproj') + + gi = _default_geo_info(crs_wkt='NOT-A-VALID-WKT-STRING') + seed = {'sentinel_marker': True} + + with pytest.raises(UnparseableCRSError): + _finalize_lazy_read_attrs( + geo_info=gi, + nodata=-9999, + mask_nodata=True, + dtype='float64', + window=None, + attrs_in=seed, + ) + assert seed == {'sentinel_marker': True} + + +def test_lazy_allow_unparseable_crs_bypasses_check(): + pytest.importorskip('pyproj') + + gi = _default_geo_info(crs_wkt='NOT-A-VALID-WKT-STRING') + + attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=-9999, + mask_nodata=True, + dtype='float64', + window=None, + allow_unparseable_crs=True, + ) + assert attrs['crs_wkt'] == 'NOT-A-VALID-WKT-STRING' + + +# --------------------------------------------------------------------------- +# Both helpers: parity on the non-mask attrs surface +# --------------------------------------------------------------------------- + + +def test_lazy_and_eager_produce_same_georef_and_nodata_attrs(): + # Same input on both paths should produce the same attrs dict apart + # from ``nodata_pixels_present`` (eager-only by design). + arr = np.array([[1.0, 2.0]], dtype=np.float32) + gi = _default_geo_info() + + da = _finalize_eager_read( + arr, + geo_info=gi, + nodata=-9999, + mask_sentinel=-9999, + mask_nodata=True, + dtype='float64', + window=None, + name='t', + ) + lazy_attrs = _finalize_lazy_read_attrs( + geo_info=gi, + nodata=-9999, + mask_nodata=True, + dtype='float64', + window=None, + ) + + keys_to_check = { + 'crs', 'crs_wkt', 'transform', 'georef_status', + 'nodata', 'masked_nodata', 'nodata_dtype_cast', + '_xrspatial_geotiff_contract', + } + for key in keys_to_check: + assert key in da.attrs, key + assert key in lazy_attrs, key + assert da.attrs[key] == lazy_attrs[key], key + # And the lazy path does not carry the eager-only attr. + assert 'nodata_pixels_present' in da.attrs + assert 'nodata_pixels_present' not in lazy_attrs + + +# --------------------------------------------------------------------------- +# Lazy helper: mask_sentinel != nodata is a no-op (helper takes attrs only) +# --------------------------------------------------------------------------- + + +def test_lazy_helper_signature_omits_mask_sentinel(): + # The lazy helper deliberately does not accept ``mask_sentinel`` -- + # the dask graph applies masking per-chunk, and the per-chunk task + # closes over the sentinel value separately. Pin the signature here + # so a future refactor that adds ``mask_sentinel`` triggers a review. + import inspect + + sig = inspect.signature(_finalize_lazy_read_attrs) + assert 'mask_sentinel' not in sig.parameters + + +def test_eager_helper_signature_includes_mask_sentinel(): + # Mirror of the lazy check: the eager helper does take + # ``mask_sentinel`` because the three GPU eager sites derive it three + # different ways. + import inspect + + sig = inspect.signature(_finalize_eager_read) + assert 'mask_sentinel' in sig.parameters From b5e84ea0f3fae7450ea24045fdc57076bcbe15ec Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 20 May 2026 08:46:55 -0700 Subject: [PATCH 2/2] Address review feedback (#2177) - Move ``import xarray as xr`` to module scope. xarray is already a hard dependency, so the per-call local import was unnecessary indirection. - Document the shallow-copy semantics of ``attrs_in`` on both helpers so wave 2 / wave 3 migrators know nested values are shared with the caller's seed dict. - Pin the int-dtype ``nodata_dtype_cast`` value in the lazy helper test so wave 2 catches any drift from the conflated-dtype semantics. - Strengthen the seed-dict-untouched assertions with a ``len(seed) == 1`` check so a future partial-leak that adds new keys is caught even if the original key still matches. --- xrspatial/geotiff/_attrs.py | 62 +++++++++++++------ .../tests/test_finalization_helpers_2162.py | 12 +++- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index d8b4baa58..6618aaac5 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -160,6 +160,8 @@ import numpy as np +import xarray as xr + from ._coords import ( coords_from_geo_info as _coords_from_geo_info, transform_tuple_from_pixel_geometry as _transform_tuple_from_pixel_geometry, @@ -1250,18 +1252,30 @@ def _finalize_eager_read( sources). Read paths that don't need MinIsWhite inversion can pass ``mask_sentinel=nodata``. - Numpy buffers only; the GPU eager call sites pass cupy ndarrays and - rely on cupy's numpy-equivalent operators (``arr.dtype.kind``, - ``arr.astype``, boolean fancy indexing, ``arr == scalar``) so the - same helper covers both backends. The GPU eager path also passes a - masking function that runs on the device; this helper handles the - host-side path. (Wave 2 will migrate eager numpy / VRT eager; wave 3 - will migrate the three GPU eager sites with the mask step factored - out.) + Wave migration plan: + + * Wave 2 (#2178 dask, #2179 eager numpy) migrates the eager numpy + paths. The mask block inside this helper matches the inline block + in ``open_geotiff`` field-for-field; the migration is a straight + swap. + * Wave 3 (#2180 VRT, GPU) migrates the VRT eager + three GPU eager + sites. The VRT eager path is host-side and works with the helper + as-is. The GPU sites apply masking through a CUDA kernel + (``_apply_nodata_mask_gpu_with_presence``); they can either + pre-mask and call the helper with ``nodata=None`` to skip the + helper's host-side mask block, or wave 3 can extend this + helper's signature with a ``mask_fn`` injection. Either choice + lives in #2180; the wave 1 contract here is the host-side path. Returns a :class:`xarray.DataArray` ready for the caller to return from the read function. The caller assembles the dask graph separately when a lazy backend is in play; this helper is eager-only. + + ``attrs_in`` is shallow-copied via ``dict(attrs_in)``. Nested values + (e.g. ``extra_tags`` list, ``gdal_metadata`` dict) are shared between + the caller's seed dict and the returned DataArray's attrs; mutating + a nested value after the call propagates both ways. Callers that + care about isolation can ``copy.deepcopy(attrs_in)`` first. """ # Step 1: validate first so partial attrs never leak. _validate_read_geo_info( @@ -1322,10 +1336,6 @@ def _finalize_eager_read( else: dims = ['y', 'x'] - # Local import to avoid a top-level ``import xarray`` -- ``_attrs`` - # is imported from ``__init__`` and the lazy import keeps the module - # load graph the same shape it already has. - import xarray as xr return xr.DataArray(arr, dims=dims, coords=coords, name=name, attrs=attrs) @@ -1365,13 +1375,27 @@ def _finalize_lazy_read_attrs( deliberately does not touch arrays or coords. The ``dtype`` parameter accepts a numpy dtype, a string ('float64'), - or ``None`` (no explicit cast). When ``None`` the masked-flag still - has to know the graph dtype; the caller passes whatever ``dtype=`` - they resolved onto the graph (e.g. ``target_dtype`` in the dask - backend) so the helper can derive ``masked``. The integer-to-float - promotion that an eager path applies when sentinel pixels are - present is handled by the dask backend before it calls this helper; - ``dtype`` here is the resolved graph dtype, not the file dtype. + or ``None``. It is the **resolved graph dtype** the dask backend + settled on (e.g. ``target_dtype`` after the int->float64 promotion + that ``mask_nodata=True`` triggers on int files): the helper uses + it to derive ``masked`` and writes it as ``nodata_dtype_cast`` when + non-None. + + Wave 2 migration note: the current pre-helper dask backend + distinguishes "caller explicitly passed ``dtype=``" from + "graph dtype was auto-promoted by masking" so that + ``nodata_dtype_cast`` surfaces only on the explicit-cast case. + This helper conflates the two -- whatever ``dtype`` value the + caller passes here becomes the ``nodata_dtype_cast`` attr. The + migration PR (#2178) can either accept that change, or split the + helper's ``dtype`` into two parameters at that point. Frozen + signature here per #2177 means we ship the one-``dtype`` shape + and leave the split for wave 2 if it turns out to matter. + + ``attrs_in`` is shallow-copied via ``dict(attrs_in)``. Nested values + are shared between the caller's seed dict and the returned attrs; + callers that care about isolation can ``copy.deepcopy(attrs_in)`` + first. """ _validate_read_geo_info( geo_info, window=window, diff --git a/xrspatial/geotiff/tests/test_finalization_helpers_2162.py b/xrspatial/geotiff/tests/test_finalization_helpers_2162.py index 61221f07d..bbde2b710 100644 --- a/xrspatial/geotiff/tests/test_finalization_helpers_2162.py +++ b/xrspatial/geotiff/tests/test_finalization_helpers_2162.py @@ -334,8 +334,11 @@ def test_eager_raises_on_unparseable_crs_without_partial_attrs(): attrs_in=seed, ) # Seed dict was never written to; the validation failure raised - # before any ``attrs[...]`` step ran. + # before any ``attrs[...]`` step ran. Check both the exact contents + # AND the length so a future partial-leak that adds new keys is + # caught even if the existing key still matches. assert seed == {'sentinel_marker': True} + assert len(seed) == 1 def test_eager_allow_unparseable_crs_bypasses_check(): @@ -424,6 +427,12 @@ def test_lazy_int_graph_dtype_keeps_masked_false(): # Integer graph -> the per-chunk mask cannot have run, so masked=False # mirrors the #2092 contract even though the caller asked for masking. assert attrs['masked_nodata'] is False + # Wave 1 contract: whatever ``dtype`` the caller passes lands as + # ``nodata_dtype_cast``. The lazy helper does not distinguish "caller + # explicitly passed dtype=int16" from "graph dtype is int16 by + # default"; that split is wave 2's call to make per the helper + # docstring. Pinning the int16 value here locks in the conflated + # semantics so wave 2 catches any drift. assert attrs['nodata_dtype_cast'] == 'int16' @@ -497,6 +506,7 @@ def test_lazy_raises_on_unparseable_crs_without_partial_attrs(): attrs_in=seed, ) assert seed == {'sentinel_marker': True} + assert len(seed) == 1 def test_lazy_allow_unparseable_crs_bypasses_check():