diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index 7abec7c04..7ad7a401c 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,3 +1,4 @@ module,last_inspected,issue,severity_max,categories_found,notes geotiff,2026-05-15,1909,HIGH,4;5,"Re-audit 2026-05-15 (agent-a55b69cec1ef2a092 worktree, branch deep-sweep-metadata-geotiff-2026-05-15). 4-backend (numpy/cupy/dask+numpy/dask+cupy) parity reverified after the #1813 modular refactor: full reads, windowed reads, multi-band, band=N selection, no-georef integer pixel coords, crs/crs_wkt/transform/nodata/x_resolution/y_resolution/resolution_unit/image_description/gdal_metadata all agree across backends. DataArray .name and dims agree (y, x for 2D; y, x, band for 3D). NEW HIGH finding #1909: GDS chunked GPU path (_read_geotiff_gpu_chunked_gds) declared the dask graph dtype as float64 when source had an in-range integer nodata sentinel, matching the CPU dask path's #1597 contract, but the per-chunk _chunk_task did not cast its returned cupy array to declared_dtype -- chunks with no sentinel hit returned the raw uint16/int16 source dtype, producing a silent declared/actual dtype mismatch. Fix mirrors the #1597 + #1624 CPU dask pattern: compute declared_dtype before defining _chunk_task, cast inside the task only when arr.dtype != declared_dtype to skip the no-op astype(copy=True). 6 regression tests added in test_chunked_gpu_declared_dtype_1909.py covering declared vs computed parity, CPU/GPU dask declared-dtype agreement, eager paths preserve source dtype, no-nodata round-trip, explicit dtype= kwarg, and sentinel-hit float64 promotion. Pre-existing test failures in test_predictor2_big_endian_gpu_1517.py and test_size_param_validation_gpu_vrt_1776.py exist on main (read_to_array AttributeError after #1813 refactor, tile_size=4 rejected by stricter _validate_tile_size_arg) and are unrelated to this audit." +rasterize,2026-05-17,2018,HIGH,1;2;4,"rasterize() drops like.attrs, rebuilds like.coords via linspace (not bit-identical), and never emits _FillValue/nodatavals even when fill is non-NaN. Cat 1 HIGH: chained pipelines like slope(rasterize(gdf, like=elevation)) silently lose crs/res/transform. Cat 2 MEDIUM: linspace round-trip from re-derived bounds breaks xr.align with like. Cat 4 MEDIUM: rasterize(..., fill=-9999, dtype=int32) emits no _FillValue. All 4 backends share the same final return so the fix is one place. Fixed in deep-sweep-metadata-rasterize-2026-05-17-01 (worktree agent-ab7a9aee97c1e4cdf): _extract_grid_from_like now returns coords/attrs; rasterize() reuses like.coords directly when grid matches, copies like.attrs, and emits _FillValue + nodatavals when fill is not NaN. 9 new tests in TestMetadataPropagation cover attrs propagation, bit-identical coord reuse, fill-value emission, isolation from template attrs, and parity across numpy/cupy/dask+numpy/dask+cupy backends. Full test suite (193 passing) clean." reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py index d668003b4..30a79a6d4 100644 --- a/xrspatial/rasterize.py +++ b/xrspatial/rasterize.py @@ -1928,7 +1928,13 @@ def _parse_input(geometries, column=None, columns=None): def _extract_grid_from_like(like): - """Extract width, height, bounds, dtype from a template DataArray.""" + """Extract width, height, bounds, dtype from a template DataArray. + + Also returns the input coords and attrs so the caller can propagate + them onto the output raster. Reusing ``like.coords`` directly (rather + than rebuilding with ``linspace``) keeps the output bit-identical to + the template and so preserves ``xr.align`` compatibility. + """ if not isinstance(like, xr.DataArray): raise TypeError("'like' must be an xr.DataArray") if like.ndim != 2 or 'y' not in like.dims or 'x' not in like.dims: @@ -1956,7 +1962,17 @@ def _extract_grid_from_like(like): ymin = float(np.min(y)) - py / 2 ymax = float(np.max(y)) + py / 2 - return width, height, (xmin, ymin, xmax, ymax), dt + # Carry through any non-dim coords (e.g. rioxarray's ``spatial_ref`` + # CRS coord). The y/x dim coords are returned separately because the + # caller decides whether to reuse them (bit-identical grid) or build + # fresh ones (resized grid). + extra_coords = { + k: v for k, v in like.coords.items() if k not in ('x', 'y') + } + + return (width, height, (xmin, ymin, xmax, ymax), dt, + like.coords['x'], like.coords['y'], + extra_coords, dict(like.attrs)) # --------------------------------------------------------------------------- @@ -2119,8 +2135,13 @@ def rasterize( # Extract defaults from template raster like_width = like_height = like_bounds = like_dtype = None + like_x_coord = like_y_coord = None + like_extra_coords = {} + like_attrs = None + bounds_explicit = bounds is not None if like is not None: - like_width, like_height, like_bounds, like_dtype = \ + (like_width, like_height, like_bounds, like_dtype, + like_x_coord, like_y_coord, like_extra_coords, like_attrs) = \ _extract_grid_from_like(like) # Parse input geometries @@ -2218,15 +2239,61 @@ def rasterize( final_height, final_width, fill, final_dtype, all_touched, merge_fn) - # Build coordinates - px = (xmax - xmin) / final_width - py = (ymax - ymin) / final_height - x_coords = np.linspace(xmin + px / 2, xmax - px / 2, final_width) - y_coords = np.linspace(ymax - py / 2, ymin + py / 2, final_height) + # Build coordinates. When the caller didn't override the grid (no + # explicit width/height/bounds/resolution that resizes the output), + # reuse like.coords directly so the output is bit-identical to the + # template and xr.align keeps working. Float-equal bound comparison + # would be fragile, so key the reuse on size + "bounds weren't + # overridden" instead. + reuse_like_coords = ( + like_x_coord is not None + and like_x_coord.sizes['x'] == final_width + and like_y_coord.sizes['y'] == final_height + and not bounds_explicit + and resolution is None + ) + if reuse_like_coords: + x_coords = like_x_coord + y_coords = like_y_coord + else: + px = (xmax - xmin) / final_width + py = (ymax - ymin) / final_height + x_coords = np.linspace(xmin + px / 2, xmax - px / 2, final_width) + y_coords = np.linspace(ymax - py / 2, ymin + py / 2, final_height) + + # Build attrs. Start from like.attrs if given so chained spatial + # pipelines (slope(rasterize(gdf, like=elevation))) see the same res, + # crs, transform, etc. as the template. Strip any inherited nodata + # keys first -- the template's old fill value almost certainly + # disagrees with this call's fill, and the geotiff writer's + # _resolve_nodata_attr would otherwise tag pixels with a stale + # sentinel. Then re-emit a consistent triplet keyed off the actual + # fill: nodata (xrspatial's primary key), _FillValue (CF), and + # nodatavals (rioxarray's per-band tuple). + out_attrs = like_attrs if like_attrs is not None else {} + for k in ('nodata', '_FillValue', 'nodatavals'): + out_attrs.pop(k, None) + try: + fill_as_float = float(fill) + fill_is_nan = np.isnan(fill_as_float) + except (TypeError, ValueError): + fill_is_nan = False + if not fill_is_nan: + out_attrs['nodata'] = fill + out_attrs['_FillValue'] = fill + out_attrs['nodatavals'] = (fill,) + + # Combine y/x dim coords with any non-dim coords carried from the + # template (e.g. rioxarray's spatial_ref CRS coord). + out_coords = {'y': y_coords, 'x': x_coords} + for k, v in like_extra_coords.items(): + if k not in out_coords: + out_coords[k] = v return xr.DataArray( out, name=name, dims=['y', 'x'], - coords={'y': y_coords, 'x': x_coords}, + coords=out_coords, + attrs=out_attrs, ) diff --git a/xrspatial/tests/test_rasterize.py b/xrspatial/tests/test_rasterize.py index c27437fe1..df89328a8 100644 --- a/xrspatial/tests/test_rasterize.py +++ b/xrspatial/tests/test_rasterize.py @@ -1592,3 +1592,222 @@ def test_rasterize_public_path_still_works_after_int64_change(self): ) # The polygon covers a 6x6 inner block; just check it burned in. assert np.any(result.values == 3.0) + + +# --------------------------------------------------------------------------- +# Metadata propagation (attrs from `like`, coord reuse, _FillValue) +# --------------------------------------------------------------------------- + +def _make_like(width=10, height=10, attrs=None, dtype=np.float64): + """Build a 2D template DataArray with georeferenced coords and attrs.""" + x = np.linspace(0.5, width - 0.5, width) + y = np.linspace(height - 0.5, 0.5, height) + data = np.zeros((height, width), dtype=dtype) + return xr.DataArray( + data, dims=['y', 'x'], + coords={'y': y, 'x': x}, + attrs=dict(attrs or {}), + ) + + +class TestMetadataPropagation: + """Verify `like.attrs` propagates, `like.coords` are reused, and the + fill value lands in `_FillValue` / `nodatavals`. + """ + + def test_like_propagates_attrs(self): + attrs = { + 'crs': 'EPSG:32610', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + 'res': (1.0, 1.0), + } + like = _make_like(attrs=attrs) + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, + ) + assert result.attrs.get('crs') == 'EPSG:32610' + assert result.attrs.get('transform') == \ + (1.0, 0.0, 0.0, 0.0, -1.0, 10.0) + assert result.attrs.get('res') == (1.0, 1.0) + + def test_like_preserves_coords_bit_identical(self): + like = _make_like() + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, + ) + # Output reuses like.coords exactly so xr.align keeps working. + np.testing.assert_array_equal( + result.coords['x'].values, like.coords['x'].values) + np.testing.assert_array_equal( + result.coords['y'].values, like.coords['y'].values) + + def test_like_attrs_isolated_from_template(self): + """Mutating output attrs must not mutate the template's attrs.""" + attrs = {'crs': 'EPSG:32610'} + like = _make_like(attrs=attrs) + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, + ) + result.attrs['crs'] = 'EPSG:4326' + assert like.attrs['crs'] == 'EPSG:32610' + + def test_fill_value_recorded_when_not_nan(self): + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], + width=10, height=10, bounds=(0, 0, 10, 10), + fill=-9999, dtype=np.int32, + ) + # Emit the full triplet: xrspatial's primary (nodata), the CF + # alias (_FillValue), and rioxarray's per-band tuple (nodatavals). + # All three must agree with fill so downstream consumers all + # resolve to the same sentinel regardless of which key they read. + assert result.attrs.get('nodata') == -9999 + assert result.attrs.get('_FillValue') == -9999 + assert result.attrs.get('nodatavals') == (-9999,) + # Sentinel round-trips cleanly when the array is cast to its + # declared dtype -- pins #1973-style dtype mismatch regressions. + assert np.array(-9999, dtype=result.dtype) == \ + np.array(result.attrs['_FillValue'], dtype=result.dtype) + + def test_fill_value_omitted_for_nan(self): + """Default fill=NaN should not pollute attrs with nodata keys.""" + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], + width=10, height=10, bounds=(0, 0, 10, 10), + ) + assert 'nodata' not in result.attrs + assert '_FillValue' not in result.attrs + assert 'nodatavals' not in result.attrs + + def test_no_like_no_attrs_pollution(self): + """Without `like` and with NaN fill, attrs must stay empty.""" + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], + width=10, height=10, bounds=(0, 0, 10, 10), + ) + assert result.attrs == {} + + @skip_no_dask + def test_like_attrs_propagated_dask(self): + like = _make_like(attrs={'crs': 'EPSG:32610'}) + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, chunks=5, + ) + # The dask backend routes through the same final xr.DataArray + # constructor, so attrs / coords / _FillValue behave identically. + assert result.attrs.get('crs') == 'EPSG:32610' + assert result.attrs.get('_FillValue') == 0 + np.testing.assert_array_equal( + result.coords['x'].values, like.coords['x'].values) + + @skip_no_cuda + def test_like_attrs_propagated_cupy(self): + like = _make_like(attrs={'crs': 'EPSG:32610'}) + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, use_cuda=True, + ) + assert result.attrs.get('crs') == 'EPSG:32610' + assert result.attrs.get('_FillValue') == 0 + + @skip_no_cuda + @skip_no_dask + def test_like_attrs_propagated_dask_cupy(self): + like = _make_like(attrs={'crs': 'EPSG:32610'}) + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], + like=like, fill=0, use_cuda=True, chunks=5, + ) + assert result.attrs.get('crs') == 'EPSG:32610' + assert result.attrs.get('_FillValue') == 0 + + def test_like_stale_nodata_keys_replaced_with_fill(self): + """Inherited nodata keys from like must not outlive the new fill. + + The geotiff writer's _resolve_nodata_attr checks ``nodata`` → + ``nodatavals`` → ``_FillValue``. If a previous round-trip left + any of them on the template, they have to be replaced (or + cleared) so the writer doesn't tag pixels with a stale sentinel + that disagrees with the actual fill in the new array. + """ + like = _make_like(attrs={ + 'nodata': -9999, + '_FillValue': -9999, + 'nodatavals': (-9999,), + 'crs': 'EPSG:32610', + }) + # Case 1: explicit fill replaces all three keys. + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, + ) + assert result.attrs.get('nodata') == 0 + assert result.attrs.get('_FillValue') == 0 + assert result.attrs.get('nodatavals') == (0,) + assert result.attrs.get('crs') == 'EPSG:32610' + # Case 2: NaN fill strips inherited nodata keys outright -- the + # actual unwritten pixels are NaN, not -9999, so advertising + # -9999 would lie to downstream tools. + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, + ) + assert 'nodata' not in result.attrs + assert '_FillValue' not in result.attrs + assert 'nodatavals' not in result.attrs + assert result.attrs.get('crs') == 'EPSG:32610' + + def test_numpy_float_nan_fill_treated_as_nan(self): + """Numpy scalar NaN must be detected as NaN, not emitted as fill. + + ``isinstance(np.float32(np.nan), float)`` is False, so a naive + ``isinstance(fill, float) and np.isnan(fill)`` check would let a + numpy-typed NaN slip through and land in ``_FillValue``. + """ + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], + width=10, height=10, bounds=(0, 0, 10, 10), + fill=np.float32(np.nan), + ) + assert 'nodata' not in result.attrs + assert '_FillValue' not in result.attrs + assert 'nodatavals' not in result.attrs + + def test_like_non_dim_coords_propagated(self): + """Non-dim coords on like (e.g. rioxarray's spatial_ref) carry over.""" + like = _make_like() + # rioxarray attaches the CRS as a scalar non-dim coord + # called ``spatial_ref``. rasterize must propagate it. + like = like.assign_coords(spatial_ref=0) + like['spatial_ref'].attrs['crs_wkt'] = ( + 'GEOGCS["WGS 84",DATUM["WGS_1984"]]' + ) + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], like=like, fill=0, + ) + assert 'spatial_ref' in result.coords + assert (result.coords['spatial_ref'].attrs.get('crs_wkt') + == 'GEOGCS["WGS 84",DATUM["WGS_1984"]]') + + def test_geotiff_round_trip_preserves_fill(self, tmp_path): + """rasterize → to_geotiff → read → nodata matches. + + The user-visible motivation for #2018: a rasterized output + written to GeoTIFF must round-trip the fill sentinel so masks + and downstream readers identify the right nodata pixels. + """ + pytest.importorskip('tifffile') + from xrspatial.geotiff import to_geotiff, open_geotiff + + result = rasterize( + [(box(2, 2, 8, 8), 1.0)], + width=10, height=10, bounds=(0, 0, 10, 10), + fill=-9999.0, dtype=np.float32, + ) + path = str(tmp_path / 'rasterize_fill_round_trip.tif') + to_geotiff(result, path) + read_back = open_geotiff(path) + # GeoTIFF stores nodata as a scalar; either ``nodata`` or + # ``_FillValue`` should land on the read-back DataArray and + # match the fill we supplied. + nodata = (read_back.attrs.get('nodata') + or read_back.attrs.get('_FillValue')) + assert nodata is not None + assert float(nodata) == -9999.0