Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .claude/sweep-metadata-state.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
module,last_inspected,issue,severity_max,categories_found,notes
geotiff,2026-05-15,1909,HIGH,4;5,"Re-audit 2026-05-15 (agent-a55b69cec1ef2a092 worktree, branch deep-sweep-metadata-geotiff-2026-05-15). 4-backend (numpy/cupy/dask+numpy/dask+cupy) parity reverified after the #1813 modular refactor: full reads, windowed reads, multi-band, band=N selection, no-georef integer pixel coords, crs/crs_wkt/transform/nodata/x_resolution/y_resolution/resolution_unit/image_description/gdal_metadata all agree across backends. DataArray .name and dims agree (y, x for 2D; y, x, band for 3D). NEW HIGH finding #1909: GDS chunked GPU path (_read_geotiff_gpu_chunked_gds) declared the dask graph dtype as float64 when source had an in-range integer nodata sentinel, matching the CPU dask path's #1597 contract, but the per-chunk _chunk_task did not cast its returned cupy array to declared_dtype -- chunks with no sentinel hit returned the raw uint16/int16 source dtype, producing a silent declared/actual dtype mismatch. Fix mirrors the #1597 + #1624 CPU dask pattern: compute declared_dtype before defining _chunk_task, cast inside the task only when arr.dtype != declared_dtype to skip the no-op astype(copy=True). 6 regression tests added in test_chunked_gpu_declared_dtype_1909.py covering declared vs computed parity, CPU/GPU dask declared-dtype agreement, eager paths preserve source dtype, no-nodata round-trip, explicit dtype= kwarg, and sentinel-hit float64 promotion. Pre-existing test failures in test_predictor2_big_endian_gpu_1517.py and test_size_param_validation_gpu_vrt_1776.py exist on main (read_to_array AttributeError after #1813 refactor, tile_size=4 rejected by stricter _validate_tile_size_arg) and are unrelated to this audit."
rasterize,2026-05-17,2018,HIGH,1;2;4,"rasterize() drops like.attrs, rebuilds like.coords via linspace (not bit-identical), and never emits _FillValue/nodatavals even when fill is non-NaN. Cat 1 HIGH: chained pipelines like slope(rasterize(gdf, like=elevation)) silently lose crs/res/transform. Cat 2 MEDIUM: linspace round-trip from re-derived bounds breaks xr.align with like. Cat 4 MEDIUM: rasterize(..., fill=-9999, dtype=int32) emits no _FillValue. All 4 backends share the same final return so the fix is one place. Fixed in deep-sweep-metadata-rasterize-2026-05-17-01 (worktree agent-ab7a9aee97c1e4cdf): _extract_grid_from_like now returns coords/attrs; rasterize() reuses like.coords directly when grid matches, copies like.attrs, and emits _FillValue + nodatavals when fill is not NaN. 9 new tests in TestMetadataPropagation cover attrs propagation, bit-identical coord reuse, fill-value emission, isolation from template attrs, and parity across numpy/cupy/dask+numpy/dask+cupy backends. Full test suite (193 passing) clean."
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.
85 changes: 76 additions & 9 deletions xrspatial/rasterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1928,7 +1928,13 @@ def _parse_input(geometries, column=None, columns=None):


def _extract_grid_from_like(like):
"""Extract width, height, bounds, dtype from a template DataArray."""
"""Extract width, height, bounds, dtype from a template DataArray.

Also returns the input coords and attrs so the caller can propagate
them onto the output raster. Reusing ``like.coords`` directly (rather
than rebuilding with ``linspace``) keeps the output bit-identical to
the template and so preserves ``xr.align`` compatibility.
"""
if not isinstance(like, xr.DataArray):
raise TypeError("'like' must be an xr.DataArray")
if like.ndim != 2 or 'y' not in like.dims or 'x' not in like.dims:
Expand Down Expand Up @@ -1956,7 +1962,17 @@ def _extract_grid_from_like(like):
ymin = float(np.min(y)) - py / 2
ymax = float(np.max(y)) + py / 2

return width, height, (xmin, ymin, xmax, ymax), dt
# Carry through any non-dim coords (e.g. rioxarray's ``spatial_ref``
# CRS coord). The y/x dim coords are returned separately because the
# caller decides whether to reuse them (bit-identical grid) or build
# fresh ones (resized grid).
extra_coords = {
k: v for k, v in like.coords.items() if k not in ('x', 'y')
}

return (width, height, (xmin, ymin, xmax, ymax), dt,
like.coords['x'], like.coords['y'],
extra_coords, dict(like.attrs))


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -2119,8 +2135,13 @@ def rasterize(

# Extract defaults from template raster
like_width = like_height = like_bounds = like_dtype = None
like_x_coord = like_y_coord = None
like_extra_coords = {}
like_attrs = None
bounds_explicit = bounds is not None
if like is not None:
like_width, like_height, like_bounds, like_dtype = \
(like_width, like_height, like_bounds, like_dtype,
like_x_coord, like_y_coord, like_extra_coords, like_attrs) = \
_extract_grid_from_like(like)

# Parse input geometries
Expand Down Expand Up @@ -2218,15 +2239,61 @@ def rasterize(
final_height, final_width, fill, final_dtype,
all_touched, merge_fn)

# Build coordinates
px = (xmax - xmin) / final_width
py = (ymax - ymin) / final_height
x_coords = np.linspace(xmin + px / 2, xmax - px / 2, final_width)
y_coords = np.linspace(ymax - py / 2, ymin + py / 2, final_height)
# Build coordinates. When the caller didn't override the grid (no
# explicit width/height/bounds/resolution that resizes the output),
# reuse like.coords directly so the output is bit-identical to the
# template and xr.align keeps working. Float-equal bound comparison
# would be fragile, so key the reuse on size + "bounds weren't
# overridden" instead.
reuse_like_coords = (
like_x_coord is not None
and like_x_coord.sizes['x'] == final_width
and like_y_coord.sizes['y'] == final_height
and not bounds_explicit
and resolution is None
)
if reuse_like_coords:
x_coords = like_x_coord
y_coords = like_y_coord
else:
px = (xmax - xmin) / final_width
py = (ymax - ymin) / final_height
x_coords = np.linspace(xmin + px / 2, xmax - px / 2, final_width)
y_coords = np.linspace(ymax - py / 2, ymin + py / 2, final_height)

# Build attrs. Start from like.attrs if given so chained spatial
# pipelines (slope(rasterize(gdf, like=elevation))) see the same res,
# crs, transform, etc. as the template. Strip any inherited nodata
# keys first -- the template's old fill value almost certainly
# disagrees with this call's fill, and the geotiff writer's
# _resolve_nodata_attr would otherwise tag pixels with a stale
# sentinel. Then re-emit a consistent triplet keyed off the actual
# fill: nodata (xrspatial's primary key), _FillValue (CF), and
# nodatavals (rioxarray's per-band tuple).
out_attrs = like_attrs if like_attrs is not None else {}
for k in ('nodata', '_FillValue', 'nodatavals'):
out_attrs.pop(k, None)
try:
fill_as_float = float(fill)
fill_is_nan = np.isnan(fill_as_float)
except (TypeError, ValueError):
fill_is_nan = False
if not fill_is_nan:
out_attrs['nodata'] = fill
out_attrs['_FillValue'] = fill
out_attrs['nodatavals'] = (fill,)

# Combine y/x dim coords with any non-dim coords carried from the
# template (e.g. rioxarray's spatial_ref CRS coord).
out_coords = {'y': y_coords, 'x': x_coords}
for k, v in like_extra_coords.items():
if k not in out_coords:
out_coords[k] = v

return xr.DataArray(
out,
name=name,
dims=['y', 'x'],
coords={'y': y_coords, 'x': x_coords},
coords=out_coords,
attrs=out_attrs,
)
219 changes: 219 additions & 0 deletions xrspatial/tests/test_rasterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1592,3 +1592,222 @@ def test_rasterize_public_path_still_works_after_int64_change(self):
)
# The polygon covers a 6x6 inner block; just check it burned in.
assert np.any(result.values == 3.0)


# ---------------------------------------------------------------------------
# Metadata propagation (attrs from `like`, coord reuse, _FillValue)
# ---------------------------------------------------------------------------

def _make_like(width=10, height=10, attrs=None, dtype=np.float64):
"""Build a 2D template DataArray with georeferenced coords and attrs."""
x = np.linspace(0.5, width - 0.5, width)
y = np.linspace(height - 0.5, 0.5, height)
data = np.zeros((height, width), dtype=dtype)
return xr.DataArray(
data, dims=['y', 'x'],
coords={'y': y, 'x': x},
attrs=dict(attrs or {}),
)


class TestMetadataPropagation:
"""Verify `like.attrs` propagates, `like.coords` are reused, and the
fill value lands in `_FillValue` / `nodatavals`.
"""

def test_like_propagates_attrs(self):
attrs = {
'crs': 'EPSG:32610',
'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0),
'res': (1.0, 1.0),
}
like = _make_like(attrs=attrs)
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0,
)
assert result.attrs.get('crs') == 'EPSG:32610'
assert result.attrs.get('transform') == \
(1.0, 0.0, 0.0, 0.0, -1.0, 10.0)
assert result.attrs.get('res') == (1.0, 1.0)

def test_like_preserves_coords_bit_identical(self):
like = _make_like()
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0,
)
# Output reuses like.coords exactly so xr.align keeps working.
np.testing.assert_array_equal(
result.coords['x'].values, like.coords['x'].values)
np.testing.assert_array_equal(
result.coords['y'].values, like.coords['y'].values)

def test_like_attrs_isolated_from_template(self):
"""Mutating output attrs must not mutate the template's attrs."""
attrs = {'crs': 'EPSG:32610'}
like = _make_like(attrs=attrs)
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0,
)
result.attrs['crs'] = 'EPSG:4326'
assert like.attrs['crs'] == 'EPSG:32610'

def test_fill_value_recorded_when_not_nan(self):
result = rasterize(
[(box(2, 2, 8, 8), 1.0)],
width=10, height=10, bounds=(0, 0, 10, 10),
fill=-9999, dtype=np.int32,
)
# Emit the full triplet: xrspatial's primary (nodata), the CF
# alias (_FillValue), and rioxarray's per-band tuple (nodatavals).
# All three must agree with fill so downstream consumers all
# resolve to the same sentinel regardless of which key they read.
assert result.attrs.get('nodata') == -9999
assert result.attrs.get('_FillValue') == -9999
assert result.attrs.get('nodatavals') == (-9999,)
# Sentinel round-trips cleanly when the array is cast to its
# declared dtype -- pins #1973-style dtype mismatch regressions.
assert np.array(-9999, dtype=result.dtype) == \
np.array(result.attrs['_FillValue'], dtype=result.dtype)

def test_fill_value_omitted_for_nan(self):
"""Default fill=NaN should not pollute attrs with nodata keys."""
result = rasterize(
[(box(2, 2, 8, 8), 1.0)],
width=10, height=10, bounds=(0, 0, 10, 10),
)
assert 'nodata' not in result.attrs
assert '_FillValue' not in result.attrs
assert 'nodatavals' not in result.attrs

def test_no_like_no_attrs_pollution(self):
"""Without `like` and with NaN fill, attrs must stay empty."""
result = rasterize(
[(box(2, 2, 8, 8), 1.0)],
width=10, height=10, bounds=(0, 0, 10, 10),
)
assert result.attrs == {}

@skip_no_dask
def test_like_attrs_propagated_dask(self):
like = _make_like(attrs={'crs': 'EPSG:32610'})
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0, chunks=5,
)
# The dask backend routes through the same final xr.DataArray
# constructor, so attrs / coords / _FillValue behave identically.
assert result.attrs.get('crs') == 'EPSG:32610'
assert result.attrs.get('_FillValue') == 0
np.testing.assert_array_equal(
result.coords['x'].values, like.coords['x'].values)

@skip_no_cuda
def test_like_attrs_propagated_cupy(self):
like = _make_like(attrs={'crs': 'EPSG:32610'})
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0, use_cuda=True,
)
assert result.attrs.get('crs') == 'EPSG:32610'
assert result.attrs.get('_FillValue') == 0

@skip_no_cuda
@skip_no_dask
def test_like_attrs_propagated_dask_cupy(self):
like = _make_like(attrs={'crs': 'EPSG:32610'})
result = rasterize(
[(box(2, 2, 8, 8), 1.0)],
like=like, fill=0, use_cuda=True, chunks=5,
)
assert result.attrs.get('crs') == 'EPSG:32610'
assert result.attrs.get('_FillValue') == 0

def test_like_stale_nodata_keys_replaced_with_fill(self):
"""Inherited nodata keys from like must not outlive the new fill.

The geotiff writer's _resolve_nodata_attr checks ``nodata`` →
``nodatavals`` → ``_FillValue``. If a previous round-trip left
any of them on the template, they have to be replaced (or
cleared) so the writer doesn't tag pixels with a stale sentinel
that disagrees with the actual fill in the new array.
"""
like = _make_like(attrs={
'nodata': -9999,
'_FillValue': -9999,
'nodatavals': (-9999,),
'crs': 'EPSG:32610',
})
# Case 1: explicit fill replaces all three keys.
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0,
)
assert result.attrs.get('nodata') == 0
assert result.attrs.get('_FillValue') == 0
assert result.attrs.get('nodatavals') == (0,)
assert result.attrs.get('crs') == 'EPSG:32610'
# Case 2: NaN fill strips inherited nodata keys outright -- the
# actual unwritten pixels are NaN, not -9999, so advertising
# -9999 would lie to downstream tools.
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like,
)
assert 'nodata' not in result.attrs
assert '_FillValue' not in result.attrs
assert 'nodatavals' not in result.attrs
assert result.attrs.get('crs') == 'EPSG:32610'

def test_numpy_float_nan_fill_treated_as_nan(self):
"""Numpy scalar NaN must be detected as NaN, not emitted as fill.

``isinstance(np.float32(np.nan), float)`` is False, so a naive
``isinstance(fill, float) and np.isnan(fill)`` check would let a
numpy-typed NaN slip through and land in ``_FillValue``.
"""
result = rasterize(
[(box(2, 2, 8, 8), 1.0)],
width=10, height=10, bounds=(0, 0, 10, 10),
fill=np.float32(np.nan),
)
assert 'nodata' not in result.attrs
assert '_FillValue' not in result.attrs
assert 'nodatavals' not in result.attrs

def test_like_non_dim_coords_propagated(self):
"""Non-dim coords on like (e.g. rioxarray's spatial_ref) carry over."""
like = _make_like()
# rioxarray attaches the CRS as a scalar non-dim coord
# called ``spatial_ref``. rasterize must propagate it.
like = like.assign_coords(spatial_ref=0)
like['spatial_ref'].attrs['crs_wkt'] = (
'GEOGCS["WGS 84",DATUM["WGS_1984"]]'
)
result = rasterize(
[(box(2, 2, 8, 8), 1.0)], like=like, fill=0,
)
assert 'spatial_ref' in result.coords
assert (result.coords['spatial_ref'].attrs.get('crs_wkt')
== 'GEOGCS["WGS 84",DATUM["WGS_1984"]]')

def test_geotiff_round_trip_preserves_fill(self, tmp_path):
"""rasterize → to_geotiff → read → nodata matches.

The user-visible motivation for #2018: a rasterized output
written to GeoTIFF must round-trip the fill sentinel so masks
and downstream readers identify the right nodata pixels.
"""
pytest.importorskip('tifffile')
from xrspatial.geotiff import to_geotiff, open_geotiff

result = rasterize(
[(box(2, 2, 8, 8), 1.0)],
width=10, height=10, bounds=(0, 0, 10, 10),
fill=-9999.0, dtype=np.float32,
)
path = str(tmp_path / 'rasterize_fill_round_trip.tif')
to_geotiff(result, path)
read_back = open_geotiff(path)
# GeoTIFF stores nodata as a scalar; either ``nodata`` or
# ``_FillValue`` should land on the read-back DataArray and
# match the fill we supplied.
nodata = (read_back.attrs.get('nodata')
or read_back.attrs.get('_FillValue'))
assert nodata is not None
assert float(nodata) == -9999.0
Loading