diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py index 2c06d71b4..46686eeb6 100644 --- a/xrspatial/rasterize.py +++ b/xrspatial/rasterize.py @@ -1222,6 +1222,25 @@ def _should_write_last_gpu(is_first, new_idx, cur_idx): _gpu_kernel_cache = {} +# Must stay in lock-step with the cuda.local.array(2048, ...) allocations in +# _scanline_fill_gpu. Raise both together if this is ever tuned. +_GPU_MAX_ISECT = 2048 + + +def _check_gpu_edge_cap(row_ptr): + """Raise if any raster row needs more active edges than the GPU kernel + can hold in its fixed-size local array. Without this guard the CUDA + scanline kernel would silently truncate and emit wrong pixels. + """ + max_edges_per_row = int(np.diff(row_ptr).max()) + if max_edges_per_row > _GPU_MAX_ISECT: + raise ValueError( + f"rasterize CUDA kernel: {max_edges_per_row} active edges in a " + f"single row exceeds the limit of {_GPU_MAX_ISECT}. The kernel " + f"would silently drop excess edges and produce wrong output. " + f"Use the numpy/dask+numpy backend, or reduce polygon density." + ) + def _ensure_gpu_kernels(merge_fn, should_write): """Compile CUDA kernels for the given merge + predicate pair and cache. @@ -1480,17 +1499,10 @@ def _run_cupy(geometries, props_array, bounds, height, width, fill, dtype, row_ptr, col_idx = _build_row_csr_numba( edge_y_min, edge_y_max, height) - # Check for rows exceeding the GPU kernel's local array limit - _GPU_MAX_ISECT = 2048 - max_edges_per_row = int(np.diff(row_ptr).max()) - if max_edges_per_row > _GPU_MAX_ISECT: - warnings.warn( - f"rasterize CUDA kernel: {max_edges_per_row} active edges " - f"in a single row exceeds the limit of {_GPU_MAX_ISECT}. " - f"Excess edges will be silently dropped, causing incorrect " - f"results. Reduce polygon density or use the numpy backend.", - stacklevel=3, - ) + # The CUDA scanline kernel allocates a fixed-size local array; rows + # exceeding _GPU_MAX_ISECT would have edges silently dropped and + # produce wrong output. Fail fast instead. + _check_gpu_edge_cap(row_ptr) d_y_min = cupy.asarray(edge_y_min) d_x_at_ymin = cupy.asarray(edge_x_at_ymin) @@ -1924,6 +1936,9 @@ def _rasterize_tile_cupy(poly_wkb, poly_props_2d, poly_global_2d, tile_bounds, edge_geom_id = edge_arrays[1:] row_ptr, col_idx = _build_row_csr_numba( edge_y_min, edge_y_max, tile_h) + # Same fail-fast as eager CuPy: per-tile truncation would be + # silent without this check. + _check_gpu_edge_cap(row_ptr) tpb = 256 blocks = (tile_h + tpb - 1) // tpb kernels['scanline_fill'][blocks, tpb]( diff --git a/xrspatial/tests/test_rasterize.py b/xrspatial/tests/test_rasterize.py index 3f6c27819..5ad1814c8 100644 --- a/xrspatial/tests/test_rasterize.py +++ b/xrspatial/tests/test_rasterize.py @@ -364,6 +364,32 @@ def test_invalid_chunks(self, bad): width=10, height=10, bounds=(0, 0, 1, 1), chunks=bad) + def test_gpu_edge_cap_check_raises(self): + # Issue #2067: the CUDA scanline kernel allocates a fixed 2048-entry + # local array. Without a guard, exceeding it silently truncates + # active edges and produces wrong output. The validator runs on + # the host, so this is testable without a GPU. + from xrspatial.rasterize import _check_gpu_edge_cap, _GPU_MAX_ISECT + # row_ptr where row 0 has _GPU_MAX_ISECT + 1 edges + row_ptr = np.array([0, _GPU_MAX_ISECT + 1, _GPU_MAX_ISECT + 1], + dtype=np.int64) + with pytest.raises(ValueError, match="active edges"): + _check_gpu_edge_cap(row_ptr) + + def test_gpu_edge_cap_check_passes_at_limit(self): + from xrspatial.rasterize import _check_gpu_edge_cap, _GPU_MAX_ISECT + row_ptr = np.array([0, _GPU_MAX_ISECT, _GPU_MAX_ISECT], + dtype=np.int64) + # exactly at the cap is permitted + _check_gpu_edge_cap(row_ptr) + + def test_gpu_edge_cap_check_passes_below_limit(self): + from xrspatial.rasterize import _check_gpu_edge_cap, _GPU_MAX_ISECT + row_ptr = np.array([0, _GPU_MAX_ISECT - 1, _GPU_MAX_ISECT - 1], + dtype=np.int64) + # just under the cap, the common case + _check_gpu_edge_cap(row_ptr) + def test_empty_geometry_does_not_poison_inferred_bounds(self): # Issue #2065: an empty geometry returns nan from .bounds; the # caller used .min()/.max() unfiltered, so a single empty geom