From 1353b493fede57d5e363a67f96ff0b59338bb91a Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 13:55:40 -0700 Subject: [PATCH 1/2] rasterize: fail fast on GPU scanline edge cap (#2067) The CUDA scanline kernel allocates a fixed-size cuda.local.array(2048) for active edges per row. Rows exceeding that cap silently truncated and produced wrong output. The eager CuPy path issued a RuntimeWarning but still returned the corrupted raster. The Dask+CuPy tile path had no check at all, so per-tile truncation was silent. Both paths now share _check_gpu_edge_cap(row_ptr) which raises a ValueError pointing the caller at the numpy backend. Closes #2067 --- xrspatial/rasterize.py | 37 ++++++++++++++++++++++--------- xrspatial/tests/test_rasterize.py | 19 ++++++++++++++++ 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py index 25af4cd63..a1a715b34 100644 --- a/xrspatial/rasterize.py +++ b/xrspatial/rasterize.py @@ -1222,6 +1222,25 @@ def _should_write_last_gpu(is_first, new_idx, cur_idx): _gpu_kernel_cache = {} +# Must stay in lock-step with the cuda.local.array(2048, ...) allocations in +# _scanline_fill_gpu. Raise both together if this is ever tuned. +_GPU_MAX_ISECT = 2048 + + +def _check_gpu_edge_cap(row_ptr): + """Raise if any raster row needs more active edges than the GPU kernel + can hold in its fixed-size local array. Without this guard the CUDA + scanline kernel would silently truncate and emit wrong pixels. + """ + max_edges_per_row = int(np.diff(row_ptr).max()) + if max_edges_per_row > _GPU_MAX_ISECT: + raise ValueError( + f"rasterize CUDA kernel: {max_edges_per_row} active edges in a " + f"single row exceeds the limit of {_GPU_MAX_ISECT}. The kernel " + f"would silently drop excess edges and produce wrong output. " + f"Use the numpy/dask+numpy backend, or reduce polygon density." + ) + def _ensure_gpu_kernels(merge_fn, should_write): """Compile CUDA kernels for the given merge + predicate pair and cache. @@ -1480,17 +1499,10 @@ def _run_cupy(geometries, props_array, bounds, height, width, fill, dtype, row_ptr, col_idx = _build_row_csr_numba( edge_y_min, edge_y_max, height) - # Check for rows exceeding the GPU kernel's local array limit - _GPU_MAX_ISECT = 2048 - max_edges_per_row = int(np.diff(row_ptr).max()) - if max_edges_per_row > _GPU_MAX_ISECT: - warnings.warn( - f"rasterize CUDA kernel: {max_edges_per_row} active edges " - f"in a single row exceeds the limit of {_GPU_MAX_ISECT}. " - f"Excess edges will be silently dropped, causing incorrect " - f"results. Reduce polygon density or use the numpy backend.", - stacklevel=3, - ) + # The CUDA scanline kernel allocates a fixed-size local array; rows + # exceeding _GPU_MAX_ISECT would have edges silently dropped and + # produce wrong output. Fail fast instead. + _check_gpu_edge_cap(row_ptr) d_y_min = cupy.asarray(edge_y_min) d_x_at_ymin = cupy.asarray(edge_x_at_ymin) @@ -1924,6 +1936,9 @@ def _rasterize_tile_cupy(poly_wkb, poly_props_2d, poly_global_2d, tile_bounds, edge_geom_id = edge_arrays[1:] row_ptr, col_idx = _build_row_csr_numba( edge_y_min, edge_y_max, tile_h) + # Same fail-fast as eager CuPy: per-tile truncation would be + # silent without this check. + _check_gpu_edge_cap(row_ptr) tpb = 256 blocks = (tile_h + tpb - 1) // tpb kernels['scanline_fill'][blocks, tpb]( diff --git a/xrspatial/tests/test_rasterize.py b/xrspatial/tests/test_rasterize.py index 01ad85f0f..3c8e7af65 100644 --- a/xrspatial/tests/test_rasterize.py +++ b/xrspatial/tests/test_rasterize.py @@ -364,6 +364,25 @@ def test_invalid_chunks(self, bad): width=10, height=10, bounds=(0, 0, 1, 1), chunks=bad) + def test_gpu_edge_cap_check_raises(self): + # Issue #2067: the CUDA scanline kernel allocates a fixed 2048-entry + # local array. Without a guard, exceeding it silently truncates + # active edges and produces wrong output. The validator runs on + # the host, so this is testable without a GPU. + from xrspatial.rasterize import _check_gpu_edge_cap, _GPU_MAX_ISECT + # row_ptr where row 0 has _GPU_MAX_ISECT + 1 edges + row_ptr = np.array([0, _GPU_MAX_ISECT + 1, _GPU_MAX_ISECT + 1], + dtype=np.int64) + with pytest.raises(ValueError, match="active edges"): + _check_gpu_edge_cap(row_ptr) + + def test_gpu_edge_cap_check_passes_at_limit(self): + from xrspatial.rasterize import _check_gpu_edge_cap, _GPU_MAX_ISECT + row_ptr = np.array([0, _GPU_MAX_ISECT, _GPU_MAX_ISECT], + dtype=np.int64) + # exactly at the cap is permitted + _check_gpu_edge_cap(row_ptr) + # --------------------------------------------------------------------------- # all_touched mode From 450b0c42c3730f1c6af916424affdddc472e58f8 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 14:13:56 -0700 Subject: [PATCH 2/2] rasterize: add below-cap GPU edge check test (#2067) --- xrspatial/tests/test_rasterize.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xrspatial/tests/test_rasterize.py b/xrspatial/tests/test_rasterize.py index 3c8e7af65..35e6fd86c 100644 --- a/xrspatial/tests/test_rasterize.py +++ b/xrspatial/tests/test_rasterize.py @@ -383,6 +383,13 @@ def test_gpu_edge_cap_check_passes_at_limit(self): # exactly at the cap is permitted _check_gpu_edge_cap(row_ptr) + def test_gpu_edge_cap_check_passes_below_limit(self): + from xrspatial.rasterize import _check_gpu_edge_cap, _GPU_MAX_ISECT + row_ptr = np.array([0, _GPU_MAX_ISECT - 1, _GPU_MAX_ISECT - 1], + dtype=np.int64) + # just under the cap, the common case + _check_gpu_edge_cap(row_ptr) + # --------------------------------------------------------------------------- # all_touched mode