From 8f849af4a6783f38672ff62c6976f1ae630d73ae Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 20 May 2026 08:34:56 -0700
Subject: [PATCH 1/3] Use CUDA atomics in GPU rasterize kernels (#2167)

The GPU point and line burn kernels did non-atomic read-modify-write
on per-pixel state, so overlapping geometries produced nondeterministic
results that disagreed with the numpy backend.

Switch the per-pixel write strategy to CUDA atomics for the six
built-in aggregators:

- sum, count: cuda.atomic.add on the output buffer.
- min, max: cuda.atomic.min / cuda.atomic.max on the output buffer.
- first, last: two-pass. Pass 1 resolves the per-pixel winning
  input index with cuda.atomic.min / cuda.atomic.max on an index
  buffer; pass 2 stamps the winner's value into the output.

The per-merge buffers are initialised up front (zero / +inf / -inf /
fill) and a host-side post-pass blends fill back into untouched
pixels.

User-supplied merge callables keep the previous non-atomic closure
path, since the public merge_fn signature has no atomic equivalent.

The numpy and dask+numpy backends are unchanged.
---
 xrspatial/rasterize.py | 561 +++++++++++++++++++++++++++++++++--------
 1 file changed, 451 insertions(+), 110 deletions(-)

diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py
index d7e3f68c6..debe564be 100644
--- a/xrspatial/rasterize.py
+++ b/xrspatial/rasterize.py
@@ -1022,28 +1022,103 @@ def _check_gpu_edge_cap(row_ptr):
         )
 
 
-def _ensure_gpu_kernels(merge_fn, should_write):
+#: Built-in merges whose GPU path uses CUDA atomics so overlapping
+#: writes are deterministic.  User-supplied merge callables fall back to
+#: the non-atomic closure path (the public ``merge_fn`` signature has no
+#: slot for atomicity, and we cannot synthesise atomics for an arbitrary
+#: Python function).
+_GPU_ATOMIC_MERGES = ('sum', 'count', 'min', 'max', 'first', 'last')
+
+
+def _ensure_gpu_kernels(merge_fn, should_write, merge_name=None):
     """Compile CUDA kernels for the given merge + predicate pair and cache.
 
     Each unique pair produces a separate set of kernels because CUDA kernels
     cannot accept function arguments; both functions are captured by closure
     at compile time.
+
+    When ``merge_name`` is one of the built-in merges (``sum``, ``count``,
+    ``min``, ``max``, ``first``, ``last``), the returned kernel set uses
+    CUDA atomics on ``out`` / ``order`` so that overlapping point writes,
+    crossing line segments, and shared polygon-boundary pixels produce
+    deterministic results matching the CPU backend.  Otherwise (callable
+    or unknown merge) the kernels keep the legacy non-atomic
+    read-modify-write path, which is racy on overlap but preserved for
+    user callables whose signature has no atomic equivalent.
     """
-    key = (id(merge_fn), id(should_write))
+    key = (id(merge_fn), id(should_write), merge_name)
     if key in _gpu_kernel_cache:
         return _gpu_kernel_cache[key]
 
     from numba import cuda
 
+    atomic_mode = merge_name if merge_name in _GPU_ATOMIC_MERGES else None
+
+    # The per-pixel write strategy depends on ``atomic_mode``.  Numba
+    # treats free Python variables in a ``cuda.jit`` closure as compile-
+    # time constants, so the if/elif ladder below is folded away and
+    # each compiled kernel contains only the relevant atomic call.
+    #
+    # Initialisation contract (launcher fills ``out`` / ``order`` before
+    # the kernel runs):
+    #   sum, count    -> out = 0.0           , order = MIN_INT64
+    #   min           -> out = +inf          , order = MIN_INT64
+    #   max           -> out = -inf          , order = MIN_INT64
+    #   first         -> out = fill          , order = MAX_INT64
+    #   last          -> out = fill          , order = MIN_INT64
+    #   None (user)   -> out = fill          , order = -1 (legacy)
+    #
+    # For sum/count/min/max, ``order`` is repurposed as a touched mask:
+    # any non-sentinel value means the pixel was written by at least
+    # one geometry.  A host-side post-pass blends in ``fill`` where
+    # the pixel is still untouched, and substitutes ``fill`` for the
+    # numerical sentinels used by min/max init.
+    #
+    # For first/last, the kernel is two-pass: pass 1 resolves the
+    # winning input index atomically into ``order``; pass 2 stamps the
+    # winner's ``props[0]`` value into ``out``.  ``pass_id`` selects
+    # which branch executes (kernels are compiled once per pass).
+
     @cuda.jit(device=True)
     def _apply_merge_gpu(out, written, order, r, c, props, new_idx):
-        is_first = np.int64(written[r, c] == 0)
-        cur_idx = order[r, c]
-        if not should_write(is_first, new_idx, cur_idx):
-            return
-        out[r, c] = merge_fn(out[r, c], props, is_first)
-        written[r, c] = 1
-        order[r, c] = new_idx
+        if atomic_mode == 'sum':
+            cuda.atomic.add(out, (r, c), props[0])
+            cuda.atomic.max(order, (r, c), new_idx)
+        elif atomic_mode == 'count':
+            cuda.atomic.add(out, (r, c), 1.0)
+            cuda.atomic.max(order, (r, c), new_idx)
+        elif atomic_mode == 'min':
+            cuda.atomic.min(out, (r, c), props[0])
+            cuda.atomic.max(order, (r, c), new_idx)
+        elif atomic_mode == 'max':
+            cuda.atomic.max(out, (r, c), props[0])
+            cuda.atomic.max(order, (r, c), new_idx)
+        elif atomic_mode == 'first':
+            # Pass 1: resolve winning input index per pixel.
+            cuda.atomic.min(order, (r, c), new_idx)
+        elif atomic_mode == 'last':
+            cuda.atomic.max(order, (r, c), new_idx)
+        else:
+            # Legacy non-atomic path for user callables.
+            is_first = np.int64(written[r, c] == 0)
+            cur_idx = order[r, c]
+            if not should_write(is_first, new_idx, cur_idx):
+                return
+            out[r, c] = merge_fn(out[r, c], props, is_first)
+            written[r, c] = 1
+            order[r, c] = new_idx
+
+    # Pass-2 device function for first/last: stamp the winner's value
+    # into ``out``.  Only the geometry whose index won pass 1 writes.
+    # Multiple threads from the same geometry may write the same pixel,
+    # but they all write ``props[0]`` so the outcome is deterministic.
+    @cuda.jit(device=True)
+    def _apply_merge_gpu_pass2(out, written, order, r, c, props, new_idx):
+        if atomic_mode == 'first' or atomic_mode == 'last':
+            if order[r, c] == new_idx:
+                out[r, c] = props[0]
+        # else: pass 2 is a no-op for all other modes; the launcher
+        # simply skips it.
 
     @cuda.jit
     def _scanline_fill_gpu(out, written, order, edge_y_min, edge_x_at_ymin,
@@ -1187,10 +1262,143 @@ def _burn_lines_gpu(out, written, order, r0_arr, c0_arr, r1_arr, c1_arr,
                 c += sc
                 err += dr
 
+    # ---- Pass-2 kernels (only meaningful for first / last) -----------
+    # These mirror the pass-1 kernels but call ``_apply_merge_gpu_pass2``
+    # to stamp the winner's value once ``order`` holds the resolved
+    # input index.  Compiled unconditionally so the kernels dict shape
+    # stays uniform; for non-first/last modes the launcher does not
+    # invoke them.
+
+    @cuda.jit
+    def _scanline_fill_gpu_pass2(out, written, order, edge_y_min,
+                                 edge_x_at_ymin, edge_inv_slope,
+                                 edge_geom_id, geom_props, geom_global_idx,
+                                 row_ptr, col_idx, width):
+        row = cuda.grid(1)
+        if row >= out.shape[0]:
+            return
+        start = row_ptr[row]
+        end = row_ptr[row + 1]
+        if end - start == 0:
+            return
+        MAX_ISECT = 2048
+        xs = cuda.local.array(2048, dtype=np.float64)
+        gs = cuda.local.array(2048, dtype=np.int32)
+        actual = 0
+        for k in range(start, end):
+            if actual >= MAX_ISECT:
+                break
+            e = col_idx[k]
+            xs[actual] = (edge_x_at_ymin[e]
+                          + (row - edge_y_min[e]) * edge_inv_slope[e])
+            gs[actual] = edge_geom_id[e]
+            actual += 1
+        gap = actual >> 1
+        while gap > 0:
+            for i in range(gap, actual):
+                kx = xs[i]
+                kg = gs[i]
+                j = i - gap
+                while j >= 0 and (gs[j] > kg or (gs[j] == kg and xs[j] > kx)):
+                    xs[j + gap] = xs[j]
+                    gs[j + gap] = gs[j]
+                    j -= gap
+                xs[j + gap] = kx
+                gs[j + gap] = kg
+            gap >>= 1
+        i = 0
+        while i < actual - 1:
+            gid = gs[i]
+            new_idx = geom_global_idx[gid]
+            j = i
+            while j < actual and gs[j] == gid:
+                j += 1
+            k = i
+            while k + 1 < j:
+                x_start = xs[k]
+                x_end = xs[k + 1]
+                ix = int(x_start)
+                col_start = ix + 1 if x_start > ix else ix
+                if col_start < 0:
+                    col_start = 0
+                ix_end = int(x_end)
+                col_end = ix_end if x_end > ix_end else ix_end - 1
+                if col_end >= width:
+                    col_end = width - 1
+                for c in range(col_start, col_end + 1):
+                    _apply_merge_gpu_pass2(out, written, order, row, c,
+                                           geom_props[gid], new_idx)
+                k += 2
+            i = j
+
+    @cuda.jit
+    def _burn_points_gpu_pass2(out, written, order, rows, cols, geom_idx,
+                               geom_props, geom_global_idx, n_points):
+        i = cuda.grid(1)
+        if i >= n_points:
+            return
+        r = rows[i]
+        c = cols[i]
+        if 0 <= r < out.shape[0] and 0 <= c < out.shape[1]:
+            gi = geom_idx[i]
+            _apply_merge_gpu_pass2(out, written, order, r, c,
+                                   geom_props[gi], geom_global_idx[gi])
+
+    @cuda.jit
+    def _burn_lines_gpu_pass2(out, written, order, r0_arr, c0_arr, r1_arr,
+                              c1_arr, geom_idx, geom_props,
+                              geom_global_idx, n_segs, height, width):
+        i = cuda.grid(1)
+        if i >= n_segs:
+            return
+        r0 = r0_arr[i]
+        c0 = c0_arr[i]
+        r1 = r1_arr[i]
+        c1 = c1_arr[i]
+        gi = geom_idx[i]
+        props = geom_props[gi]
+        new_idx = geom_global_idx[gi]
+        dr = r1 - r0
+        dc = c1 - c0
+        sr = 1 if dr >= 0 else -1
+        sc = 1 if dc >= 0 else -1
+        if dr < 0:
+            dr = -dr
+        if dc < 0:
+            dc = -dc
+        if dr >= dc:
+            err = dc - dr
+            r, c = r0, c0
+            for _ in range(dr + 1):
+                if 0 <= r < height and 0 <= c < width:
+                    _apply_merge_gpu_pass2(out, written, order, r, c,
+                                           props, new_idx)
+                if err >= 0:
+                    c += sc
+                    err -= dr
+                r += sr
+                err += dc
+        else:
+            err = dr - dc
+            r, c = r0, c0
+            for _ in range(dc + 1):
+                if 0 <= r < height and 0 <= c < width:
+                    _apply_merge_gpu_pass2(out, written, order, r, c,
+                                           props, new_idx)
+                if err >= 0:
+                    r += sr
+                    err -= dc
+                c += sc
+                err += dr
+
     kernels = {
         'scanline_fill': _scanline_fill_gpu,
         'burn_points': _burn_points_gpu,
         'burn_lines': _burn_lines_gpu,
+        'scanline_fill_pass2': _scanline_fill_gpu_pass2,
+        'burn_points_pass2': _burn_points_gpu_pass2,
+        'burn_lines_pass2': _burn_lines_gpu_pass2,
+        'atomic_mode': atomic_mode,
     }
     _gpu_kernel_cache[key] = kernels
     return kernels
@@ -1253,14 +1461,84 @@ def _build_row_csr_numba(edge_y_min, edge_y_max, height):
     return row_ptr, col_idx
 
 
+#: Sentinels used to mark "untouched" pixels in the GPU ``order`` array.
+#: Numba's ``cuda.atomic.{min,max}`` on int64 accepts standard limits.
+_GPU_ORDER_SENTINEL_MIN = np.int64(np.iinfo(np.int64).min)
+_GPU_ORDER_SENTINEL_MAX = np.int64(np.iinfo(np.int64).max)
+
+
+def _gpu_init_buffers(height, width, fill, merge_name):
+    """Allocate ``out`` / ``written`` / ``order`` with the right initial
+    values for the selected atomic merge.  Returns the trio plus the
+    sentinel value used in ``order`` so the caller can blend ``fill``
+    back in afterwards.
+    """
+    written = cupy.zeros((height, width), dtype=cupy.int8)
+    if merge_name in ('sum', 'count'):
+        out = cupy.zeros((height, width), dtype=cupy.float64)
+        order = cupy.full((height, width), _GPU_ORDER_SENTINEL_MIN,
+                          dtype=cupy.int64)
+        order_sentinel = _GPU_ORDER_SENTINEL_MIN
+    elif merge_name == 'min':
+        out = cupy.full((height, width), cupy.inf, dtype=cupy.float64)
+        order = cupy.full((height, width), _GPU_ORDER_SENTINEL_MIN,
+                          dtype=cupy.int64)
+        order_sentinel = _GPU_ORDER_SENTINEL_MIN
+    elif merge_name == 'max':
+        out = cupy.full((height, width), -cupy.inf, dtype=cupy.float64)
+        order = cupy.full((height, width), _GPU_ORDER_SENTINEL_MIN,
+                          dtype=cupy.int64)
+        order_sentinel = _GPU_ORDER_SENTINEL_MIN
+    elif merge_name == 'first':
+        out = cupy.full((height, width), fill, dtype=cupy.float64)
+        order = cupy.full((height, width), _GPU_ORDER_SENTINEL_MAX,
+                          dtype=cupy.int64)
+        order_sentinel = _GPU_ORDER_SENTINEL_MAX
+    elif merge_name == 'last':
+        out = cupy.full((height, width), fill, dtype=cupy.float64)
+        order = cupy.full((height, width), _GPU_ORDER_SENTINEL_MIN,
+                          dtype=cupy.int64)
+        order_sentinel = _GPU_ORDER_SENTINEL_MIN
+    else:
+        # User callable path: legacy initialisation, non-atomic semantics.
+        out = cupy.full((height, width), fill, dtype=cupy.float64)
+        order = cupy.full((height, width), -1, dtype=cupy.int64)
+        order_sentinel = None
+    return out, written, order, order_sentinel
+
+
+def _gpu_finalize_buffers(out, order, order_sentinel, fill, merge_name):
+    """Apply the post-pass blend that restores ``fill`` for untouched
+    pixels (sum/count/min/max/first/last) and replaces the +/- inf
+    initialisers used by min/max.
+
+    Returns ``out`` (modified in-place where convenient).
+    """
+    if merge_name is None or order_sentinel is None:
+        return out
+    touched = order != order_sentinel
+    if merge_name in ('sum', 'count'):
+        # Pixels with no contribution should show ``fill``, not 0.
+        out = cupy.where(touched, out, cupy.float64(fill))
+    elif merge_name == 'min':
+        # +inf initialiser leaks into untouched pixels.
+        out = cupy.where(touched, out, cupy.float64(fill))
+    elif merge_name == 'max':
+        out = cupy.where(touched, out, cupy.float64(fill))
+    # first/last already initialise ``out`` to ``fill``; pass-2 only
+    # writes for touched pixels, so no blend is needed.
+    return out
+
+
 def _run_cupy(geometries, props_array, bounds, height, width, fill, dtype,
-              all_touched, merge_fn, should_write):
+              all_touched, merge_fn, should_write, merge_name=None):
     """CuPy backend for rasterize."""
-    kernels = _ensure_gpu_kernels(merge_fn, should_write)
+    kernels = _ensure_gpu_kernels(merge_fn, should_write, merge_name)
+    atomic_mode = kernels['atomic_mode']
+    two_pass = atomic_mode in ('first', 'last')
 
-    out = cupy.full((height, width), fill, dtype=cupy.float64)
-    written = cupy.zeros((height, width), dtype=cupy.int8)
-    order = cupy.full((height, width), -1, dtype=cupy.int64)
+    out, written, order, order_sentinel = _gpu_init_buffers(
+        height, width, fill, atomic_mode)
 
     (poly_geoms, poly_props, poly_ids, poly_global), \
         (line_geoms, line_props, line_global), \
@@ -1274,77 +1552,100 @@ def _run_cupy(geometries, props_array, bounds, height, width, fill, dtype,
     edge_y_min, edge_y_max, edge_x_at_ymin, edge_inv_slope, \
         edge_geom_id = edge_arrays
 
+    # Stage the per-launch inputs once; first/last reuses these tensors
+    # for the pass-2 stamp launches.
+    poly_launch = None
     if len(edge_y_min) > 0:
-        # Build CSR structure on CPU, then transfer to GPU
         row_ptr, col_idx = _build_row_csr_numba(
             edge_y_min, edge_y_max, height)
-
         # The CUDA scanline kernel allocates a fixed-size local array; rows
         # exceeding _GPU_MAX_ISECT would have edges silently dropped and
         # produce wrong output.  Fail fast instead.
         _check_gpu_edge_cap(row_ptr)
+        poly_launch = (
+            cupy.asarray(edge_y_min), cupy.asarray(edge_x_at_ymin),
+            cupy.asarray(edge_inv_slope), cupy.asarray(edge_geom_id),
+            cupy.asarray(poly_props), cupy.asarray(poly_global),
+            cupy.asarray(row_ptr), cupy.asarray(col_idx))
 
-        d_y_min = cupy.asarray(edge_y_min)
-        d_x_at_ymin = cupy.asarray(edge_x_at_ymin)
-        d_inv_slope = cupy.asarray(edge_inv_slope)
-        d_geom_id = cupy.asarray(edge_geom_id)
-        d_geom_props = cupy.asarray(poly_props)
-        d_poly_global = cupy.asarray(poly_global)
-        d_row_ptr = cupy.asarray(row_ptr)
-        d_col_idx = cupy.asarray(col_idx)
-
-        tpb = 256
-        blocks = (height + tpb - 1) // tpb
-        kernels['scanline_fill'][blocks, tpb](
-            out, written, order, d_y_min, d_x_at_ymin, d_inv_slope,
-            d_geom_id, d_geom_props, d_poly_global,
-            d_row_ptr, d_col_idx, width)
-
-    # 1b. all_touched: draw polygon boundaries via Bresenham (GPU)
+    boundary_launch = None
     if all_touched and poly_geoms:
         br0, bc0, br1, bc1, bidx = _extract_polygon_boundary_segments(
             poly_geoms, poly_ids, bounds, height, width)
         if len(br0) > 0:
-            n_bsegs = len(br0)
-            tpb = 256
-            bpg = (n_bsegs + tpb - 1) // tpb
-            kernels['burn_lines'][bpg, tpb](
-                out, written, order,
+            boundary_launch = (
                 cupy.asarray(br0), cupy.asarray(bc0),
                 cupy.asarray(br1), cupy.asarray(bc1),
                 cupy.asarray(bidx), cupy.asarray(poly_props),
-                cupy.asarray(poly_global),
-                n_bsegs, height, width)
+                cupy.asarray(poly_global), len(br0))
 
-    # 2. Lines
     r0, c0, r1, c1, line_idx = _extract_line_segments(
         line_geoms, bounds, height, width)
+    line_launch = None
     if len(r0) > 0:
-        n_segs = len(r0)
-        tpb = 256
-        bpg = (n_segs + tpb - 1) // tpb
-        kernels['burn_lines'][bpg, tpb](
-            out, written, order,
+        line_launch = (
             cupy.asarray(r0), cupy.asarray(c0),
             cupy.asarray(r1), cupy.asarray(c1),
             cupy.asarray(line_idx), cupy.asarray(line_props),
-            cupy.asarray(line_global),
-            n_segs, height, width)
+            cupy.asarray(line_global), len(r0))
 
-    # 3. Points
     prows, pcols, pt_idx = _extract_points(
         point_geoms, bounds, height, width)
+    point_launch = None
     if len(prows) > 0:
-        n_pts = len(prows)
-        tpb = 256
-        bpg = (n_pts + tpb - 1) // tpb
-        kernels['burn_points'][bpg, tpb](
-            out, written, order,
+        point_launch = (
             cupy.asarray(prows), cupy.asarray(pcols),
             cupy.asarray(pt_idx), cupy.asarray(point_props),
-            cupy.asarray(point_global), n_pts)
+            cupy.asarray(point_global), len(prows))
 
-    return out.astype(dtype)
+    def _launch_phase(k_scan, k_lines, k_points):
+        tpb = 256
+        if poly_launch is not None:
+            blocks = (height + tpb - 1) // tpb
+            k_scan[blocks, tpb](out, written, order,
+                                poly_launch[0], poly_launch[1],
+                                poly_launch[2], poly_launch[3],
+                                poly_launch[4], poly_launch[5],
+                                poly_launch[6], poly_launch[7], width)
+        if boundary_launch is not None:
+            n_bsegs = boundary_launch[7]
+            bpg = (n_bsegs + tpb - 1) // tpb
+            k_lines[bpg, tpb](out, written, order,
+                              boundary_launch[0], boundary_launch[1],
+                              boundary_launch[2], boundary_launch[3],
+                              boundary_launch[4], boundary_launch[5],
+                              boundary_launch[6], n_bsegs, height, width)
+        if line_launch is not None:
+            n_segs = line_launch[7]
+            bpg = (n_segs + tpb - 1) // tpb
+            k_lines[bpg, tpb](out, written, order,
+                              line_launch[0], line_launch[1],
+                              line_launch[2], line_launch[3],
+                              line_launch[4], line_launch[5],
+                              line_launch[6], n_segs, height, width)
+        if point_launch is not None:
+            n_pts = point_launch[5]
+            bpg = (n_pts + tpb - 1) // tpb
+            k_points[bpg, tpb](out, written, order,
+                               point_launch[0], point_launch[1],
+                               point_launch[2], point_launch[3],
+                               point_launch[4], n_pts)
+
+    # Pass 1: write values (atomics) or, for first/last, resolve the
+    # per-pixel winning input index into ``order``.
+    _launch_phase(kernels['scanline_fill'],
+                  kernels['burn_lines'],
+                  kernels['burn_points'])
+
+    # Pass 2 (first / last only): stamp the winner's value into ``out``.
+    if two_pass:
+        _launch_phase(kernels['scanline_fill_pass2'],
+                      kernels['burn_lines_pass2'],
+                      kernels['burn_points_pass2'])
+
+    final_out = _gpu_finalize_buffers(out, order, order_sentinel, fill,
+                                      atomic_mode)
+    return final_out.astype(dtype)
 
 
 # ---------------------------------------------------------------------------
@@ -1687,86 +1988,119 @@ def _rasterize_tile_cupy(poly_wkb, poly_props_2d, poly_global_2d, tile_bounds,
                          seg_r0, seg_c0, seg_r1, seg_c1, seg_geom_idx,
                          line_props, line_global,
                          pt_rows, pt_cols, pt_geom_idx,
-                         point_props, point_global):
-    """GPU tile rasterization: polygons as WKB, lines/points as segments."""
-    kernels = _ensure_gpu_kernels(merge_fn, should_write)
+                         point_props, point_global, merge_name=None):
+    """GPU tile rasterization: polygons as WKB, lines/points as segments.
+
+    When ``merge_name`` selects an atomic built-in merge, the per-tile
+    kernels use CUDA atomics so overlapping writes inside the tile are
+    deterministic.  Cross-tile pixels do not overlap (the tiling is a
+    partition of the output grid), so per-tile atomicity is sufficient.
+    """
+    kernels = _ensure_gpu_kernels(merge_fn, should_write, merge_name)
+    atomic_mode = kernels['atomic_mode']
+    two_pass = atomic_mode in ('first', 'last')
 
-    out = cupy.full((tile_h, tile_w), fill, dtype=cupy.float64)
-    written = cupy.zeros((tile_h, tile_w), dtype=cupy.int8)
-    order = cupy.full((tile_h, tile_w), -1, dtype=cupy.int64)
+    out, written, order, order_sentinel = _gpu_init_buffers(
+        tile_h, tile_w, fill, atomic_mode)
 
-    # 1. Polygons (deserialize WKB, then scanline fill on GPU)
+    # Stage launches (same pattern as _run_cupy).  Stage once so the
+    # first/last pass-2 sweep reuses the device buffers.
+    poly_launch = None
     if poly_wkb:
         poly_geoms = _polys_from_wkb(poly_wkb)
         poly_ids = np.arange(len(poly_geoms), dtype=np.int32)
         edge_arrays = _extract_edges(
-            poly_geoms, poly_ids, tile_bounds,
-            tile_h, tile_w)
+            poly_geoms, poly_ids, tile_bounds, tile_h, tile_w)
         edge_arrays = _sort_edges(edge_arrays)
         edge_y_min = edge_arrays[0]
         if len(edge_y_min) > 0:
-            edge_y_max, edge_x_at_ymin, edge_inv_slope, \
-                edge_geom_id = edge_arrays[1:]
+            edge_y_max, edge_x_at_ymin, edge_inv_slope, edge_geom_id = \
+                edge_arrays[1:]
             row_ptr, col_idx = _build_row_csr_numba(
                 edge_y_min, edge_y_max, tile_h)
-            # Same fail-fast as eager CuPy: per-tile truncation would be
-            # silent without this check.
             _check_gpu_edge_cap(row_ptr)
-            tpb = 256
-            blocks = (tile_h + tpb - 1) // tpb
-            kernels['scanline_fill'][blocks, tpb](
-                out, written, order,
+            poly_launch = (
                 cupy.asarray(edge_y_min), cupy.asarray(edge_x_at_ymin),
                 cupy.asarray(edge_inv_slope), cupy.asarray(edge_geom_id),
                 cupy.asarray(poly_props_2d), cupy.asarray(poly_global_2d),
-                cupy.asarray(row_ptr), cupy.asarray(col_idx), tile_w)
+                cupy.asarray(row_ptr), cupy.asarray(col_idx))
 
-        # 1b. all_touched: draw polygon boundaries via Bresenham (GPU)
-        if all_touched:
-            br0, bc0, br1, bc1, bidx = _extract_polygon_boundary_segments(
-                poly_geoms, poly_ids, tile_bounds, tile_h, tile_w)
-            if len(br0) > 0:
-                n_bsegs = len(br0)
-                tpb_b = 256
-                bpg_b = (n_bsegs + tpb_b - 1) // tpb_b
-                kernels['burn_lines'][bpg_b, tpb_b](
-                    out, written, order,
-                    cupy.asarray(br0), cupy.asarray(bc0),
-                    cupy.asarray(br1), cupy.asarray(bc1),
-                    cupy.asarray(bidx), cupy.asarray(poly_props_2d),
-                    cupy.asarray(poly_global_2d),
-                    n_bsegs, tile_h, tile_w)
-
-    # 2. Lines (tile-local segments, GPU Bresenham)
+    boundary_launch = None
+    if poly_wkb and all_touched:
+        br0, bc0, br1, bc1, bidx = _extract_polygon_boundary_segments(
+            poly_geoms, poly_ids, tile_bounds, tile_h, tile_w)
+        if len(br0) > 0:
+            boundary_launch = (
+                cupy.asarray(br0), cupy.asarray(bc0),
+                cupy.asarray(br1), cupy.asarray(bc1),
+                cupy.asarray(bidx), cupy.asarray(poly_props_2d),
+                cupy.asarray(poly_global_2d), len(br0))
+
+    line_launch = None
     if len(seg_r0) > 0:
-        n_segs = len(seg_r0)
-        tpb = 256
-        bpg = (n_segs + tpb - 1) // tpb
-        kernels['burn_lines'][bpg, tpb](
-            out, written, order,
+        line_launch = (
             cupy.asarray(seg_r0), cupy.asarray(seg_c0),
             cupy.asarray(seg_r1), cupy.asarray(seg_c1),
             cupy.asarray(seg_geom_idx), _ensure_cupy(line_props),
-            _ensure_cupy(line_global),
-            n_segs, tile_h, tile_w)
+            _ensure_cupy(line_global), len(seg_r0))
 
-    # 3. Points (tile-local)
+    point_launch = None
     if len(pt_rows) > 0:
-        n_pts = len(pt_rows)
-        tpb = 256
-        bpg = (n_pts + tpb - 1) // tpb
-        kernels['burn_points'][bpg, tpb](
-            out, written, order,
+        point_launch = (
             cupy.asarray(pt_rows), cupy.asarray(pt_cols),
             cupy.asarray(pt_geom_idx), _ensure_cupy(point_props),
-            _ensure_cupy(point_global), n_pts)
+            _ensure_cupy(point_global), len(pt_rows))
 
-    return out.astype(dtype)
+    def _launch_phase(k_scan, k_lines, k_points):
+        tpb = 256
+        if poly_launch is not None:
+            blocks = (tile_h + tpb - 1) // tpb
+            k_scan[blocks, tpb](out, written, order,
+                                poly_launch[0], poly_launch[1],
+                                poly_launch[2], poly_launch[3],
+                                poly_launch[4], poly_launch[5],
+                                poly_launch[6], poly_launch[7], tile_w)
+        if boundary_launch is not None:
+            n_bsegs = boundary_launch[7]
+            bpg = (n_bsegs + tpb - 1) // tpb
+            k_lines[bpg, tpb](out, written, order,
+                              boundary_launch[0], boundary_launch[1],
+                              boundary_launch[2], boundary_launch[3],
+                              boundary_launch[4], boundary_launch[5],
+                              boundary_launch[6],
+                              n_bsegs, tile_h, tile_w)
+        if line_launch is not None:
+            n_segs = line_launch[7]
+            bpg = (n_segs + tpb - 1) // tpb
+            k_lines[bpg, tpb](out, written, order,
+                              line_launch[0], line_launch[1],
+                              line_launch[2], line_launch[3],
+                              line_launch[4], line_launch[5],
+                              line_launch[6], n_segs, tile_h, tile_w)
+        if point_launch is not None:
+            n_pts = point_launch[5]
+            bpg = (n_pts + tpb - 1) // tpb
+            k_points[bpg, tpb](out, written, order,
+                               point_launch[0], point_launch[1],
+                               point_launch[2], point_launch[3],
+                               point_launch[4], n_pts)
+
+    _launch_phase(kernels['scanline_fill'],
+                  kernels['burn_lines'],
+                  kernels['burn_points'])
+    if two_pass:
+        _launch_phase(kernels['scanline_fill_pass2'],
+                      kernels['burn_lines_pass2'],
+                      kernels['burn_points_pass2'])
+
+    final_out = _gpu_finalize_buffers(out, order, order_sentinel, fill,
+                                      atomic_mode)
+    return final_out.astype(dtype)
 
 
 def _run_dask_cupy(geometries, props_array, bounds, height, width, fill,
                    dtype, all_touched, merge_fn, should_write,
-                   row_chunks, col_chunks):
+                   row_chunks, col_chunks, merge_name=None):
     """Dask + CuPy backend: tile-based parallel GPU rasterization."""
     import dask
     import dask.array as da
@@ -1845,7 +2179,8 @@ def _run_dask_cupy(geometries, props_array, bounds, height, width, fill,
                 tile_h, tile_w, fill, dtype, all_touched,
                 merge_fn, should_write,
                 *ts, tile_line_props, tile_line_global,
-                *tp, tile_point_props, tile_point_global)
+                *tp, tile_point_props, tile_point_global,
+                merge_name)
             blocks[i][j] = da.from_delayed(
                 delayed_tile, shape=(tile_h, tile_w), dtype=dtype,
                 meta=cupy.empty(()))
@@ -2229,6 +2564,10 @@ def rasterize(
     # For GPU paths, resolve string merge names to GPU (merge_fn,
     # should_write_fn) pairs.  User callables are paired with the
     # always-write predicate (matches the public 3-arg signature).
+    # ``gpu_merge_name`` selects the atomic kernel path in
+    # ``_ensure_gpu_kernels``; ``None`` falls back to the non-atomic
+    # closure path used for user callables.
+    gpu_merge_name = None
     if use_cuda:
         if cupy is None:
             raise ImportError(
@@ -2236,6 +2575,7 @@ def rasterize(
         gpu_fns = _get_gpu_merge_fns()
         if isinstance(_merge_fn_gpu, str):
             gpu_merge_fn, should_write_gpu = gpu_fns[_merge_fn_gpu]
+            gpu_merge_name = _merge_fn_gpu
         else:
             gpu_merge_fn = _merge_fn_gpu
             # Need an always-true predicate compiled for cuda; reuse any
@@ -2251,7 +2591,7 @@ def rasterize(
                 geom_list, props_array, final_bounds,
                 final_height, final_width, fill, final_dtype,
                 all_touched, gpu_merge_fn, should_write_gpu,
-                row_chunks, col_chunks)
+                row_chunks, col_chunks, gpu_merge_name)
         else:
             out = _run_dask_numpy(
                 geom_list, props_array, final_bounds,
@@ -2261,7 +2601,8 @@ def rasterize(
     elif use_cuda:
         out = _run_cupy(geom_list, props_array, final_bounds,
                         final_height, final_width, fill, final_dtype,
-                        all_touched, gpu_merge_fn, should_write_gpu)
+                        all_touched, gpu_merge_fn, should_write_gpu,
+                        gpu_merge_name)
     else:
         out = _run_numpy(geom_list, props_array, final_bounds,
                          final_height, final_width, fill, final_dtype,

From ab958851051478e15f8f3a961c6d717b0f46a8bc Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 20 May 2026 08:35:03 -0700
Subject: [PATCH 2/3] Add GPU rasterize race regression tests (#2167)

Cover the six built-in aggregators (last, first, sum, count, min,
max) across three overlap scenarios: coincident points, crossing
line segments, and duplicate segments. Assert that the cupy output
matches the numpy output exactly and that repeated cupy runs
produce identical arrays.
---
 .../tests/test_rasterize_gpu_race_2167.py     | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 xrspatial/tests/test_rasterize_gpu_race_2167.py

diff --git a/xrspatial/tests/test_rasterize_gpu_race_2167.py b/xrspatial/tests/test_rasterize_gpu_race_2167.py
new file mode 100644
index 000000000..1b1af2470
--- /dev/null
+++ b/xrspatial/tests/test_rasterize_gpu_race_2167.py
@@ -0,0 +1,184 @@
+"""Regression tests for issue #2167.
+
+GPU point and line rasterize kernels previously did non-atomic
+read-modify-write on the per-pixel state.  When multiple threads
+landed on the same pixel (overlapping points, crossing line
+segments, duplicate segments, shared polygon boundaries with
+``all_touched=True``) the writes raced and the cupy backend could
+return a value that disagreed with the numpy backend, and varied
+from run to run.
+
+These tests pin down deterministic, numpy-matching behaviour for
+the six built-in aggregators (``last``, ``first``, ``sum``,
+``count``, ``min``, ``max``) on overlapping geometries.
+
+When cupy / CUDA are not available the whole module skips.
+"""
+import numpy as np
+import pytest
+
+try:
+    from shapely.geometry import LineString, Point
+    has_shapely = True
+except ImportError:
+    has_shapely = False
+
+if has_shapely:
+    from xrspatial.rasterize import rasterize
+
+try:
+    import cupy  # noqa: F401
+    has_cupy = True
+except ImportError:
+    has_cupy = False
+
+try:
+    from numba import cuda
+    has_cuda = has_cupy and cuda.is_available()
+except Exception:
+    has_cuda = False
+
+
+pytestmark = [
+    pytest.mark.skipif(not has_shapely, reason="shapely not installed"),
+    pytest.mark.skipif(not has_cuda, reason="CUDA / CuPy not available"),
+]
+
+
+BOUNDS = (0.0, 0.0, 10.0, 10.0)
+HEIGHT = 10
+WIDTH = 10
+MERGES = ('last', 'first', 'sum', 'count', 'min', 'max')
+
+
+def _as_numpy(arr):
+    """Return a plain numpy ndarray regardless of backend."""
+    data = arr.data
+    if hasattr(data, 'get'):
+        return data.get()
+    return np.asarray(data)
+
+
+def _run(geom_list, merge, use_cuda):
+    return rasterize(
+        geom_list, width=WIDTH, height=HEIGHT, bounds=BOUNDS,
+        merge=merge, use_cuda=use_cuda, fill=np.nan,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Scenario builders
+# ---------------------------------------------------------------------------
+
+def _coincident_points():
+    """Three points that land on the same pixel, plus one offset point."""
+    # Pixel grid spans 0-10 in 10 steps -> 1 unit per pixel.  All three
+    # of these land in the pixel centred at (5.5, 5.5).  The fourth one
+    # is well away so we can confirm non-overlap is unaffected.
+    return [
+        (Point(5.5, 5.5), 1.0),
+        (Point(5.7, 5.3), 2.0),
+        (Point(5.2, 5.8), 4.0),
+        (Point(1.5, 1.5), 9.0),
+    ]
+
+
+def _crossing_lines():
+    """Two line segments that cross at one or more pixels."""
+    return [
+        (LineString([(0.5, 0.5), (9.5, 9.5)]), 1.0),
+        (LineString([(0.5, 9.5), (9.5, 0.5)]), 2.0),
+        (LineString([(2.5, 5.5), (7.5, 5.5)]), 4.0),
+    ]
+
+
+def _duplicate_segments():
+    """Two identical line segments -- every pixel they cover overlaps."""
+    coords = [(1.5, 1.5), (8.5, 8.5)]
+    return [
+        (LineString(coords), 1.0),
+        (LineString(coords), 8.0),
+    ]
+
+
+SCENARIOS = {
+    'coincident_points': _coincident_points,
+    'crossing_lines': _crossing_lines,
+    'duplicate_segments': _duplicate_segments,
+}
+
+
+# ---------------------------------------------------------------------------
+# Cross-backend parity
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize('scenario', list(SCENARIOS))
+@pytest.mark.parametrize('merge', MERGES)
+def test_cupy_matches_numpy_on_overlap(scenario, merge):
+    geoms = SCENARIOS[scenario]()
+    expected = _as_numpy(_run(geoms, merge, use_cuda=False))
+    actual = _as_numpy(_run(geoms, merge, use_cuda=True))
+
+    # Pixels touched by no geometry should be ``fill`` (NaN here) on
+    # both backends.  np.testing.assert_array_equal treats NaN==NaN
+    # when equal_nan=True is requested via assert_array_equal in
+    # recent NumPy; use the explicit form for portability.
+    np.testing.assert_allclose(
+        actual, expected, rtol=0, atol=0, equal_nan=True,
+        err_msg=(
+            f"cupy backend disagrees with numpy for scenario "
+            f"{scenario!r} and merge {merge!r}.\n"
+            f"cupy:\n{actual}\nnumpy:\n{expected}"
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Determinism: same input must give identical output across runs.
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize('scenario', list(SCENARIOS))
+@pytest.mark.parametrize('merge', MERGES)
+def test_cupy_is_deterministic_across_runs(scenario, merge):
+    geoms = SCENARIOS[scenario]()
+    first = _as_numpy(_run(geoms, merge, use_cuda=True))
+    # Six repeats is enough to surface a thread-interleaving race
+    # without making the test slow.
+    for _ in range(5):
+        again = _as_numpy(_run(geoms, merge, use_cuda=True))
+        np.testing.assert_allclose(
+            again, first, rtol=0, atol=0, equal_nan=True,
+            err_msg=(
+                f"cupy backend produced different results across runs "
+                f"for scenario {scenario!r} and merge {merge!r}.\n"
+                f"first:\n{first}\nagain:\n{again}"
+            ),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Spot-check that sum on coincident points actually accumulates.  Without
+# the atomic-add fix, this test would either return one of the input
+# values or NaN, never the true sum.
+# ---------------------------------------------------------------------------
+
+def test_sum_of_coincident_points_equals_total():
+    geoms = _coincident_points()
+    # The first three points all land in the same pixel; sum should be 7.
+    result = _as_numpy(_run(geoms, 'sum', use_cuda=True))
+    # Find the pixel with the largest accumulated value.
+    finite = result[np.isfinite(result)]
+    assert finite.size > 0
+    # Three coincident points (1 + 2 + 4) plus an isolated point of 9.
+    assert 7.0 in finite, (
+        f"expected the three-point pixel to sum to 7.0; got {finite}"
+    )
+
+
+def test_count_of_coincident_points_equals_three():
+    geoms = _coincident_points()
+    result = _as_numpy(_run(geoms, 'count', use_cuda=True))
+    finite = result[np.isfinite(result)]
+    assert 3.0 in finite, (
+        f"expected the three-point pixel to count 3; got {finite}"
+    )

From 2842709fad924866203c55e9014aa51578463400 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 20 May 2026 08:40:23 -0700
Subject: [PATCH 3/3] Address review nits: tighten GPU rasterize atomics
 (#2167)

- Skip the (H, W) ``written`` allocation when the atomic kernel path
  is selected. Atomic modes never read or write ``written``; a (1, 1)
  placeholder satisfies the kernel signature without spending H*W
  bytes on dead storage.
- Nest the all_touched boundary launch inside the ``if poly_wkb:``
  block in ``_rasterize_tile_cupy`` so the dependency on
  ``poly_geoms`` / ``poly_ids`` is local. Add a parallel comment to
  ``_run_cupy`` explaining why the same shape is safe there
  (``poly_geoms`` comes from ``_classify_geometries`` unconditionally).
- Fix the stale ``pass_id`` comment in ``_ensure_gpu_kernels``; pass 1
  and pass 2 are separate compiled kernels, not branches selected by a
  runtime ``pass_id``.
- Add a shared-boundary polygon scenario (cross-backend parity plus
  per-run determinism) to the regression tests. This is the polygon
  analogue of the line-overlap case and exercises the all_touched
  scanline-plus-boundary write pattern.
---
 xrspatial/rasterize.py                        | 45 +++++++++++-----
 .../tests/test_rasterize_gpu_race_2167.py     | 51 ++++++++++++++++++-
 2 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/xrspatial/rasterize.py b/xrspatial/rasterize.py
index debe564be..32a39b8bd 100644
--- a/xrspatial/rasterize.py
+++ b/xrspatial/rasterize.py
@@ -1076,8 +1076,10 @@ def _ensure_gpu_kernels(merge_fn, should_write, merge_name=None):
     #
     # For first/last, the kernel is two-pass: pass 1 resolves the
     # winning input index atomically into ``order``; pass 2 stamps the
-    # winner's ``props[0]`` value into ``out``.  ``pass_id`` selects
-    # which branch executes (kernels are compiled once per pass).
+    # winner's ``props[0]`` value into ``out``.  Pass 1 and pass 2 are
+    # separate compiled kernels (``_..._gpu`` and ``_..._gpu_pass2``);
+    # ``_run_cupy`` / ``_rasterize_tile_cupy`` skip the pass-2 launch
+    # for all other modes.
 
     @cuda.jit(device=True)
     def _apply_merge_gpu(out, written, order, r, c, props, new_idx):
@@ -1472,8 +1474,16 @@ def _gpu_init_buffers(height, width, fill, merge_name):
     values for the selected atomic merge.  Returns the trio plus the
     sentinel value used in ``order`` so the caller can blend ``fill``
     back in afterwards.
+
+    The atomic kernels never consult ``written`` (the legacy
+    user-callable path is the only consumer), so atomic modes get a
+    placeholder ``(1, 1)`` buffer to satisfy the kernel signature
+    without spending ``H*W`` bytes on dead storage.
     """
-    written = cupy.zeros((height, width), dtype=cupy.int8)
+    if merge_name in _GPU_ATOMIC_MERGES:
+        written = cupy.zeros((1, 1), dtype=cupy.int8)
+    else:
+        written = cupy.zeros((height, width), dtype=cupy.int8)
     if merge_name in ('sum', 'count'):
         out = cupy.zeros((height, width), dtype=cupy.float64)
         order = cupy.full((height, width), _GPU_ORDER_SENTINEL_MIN,
@@ -1568,6 +1578,10 @@ def _run_cupy(geometries, props_array, bounds, height, width, fill, dtype,
             cupy.asarray(poly_props), cupy.asarray(poly_global),
             cupy.asarray(row_ptr), cupy.asarray(col_idx))
 
+    # all_touched boundaries.  ``poly_geoms`` and ``poly_ids`` come
+    # from ``_classify_geometries`` above and are always defined here,
+    # so this block does not depend on the polygon scanline path
+    # having allocated ``poly_launch``.
     boundary_launch = None
     if all_touched and poly_geoms:
         br0, bc0, br1, bc1, bidx = _extract_polygon_boundary_segments(
@@ -2004,8 +2018,12 @@ def _rasterize_tile_cupy(poly_wkb, poly_props_2d, poly_global_2d, tile_bounds,
         tile_h, tile_w, fill, atomic_mode)
 
     # Stage launches (same pattern as _run_cupy).  Stage once so the
-    # first/last pass-2 sweep reuses the device buffers.
+    # first/last pass-2 sweep reuses the device buffers.  ``poly_geoms``
+    # and ``poly_ids`` are bound inside the ``if poly_wkb:`` block, so
+    # ``boundary_launch`` is staged inside the same block to keep the
+    # dependency local.
     poly_launch = None
+    boundary_launch = None
     if poly_wkb:
         poly_geoms = _polys_from_wkb(poly_wkb)
         poly_ids = np.arange(len(poly_geoms), dtype=np.int32)
@@ -2025,16 +2043,15 @@ def _rasterize_tile_cupy(poly_wkb, poly_props_2d, poly_global_2d, tile_bounds,
                 cupy.asarray(poly_props_2d), cupy.asarray(poly_global_2d),
                 cupy.asarray(row_ptr), cupy.asarray(col_idx))
 
-    boundary_launch = None
-    if poly_wkb and all_touched:
-        br0, bc0, br1, bc1, bidx = _extract_polygon_boundary_segments(
-            poly_geoms, poly_ids, tile_bounds, tile_h, tile_w)
-        if len(br0) > 0:
-            boundary_launch = (
-                cupy.asarray(br0), cupy.asarray(bc0),
-                cupy.asarray(br1), cupy.asarray(bc1),
-                cupy.asarray(bidx), cupy.asarray(poly_props_2d),
-                cupy.asarray(poly_global_2d), len(br0))
+        if all_touched:
+            br0, bc0, br1, bc1, bidx = _extract_polygon_boundary_segments(
+                poly_geoms, poly_ids, tile_bounds, tile_h, tile_w)
+            if len(br0) > 0:
+                boundary_launch = (
+                    cupy.asarray(br0), cupy.asarray(bc0),
+                    cupy.asarray(br1), cupy.asarray(bc1),
+                    cupy.asarray(bidx), cupy.asarray(poly_props_2d),
+                    cupy.asarray(poly_global_2d), len(br0))
 
     line_launch = None
     if len(seg_r0) > 0:
diff --git a/xrspatial/tests/test_rasterize_gpu_race_2167.py b/xrspatial/tests/test_rasterize_gpu_race_2167.py
index 1b1af2470..f1f6e9557 100644
--- a/xrspatial/tests/test_rasterize_gpu_race_2167.py
+++ b/xrspatial/tests/test_rasterize_gpu_race_2167.py
@@ -18,7 +18,7 @@
 import pytest
 
 try:
-    from shapely.geometry import LineString, Point
+    from shapely.geometry import LineString, Point, box
     has_shapely = True
 except ImportError:
     has_shapely = False
@@ -59,10 +59,11 @@ def _as_numpy(arr):
     return np.asarray(data)
 
 
-def _run(geom_list, merge, use_cuda):
+def _run(geom_list, merge, use_cuda, all_touched=False):
     return rasterize(
         geom_list, width=WIDTH, height=HEIGHT, bounds=BOUNDS,
         merge=merge, use_cuda=use_cuda, fill=np.nan,
+        all_touched=all_touched,
     )
 
 
@@ -182,3 +183,49 @@ def test_count_of_coincident_points_equals_three():
     assert 3.0 in finite, (
         f"expected the three-point pixel to count 3; got {finite}"
     )
+
+
+# ---------------------------------------------------------------------------
+# all_touched=True polygon boundaries are the polygon analogue of the
+# line-overlap case: two polygons sharing a boundary write the same
+# pixels twice via the Bresenham boundary pass, on top of the scanline
+# fill.  Confirm cupy still matches numpy across all six aggregators.
+# ---------------------------------------------------------------------------
+
+def _shared_boundary_polygons():
+    """Two rectangles that share an edge."""
+    return [
+        (box(1.0, 1.0, 5.0, 5.0), 1.0),
+        (box(5.0, 1.0, 9.0, 5.0), 2.0),
+    ]
+
+
+@pytest.mark.parametrize('merge', MERGES)
+def test_cupy_matches_numpy_all_touched_shared_boundary(merge):
+    geoms = _shared_boundary_polygons()
+    expected = _as_numpy(_run(geoms, merge, use_cuda=False, all_touched=True))
+    actual = _as_numpy(_run(geoms, merge, use_cuda=True, all_touched=True))
+    np.testing.assert_allclose(
+        actual, expected, rtol=0, atol=0, equal_nan=True,
+        err_msg=(
+            f"cupy backend disagrees with numpy for shared-boundary "
+            f"polygons with all_touched=True and merge {merge!r}.\n"
+            f"cupy:\n{actual}\nnumpy:\n{expected}"
+        ),
+    )
+
+
+@pytest.mark.parametrize('merge', MERGES)
+def test_cupy_deterministic_all_touched_shared_boundary(merge):
+    geoms = _shared_boundary_polygons()
+    first = _as_numpy(_run(geoms, merge, use_cuda=True, all_touched=True))
+    for _ in range(5):
+        again = _as_numpy(_run(geoms, merge, use_cuda=True, all_touched=True))
+        np.testing.assert_allclose(
+            again, first, rtol=0, atol=0, equal_nan=True,
+            err_msg=(
+                f"cupy backend produced different results across runs "
+                f"for shared-boundary polygons with all_touched=True and "
+                f"merge {merge!r}."
+            ),
+        )