From e761f16225fb5ba75ec4424c8fa5b7203893ea1b Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 15 May 2026 07:34:03 -0700
Subject: [PATCH 1/3] geotiff: in-place nodata mask on GPU (#1934)

`_apply_nodata_mask_gpu` ran `cupy.where(arr == sentinel, nan, arr)` for
both the float and the post-`astype(float64)` integer paths, which
allocates a fresh output buffer the same shape as the input. Every call
site passes a freshly decoded GPU buffer that no caller-visible state
aliases, so writing NaN into the existing buffer with `cupy.putmask`
drops one chunk-sized device allocation per call.

Adds `test_apply_nodata_mask_gpu_inplace_1934.py` covering the float
correctness path, the in-place pointer guarantee, a pool `used_bytes`
ceiling for both the float and integer paths, the NaN-sentinel no-op,
and the `nodata=None` passthrough.
---
 xrspatial/geotiff/_backends/_gpu_helpers.py   |  15 +-
 ...test_apply_nodata_mask_gpu_inplace_1934.py | 208 ++++++++++++++++++
 2 files changed, 220 insertions(+), 3 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py

diff --git a/xrspatial/geotiff/_backends/_gpu_helpers.py b/xrspatial/geotiff/_backends/_gpu_helpers.py
index 49399c40e..723a477e9 100644
--- a/xrspatial/geotiff/_backends/_gpu_helpers.py
+++ b/xrspatial/geotiff/_backends/_gpu_helpers.py
@@ -54,6 +54,12 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata):
     Returns the (possibly promoted, possibly nodata-masked) CuPy array.
     The caller is responsible for setting ``attrs['nodata']`` so the
     sentinel is still discoverable downstream.
+
+    The sentinel-replacement step writes NaN into the existing buffer with
+    ``cupy.putmask`` rather than allocating a fresh output via
+    ``cupy.where``; every call site passes a freshly decoded GPU buffer
+    that no caller-visible state aliases, so the mutation is safe and
+    drops one chunk-sized device allocation per call (#1934).
     """
     import cupy
 
@@ -63,8 +69,8 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata):
     if arr_dtype.kind == 'f':
         if not np.isnan(nodata):
             sentinel = arr_dtype.type(nodata)
-            arr_gpu = cupy.where(arr_gpu == sentinel,
-                                 arr_dtype.type('nan'), arr_gpu)
+            cupy.putmask(arr_gpu, arr_gpu == sentinel,
+                         arr_dtype.type('nan'))
         return arr_gpu
     if arr_dtype.kind in ('u', 'i'):
         # Out-of-range sentinels (e.g. uint16 + GDAL_NODATA="-9999") cannot
@@ -89,8 +95,11 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata):
         sentinel = arr_dtype.type(nodata_int)
         mask = arr_gpu == sentinel
         if bool(mask.any().item()):
+            # ``astype`` allocates the float64 buffer; write NaN into it
+            # in place instead of running it through another ``cupy.where``
+            # that would allocate again (#1934).
             arr_gpu = arr_gpu.astype(cupy.float64)
-            arr_gpu = cupy.where(mask, cupy.float64('nan'), arr_gpu)
+            cupy.putmask(arr_gpu, mask, cupy.float64('nan'))
         return arr_gpu
     return arr_gpu
 
diff --git a/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py
new file mode 100644
index 000000000..3f982fd8f
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py
@@ -0,0 +1,208 @@
+"""Regression tests for issue #1934.
+
+``_apply_nodata_mask_gpu`` used to replace the sentinel pixels via
+``cupy.where(arr_gpu == sentinel, nan, arr_gpu)``, which allocates a fresh
+output buffer the same shape as the input. Every call site passes a
+freshly decoded GPU buffer that no caller-visible state aliases, so the
+fix writes NaN into the existing buffer with ``cupy.putmask`` and drops
+one chunk-sized device allocation per call.
+
+Two guards here:
+
+1. Correctness -- float and integer paths match the pre-fix behaviour on
+   a representative input (sentinel masked to NaN, non-sentinel pixels
+   preserved, integer dtype promoted to float64).
+2. In-place mutation -- on the float path the output array shares the
+   same device pointer as the input, confirming no fresh allocation. The
+   integer path still allocates via ``astype(float64)``; the test checks
+   the post-astype buffer is then mutated in place rather than copied
+   again by ``cupy.where``.
+"""
+from __future__ import annotations
+
+import importlib.util
+
+import numpy as np
+import pytest
+
+
+def _gpu_available() -> bool:
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+        return bool(cupy.cuda.is_available())
+    except Exception:
+        return False
+
+
+_HAS_GPU = _gpu_available()
+_gpu_only = pytest.mark.skipif(
+    not _HAS_GPU,
+    reason="cupy + CUDA required",
+)
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_float_masks_sentinel_to_nan_1934():
+    """Float path masks the sentinel to NaN and leaves other pixels alone."""
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    arr_gpu = cupy.asarray(
+        np.array([[1.0, 2.0], [-9999.0, 4.0]], dtype=np.float32)
+    )
+    out = _apply_nodata_mask_gpu(arr_gpu, -9999.0)
+    assert out.dtype == cupy.float32
+    host = out.get()
+    assert np.isnan(host[1, 0])
+    assert host[0, 0] == 1.0
+    assert host[0, 1] == 2.0
+    assert host[1, 1] == 4.0
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_float_in_place_no_copy_1934():
+    """Float path mutates the input buffer in place.
+
+    Before the fix, ``cupy.where`` returned a fresh array, so ``out`` had
+    a different device pointer than ``arr_gpu``. After the fix the input
+    buffer is reused and the pointers match.
+    """
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    arr_gpu = cupy.asarray(
+        np.array([[1.0, 2.0], [-9999.0, 4.0]], dtype=np.float32)
+    )
+    input_ptr = arr_gpu.data.ptr
+    out = _apply_nodata_mask_gpu(arr_gpu, -9999.0)
+    assert out.data.ptr == input_ptr
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_float_alloc_count_unchanged_1934():
+    """Float path does not pull a fresh chunk-sized buffer from the pool.
+
+    The default memory pool exposes ``n_free_blocks`` / ``used_bytes`` and
+    ``total_bytes``. The pre-fix ``cupy.where`` path allocated one extra
+    chunk-sized buffer; the in-place fix should not grow ``used_bytes``
+    once the input is already on device.
+    """
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    # Large enough that an extra allocation would be visible.
+    arr_gpu = cupy.full((512, 512), -9999.0, dtype=cupy.float32)
+    # Plant a non-sentinel pixel so the mask is not empty.
+    arr_gpu[0, 0] = 1.0
+
+    pool = cupy.get_default_memory_pool()
+    cupy.cuda.Stream.null.synchronize()
+    used_before = pool.used_bytes()
+
+    out = _apply_nodata_mask_gpu(arr_gpu, -9999.0)
+    cupy.cuda.Stream.null.synchronize()
+    used_after = pool.used_bytes()
+
+    # The mask is a transient bool array (1/4 the byte count of float32),
+    # so used_bytes can rise by the mask size but must not rise by the
+    # array's full byte count. Pre-fix would add at least one float32
+    # buffer of the same shape (512*512*4 = 1 MiB).
+    array_bytes = arr_gpu.nbytes
+    growth = used_after - used_before
+    assert growth < array_bytes, (
+        f"unexpected allocation growth {growth} bytes >= "
+        f"array_bytes {array_bytes}; in-place mutation regressed"
+    )
+    # And the returned buffer is the same one we passed in.
+    assert out.data.ptr == arr_gpu.data.ptr
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_int_promotes_and_masks_1934():
+    """Integer path still promotes to float64 and masks the sentinel."""
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    arr_gpu = cupy.asarray(
+        np.array([[1, 2], [3, 4]], dtype=np.uint16)
+    )
+    out = _apply_nodata_mask_gpu(arr_gpu, 3)
+    assert out.dtype == cupy.float64
+    host = out.get()
+    assert np.isnan(host[1, 0])
+    assert host[0, 0] == 1.0
+    assert host[0, 1] == 2.0
+    assert host[1, 1] == 4.0
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_int_no_extra_buffer_after_astype_1934():
+    """Integer path: only the ``astype(float64)`` buffer is allocated.
+
+    Before the fix the trailing ``cupy.where`` allocated a second
+    chunk-sized float64 buffer. After the fix the ``astype`` buffer is
+    mutated in place.
+    """
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    arr_gpu = cupy.full((512, 512), 3, dtype=cupy.uint16)
+    arr_gpu[0, 0] = 1  # ensure non-sentinel pixel exists
+
+    pool = cupy.get_default_memory_pool()
+    cupy.cuda.Stream.null.synchronize()
+    used_before = pool.used_bytes()
+
+    out = _apply_nodata_mask_gpu(arr_gpu, 3)
+    cupy.cuda.Stream.null.synchronize()
+    used_after = pool.used_bytes()
+
+    # Required: one float64 buffer (512*512*8 = 2 MiB) from astype.
+    # Pre-fix would have allocated a second float64 buffer for cupy.where
+    # (another 2 MiB) on top of that.
+    float64_bytes = out.nbytes
+    growth = used_after - used_before
+    # Allow some slack for the bool mask + .any() scalar (well under
+    # one float64 buffer of slack).
+    assert growth < 2 * float64_bytes, (
+        f"unexpected allocation growth {growth} bytes >= "
+        f"2 * float64_bytes {2 * float64_bytes}; pre-fix double-alloc"
+    )
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_float_nan_sentinel_noop_1934():
+    """NaN nodata on a float array stays a no-op."""
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    arr_gpu = cupy.asarray(
+        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    )
+    input_ptr = arr_gpu.data.ptr
+    out = _apply_nodata_mask_gpu(arr_gpu, float('nan'))
+    # Same buffer back, untouched.
+    assert out.data.ptr == input_ptr
+    np.testing.assert_array_equal(out.get(), [[1.0, 2.0], [3.0, 4.0]])
+
+
+@_gpu_only
+def test_apply_nodata_mask_gpu_none_nodata_passthrough_1934():
+    """``nodata is None`` returns the input array untouched."""
+    import cupy
+
+    from xrspatial.geotiff import _apply_nodata_mask_gpu
+
+    arr_gpu = cupy.asarray(np.array([[1, 2], [3, 4]], dtype=np.int32))
+    input_ptr = arr_gpu.data.ptr
+    out = _apply_nodata_mask_gpu(arr_gpu, None)
+    assert out.data.ptr == input_ptr
+    assert out.dtype == cupy.int32

From 511f0855b8622aa8185855098e998783b37a8e8c Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 15 May 2026 07:34:07 -0700
Subject: [PATCH 2/3] geotiff: record rockout 2026-05-15 LOW fix in
 sweep-performance state

Note that #1934 (`_apply_nodata_mask_gpu` in-place mutation) was filed
and fixed during the 2026-05-15 rockout pass on geotiff.
---
 .claude/sweep-performance-state.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
index a759ddcd4..9c43540ef 100644
--- a/.claude/sweep-performance-state.csv
+++ b/.claude/sweep-performance-state.csv
@@ -18,7 +18,7 @@ fire,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 flood,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 focal,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 geodesic,2026-03-31T18:00:00Z,N/A,compute-bound,0,,
-geotiff,2026-05-12,SAFE,IO-bound,0,1756,"Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures."
+geotiff,2026-05-15,SAFE,IO-bound,0,1756,"Rockout 2026-05-15: LOW filed #1934 -- _apply_nodata_mask_gpu used cupy.where (allocating); switched to cupy.putmask on the already-owned buffer (float path) and on the post-astype float64 buffer (int path). Saves one chunk-sized device allocation per call. 7 new tests in test_apply_nodata_mask_gpu_inplace_1934.py; 52 related nodata tests pass. | Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures."
 glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk."
 hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings."
 hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE."

From 09412da1f15a6e1df22eda305144d534bdcfcc40 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 15 May 2026 07:40:38 -0700
Subject: [PATCH 3/3] geotiff: address copilot review on #1937 test module

- Reuse the shared ``requires_gpu`` marker from
  ``xrspatial/geotiff/tests/conftest.py`` instead of redefining a
  local ``_HAS_GPU`` import-and-runtime check. The conftest helper
  already validates both ``import cupy`` and
  ``cupy.cuda.is_available()``.
- Run the two memory-pool allocation tests under an isolated
  ``MemoryPool`` allocator and switch the measurement from
  ``used_bytes`` to ``total_bytes`` (called after
  ``free_all_blocks``) so the assertion cannot be masked by the
  input buffer being refcount-freed back to the pool before
  ``used_after`` is sampled.
---
 ...test_apply_nodata_mask_gpu_inplace_1934.py | 137 +++++++++---------
 1 file changed, 67 insertions(+), 70 deletions(-)

diff --git a/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py
index 3f982fd8f..9b176eff0 100644
--- a/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py
+++ b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py
@@ -20,27 +20,9 @@
 """
 from __future__ import annotations
 
-import importlib.util
-
 import numpy as np
-import pytest
-
-
-def _gpu_available() -> bool:
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
 
-
-_HAS_GPU = _gpu_available()
-_gpu_only = pytest.mark.skipif(
-    not _HAS_GPU,
-    reason="cupy + CUDA required",
-)
+from xrspatial.geotiff.tests.conftest import requires_gpu as _gpu_only
 
 
 @_gpu_only
@@ -86,40 +68,47 @@ def test_apply_nodata_mask_gpu_float_in_place_no_copy_1934():
 def test_apply_nodata_mask_gpu_float_alloc_count_unchanged_1934():
     """Float path does not pull a fresh chunk-sized buffer from the pool.
 
-    The default memory pool exposes ``n_free_blocks`` / ``used_bytes`` and
-    ``total_bytes``. The pre-fix ``cupy.where`` path allocated one extra
-    chunk-sized buffer; the in-place fix should not grow ``used_bytes``
-    once the input is already on device.
+    Uses an isolated ``MemoryPool`` and measures ``total_bytes`` (which
+    counts free blocks too) after a ``free_all_blocks`` so the pre-fix
+    ``cupy.where`` allocation cannot be masked by the input buffer being
+    refcount-freed back to the pool before the assertion.
     """
     import cupy
 
     from xrspatial.geotiff import _apply_nodata_mask_gpu
 
-    # Large enough that an extra allocation would be visible.
-    arr_gpu = cupy.full((512, 512), -9999.0, dtype=cupy.float32)
-    # Plant a non-sentinel pixel so the mask is not empty.
-    arr_gpu[0, 0] = 1.0
-
-    pool = cupy.get_default_memory_pool()
-    cupy.cuda.Stream.null.synchronize()
-    used_before = pool.used_bytes()
-
-    out = _apply_nodata_mask_gpu(arr_gpu, -9999.0)
-    cupy.cuda.Stream.null.synchronize()
-    used_after = pool.used_bytes()
-
-    # The mask is a transient bool array (1/4 the byte count of float32),
-    # so used_bytes can rise by the mask size but must not rise by the
-    # array's full byte count. Pre-fix would add at least one float32
-    # buffer of the same shape (512*512*4 = 1 MiB).
-    array_bytes = arr_gpu.nbytes
-    growth = used_after - used_before
-    assert growth < array_bytes, (
-        f"unexpected allocation growth {growth} bytes >= "
-        f"array_bytes {array_bytes}; in-place mutation regressed"
-    )
-    # And the returned buffer is the same one we passed in.
-    assert out.data.ptr == arr_gpu.data.ptr
+    isolated_pool = cupy.cuda.MemoryPool()
+    prev_allocator = cupy.cuda.get_allocator()
+    cupy.cuda.set_allocator(isolated_pool.malloc)
+    try:
+        # Large enough that an extra allocation would be visible.
+        arr_gpu = cupy.full((512, 512), -9999.0, dtype=cupy.float32)
+        arr_gpu[0, 0] = 1.0  # plant a non-sentinel pixel
+
+        cupy.cuda.Stream.null.synchronize()
+        isolated_pool.free_all_blocks()
+        total_before = isolated_pool.total_bytes()
+
+        out = _apply_nodata_mask_gpu(arr_gpu, -9999.0)
+        cupy.cuda.Stream.null.synchronize()
+        isolated_pool.free_all_blocks()
+        total_after = isolated_pool.total_bytes()
+
+        # The mask is a transient bool array (1/4 the byte count of float32),
+        # so total_bytes can rise by the mask size but must not rise by the
+        # array's full byte count. Pre-fix would add at least one float32
+        # buffer of the same shape (512*512*4 = 1 MiB).
+        array_bytes = arr_gpu.nbytes
+        growth = total_after - total_before
+        assert growth < array_bytes, (
+            f"unexpected allocation growth {growth} bytes >= "
+            f"array_bytes {array_bytes}; in-place mutation regressed"
+        )
+        # And the returned buffer is the same one we passed in.
+        assert out.data.ptr == arr_gpu.data.ptr
+    finally:
+        cupy.cuda.set_allocator(prev_allocator)
+        isolated_pool.free_all_blocks()
 
 
 @_gpu_only
@@ -153,28 +142,36 @@ def test_apply_nodata_mask_gpu_int_no_extra_buffer_after_astype_1934():
 
     from xrspatial.geotiff import _apply_nodata_mask_gpu
 
-    arr_gpu = cupy.full((512, 512), 3, dtype=cupy.uint16)
-    arr_gpu[0, 0] = 1  # ensure non-sentinel pixel exists
-
-    pool = cupy.get_default_memory_pool()
-    cupy.cuda.Stream.null.synchronize()
-    used_before = pool.used_bytes()
-
-    out = _apply_nodata_mask_gpu(arr_gpu, 3)
-    cupy.cuda.Stream.null.synchronize()
-    used_after = pool.used_bytes()
-
-    # Required: one float64 buffer (512*512*8 = 2 MiB) from astype.
-    # Pre-fix would have allocated a second float64 buffer for cupy.where
-    # (another 2 MiB) on top of that.
-    float64_bytes = out.nbytes
-    growth = used_after - used_before
-    # Allow some slack for the bool mask + .any() scalar (well under
-    # one float64 buffer of slack).
-    assert growth < 2 * float64_bytes, (
-        f"unexpected allocation growth {growth} bytes >= "
-        f"2 * float64_bytes {2 * float64_bytes}; pre-fix double-alloc"
-    )
+    isolated_pool = cupy.cuda.MemoryPool()
+    prev_allocator = cupy.cuda.get_allocator()
+    cupy.cuda.set_allocator(isolated_pool.malloc)
+    try:
+        arr_gpu = cupy.full((512, 512), 3, dtype=cupy.uint16)
+        arr_gpu[0, 0] = 1  # ensure non-sentinel pixel exists
+
+        cupy.cuda.Stream.null.synchronize()
+        isolated_pool.free_all_blocks()
+        total_before = isolated_pool.total_bytes()
+
+        out = _apply_nodata_mask_gpu(arr_gpu, 3)
+        cupy.cuda.Stream.null.synchronize()
+        isolated_pool.free_all_blocks()
+        total_after = isolated_pool.total_bytes()
+
+        # Required: one float64 buffer (512*512*8 = 2 MiB) from astype.
+        # Pre-fix would have allocated a second float64 buffer for
+        # cupy.where (another 2 MiB) on top of that.
+        float64_bytes = out.nbytes
+        growth = total_after - total_before
+        # Allow some slack for the bool mask + .any() scalar (well under
+        # one float64 buffer of slack).
+        assert growth < 2 * float64_bytes, (
+            f"unexpected allocation growth {growth} bytes >= "
+            f"2 * float64_bytes {2 * float64_bytes}; pre-fix double-alloc"
+        )
+    finally:
+        cupy.cuda.set_allocator(prev_allocator)
+        isolated_pool.free_all_blocks()
 
 
 @_gpu_only