From e761f16225fb5ba75ec4424c8fa5b7203893ea1b Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 07:34:03 -0700 Subject: [PATCH 1/3] geotiff: in-place nodata mask on GPU (#1934) `_apply_nodata_mask_gpu` ran `cupy.where(arr == sentinel, nan, arr)` for both the float and the post-`astype(float64)` integer paths, which allocates a fresh output buffer the same shape as the input. Every call site passes a freshly decoded GPU buffer that no caller-visible state aliases, so writing NaN into the existing buffer with `cupy.putmask` drops one chunk-sized device allocation per call. Adds `test_apply_nodata_mask_gpu_inplace_1934.py` covering the float correctness path, the in-place pointer guarantee, a pool `used_bytes` ceiling for both the float and integer paths, the NaN-sentinel no-op, and the `nodata=None` passthrough. --- xrspatial/geotiff/_backends/_gpu_helpers.py | 15 +- ...test_apply_nodata_mask_gpu_inplace_1934.py | 208 ++++++++++++++++++ 2 files changed, 220 insertions(+), 3 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py diff --git a/xrspatial/geotiff/_backends/_gpu_helpers.py b/xrspatial/geotiff/_backends/_gpu_helpers.py index 49399c40e..723a477e9 100644 --- a/xrspatial/geotiff/_backends/_gpu_helpers.py +++ b/xrspatial/geotiff/_backends/_gpu_helpers.py @@ -54,6 +54,12 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata): Returns the (possibly promoted, possibly nodata-masked) CuPy array. The caller is responsible for setting ``attrs['nodata']`` so the sentinel is still discoverable downstream. + + The sentinel-replacement step writes NaN into the existing buffer with + ``cupy.putmask`` rather than allocating a fresh output via + ``cupy.where``; every call site passes a freshly decoded GPU buffer + that no caller-visible state aliases, so the mutation is safe and + drops one chunk-sized device allocation per call (#1934). """ import cupy @@ -63,8 +69,8 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata): if arr_dtype.kind == 'f': if not np.isnan(nodata): sentinel = arr_dtype.type(nodata) - arr_gpu = cupy.where(arr_gpu == sentinel, - arr_dtype.type('nan'), arr_gpu) + cupy.putmask(arr_gpu, arr_gpu == sentinel, + arr_dtype.type('nan')) return arr_gpu if arr_dtype.kind in ('u', 'i'): # Out-of-range sentinels (e.g. uint16 + GDAL_NODATA="-9999") cannot @@ -89,8 +95,11 @@ def _apply_nodata_mask_gpu(arr_gpu, nodata): sentinel = arr_dtype.type(nodata_int) mask = arr_gpu == sentinel if bool(mask.any().item()): + # ``astype`` allocates the float64 buffer; write NaN into it + # in place instead of running it through another ``cupy.where`` + # that would allocate again (#1934). arr_gpu = arr_gpu.astype(cupy.float64) - arr_gpu = cupy.where(mask, cupy.float64('nan'), arr_gpu) + cupy.putmask(arr_gpu, mask, cupy.float64('nan')) return arr_gpu return arr_gpu diff --git a/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py new file mode 100644 index 000000000..3f982fd8f --- /dev/null +++ b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py @@ -0,0 +1,208 @@ +"""Regression tests for issue #1934. + +``_apply_nodata_mask_gpu`` used to replace the sentinel pixels via +``cupy.where(arr_gpu == sentinel, nan, arr_gpu)``, which allocates a fresh +output buffer the same shape as the input. Every call site passes a +freshly decoded GPU buffer that no caller-visible state aliases, so the +fix writes NaN into the existing buffer with ``cupy.putmask`` and drops +one chunk-sized device allocation per call. + +Two guards here: + +1. Correctness -- float and integer paths match the pre-fix behaviour on + a representative input (sentinel masked to NaN, non-sentinel pixels + preserved, integer dtype promoted to float64). +2. In-place mutation -- on the float path the output array shares the + same device pointer as the input, confirming no fresh allocation. The + integer path still allocates via ``astype(float64)``; the test checks + the post-astype buffer is then mutated in place rather than copied + again by ``cupy.where``. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif( + not _HAS_GPU, + reason="cupy + CUDA required", +) + + +@_gpu_only +def test_apply_nodata_mask_gpu_float_masks_sentinel_to_nan_1934(): + """Float path masks the sentinel to NaN and leaves other pixels alone.""" + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + arr_gpu = cupy.asarray( + np.array([[1.0, 2.0], [-9999.0, 4.0]], dtype=np.float32) + ) + out = _apply_nodata_mask_gpu(arr_gpu, -9999.0) + assert out.dtype == cupy.float32 + host = out.get() + assert np.isnan(host[1, 0]) + assert host[0, 0] == 1.0 + assert host[0, 1] == 2.0 + assert host[1, 1] == 4.0 + + +@_gpu_only +def test_apply_nodata_mask_gpu_float_in_place_no_copy_1934(): + """Float path mutates the input buffer in place. + + Before the fix, ``cupy.where`` returned a fresh array, so ``out`` had + a different device pointer than ``arr_gpu``. After the fix the input + buffer is reused and the pointers match. + """ + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + arr_gpu = cupy.asarray( + np.array([[1.0, 2.0], [-9999.0, 4.0]], dtype=np.float32) + ) + input_ptr = arr_gpu.data.ptr + out = _apply_nodata_mask_gpu(arr_gpu, -9999.0) + assert out.data.ptr == input_ptr + + +@_gpu_only +def test_apply_nodata_mask_gpu_float_alloc_count_unchanged_1934(): + """Float path does not pull a fresh chunk-sized buffer from the pool. + + The default memory pool exposes ``n_free_blocks`` / ``used_bytes`` and + ``total_bytes``. The pre-fix ``cupy.where`` path allocated one extra + chunk-sized buffer; the in-place fix should not grow ``used_bytes`` + once the input is already on device. + """ + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + # Large enough that an extra allocation would be visible. + arr_gpu = cupy.full((512, 512), -9999.0, dtype=cupy.float32) + # Plant a non-sentinel pixel so the mask is not empty. + arr_gpu[0, 0] = 1.0 + + pool = cupy.get_default_memory_pool() + cupy.cuda.Stream.null.synchronize() + used_before = pool.used_bytes() + + out = _apply_nodata_mask_gpu(arr_gpu, -9999.0) + cupy.cuda.Stream.null.synchronize() + used_after = pool.used_bytes() + + # The mask is a transient bool array (1/4 the byte count of float32), + # so used_bytes can rise by the mask size but must not rise by the + # array's full byte count. Pre-fix would add at least one float32 + # buffer of the same shape (512*512*4 = 1 MiB). + array_bytes = arr_gpu.nbytes + growth = used_after - used_before + assert growth < array_bytes, ( + f"unexpected allocation growth {growth} bytes >= " + f"array_bytes {array_bytes}; in-place mutation regressed" + ) + # And the returned buffer is the same one we passed in. + assert out.data.ptr == arr_gpu.data.ptr + + +@_gpu_only +def test_apply_nodata_mask_gpu_int_promotes_and_masks_1934(): + """Integer path still promotes to float64 and masks the sentinel.""" + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + arr_gpu = cupy.asarray( + np.array([[1, 2], [3, 4]], dtype=np.uint16) + ) + out = _apply_nodata_mask_gpu(arr_gpu, 3) + assert out.dtype == cupy.float64 + host = out.get() + assert np.isnan(host[1, 0]) + assert host[0, 0] == 1.0 + assert host[0, 1] == 2.0 + assert host[1, 1] == 4.0 + + +@_gpu_only +def test_apply_nodata_mask_gpu_int_no_extra_buffer_after_astype_1934(): + """Integer path: only the ``astype(float64)`` buffer is allocated. + + Before the fix the trailing ``cupy.where`` allocated a second + chunk-sized float64 buffer. After the fix the ``astype`` buffer is + mutated in place. + """ + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + arr_gpu = cupy.full((512, 512), 3, dtype=cupy.uint16) + arr_gpu[0, 0] = 1 # ensure non-sentinel pixel exists + + pool = cupy.get_default_memory_pool() + cupy.cuda.Stream.null.synchronize() + used_before = pool.used_bytes() + + out = _apply_nodata_mask_gpu(arr_gpu, 3) + cupy.cuda.Stream.null.synchronize() + used_after = pool.used_bytes() + + # Required: one float64 buffer (512*512*8 = 2 MiB) from astype. + # Pre-fix would have allocated a second float64 buffer for cupy.where + # (another 2 MiB) on top of that. + float64_bytes = out.nbytes + growth = used_after - used_before + # Allow some slack for the bool mask + .any() scalar (well under + # one float64 buffer of slack). + assert growth < 2 * float64_bytes, ( + f"unexpected allocation growth {growth} bytes >= " + f"2 * float64_bytes {2 * float64_bytes}; pre-fix double-alloc" + ) + + +@_gpu_only +def test_apply_nodata_mask_gpu_float_nan_sentinel_noop_1934(): + """NaN nodata on a float array stays a no-op.""" + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + arr_gpu = cupy.asarray( + np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + ) + input_ptr = arr_gpu.data.ptr + out = _apply_nodata_mask_gpu(arr_gpu, float('nan')) + # Same buffer back, untouched. + assert out.data.ptr == input_ptr + np.testing.assert_array_equal(out.get(), [[1.0, 2.0], [3.0, 4.0]]) + + +@_gpu_only +def test_apply_nodata_mask_gpu_none_nodata_passthrough_1934(): + """``nodata is None`` returns the input array untouched.""" + import cupy + + from xrspatial.geotiff import _apply_nodata_mask_gpu + + arr_gpu = cupy.asarray(np.array([[1, 2], [3, 4]], dtype=np.int32)) + input_ptr = arr_gpu.data.ptr + out = _apply_nodata_mask_gpu(arr_gpu, None) + assert out.data.ptr == input_ptr + assert out.dtype == cupy.int32 From 511f0855b8622aa8185855098e998783b37a8e8c Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 07:34:07 -0700 Subject: [PATCH 2/3] geotiff: record rockout 2026-05-15 LOW fix in sweep-performance state Note that #1934 (`_apply_nodata_mask_gpu` in-place mutation) was filed and fixed during the 2026-05-15 rockout pass on geotiff. --- .claude/sweep-performance-state.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index a759ddcd4..9c43540ef 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -18,7 +18,7 @@ fire,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, flood,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, focal,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, geodesic,2026-03-31T18:00:00Z,N/A,compute-bound,0,, -geotiff,2026-05-12,SAFE,IO-bound,0,1756,"Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures." +geotiff,2026-05-15,SAFE,IO-bound,0,1756,"Rockout 2026-05-15: LOW filed #1934 -- _apply_nodata_mask_gpu used cupy.where (allocating); switched to cupy.putmask on the already-owned buffer (float path) and on the post-astype float64 buffer (int path). Saves one chunk-sized device allocation per call. 7 new tests in test_apply_nodata_mask_gpu_inplace_1934.py; 52 related nodata tests pass. | Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures." glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk." hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings." hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE." From 09412da1f15a6e1df22eda305144d534bdcfcc40 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 07:40:38 -0700 Subject: [PATCH 3/3] geotiff: address copilot review on #1937 test module - Reuse the shared ``requires_gpu`` marker from ``xrspatial/geotiff/tests/conftest.py`` instead of redefining a local ``_HAS_GPU`` import-and-runtime check. The conftest helper already validates both ``import cupy`` and ``cupy.cuda.is_available()``. - Run the two memory-pool allocation tests under an isolated ``MemoryPool`` allocator and switch the measurement from ``used_bytes`` to ``total_bytes`` (called after ``free_all_blocks``) so the assertion cannot be masked by the input buffer being refcount-freed back to the pool before ``used_after`` is sampled. --- ...test_apply_nodata_mask_gpu_inplace_1934.py | 137 +++++++++--------- 1 file changed, 67 insertions(+), 70 deletions(-) diff --git a/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py index 3f982fd8f..9b176eff0 100644 --- a/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py +++ b/xrspatial/geotiff/tests/test_apply_nodata_mask_gpu_inplace_1934.py @@ -20,27 +20,9 @@ """ from __future__ import annotations -import importlib.util - import numpy as np -import pytest - - -def _gpu_available() -> bool: - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - -_HAS_GPU = _gpu_available() -_gpu_only = pytest.mark.skipif( - not _HAS_GPU, - reason="cupy + CUDA required", -) +from xrspatial.geotiff.tests.conftest import requires_gpu as _gpu_only @_gpu_only @@ -86,40 +68,47 @@ def test_apply_nodata_mask_gpu_float_in_place_no_copy_1934(): def test_apply_nodata_mask_gpu_float_alloc_count_unchanged_1934(): """Float path does not pull a fresh chunk-sized buffer from the pool. - The default memory pool exposes ``n_free_blocks`` / ``used_bytes`` and - ``total_bytes``. The pre-fix ``cupy.where`` path allocated one extra - chunk-sized buffer; the in-place fix should not grow ``used_bytes`` - once the input is already on device. + Uses an isolated ``MemoryPool`` and measures ``total_bytes`` (which + counts free blocks too) after a ``free_all_blocks`` so the pre-fix + ``cupy.where`` allocation cannot be masked by the input buffer being + refcount-freed back to the pool before the assertion. """ import cupy from xrspatial.geotiff import _apply_nodata_mask_gpu - # Large enough that an extra allocation would be visible. - arr_gpu = cupy.full((512, 512), -9999.0, dtype=cupy.float32) - # Plant a non-sentinel pixel so the mask is not empty. - arr_gpu[0, 0] = 1.0 - - pool = cupy.get_default_memory_pool() - cupy.cuda.Stream.null.synchronize() - used_before = pool.used_bytes() - - out = _apply_nodata_mask_gpu(arr_gpu, -9999.0) - cupy.cuda.Stream.null.synchronize() - used_after = pool.used_bytes() - - # The mask is a transient bool array (1/4 the byte count of float32), - # so used_bytes can rise by the mask size but must not rise by the - # array's full byte count. Pre-fix would add at least one float32 - # buffer of the same shape (512*512*4 = 1 MiB). - array_bytes = arr_gpu.nbytes - growth = used_after - used_before - assert growth < array_bytes, ( - f"unexpected allocation growth {growth} bytes >= " - f"array_bytes {array_bytes}; in-place mutation regressed" - ) - # And the returned buffer is the same one we passed in. - assert out.data.ptr == arr_gpu.data.ptr + isolated_pool = cupy.cuda.MemoryPool() + prev_allocator = cupy.cuda.get_allocator() + cupy.cuda.set_allocator(isolated_pool.malloc) + try: + # Large enough that an extra allocation would be visible. + arr_gpu = cupy.full((512, 512), -9999.0, dtype=cupy.float32) + arr_gpu[0, 0] = 1.0 # plant a non-sentinel pixel + + cupy.cuda.Stream.null.synchronize() + isolated_pool.free_all_blocks() + total_before = isolated_pool.total_bytes() + + out = _apply_nodata_mask_gpu(arr_gpu, -9999.0) + cupy.cuda.Stream.null.synchronize() + isolated_pool.free_all_blocks() + total_after = isolated_pool.total_bytes() + + # The mask is a transient bool array (1/4 the byte count of float32), + # so total_bytes can rise by the mask size but must not rise by the + # array's full byte count. Pre-fix would add at least one float32 + # buffer of the same shape (512*512*4 = 1 MiB). + array_bytes = arr_gpu.nbytes + growth = total_after - total_before + assert growth < array_bytes, ( + f"unexpected allocation growth {growth} bytes >= " + f"array_bytes {array_bytes}; in-place mutation regressed" + ) + # And the returned buffer is the same one we passed in. + assert out.data.ptr == arr_gpu.data.ptr + finally: + cupy.cuda.set_allocator(prev_allocator) + isolated_pool.free_all_blocks() @_gpu_only @@ -153,28 +142,36 @@ def test_apply_nodata_mask_gpu_int_no_extra_buffer_after_astype_1934(): from xrspatial.geotiff import _apply_nodata_mask_gpu - arr_gpu = cupy.full((512, 512), 3, dtype=cupy.uint16) - arr_gpu[0, 0] = 1 # ensure non-sentinel pixel exists - - pool = cupy.get_default_memory_pool() - cupy.cuda.Stream.null.synchronize() - used_before = pool.used_bytes() - - out = _apply_nodata_mask_gpu(arr_gpu, 3) - cupy.cuda.Stream.null.synchronize() - used_after = pool.used_bytes() - - # Required: one float64 buffer (512*512*8 = 2 MiB) from astype. - # Pre-fix would have allocated a second float64 buffer for cupy.where - # (another 2 MiB) on top of that. - float64_bytes = out.nbytes - growth = used_after - used_before - # Allow some slack for the bool mask + .any() scalar (well under - # one float64 buffer of slack). - assert growth < 2 * float64_bytes, ( - f"unexpected allocation growth {growth} bytes >= " - f"2 * float64_bytes {2 * float64_bytes}; pre-fix double-alloc" - ) + isolated_pool = cupy.cuda.MemoryPool() + prev_allocator = cupy.cuda.get_allocator() + cupy.cuda.set_allocator(isolated_pool.malloc) + try: + arr_gpu = cupy.full((512, 512), 3, dtype=cupy.uint16) + arr_gpu[0, 0] = 1 # ensure non-sentinel pixel exists + + cupy.cuda.Stream.null.synchronize() + isolated_pool.free_all_blocks() + total_before = isolated_pool.total_bytes() + + out = _apply_nodata_mask_gpu(arr_gpu, 3) + cupy.cuda.Stream.null.synchronize() + isolated_pool.free_all_blocks() + total_after = isolated_pool.total_bytes() + + # Required: one float64 buffer (512*512*8 = 2 MiB) from astype. + # Pre-fix would have allocated a second float64 buffer for + # cupy.where (another 2 MiB) on top of that. + float64_bytes = out.nbytes + growth = total_after - total_before + # Allow some slack for the bool mask + .any() scalar (well under + # one float64 buffer of slack). + assert growth < 2 * float64_bytes, ( + f"unexpected allocation growth {growth} bytes >= " + f"2 * float64_bytes {2 * float64_bytes}; pre-fix double-alloc" + ) + finally: + cupy.cuda.set_allocator(prev_allocator) + isolated_pool.free_all_blocks() @_gpu_only