From a0a71b303d8dd5f3e43d03fe3887ba44d9bc5404 Mon Sep 17 00:00:00 2001 From: haixuanTao Date: Mon, 8 Jun 2026 14:30:08 +0200 Subject: [PATCH 1/2] fix(cuda): push element count, not byte length, for slice kernel args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cuda-oxide lowers a `&[T]` kernel parameter to a `(ptr, len)` pair where `len` is an ELEMENT count, but the khal CUDA backend was pushing the buffer's BYTE length. So a kernel calling `slice.len()` got `byte_len` (4x too large for `&[u32]`) and read out of bounds — fatal in the batched broad-phase radix sort (`gpu_init_sort_dispatch` derives the workgroup grid from `num_keys_arr.len()`, producing a garbage indirect dispatch and an illegal memory access) and in the LBVH traversal (`*_len.len()`). Fix: in the three CUDA `write_arg` arms (`GpuBuffer`/`GpuBufferSlice`/ `GpuBufferSliceMut`, all generic over `T`), push `byte_len / size_of::()`. Sized arrays (`&[T; N]`), scalars (`&T`) and uniforms reconstruct via the pointer and ignore this value, so they are unaffected; only true `&[T]` slices change — to correct. Latent in vortx (its kernels bound loops by `Shape` uniforms, never `slice.len()`) and in single-env physics; surfaced once N>1 batched physics ran on native CUDA. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/khal/src/backend/any_backend.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/khal/src/backend/any_backend.rs b/crates/khal/src/backend/any_backend.rs index 7ecc166..d7a0761 100644 --- a/crates/khal/src/backend/any_backend.rs +++ b/crates/khal/src/backend/any_backend.rs @@ -1304,7 +1304,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBuffer { } #[cfg(feature = "cuda")] (GpuBuffer::Cuda(buffer), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len()); + // cuda-oxide `&[T]` slice ABI wants an ELEMENT count, but byte_len is bytes; + // push element count so kernel `slice.len()` is correct (off-by-size_of + // otherwise -> OOB reads, e.g. gpu_init_sort_dispatch / lbvh). Arrays, + // scalars and uniforms ignore this value, so they are unaffected. + dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len() / std::mem::size_of::() as u64); Ok(()) } #[cfg(feature = "metal")] @@ -1336,7 +1340,7 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSlice<'_, T> { } #[cfg(feature = "cuda")] (GpuBufferSlice::Cuda(slice), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len); + dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::() as u64); Ok(()) } #[cfg(feature = "metal")] @@ -1373,7 +1377,7 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSliceMut<'_, T> { } #[cfg(feature = "cuda")] (GpuBufferSliceMut::Cuda(slice), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len); + dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::() as u64); Ok(()) } #[cfg(feature = "metal")] From 2552c994d58475cdd67ee2e20cd606313a0a5518 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 17:56:56 +0000 Subject: [PATCH 2/2] style: fix rustfmt formatting on slice element-count fix https://claude.ai/code/session_01PKvqHpCVv3JN7wHCjUG6HP --- crates/khal/src/backend/any_backend.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/crates/khal/src/backend/any_backend.rs b/crates/khal/src/backend/any_backend.rs index d7a0761..69c5fe1 100644 --- a/crates/khal/src/backend/any_backend.rs +++ b/crates/khal/src/backend/any_backend.rs @@ -1308,7 +1308,8 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBuffer { // push element count so kernel `slice.len()` is correct (off-by-size_of // otherwise -> OOB reads, e.g. gpu_init_sort_dispatch / lbvh). Arrays, // scalars and uniforms ignore this value, so they are unaffected. - dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len() / std::mem::size_of::() as u64); + let elem_count = buffer.byte_len() / std::mem::size_of::() as u64; + dispatch.set_arg(binding, buffer.device_ptr_raw(), elem_count); Ok(()) } #[cfg(feature = "metal")] @@ -1340,7 +1341,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSlice<'_, T> { } #[cfg(feature = "cuda")] (GpuBufferSlice::Cuda(slice), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::() as u64); + dispatch.set_arg( + binding, + slice.offset_ptr(), + slice.byte_len / std::mem::size_of::() as u64, + ); Ok(()) } #[cfg(feature = "metal")] @@ -1377,7 +1382,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSliceMut<'_, T> { } #[cfg(feature = "cuda")] (GpuBufferSliceMut::Cuda(slice), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::() as u64); + dispatch.set_arg( + binding, + slice.offset_ptr(), + slice.byte_len / std::mem::size_of::() as u64, + ); Ok(()) } #[cfg(feature = "metal")]