From a0a71b303d8dd5f3e43d03fe3887ba44d9bc5404 Mon Sep 17 00:00:00 2001
From: haixuanTao <shavtao@gmail.com>
Date: Mon, 8 Jun 2026 14:30:08 +0200
Subject: [PATCH 1/2] fix(cuda): push element count, not byte length, for slice
 kernel args
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cuda-oxide lowers a `&[T]` kernel parameter to a `(ptr, len)` pair where
`len` is an ELEMENT count, but the khal CUDA backend was pushing the
buffer's BYTE length. So a kernel calling `slice.len()` got `byte_len`
(4x too large for `&[u32]`) and read out of bounds — fatal in the batched
broad-phase radix sort (`gpu_init_sort_dispatch` derives the workgroup
grid from `num_keys_arr.len()`, producing a garbage indirect dispatch and
an illegal memory access) and in the LBVH traversal (`*_len.len()`).

Fix: in the three CUDA `write_arg` arms (`GpuBuffer`/`GpuBufferSlice`/
`GpuBufferSliceMut`, all generic over `T`), push `byte_len / size_of::<T>()`.
Sized arrays (`&[T; N]`), scalars (`&T`) and uniforms reconstruct via the
pointer and ignore this value, so they are unaffected; only true `&[T]`
slices change — to correct. Latent in vortx (its kernels bound loops by
`Shape` uniforms, never `slice.len()`) and in single-env physics; surfaced
once N>1 batched physics ran on native CUDA.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/khal/src/backend/any_backend.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/crates/khal/src/backend/any_backend.rs b/crates/khal/src/backend/any_backend.rs
index 7ecc166..d7a0761 100644
--- a/crates/khal/src/backend/any_backend.rs
+++ b/crates/khal/src/backend/any_backend.rs
@@ -1304,7 +1304,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBuffer<T> {
             }
             #[cfg(feature = "cuda")]
             (GpuBuffer::Cuda(buffer), GpuDispatch::Cuda(dispatch)) => {
-                dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len());
+                // cuda-oxide `&[T]` slice ABI wants an ELEMENT count, but byte_len is bytes;
+                // push element count so kernel `slice.len()` is correct (off-by-size_of
+                // otherwise -> OOB reads, e.g. gpu_init_sort_dispatch / lbvh). Arrays,
+                // scalars and uniforms ignore this value, so they are unaffected.
+                dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len() / std::mem::size_of::<T>() as u64);
                 Ok(())
             }
             #[cfg(feature = "metal")]
@@ -1336,7 +1340,7 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSlice<'_, T> {
             }
             #[cfg(feature = "cuda")]
             (GpuBufferSlice::Cuda(slice), GpuDispatch::Cuda(dispatch)) => {
-                dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len);
+                dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::<T>() as u64);
                 Ok(())
             }
             #[cfg(feature = "metal")]
@@ -1373,7 +1377,7 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSliceMut<'_, T> {
             }
             #[cfg(feature = "cuda")]
             (GpuBufferSliceMut::Cuda(slice), GpuDispatch::Cuda(dispatch)) => {
-                dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len);
+                dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::<T>() as u64);
                 Ok(())
             }
             #[cfg(feature = "metal")]

From 2552c994d58475cdd67ee2e20cd606313a0a5518 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Jun 2026 17:56:56 +0000
Subject: [PATCH 2/2] style: fix rustfmt formatting on slice element-count fix

https://claude.ai/code/session_01PKvqHpCVv3JN7wHCjUG6HP
---
 crates/khal/src/backend/any_backend.rs | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/crates/khal/src/backend/any_backend.rs b/crates/khal/src/backend/any_backend.rs
index d7a0761..69c5fe1 100644
--- a/crates/khal/src/backend/any_backend.rs
+++ b/crates/khal/src/backend/any_backend.rs
@@ -1308,7 +1308,8 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBuffer<T> {
                 // push element count so kernel `slice.len()` is correct (off-by-size_of
                 // otherwise -> OOB reads, e.g. gpu_init_sort_dispatch / lbvh). Arrays,
                 // scalars and uniforms ignore this value, so they are unaffected.
-                dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len() / std::mem::size_of::<T>() as u64);
+                let elem_count = buffer.byte_len() / std::mem::size_of::<T>() as u64;
+                dispatch.set_arg(binding, buffer.device_ptr_raw(), elem_count);
                 Ok(())
             }
             #[cfg(feature = "metal")]
@@ -1340,7 +1341,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSlice<'_, T> {
             }
             #[cfg(feature = "cuda")]
             (GpuBufferSlice::Cuda(slice), GpuDispatch::Cuda(dispatch)) => {
-                dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::<T>() as u64);
+                dispatch.set_arg(
+                    binding,
+                    slice.offset_ptr(),
+                    slice.byte_len / std::mem::size_of::<T>() as u64,
+                );
                 Ok(())
             }
             #[cfg(feature = "metal")]
@@ -1377,7 +1382,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSliceMut<'_, T> {
             }
             #[cfg(feature = "cuda")]
             (GpuBufferSliceMut::Cuda(slice), GpuDispatch::Cuda(dispatch)) => {
-                dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len / std::mem::size_of::<T>() as u64);
+                dispatch.set_arg(
+                    binding,
+                    slice.offset_ptr(),
+                    slice.byte_len / std::mem::size_of::<T>() as u64,
+                );
                 Ok(())
             }
             #[cfg(feature = "metal")]