diff --git a/crates/khal/src/backend/any_backend.rs b/crates/khal/src/backend/any_backend.rs index 7ecc166..69c5fe1 100644 --- a/crates/khal/src/backend/any_backend.rs +++ b/crates/khal/src/backend/any_backend.rs @@ -1304,7 +1304,12 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBuffer { } #[cfg(feature = "cuda")] (GpuBuffer::Cuda(buffer), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, buffer.device_ptr_raw(), buffer.byte_len()); + // cuda-oxide `&[T]` slice ABI wants an ELEMENT count, but byte_len is bytes; + // push element count so kernel `slice.len()` is correct (off-by-size_of + // otherwise -> OOB reads, e.g. gpu_init_sort_dispatch / lbvh). Arrays, + // scalars and uniforms ignore this value, so they are unaffected. + let elem_count = buffer.byte_len() / std::mem::size_of::() as u64; + dispatch.set_arg(binding, buffer.device_ptr_raw(), elem_count); Ok(()) } #[cfg(feature = "metal")] @@ -1336,7 +1341,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSlice<'_, T> { } #[cfg(feature = "cuda")] (GpuBufferSlice::Cuda(slice), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len); + dispatch.set_arg( + binding, + slice.offset_ptr(), + slice.byte_len / std::mem::size_of::() as u64, + ); Ok(()) } #[cfg(feature = "metal")] @@ -1373,7 +1382,11 @@ impl<'b, T: DeviceValue> crate::ShaderArgs<'b> for GpuBufferSliceMut<'_, T> { } #[cfg(feature = "cuda")] (GpuBufferSliceMut::Cuda(slice), GpuDispatch::Cuda(dispatch)) => { - dispatch.set_arg(binding, slice.offset_ptr(), slice.byte_len); + dispatch.set_arg( + binding, + slice.offset_ptr(), + slice.byte_len / std::mem::size_of::() as u64, + ); Ok(()) } #[cfg(feature = "metal")]