diff --git a/CHANGELOG.md b/CHANGELOG.md index df357ed..8ea2906 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# Unreleased +- Fix a GPU validation error / panic on simulations with more than ~4.19M particles, caused by + compute kernels dispatching more than 65535 workgroups along a single dimension. The affected + kernels now clamp the dispatch and grid-stride over the particles. + # v0.2.0 (27 Oct. 2025) - Add support for dynamic particle insertion. - Add support for specializing the particle update logic using slang’s link-time specializaiton feature. diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang index 24c5f84..8b9e52b 100644 --- a/shaders/slosh/grid/sort.slang +++ b/shaders/slosh/grid/sort.slang @@ -13,8 +13,9 @@ func touch_particle_blocks( StructuredBuffer particles_pos, ConstantBuffer particles_len, ) { - let id = invocation_id.x; - if (id < particles_len) { + let _gs_n = particles_len; + let _gs_total = min((_gs_n + (GRID_WORKGROUP_SIZE - 1u)) / GRID_WORKGROUP_SIZE, 65535u) * GRID_WORKGROUP_SIZE; + for (var id = invocation_id.x; id < _gs_n; id += _gs_total) { let cell_width = grid[0].cell_width; let particle = particles_pos[id]; var blocks = blocks_associated_to_point(cell_width, particle.pt); @@ -101,8 +102,9 @@ func update_block_particle_count( ConstantBuffer particles_len, RWStructuredBuffer active_blocks, ) { - let id = invocation_id.x; - if (id < particles_len) { + let _gs_n = particles_len; + let _gs_total = min((_gs_n + (GRID_WORKGROUP_SIZE - 1u)) / GRID_WORKGROUP_SIZE, 65535u) * GRID_WORKGROUP_SIZE; + for (var id = invocation_id.x; id < _gs_n; id += _gs_total) { let cell_width = grid[0].cell_width; let particle = particles_pos[id]; @@ -168,8 +170,9 @@ func finalize_particles_sort( RWStructuredBuffer active_blocks, ) { - let id = invocation_id.x; - if (id < particles_len) { + let _gs_n = particles_len; + let _gs_total = min((_gs_n + (GRID_WORKGROUP_SIZE - 1u)) / GRID_WORKGROUP_SIZE, 65535u) * GRID_WORKGROUP_SIZE; + for (var id = invocation_id.x; id < _gs_n; id += _gs_total) { let cell_width = grid[0].cell_width; let particle = particles_pos[id]; diff --git a/shaders/slosh/solver/particle_update.slang b/shaders/slosh/solver/particle_update.slang index 89331a0..2e10f5d 100644 --- a/shaders/slosh/solver/particle_update.slang +++ b/shaders/slosh/solver/particle_update.slang @@ -43,11 +43,9 @@ func particle_update( RWStructuredBuffer particles_props, ConstantBuffer particles_len, ) { - let particle_id = invocation_id.x; - - if (particle_id >= particles_len) { - return; - } + let _gs_n = particles_len; + let _gs_total = min((_gs_n + 63u) / 64u, 65535u) * 64u; + for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) { let model = ParticleModel(); let flags = model.model_flags(particles_model, particle_id); @@ -161,4 +159,5 @@ func particle_update( particles_kin[particle_id] = kin; particles_def_grad[particle_id] = def_grad; + } } diff --git a/shaders/slosh/solver/timestep_bound.slang b/shaders/slosh/solver/timestep_bound.slang index 0debbd3..9121933 100644 --- a/shaders/slosh/solver/timestep_bound.slang +++ b/shaders/slosh/solver/timestep_bound.slang @@ -46,14 +46,12 @@ func estimate_timestep_bound( ConstantBuffer particles_len, RWStructuredBuffer result, ) { - let particle_id = invocation_id.x; - - if (particle_id >= particles_len) { - return; - } + let _gs_n = particles_len; + let _gs_total = min((_gs_n + (WORKGROUP_SIZE - 1u)) / WORKGROUP_SIZE, 65535u) * WORKGROUP_SIZE; + for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) { if (particles_kin[particle_id].enabled == 0) { - return; + continue; } let cell_width = grid[0].cell_width; @@ -91,4 +89,5 @@ func estimate_timestep_bound( let candidate = GpuTimestepBounds::secs_to_int(dt); result[0].computed_max_dt_as_uint.min(candidate); + } } diff --git a/src/grid/grid.rs b/src/grid/grid.rs index e1a271b..637457a 100644 --- a/src/grid/grid.rs +++ b/src/grid/grid.rs @@ -108,11 +108,11 @@ impl WgGrid { self.reset_hmap .launch(backend, pass, &args, [grid.cpu_meta.hmap_capacity, 1, 1])?; - sort_module.touch_particle_blocks.launch( + sort_module.touch_particle_blocks.launch_capped( backend, pass, &args, - [particles.len() as u32, 1, 1], + particles.len() as u32, )?; // // Ensure blocks exist wherever we have rigid particles that might affect @@ -151,11 +151,11 @@ impl WgGrid { self.init_indirect_workgroups .launch_grid(backend, pass, &args, [1, 1, 1])?; - sort_module.update_block_particle_count.launch( + sort_module.update_block_particle_count.launch_capped( backend, pass, &args, - [particles.len() as u32, 1, 1], + particles.len() as u32, )?; sort_module @@ -175,11 +175,11 @@ impl WgGrid { &args, grid.indirect_n_g2p_p2g_groups.buffer(), )?; - sort_module.finalize_particles_sort.launch( + sort_module.finalize_particles_sort.launch_capped( backend, pass, &args, - [particles.len() as u32, 1, 1], + particles.len() as u32, )?; Ok(()) diff --git a/src/solver/particle_update.rs b/src/solver/particle_update.rs index b79d1f8..84b9ad7 100644 --- a/src/solver/particle_update.rs +++ b/src/solver/particle_update.rs @@ -67,6 +67,6 @@ impl WgParticleUpdate { particles_len: particles.gpu_len(), }; self.particle_update - .launch(backend, pass, &args, [particles.len() as u32, 1, 1]) + .launch_capped(backend, pass, &args, particles.len() as u32) } } diff --git a/src/solver/timestep_bound.rs b/src/solver/timestep_bound.rs index 566d1ac..db6c5c2 100644 --- a/src/solver/timestep_bound.rs +++ b/src/solver/timestep_bound.rs @@ -97,6 +97,6 @@ impl WgTimestepBounds { self.reset_timestep_bound .launch(backend, pass, &args, [1; 3])?; self.estimate_timestep_bound - .launch(backend, pass, &args, [particles.len() as u32, 1, 1]) + .launch_capped(backend, pass, &args, particles.len() as u32) } }