diff --git a/CHANGELOG.md b/CHANGELOG.md
index df357ed..8ea2906 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+# Unreleased
+- Fix a GPU validation error / panic on simulations with more than ~4.19M particles, caused by
+  compute kernels dispatching more than 65535 workgroups along a single dimension. The affected 
+  kernels now clamp the dispatch and grid-stride over the particles.
+
 # v0.2.0 (27 Oct. 2025)
 - Add support for dynamic particle insertion.
 - Add support for specializing the particle update logic using slang’s link-time specializaiton feature.
diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang
index 24c5f84..8b9e52b 100644
--- a/shaders/slosh/grid/sort.slang
+++ b/shaders/slosh/grid/sort.slang
@@ -13,8 +13,9 @@ func touch_particle_blocks(
     StructuredBuffer<Position> particles_pos,
     ConstantBuffer<uint> particles_len,
 ) {
-    let id = invocation_id.x;
-    if (id < particles_len) {
+    let _gs_n = particles_len;
+    let _gs_total = min((_gs_n + (GRID_WORKGROUP_SIZE - 1u)) / GRID_WORKGROUP_SIZE, 65535u) * GRID_WORKGROUP_SIZE;
+    for (var id = invocation_id.x; id < _gs_n; id += _gs_total) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
         var blocks = blocks_associated_to_point(cell_width, particle.pt);
@@ -101,8 +102,9 @@ func update_block_particle_count(
     ConstantBuffer<uint> particles_len,
     RWStructuredBuffer<AtomicActiveBlockHeader> active_blocks,
 ) {
-    let id = invocation_id.x;
-    if (id < particles_len) {
+    let _gs_n = particles_len;
+    let _gs_total = min((_gs_n + (GRID_WORKGROUP_SIZE - 1u)) / GRID_WORKGROUP_SIZE, 65535u) * GRID_WORKGROUP_SIZE;
+    for (var id = invocation_id.x; id < _gs_n; id += _gs_total) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
 
@@ -168,8 +170,9 @@ func finalize_particles_sort(
     RWStructuredBuffer<AtomicActiveBlockHeader> active_blocks,
 
 ) {
-    let id = invocation_id.x;
-    if (id < particles_len) {
+    let _gs_n = particles_len;
+    let _gs_total = min((_gs_n + (GRID_WORKGROUP_SIZE - 1u)) / GRID_WORKGROUP_SIZE, 65535u) * GRID_WORKGROUP_SIZE;
+    for (var id = invocation_id.x; id < _gs_n; id += _gs_total) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
 
diff --git a/shaders/slosh/solver/particle_update.slang b/shaders/slosh/solver/particle_update.slang
index 89331a0..2e10f5d 100644
--- a/shaders/slosh/solver/particle_update.slang
+++ b/shaders/slosh/solver/particle_update.slang
@@ -43,11 +43,9 @@ func particle_update(
     RWStructuredBuffer<ParticleProperties> particles_props,
     ConstantBuffer<uint> particles_len,
 ) {
-    let particle_id = invocation_id.x;
-
-    if (particle_id >= particles_len) {
-        return;
-    }
+    let _gs_n = particles_len;
+    let _gs_total = min((_gs_n + 63u) / 64u, 65535u) * 64u;
+    for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) {
 
     let model = ParticleModel();
     let flags = model.model_flags(particles_model, particle_id);
@@ -161,4 +159,5 @@ func particle_update(
 
     particles_kin[particle_id] = kin;
     particles_def_grad[particle_id] = def_grad;
+    }
 }
diff --git a/shaders/slosh/solver/timestep_bound.slang b/shaders/slosh/solver/timestep_bound.slang
index 0debbd3..9121933 100644
--- a/shaders/slosh/solver/timestep_bound.slang
+++ b/shaders/slosh/solver/timestep_bound.slang
@@ -46,14 +46,12 @@ func estimate_timestep_bound(
     ConstantBuffer<uint> particles_len,
     RWStructuredBuffer<GpuTimestepBounds> result,
 ) {
-    let particle_id = invocation_id.x;
-
-    if (particle_id >= particles_len) {
-        return;
-    }
+    let _gs_n = particles_len;
+    let _gs_total = min((_gs_n + (WORKGROUP_SIZE - 1u)) / WORKGROUP_SIZE, 65535u) * WORKGROUP_SIZE;
+    for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) {
 
     if (particles_kin[particle_id].enabled == 0) {
-        return;
+        continue;
     }
 
     let cell_width = grid[0].cell_width;
@@ -91,4 +89,5 @@ func estimate_timestep_bound(
 
     let candidate = GpuTimestepBounds::secs_to_int(dt);
     result[0].computed_max_dt_as_uint.min(candidate);
+    }
 }
diff --git a/src/grid/grid.rs b/src/grid/grid.rs
index e1a271b..637457a 100644
--- a/src/grid/grid.rs
+++ b/src/grid/grid.rs
@@ -108,11 +108,11 @@ impl<B: Backend> WgGrid<B> {
             self.reset_hmap
                 .launch(backend, pass, &args, [grid.cpu_meta.hmap_capacity, 1, 1])?;
 
-            sort_module.touch_particle_blocks.launch(
+            sort_module.touch_particle_blocks.launch_capped(
                 backend,
                 pass,
                 &args,
-                [particles.len() as u32, 1, 1],
+                particles.len() as u32,
             )?;
 
             // // Ensure blocks exist wherever we have rigid particles that might affect
@@ -151,11 +151,11 @@ impl<B: Backend> WgGrid<B> {
         self.init_indirect_workgroups
             .launch_grid(backend, pass, &args, [1, 1, 1])?;
 
-        sort_module.update_block_particle_count.launch(
+        sort_module.update_block_particle_count.launch_capped(
             backend,
             pass,
             &args,
-            [particles.len() as u32, 1, 1],
+            particles.len() as u32,
         )?;
 
         sort_module
@@ -175,11 +175,11 @@ impl<B: Backend> WgGrid<B> {
             &args,
             grid.indirect_n_g2p_p2g_groups.buffer(),
         )?;
-        sort_module.finalize_particles_sort.launch(
+        sort_module.finalize_particles_sort.launch_capped(
             backend,
             pass,
             &args,
-            [particles.len() as u32, 1, 1],
+            particles.len() as u32,
         )?;
 
         Ok(())
diff --git a/src/solver/particle_update.rs b/src/solver/particle_update.rs
index b79d1f8..84b9ad7 100644
--- a/src/solver/particle_update.rs
+++ b/src/solver/particle_update.rs
@@ -67,6 +67,6 @@ impl<B: Backend> WgParticleUpdate<B> {
             particles_len: particles.gpu_len(),
         };
         self.particle_update
-            .launch(backend, pass, &args, [particles.len() as u32, 1, 1])
+            .launch_capped(backend, pass, &args, particles.len() as u32)
     }
 }
diff --git a/src/solver/timestep_bound.rs b/src/solver/timestep_bound.rs
index 566d1ac..db6c5c2 100644
--- a/src/solver/timestep_bound.rs
+++ b/src/solver/timestep_bound.rs
@@ -97,6 +97,6 @@ impl<B: Backend> WgTimestepBounds<B> {
         self.reset_timestep_bound
             .launch(backend, pass, &args, [1; 3])?;
         self.estimate_timestep_bound
-            .launch(backend, pass, &args, [particles.len() as u32, 1, 1])
+            .launch_capped(backend, pass, &args, particles.len() as u32)
     }
 }