dimforge · sebcrozet · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/crates/slosh2d/Cargo.toml b/crates/slosh2d/Cargo.toml
@@ -3,7 +3,7 @@ name = "slosh2d"
 authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
 description = "Cross-platform GPU 2D Material Point Method implementation."
 repository = "https://github.com/dimforge/slosh"
-version = "0.6.0"
+version = "0.6.1"
 edition = "2024"
 license = "Apache-2.0"
 

diff --git a/crates/slosh3d/Cargo.toml b/crates/slosh3d/Cargo.toml
@@ -3,7 +3,7 @@ name = "slosh3d"
 authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
 description = "Cross-platform GPU 3D Material Point Method implementation."
 repository = "https://github.com/dimforge/slosh"
-version = "0.6.0"
+version = "0.6.1"
 edition = "2024"
 license = "Apache-2.0"
 

diff --git a/shaders/slosh/grid/grid.slang b/shaders/slosh/grid/grid.slang
@@ -216,7 +216,8 @@ public static const int OFF_BY_ONE = 1;
 public struct ActiveBlockHeaderGeneric<MaybeAtomicUint> {
     public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block.
     public uint first_particle;
-    public MaybeAtomicUint num_particles;
+    public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block.
+    public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively.
 }
 
 public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric<uint>;
@@ -339,6 +340,7 @@ public func mark_block_as_active(
         let block_header_id = grid[0].num_active_blocks.add(1u);
         active_blocks[block_header_id].virtual_id = block;
         active_blocks[block_header_id].first_particle = 0u;
+        active_blocks[block_header_id].num_particles_with_extras = 0u;
         active_blocks[block_header_id].num_particles = 0u;
         hmap_entries[slot].value = BlockHeaderId(block_header_id);
     }

diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang
@@ -105,9 +105,22 @@ func update_block_particle_count(
     if (id < particles_len) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
-        let block_id = block_associated_to_point(cell_width, particle.pt);
-        let active_block_id = find_block_header_id(grid, hmap_entries, block_id);
-        active_blocks[active_block_id.id].num_particles.add(1u);
+
+        var blocks = blocks_associated_to_point(cell_width, particle.pt);
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        active_blocks[active_block_id_0.id].num_particles.add(1u);
+        active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u);
+
+        let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
+        let mask = uint3(assoc >= 2);
+
+        for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
+            let bshift = blocks[i].id - blocks[0].id;
+            if (all((bshift * mask) == bshift)) {
+                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
+                active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
+            }
+        }
     }
 }
 
@@ -121,13 +134,13 @@ func copy_particles_len_to_scan_value(
 ) {
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
-        scan_values[id] = active_blocks[id].num_particles;
+        scan_values[id] = active_blocks[id].num_particles_with_extras;
     }
 }
 
 [shader("compute")]
 [numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
-func copy_scan_values_to_first_particles(
+func copy_scan_values_to_first_particles_and_prepare_for_finalize(
     uint3 invocation_id: SV_DispatchThreadID,
     StructuredBuffer<Grid> grid,
     StructuredBuffer<uint> scan_values,
@@ -136,6 +149,8 @@ func copy_scan_values_to_first_particles(
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
         active_blocks[id].first_particle = scan_values[id];
+        active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles;
+        active_blocks[id].num_particles = 0u;
     }
 }
 
@@ -147,26 +162,40 @@ func finalize_particles_sort(
     StructuredBuffer<GridHashMapEntry> hmap_entries,
     StructuredBuffer<Position> particles_pos,
     ConstantBuffer<uint> particles_len,
-    RWStructuredBuffer<Atomic<uint>> scan_values,
     RWStructuredBuffer<AtomicNodeLinkedList> nodes_linked_lists,
     RWStructuredBuffer<uint> particle_node_linked_lists,
     RWStructuredBuffer<uint> sorted_particle_ids,
+    RWStructuredBuffer<AtomicActiveBlockHeader> active_blocks,
 
 ) {
     let id = invocation_id.x;
     if (id < particles_len) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
-        let block_id = block_associated_to_point(cell_width, particle.pt);
 
         // Place the particle to its sorted place.
-        let active_block_id = find_block_header_id(grid, hmap_entries, block_id);
-        let target_index = scan_values[active_block_id.id].add(1u);
+        var blocks = blocks_associated_to_point(cell_width, particle.pt);
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        let target_index = active_blocks[active_block_id_0.id].first_particle +
+            active_blocks[active_block_id_0.id].num_particles.add(1u);
         sorted_particle_ids[target_index] = id;
 
+        let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
+        let mask = uint3(assoc >= 2);
+
+        for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
+            let bshift = blocks[i].id - blocks[0].id;
+            if (all((bshift * mask) == bshift)) {
+                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
+                let target_index = active_blocks[active_block_id_i.id].first_particle +
+                    active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
+                sorted_particle_ids[target_index] = id;
+            }
+        }
+
         // Setup the per-node particle linked-list.
         let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id), node_local_id);
+        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id);
         let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id);
         nodes_linked_lists[node_global_id.id].len.add(1u);
         particle_node_linked_lists[id] = prev_head;

diff --git a/shaders/slosh/solver/p2g_scatter_style.slang b/shaders/slosh/solver/p2g_scatter_style.slang
@@ -0,0 +1,117 @@
+module p2g;
+
+import slosh.solver.params;
+import slosh.solver.particle;
+import slosh.solver.boundary_condition;
+import slosh.grid.kernel;
+import slosh.grid.grid;
+import slosh.solver.rigid_impulses;
+import slosh.rbd.dynamics.body;
+import slosh.aliases;
+
+#if DIM == 2
+static const uint WORKGROUP_SIZE_X = 8;
+static const uint WORKGROUP_SIZE_Y = 8;
+static const uint WORKGROUP_SIZE_Z = 1;
+#else
+static const uint WORKGROUP_SIZE_X = 4;
+static const uint WORKGROUP_SIZE_Y = 4;
+static const uint WORKGROUP_SIZE_Z = 4;
+#endif
+static const uint WORKGROUP_SIZE = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y * WORKGROUP_SIZE_Z;
+
+// Staging buffers for one workgroup-sized chunk of particles. The chunk is loaded
+// cooperatively (one particle per thread) and then read by every cell-thread, so
+// each particle is fetched from global memory exactly once per block.
+#if DIM == 2
+groupshared float2 shared_pos[WORKGROUP_SIZE];
+groupshared float2 shared_momentum[WORKGROUP_SIZE];
+groupshared float2x2 shared_affine[WORKGROUP_SIZE];
+#else
+groupshared float3 shared_pos[WORKGROUP_SIZE];
+groupshared float3 shared_momentum[WORKGROUP_SIZE];
+groupshared float3x3 shared_affine[WORKGROUP_SIZE];
+#endif
+groupshared float shared_mass[WORKGROUP_SIZE];
+
+[shader("compute")]
+[numthreads(WORKGROUP_SIZE, 1, 1)]
+func p2g_scatter_style(
+    uint3 block_id: SV_GroupID,
+    uint tid: SV_GroupIndex,
+    StructuredBuffer<Grid> grid,
+    StructuredBuffer<ActiveBlockHeader> active_blocks,
+    StructuredBuffer<Position> particles_pos,
+    StructuredBuffer<Kinematics> particles_kin,
+    RWStructuredBuffer<Node> nodes,
+    StructuredBuffer<uint> sorted_particle_ids,
+) {
+    let cell_width = grid[0].cell_width;
+    let inv_cell_width = 1.0 / cell_width;
+    let bid = block_id.x;
+
+    let first_particle = active_blocks[bid].first_particle;
+    let num_particles = active_blocks[bid].num_particles_with_extras;
+    let last_particle = first_particle + num_particles;
+    let block_vid = active_blocks[bid].virtual_id.id;
+
+    // Each thread owns one cell (grid node) of this block and accumulates the
+    // contribution of every particle assigned to the block into a register.
+    // This avoids both global atomics and the per-cell workgroup reduction that
+    // the previous implementation used (which serialized the whole workgroup with
+    // ~7 barriers per cell, i.e. hundreds of barriers per particle batch).
+#if DIM == 2
+    let local_cell = int2(int(tid % 8u), int(tid / 8u));
+    let cell_pos = float2(block_vid * 8 + local_cell) * cell_width;
+    var acc = float3(0.0);
+#else
+    let local_cell = int3(int(tid % 4u), int((tid / 4u) % 4u), int(tid / 16u));
+    let cell_pos = float3(block_vid * 4 + local_cell) * cell_width;
+    var acc = float4(0.0);
+#endif
+
+    // Stream the block's particles through shared memory one chunk at a time.
+    for (var chunk_base = first_particle; chunk_base < last_particle; chunk_base += WORKGROUP_SIZE) {
+        // Wait for the previous chunk's readers before overwriting shared memory.
+        GroupMemoryBarrierWithGroupSync();
+
+        let load_idx = chunk_base + tid;
+        if (load_idx < last_particle) {
+            let pid = sorted_particle_ids[load_idx];
+            let kin = particles_kin[pid];
+            shared_pos[tid] = particles_pos[pid].pt;
+            shared_mass[tid] = kin.mass;
+            shared_affine[tid] = kin.affine;
+            shared_momentum[tid] = kin.velocity * kin.mass + kin.force_dt;
+        }
+
+        GroupMemoryBarrierWithGroupSync();
+
+        // `chunk_len` is uniform across the workgroup, so the barriers above stay in
+        // uniform control flow regardless of the total particle count.
+        let chunk_len = min(WORKGROUP_SIZE, last_particle - chunk_base);
+        for (var p = 0u; p < chunk_len; p += 1u) {
+            let dpt = cell_pos - shared_pos[p];
+#if DIM == 2
+            let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
+                       * QuadraticKernel::eval(dpt.y * inv_cell_width);
+#else
+            let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
+                       * QuadraticKernel::eval(dpt.y * inv_cell_width)
+                       * QuadraticKernel::eval(dpt.z * inv_cell_width);
+#endif
+            // The quadratic kernel is exactly zero outside the 3-node support, which
+            // is the common case for the dense cell x particle cross product. Skipping
+            // the affine matrix-vector product there is the bulk of the saved work.
+            if (weight != 0.0) {
+                let momentum = mul(dpt, shared_affine[p]) + shared_momentum[p];
+                acc += vector<float, DIM + 1>(momentum, shared_mass[p]) * weight;
+            }
+        }
+    }
+
+    // Write the accumulated node state to global memory. Every cell is written once
+    // (no atomics, no inter-block races), zeroing cells that received no contribution.
+    let global_chunk_id = block_header_id_to_physical_id(BlockHeaderId(bid));
+    nodes[global_chunk_id.id + tid].momentum_velocity_mass = acc;
+}
diff --git a/src/grid/grid.rs b/src/grid/grid.rs
@@ -164,7 +164,7 @@ impl<B: Backend> WgGrid<B> {
         prefix_sum_module.launch(backend, pass, prefix_sum, &grid.scan_values)?;
 
         sort_module
-            .copy_scan_values_to_first_particles
+            .copy_scan_values_to_first_particles_and_prepare_for_finalize
             .launch_indirect(backend, pass, &args, grid.indirect_n_blocks_groups.buffer())?;
 
         // Reset here so the linked list heads get reset before `finalize_particles_sort` which
@@ -263,6 +263,7 @@ impl Default for GpuGridHashMapEntry {
 pub struct GpuActiveBlockHeader {
     virtual_id: BlockVirtualId,
     first_particle: u32,
+    num_particles_with_extras: u32,
     num_particles: u32,
 }
 

diff --git a/src/grid/sort.rs b/src/grid/sort.rs
@@ -16,11 +16,14 @@ use stensor::tensor::GpuScalar;
 #[shader(module = "slosh::grid::sort")]
 pub struct WgSort<B: Backend> {
     pub(crate) touch_particle_blocks: GpuFunction<B>,
+    // Bound to GPU kernels; currently only used by commented-out rigid-particle code paths.
+    #[allow(dead_code)]
     pub(crate) touch_rigid_particle_blocks: GpuFunction<B>,
+    #[allow(dead_code)]
     pub(crate) mark_rigid_particles_needing_block: GpuFunction<B>,
     pub(crate) update_block_particle_count: GpuFunction<B>,
     pub(crate) copy_particles_len_to_scan_value: GpuFunction<B>,
-    pub(crate) copy_scan_values_to_first_particles: GpuFunction<B>,
+    pub(crate) copy_scan_values_to_first_particles_and_prepare_for_finalize: GpuFunction<B>,
     pub(crate) finalize_particles_sort: GpuFunction<B>,
     pub(crate) sort_rigid_particles: GpuFunction<B>,
 }

diff --git a/src/pipeline.rs b/src/pipeline.rs
@@ -12,8 +12,8 @@ use crate::rbd::dynamics::body::{BodyCoupling, BodyCouplingEntry};
 use crate::solver::{
     GpuBoundaryCondition, GpuImpulses, GpuMaterials, GpuParticleModelData, GpuParticles,
     GpuRigidParticles, GpuSimulationParams, GpuTimestepBounds, Particle, SimulationParams, WgG2P,
-    WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgParticleUpdate, WgRigidImpulses,
-    WgRigidParticleUpdate, WgTimestepBounds,
+    WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgP2GScatterStyle, WgParticleUpdate,
+    WgRigidImpulses, WgRigidParticleUpdate, WgTimestepBounds,
 };
 use rapier::dynamics::RigidBodySet;
 use rapier::geometry::{ColliderHandle, ColliderSet};
@@ -43,13 +43,20 @@ pub struct MpmPipeline<B: Backend, GpuModel: GpuParticleModelData> {
     grid: WgGrid<B>,
     prefix_sum: WgPrefixSum<B>,
     sort: WgSort<B>,
+    // Kept for the alternative/CDF code paths that are currently commented out in `step`.
+    #[allow(dead_code)]
     p2g: WgP2G<B>,
+    p2g_scatter_style: WgP2GScatterStyle<B>,
+    #[allow(dead_code)]
     p2g_cdf: WgP2GCdf<B>,
+    #[allow(dead_code)]
     grid_update_cdf: WgGridUpdateCdf<B>,
     grid_update: WgGridUpdate<B>,
     particles_update: WgParticleUpdate<B>,
     g2p: WgG2P<B>,
+    #[allow(dead_code)]
     g2p_cdf: WgG2PCdf<B>,
+    #[allow(dead_code)]
     rigid_particles_update: WgRigidParticleUpdate<B>,
     /// Maximum timestep bound calculation.
     pub timestep_bounds: WgTimestepBounds<B>,
@@ -342,6 +349,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
             prefix_sum: WgPrefixSum::from_backend(backend, compiler)?,
             sort: WgSort::from_backend(backend, compiler)?,
             p2g: WgP2G::from_backend(backend, compiler)?,
+            p2g_scatter_style: WgP2GScatterStyle::from_backend(backend, compiler)?,
             p2g_cdf: WgP2GCdf::from_backend(backend, compiler)?,
             grid_update: WgGridUpdate::from_backend(backend, compiler)?,
             grid_update_cdf: WgGridUpdateCdf::from_backend(backend, compiler)?,
@@ -479,7 +487,16 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
 
         {
             let mut pass = encoder.begin_pass("p2g", timestamps.as_deref_mut());
-            self.p2g.launch(
+            // self.p2g.launch(
+            //     backend,
+            //     &mut pass,
+            //     &data.grid,
+            //     &data.particles,
+            //     &data.impulses,
+            //     &data.bodies,
+            //     &data.body_materials,
+            // )?;
+            self.p2g_scatter_style.launch(
                 backend,
                 &mut pass,
                 &data.grid,
@@ -560,7 +577,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
         )?;
 
         {
-            let mut pass = encoder.begin_pass("integrate_bodies", timestamps.as_deref_mut());
+            let mut pass = encoder.begin_pass("integrate_bodies", timestamps);
             // TODO: should this be in a separate pipeline? Within impulse probably?
             self.impulses.launch(
                 backend,

diff --git a/src/rbd/mod.rs b/src/rbd/mod.rs
@@ -1,8 +1,3 @@
-use slang_hal::re_exports::include_dir;
-
-#[cfg(feature = "runtime")]
-use slang_hal::re_exports::minislang::SlangCompiler;
-
 /// GPU-accelerated rigid body dynamics simulation.
 ///
 /// This module provides structures and methods for managing physics bodies

diff --git a/src/solver/mod.rs b/src/solver/mod.rs
@@ -35,7 +35,7 @@
 
 pub use g2p::WgG2P;
 pub use g2p_cdf::WgG2PCdf;
-pub use p2g::WgP2G;
+pub use p2g::{WgP2G, WgP2GScatterStyle};
 pub use p2g_cdf::WgP2GCdf;
 pub use params::{GpuSimulationParams, SimulationParams};
 pub use particle::*;