From 37e7e8083689eee91a59aec91cde189f29d3956c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 17 Jun 2026 11:43:16 +0200
Subject: [PATCH 1/3] feat: p2g optimizations by making it independent from
 in-chunk particle density variations

---
 shaders/slosh/grid/grid.slang                |   4 +-
 shaders/slosh/grid/sort.slang                |  49 ++++++--
 shaders/slosh/solver/p2g_scatter_style.slang | 117 +++++++++++++++++++
 src/grid/grid.rs                             |   3 +-
 src/grid/sort.rs                             |   2 +-
 src/pipeline.rs                              |  15 ++-
 src/solver/mod.rs                            |   2 +-
 src/solver/p2g.rs                            |  44 +++++++
 src/solver/particle.rs                       |   2 +-
 9 files changed, 221 insertions(+), 17 deletions(-)
 create mode 100644 shaders/slosh/solver/p2g_scatter_style.slang
diff --git a/shaders/slosh/grid/grid.slang b/shaders/slosh/grid/grid.slang
index 3cbc51c..1790642 100644
--- a/shaders/slosh/grid/grid.slang
+++ b/shaders/slosh/grid/grid.slang
@@ -216,7 +216,8 @@ public static const int OFF_BY_ONE = 1;
 public struct ActiveBlockHeaderGeneric<MaybeAtomicUint> {
     public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block.
     public uint first_particle;
-    public MaybeAtomicUint num_particles;
+    public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block.
+    public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively.
 }
 
 public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric<uint>;
@@ -339,6 +340,7 @@ public func mark_block_as_active(
         let block_header_id = grid[0].num_active_blocks.add(1u);
         active_blocks[block_header_id].virtual_id = block;
         active_blocks[block_header_id].first_particle = 0u;
+        active_blocks[block_header_id].num_particles_with_extras = 0u;
         active_blocks[block_header_id].num_particles = 0u;
         hmap_entries[slot].value = BlockHeaderId(block_header_id);
     }
diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang
index b0383b5..24c5f84 100644
--- a/shaders/slosh/grid/sort.slang
+++ b/shaders/slosh/grid/sort.slang
@@ -105,9 +105,22 @@ func update_block_particle_count(
     if (id < particles_len) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
-        let block_id = block_associated_to_point(cell_width, particle.pt);
-        let active_block_id = find_block_header_id(grid, hmap_entries, block_id);
-        active_blocks[active_block_id.id].num_particles.add(1u);
+
+        var blocks = blocks_associated_to_point(cell_width, particle.pt);
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        active_blocks[active_block_id_0.id].num_particles.add(1u);
+        active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u);
+
+        let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
+        let mask = uint3(assoc >= 2);
+
+        for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
+            let bshift = blocks[i].id - blocks[0].id;
+            if (all((bshift * mask) == bshift)) {
+                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
+                active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
+            }
+        }
     }
 }
 
@@ -121,13 +134,13 @@ func copy_particles_len_to_scan_value(
 ) {
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
-        scan_values[id] = active_blocks[id].num_particles;
+        scan_values[id] = active_blocks[id].num_particles_with_extras;
     }
 }
 
 [shader("compute")]
 [numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
-func copy_scan_values_to_first_particles(
+func copy_scan_values_to_first_particles_and_prepare_for_finalize(
     uint3 invocation_id: SV_DispatchThreadID,
     StructuredBuffer<Grid> grid,
     StructuredBuffer<uint> scan_values,
@@ -136,6 +149,8 @@ func copy_scan_values_to_first_particles(
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
         active_blocks[id].first_particle = scan_values[id];
+        active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles;
+        active_blocks[id].num_particles = 0u;
     }
 }
 
@@ -147,26 +162,40 @@ func finalize_particles_sort(
     StructuredBuffer<GridHashMapEntry> hmap_entries,
     StructuredBuffer<Position> particles_pos,
     ConstantBuffer<uint> particles_len,
-    RWStructuredBuffer<Atomic<uint>> scan_values,
     RWStructuredBuffer<AtomicNodeLinkedList> nodes_linked_lists,
     RWStructuredBuffer<uint> particle_node_linked_lists,
     RWStructuredBuffer<uint> sorted_particle_ids,
+    RWStructuredBuffer<AtomicActiveBlockHeader> active_blocks,
 
 ) {
     let id = invocation_id.x;
     if (id < particles_len) {
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
-        let block_id = block_associated_to_point(cell_width, particle.pt);
 
         // Place the particle to its sorted place.
-        let active_block_id = find_block_header_id(grid, hmap_entries, block_id);
-        let target_index = scan_values[active_block_id.id].add(1u);
+        var blocks = blocks_associated_to_point(cell_width, particle.pt);
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        let target_index = active_blocks[active_block_id_0.id].first_particle +
+            active_blocks[active_block_id_0.id].num_particles.add(1u);
         sorted_particle_ids[target_index] = id;
 
+        let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
+        let mask = uint3(assoc >= 2);
+
+        for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
+            let bshift = blocks[i].id - blocks[0].id;
+            if (all((bshift * mask) == bshift)) {
+                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
+                let target_index = active_blocks[active_block_id_i.id].first_particle +
+                    active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
+                sorted_particle_ids[target_index] = id;
+            }
+        }
+
         // Setup the per-node particle linked-list.
         let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id), node_local_id);
+        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id);
         let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id);
         nodes_linked_lists[node_global_id.id].len.add(1u);
         particle_node_linked_lists[id] = prev_head;
diff --git a/shaders/slosh/solver/p2g_scatter_style.slang b/shaders/slosh/solver/p2g_scatter_style.slang
new file mode 100644
index 0000000..b94000e
--- /dev/null
+++ b/shaders/slosh/solver/p2g_scatter_style.slang
@@ -0,0 +1,117 @@
+module p2g;
+
+import slosh.solver.params;
+import slosh.solver.particle;
+import slosh.solver.boundary_condition;
+import slosh.grid.kernel;
+import slosh.grid.grid;
+import slosh.solver.rigid_impulses;
+import slosh.rbd.dynamics.body;
+import slosh.aliases;
+
+#if DIM == 2
+static const uint WORKGROUP_SIZE_X = 8;
+static const uint WORKGROUP_SIZE_Y = 8;
+static const uint WORKGROUP_SIZE_Z = 1;
+#else
+static const uint WORKGROUP_SIZE_X = 4;
+static const uint WORKGROUP_SIZE_Y = 4;
+static const uint WORKGROUP_SIZE_Z = 4;
+#endif
+static const uint WORKGROUP_SIZE = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y * WORKGROUP_SIZE_Z;
+
+// Staging buffers for one workgroup-sized chunk of particles. The chunk is loaded
+// cooperatively (one particle per thread) and then read by every cell-thread, so
+// each particle is fetched from global memory exactly once per block.
+#if DIM == 2
+groupshared float2 shared_pos[WORKGROUP_SIZE];
+groupshared float2 shared_momentum[WORKGROUP_SIZE];
+groupshared float2x2 shared_affine[WORKGROUP_SIZE];
+#else
+groupshared float3 shared_pos[WORKGROUP_SIZE];
+groupshared float3 shared_momentum[WORKGROUP_SIZE];
+groupshared float3x3 shared_affine[WORKGROUP_SIZE];
+#endif
+groupshared float shared_mass[WORKGROUP_SIZE];
+
+[shader("compute")]
+[numthreads(WORKGROUP_SIZE, 1, 1)]
+func p2g_scatter_style(
+    uint3 block_id: SV_GroupID,
+    uint tid: SV_GroupIndex,
+    StructuredBuffer<Grid> grid,
+    StructuredBuffer<ActiveBlockHeader> active_blocks,
+    StructuredBuffer<Position> particles_pos,
+    StructuredBuffer<Kinematics> particles_kin,
+    RWStructuredBuffer<Node> nodes,
+    StructuredBuffer<uint> sorted_particle_ids,
+) {
+    let cell_width = grid[0].cell_width;
+    let inv_cell_width = 1.0 / cell_width;
+    let bid = block_id.x;
+
+    let first_particle = active_blocks[bid].first_particle;
+    let num_particles = active_blocks[bid].num_particles_with_extras;
+    let last_particle = first_particle + num_particles;
+    let block_vid = active_blocks[bid].virtual_id.id;
+
+    // Each thread owns one cell (grid node) of this block and accumulates the
+    // contribution of every particle assigned to the block into a register.
+    // This avoids both global atomics and the per-cell workgroup reduction that
+    // the previous implementation used (which serialized the whole workgroup with
+    // ~7 barriers per cell, i.e. hundreds of barriers per particle batch).
+#if DIM == 2
+    let local_cell = int2(int(tid % 8u), int(tid / 8u));
+    let cell_pos = float2(block_vid * 8 + local_cell) * cell_width;
+    var acc = float3(0.0);
+#else
+    let local_cell = int3(int(tid % 4u), int((tid / 4u) % 4u), int(tid / 16u));
+    let cell_pos = float3(block_vid * 4 + local_cell) * cell_width;
+    var acc = float4(0.0);
+#endif
+
+    // Stream the block's particles through shared memory one chunk at a time.
+    for (var chunk_base = first_particle; chunk_base < last_particle; chunk_base += WORKGROUP_SIZE) {
+        // Wait for the previous chunk's readers before overwriting shared memory.
+        GroupMemoryBarrierWithGroupSync();
+
+        let load_idx = chunk_base + tid;
+        if (load_idx < last_particle) {
+            let pid = sorted_particle_ids[load_idx];
+            let kin = particles_kin[pid];
+            shared_pos[tid] = particles_pos[pid].pt;
+            shared_mass[tid] = kin.mass;
+            shared_affine[tid] = kin.affine;
+            shared_momentum[tid] = kin.velocity * kin.mass + kin.force_dt;
+        }
+
+        GroupMemoryBarrierWithGroupSync();
+
+        // `chunk_len` is uniform across the workgroup, so the barriers above stay in
+        // uniform control flow regardless of the total particle count.
+        let chunk_len = min(WORKGROUP_SIZE, last_particle - chunk_base);
+        for (var p = 0u; p < chunk_len; p += 1u) {
+            let dpt = cell_pos - shared_pos[p];
+#if DIM == 2
+            let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
+                       * QuadraticKernel::eval(dpt.y * inv_cell_width);
+#else
+            let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
+                       * QuadraticKernel::eval(dpt.y * inv_cell_width)
+                       * QuadraticKernel::eval(dpt.z * inv_cell_width);
+#endif
+            // The quadratic kernel is exactly zero outside the 3-node support, which
+            // is the common case for the dense cell x particle cross product. Skipping
+            // the affine matrix-vector product there is the bulk of the saved work.
+            if (weight != 0.0) {
+                let momentum = mul(dpt, shared_affine[p]) + shared_momentum[p];
+                acc += vector<float, DIM + 1>(momentum, shared_mass[p]) * weight;
+            }
+        }
+    }
+
+    // Write the accumulated node state to global memory. Every cell is written once
+    // (no atomics, no inter-block races), zeroing cells that received no contribution.
+    let global_chunk_id = block_header_id_to_physical_id(BlockHeaderId(bid));
+    nodes[global_chunk_id.id + tid].momentum_velocity_mass = acc;
+}
diff --git a/src/grid/grid.rs b/src/grid/grid.rs
index 7bb1838..e1a271b 100644
--- a/src/grid/grid.rs
+++ b/src/grid/grid.rs
@@ -164,7 +164,7 @@ impl<B: Backend> WgGrid<B> {
         prefix_sum_module.launch(backend, pass, prefix_sum, &grid.scan_values)?;
 
         sort_module
-            .copy_scan_values_to_first_particles
+            .copy_scan_values_to_first_particles_and_prepare_for_finalize
             .launch_indirect(backend, pass, &args, grid.indirect_n_blocks_groups.buffer())?;
 
         // Reset here so the linked list heads get reset before `finalize_particles_sort` which
@@ -263,6 +263,7 @@ impl Default for GpuGridHashMapEntry {
 pub struct GpuActiveBlockHeader {
     virtual_id: BlockVirtualId,
     first_particle: u32,
+    num_particles_with_extras: u32,
     num_particles: u32,
 }
 
diff --git a/src/grid/sort.rs b/src/grid/sort.rs
index 031018d..4ba5d69 100644
--- a/src/grid/sort.rs
+++ b/src/grid/sort.rs
@@ -20,7 +20,7 @@ pub struct WgSort<B: Backend> {
     pub(crate) mark_rigid_particles_needing_block: GpuFunction<B>,
     pub(crate) update_block_particle_count: GpuFunction<B>,
     pub(crate) copy_particles_len_to_scan_value: GpuFunction<B>,
-    pub(crate) copy_scan_values_to_first_particles: GpuFunction<B>,
+    pub(crate) copy_scan_values_to_first_particles_and_prepare_for_finalize: GpuFunction<B>,
     pub(crate) finalize_particles_sort: GpuFunction<B>,
     pub(crate) sort_rigid_particles: GpuFunction<B>,
 }
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 63ed4d9..f58f951 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -13,7 +13,7 @@ use crate::solver::{
     GpuBoundaryCondition, GpuImpulses, GpuMaterials, GpuParticleModelData, GpuParticles,
     GpuRigidParticles, GpuSimulationParams, GpuTimestepBounds, Particle, SimulationParams, WgG2P,
     WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgParticleUpdate, WgRigidImpulses,
-    WgRigidParticleUpdate, WgTimestepBounds,
+    WgRigidParticleUpdate, WgTimestepBounds, WgP2GScatterStyle
 };
 use rapier::dynamics::RigidBodySet;
 use rapier::geometry::{ColliderHandle, ColliderSet};
@@ -44,6 +44,7 @@ pub struct MpmPipeline<B: Backend, GpuModel: GpuParticleModelData> {
     prefix_sum: WgPrefixSum<B>,
     sort: WgSort<B>,
     p2g: WgP2G<B>,
+    p2g_scatter_style: WgP2GScatterStyle<B>,
     p2g_cdf: WgP2GCdf<B>,
     grid_update_cdf: WgGridUpdateCdf<B>,
     grid_update: WgGridUpdate<B>,
@@ -342,6 +343,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
             prefix_sum: WgPrefixSum::from_backend(backend, compiler)?,
             sort: WgSort::from_backend(backend, compiler)?,
             p2g: WgP2G::from_backend(backend, compiler)?,
+            p2g_scatter_style: WgP2GScatterStyle::from_backend(backend, compiler)?,
             p2g_cdf: WgP2GCdf::from_backend(backend, compiler)?,
             grid_update: WgGridUpdate::from_backend(backend, compiler)?,
             grid_update_cdf: WgGridUpdateCdf::from_backend(backend, compiler)?,
@@ -479,7 +481,16 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
 
         {
             let mut pass = encoder.begin_pass("p2g", timestamps.as_deref_mut());
-            self.p2g.launch(
+            // self.p2g.launch(
+            //     backend,
+            //     &mut pass,
+            //     &data.grid,
+            //     &data.particles,
+            //     &data.impulses,
+            //     &data.bodies,
+            //     &data.body_materials,
+            // )?;
+            self.p2g_scatter_style.launch(
                 backend,
                 &mut pass,
                 &data.grid,
diff --git a/src/solver/mod.rs b/src/solver/mod.rs
index b335205..41e99ef 100644
--- a/src/solver/mod.rs
+++ b/src/solver/mod.rs
@@ -35,7 +35,7 @@
 
 pub use g2p::WgG2P;
 pub use g2p_cdf::WgG2PCdf;
-pub use p2g::WgP2G;
+pub use p2g::{WgP2G, WgP2GScatterStyle};
 pub use p2g_cdf::WgP2GCdf;
 pub use params::{GpuSimulationParams, SimulationParams};
 pub use particle::*;
diff --git a/src/solver/p2g.rs b/src/solver/p2g.rs
index 90a50ee..8332963 100644
--- a/src/solver/p2g.rs
+++ b/src/solver/p2g.rs
@@ -27,6 +27,13 @@ pub struct WgP2G<B: Backend> {
     pub p2g: GpuFunction<B>,
 }
 
+#[derive(Shader)]
+#[shader(module = "slosh::solver::p2g_scatter_style")]
+pub struct WgP2GScatterStyle<B: Backend> {
+    /// Compiled P2G compute shader.
+    pub p2g_scatter_style: GpuFunction<B>,
+}
+
 #[derive(ShaderArgs)]
 struct P2GArgs<'a, B: Backend> {
     grid: &'a GpuScalar<GpuGridMetadata, B>,
@@ -37,6 +44,7 @@ struct P2GArgs<'a, B: Backend> {
     particles_pos: &'a GpuVector<ParticlePosition, B>,
     particles_kin: &'a GpuVector<Kinematics, B>,
     nodes: &'a GpuVector<GpuGridNode, B>,
+    sorted_particle_ids: &'a GpuVector<u32, B>,
     body_vels: &'a GpuVector<GpuVelocity, B>,
     body_impulses: &'a GpuVector<RigidImpulse, B>,
     body_materials: &'a GpuVector<GpuBoundaryCondition, B>,
@@ -60,6 +68,7 @@ impl<B: Backend> WgP2G<B> {
             active_blocks: &grid.active_blocks,
             nodes: &grid.nodes,
             nodes_linked_lists: &grid.nodes_linked_lists,
+            sorted_particle_ids: particles.sorted_ids(),
             particles_pos: particles.positions(),
             particles_kin: &particles.kinematics,
             particle_node_linked_lists: particles.node_linked_lists(),
@@ -75,3 +84,38 @@ impl<B: Backend> WgP2G<B> {
         )
     }
 }
+
+impl<B: Backend> WgP2GScatterStyle<B> {
+    /// Launches the P2G kernel to transfer particle data to grid nodes.
+    pub fn launch<GpuModel: GpuParticleModelData>(
+        &self,
+        backend: &B,
+        pass: &mut B::Pass,
+        grid: &GpuGrid<B>,
+        particles: &GpuParticles<B, GpuModel>,
+        impulses: &GpuImpulses<B>,
+        bodies: &GpuBodySet<B>,
+        body_materials: &GpuMaterials<B>,
+    ) -> Result<(), B::Error> {
+        let args = P2GArgs {
+            grid: &grid.meta,
+            hmap_entries: &grid.hmap_entries,
+            active_blocks: &grid.active_blocks,
+            nodes: &grid.nodes,
+            nodes_linked_lists: &grid.nodes_linked_lists,
+            sorted_particle_ids: particles.sorted_ids(),
+            particles_pos: particles.positions(),
+            particles_kin: &particles.kinematics,
+            particle_node_linked_lists: particles.node_linked_lists(),
+            body_vels: bodies.vels(),
+            body_impulses: &impulses.incremental_impulses,
+            body_materials: &body_materials.materials,
+        };
+        self.p2g_scatter_style.launch_indirect(
+            backend,
+            pass,
+            &args,
+            grid.indirect_n_g2p_p2g_groups.buffer(),
+        )
+    }
+}
diff --git a/src/solver/particle.rs b/src/solver/particle.rs
index 2dfe944..e71965f 100644
--- a/src/solver/particle.rs
+++ b/src/solver/particle.rs
@@ -493,7 +493,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> GpuParticles<B, GpuModel> {
             def_grad: GpuTensor::vector_encased(backend, &data.def_grad, resizeable)?,
             properties: GpuTensor::vector_encased(backend, &data.properties, resizeable)?,
             models: GpuTensor::vector(backend, &data.models, resizeable)?,
-            sorted_ids: GpuTensor::vector_uninit(backend, particles.len() as u32, resizeable)?,
+            sorted_ids: GpuTensor::vector_uninit(backend, particles.len() as u32 * 8, resizeable)?,
             node_linked_lists: GpuTensor::vector_uninit(
                 backend,
                 particles.len() as u32,

From 1f5a560435d8b3e2e846b359fd6348c2550889c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 17 Jun 2026 11:43:28 +0200
Subject: [PATCH 2/3] Release v0.6.1

---
 crates/slosh2d/Cargo.toml | 2 +-
 crates/slosh3d/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/slosh2d/Cargo.toml b/crates/slosh2d/Cargo.toml
index e89fa9e..d2aaf61 100644
--- a/crates/slosh2d/Cargo.toml
+++ b/crates/slosh2d/Cargo.toml
@@ -3,7 +3,7 @@ name = "slosh2d"
 authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
 description = "Cross-platform GPU 2D Material Point Method implementation."
 repository = "https://github.com/dimforge/slosh"
-version = "0.6.0"
+version = "0.6.1"
 edition = "2024"
 license = "Apache-2.0"
 
diff --git a/crates/slosh3d/Cargo.toml b/crates/slosh3d/Cargo.toml
index 4e33719..7135efb 100644
--- a/crates/slosh3d/Cargo.toml
+++ b/crates/slosh3d/Cargo.toml
@@ -3,7 +3,7 @@ name = "slosh3d"
 authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
 description = "Cross-platform GPU 3D Material Point Method implementation."
 repository = "https://github.com/dimforge/slosh"
-version = "0.6.0"
+version = "0.6.1"
 edition = "2024"
 license = "Apache-2.0"
 

From ed8c498c37787b3a51d67a1677204d99778cd97b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 17 Jun 2026 11:52:48 +0200
Subject: [PATCH 3/3] clippy fixes

---
 src/grid/sort.rs       |  3 +++
 src/pipeline.rs        | 12 +++++++++---
 src/rbd/mod.rs         |  5 -----
 src/solver/particle.rs |  4 ++--
 src_testbed/step.rs    |  2 +-
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/grid/sort.rs b/src/grid/sort.rs
index 4ba5d69..eb9f955 100644
--- a/src/grid/sort.rs
+++ b/src/grid/sort.rs
@@ -16,7 +16,10 @@ use stensor::tensor::GpuScalar;
 #[shader(module = "slosh::grid::sort")]
 pub struct WgSort<B: Backend> {
     pub(crate) touch_particle_blocks: GpuFunction<B>,
+    // Bound to GPU kernels; currently only used by commented-out rigid-particle code paths.
+    #[allow(dead_code)]
     pub(crate) touch_rigid_particle_blocks: GpuFunction<B>,
+    #[allow(dead_code)]
     pub(crate) mark_rigid_particles_needing_block: GpuFunction<B>,
     pub(crate) update_block_particle_count: GpuFunction<B>,
     pub(crate) copy_particles_len_to_scan_value: GpuFunction<B>,
diff --git a/src/pipeline.rs b/src/pipeline.rs
index f58f951..c6c34b7 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -12,8 +12,8 @@ use crate::rbd::dynamics::body::{BodyCoupling, BodyCouplingEntry};
 use crate::solver::{
     GpuBoundaryCondition, GpuImpulses, GpuMaterials, GpuParticleModelData, GpuParticles,
     GpuRigidParticles, GpuSimulationParams, GpuTimestepBounds, Particle, SimulationParams, WgG2P,
-    WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgParticleUpdate, WgRigidImpulses,
-    WgRigidParticleUpdate, WgTimestepBounds, WgP2GScatterStyle
+    WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgP2GScatterStyle, WgParticleUpdate,
+    WgRigidImpulses, WgRigidParticleUpdate, WgTimestepBounds,
 };
 use rapier::dynamics::RigidBodySet;
 use rapier::geometry::{ColliderHandle, ColliderSet};
@@ -43,14 +43,20 @@ pub struct MpmPipeline<B: Backend, GpuModel: GpuParticleModelData> {
     grid: WgGrid<B>,
     prefix_sum: WgPrefixSum<B>,
     sort: WgSort<B>,
+    // Kept for the alternative/CDF code paths that are currently commented out in `step`.
+    #[allow(dead_code)]
     p2g: WgP2G<B>,
     p2g_scatter_style: WgP2GScatterStyle<B>,
+    #[allow(dead_code)]
     p2g_cdf: WgP2GCdf<B>,
+    #[allow(dead_code)]
     grid_update_cdf: WgGridUpdateCdf<B>,
     grid_update: WgGridUpdate<B>,
     particles_update: WgParticleUpdate<B>,
     g2p: WgG2P<B>,
+    #[allow(dead_code)]
     g2p_cdf: WgG2PCdf<B>,
+    #[allow(dead_code)]
     rigid_particles_update: WgRigidParticleUpdate<B>,
     /// Maximum timestep bound calculation.
     pub timestep_bounds: WgTimestepBounds<B>,
@@ -571,7 +577,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
         )?;
 
         {
-            let mut pass = encoder.begin_pass("integrate_bodies", timestamps.as_deref_mut());
+            let mut pass = encoder.begin_pass("integrate_bodies", timestamps);
             // TODO: should this be in a separate pipeline? Within impulse probably?
             self.impulses.launch(
                 backend,
diff --git a/src/rbd/mod.rs b/src/rbd/mod.rs
index 7f22ded..07c250c 100644
--- a/src/rbd/mod.rs
+++ b/src/rbd/mod.rs
@@ -1,8 +1,3 @@
-use slang_hal::re_exports::include_dir;
-
-#[cfg(feature = "runtime")]
-use slang_hal::re_exports::minislang::SlangCompiler;
-
 /// GPU-accelerated rigid body dynamics simulation.
 ///
 /// This module provides structures and methods for managing physics bodies
diff --git a/src/solver/particle.rs b/src/solver/particle.rs
index e71965f..81d8860 100644
--- a/src/solver/particle.rs
+++ b/src/solver/particle.rs
@@ -101,7 +101,7 @@ impl ParticleDynamics {
     }
 
     /// Extracts the kinematic state for GPU upload.
-    fn to_kinematics(&self) -> Kinematics {
+    fn to_kinematics(self) -> Kinematics {
         Kinematics {
             affine: self.affine,
             velocity: self.velocity,
@@ -113,7 +113,7 @@ impl ParticleDynamics {
     }
 
     /// Extracts the static properties for GPU upload.
-    fn to_properties(&self) -> ParticleProperties {
+    fn to_properties(self) -> ParticleProperties {
         ParticleProperties {
             init_volume: self.init_volume,
             init_radius: self.init_radius,
diff --git a/src_testbed/step.rs b/src_testbed/step.rs
index 9324e13..9f263f6 100644
--- a/src_testbed/step.rs
+++ b/src_testbed/step.rs
@@ -38,7 +38,7 @@ pub struct SimulationStepResult {
     /// Can be cast to the concrete model type using `bytemuck::cast_slice`.
     pub model_data_raw: Vec<u32>,
     /// Raw deformation gradient data read back from GPU, stored as f32 values.
-    /// Stride per particle is [`GPU_DEF_GRAD_STRIDE_F32`]: 4 in 2D (a
+    /// Stride per particle is `GPU_DEF_GRAD_STRIDE_F32`: 4 in 2D (a
     /// `mat2x2<f32>`), 12 in 3D (a `mat3x3<f32>` with `vec4`-aligned columns).
     /// In 3D only the first three entries of each column are meaningful; the
     /// fourth entry of each column is slang padding.