From 5c903ae1e89027a0e2f614794c5b1ca404aa6ee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 24 Jun 2026 10:49:02 +0200
Subject: [PATCH 1/3] feat: further optimize p2g and particle sort through
 intra-blocks particles sorting

---
 shaders/slosh/grid/grid.slang                |  55 ++++++
 shaders/slosh/grid/sort.slang                | 195 ++++++++++++++++---
 shaders/slosh/solver/p2g_scatter_style.slang |  42 +++-
 src/grid/grid.rs                             |  67 ++++++-
 src/grid/sort.rs                             |   6 +
 5 files changed, 333 insertions(+), 32 deletions(-)
diff --git a/shaders/slosh/grid/grid.slang b/shaders/slosh/grid/grid.slang
index 1790642..5d9aa3f 100644
--- a/shaders/slosh/grid/grid.slang
+++ b/shaders/slosh/grid/grid.slang
@@ -211,13 +211,50 @@ public static const uint NUM_ASSOC_BLOCKS = 4;
 #else
 public static const uint NUM_ASSOC_BLOCKS = 8;
 #endif
+// Number of +1 neighbour blocks of a primary block (all associated blocks but the primary).
+public static const uint NUM_NBH_BLOCKS = NUM_ASSOC_BLOCKS - 1;
 public static const int OFF_BY_ONE = 1;
 
+// Number of "slab" buckets for the within-block counting sort of regular particles whose
+// primary block is the block being sorted. One bucket per associated-cell slab along the
+// slowest-varying node axis (y in 2D, z in 3D), so the sorted order lets P2G derive tight
+// per-chunk slab bounds for culling.
+#if DIM == 2
+public static const uint NUM_PRIMARY_SORT_BUCKETS = 8;
+// Number of slab buckets for "extras" (particles spilling in from a neighbour block). Their
+// associated slab relative to this block lies in [-2, block_width - 1], clamped at -2 (slabs
+// below -2 cannot influence any node of the block anyway): [-2, 7] -> 10 buckets in 2D.
+public static const uint NUM_EXTRA_SORT_BUCKETS = 10;
+#else
+public static const uint NUM_PRIMARY_SORT_BUCKETS = 4;
+// [-2, 3] -> 6 buckets in 3D.
+public static const uint NUM_EXTRA_SORT_BUCKETS = 6;
+#endif
+// Total number of within-block sort buckets. Primary buckets come first so primaries end up
+// contiguous in [first_particle, first_particle + num_particles), which G2P relies on.
+public static const uint NUM_SORT_BUCKETS = NUM_PRIMARY_SORT_BUCKETS + NUM_EXTRA_SORT_BUCKETS;
+
+// Minimum local base-cell index along an axis for a particle's quadratic stencil to spill into
+// the +1 neighbour block along that axis.
+#if DIM == 2
+public static const uint EXTRA_PARTICLE_MIN_SHIFT = 6;
+#else
+public static const uint EXTRA_PARTICLE_MIN_SHIFT = 2;
+#endif
+
 public struct ActiveBlockHeaderGeneric<MaybeAtomicUint> {
     public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block.
     public uint first_particle;
     public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block.
     public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively.
+    // Per-slab-bucket cursors for the within-block counting sort. The count pass accumulates
+    // per-bucket counts here; the prepare pass converts them in place to absolute insertion
+    // cursors (first_particle baked in); the finalize pass increments them while inserting.
+    // The resulting segment is ordered by slab key, which P2G uses for per-chunk culling.
+    public MaybeAtomicUint sort_bucket_cursors[NUM_SORT_BUCKETS];
+    // Header IDs of the +1 neighbour blocks, precomputed once per block to avoid repeated
+    // hashmap lookups in the count/finalize passes. Inactive neighbours are stored as NONE.
+    public BlockHeaderId nbh_block_ids[NUM_NBH_BLOCKS];
 }
 
 public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric<uint>;
@@ -342,6 +379,9 @@ public func mark_block_as_active(
         active_blocks[block_header_id].first_particle = 0u;
         active_blocks[block_header_id].num_particles_with_extras = 0u;
         active_blocks[block_header_id].num_particles = 0u;
+        for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
+            active_blocks[block_header_id].sort_bucket_cursors[k] = 0u;
+        }
         hmap_entries[slot].value = BlockHeaderId(block_header_id);
     }
 }
@@ -364,6 +404,21 @@ func div_ceil(x: uint, y: uint) -> uint {
     return (x + y - 1) / y;
 }
 
+// Snapshots the current `num_active_blocks` into a single-element buffer.
+//
+// Used by the two-pass block activation: after `touch_primary_blocks` has activated all
+// particle base blocks, this captures their count so that `touch_neighbor_blocks` only
+// iterates over base blocks (and not over the neighbour blocks it appends).
+[shader("compute")]
+[numthreads(1, 1, 1)]
+func capture_num_active_blocks(
+    uint3 invocation_id: SV_DispatchThreadID,
+    StructuredBuffer<Grid> grid,
+    RWStructuredBuffer<uint> num_base_blocks,
+) {
+    num_base_blocks[0] = grid[0].num_active_blocks;
+}
+
 [shader("compute")]
 [numthreads(1, 1, 1)]
 func init_indirect_workgroups(
diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang
index 8b9e52b..37f9664 100644
--- a/shaders/slosh/grid/sort.slang
+++ b/shaders/slosh/grid/sort.slang
@@ -3,6 +3,113 @@ module sort;
 import slosh.grid.grid;
 import slosh.solver.particle;
 
+// Returns the within-block sort bucket for a particle counted/inserted into its primary
+// block: one bucket per associated-cell slab along the slowest-varying node axis (y in 2D,
+// z in 3D).
+func primary_sort_bucket(assoc: vector<uint, DIM>) -> uint {
+#if DIM == 2
+    return assoc.y;
+#else
+    return assoc.z;
+#endif
+}
+
+// Returns the within-block sort bucket for a particle counted/inserted as an "extra" into the
+// neighbour block shifted by `bshift` from its primary block. The associated slab relative to
+// the neighbour block can be negative; slabs below -2 cannot influence any node of that block
+// (the quadratic stencil only covers slabs [assoc, assoc + 2]), so they are clamped at -2.
+func extra_sort_bucket(assoc: vector<uint, DIM>, bshift: vector<int, DIM>) -> uint {
+#if DIM == 2
+    let local = int(assoc.y) - bshift.y * 8;
+#else
+    let local = int(assoc.z) - bshift.z * 4;
+#endif
+    return NUM_PRIMARY_SORT_BUCKETS + uint(max(local, -2) + 2);
+}
+
+// True if a particle whose local base-cell index is `assoc` spills its quadratic stencil into
+// the +1 neighbour block reached by `bshift` (component 0 or 1 per axis).
+func extra_spills(assoc: vector<uint, DIM>, bshift: vector<int, DIM>) -> bool {
+    let mask = vector<uint, DIM>(assoc >= EXTRA_PARTICLE_MIN_SHIFT);
+    let bshift_u = vector<uint, DIM>(bshift);
+    return all((bshift_u * mask) == bshift_u);
+}
+
+// Marks only each particle's primary (base) block as active.
+//
+// First half of the two-pass block activation that replaces `touch_particle_blocks`: instead
+// of every particle redundantly inserting all NUM_ASSOC_BLOCKS of its stencil (~NUM_ASSOC_BLOCKS×
+// redundancy), each particle inserts only its base block here, and `touch_neighbor_blocks` then
+// inserts the +1 neighbour blocks once per active base block.
+[shader("compute")]
+[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
+func touch_primary_blocks(
+    uint3 invocation_id: SV_DispatchThreadID,
+    RWStructuredBuffer<AtomicGrid> grid,
+    RWStructuredBuffer<AtomicGridHashMapEntry> hmap_entries,
+    RWStructuredBuffer<ActiveBlockHeader> active_blocks,
+    StructuredBuffer<Position> particles_pos,
+    ConstantBuffer<uint> particles_len,
+) {
+    let id = invocation_id.x;
+    if (id < particles_len) {
+        let cell_width = grid[0].cell_width;
+        let particle = particles_pos[id];
+        let block = block_associated_to_point(cell_width, particle.pt);
+        mark_block_as_active(grid, hmap_entries, active_blocks, block);
+    }
+}
+
+// Marks the +1 neighbour blocks of every already-active base block as active.
+//
+// Second half of the two-pass block activation. One thread per base block: reads the block's
+// virtual id and inserts its NUM_NBH_BLOCKS forward neighbours into the hashmap. `num_base_blocks`
+// is a snapshot of `num_active_blocks` taken before this pass runs, so the neighbour blocks
+// appended during the pass (at indices >= num_base_blocks) are not themselves processed.
+[shader("compute")]
+[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
+func touch_neighbor_blocks(
+    uint3 invocation_id: SV_DispatchThreadID,
+    RWStructuredBuffer<AtomicGrid> grid,
+    RWStructuredBuffer<AtomicGridHashMapEntry> hmap_entries,
+    RWStructuredBuffer<ActiveBlockHeader> active_blocks,
+    StructuredBuffer<uint> num_base_blocks,
+) {
+    let id = invocation_id.x;
+    if (id < num_base_blocks[0]) {
+        let vid = active_blocks[id].virtual_id;
+        var blocks = blocks_associated_to_block(vid);
+        for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
+            mark_block_as_active(grid, hmap_entries, active_blocks, blocks[i]);
+        }
+    }
+}
+
+// Precomputes, for each active block, the header IDs of its +1 neighbour blocks.
+//
+// Each particle contributes "extras" to the +1 neighbour blocks of its primary block. Rather
+// than re-querying the hashmap for those neighbours once per particle in the count/finalize
+// passes, this resolves them once per active block and caches them in `nbh_block_ids`. Inactive
+// neighbours are stored as NONE. Must run after all blocks have been touched.
+[shader("compute")]
+[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
+func update_nbh_block_ids(
+    uint3 invocation_id: SV_DispatchThreadID,
+    StructuredBuffer<Grid> grid,
+    StructuredBuffer<GridHashMapEntry> hmap_entries,
+    RWStructuredBuffer<ActiveBlockHeader> active_blocks,
+) {
+    let id = invocation_id.x;
+    if (id < grid[0].num_active_blocks) {
+        let vid = active_blocks[id].virtual_id;
+        var assoc = blocks_associated_to_block(vid);
+        for (var nbh = 0u; nbh < NUM_NBH_BLOCKS; nbh += 1u) {
+            let nbh_hid = find_block_header_id(grid, hmap_entries, assoc[nbh + 1]);
+            active_blocks[id].nbh_block_ids[nbh] = nbh_hid;
+        }
+    }
+}
+
 [shader("compute")]
 [numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
 func touch_particle_blocks(
@@ -109,18 +216,24 @@ func update_block_particle_count(
         let particle = particles_pos[id];
 
         var blocks = blocks_associated_to_point(cell_width, particle.pt);
-        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
-        active_blocks[active_block_id_0.id].num_particles.add(1u);
-        active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u);
-
         let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let mask = uint3(assoc >= 2);
 
+        // The particle's primary (base) block gets it as a regular particle. Only the per-slab
+        // bucket counter is incremented: `num_particles` (sum of primary buckets) and
+        // `num_particles_with_extras` (sum of all buckets) are derived per block in the copy pass,
+        // so we avoid two extra — and heavily contended — atomics per particle here.
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u);
+
+        // Each +1 neighbour block also receives the particle as an "extra" if its quadratic
+        // stencil actually spills into it. The neighbour header IDs were precomputed by
+        // `update_nbh_block_ids`, so we read them from the primary block instead of doing a
+        // hashmap lookup per particle.
         for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
             let bshift = blocks[i].id - blocks[0].id;
-            if (all((bshift * mask) == bshift)) {
-                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
-                active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
+            if (extra_spills(assoc, bshift)) {
+                let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1];
+                active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u);
             }
         }
     }
@@ -136,7 +249,14 @@ func copy_particles_len_to_scan_value(
 ) {
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
-        scan_values[id] = active_blocks[id].num_particles_with_extras;
+        // The sorted array reserves room for every particle a block touches, extras included.
+        // `num_particles_with_extras` is the sum of all slab buckets (the count pass no longer
+        // maintains it as a separate atomic).
+        var total = 0u;
+        for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
+            total += active_blocks[id].sort_bucket_cursors[k];
+        }
+        scan_values[id] = total;
     }
 }
 
@@ -150,9 +270,28 @@ func copy_scan_values_to_first_particles_and_prepare_for_finalize(
 ) {
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
-        active_blocks[id].first_particle = scan_values[id];
-        active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles;
-        active_blocks[id].num_particles = 0u;
+        let first = scan_values[id];
+        active_blocks[id].first_particle = first;
+        // Convert the per-bucket counts accumulated by the count pass into running insertion
+        // cursors. The cursors are *absolute* offsets into the sorted array (i.e. `first_particle`
+        // is baked in), so the finalize pass can scatter each particle with a single atomic.
+        // Primary buckets come first, so primaries land in [first_particle, first_particle +
+        // num_particles) as the scatter P2G expects, with the extras after them; both segments
+        // end up ordered by slab key.
+        //
+        // The running total advanced past the primary buckets is `num_particles`; the grand total
+        // is `num_particles_with_extras`. Both are derived here rather than maintained as
+        // per-particle atomics in the count pass.
+        var running = first;
+        for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
+            if (k == NUM_PRIMARY_SORT_BUCKETS) {
+                active_blocks[id].num_particles = running - first;
+            }
+            let count = active_blocks[id].sort_bucket_cursors[k];
+            active_blocks[id].sort_bucket_cursors[k] = running;
+            running += count;
+        }
+        active_blocks[id].num_particles_with_extras = running - first;
     }
 }
 
@@ -176,29 +315,31 @@ func finalize_particles_sort(
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
 
-        // Place the particle to its sorted place.
         var blocks = blocks_associated_to_point(cell_width, particle.pt);
-        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
-        let target_index = active_blocks[active_block_id_0.id].first_particle +
-            active_blocks[active_block_id_0.id].num_particles.add(1u);
-        sorted_particle_ids[target_index] = id;
-
         let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let mask = uint3(assoc >= 2);
 
+        // Place the particle in its primary block's range. The prepare pass turned the bucket
+        // counts into absolute insertion cursors (first_particle baked in), so the atomically
+        // claimed slot is already the final sorted index.
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        let slot0 = active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u);
+        sorted_particle_ids[slot0] = id;
+
+        // Place the particle as an "extra" into each +1 neighbour block whose stencil it spills
+        // into, using the extra slab bucket cursors (extras land after the primaries because
+        // their buckets come last). Neighbour header IDs are reused from `update_nbh_block_ids`.
         for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
             let bshift = blocks[i].id - blocks[0].id;
-            if (all((bshift * mask) == bshift)) {
-                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
-                let target_index = active_blocks[active_block_id_i.id].first_particle +
-                    active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
-                sorted_particle_ids[target_index] = id;
+            if (extra_spills(assoc, bshift)) {
+                let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1];
+                let slot_i = active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u);
+                sorted_particle_ids[slot_i] = id;
             }
         }
 
-        // Setup the per-node particle linked-list.
-        let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id);
+        // Setup the per-node particle linked-list (still consumed by the gather P2G and the
+        // rigid/CDF paths).
+        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), assoc);
         let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id);
         nodes_linked_lists[node_global_id.id].len.add(1u);
         particle_node_linked_lists[id] = prev_head;
diff --git a/shaders/slosh/solver/p2g_scatter_style.slang b/shaders/slosh/solver/p2g_scatter_style.slang
index b94000e..d11f4db 100644
--- a/shaders/slosh/solver/p2g_scatter_style.slang
+++ b/shaders/slosh/solver/p2g_scatter_style.slang
@@ -33,6 +33,10 @@ groupshared float3 shared_momentum[WORKGROUP_SIZE];
 groupshared float3x3 shared_affine[WORKGROUP_SIZE];
 #endif
 groupshared float shared_mass[WORKGROUP_SIZE];
+// Per-particle slab key (associated-cell coordinate along the slowest node axis, relative to
+// the block), used to derive the per-chunk culling bounds. Matches the bucket key the sort used,
+// so the keys are ascending within the primaries and within the extras of each chunk.
+groupshared int shared_zkey[WORKGROUP_SIZE];
 
 [shader("compute")]
 [numthreads(WORKGROUP_SIZE, 1, 1)]
@@ -53,6 +57,9 @@ func p2g_scatter_style(
     let first_particle = active_blocks[bid].first_particle;
     let num_particles = active_blocks[bid].num_particles_with_extras;
     let last_particle = first_particle + num_particles;
+    // End of the primaries segment: the sorted slab keys are ascending within the primaries and
+    // within the extras, so the per-chunk slab bounds need this boundary.
+    let primaries_end = first_particle + active_blocks[bid].num_particles;
     let block_vid = active_blocks[bid].virtual_id.id;
 
     // Each thread owns one cell (grid node) of this block and accumulates the
@@ -64,10 +71,13 @@ func p2g_scatter_style(
     let local_cell = int2(int(tid % 8u), int(tid / 8u));
     let cell_pos = float2(block_vid * 8 + local_cell) * cell_width;
     var acc = float3(0.0);
+    // This thread's node slab along the sort axis (the slowest-varying node axis).
+    let node_slab = local_cell.y;
 #else
     let local_cell = int3(int(tid % 4u), int((tid / 4u) % 4u), int(tid / 16u));
     let cell_pos = float3(block_vid * 4 + local_cell) * cell_width;
     var acc = float4(0.0);
+    let node_slab = local_cell.z;
 #endif
 
     // Stream the block's particles through shared memory one chunk at a time.
@@ -79,10 +89,21 @@ func p2g_scatter_style(
         if (load_idx < last_particle) {
             let pid = sorted_particle_ids[load_idx];
             let kin = particles_kin[pid];
-            shared_pos[tid] = particles_pos[pid].pt;
+            let pos = particles_pos[pid].pt;
+            shared_pos[tid] = pos;
             shared_mass[tid] = kin.mass;
             shared_affine[tid] = kin.affine;
             shared_momentum[tid] = kin.velocity * kin.mass + kin.force_dt;
+
+            // Slab key along the sort axis, relative to this block; must match the bucket key
+            // used by the sort (same associated-cell rounding, same clamp at -2) so the shared
+            // keys stay ascending within each segment.
+            let assoc_cell = round(pos / cell_width) - 1.0;
+#if DIM == 2
+            shared_zkey[tid] = max(int(assoc_cell.y) - block_vid.y * 8, -2);
+#else
+            shared_zkey[tid] = max(int(assoc_cell.z) - block_vid.z * 4, -2);
+#endif
         }
 
         GroupMemoryBarrierWithGroupSync();
@@ -90,7 +111,24 @@ func p2g_scatter_style(
         // `chunk_len` is uniform across the workgroup, so the barriers above stay in
         // uniform control flow regardless of the total particle count.
         let chunk_len = min(WORKGROUP_SIZE, last_particle - chunk_base);
-        for (var p = 0u; p < chunk_len; p += 1u) {
+
+        // Per-chunk slab bounds, exact thanks to the within-block sort: the keys are ascending
+        // within the primaries and within the extras, so the range is given by the chunk's
+        // first/last key, plus the two values around the primaries/extras boundary if the chunk
+        // straddles it.
+        var zmin = shared_zkey[0];
+        var zmax = shared_zkey[chunk_len - 1u];
+        if (primaries_end > chunk_base && primaries_end < chunk_base + chunk_len) {
+            let b = primaries_end - chunk_base;
+            zmin = min(zmin, shared_zkey[b]);
+            zmax = max(zmax, shared_zkey[b - 1u]);
+        }
+
+        // A particle with slab key `a` only influences nodes in slabs [a, a + 2]: skip the whole
+        // chunk if this thread's node slab is outside the chunk's dilated slab range. When that
+        // holds for every thread of a warp, the warp skips the chunk entirely.
+        let culled_len = (node_slab >= zmin && node_slab <= zmax + 2) ? chunk_len : 0u;
+        for (var p = 0u; p < culled_len; p += 1u) {
             let dpt = cell_pos - shared_pos[p];
 #if DIM == 2
             let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
diff --git a/src/grid/grid.rs b/src/grid/grid.rs
index 637457a..78bf651 100644
--- a/src/grid/grid.rs
+++ b/src/grid/grid.rs
@@ -21,6 +21,7 @@ pub struct WgGrid<B: Backend> {
     reset_hmap: GpuFunction<B>,
     reset: GpuFunction<B>,
     init_indirect_workgroups: GpuFunction<B>,
+    capture_num_active_blocks: GpuFunction<B>,
 }
 
 // TODO: should we have all the kernel launches just use
@@ -36,6 +37,9 @@ struct GridArgs<'a, B: Backend> {
     nodes_linked_lists: &'a GpuVector<[u32; 2], B>,
     rigid_nodes_linked_lists: &'a GpuVector<[u32; 2], B>,
     scan_values: &'a GpuVector<u32, B>,
+    // Snapshot of `num_active_blocks` after the primary-block touch pass (the base-block count),
+    // consumed by `touch_neighbor_blocks`.
+    num_base_blocks: &'a GpuVector<u32, B>,
     // From particles
     particles_pos: &'a GpuVector<ParticlePosition, B>,
     particles_len: &'a GpuScalar<u32, B>,
@@ -86,6 +90,7 @@ impl<B: Backend> WgGrid<B> {
             nodes_linked_lists: &grid.nodes_linked_lists,
             rigid_nodes_linked_lists: &grid.rigid_nodes_linked_lists,
             scan_values: &grid.scan_values,
+            num_base_blocks: &grid.active_blocks_snapshot,
             particles_pos: particles.positions(),
             particles_len: particles.gpu_len(),
             active_blocks: &grid.active_blocks,
@@ -108,13 +113,34 @@ impl<B: Backend> WgGrid<B> {
             self.reset_hmap
                 .launch(backend, pass, &args, [grid.cpu_meta.hmap_capacity, 1, 1])?;
 
-            sort_module.touch_particle_blocks.launch_capped(
+            // Block activation in two passes (cheaper than every particle inserting all
+            // NUM_ASSOC_BLOCKS of its stencil):
+            // 1. Each particle activates only its primary (base) block.
+            // 2. Each base block activates its +1 neighbour blocks (once per block, not once
+            //    per particle). `active_blocks_snapshot` records the base-block count so pass 2
+            //    doesn't reprocess the neighbours it appends.
+            sort_module.touch_primary_blocks.launch_capped(
                 backend,
                 pass,
                 &args,
                 particles.len() as u32,
             )?;
 
+            self.capture_num_active_blocks
+                .launch_grid(backend, pass, &args, [1, 1, 1])?;
+
+            // Indirect dispatch sized for the base blocks (the neighbour pass runs one thread
+            // per base block).
+            self.init_indirect_workgroups
+                .launch_grid(backend, pass, &args, [1, 1, 1])?;
+
+            sort_module.touch_neighbor_blocks.launch_indirect(
+                backend,
+                pass,
+                &args,
+                grid.indirect_n_blocks_groups.buffer(),
+            )?;
+
             // // Ensure blocks exist wherever we have rigid particles that might affect
             // // other blocks. This is done in two passes:
             // // 1. Mark all rigid particles that need to ensure it’s associated block exists
@@ -147,10 +173,19 @@ impl<B: Backend> WgGrid<B> {
         // - Launch write_blocks_multiplicity_to_scan_value
         // - Launch cumulated sum
 
-        // Prepare workgroups for indirect dispatches based on the number of active blocks.
+        // Prepare workgroups for indirect dispatches based on the (final) number of active blocks.
         self.init_indirect_workgroups
             .launch_grid(backend, pass, &args, [1, 1, 1])?;
 
+        // Cache each active block's +1 neighbour header IDs so the count/finalize passes don't
+        // re-query the hashmap per particle per neighbour.
+        sort_module.update_nbh_block_ids.launch_indirect(
+            backend,
+            pass,
+            &args,
+            grid.indirect_n_blocks_groups.buffer(),
+        )?;
+
         sort_module.update_block_particle_count.launch_capped(
             backend,
             pass,
@@ -255,9 +290,26 @@ impl Default for GpuGridHashMapEntry {
     }
 }
 
+/// Number of within-block slab sort buckets (primary + extra). Must match `NUM_SORT_BUCKETS`
+/// in `shaders/slosh/grid/grid.slang`.
+#[cfg(feature = "dim2")]
+const NUM_SORT_BUCKETS: usize = 18; // 8 primary + 10 extra
+#[cfg(feature = "dim3")]
+const NUM_SORT_BUCKETS: usize = 10; // 4 primary + 6 extra
+
+/// Number of +1 neighbour blocks of a primary block. Must match `NUM_NBH_BLOCKS` in
+/// `shaders/slosh/grid/grid.slang` (`NUM_ASSOC_BLOCKS - 1`).
+#[cfg(feature = "dim2")]
+const NUM_NBH_BLOCKS: usize = 3;
+#[cfg(feature = "dim3")]
+const NUM_NBH_BLOCKS: usize = 7;
+
 /// Header for an active grid block containing particles.
 ///
-/// Tracks which particles belong to this block for efficient iteration.
+/// Tracks which particles belong to this block for efficient iteration. This buffer is
+/// GPU-private (allocated uninitialized, written and read only on the GPU), so the field
+/// layout only needs to be at least as large as the shader-side `ActiveBlockHeader` for the
+/// allocation to be correctly sized; the field order is kept in sync for clarity.
 #[derive(Copy, Clone, PartialEq, Pod, Zeroable)]
 #[repr(C)]
 pub struct GpuActiveBlockHeader {
@@ -265,6 +317,10 @@ pub struct GpuActiveBlockHeader {
     first_particle: u32,
     num_particles_with_extras: u32,
     num_particles: u32,
+    /// Per-slab-bucket insertion cursors for the within-block counting sort.
+    sort_bucket_cursors: [u32; NUM_SORT_BUCKETS],
+    /// Cached header IDs of the +1 neighbour blocks (resolved by `update_nbh_block_ids`).
+    nbh_block_ids: [u32; NUM_NBH_BLOCKS],
 }
 
 /// Cached collision information for a grid node.
@@ -325,6 +381,9 @@ pub struct GpuGrid<B: Backend> {
     pub prev_node_collisions: GpuVector<GpuNodeCollision, B>,
     /// Active block headers tracking particle ranges.
     pub active_blocks: GpuVector<GpuActiveBlockHeader, B>,
+    /// Single-element snapshot of `num_active_blocks` after the primary-block touch pass,
+    /// used by the two-pass block activation.
+    pub active_blocks_snapshot: GpuVector<u32, B>,
     /// Workspace for prefix sum operations.
     pub scan_values: GpuVector<u32, B>,
     /// Per-node linked lists for MPM particles.
@@ -393,6 +452,7 @@ impl<B: Backend> GpuGrid<B> {
         let rigid_nodes_linked_lists =
             GpuVector::vector_uninit(backend, capacity * NODES_PER_BLOCK, BufferUsages::STORAGE)?;
         let active_blocks = GpuVector::vector_uninit(backend, capacity, BufferUsages::STORAGE)?;
+        let active_blocks_snapshot = GpuVector::vector_uninit(backend, 1, BufferUsages::STORAGE)?;
         let scan_values = GpuVector::vector_uninit(backend, capacity, BufferUsages::STORAGE)?;
         let indirect_n_blocks_groups = Arc::new(GpuVector::scalar_uninit(
             backend,
@@ -414,6 +474,7 @@ impl<B: Backend> GpuGrid<B> {
             node_collisions,
             prev_node_collisions,
             active_blocks,
+            active_blocks_snapshot,
             scan_values,
             indirect_n_blocks_groups,
             indirect_n_g2p_p2g_groups,
diff --git a/src/grid/sort.rs b/src/grid/sort.rs
index eb9f955..92020f6 100644
--- a/src/grid/sort.rs
+++ b/src/grid/sort.rs
@@ -15,7 +15,13 @@ use stensor::tensor::GpuScalar;
 #[derive(Shader)]
 #[shader(module = "slosh::grid::sort")]
 pub struct WgSort<B: Backend> {
+    // Legacy single-pass block activation, superseded by the two-pass
+    // touch_primary_blocks / touch_neighbor_blocks. Kept bound for reference/fallback.
+    #[allow(dead_code)]
     pub(crate) touch_particle_blocks: GpuFunction<B>,
+    pub(crate) touch_primary_blocks: GpuFunction<B>,
+    pub(crate) touch_neighbor_blocks: GpuFunction<B>,
+    pub(crate) update_nbh_block_ids: GpuFunction<B>,
     // Bound to GPU kernels; currently only used by commented-out rigid-particle code paths.
     #[allow(dead_code)]
     pub(crate) touch_rigid_particle_blocks: GpuFunction<B>,

From 5d418905417db12bb3b17afbaf6865d0328035f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 24 Jun 2026 10:49:20 +0200
Subject: [PATCH 2/3] feat: make p2g and g2p overwrittable

---
 src/pipeline.rs | 61 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/src/pipeline.rs b/src/pipeline.rs
index c6c34b7..0f38cfa 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -79,6 +79,38 @@ pub trait MpmPipelineHooks<B: Backend, GpuModel: GpuParticleModelData> {
         Ok(())
     }
 
+    /// Replace the built-in Particle-To-Grid transfer.
+    ///
+    /// Return `Ok(true)` if the transfer was handled by this hook; the pipeline then
+    /// skips its built-in P2G. Return `Ok(false)` (the default) to keep slosh's P2G.
+    /// The hook is responsible for beginning its own compute pass.
+    fn run_p2g(
+        &mut self,
+        _backend: &B,
+        _encoder: &mut B::Encoder,
+        _data: &mut MpmData<B, GpuModel>,
+        _state: &mut dyn Any,
+        _timestamps: Option<&mut GpuTimestamps>,
+    ) -> Result<bool, B::Error> {
+        Ok(false)
+    }
+
+    /// Replace the built-in Grid-To-Particle transfer.
+    ///
+    /// Return `Ok(true)` if the transfer was handled by this hook; the pipeline then
+    /// skips its built-in G2P. Return `Ok(false)` (the default) to keep slosh's G2P.
+    /// The hook is responsible for beginning its own compute pass.
+    fn run_g2p(
+        &mut self,
+        _backend: &B,
+        _encoder: &mut B::Encoder,
+        _data: &mut MpmData<B, GpuModel>,
+        _state: &mut dyn Any,
+        _timestamps: Option<&mut GpuTimestamps>,
+    ) -> Result<bool, B::Error> {
+        Ok(false)
+    }
+
     /// Custom operation run after the main Particle-To-Grid transfer.
     fn after_p2g(
         &mut self,
@@ -485,17 +517,16 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
         //     )?;
         // }
 
-        {
+        // Let a hook replace the built-in P2G transfer (e.g. to fuse extra fields into the
+        // same sweep). When no hook handles it, run slosh's default scatter-style P2G.
+        if !hooks.run_p2g(
+            backend,
+            encoder,
+            data,
+            hooks_state,
+            timestamps.as_deref_mut(),
+        )? {
             let mut pass = encoder.begin_pass("p2g", timestamps.as_deref_mut());
-            // self.p2g.launch(
-            //     backend,
-            //     &mut pass,
-            //     &data.grid,
-            //     &data.particles,
-            //     &data.impulses,
-            //     &data.bodies,
-            //     &data.body_materials,
-            // )?;
             self.p2g_scatter_style.launch(
                 backend,
                 &mut pass,
@@ -535,7 +566,15 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
             timestamps.as_deref_mut(),
         )?;
 
-        {
+        // Let a hook replace the built-in G2P transfer (e.g. to fuse extra fields into the
+        // same sweep). When no hook handles it, run slosh's default G2P.
+        if !hooks.run_g2p(
+            backend,
+            encoder,
+            data,
+            hooks_state,
+            timestamps.as_deref_mut(),
+        )? {
             let mut pass = encoder.begin_pass("g2p", timestamps.as_deref_mut());
             self.g2p.launch(
                 backend,

From 817c66c125a7b7b8a783fec1498bce8a89a2cbf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 24 Jun 2026 11:04:23 +0200
Subject: [PATCH 3/3] chore: formatting fixes

---
 shaders/slosh/solver/particle_update.slang | 219 ++++++++++-----------
 shaders/slosh/solver/timestep_bound.slang  |  67 ++++---
 2 files changed, 141 insertions(+), 145 deletions(-)

diff --git a/shaders/slosh/solver/particle_update.slang b/shaders/slosh/solver/particle_update.slang
index 2e10f5d..bbbc0f3 100644
--- a/shaders/slosh/solver/particle_update.slang
+++ b/shaders/slosh/solver/particle_update.slang
@@ -47,117 +47,114 @@ func particle_update(
     let _gs_total = min((_gs_n + 63u) / 64u, 65535u) * 64u;
     for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) {
 
-    let model = ParticleModel();
-    let flags = model.model_flags(particles_model, particle_id);
-    let dt = params.dt;
-    let cell_width = grid[0].cell_width;
-    var kin = particles_kin[particle_id];
-//    let cdf = particles_cdf[particle_id];
-    var def_grad = particles_def_grad[particle_id];
-    let props = particles_props[particle_id];
-    let particle_pos = particles_pos[particle_id].pt;
-
-
-    /*
-     * Advection.
-     */
-     // TODO: double check that we never need the reprojection below.
-     //       This reprojection isn't part of the original MPM-MLS/CPIC paper but we added
-     //       it at some point as it appeared that we'd still get some penetrating particles.
-     //       However, that might have been caused by other bugs so it is unsure if we need to
-     //       keep it now.
-//     if (cdf.signed_distance < -0.05 * cell_width) {
-//         let slip = BoundaryCondition(BoundaryConditionType::Slip, 0.0);
-//         kin.velocity = cdf.rigid_vel + slip.project_velocity((kin.velocity - cdf.rigid_vel), cdf.normal);
-//     }
-
-    // Clamp the max velocity a particle can get.
-    // TODO: clamp the grid velocities instead?
-    if (length(kin.velocity) > cell_width / dt) {
-        kin.velocity = kin.velocity / length(kin.velocity) * cell_width / dt;
-    }
-
-    // Apply Rayleigh mass-proportional damping (implicit integration for stability).
-    // v_new = v / (1 + damping * dt)
-    kin.velocity = kin.velocity / (1.0 + props.damping * dt);
-
-
-    // If the particle is fixed, clear its velocity and its affine matrix.
-    // This isn't ideal (this should typically be handled on the grid) but we sometimes
-    // need sub-grid-sized fixed particles.
-    if (props.fixed != 0) {
-        kin.velocity = Vect<float>(0.0);
-    }
-
-    let new_particle_pos = particle_pos + kin.velocity * dt;
-
-//    /*
-//     * Penalty impulse.
-//     */
-//     // TODO: apply the penalty impulse as an extra force on the grid instead of on
-//     //       changing the particle velocity directly?
-//     static const float PENALTY_COEFF = 1.0e3;
-//     if (cdf.signed_distance < -0.05 * cell_width) { // && cdf.signed_distance > -0.3 * cell_width {
-//         let corrected_dist = max(cdf.signed_distance, -0.3 * cell_width);
-//         let impulse = (dt * -corrected_dist * PENALTY_COEFF) * cdf.normal;
-//         kin.velocity += impulse; // / curr_particle_vol.mass;
-//     }
-
-    /*
-     * Deformation gradient update.
-     */
-    if ((flags & ModelFlags::FLUID) == 0) {
+        let model = ParticleModel();
+        let flags = model.model_flags(particles_model, particle_id);
+        let dt = params.dt;
+        let cell_width = grid[0].cell_width;
+        var kin = particles_kin[particle_id];
+    //    let cdf = particles_cdf[particle_id];
+        var def_grad = particles_def_grad[particle_id];
+        let props = particles_props[particle_id];
+        let particle_pos = particles_pos[particle_id].pt;
+
+        /*
+         * Advection.
+         */
+         // TODO: double check that we never need the reprojection below.
+         //       This reprojection isn't part of the original MPM-MLS/CPIC paper but we added
+         //       it at some point as it appeared that we'd still get some penetrating particles.
+         //       However, that might have been caused by other bugs so it is unsure if we need to
+         //       keep it now.
+    //     if (cdf.signed_distance < -0.05 * cell_width) {
+    //         let slip = BoundaryCondition(BoundaryConditionType::Slip, 0.0);
+    //         kin.velocity = cdf.rigid_vel + slip.project_velocity((kin.velocity - cdf.rigid_vel), cdf.normal);
+    //     }
+
+        // Clamp the max velocity a particle can get.
+        // TODO: clamp the grid velocities instead?
+        if (length(kin.velocity) > cell_width / dt) {
+            kin.velocity = kin.velocity / length(kin.velocity) * cell_width / dt;
+        }
+
+        // Apply Rayleigh mass-proportional damping (implicit integration for stability).
+        // v_new = v / (1 + damping * dt)
+        kin.velocity = kin.velocity / (1.0 + props.damping * dt);
+
+        // If the particle is fixed, clear its velocity and its affine matrix.
+        // This isn't ideal (this should typically be handled on the grid) but we sometimes
+        // need sub-grid-sized fixed particles.
+        if (props.fixed != 0) {
+            kin.velocity = Vect<float>(0.0);
+        }
+
+        let new_particle_pos = particle_pos + kin.velocity * dt;
+
+    //    /*
+    //     * Penalty impulse.
+    //     */
+    //     // TODO: apply the penalty impulse as an extra force on the grid instead of on
+    //     //       changing the particle velocity directly?
+    //     static const float PENALTY_COEFF = 1.0e3;
+    //     if (cdf.signed_distance < -0.05 * cell_width) { // && cdf.signed_distance > -0.3 * cell_width {
+    //         let corrected_dist = max(cdf.signed_distance, -0.3 * cell_width);
+    //         let impulse = (dt * -corrected_dist * PENALTY_COEFF) * cdf.normal;
+    //         kin.velocity += impulse; // / curr_particle_vol.mass;
+    //     }
+
+        /*
+         * Deformation gradient update.
+         */
+        if ((flags & ModelFlags::FLUID) == 0) {
+            // NOTE: the velocity gradient was stored in the affine buffer.
+            def_grad += mul(def_grad, kin.affine * dt);
+        } else {
+            let def_grad0 = def_grad[0][0];
+            var new_def_grad_diag_elt = def_grad0 + def_grad0 * kin.vel_grad_det * dt;
+            def_grad = diag(Vect<float>(new_def_grad_diag_elt));
+        }
+
+        // NOTE: we only reset the velocity gradient to zero for fixed particles **after** the deformation
+        //       gradient update so we keep its elastic response. It acts as a cube of elastic material which
+        //       center doesn’t move but can deform.
+        if (props.fixed != 0) {
+            kin.affine = Mat<float>(0.0);
+        }
+
+        /*
+         * Constitutive model.
+         */
+        let update_data = ParticleUpdateData(dt, cell_width, particle_id, kin.affine, props.init_volume, props.phase);
+        let update_result = model.update(particles_model, update_data, def_grad);
+
+        // Write back init_volume if the model modified it (e.g., swell).
+        let effective_init_volume = select(update_result.init_volume >= 0.0, update_result.init_volume, props.init_volume);
+        if (update_result.init_volume >= 0.0) {
+            particles_props[particle_id].init_volume = update_result.init_volume;
+        }
+
+        /*
+         * Affine matrix for APIC transfer.
+         */
+        let inv_d = QuadraticKernel::inv_d(cell_width);
         // NOTE: the velocity gradient was stored in the affine buffer.
-        def_grad += mul(def_grad, kin.affine * dt);
-    } else {
-        let def_grad0 = def_grad[0][0];
-        var new_def_grad_diag_elt = def_grad0 + def_grad0 * kin.vel_grad_det * dt;
-        def_grad = diag(Vect<float>(new_def_grad_diag_elt));
-    }
-
-    // NOTE: we only reset the velocity gradient to zero for fixed particles **after** the deformation
-    //       gradient update so we keep its elastic response. It acts as a cube of elastic material which
-    //       center doesn’t move but can deform.
-    if (props.fixed != 0) {
-        kin.affine = Mat<float>(0.0);
-    }
-
-    /*
-     * Constitutive model.
-     */
-
-    let update_data = ParticleUpdateData(dt, cell_width, particle_id, kin.affine, props.init_volume, props.phase);
-    let update_result = model.update(particles_model, update_data, def_grad);
-
-    // Write back init_volume if the model modified it (e.g., swell).
-    let effective_init_volume = select(update_result.init_volume >= 0.0, update_result.init_volume, props.init_volume);
-    if (update_result.init_volume >= 0.0) {
-        particles_props[particle_id].init_volume = update_result.init_volume;
-    }
-
-    /*
-     * Affine matrix for APIC transfer.
-     */
-    let inv_d = QuadraticKernel::inv_d(cell_width);
-    // NOTE: the velocity gradient was stored in the affine buffer.
-    let affine = kin.affine * kin.mass - update_result.kirchoff_stress * (effective_init_volume * inv_d * dt);
-
-    /*
-     * Write back the new particle properties.
-     */
-    if (all(new_particle_pos == new_particle_pos) && determinant(def_grad) > 0.0) { // Avoid NaNs and invalid deformation gradients
-        particles_pos[particle_id].pt = new_particle_pos;
-        kin.affine = affine;
-    } else {
-        kin.enabled = 0; // This particles diverged, disable it.
-        kin.velocity = Vect<float>(0.0);
-        def_grad = diag(Vect<float>(1.0));
-        kin.affine = Mat<float>(0.0);
-        kin.mass = 0.0;
-    }
-    kin.force_dt = Vect<float>(0.0);
-
-    particles_kin[particle_id] = kin;
-    particles_def_grad[particle_id] = def_grad;
+        let affine = kin.affine * kin.mass - update_result.kirchoff_stress * (effective_init_volume * inv_d * dt);
+
+        /*
+         * Write back the new particle properties.
+         */
+        if (all(new_particle_pos == new_particle_pos) && determinant(def_grad) > 0.0) { // Avoid NaNs and invalid deformation gradients
+            particles_pos[particle_id].pt = new_particle_pos;
+            kin.affine = affine;
+        } else {
+            kin.enabled = 0; // This particles diverged, disable it.
+            kin.velocity = Vect<float>(0.0);
+            def_grad = diag(Vect<float>(1.0));
+            kin.affine = Mat<float>(0.0);
+            kin.mass = 0.0;
+        }
+        kin.force_dt = Vect<float>(0.0);
+
+        particles_kin[particle_id] = kin;
+        particles_def_grad[particle_id] = def_grad;
     }
 }
diff --git a/shaders/slosh/solver/timestep_bound.slang b/shaders/slosh/solver/timestep_bound.slang
index 9121933..18809ce 100644
--- a/shaders/slosh/solver/timestep_bound.slang
+++ b/shaders/slosh/solver/timestep_bound.slang
@@ -49,45 +49,44 @@ func estimate_timestep_bound(
     let _gs_n = particles_len;
     let _gs_total = min((_gs_n + (WORKGROUP_SIZE - 1u)) / WORKGROUP_SIZE, 65535u) * WORKGROUP_SIZE;
     for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) {
+        if (particles_kin[particle_id].enabled == 0) {
+            continue;
+        }
 
-    if (particles_kin[particle_id].enabled == 0) {
-        continue;
-    }
-
-    let cell_width = grid[0].cell_width;
+        let cell_width = grid[0].cell_width;
 
-    /*
-     * Model-specific restrictions.
-     * Will usually be based on sounds speed for the material (section 4.1).
-     */
-    let density0 = particles_kin[particle_id].mass / particles_props[particle_id].init_volume;
-    let def_grad = particles_def_grad[particle_id];
-    let velocity = particles_kin[particle_id].velocity;
-    let affine = particles_kin[particle_id].affine;
-    let mass = particles_kin[particle_id].mass;
+        /*
+         * Model-specific restrictions.
+         * Will usually be based on sounds speed for the material (section 4.1).
+         */
+        let density0 = particles_kin[particle_id].mass / particles_props[particle_id].init_volume;
+        let def_grad = particles_def_grad[particle_id];
+        let velocity = particles_kin[particle_id].velocity;
+        let affine = particles_kin[particle_id].affine;
+        let mass = particles_kin[particle_id].mass;
 
-    let model = ParticleModel();
-    var dt = model.timestep_bound(particles_model, particle_id, density0, def_grad, velocity, cell_width);
+        let model = ParticleModel();
+        var dt = model.timestep_bound(particles_model, particle_id, density0, def_grad, velocity, cell_width);
 
-    /*
-     * Velocity-based restrictions (section 4.2)
-     */
-   var norm_affine_squared = 0.0;
-   [[ForceUnroll]]
-   for (var i = 0; i < DIM; i++) {
-      [[ForceUnroll]]
-      for (var j = 0; j < DIM; j++) {
-        norm_affine_squared += affine[i][j] * affine[i][j];
-      }
-   }
+        /*
+         * Velocity-based restrictions (section 4.2)
+         */
+       var norm_affine_squared = 0.0;
+       [[ForceUnroll]]
+       for (var i = 0; i < DIM; i++) {
+          [[ForceUnroll]]
+          for (var j = 0; j < DIM; j++) {
+            norm_affine_squared += affine[i][j] * affine[i][j];
+          }
+       }
 
-   let d = (cell_width * cell_width) / 4.0;
-   let norm_b = d * sqrt(norm_affine_squared) / mass;
-   let apic_v = norm_b * 6.0 * sqrt(DIM) / cell_width;
-    let v = length(velocity); // + apic_v;
-    dt = min(dt, cell_width / v);
+       let d = (cell_width * cell_width) / 4.0;
+       let norm_b = d * sqrt(norm_affine_squared) / mass;
+       let apic_v = norm_b * 6.0 * sqrt(DIM) / cell_width;
+        let v = length(velocity); // + apic_v;
+        dt = min(dt, cell_width / v);
 
-    let candidate = GpuTimestepBounds::secs_to_int(dt);
-    result[0].computed_max_dt_as_uint.min(candidate);
+        let candidate = GpuTimestepBounds::secs_to_int(dt);
+        result[0].computed_max_dt_as_uint.min(candidate);
     }
 }