dimforge · sebcrozet · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/shaders/slosh/grid/grid.slang b/shaders/slosh/grid/grid.slang
@@ -211,13 +211,50 @@ public static const uint NUM_ASSOC_BLOCKS = 4;
 #else
 public static const uint NUM_ASSOC_BLOCKS = 8;
 #endif
+// Number of +1 neighbour blocks of a primary block (all associated blocks but the primary).
+public static const uint NUM_NBH_BLOCKS = NUM_ASSOC_BLOCKS - 1;
 public static const int OFF_BY_ONE = 1;
 
+// Number of "slab" buckets for the within-block counting sort of regular particles whose
+// primary block is the block being sorted. One bucket per associated-cell slab along the
+// slowest-varying node axis (y in 2D, z in 3D), so the sorted order lets P2G derive tight
+// per-chunk slab bounds for culling.
+#if DIM == 2
+public static const uint NUM_PRIMARY_SORT_BUCKETS = 8;
+// Number of slab buckets for "extras" (particles spilling in from a neighbour block). Their
+// associated slab relative to this block lies in [-2, block_width - 1], clamped at -2 (slabs
+// below -2 cannot influence any node of the block anyway): [-2, 7] -> 10 buckets in 2D.
+public static const uint NUM_EXTRA_SORT_BUCKETS = 10;
+#else
+public static const uint NUM_PRIMARY_SORT_BUCKETS = 4;
+// [-2, 3] -> 6 buckets in 3D.
+public static const uint NUM_EXTRA_SORT_BUCKETS = 6;
+#endif
+// Total number of within-block sort buckets. Primary buckets come first so primaries end up
+// contiguous in [first_particle, first_particle + num_particles), which G2P relies on.
+public static const uint NUM_SORT_BUCKETS = NUM_PRIMARY_SORT_BUCKETS + NUM_EXTRA_SORT_BUCKETS;
+
+// Minimum local base-cell index along an axis for a particle's quadratic stencil to spill into
+// the +1 neighbour block along that axis.
+#if DIM == 2
+public static const uint EXTRA_PARTICLE_MIN_SHIFT = 6;
+#else
+public static const uint EXTRA_PARTICLE_MIN_SHIFT = 2;
+#endif
+
 public struct ActiveBlockHeaderGeneric<MaybeAtomicUint> {
     public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block.
     public uint first_particle;
     public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block.
     public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively.
+    // Per-slab-bucket cursors for the within-block counting sort. The count pass accumulates
+    // per-bucket counts here; the prepare pass converts them in place to absolute insertion
+    // cursors (first_particle baked in); the finalize pass increments them while inserting.
+    // The resulting segment is ordered by slab key, which P2G uses for per-chunk culling.
+    public MaybeAtomicUint sort_bucket_cursors[NUM_SORT_BUCKETS];
+    // Header IDs of the +1 neighbour blocks, precomputed once per block to avoid repeated
+    // hashmap lookups in the count/finalize passes. Inactive neighbours are stored as NONE.
+    public BlockHeaderId nbh_block_ids[NUM_NBH_BLOCKS];
 }
 
 public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric<uint>;
@@ -342,6 +379,9 @@ public func mark_block_as_active(
         active_blocks[block_header_id].first_particle = 0u;
         active_blocks[block_header_id].num_particles_with_extras = 0u;
         active_blocks[block_header_id].num_particles = 0u;
+        for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
+            active_blocks[block_header_id].sort_bucket_cursors[k] = 0u;
+        }
         hmap_entries[slot].value = BlockHeaderId(block_header_id);
     }
 }
@@ -364,6 +404,21 @@ func div_ceil(x: uint, y: uint) -> uint {
     return (x + y - 1) / y;
 }
 
+// Snapshots the current `num_active_blocks` into a single-element buffer.
+//
+// Used by the two-pass block activation: after `touch_primary_blocks` has activated all
+// particle base blocks, this captures their count so that `touch_neighbor_blocks` only
+// iterates over base blocks (and not over the neighbour blocks it appends).
+[shader("compute")]
+[numthreads(1, 1, 1)]
+func capture_num_active_blocks(
+    uint3 invocation_id: SV_DispatchThreadID,
+    StructuredBuffer<Grid> grid,
+    RWStructuredBuffer<uint> num_base_blocks,
+) {
+    num_base_blocks[0] = grid[0].num_active_blocks;
+}
+
 [shader("compute")]
 [numthreads(1, 1, 1)]
 func init_indirect_workgroups(

diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang
@@ -3,6 +3,113 @@ module sort;
 import slosh.grid.grid;
 import slosh.solver.particle;
 
+// Returns the within-block sort bucket for a particle counted/inserted into its primary
+// block: one bucket per associated-cell slab along the slowest-varying node axis (y in 2D,
+// z in 3D).
+func primary_sort_bucket(assoc: vector<uint, DIM>) -> uint {
+#if DIM == 2
+    return assoc.y;
+#else
+    return assoc.z;
+#endif
+}
+
+// Returns the within-block sort bucket for a particle counted/inserted as an "extra" into the
+// neighbour block shifted by `bshift` from its primary block. The associated slab relative to
+// the neighbour block can be negative; slabs below -2 cannot influence any node of that block
+// (the quadratic stencil only covers slabs [assoc, assoc + 2]), so they are clamped at -2.
+func extra_sort_bucket(assoc: vector<uint, DIM>, bshift: vector<int, DIM>) -> uint {
+#if DIM == 2
+    let local = int(assoc.y) - bshift.y * 8;
+#else
+    let local = int(assoc.z) - bshift.z * 4;
+#endif
+    return NUM_PRIMARY_SORT_BUCKETS + uint(max(local, -2) + 2);
+}
+
+// True if a particle whose local base-cell index is `assoc` spills its quadratic stencil into
+// the +1 neighbour block reached by `bshift` (component 0 or 1 per axis).
+func extra_spills(assoc: vector<uint, DIM>, bshift: vector<int, DIM>) -> bool {
+    let mask = vector<uint, DIM>(assoc >= EXTRA_PARTICLE_MIN_SHIFT);
+    let bshift_u = vector<uint, DIM>(bshift);
+    return all((bshift_u * mask) == bshift_u);
+}
+
+// Marks only each particle's primary (base) block as active.
+//
+// First half of the two-pass block activation that replaces `touch_particle_blocks`: instead
+// of every particle redundantly inserting all NUM_ASSOC_BLOCKS of its stencil (~NUM_ASSOC_BLOCKS×
+// redundancy), each particle inserts only its base block here, and `touch_neighbor_blocks` then
+// inserts the +1 neighbour blocks once per active base block.
+[shader("compute")]
+[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
+func touch_primary_blocks(
+    uint3 invocation_id: SV_DispatchThreadID,
+    RWStructuredBuffer<AtomicGrid> grid,
+    RWStructuredBuffer<AtomicGridHashMapEntry> hmap_entries,
+    RWStructuredBuffer<ActiveBlockHeader> active_blocks,
+    StructuredBuffer<Position> particles_pos,
+    ConstantBuffer<uint> particles_len,
+) {
+    let id = invocation_id.x;
+    if (id < particles_len) {
+        let cell_width = grid[0].cell_width;
+        let particle = particles_pos[id];
+        let block = block_associated_to_point(cell_width, particle.pt);
+        mark_block_as_active(grid, hmap_entries, active_blocks, block);
+    }
+}
+
+// Marks the +1 neighbour blocks of every already-active base block as active.
+//
+// Second half of the two-pass block activation. One thread per base block: reads the block's
+// virtual id and inserts its NUM_NBH_BLOCKS forward neighbours into the hashmap. `num_base_blocks`
+// is a snapshot of `num_active_blocks` taken before this pass runs, so the neighbour blocks
+// appended during the pass (at indices >= num_base_blocks) are not themselves processed.
+[shader("compute")]
+[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
+func touch_neighbor_blocks(
+    uint3 invocation_id: SV_DispatchThreadID,
+    RWStructuredBuffer<AtomicGrid> grid,
+    RWStructuredBuffer<AtomicGridHashMapEntry> hmap_entries,
+    RWStructuredBuffer<ActiveBlockHeader> active_blocks,
+    StructuredBuffer<uint> num_base_blocks,
+) {
+    let id = invocation_id.x;
+    if (id < num_base_blocks[0]) {
+        let vid = active_blocks[id].virtual_id;
+        var blocks = blocks_associated_to_block(vid);
+        for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
+            mark_block_as_active(grid, hmap_entries, active_blocks, blocks[i]);
+        }
+    }
+}
+
+// Precomputes, for each active block, the header IDs of its +1 neighbour blocks.
+//
+// Each particle contributes "extras" to the +1 neighbour blocks of its primary block. Rather
+// than re-querying the hashmap for those neighbours once per particle in the count/finalize
+// passes, this resolves them once per active block and caches them in `nbh_block_ids`. Inactive
+// neighbours are stored as NONE. Must run after all blocks have been touched.
+[shader("compute")]
+[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
+func update_nbh_block_ids(
+    uint3 invocation_id: SV_DispatchThreadID,
+    StructuredBuffer<Grid> grid,
+    StructuredBuffer<GridHashMapEntry> hmap_entries,
+    RWStructuredBuffer<ActiveBlockHeader> active_blocks,
+) {
+    let id = invocation_id.x;
+    if (id < grid[0].num_active_blocks) {
+        let vid = active_blocks[id].virtual_id;
+        var assoc = blocks_associated_to_block(vid);
+        for (var nbh = 0u; nbh < NUM_NBH_BLOCKS; nbh += 1u) {
+            let nbh_hid = find_block_header_id(grid, hmap_entries, assoc[nbh + 1]);
+            active_blocks[id].nbh_block_ids[nbh] = nbh_hid;
+        }
+    }
+}
+
 [shader("compute")]
 [numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
 func touch_particle_blocks(
@@ -109,18 +216,24 @@ func update_block_particle_count(
         let particle = particles_pos[id];
 
         var blocks = blocks_associated_to_point(cell_width, particle.pt);
-        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
-        active_blocks[active_block_id_0.id].num_particles.add(1u);
-        active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u);
-
         let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let mask = uint3(assoc >= 2);
 
+        // The particle's primary (base) block gets it as a regular particle. Only the per-slab
+        // bucket counter is incremented: `num_particles` (sum of primary buckets) and
+        // `num_particles_with_extras` (sum of all buckets) are derived per block in the copy pass,
+        // so we avoid two extra — and heavily contended — atomics per particle here.
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u);
+
+        // Each +1 neighbour block also receives the particle as an "extra" if its quadratic
+        // stencil actually spills into it. The neighbour header IDs were precomputed by
+        // `update_nbh_block_ids`, so we read them from the primary block instead of doing a
+        // hashmap lookup per particle.
         for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
             let bshift = blocks[i].id - blocks[0].id;
-            if (all((bshift * mask) == bshift)) {
-                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
-                active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
+            if (extra_spills(assoc, bshift)) {
+                let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1];
+                active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u);
             }
         }
     }
@@ -136,7 +249,14 @@ func copy_particles_len_to_scan_value(
 ) {
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
-        scan_values[id] = active_blocks[id].num_particles_with_extras;
+        // The sorted array reserves room for every particle a block touches, extras included.
+        // `num_particles_with_extras` is the sum of all slab buckets (the count pass no longer
+        // maintains it as a separate atomic).
+        var total = 0u;
+        for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
+            total += active_blocks[id].sort_bucket_cursors[k];
+        }
+        scan_values[id] = total;
     }
 }
 
@@ -150,9 +270,28 @@ func copy_scan_values_to_first_particles_and_prepare_for_finalize(
 ) {
     let id = invocation_id.x;
     if (id < grid[0].num_active_blocks) {
-        active_blocks[id].first_particle = scan_values[id];
-        active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles;
-        active_blocks[id].num_particles = 0u;
+        let first = scan_values[id];
+        active_blocks[id].first_particle = first;
+        // Convert the per-bucket counts accumulated by the count pass into running insertion
+        // cursors. The cursors are *absolute* offsets into the sorted array (i.e. `first_particle`
+        // is baked in), so the finalize pass can scatter each particle with a single atomic.
+        // Primary buckets come first, so primaries land in [first_particle, first_particle +
+        // num_particles) as the scatter P2G expects, with the extras after them; both segments
+        // end up ordered by slab key.
+        //
+        // The running total advanced past the primary buckets is `num_particles`; the grand total
+        // is `num_particles_with_extras`. Both are derived here rather than maintained as
+        // per-particle atomics in the count pass.
+        var running = first;
+        for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
+            if (k == NUM_PRIMARY_SORT_BUCKETS) {
+                active_blocks[id].num_particles = running - first;
+            }
+            let count = active_blocks[id].sort_bucket_cursors[k];
+            active_blocks[id].sort_bucket_cursors[k] = running;
+            running += count;
+        }
+        active_blocks[id].num_particles_with_extras = running - first;
     }
 }
 
@@ -176,29 +315,31 @@ func finalize_particles_sort(
         let cell_width = grid[0].cell_width;
         let particle = particles_pos[id];
 
-        // Place the particle to its sorted place.
         var blocks = blocks_associated_to_point(cell_width, particle.pt);
-        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
-        let target_index = active_blocks[active_block_id_0.id].first_particle +
-            active_blocks[active_block_id_0.id].num_particles.add(1u);
-        sorted_particle_ids[target_index] = id;
-
         let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let mask = uint3(assoc >= 2);
 
+        // Place the particle in its primary block's range. The prepare pass turned the bucket
+        // counts into absolute insertion cursors (first_particle baked in), so the atomically
+        // claimed slot is already the final sorted index.
+        let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
+        let slot0 = active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u);
+        sorted_particle_ids[slot0] = id;
+
+        // Place the particle as an "extra" into each +1 neighbour block whose stencil it spills
+        // into, using the extra slab bucket cursors (extras land after the primaries because
+        // their buckets come last). Neighbour header IDs are reused from `update_nbh_block_ids`.
         for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
             let bshift = blocks[i].id - blocks[0].id;
-            if (all((bshift * mask) == bshift)) {
-                let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
-                let target_index = active_blocks[active_block_id_i.id].first_particle +
-                    active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
-                sorted_particle_ids[target_index] = id;
+            if (extra_spills(assoc, bshift)) {
+                let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1];
+                let slot_i = active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u);
+                sorted_particle_ids[slot_i] = id;
             }
         }
 
-        // Setup the per-node particle linked-list.
-        let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width);
-        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id);
+        // Setup the per-node particle linked-list (still consumed by the gather P2G and the
+        // rigid/CDF paths).
+        let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), assoc);
         let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id);
         nodes_linked_lists[node_global_id.id].len.add(1u);
         particle_node_linked_lists[id] = prev_head;