From 5c903ae1e89027a0e2f614794c5b1ca404aa6ee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Wed, 24 Jun 2026 10:49:02 +0200 Subject: [PATCH 1/3] feat: further optimize p2g and particle sort through intra-blocks particles sorting --- shaders/slosh/grid/grid.slang | 55 ++++++ shaders/slosh/grid/sort.slang | 195 ++++++++++++++++--- shaders/slosh/solver/p2g_scatter_style.slang | 42 +++- src/grid/grid.rs | 67 ++++++- src/grid/sort.rs | 6 + 5 files changed, 333 insertions(+), 32 deletions(-) diff --git a/shaders/slosh/grid/grid.slang b/shaders/slosh/grid/grid.slang index 1790642..5d9aa3f 100644 --- a/shaders/slosh/grid/grid.slang +++ b/shaders/slosh/grid/grid.slang @@ -211,13 +211,50 @@ public static const uint NUM_ASSOC_BLOCKS = 4; #else public static const uint NUM_ASSOC_BLOCKS = 8; #endif +// Number of +1 neighbour blocks of a primary block (all associated blocks but the primary). +public static const uint NUM_NBH_BLOCKS = NUM_ASSOC_BLOCKS - 1; public static const int OFF_BY_ONE = 1; +// Number of "slab" buckets for the within-block counting sort of regular particles whose +// primary block is the block being sorted. One bucket per associated-cell slab along the +// slowest-varying node axis (y in 2D, z in 3D), so the sorted order lets P2G derive tight +// per-chunk slab bounds for culling. +#if DIM == 2 +public static const uint NUM_PRIMARY_SORT_BUCKETS = 8; +// Number of slab buckets for "extras" (particles spilling in from a neighbour block). Their +// associated slab relative to this block lies in [-2, block_width - 1], clamped at -2 (slabs +// below -2 cannot influence any node of the block anyway): [-2, 7] -> 10 buckets in 2D. +public static const uint NUM_EXTRA_SORT_BUCKETS = 10; +#else +public static const uint NUM_PRIMARY_SORT_BUCKETS = 4; +// [-2, 3] -> 6 buckets in 3D. +public static const uint NUM_EXTRA_SORT_BUCKETS = 6; +#endif +// Total number of within-block sort buckets. Primary buckets come first so primaries end up +// contiguous in [first_particle, first_particle + num_particles), which G2P relies on. +public static const uint NUM_SORT_BUCKETS = NUM_PRIMARY_SORT_BUCKETS + NUM_EXTRA_SORT_BUCKETS; + +// Minimum local base-cell index along an axis for a particle's quadratic stencil to spill into +// the +1 neighbour block along that axis. +#if DIM == 2 +public static const uint EXTRA_PARTICLE_MIN_SHIFT = 6; +#else +public static const uint EXTRA_PARTICLE_MIN_SHIFT = 2; +#endif + public struct ActiveBlockHeaderGeneric { public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block. public uint first_particle; public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block. public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively. + // Per-slab-bucket cursors for the within-block counting sort. The count pass accumulates + // per-bucket counts here; the prepare pass converts them in place to absolute insertion + // cursors (first_particle baked in); the finalize pass increments them while inserting. + // The resulting segment is ordered by slab key, which P2G uses for per-chunk culling. + public MaybeAtomicUint sort_bucket_cursors[NUM_SORT_BUCKETS]; + // Header IDs of the +1 neighbour blocks, precomputed once per block to avoid repeated + // hashmap lookups in the count/finalize passes. Inactive neighbours are stored as NONE. + public BlockHeaderId nbh_block_ids[NUM_NBH_BLOCKS]; } public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric; @@ -342,6 +379,9 @@ public func mark_block_as_active( active_blocks[block_header_id].first_particle = 0u; active_blocks[block_header_id].num_particles_with_extras = 0u; active_blocks[block_header_id].num_particles = 0u; + for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) { + active_blocks[block_header_id].sort_bucket_cursors[k] = 0u; + } hmap_entries[slot].value = BlockHeaderId(block_header_id); } } @@ -364,6 +404,21 @@ func div_ceil(x: uint, y: uint) -> uint { return (x + y - 1) / y; } +// Snapshots the current `num_active_blocks` into a single-element buffer. +// +// Used by the two-pass block activation: after `touch_primary_blocks` has activated all +// particle base blocks, this captures their count so that `touch_neighbor_blocks` only +// iterates over base blocks (and not over the neighbour blocks it appends). +[shader("compute")] +[numthreads(1, 1, 1)] +func capture_num_active_blocks( + uint3 invocation_id: SV_DispatchThreadID, + StructuredBuffer grid, + RWStructuredBuffer num_base_blocks, +) { + num_base_blocks[0] = grid[0].num_active_blocks; +} + [shader("compute")] [numthreads(1, 1, 1)] func init_indirect_workgroups( diff --git a/shaders/slosh/grid/sort.slang b/shaders/slosh/grid/sort.slang index 8b9e52b..37f9664 100644 --- a/shaders/slosh/grid/sort.slang +++ b/shaders/slosh/grid/sort.slang @@ -3,6 +3,113 @@ module sort; import slosh.grid.grid; import slosh.solver.particle; +// Returns the within-block sort bucket for a particle counted/inserted into its primary +// block: one bucket per associated-cell slab along the slowest-varying node axis (y in 2D, +// z in 3D). +func primary_sort_bucket(assoc: vector) -> uint { +#if DIM == 2 + return assoc.y; +#else + return assoc.z; +#endif +} + +// Returns the within-block sort bucket for a particle counted/inserted as an "extra" into the +// neighbour block shifted by `bshift` from its primary block. The associated slab relative to +// the neighbour block can be negative; slabs below -2 cannot influence any node of that block +// (the quadratic stencil only covers slabs [assoc, assoc + 2]), so they are clamped at -2. +func extra_sort_bucket(assoc: vector, bshift: vector) -> uint { +#if DIM == 2 + let local = int(assoc.y) - bshift.y * 8; +#else + let local = int(assoc.z) - bshift.z * 4; +#endif + return NUM_PRIMARY_SORT_BUCKETS + uint(max(local, -2) + 2); +} + +// True if a particle whose local base-cell index is `assoc` spills its quadratic stencil into +// the +1 neighbour block reached by `bshift` (component 0 or 1 per axis). +func extra_spills(assoc: vector, bshift: vector) -> bool { + let mask = vector(assoc >= EXTRA_PARTICLE_MIN_SHIFT); + let bshift_u = vector(bshift); + return all((bshift_u * mask) == bshift_u); +} + +// Marks only each particle's primary (base) block as active. +// +// First half of the two-pass block activation that replaces `touch_particle_blocks`: instead +// of every particle redundantly inserting all NUM_ASSOC_BLOCKS of its stencil (~NUM_ASSOC_BLOCKS× +// redundancy), each particle inserts only its base block here, and `touch_neighbor_blocks` then +// inserts the +1 neighbour blocks once per active base block. +[shader("compute")] +[numthreads(GRID_WORKGROUP_SIZE, 1, 1)] +func touch_primary_blocks( + uint3 invocation_id: SV_DispatchThreadID, + RWStructuredBuffer grid, + RWStructuredBuffer hmap_entries, + RWStructuredBuffer active_blocks, + StructuredBuffer particles_pos, + ConstantBuffer particles_len, +) { + let id = invocation_id.x; + if (id < particles_len) { + let cell_width = grid[0].cell_width; + let particle = particles_pos[id]; + let block = block_associated_to_point(cell_width, particle.pt); + mark_block_as_active(grid, hmap_entries, active_blocks, block); + } +} + +// Marks the +1 neighbour blocks of every already-active base block as active. +// +// Second half of the two-pass block activation. One thread per base block: reads the block's +// virtual id and inserts its NUM_NBH_BLOCKS forward neighbours into the hashmap. `num_base_blocks` +// is a snapshot of `num_active_blocks` taken before this pass runs, so the neighbour blocks +// appended during the pass (at indices >= num_base_blocks) are not themselves processed. +[shader("compute")] +[numthreads(GRID_WORKGROUP_SIZE, 1, 1)] +func touch_neighbor_blocks( + uint3 invocation_id: SV_DispatchThreadID, + RWStructuredBuffer grid, + RWStructuredBuffer hmap_entries, + RWStructuredBuffer active_blocks, + StructuredBuffer num_base_blocks, +) { + let id = invocation_id.x; + if (id < num_base_blocks[0]) { + let vid = active_blocks[id].virtual_id; + var blocks = blocks_associated_to_block(vid); + for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) { + mark_block_as_active(grid, hmap_entries, active_blocks, blocks[i]); + } + } +} + +// Precomputes, for each active block, the header IDs of its +1 neighbour blocks. +// +// Each particle contributes "extras" to the +1 neighbour blocks of its primary block. Rather +// than re-querying the hashmap for those neighbours once per particle in the count/finalize +// passes, this resolves them once per active block and caches them in `nbh_block_ids`. Inactive +// neighbours are stored as NONE. Must run after all blocks have been touched. +[shader("compute")] +[numthreads(GRID_WORKGROUP_SIZE, 1, 1)] +func update_nbh_block_ids( + uint3 invocation_id: SV_DispatchThreadID, + StructuredBuffer grid, + StructuredBuffer hmap_entries, + RWStructuredBuffer active_blocks, +) { + let id = invocation_id.x; + if (id < grid[0].num_active_blocks) { + let vid = active_blocks[id].virtual_id; + var assoc = blocks_associated_to_block(vid); + for (var nbh = 0u; nbh < NUM_NBH_BLOCKS; nbh += 1u) { + let nbh_hid = find_block_header_id(grid, hmap_entries, assoc[nbh + 1]); + active_blocks[id].nbh_block_ids[nbh] = nbh_hid; + } + } +} + [shader("compute")] [numthreads(GRID_WORKGROUP_SIZE, 1, 1)] func touch_particle_blocks( @@ -109,18 +216,24 @@ func update_block_particle_count( let particle = particles_pos[id]; var blocks = blocks_associated_to_point(cell_width, particle.pt); - let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]); - active_blocks[active_block_id_0.id].num_particles.add(1u); - active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u); - let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width); - let mask = uint3(assoc >= 2); + // The particle's primary (base) block gets it as a regular particle. Only the per-slab + // bucket counter is incremented: `num_particles` (sum of primary buckets) and + // `num_particles_with_extras` (sum of all buckets) are derived per block in the copy pass, + // so we avoid two extra — and heavily contended — atomics per particle here. + let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]); + active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u); + + // Each +1 neighbour block also receives the particle as an "extra" if its quadratic + // stencil actually spills into it. The neighbour header IDs were precomputed by + // `update_nbh_block_ids`, so we read them from the primary block instead of doing a + // hashmap lookup per particle. for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) { let bshift = blocks[i].id - blocks[0].id; - if (all((bshift * mask) == bshift)) { - let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]); - active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u); + if (extra_spills(assoc, bshift)) { + let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1]; + active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u); } } } @@ -136,7 +249,14 @@ func copy_particles_len_to_scan_value( ) { let id = invocation_id.x; if (id < grid[0].num_active_blocks) { - scan_values[id] = active_blocks[id].num_particles_with_extras; + // The sorted array reserves room for every particle a block touches, extras included. + // `num_particles_with_extras` is the sum of all slab buckets (the count pass no longer + // maintains it as a separate atomic). + var total = 0u; + for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) { + total += active_blocks[id].sort_bucket_cursors[k]; + } + scan_values[id] = total; } } @@ -150,9 +270,28 @@ func copy_scan_values_to_first_particles_and_prepare_for_finalize( ) { let id = invocation_id.x; if (id < grid[0].num_active_blocks) { - active_blocks[id].first_particle = scan_values[id]; - active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles; - active_blocks[id].num_particles = 0u; + let first = scan_values[id]; + active_blocks[id].first_particle = first; + // Convert the per-bucket counts accumulated by the count pass into running insertion + // cursors. The cursors are *absolute* offsets into the sorted array (i.e. `first_particle` + // is baked in), so the finalize pass can scatter each particle with a single atomic. + // Primary buckets come first, so primaries land in [first_particle, first_particle + + // num_particles) as the scatter P2G expects, with the extras after them; both segments + // end up ordered by slab key. + // + // The running total advanced past the primary buckets is `num_particles`; the grand total + // is `num_particles_with_extras`. Both are derived here rather than maintained as + // per-particle atomics in the count pass. + var running = first; + for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) { + if (k == NUM_PRIMARY_SORT_BUCKETS) { + active_blocks[id].num_particles = running - first; + } + let count = active_blocks[id].sort_bucket_cursors[k]; + active_blocks[id].sort_bucket_cursors[k] = running; + running += count; + } + active_blocks[id].num_particles_with_extras = running - first; } } @@ -176,29 +315,31 @@ func finalize_particles_sort( let cell_width = grid[0].cell_width; let particle = particles_pos[id]; - // Place the particle to its sorted place. var blocks = blocks_associated_to_point(cell_width, particle.pt); - let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]); - let target_index = active_blocks[active_block_id_0.id].first_particle + - active_blocks[active_block_id_0.id].num_particles.add(1u); - sorted_particle_ids[target_index] = id; - let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width); - let mask = uint3(assoc >= 2); + // Place the particle in its primary block's range. The prepare pass turned the bucket + // counts into absolute insertion cursors (first_particle baked in), so the atomically + // claimed slot is already the final sorted index. + let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]); + let slot0 = active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u); + sorted_particle_ids[slot0] = id; + + // Place the particle as an "extra" into each +1 neighbour block whose stencil it spills + // into, using the extra slab bucket cursors (extras land after the primaries because + // their buckets come last). Neighbour header IDs are reused from `update_nbh_block_ids`. for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) { let bshift = blocks[i].id - blocks[0].id; - if (all((bshift * mask) == bshift)) { - let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]); - let target_index = active_blocks[active_block_id_i.id].first_particle + - active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u); - sorted_particle_ids[target_index] = id; + if (extra_spills(assoc, bshift)) { + let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1]; + let slot_i = active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u); + sorted_particle_ids[slot_i] = id; } } - // Setup the per-node particle linked-list. - let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width); - let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id); + // Setup the per-node particle linked-list (still consumed by the gather P2G and the + // rigid/CDF paths). + let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), assoc); let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id); nodes_linked_lists[node_global_id.id].len.add(1u); particle_node_linked_lists[id] = prev_head; diff --git a/shaders/slosh/solver/p2g_scatter_style.slang b/shaders/slosh/solver/p2g_scatter_style.slang index b94000e..d11f4db 100644 --- a/shaders/slosh/solver/p2g_scatter_style.slang +++ b/shaders/slosh/solver/p2g_scatter_style.slang @@ -33,6 +33,10 @@ groupshared float3 shared_momentum[WORKGROUP_SIZE]; groupshared float3x3 shared_affine[WORKGROUP_SIZE]; #endif groupshared float shared_mass[WORKGROUP_SIZE]; +// Per-particle slab key (associated-cell coordinate along the slowest node axis, relative to +// the block), used to derive the per-chunk culling bounds. Matches the bucket key the sort used, +// so the keys are ascending within the primaries and within the extras of each chunk. +groupshared int shared_zkey[WORKGROUP_SIZE]; [shader("compute")] [numthreads(WORKGROUP_SIZE, 1, 1)] @@ -53,6 +57,9 @@ func p2g_scatter_style( let first_particle = active_blocks[bid].first_particle; let num_particles = active_blocks[bid].num_particles_with_extras; let last_particle = first_particle + num_particles; + // End of the primaries segment: the sorted slab keys are ascending within the primaries and + // within the extras, so the per-chunk slab bounds need this boundary. + let primaries_end = first_particle + active_blocks[bid].num_particles; let block_vid = active_blocks[bid].virtual_id.id; // Each thread owns one cell (grid node) of this block and accumulates the @@ -64,10 +71,13 @@ func p2g_scatter_style( let local_cell = int2(int(tid % 8u), int(tid / 8u)); let cell_pos = float2(block_vid * 8 + local_cell) * cell_width; var acc = float3(0.0); + // This thread's node slab along the sort axis (the slowest-varying node axis). + let node_slab = local_cell.y; #else let local_cell = int3(int(tid % 4u), int((tid / 4u) % 4u), int(tid / 16u)); let cell_pos = float3(block_vid * 4 + local_cell) * cell_width; var acc = float4(0.0); + let node_slab = local_cell.z; #endif // Stream the block's particles through shared memory one chunk at a time. @@ -79,10 +89,21 @@ func p2g_scatter_style( if (load_idx < last_particle) { let pid = sorted_particle_ids[load_idx]; let kin = particles_kin[pid]; - shared_pos[tid] = particles_pos[pid].pt; + let pos = particles_pos[pid].pt; + shared_pos[tid] = pos; shared_mass[tid] = kin.mass; shared_affine[tid] = kin.affine; shared_momentum[tid] = kin.velocity * kin.mass + kin.force_dt; + + // Slab key along the sort axis, relative to this block; must match the bucket key + // used by the sort (same associated-cell rounding, same clamp at -2) so the shared + // keys stay ascending within each segment. + let assoc_cell = round(pos / cell_width) - 1.0; +#if DIM == 2 + shared_zkey[tid] = max(int(assoc_cell.y) - block_vid.y * 8, -2); +#else + shared_zkey[tid] = max(int(assoc_cell.z) - block_vid.z * 4, -2); +#endif } GroupMemoryBarrierWithGroupSync(); @@ -90,7 +111,24 @@ func p2g_scatter_style( // `chunk_len` is uniform across the workgroup, so the barriers above stay in // uniform control flow regardless of the total particle count. let chunk_len = min(WORKGROUP_SIZE, last_particle - chunk_base); - for (var p = 0u; p < chunk_len; p += 1u) { + + // Per-chunk slab bounds, exact thanks to the within-block sort: the keys are ascending + // within the primaries and within the extras, so the range is given by the chunk's + // first/last key, plus the two values around the primaries/extras boundary if the chunk + // straddles it. + var zmin = shared_zkey[0]; + var zmax = shared_zkey[chunk_len - 1u]; + if (primaries_end > chunk_base && primaries_end < chunk_base + chunk_len) { + let b = primaries_end - chunk_base; + zmin = min(zmin, shared_zkey[b]); + zmax = max(zmax, shared_zkey[b - 1u]); + } + + // A particle with slab key `a` only influences nodes in slabs [a, a + 2]: skip the whole + // chunk if this thread's node slab is outside the chunk's dilated slab range. When that + // holds for every thread of a warp, the warp skips the chunk entirely. + let culled_len = (node_slab >= zmin && node_slab <= zmax + 2) ? chunk_len : 0u; + for (var p = 0u; p < culled_len; p += 1u) { let dpt = cell_pos - shared_pos[p]; #if DIM == 2 let weight = QuadraticKernel::eval(dpt.x * inv_cell_width) diff --git a/src/grid/grid.rs b/src/grid/grid.rs index 637457a..78bf651 100644 --- a/src/grid/grid.rs +++ b/src/grid/grid.rs @@ -21,6 +21,7 @@ pub struct WgGrid { reset_hmap: GpuFunction, reset: GpuFunction, init_indirect_workgroups: GpuFunction, + capture_num_active_blocks: GpuFunction, } // TODO: should we have all the kernel launches just use @@ -36,6 +37,9 @@ struct GridArgs<'a, B: Backend> { nodes_linked_lists: &'a GpuVector<[u32; 2], B>, rigid_nodes_linked_lists: &'a GpuVector<[u32; 2], B>, scan_values: &'a GpuVector, + // Snapshot of `num_active_blocks` after the primary-block touch pass (the base-block count), + // consumed by `touch_neighbor_blocks`. + num_base_blocks: &'a GpuVector, // From particles particles_pos: &'a GpuVector, particles_len: &'a GpuScalar, @@ -86,6 +90,7 @@ impl WgGrid { nodes_linked_lists: &grid.nodes_linked_lists, rigid_nodes_linked_lists: &grid.rigid_nodes_linked_lists, scan_values: &grid.scan_values, + num_base_blocks: &grid.active_blocks_snapshot, particles_pos: particles.positions(), particles_len: particles.gpu_len(), active_blocks: &grid.active_blocks, @@ -108,13 +113,34 @@ impl WgGrid { self.reset_hmap .launch(backend, pass, &args, [grid.cpu_meta.hmap_capacity, 1, 1])?; - sort_module.touch_particle_blocks.launch_capped( + // Block activation in two passes (cheaper than every particle inserting all + // NUM_ASSOC_BLOCKS of its stencil): + // 1. Each particle activates only its primary (base) block. + // 2. Each base block activates its +1 neighbour blocks (once per block, not once + // per particle). `active_blocks_snapshot` records the base-block count so pass 2 + // doesn't reprocess the neighbours it appends. + sort_module.touch_primary_blocks.launch_capped( backend, pass, &args, particles.len() as u32, )?; + self.capture_num_active_blocks + .launch_grid(backend, pass, &args, [1, 1, 1])?; + + // Indirect dispatch sized for the base blocks (the neighbour pass runs one thread + // per base block). + self.init_indirect_workgroups + .launch_grid(backend, pass, &args, [1, 1, 1])?; + + sort_module.touch_neighbor_blocks.launch_indirect( + backend, + pass, + &args, + grid.indirect_n_blocks_groups.buffer(), + )?; + // // Ensure blocks exist wherever we have rigid particles that might affect // // other blocks. This is done in two passes: // // 1. Mark all rigid particles that need to ensure it’s associated block exists @@ -147,10 +173,19 @@ impl WgGrid { // - Launch write_blocks_multiplicity_to_scan_value // - Launch cumulated sum - // Prepare workgroups for indirect dispatches based on the number of active blocks. + // Prepare workgroups for indirect dispatches based on the (final) number of active blocks. self.init_indirect_workgroups .launch_grid(backend, pass, &args, [1, 1, 1])?; + // Cache each active block's +1 neighbour header IDs so the count/finalize passes don't + // re-query the hashmap per particle per neighbour. + sort_module.update_nbh_block_ids.launch_indirect( + backend, + pass, + &args, + grid.indirect_n_blocks_groups.buffer(), + )?; + sort_module.update_block_particle_count.launch_capped( backend, pass, @@ -255,9 +290,26 @@ impl Default for GpuGridHashMapEntry { } } +/// Number of within-block slab sort buckets (primary + extra). Must match `NUM_SORT_BUCKETS` +/// in `shaders/slosh/grid/grid.slang`. +#[cfg(feature = "dim2")] +const NUM_SORT_BUCKETS: usize = 18; // 8 primary + 10 extra +#[cfg(feature = "dim3")] +const NUM_SORT_BUCKETS: usize = 10; // 4 primary + 6 extra + +/// Number of +1 neighbour blocks of a primary block. Must match `NUM_NBH_BLOCKS` in +/// `shaders/slosh/grid/grid.slang` (`NUM_ASSOC_BLOCKS - 1`). +#[cfg(feature = "dim2")] +const NUM_NBH_BLOCKS: usize = 3; +#[cfg(feature = "dim3")] +const NUM_NBH_BLOCKS: usize = 7; + /// Header for an active grid block containing particles. /// -/// Tracks which particles belong to this block for efficient iteration. +/// Tracks which particles belong to this block for efficient iteration. This buffer is +/// GPU-private (allocated uninitialized, written and read only on the GPU), so the field +/// layout only needs to be at least as large as the shader-side `ActiveBlockHeader` for the +/// allocation to be correctly sized; the field order is kept in sync for clarity. #[derive(Copy, Clone, PartialEq, Pod, Zeroable)] #[repr(C)] pub struct GpuActiveBlockHeader { @@ -265,6 +317,10 @@ pub struct GpuActiveBlockHeader { first_particle: u32, num_particles_with_extras: u32, num_particles: u32, + /// Per-slab-bucket insertion cursors for the within-block counting sort. + sort_bucket_cursors: [u32; NUM_SORT_BUCKETS], + /// Cached header IDs of the +1 neighbour blocks (resolved by `update_nbh_block_ids`). + nbh_block_ids: [u32; NUM_NBH_BLOCKS], } /// Cached collision information for a grid node. @@ -325,6 +381,9 @@ pub struct GpuGrid { pub prev_node_collisions: GpuVector, /// Active block headers tracking particle ranges. pub active_blocks: GpuVector, + /// Single-element snapshot of `num_active_blocks` after the primary-block touch pass, + /// used by the two-pass block activation. + pub active_blocks_snapshot: GpuVector, /// Workspace for prefix sum operations. pub scan_values: GpuVector, /// Per-node linked lists for MPM particles. @@ -393,6 +452,7 @@ impl GpuGrid { let rigid_nodes_linked_lists = GpuVector::vector_uninit(backend, capacity * NODES_PER_BLOCK, BufferUsages::STORAGE)?; let active_blocks = GpuVector::vector_uninit(backend, capacity, BufferUsages::STORAGE)?; + let active_blocks_snapshot = GpuVector::vector_uninit(backend, 1, BufferUsages::STORAGE)?; let scan_values = GpuVector::vector_uninit(backend, capacity, BufferUsages::STORAGE)?; let indirect_n_blocks_groups = Arc::new(GpuVector::scalar_uninit( backend, @@ -414,6 +474,7 @@ impl GpuGrid { node_collisions, prev_node_collisions, active_blocks, + active_blocks_snapshot, scan_values, indirect_n_blocks_groups, indirect_n_g2p_p2g_groups, diff --git a/src/grid/sort.rs b/src/grid/sort.rs index eb9f955..92020f6 100644 --- a/src/grid/sort.rs +++ b/src/grid/sort.rs @@ -15,7 +15,13 @@ use stensor::tensor::GpuScalar; #[derive(Shader)] #[shader(module = "slosh::grid::sort")] pub struct WgSort { + // Legacy single-pass block activation, superseded by the two-pass + // touch_primary_blocks / touch_neighbor_blocks. Kept bound for reference/fallback. + #[allow(dead_code)] pub(crate) touch_particle_blocks: GpuFunction, + pub(crate) touch_primary_blocks: GpuFunction, + pub(crate) touch_neighbor_blocks: GpuFunction, + pub(crate) update_nbh_block_ids: GpuFunction, // Bound to GPU kernels; currently only used by commented-out rigid-particle code paths. #[allow(dead_code)] pub(crate) touch_rigid_particle_blocks: GpuFunction, From 5d418905417db12bb3b17afbaf6865d0328035f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Wed, 24 Jun 2026 10:49:20 +0200 Subject: [PATCH 2/3] feat: make p2g and g2p overwrittable --- src/pipeline.rs | 61 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/src/pipeline.rs b/src/pipeline.rs index c6c34b7..0f38cfa 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -79,6 +79,38 @@ pub trait MpmPipelineHooks { Ok(()) } + /// Replace the built-in Particle-To-Grid transfer. + /// + /// Return `Ok(true)` if the transfer was handled by this hook; the pipeline then + /// skips its built-in P2G. Return `Ok(false)` (the default) to keep slosh's P2G. + /// The hook is responsible for beginning its own compute pass. + fn run_p2g( + &mut self, + _backend: &B, + _encoder: &mut B::Encoder, + _data: &mut MpmData, + _state: &mut dyn Any, + _timestamps: Option<&mut GpuTimestamps>, + ) -> Result { + Ok(false) + } + + /// Replace the built-in Grid-To-Particle transfer. + /// + /// Return `Ok(true)` if the transfer was handled by this hook; the pipeline then + /// skips its built-in G2P. Return `Ok(false)` (the default) to keep slosh's G2P. + /// The hook is responsible for beginning its own compute pass. + fn run_g2p( + &mut self, + _backend: &B, + _encoder: &mut B::Encoder, + _data: &mut MpmData, + _state: &mut dyn Any, + _timestamps: Option<&mut GpuTimestamps>, + ) -> Result { + Ok(false) + } + /// Custom operation run after the main Particle-To-Grid transfer. fn after_p2g( &mut self, @@ -485,17 +517,16 @@ impl MpmPipeline { // )?; // } - { + // Let a hook replace the built-in P2G transfer (e.g. to fuse extra fields into the + // same sweep). When no hook handles it, run slosh's default scatter-style P2G. + if !hooks.run_p2g( + backend, + encoder, + data, + hooks_state, + timestamps.as_deref_mut(), + )? { let mut pass = encoder.begin_pass("p2g", timestamps.as_deref_mut()); - // self.p2g.launch( - // backend, - // &mut pass, - // &data.grid, - // &data.particles, - // &data.impulses, - // &data.bodies, - // &data.body_materials, - // )?; self.p2g_scatter_style.launch( backend, &mut pass, @@ -535,7 +566,15 @@ impl MpmPipeline { timestamps.as_deref_mut(), )?; - { + // Let a hook replace the built-in G2P transfer (e.g. to fuse extra fields into the + // same sweep). When no hook handles it, run slosh's default G2P. + if !hooks.run_g2p( + backend, + encoder, + data, + hooks_state, + timestamps.as_deref_mut(), + )? { let mut pass = encoder.begin_pass("g2p", timestamps.as_deref_mut()); self.g2p.launch( backend, From 817c66c125a7b7b8a783fec1498bce8a89a2cbf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Wed, 24 Jun 2026 11:04:23 +0200 Subject: [PATCH 3/3] chore: formatting fixes --- shaders/slosh/solver/particle_update.slang | 219 ++++++++++----------- shaders/slosh/solver/timestep_bound.slang | 67 ++++--- 2 files changed, 141 insertions(+), 145 deletions(-) diff --git a/shaders/slosh/solver/particle_update.slang b/shaders/slosh/solver/particle_update.slang index 2e10f5d..bbbc0f3 100644 --- a/shaders/slosh/solver/particle_update.slang +++ b/shaders/slosh/solver/particle_update.slang @@ -47,117 +47,114 @@ func particle_update( let _gs_total = min((_gs_n + 63u) / 64u, 65535u) * 64u; for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) { - let model = ParticleModel(); - let flags = model.model_flags(particles_model, particle_id); - let dt = params.dt; - let cell_width = grid[0].cell_width; - var kin = particles_kin[particle_id]; -// let cdf = particles_cdf[particle_id]; - var def_grad = particles_def_grad[particle_id]; - let props = particles_props[particle_id]; - let particle_pos = particles_pos[particle_id].pt; - - - /* - * Advection. - */ - // TODO: double check that we never need the reprojection below. - // This reprojection isn't part of the original MPM-MLS/CPIC paper but we added - // it at some point as it appeared that we'd still get some penetrating particles. - // However, that might have been caused by other bugs so it is unsure if we need to - // keep it now. -// if (cdf.signed_distance < -0.05 * cell_width) { -// let slip = BoundaryCondition(BoundaryConditionType::Slip, 0.0); -// kin.velocity = cdf.rigid_vel + slip.project_velocity((kin.velocity - cdf.rigid_vel), cdf.normal); -// } - - // Clamp the max velocity a particle can get. - // TODO: clamp the grid velocities instead? - if (length(kin.velocity) > cell_width / dt) { - kin.velocity = kin.velocity / length(kin.velocity) * cell_width / dt; - } - - // Apply Rayleigh mass-proportional damping (implicit integration for stability). - // v_new = v / (1 + damping * dt) - kin.velocity = kin.velocity / (1.0 + props.damping * dt); - - - // If the particle is fixed, clear its velocity and its affine matrix. - // This isn't ideal (this should typically be handled on the grid) but we sometimes - // need sub-grid-sized fixed particles. - if (props.fixed != 0) { - kin.velocity = Vect(0.0); - } - - let new_particle_pos = particle_pos + kin.velocity * dt; - -// /* -// * Penalty impulse. -// */ -// // TODO: apply the penalty impulse as an extra force on the grid instead of on -// // changing the particle velocity directly? -// static const float PENALTY_COEFF = 1.0e3; -// if (cdf.signed_distance < -0.05 * cell_width) { // && cdf.signed_distance > -0.3 * cell_width { -// let corrected_dist = max(cdf.signed_distance, -0.3 * cell_width); -// let impulse = (dt * -corrected_dist * PENALTY_COEFF) * cdf.normal; -// kin.velocity += impulse; // / curr_particle_vol.mass; -// } - - /* - * Deformation gradient update. - */ - if ((flags & ModelFlags::FLUID) == 0) { + let model = ParticleModel(); + let flags = model.model_flags(particles_model, particle_id); + let dt = params.dt; + let cell_width = grid[0].cell_width; + var kin = particles_kin[particle_id]; + // let cdf = particles_cdf[particle_id]; + var def_grad = particles_def_grad[particle_id]; + let props = particles_props[particle_id]; + let particle_pos = particles_pos[particle_id].pt; + + /* + * Advection. + */ + // TODO: double check that we never need the reprojection below. + // This reprojection isn't part of the original MPM-MLS/CPIC paper but we added + // it at some point as it appeared that we'd still get some penetrating particles. + // However, that might have been caused by other bugs so it is unsure if we need to + // keep it now. + // if (cdf.signed_distance < -0.05 * cell_width) { + // let slip = BoundaryCondition(BoundaryConditionType::Slip, 0.0); + // kin.velocity = cdf.rigid_vel + slip.project_velocity((kin.velocity - cdf.rigid_vel), cdf.normal); + // } + + // Clamp the max velocity a particle can get. + // TODO: clamp the grid velocities instead? + if (length(kin.velocity) > cell_width / dt) { + kin.velocity = kin.velocity / length(kin.velocity) * cell_width / dt; + } + + // Apply Rayleigh mass-proportional damping (implicit integration for stability). + // v_new = v / (1 + damping * dt) + kin.velocity = kin.velocity / (1.0 + props.damping * dt); + + // If the particle is fixed, clear its velocity and its affine matrix. + // This isn't ideal (this should typically be handled on the grid) but we sometimes + // need sub-grid-sized fixed particles. + if (props.fixed != 0) { + kin.velocity = Vect(0.0); + } + + let new_particle_pos = particle_pos + kin.velocity * dt; + + // /* + // * Penalty impulse. + // */ + // // TODO: apply the penalty impulse as an extra force on the grid instead of on + // // changing the particle velocity directly? + // static const float PENALTY_COEFF = 1.0e3; + // if (cdf.signed_distance < -0.05 * cell_width) { // && cdf.signed_distance > -0.3 * cell_width { + // let corrected_dist = max(cdf.signed_distance, -0.3 * cell_width); + // let impulse = (dt * -corrected_dist * PENALTY_COEFF) * cdf.normal; + // kin.velocity += impulse; // / curr_particle_vol.mass; + // } + + /* + * Deformation gradient update. + */ + if ((flags & ModelFlags::FLUID) == 0) { + // NOTE: the velocity gradient was stored in the affine buffer. + def_grad += mul(def_grad, kin.affine * dt); + } else { + let def_grad0 = def_grad[0][0]; + var new_def_grad_diag_elt = def_grad0 + def_grad0 * kin.vel_grad_det * dt; + def_grad = diag(Vect(new_def_grad_diag_elt)); + } + + // NOTE: we only reset the velocity gradient to zero for fixed particles **after** the deformation + // gradient update so we keep its elastic response. It acts as a cube of elastic material which + // center doesn’t move but can deform. + if (props.fixed != 0) { + kin.affine = Mat(0.0); + } + + /* + * Constitutive model. + */ + let update_data = ParticleUpdateData(dt, cell_width, particle_id, kin.affine, props.init_volume, props.phase); + let update_result = model.update(particles_model, update_data, def_grad); + + // Write back init_volume if the model modified it (e.g., swell). + let effective_init_volume = select(update_result.init_volume >= 0.0, update_result.init_volume, props.init_volume); + if (update_result.init_volume >= 0.0) { + particles_props[particle_id].init_volume = update_result.init_volume; + } + + /* + * Affine matrix for APIC transfer. + */ + let inv_d = QuadraticKernel::inv_d(cell_width); // NOTE: the velocity gradient was stored in the affine buffer. - def_grad += mul(def_grad, kin.affine * dt); - } else { - let def_grad0 = def_grad[0][0]; - var new_def_grad_diag_elt = def_grad0 + def_grad0 * kin.vel_grad_det * dt; - def_grad = diag(Vect(new_def_grad_diag_elt)); - } - - // NOTE: we only reset the velocity gradient to zero for fixed particles **after** the deformation - // gradient update so we keep its elastic response. It acts as a cube of elastic material which - // center doesn’t move but can deform. - if (props.fixed != 0) { - kin.affine = Mat(0.0); - } - - /* - * Constitutive model. - */ - - let update_data = ParticleUpdateData(dt, cell_width, particle_id, kin.affine, props.init_volume, props.phase); - let update_result = model.update(particles_model, update_data, def_grad); - - // Write back init_volume if the model modified it (e.g., swell). - let effective_init_volume = select(update_result.init_volume >= 0.0, update_result.init_volume, props.init_volume); - if (update_result.init_volume >= 0.0) { - particles_props[particle_id].init_volume = update_result.init_volume; - } - - /* - * Affine matrix for APIC transfer. - */ - let inv_d = QuadraticKernel::inv_d(cell_width); - // NOTE: the velocity gradient was stored in the affine buffer. - let affine = kin.affine * kin.mass - update_result.kirchoff_stress * (effective_init_volume * inv_d * dt); - - /* - * Write back the new particle properties. - */ - if (all(new_particle_pos == new_particle_pos) && determinant(def_grad) > 0.0) { // Avoid NaNs and invalid deformation gradients - particles_pos[particle_id].pt = new_particle_pos; - kin.affine = affine; - } else { - kin.enabled = 0; // This particles diverged, disable it. - kin.velocity = Vect(0.0); - def_grad = diag(Vect(1.0)); - kin.affine = Mat(0.0); - kin.mass = 0.0; - } - kin.force_dt = Vect(0.0); - - particles_kin[particle_id] = kin; - particles_def_grad[particle_id] = def_grad; + let affine = kin.affine * kin.mass - update_result.kirchoff_stress * (effective_init_volume * inv_d * dt); + + /* + * Write back the new particle properties. + */ + if (all(new_particle_pos == new_particle_pos) && determinant(def_grad) > 0.0) { // Avoid NaNs and invalid deformation gradients + particles_pos[particle_id].pt = new_particle_pos; + kin.affine = affine; + } else { + kin.enabled = 0; // This particles diverged, disable it. + kin.velocity = Vect(0.0); + def_grad = diag(Vect(1.0)); + kin.affine = Mat(0.0); + kin.mass = 0.0; + } + kin.force_dt = Vect(0.0); + + particles_kin[particle_id] = kin; + particles_def_grad[particle_id] = def_grad; } } diff --git a/shaders/slosh/solver/timestep_bound.slang b/shaders/slosh/solver/timestep_bound.slang index 9121933..18809ce 100644 --- a/shaders/slosh/solver/timestep_bound.slang +++ b/shaders/slosh/solver/timestep_bound.slang @@ -49,45 +49,44 @@ func estimate_timestep_bound( let _gs_n = particles_len; let _gs_total = min((_gs_n + (WORKGROUP_SIZE - 1u)) / WORKGROUP_SIZE, 65535u) * WORKGROUP_SIZE; for (var particle_id = invocation_id.x; particle_id < _gs_n; particle_id += _gs_total) { + if (particles_kin[particle_id].enabled == 0) { + continue; + } - if (particles_kin[particle_id].enabled == 0) { - continue; - } - - let cell_width = grid[0].cell_width; + let cell_width = grid[0].cell_width; - /* - * Model-specific restrictions. - * Will usually be based on sounds speed for the material (section 4.1). - */ - let density0 = particles_kin[particle_id].mass / particles_props[particle_id].init_volume; - let def_grad = particles_def_grad[particle_id]; - let velocity = particles_kin[particle_id].velocity; - let affine = particles_kin[particle_id].affine; - let mass = particles_kin[particle_id].mass; + /* + * Model-specific restrictions. + * Will usually be based on sounds speed for the material (section 4.1). + */ + let density0 = particles_kin[particle_id].mass / particles_props[particle_id].init_volume; + let def_grad = particles_def_grad[particle_id]; + let velocity = particles_kin[particle_id].velocity; + let affine = particles_kin[particle_id].affine; + let mass = particles_kin[particle_id].mass; - let model = ParticleModel(); - var dt = model.timestep_bound(particles_model, particle_id, density0, def_grad, velocity, cell_width); + let model = ParticleModel(); + var dt = model.timestep_bound(particles_model, particle_id, density0, def_grad, velocity, cell_width); - /* - * Velocity-based restrictions (section 4.2) - */ - var norm_affine_squared = 0.0; - [[ForceUnroll]] - for (var i = 0; i < DIM; i++) { - [[ForceUnroll]] - for (var j = 0; j < DIM; j++) { - norm_affine_squared += affine[i][j] * affine[i][j]; - } - } + /* + * Velocity-based restrictions (section 4.2) + */ + var norm_affine_squared = 0.0; + [[ForceUnroll]] + for (var i = 0; i < DIM; i++) { + [[ForceUnroll]] + for (var j = 0; j < DIM; j++) { + norm_affine_squared += affine[i][j] * affine[i][j]; + } + } - let d = (cell_width * cell_width) / 4.0; - let norm_b = d * sqrt(norm_affine_squared) / mass; - let apic_v = norm_b * 6.0 * sqrt(DIM) / cell_width; - let v = length(velocity); // + apic_v; - dt = min(dt, cell_width / v); + let d = (cell_width * cell_width) / 4.0; + let norm_b = d * sqrt(norm_affine_squared) / mass; + let apic_v = norm_b * 6.0 * sqrt(DIM) / cell_width; + let v = length(velocity); // + apic_v; + dt = min(dt, cell_width / v); - let candidate = GpuTimestepBounds::secs_to_int(dt); - result[0].computed_max_dt_as_uint.min(candidate); + let candidate = GpuTimestepBounds::secs_to_int(dt); + result[0].computed_max_dt_as_uint.min(candidate); } }