Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/slosh2d/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "slosh2d"
authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
description = "Cross-platform GPU 2D Material Point Method implementation."
repository = "https://github.com/dimforge/slosh"
version = "0.6.0"
version = "0.6.1"
edition = "2024"
license = "Apache-2.0"

Expand Down
2 changes: 1 addition & 1 deletion crates/slosh3d/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "slosh3d"
authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
description = "Cross-platform GPU 3D Material Point Method implementation."
repository = "https://github.com/dimforge/slosh"
version = "0.6.0"
version = "0.6.1"
edition = "2024"
license = "Apache-2.0"

Expand Down
4 changes: 3 additions & 1 deletion shaders/slosh/grid/grid.slang
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ public static const int OFF_BY_ONE = 1;
public struct ActiveBlockHeaderGeneric<MaybeAtomicUint> {
public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block.
public uint first_particle;
public MaybeAtomicUint num_particles;
public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block.
public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively.
}

public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric<uint>;
Expand Down Expand Up @@ -339,6 +340,7 @@ public func mark_block_as_active(
let block_header_id = grid[0].num_active_blocks.add(1u);
active_blocks[block_header_id].virtual_id = block;
active_blocks[block_header_id].first_particle = 0u;
active_blocks[block_header_id].num_particles_with_extras = 0u;
active_blocks[block_header_id].num_particles = 0u;
hmap_entries[slot].value = BlockHeaderId(block_header_id);
}
Expand Down
49 changes: 39 additions & 10 deletions shaders/slosh/grid/sort.slang
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,22 @@ func update_block_particle_count(
if (id < particles_len) {
let cell_width = grid[0].cell_width;
let particle = particles_pos[id];
let block_id = block_associated_to_point(cell_width, particle.pt);
let active_block_id = find_block_header_id(grid, hmap_entries, block_id);
active_blocks[active_block_id.id].num_particles.add(1u);

var blocks = blocks_associated_to_point(cell_width, particle.pt);
let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
active_blocks[active_block_id_0.id].num_particles.add(1u);
active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u);

let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
let mask = uint3(assoc >= 2);

for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
let bshift = blocks[i].id - blocks[0].id;
if (all((bshift * mask) == bshift)) {
let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
}
}
}
}

Expand All @@ -121,13 +134,13 @@ func copy_particles_len_to_scan_value(
) {
let id = invocation_id.x;
if (id < grid[0].num_active_blocks) {
scan_values[id] = active_blocks[id].num_particles;
scan_values[id] = active_blocks[id].num_particles_with_extras;
}
}

[shader("compute")]
[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
func copy_scan_values_to_first_particles(
func copy_scan_values_to_first_particles_and_prepare_for_finalize(
uint3 invocation_id: SV_DispatchThreadID,
StructuredBuffer<Grid> grid,
StructuredBuffer<uint> scan_values,
Expand All @@ -136,6 +149,8 @@ func copy_scan_values_to_first_particles(
let id = invocation_id.x;
if (id < grid[0].num_active_blocks) {
active_blocks[id].first_particle = scan_values[id];
active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles;
active_blocks[id].num_particles = 0u;
}
}

Expand All @@ -147,26 +162,40 @@ func finalize_particles_sort(
StructuredBuffer<GridHashMapEntry> hmap_entries,
StructuredBuffer<Position> particles_pos,
ConstantBuffer<uint> particles_len,
RWStructuredBuffer<Atomic<uint>> scan_values,
RWStructuredBuffer<AtomicNodeLinkedList> nodes_linked_lists,
RWStructuredBuffer<uint> particle_node_linked_lists,
RWStructuredBuffer<uint> sorted_particle_ids,
RWStructuredBuffer<AtomicActiveBlockHeader> active_blocks,

) {
let id = invocation_id.x;
if (id < particles_len) {
let cell_width = grid[0].cell_width;
let particle = particles_pos[id];
let block_id = block_associated_to_point(cell_width, particle.pt);

// Place the particle to its sorted place.
let active_block_id = find_block_header_id(grid, hmap_entries, block_id);
let target_index = scan_values[active_block_id.id].add(1u);
var blocks = blocks_associated_to_point(cell_width, particle.pt);
let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
let target_index = active_blocks[active_block_id_0.id].first_particle +
active_blocks[active_block_id_0.id].num_particles.add(1u);
sorted_particle_ids[target_index] = id;

let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
let mask = uint3(assoc >= 2);

for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
let bshift = blocks[i].id - blocks[0].id;
if (all((bshift * mask) == bshift)) {
let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
let target_index = active_blocks[active_block_id_i.id].first_particle +
active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
sorted_particle_ids[target_index] = id;
}
}

// Setup the per-node particle linked-list.
let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width);
let node_global_id = node_id(block_header_id_to_physical_id(active_block_id), node_local_id);
let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id);
let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id);
nodes_linked_lists[node_global_id.id].len.add(1u);
particle_node_linked_lists[id] = prev_head;
Expand Down
117 changes: 117 additions & 0 deletions shaders/slosh/solver/p2g_scatter_style.slang
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
module p2g;

import slosh.solver.params;
import slosh.solver.particle;
import slosh.solver.boundary_condition;
import slosh.grid.kernel;
import slosh.grid.grid;
import slosh.solver.rigid_impulses;
import slosh.rbd.dynamics.body;
import slosh.aliases;

#if DIM == 2
static const uint WORKGROUP_SIZE_X = 8;
static const uint WORKGROUP_SIZE_Y = 8;
static const uint WORKGROUP_SIZE_Z = 1;
#else
static const uint WORKGROUP_SIZE_X = 4;
static const uint WORKGROUP_SIZE_Y = 4;
static const uint WORKGROUP_SIZE_Z = 4;
#endif
static const uint WORKGROUP_SIZE = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y * WORKGROUP_SIZE_Z;

// Staging buffers for one workgroup-sized chunk of particles. The chunk is loaded
// cooperatively (one particle per thread) and then read by every cell-thread, so
// each particle is fetched from global memory exactly once per block.
#if DIM == 2
groupshared float2 shared_pos[WORKGROUP_SIZE];
groupshared float2 shared_momentum[WORKGROUP_SIZE];
groupshared float2x2 shared_affine[WORKGROUP_SIZE];
#else
groupshared float3 shared_pos[WORKGROUP_SIZE];
groupshared float3 shared_momentum[WORKGROUP_SIZE];
groupshared float3x3 shared_affine[WORKGROUP_SIZE];
#endif
groupshared float shared_mass[WORKGROUP_SIZE];

[shader("compute")]
[numthreads(WORKGROUP_SIZE, 1, 1)]
func p2g_scatter_style(
uint3 block_id: SV_GroupID,
uint tid: SV_GroupIndex,
StructuredBuffer<Grid> grid,
StructuredBuffer<ActiveBlockHeader> active_blocks,
StructuredBuffer<Position> particles_pos,
StructuredBuffer<Kinematics> particles_kin,
RWStructuredBuffer<Node> nodes,
StructuredBuffer<uint> sorted_particle_ids,
) {
let cell_width = grid[0].cell_width;
let inv_cell_width = 1.0 / cell_width;
let bid = block_id.x;

let first_particle = active_blocks[bid].first_particle;
let num_particles = active_blocks[bid].num_particles_with_extras;
let last_particle = first_particle + num_particles;
let block_vid = active_blocks[bid].virtual_id.id;

// Each thread owns one cell (grid node) of this block and accumulates the
// contribution of every particle assigned to the block into a register.
// This avoids both global atomics and the per-cell workgroup reduction that
// the previous implementation used (which serialized the whole workgroup with
// ~7 barriers per cell, i.e. hundreds of barriers per particle batch).
#if DIM == 2
let local_cell = int2(int(tid % 8u), int(tid / 8u));
let cell_pos = float2(block_vid * 8 + local_cell) * cell_width;
var acc = float3(0.0);
#else
let local_cell = int3(int(tid % 4u), int((tid / 4u) % 4u), int(tid / 16u));
let cell_pos = float3(block_vid * 4 + local_cell) * cell_width;
var acc = float4(0.0);
#endif

// Stream the block's particles through shared memory one chunk at a time.
for (var chunk_base = first_particle; chunk_base < last_particle; chunk_base += WORKGROUP_SIZE) {
// Wait for the previous chunk's readers before overwriting shared memory.
GroupMemoryBarrierWithGroupSync();

let load_idx = chunk_base + tid;
if (load_idx < last_particle) {
let pid = sorted_particle_ids[load_idx];
let kin = particles_kin[pid];
shared_pos[tid] = particles_pos[pid].pt;
shared_mass[tid] = kin.mass;
shared_affine[tid] = kin.affine;
shared_momentum[tid] = kin.velocity * kin.mass + kin.force_dt;
}

GroupMemoryBarrierWithGroupSync();

// `chunk_len` is uniform across the workgroup, so the barriers above stay in
// uniform control flow regardless of the total particle count.
let chunk_len = min(WORKGROUP_SIZE, last_particle - chunk_base);
for (var p = 0u; p < chunk_len; p += 1u) {
let dpt = cell_pos - shared_pos[p];
#if DIM == 2
let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
* QuadraticKernel::eval(dpt.y * inv_cell_width);
#else
let weight = QuadraticKernel::eval(dpt.x * inv_cell_width)
* QuadraticKernel::eval(dpt.y * inv_cell_width)
* QuadraticKernel::eval(dpt.z * inv_cell_width);
#endif
// The quadratic kernel is exactly zero outside the 3-node support, which
// is the common case for the dense cell x particle cross product. Skipping
// the affine matrix-vector product there is the bulk of the saved work.
if (weight != 0.0) {
let momentum = mul(dpt, shared_affine[p]) + shared_momentum[p];
acc += vector<float, DIM + 1>(momentum, shared_mass[p]) * weight;
}
}
}

// Write the accumulated node state to global memory. Every cell is written once
// (no atomics, no inter-block races), zeroing cells that received no contribution.
let global_chunk_id = block_header_id_to_physical_id(BlockHeaderId(bid));
nodes[global_chunk_id.id + tid].momentum_velocity_mass = acc;
}
3 changes: 2 additions & 1 deletion src/grid/grid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ impl<B: Backend> WgGrid<B> {
prefix_sum_module.launch(backend, pass, prefix_sum, &grid.scan_values)?;

sort_module
.copy_scan_values_to_first_particles
.copy_scan_values_to_first_particles_and_prepare_for_finalize
.launch_indirect(backend, pass, &args, grid.indirect_n_blocks_groups.buffer())?;

// Reset here so the linked list heads get reset before `finalize_particles_sort` which
Expand Down Expand Up @@ -263,6 +263,7 @@ impl Default for GpuGridHashMapEntry {
pub struct GpuActiveBlockHeader {
virtual_id: BlockVirtualId,
first_particle: u32,
num_particles_with_extras: u32,
num_particles: u32,
}

Expand Down
5 changes: 4 additions & 1 deletion src/grid/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@ use stensor::tensor::GpuScalar;
#[shader(module = "slosh::grid::sort")]
pub struct WgSort<B: Backend> {
pub(crate) touch_particle_blocks: GpuFunction<B>,
// Bound to GPU kernels; currently only used by commented-out rigid-particle code paths.
#[allow(dead_code)]
pub(crate) touch_rigid_particle_blocks: GpuFunction<B>,
#[allow(dead_code)]
pub(crate) mark_rigid_particles_needing_block: GpuFunction<B>,
pub(crate) update_block_particle_count: GpuFunction<B>,
pub(crate) copy_particles_len_to_scan_value: GpuFunction<B>,
pub(crate) copy_scan_values_to_first_particles: GpuFunction<B>,
pub(crate) copy_scan_values_to_first_particles_and_prepare_for_finalize: GpuFunction<B>,
pub(crate) finalize_particles_sort: GpuFunction<B>,
pub(crate) sort_rigid_particles: GpuFunction<B>,
}
Expand Down
25 changes: 21 additions & 4 deletions src/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ use crate::rbd::dynamics::body::{BodyCoupling, BodyCouplingEntry};
use crate::solver::{
GpuBoundaryCondition, GpuImpulses, GpuMaterials, GpuParticleModelData, GpuParticles,
GpuRigidParticles, GpuSimulationParams, GpuTimestepBounds, Particle, SimulationParams, WgG2P,
WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgParticleUpdate, WgRigidImpulses,
WgRigidParticleUpdate, WgTimestepBounds,
WgG2PCdf, WgGridUpdate, WgGridUpdateCdf, WgP2G, WgP2GCdf, WgP2GScatterStyle, WgParticleUpdate,
WgRigidImpulses, WgRigidParticleUpdate, WgTimestepBounds,
};
use rapier::dynamics::RigidBodySet;
use rapier::geometry::{ColliderHandle, ColliderSet};
Expand Down Expand Up @@ -43,13 +43,20 @@ pub struct MpmPipeline<B: Backend, GpuModel: GpuParticleModelData> {
grid: WgGrid<B>,
prefix_sum: WgPrefixSum<B>,
sort: WgSort<B>,
// Kept for the alternative/CDF code paths that are currently commented out in `step`.
#[allow(dead_code)]
p2g: WgP2G<B>,
p2g_scatter_style: WgP2GScatterStyle<B>,
#[allow(dead_code)]
p2g_cdf: WgP2GCdf<B>,
#[allow(dead_code)]
grid_update_cdf: WgGridUpdateCdf<B>,
grid_update: WgGridUpdate<B>,
particles_update: WgParticleUpdate<B>,
g2p: WgG2P<B>,
#[allow(dead_code)]
g2p_cdf: WgG2PCdf<B>,
#[allow(dead_code)]
rigid_particles_update: WgRigidParticleUpdate<B>,
/// Maximum timestep bound calculation.
pub timestep_bounds: WgTimestepBounds<B>,
Expand Down Expand Up @@ -342,6 +349,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
prefix_sum: WgPrefixSum::from_backend(backend, compiler)?,
sort: WgSort::from_backend(backend, compiler)?,
p2g: WgP2G::from_backend(backend, compiler)?,
p2g_scatter_style: WgP2GScatterStyle::from_backend(backend, compiler)?,
p2g_cdf: WgP2GCdf::from_backend(backend, compiler)?,
grid_update: WgGridUpdate::from_backend(backend, compiler)?,
grid_update_cdf: WgGridUpdateCdf::from_backend(backend, compiler)?,
Expand Down Expand Up @@ -479,7 +487,16 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {

{
let mut pass = encoder.begin_pass("p2g", timestamps.as_deref_mut());
self.p2g.launch(
// self.p2g.launch(
// backend,
// &mut pass,
// &data.grid,
// &data.particles,
// &data.impulses,
// &data.bodies,
// &data.body_materials,
// )?;
self.p2g_scatter_style.launch(
backend,
&mut pass,
&data.grid,
Expand Down Expand Up @@ -560,7 +577,7 @@ impl<B: Backend, GpuModel: GpuParticleModelData> MpmPipeline<B, GpuModel> {
)?;

{
let mut pass = encoder.begin_pass("integrate_bodies", timestamps.as_deref_mut());
let mut pass = encoder.begin_pass("integrate_bodies", timestamps);
// TODO: should this be in a separate pipeline? Within impulse probably?
self.impulses.launch(
backend,
Expand Down
5 changes: 0 additions & 5 deletions src/rbd/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
use slang_hal::re_exports::include_dir;

#[cfg(feature = "runtime")]
use slang_hal::re_exports::minislang::SlangCompiler;

/// GPU-accelerated rigid body dynamics simulation.
///
/// This module provides structures and methods for managing physics bodies
Expand Down
2 changes: 1 addition & 1 deletion src/solver/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

pub use g2p::WgG2P;
pub use g2p_cdf::WgG2PCdf;
pub use p2g::WgP2G;
pub use p2g::{WgP2G, WgP2GScatterStyle};
pub use p2g_cdf::WgP2GCdf;
pub use params::{GpuSimulationParams, SimulationParams};
pub use particle::*;
Expand Down
Loading
Loading