Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions shaders/slosh/grid/grid.slang
Original file line number Diff line number Diff line change
Expand Up @@ -211,13 +211,50 @@ public static const uint NUM_ASSOC_BLOCKS = 4;
#else
public static const uint NUM_ASSOC_BLOCKS = 8;
#endif
// Number of +1 neighbour blocks of a primary block (all associated blocks but the primary).
public static const uint NUM_NBH_BLOCKS = NUM_ASSOC_BLOCKS - 1;
public static const int OFF_BY_ONE = 1;

// Number of "slab" buckets for the within-block counting sort of regular particles whose
// primary block is the block being sorted. One bucket per associated-cell slab along the
// slowest-varying node axis (y in 2D, z in 3D), so the sorted order lets P2G derive tight
// per-chunk slab bounds for culling.
#if DIM == 2
public static const uint NUM_PRIMARY_SORT_BUCKETS = 8;
// Number of slab buckets for "extras" (particles spilling in from a neighbour block). Their
// associated slab relative to this block lies in [-2, block_width - 1], clamped at -2 (slabs
// below -2 cannot influence any node of the block anyway): [-2, 7] -> 10 buckets in 2D.
public static const uint NUM_EXTRA_SORT_BUCKETS = 10;
#else
public static const uint NUM_PRIMARY_SORT_BUCKETS = 4;
// [-2, 3] -> 6 buckets in 3D.
public static const uint NUM_EXTRA_SORT_BUCKETS = 6;
#endif
// Total number of within-block sort buckets. Primary buckets come first so primaries end up
// contiguous in [first_particle, first_particle + num_particles), which G2P relies on.
public static const uint NUM_SORT_BUCKETS = NUM_PRIMARY_SORT_BUCKETS + NUM_EXTRA_SORT_BUCKETS;

// Minimum local base-cell index along an axis for a particle's quadratic stencil to spill into
// the +1 neighbour block along that axis.
#if DIM == 2
public static const uint EXTRA_PARTICLE_MIN_SHIFT = 6;
#else
public static const uint EXTRA_PARTICLE_MIN_SHIFT = 2;
#endif

public struct ActiveBlockHeaderGeneric<MaybeAtomicUint> {
public BlockVirtualId virtual_id; // Needed to compute the world-space position of a block.
public uint first_particle;
public MaybeAtomicUint num_particles_with_extras; // Total count of particles contributing to this block.
public MaybeAtomicUint num_particles; // Count of particles assigned to this block exclusively.
// Per-slab-bucket cursors for the within-block counting sort. The count pass accumulates
// per-bucket counts here; the prepare pass converts them in place to absolute insertion
// cursors (first_particle baked in); the finalize pass increments them while inserting.
// The resulting segment is ordered by slab key, which P2G uses for per-chunk culling.
public MaybeAtomicUint sort_bucket_cursors[NUM_SORT_BUCKETS];
// Header IDs of the +1 neighbour blocks, precomputed once per block to avoid repeated
// hashmap lookups in the count/finalize passes. Inactive neighbours are stored as NONE.
public BlockHeaderId nbh_block_ids[NUM_NBH_BLOCKS];
}

public typealias ActiveBlockHeader = ActiveBlockHeaderGeneric<uint>;
Expand Down Expand Up @@ -342,6 +379,9 @@ public func mark_block_as_active(
active_blocks[block_header_id].first_particle = 0u;
active_blocks[block_header_id].num_particles_with_extras = 0u;
active_blocks[block_header_id].num_particles = 0u;
for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
active_blocks[block_header_id].sort_bucket_cursors[k] = 0u;
}
hmap_entries[slot].value = BlockHeaderId(block_header_id);
}
}
Expand All @@ -364,6 +404,21 @@ func div_ceil(x: uint, y: uint) -> uint {
return (x + y - 1) / y;
}

// Snapshots the current `num_active_blocks` into a single-element buffer.
//
// Used by the two-pass block activation: after `touch_primary_blocks` has activated all
// particle base blocks, this captures their count so that `touch_neighbor_blocks` only
// iterates over base blocks (and not over the neighbour blocks it appends).
[shader("compute")]
[numthreads(1, 1, 1)]
func capture_num_active_blocks(
uint3 invocation_id: SV_DispatchThreadID,
StructuredBuffer<Grid> grid,
RWStructuredBuffer<uint> num_base_blocks,
) {
num_base_blocks[0] = grid[0].num_active_blocks;
}

[shader("compute")]
[numthreads(1, 1, 1)]
func init_indirect_workgroups(
Expand Down
195 changes: 168 additions & 27 deletions shaders/slosh/grid/sort.slang
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,113 @@ module sort;
import slosh.grid.grid;
import slosh.solver.particle;

// Returns the within-block sort bucket for a particle counted/inserted into its primary
// block: one bucket per associated-cell slab along the slowest-varying node axis (y in 2D,
// z in 3D).
func primary_sort_bucket(assoc: vector<uint, DIM>) -> uint {
#if DIM == 2
return assoc.y;
#else
return assoc.z;
#endif
}

// Returns the within-block sort bucket for a particle counted/inserted as an "extra" into the
// neighbour block shifted by `bshift` from its primary block. The associated slab relative to
// the neighbour block can be negative; slabs below -2 cannot influence any node of that block
// (the quadratic stencil only covers slabs [assoc, assoc + 2]), so they are clamped at -2.
func extra_sort_bucket(assoc: vector<uint, DIM>, bshift: vector<int, DIM>) -> uint {
#if DIM == 2
let local = int(assoc.y) - bshift.y * 8;
#else
let local = int(assoc.z) - bshift.z * 4;
#endif
return NUM_PRIMARY_SORT_BUCKETS + uint(max(local, -2) + 2);
}

// True if a particle whose local base-cell index is `assoc` spills its quadratic stencil into
// the +1 neighbour block reached by `bshift` (component 0 or 1 per axis).
func extra_spills(assoc: vector<uint, DIM>, bshift: vector<int, DIM>) -> bool {
let mask = vector<uint, DIM>(assoc >= EXTRA_PARTICLE_MIN_SHIFT);
let bshift_u = vector<uint, DIM>(bshift);
return all((bshift_u * mask) == bshift_u);
}

// Marks only each particle's primary (base) block as active.
//
// First half of the two-pass block activation that replaces `touch_particle_blocks`: instead
// of every particle redundantly inserting all NUM_ASSOC_BLOCKS of its stencil (~NUM_ASSOC_BLOCKS×
// redundancy), each particle inserts only its base block here, and `touch_neighbor_blocks` then
// inserts the +1 neighbour blocks once per active base block.
[shader("compute")]
[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
func touch_primary_blocks(
uint3 invocation_id: SV_DispatchThreadID,
RWStructuredBuffer<AtomicGrid> grid,
RWStructuredBuffer<AtomicGridHashMapEntry> hmap_entries,
RWStructuredBuffer<ActiveBlockHeader> active_blocks,
StructuredBuffer<Position> particles_pos,
ConstantBuffer<uint> particles_len,
) {
let id = invocation_id.x;
if (id < particles_len) {
let cell_width = grid[0].cell_width;
let particle = particles_pos[id];
let block = block_associated_to_point(cell_width, particle.pt);
mark_block_as_active(grid, hmap_entries, active_blocks, block);
}
}

// Marks the +1 neighbour blocks of every already-active base block as active.
//
// Second half of the two-pass block activation. One thread per base block: reads the block's
// virtual id and inserts its NUM_NBH_BLOCKS forward neighbours into the hashmap. `num_base_blocks`
// is a snapshot of `num_active_blocks` taken before this pass runs, so the neighbour blocks
// appended during the pass (at indices >= num_base_blocks) are not themselves processed.
[shader("compute")]
[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
func touch_neighbor_blocks(
uint3 invocation_id: SV_DispatchThreadID,
RWStructuredBuffer<AtomicGrid> grid,
RWStructuredBuffer<AtomicGridHashMapEntry> hmap_entries,
RWStructuredBuffer<ActiveBlockHeader> active_blocks,
StructuredBuffer<uint> num_base_blocks,
) {
let id = invocation_id.x;
if (id < num_base_blocks[0]) {
let vid = active_blocks[id].virtual_id;
var blocks = blocks_associated_to_block(vid);
for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
mark_block_as_active(grid, hmap_entries, active_blocks, blocks[i]);
}
}
}

// Precomputes, for each active block, the header IDs of its +1 neighbour blocks.
//
// Each particle contributes "extras" to the +1 neighbour blocks of its primary block. Rather
// than re-querying the hashmap for those neighbours once per particle in the count/finalize
// passes, this resolves them once per active block and caches them in `nbh_block_ids`. Inactive
// neighbours are stored as NONE. Must run after all blocks have been touched.
[shader("compute")]
[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
func update_nbh_block_ids(
uint3 invocation_id: SV_DispatchThreadID,
StructuredBuffer<Grid> grid,
StructuredBuffer<GridHashMapEntry> hmap_entries,
RWStructuredBuffer<ActiveBlockHeader> active_blocks,
) {
let id = invocation_id.x;
if (id < grid[0].num_active_blocks) {
let vid = active_blocks[id].virtual_id;
var assoc = blocks_associated_to_block(vid);
for (var nbh = 0u; nbh < NUM_NBH_BLOCKS; nbh += 1u) {
let nbh_hid = find_block_header_id(grid, hmap_entries, assoc[nbh + 1]);
active_blocks[id].nbh_block_ids[nbh] = nbh_hid;
}
}
}

[shader("compute")]
[numthreads(GRID_WORKGROUP_SIZE, 1, 1)]
func touch_particle_blocks(
Expand Down Expand Up @@ -109,18 +216,24 @@ func update_block_particle_count(
let particle = particles_pos[id];

var blocks = blocks_associated_to_point(cell_width, particle.pt);
let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
active_blocks[active_block_id_0.id].num_particles.add(1u);
active_blocks[active_block_id_0.id].num_particles_with_extras.add(1u);

let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
let mask = uint3(assoc >= 2);

// The particle's primary (base) block gets it as a regular particle. Only the per-slab
// bucket counter is incremented: `num_particles` (sum of primary buckets) and
// `num_particles_with_extras` (sum of all buckets) are derived per block in the copy pass,
// so we avoid two extra — and heavily contended — atomics per particle here.
let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u);

// Each +1 neighbour block also receives the particle as an "extra" if its quadratic
// stencil actually spills into it. The neighbour header IDs were precomputed by
// `update_nbh_block_ids`, so we read them from the primary block instead of doing a
// hashmap lookup per particle.
for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
let bshift = blocks[i].id - blocks[0].id;
if (all((bshift * mask) == bshift)) {
let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
if (extra_spills(assoc, bshift)) {
let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1];
active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u);
}
}
}
Expand All @@ -136,7 +249,14 @@ func copy_particles_len_to_scan_value(
) {
let id = invocation_id.x;
if (id < grid[0].num_active_blocks) {
scan_values[id] = active_blocks[id].num_particles_with_extras;
// The sorted array reserves room for every particle a block touches, extras included.
// `num_particles_with_extras` is the sum of all slab buckets (the count pass no longer
// maintains it as a separate atomic).
var total = 0u;
for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
total += active_blocks[id].sort_bucket_cursors[k];
}
scan_values[id] = total;
}
}

Expand All @@ -150,9 +270,28 @@ func copy_scan_values_to_first_particles_and_prepare_for_finalize(
) {
let id = invocation_id.x;
if (id < grid[0].num_active_blocks) {
active_blocks[id].first_particle = scan_values[id];
active_blocks[id].num_particles_with_extras = active_blocks[id].num_particles;
active_blocks[id].num_particles = 0u;
let first = scan_values[id];
active_blocks[id].first_particle = first;
// Convert the per-bucket counts accumulated by the count pass into running insertion
// cursors. The cursors are *absolute* offsets into the sorted array (i.e. `first_particle`
// is baked in), so the finalize pass can scatter each particle with a single atomic.
// Primary buckets come first, so primaries land in [first_particle, first_particle +
// num_particles) as the scatter P2G expects, with the extras after them; both segments
// end up ordered by slab key.
//
// The running total advanced past the primary buckets is `num_particles`; the grand total
// is `num_particles_with_extras`. Both are derived here rather than maintained as
// per-particle atomics in the count pass.
var running = first;
for (var k = 0u; k < NUM_SORT_BUCKETS; k += 1u) {
if (k == NUM_PRIMARY_SORT_BUCKETS) {
active_blocks[id].num_particles = running - first;
}
let count = active_blocks[id].sort_bucket_cursors[k];
active_blocks[id].sort_bucket_cursors[k] = running;
running += count;
}
active_blocks[id].num_particles_with_extras = running - first;
}
}

Expand All @@ -176,29 +315,31 @@ func finalize_particles_sort(
let cell_width = grid[0].cell_width;
let particle = particles_pos[id];

// Place the particle to its sorted place.
var blocks = blocks_associated_to_point(cell_width, particle.pt);
let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
let target_index = active_blocks[active_block_id_0.id].first_particle +
active_blocks[active_block_id_0.id].num_particles.add(1u);
sorted_particle_ids[target_index] = id;

let assoc = associated_cell_index_in_block_off_by_one(particle, cell_width);
let mask = uint3(assoc >= 2);

// Place the particle in its primary block's range. The prepare pass turned the bucket
// counts into absolute insertion cursors (first_particle baked in), so the atomically
// claimed slot is already the final sorted index.
let active_block_id_0 = find_block_header_id(grid, hmap_entries, blocks[0]);
let slot0 = active_blocks[active_block_id_0.id].sort_bucket_cursors[primary_sort_bucket(assoc)].add(1u);
sorted_particle_ids[slot0] = id;

// Place the particle as an "extra" into each +1 neighbour block whose stencil it spills
// into, using the extra slab bucket cursors (extras land after the primaries because
// their buckets come last). Neighbour header IDs are reused from `update_nbh_block_ids`.
for (var i = 1u; i < NUM_ASSOC_BLOCKS; i += 1u) {
let bshift = blocks[i].id - blocks[0].id;
if (all((bshift * mask) == bshift)) {
let active_block_id_i = find_block_header_id(grid, hmap_entries, blocks[i]);
let target_index = active_blocks[active_block_id_i.id].first_particle +
active_blocks[active_block_id_i.id].num_particles_with_extras.add(1u);
sorted_particle_ids[target_index] = id;
if (extra_spills(assoc, bshift)) {
let block_i = active_blocks[active_block_id_0.id].nbh_block_ids[i - 1];
let slot_i = active_blocks[block_i.id].sort_bucket_cursors[extra_sort_bucket(assoc, bshift)].add(1u);
sorted_particle_ids[slot_i] = id;
}
}

// Setup the per-node particle linked-list.
let node_local_id = associated_cell_index_in_block_off_by_one(particle, cell_width);
let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), node_local_id);
// Setup the per-node particle linked-list (still consumed by the gather P2G and the
// rigid/CDF paths).
let node_global_id = node_id(block_header_id_to_physical_id(active_block_id_0), assoc);
let prev_head = nodes_linked_lists[node_global_id.id].head.exchange(id);
nodes_linked_lists[node_global_id.id].len.add(1u);
particle_node_linked_lists[id] = prev_head;
Expand Down
Loading
Loading