Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[build]
rustflags = ["-C", "target-cpu=native"]
27 changes: 27 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ publish = false

[dependencies]
capt = { path = "../capt", features = ["simd"] }
wide = { version = "1.1.1", default-features = false }
morton_filter = { path = "../morton_filter" }
kiddo = { version = "5.2.2", features = ["simd"], default-features = false }
rand = { version = "0.9.1", default-features = false }
Expand Down
17 changes: 7 additions & 10 deletions bench/src/bin/correctness.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#![feature(portable_simd)]

use std::simd::Simd;

use bench::{dist, kdt::PkdTree, parse_pointcloud_csv, parse_trace_csv, trace_r_range};
use wide::f32x8;
use capt::Capt;
use kiddo::SquaredEuclidean;
use rand::{seq::SliceRandom, Rng, SeedableRng};
Expand Down Expand Up @@ -65,19 +62,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let exact_dist = dist(kdt.get_point(kdt.query1_exact(*center)), *center);
assert_eq!(exact_dist, exact_kiddo_dist);

let simd_center: [Simd<f32, 8>; 3] = [
Simd::splat(center[0]),
Simd::splat(center[1]),
Simd::splat(center[2]),
let simd_center: [f32x8; 3] = [
f32x8::splat(center[0]),
f32x8::splat(center[1]),
f32x8::splat(center[2]),
];
if exact_dist <= *r {
println!("iter {i}: {:?} (collides)", (center, r));
assert!(aff_tree.collides(center, *r));
assert!(aff_tree.collides_simd(&simd_center, Simd::splat(*r)))
assert!(aff_tree.collides_simd(&simd_center, f32x8::splat(*r)))
} else {
println!("iter {i}: {:?} (no collides)", (center, r));
assert!(!aff_tree.collides(center, *r));
assert!(!aff_tree.collides_simd(&simd_center, Simd::splat(*r)))
assert!(!aff_tree.collides_simd(&simd_center, f32x8::splat(*r)))
}
}

Expand Down
9 changes: 3 additions & 6 deletions bench/src/bin/error.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
#![feature(portable_simd)]

use bench::{dist, fuzz_pointcloud, get_points, kdt::PkdTree, make_needles};
use kiddo::SquaredEuclidean;
use rand::{Rng, SeedableRng};
use rand_chacha::ChaCha20Rng;

const N: usize = 1 << 16;
const L: usize = 16;
const D: usize = 3;

fn main() {
let mut rng = ChaCha20Rng::seed_from_u64(2707);
let mut starting_points = get_points(N);
fuzz_pointcloud(&mut starting_points, 0.001, &mut rng);
measure_error::<D, L>(&starting_points, &mut rng, 1 << 16)
measure_error::<D>(&starting_points, &mut rng, 1 << 16)
}

pub fn measure_error<const D: usize, const L: usize>(
pub fn measure_error<const D: usize>(
points: &[[f32; D]],
rng: &mut impl Rng,
n_trials: usize,
Expand All @@ -27,7 +24,7 @@ pub fn measure_error<const D: usize, const L: usize>(
kiddo_kdt.add(pt, 0);
}

let (seq_needles, _) = make_needles::<D, L>(rng, n_trials);
let (seq_needles, _) = make_needles::<D>(rng, n_trials);

for seq_needle in seq_needles {
let exact_kiddo_dist = kiddo_kdt
Expand Down
2 changes: 1 addition & 1 deletion bench/src/bin/forest_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ fn err_forest<const T: usize>(points: &[[f32; 3]], rng: &mut impl Rng) {
kiddo_kdt.add(pt, 0);
}

let (seq_needles, _) = make_needles::<3, 1>(rng, 10_000);
let (seq_needles, _) = make_needles::<3>(rng, 10_000);

let mut total_err = 0.0;
for &needle in &seq_needles {
Expand Down
21 changes: 8 additions & 13 deletions bench/src/bin/perf_plots.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
#![feature(portable_simd)]

use std::{
cmp::min, env::args, error::Error, fs::File, hint::black_box, io::Write, simd::f32x8,
time::Duration,
};
use std::{cmp::min, env::args, error::Error, fs::File, hint::black_box, io::Write, time::Duration};

use bench::{
forest::PkdForest, fuzz_pointcloud, kdt::PkdTree, parse_pointcloud_csv, parse_trace_csv,
simd_trace_new, stopwatch, SimdTrace, Trace,
};
use wide::f32x8;
use capt::Capt;
#[allow(unused_imports)]
use kiddo::SquaredEuclidean;
Expand All @@ -17,13 +13,12 @@ use rand::{seq::SliceRandom, Rng};
use rand_chacha::{rand_core::SeedableRng, ChaCha20Rng};

const N_TRIALS: usize = 100_000;
const L: usize = 8;

const QUERY_RADIUS: f32 = 0.05;

struct Benchmark<'a> {
seq: &'a Trace,
simd: &'a SimdTrace<L>,
simd: &'a SimdTrace,
f_query: File,
}

Expand Down Expand Up @@ -89,7 +84,7 @@ fn main() -> Result<(), Box<dyn Error>> {
println!("number of tests: {}", all_trace.len());
println!("radius range: {r_range:?}");

let captree = Capt::<3>::new(&points, r_range, L);
let captree = Capt::<3>::new(&points, r_range, 8);

let collide_trace: Box<Trace> = all_trace
.iter()
Expand Down Expand Up @@ -156,7 +151,7 @@ fn do_row(

let (pkdt, pkdt_time) = stopwatch(|| PkdTree::new(points));

let (captree, captree_time) = stopwatch(|| Capt::<3, f32, u32>::new(points, r_range, L));
let (captree, captree_time) = stopwatch(|| Capt::<3, f32, u32>::new(points, r_range, 8));

let (f1, f1_time) = stopwatch(|| PkdForest::<3, 1>::new(points));
let (f2, f2_time) = stopwatch(|| PkdForest::<3, 2>::new(points));
Expand Down Expand Up @@ -224,7 +219,7 @@ fn do_row(
});
let (_, pkdt_total_simd_q_time) = stopwatch(|| {
for (centers, radii) in simd_trace.iter() {
black_box(pkdt.might_collide_simd(centers, radii * radii));
black_box(pkdt.might_collide_simd(centers, *radii * *radii));
}
});
let (_, captree_total_seq_q_time) = stopwatch(|| {
Expand All @@ -234,7 +229,7 @@ fn do_row(
});
let (_, captree_total_simd_q_time) = stopwatch(|| {
for (centers, radii) in simd_trace.iter() {
black_box(captree.collides_simd(centers, radii * radii));
black_box(captree.collides_simd(centers, *radii * *radii));
}
});

Expand Down Expand Up @@ -280,7 +275,7 @@ fn bench_forest<const T: usize>(
) -> Duration {
stopwatch(|| {
for (centers, radii) in simd_trace {
black_box(forest.might_collide_simd(centers, radii * radii));
black_box(forest.might_collide_simd(centers, *radii * *radii));
}
})
.1
Expand Down
69 changes: 27 additions & 42 deletions bench/src/forest.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
//! Power-of-two k-d forests.

use std::simd::{cmp::SimdPartialOrd, ptr::SimdConstPtr, Mask, Simd};

use crate::{distsq, median_partition};
use wide::{f32x8, i32x8, CmpGe};

#[derive(Clone, Debug)]
struct RandomizedTree<const K: usize> {
Expand Down Expand Up @@ -53,26 +52,25 @@ impl<const K: usize, const T: usize> PkdForest<K, T> {
}

#[must_use]
pub fn might_collide_simd<const L: usize>(
&self,
needles: &[Simd<f32, L>; K],
radii_squared: Simd<f32, L>,
) -> bool {
let mut not_yet_collided = Mask::splat(true);
#[allow(clippy::cast_sign_loss)]
pub fn might_collide_simd(&self, needles: &[f32x8; K], radii_squared: f32x8) -> bool {
// all_true: f32x8 bitmask where all lanes are "not yet collided"
let all_true: f32x8 =
unsafe { core::mem::transmute::<i32x8, f32x8>(i32x8::splat(-1_i32)) };
let mut not_yet_collided = all_true;

for tree in &self.test_seqs {
let indices = tree.mask_query(needles, not_yet_collided);
let mut dists_sq = Simd::splat(0.0);
let mut ptrs = Simd::splat(tree.points.as_ptr().cast()).wrapping_offset(indices);
for needle_set in needles {
let diffs =
unsafe { Simd::gather_select_ptr(ptrs, not_yet_collided, Simd::splat(0.0)) }
- needle_set;
dists_sq += diffs * diffs;
ptrs = ptrs.wrapping_add(Simd::splat(1));
let indices = tree.forward_pass_wide(needles);
let idx_arr = indices.to_array();
let mut dists_sq = f32x8::ZERO;
for (k, needle_values) in needles.iter().enumerate() {
let vals = f32x8::new(idx_arr.map(|i| tree.points[i as usize][k]));
let diffs = vals - needle_values;
dists_sq = dists_sq + diffs * diffs;
}

not_yet_collided &= radii_squared.simd_lt(dists_sq).cast();
// lanes where dists_sq >= radii_squared have not (yet) collided
not_yet_collided = not_yet_collided & dists_sq.simd_ge(radii_squared);

if !not_yet_collided.all() {
// at least one has collided - can return quickly
Expand Down Expand Up @@ -145,37 +143,24 @@ impl<const K: usize> RandomizedTree<K> {
test_idx - self.tests.len()
}

#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
/// Perform a masked SIMD query of this tree, only determining the location of the nearest
/// neighbors for points in `mask`.
fn mask_query<const L: usize>(
&self,
needles: &[Simd<f32, L>; K],
mask: Mask<isize, L>,
) -> Simd<isize, L> {
let mut test_idxs: Simd<isize, L> = Simd::splat(0);
#[allow(clippy::cast_sign_loss)]
fn forward_pass_wide(&self, needles: &[f32x8; K]) -> i32x8 {
let mut test_idxs = i32x8::splat(0_i32);
let mut state = self.seed;

// Advance the tests forward
for _ in 0..self.tests.len().trailing_ones() {
let relevant_tests: Simd<f32, L> = unsafe {
Simd::gather_select_ptr(
Simd::splat(self.tests.as_ptr().cast()).wrapping_offset(test_idxs),
mask,
Simd::splat(f32::NAN),
)
};
let idx_arr = test_idxs.to_array();
let relevant_tests =
f32x8::new(idx_arr.map(|i| unsafe { *self.tests.get_unchecked(i as usize) }));
let d = state as usize % K;
let cmp_results: Mask<isize, L> = (needles[d].simd_ge(relevant_tests)).into();

// TODO is there a faster way than using a conditional select?
test_idxs <<= Simd::splat(1);
test_idxs += Simd::splat(1);
test_idxs += cmp_results.to_simd() & Simd::splat(1);
let cmp_f = needles[d].simd_ge(relevant_tests);
let cmp_bit: i32x8 =
unsafe { core::mem::transmute::<f32x8, i32x8>(cmp_f) } & i32x8::splat(1);
test_idxs = (test_idxs << 1_i32) + 1_i32 + cmp_bit;
state = xorshift(state);
}

test_idxs - Simd::splat(self.tests.len() as isize)
test_idxs - i32x8::splat(self.tests.len() as i32)
}
}

Expand Down
Loading
Loading