Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 115 additions & 12 deletions benches/simd_math/inverse_trig.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
use criterion::Criterion;
use simdeez::math::SimdMathF32InverseTrig;
#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
use simdeez::math::{SimdMathF32InverseTrig, SimdMathF64InverseTrig};
use simdeez::scalar::Scalar;
use simdeez::{prelude::*, simd_unsafe_generate_all};

use crate::shared::{self, BenchTargets, INPUT_LEN};
use crate::shared::{self, BenchTargets, BenchTargetsF64, INPUT_LEN};

#[inline(never)]
fn scalar_asin_sum(input: &[f32]) -> f32 {
Expand All @@ -21,6 +20,21 @@ fn scalar_atan_sum(input: &[f32]) -> f32 {
input.iter().copied().map(f32::atan).sum()
}

#[inline(never)]
fn scalar_asin_sum_f64(input: &[f64]) -> f64 {
input.iter().copied().map(f64::asin).sum()
}

#[inline(never)]
fn scalar_acos_sum_f64(input: &[f64]) -> f64 {
input.iter().copied().map(f64::acos).sum()
}

#[inline(never)]
fn scalar_atan_sum_f64(input: &[f64]) -> f64 {
input.iter().copied().map(f64::atan).sum()
}

simd_unsafe_generate_all!(
fn simdeez_asin_sum(input: &[f32]) -> f32 {
shared::simdeez_sum_impl::<S>(input, |v| v.asin_u35())
Expand All @@ -39,27 +53,59 @@ simd_unsafe_generate_all!(
}
);

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
simd_unsafe_generate_all!(
fn simdeez_asin_sum_f64(input: &[f64]) -> f64 {
shared::simdeez_sum_impl_f64::<S>(input, |v| v.asin_u35())
}
);

simd_unsafe_generate_all!(
fn simdeez_acos_sum_f64(input: &[f64]) -> f64 {
shared::simdeez_sum_impl_f64::<S>(input, |v| v.acos_u35())
}
);

simd_unsafe_generate_all!(
fn simdeez_atan_sum_f64(input: &[f64]) -> f64 {
shared::simdeez_sum_impl_f64::<S>(input, |v| v.atan_u35())
}
);

#[inline(never)]
fn simdeez_asin_sum_scalar(input: &[f32]) -> f32 {
fn forced_scalar_asin_sum(input: &[f32]) -> f32 {
shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.asin_u35())
}

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
#[inline(never)]
fn simdeez_acos_sum_scalar(input: &[f32]) -> f32 {
fn forced_scalar_acos_sum(input: &[f32]) -> f32 {
shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.acos_u35())
}

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
#[inline(never)]
fn simdeez_atan_sum_scalar(input: &[f32]) -> f32 {
fn forced_scalar_atan_sum(input: &[f32]) -> f32 {
shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.atan_u35())
}

#[inline(never)]
fn forced_scalar_asin_sum_f64(input: &[f64]) -> f64 {
shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.asin_u35())
}

#[inline(never)]
fn forced_scalar_acos_sum_f64(input: &[f64]) -> f64 {
shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.acos_u35())
}

#[inline(never)]
fn forced_scalar_atan_sum_f64(input: &[f64]) -> f64 {
shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.atan_u35())
}

pub fn register(c: &mut Criterion) {
let inverse_inputs = shared::make_inverse_trig_inputs(INPUT_LEN, 0xA11C_E101);
let atan_inputs = shared::make_atan_inputs(INPUT_LEN, 0xA11C_E102);
let inverse_inputs_f64 = shared::make_inverse_trig_inputs_f64(INPUT_LEN, 0xA11C_E201);
let atan_inputs_f64 = shared::make_atan_inputs_f64(INPUT_LEN, 0xA11C_E202);

shared::bench_variants(
c,
Expand All @@ -68,7 +114,7 @@ pub fn register(c: &mut Criterion) {
BenchTargets {
scalar_native: scalar_asin_sum,
simdeez_runtime: simdeez_asin_sum,
simdeez_scalar: simdeez_asin_sum_scalar,
simdeez_scalar: forced_scalar_asin_sum,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_asin_sum_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
Expand All @@ -87,7 +133,7 @@ pub fn register(c: &mut Criterion) {
BenchTargets {
scalar_native: scalar_acos_sum,
simdeez_runtime: simdeez_acos_sum,
simdeez_scalar: simdeez_acos_sum_scalar,
simdeez_scalar: forced_scalar_acos_sum,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_acos_sum_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
Expand All @@ -106,7 +152,7 @@ pub fn register(c: &mut Criterion) {
BenchTargets {
scalar_native: scalar_atan_sum,
simdeez_runtime: simdeez_atan_sum,
simdeez_scalar: simdeez_atan_sum_scalar,
simdeez_scalar: forced_scalar_atan_sum,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_atan_sum_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
Expand All @@ -117,4 +163,61 @@ pub fn register(c: &mut Criterion) {
simdeez_avx512: simdeez_atan_sum_avx512,
},
);

shared::bench_variants_f64(
c,
"simd_math/f64/asin_u35",
&inverse_inputs_f64,
BenchTargetsF64 {
scalar_native: scalar_asin_sum_f64,
simdeez_runtime: simdeez_asin_sum_f64,
simdeez_scalar: forced_scalar_asin_sum_f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_asin_sum_f64_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse41: simdeez_asin_sum_f64_sse41,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx2: simdeez_asin_sum_f64_avx2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx512: simdeez_asin_sum_f64_avx512,
},
);

shared::bench_variants_f64(
c,
"simd_math/f64/acos_u35",
&inverse_inputs_f64,
BenchTargetsF64 {
scalar_native: scalar_acos_sum_f64,
simdeez_runtime: simdeez_acos_sum_f64,
simdeez_scalar: forced_scalar_acos_sum_f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_acos_sum_f64_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse41: simdeez_acos_sum_f64_sse41,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx2: simdeez_acos_sum_f64_avx2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx512: simdeez_acos_sum_f64_avx512,
},
);

shared::bench_variants_f64(
c,
"simd_math/f64/atan_u35",
&atan_inputs_f64,
BenchTargetsF64 {
scalar_native: scalar_atan_sum_f64,
simdeez_runtime: simdeez_atan_sum_f64,
simdeez_scalar: forced_scalar_atan_sum_f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_atan_sum_f64_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse41: simdeez_atan_sum_f64_sse41,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx2: simdeez_atan_sum_f64_avx2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx512: simdeez_atan_sum_f64_avx512,
},
);
}
111 changes: 108 additions & 3 deletions benches/simd_math/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use criterion::{Criterion, Throughput};
use rand::{Rng, SeedableRng};
use rand_chacha::ChaCha8Rng;
use simdeez::prelude::*;
#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
use simdeez::scalar::Scalar;
use std::hint::black_box;

Expand Down Expand Up @@ -48,11 +47,21 @@ pub fn make_inverse_trig_inputs(len: usize, seed: u64) -> Vec<f32> {
(0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect()
}

pub fn make_inverse_trig_inputs_f64(len: usize, seed: u64) -> Vec<f64> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(-1.0f64..1.0f64)).collect()
}

pub fn make_atan_inputs(len: usize, seed: u64) -> Vec<f32> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(-64.0f32..64.0f32)).collect()
}

pub fn make_atan_inputs_f64(len: usize, seed: u64) -> Vec<f64> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(-64.0f64..64.0f64)).collect()
}

pub fn make_tan_inputs(len: usize, seed: u64) -> Vec<f32> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
let half_pi = core::f32::consts::FRAC_PI_2;
Expand Down Expand Up @@ -87,6 +96,20 @@ pub struct BenchTargets {
pub simdeez_avx512: unsafe fn(&[f32]) -> f32,
}

pub struct BenchTargetsF64 {
pub scalar_native: fn(&[f64]) -> f64,
pub simdeez_runtime: fn(&[f64]) -> f64,
pub simdeez_scalar: fn(&[f64]) -> f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub simdeez_sse2: unsafe fn(&[f64]) -> f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub simdeez_sse41: unsafe fn(&[f64]) -> f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub simdeez_avx2: unsafe fn(&[f64]) -> f64,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub simdeez_avx512: unsafe fn(&[f64]) -> f64,
}

pub fn bench_variants(c: &mut Criterion, group_name: &str, input: &[f32], targets: BenchTargets) {
let mut group = c.benchmark_group(group_name);
group.throughput(Throughput::Elements(input.len() as u64));
Expand Down Expand Up @@ -146,15 +169,83 @@ pub fn bench_variants(c: &mut Criterion, group_name: &str, input: &[f32], target
group.finish();
}

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
pub fn bench_variants_f64(
c: &mut Criterion,
group_name: &str,
input: &[f64],
targets: BenchTargetsF64,
) {
let mut group = c.benchmark_group(group_name);
group.throughput(Throughput::Elements(input.len() as u64));

group.bench_function("scalar-native", |b| {
b.iter(|| black_box((targets.scalar_native)(black_box(input))))
});

group.bench_function("simdeez-runtime", |b| {
b.iter(|| black_box((targets.simdeez_runtime)(black_box(input))))
});

group.bench_function("simdeez-forced-scalar", |b| {
b.iter(|| black_box((targets.simdeez_scalar)(black_box(input))))
});

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if std::is_x86_feature_detected!("sse2") {
group.bench_function("simdeez-forced-sse2", |b| {
b.iter(|| unsafe { black_box((targets.simdeez_sse2)(black_box(input))) })
});
} else {
eprintln!("[bench] skipped simdeez-forced-sse2 for {group_name}: CPU lacks sse2");
}

if std::is_x86_feature_detected!("sse4.1") {
group.bench_function("simdeez-forced-sse41", |b| {
b.iter(|| unsafe { black_box((targets.simdeez_sse41)(black_box(input))) })
});
} else {
eprintln!("[bench] skipped simdeez-forced-sse41 for {group_name}: CPU lacks sse4.1");
}

if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") {
group.bench_function("simdeez-forced-avx2", |b| {
b.iter(|| unsafe { black_box((targets.simdeez_avx2)(black_box(input))) })
});
} else {
eprintln!("[bench] skipped simdeez-forced-avx2 for {group_name}: CPU lacks avx2/fma");
}

if std::is_x86_feature_detected!("avx512f")
&& std::is_x86_feature_detected!("avx512bw")
&& std::is_x86_feature_detected!("avx512dq")
{
group.bench_function("simdeez-forced-avx512", |b| {
b.iter(|| unsafe { black_box((targets.simdeez_avx512)(black_box(input))) })
});
} else {
eprintln!(
"[bench] skipped simdeez-forced-avx512 for {group_name}: CPU lacks avx512f+bw+dq"
);
}
}

group.finish();
}

type ScalarVf32 = <Scalar as Simd>::Vf32;
type ScalarVf64 = <Scalar as Simd>::Vf64;

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
#[inline(never)]
pub fn force_scalar_sum(input: &[f32], op: impl Fn(ScalarVf32) -> ScalarVf32) -> f32 {
simdeez_sum_impl::<Scalar>(input, op)
}

#[inline(never)]
pub fn force_scalar_sum_f64(input: &[f64], op: impl Fn(ScalarVf64) -> ScalarVf64) -> f64 {
simdeez_sum_impl_f64::<Scalar>(input, op)
}

#[inline(always)]
pub fn simdeez_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32) -> f32 {
let mut sum = 0.0f32;
Expand All @@ -168,3 +259,17 @@ pub fn simdeez_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32)

sum
}

#[inline(always)]
pub fn simdeez_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
let mut sum = 0.0f64;
let mut i = 0;

while i + S::Vf64::WIDTH <= input.len() {
let v = S::Vf64::load_from_slice(&input[i..]);
sum += op(v).horizontal_add();
i += S::Vf64::WIDTH;
}

sum
}
8 changes: 5 additions & 3 deletions src/math/contracts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ pub const SIN_U35_F64_MAX_ULP: u64 = 1;
pub const COS_U35_F64_MAX_ULP: u64 = 1;
pub const TAN_U35_F64_MAX_ULP: u64 = 1;

pub const ASIN_U35_F64_MAX_ULP: u64 = 1;
pub const ACOS_U35_F64_MAX_ULP: u64 = 1;
pub const ATAN_U35_F64_MAX_ULP: u64 = 1;
// Portable f64 inverse-trig kernels now use the family-local SIMD implementation
// rather than scalar lane mapping, so they carry the honest u35 contract.
pub const ASIN_U35_F64_MAX_ULP: u64 = 35;
pub const ACOS_U35_F64_MAX_ULP: u64 = 35;
pub const ATAN_U35_F64_MAX_ULP: u64 = 35;
pub const ATAN2_U35_F64_MAX_ULP: u64 = 1;
pub const SINH_U35_F64_MAX_ULP: u64 = 1;
pub const COSH_U35_F64_MAX_ULP: u64 = 1;
Expand Down
13 changes: 8 additions & 5 deletions src/math/f64/inverse_trig.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,29 @@
use crate::math::{map, scalar};
use crate::SimdFloat64;
use crate::math::families::inverse_trig::portable_f64;
use crate::{Simd, SimdFloat64};

#[inline(always)]
pub(crate) fn asin_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::asin_u35_f64)
portable_f64::asin_u35(input)
}

#[inline(always)]
pub(crate) fn acos_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::acos_u35_f64)
portable_f64::acos_u35(input)
}

#[inline(always)]
pub(crate) fn atan_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::atan_u35_f64)
portable_f64::atan_u35(input)
}
Loading
Loading