Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions benches/simd_math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use std::time::Duration;

#[path = "simd_math/hyperbolic.rs"]
mod hyperbolic;
#[path = "simd_math/inverse_trig.rs"]
mod inverse_trig;
#[path = "simd_math/log_exp.rs"]
mod log_exp;
#[path = "simd_math/shared.rs"]
Expand All @@ -13,6 +15,7 @@ mod trig;
fn criterion_benchmark(c: &mut Criterion) {
log_exp::register(c);
hyperbolic::register(c);
inverse_trig::register(c);
trig::register(c);
}

Expand Down
120 changes: 120 additions & 0 deletions benches/simd_math/inverse_trig.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
use criterion::Criterion;
use simdeez::math::SimdMathF32InverseTrig;
#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
use simdeez::scalar::Scalar;
use simdeez::{prelude::*, simd_unsafe_generate_all};

use crate::shared::{self, BenchTargets, INPUT_LEN};

#[inline(never)]
fn scalar_asin_sum(input: &[f32]) -> f32 {
input.iter().copied().map(f32::asin).sum()
}

#[inline(never)]
fn scalar_acos_sum(input: &[f32]) -> f32 {
input.iter().copied().map(f32::acos).sum()
}

#[inline(never)]
fn scalar_atan_sum(input: &[f32]) -> f32 {
input.iter().copied().map(f32::atan).sum()
}

simd_unsafe_generate_all!(
fn simdeez_asin_sum(input: &[f32]) -> f32 {
shared::simdeez_sum_impl::<S>(input, |v| v.asin_u35())
}
);

simd_unsafe_generate_all!(
fn simdeez_acos_sum(input: &[f32]) -> f32 {
shared::simdeez_sum_impl::<S>(input, |v| v.acos_u35())
}
);

simd_unsafe_generate_all!(
fn simdeez_atan_sum(input: &[f32]) -> f32 {
shared::simdeez_sum_impl::<S>(input, |v| v.atan_u35())
}
);

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
#[inline(never)]
fn simdeez_asin_sum_scalar(input: &[f32]) -> f32 {
shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.asin_u35())
}

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
#[inline(never)]
fn simdeez_acos_sum_scalar(input: &[f32]) -> f32 {
shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.acos_u35())
}

#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
#[inline(never)]
fn simdeez_atan_sum_scalar(input: &[f32]) -> f32 {
shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.atan_u35())
}

pub fn register(c: &mut Criterion) {
let inverse_inputs = shared::make_inverse_trig_inputs(INPUT_LEN, 0xA11C_E101);
let atan_inputs = shared::make_atan_inputs(INPUT_LEN, 0xA11C_E102);

shared::bench_variants(
c,
"simd_math/f32/asin_u35",
&inverse_inputs,
BenchTargets {
scalar_native: scalar_asin_sum,
simdeez_runtime: simdeez_asin_sum,
simdeez_scalar: simdeez_asin_sum_scalar,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_asin_sum_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse41: simdeez_asin_sum_sse41,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx2: simdeez_asin_sum_avx2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx512: simdeez_asin_sum_avx512,
},
);

shared::bench_variants(
c,
"simd_math/f32/acos_u35",
&inverse_inputs,
BenchTargets {
scalar_native: scalar_acos_sum,
simdeez_runtime: simdeez_acos_sum,
simdeez_scalar: simdeez_acos_sum_scalar,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_acos_sum_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse41: simdeez_acos_sum_sse41,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx2: simdeez_acos_sum_avx2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx512: simdeez_acos_sum_avx512,
},
);

shared::bench_variants(
c,
"simd_math/f32/atan_u35",
&atan_inputs,
BenchTargets {
scalar_native: scalar_atan_sum,
simdeez_runtime: simdeez_atan_sum,
simdeez_scalar: simdeez_atan_sum_scalar,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse2: simdeez_atan_sum_sse2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_sse41: simdeez_atan_sum_sse41,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx2: simdeez_atan_sum_avx2,
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
simdeez_avx512: simdeez_atan_sum_avx512,
},
);
}
10 changes: 10 additions & 0 deletions benches/simd_math/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ pub fn make_trig_inputs(len: usize, seed: u64) -> Vec<f32> {
.collect()
}

pub fn make_inverse_trig_inputs(len: usize, seed: u64) -> Vec<f32> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect()
}

pub fn make_atan_inputs(len: usize, seed: u64) -> Vec<f32> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(-64.0f32..64.0f32)).collect()
}

pub fn make_tan_inputs(len: usize, seed: u64) -> Vec<f32> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
let half_pi = core::f32::consts::FRAC_PI_2;
Expand Down
9 changes: 5 additions & 4 deletions src/math/contracts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ pub const SIN_U35_F32_MAX_ULP: u32 = 35;
pub const COS_U35_F32_MAX_ULP: u32 = 35;
pub const TAN_U35_F32_MAX_ULP: u32 = 35;

// Portable inverse-trig kernels target the SLEEF-style u35 contract on f32.
pub const ASIN_U35_F32_MAX_ULP: u32 = 35;
pub const ACOS_U35_F32_MAX_ULP: u32 = 35;
pub const ATAN_U35_F32_MAX_ULP: u32 = 35;
pub const ATAN2_U35_F32_MAX_ULP: u32 = 1;
// Portable f32 hyperbolic kernels are now honest u35 implementations:
// bounded SIMD fast paths plus scalar-lane patching for exceptional inputs.
pub const ASIN_U35_F32_MAX_ULP: u32 = 1;
pub const ACOS_U35_F32_MAX_ULP: u32 = 1;
pub const ATAN_U35_F32_MAX_ULP: u32 = 1;
pub const ATAN2_U35_F32_MAX_ULP: u32 = 1;
pub const SINH_U35_F32_MAX_ULP: u32 = 35;
pub const COSH_U35_F32_MAX_ULP: u32 = 35;
pub const TANH_U35_F32_MAX_ULP: u32 = 35;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,31 @@
mod portable_f32;

use crate::math::{map, scalar};
use crate::{SimdFloat32, SimdFloat64};
use crate::{Simd, SimdFloat32, SimdFloat64};

pub trait SimdMathF32InverseTrig: SimdFloat32 {
#[inline(always)]
fn asin_u35(self) -> Self {
map::unary_f32(self, scalar::asin_u35_f32)
fn asin_u35(self) -> Self
where
Self::Engine: Simd<Vf32 = Self>,
{
portable_f32::asin_u35(self)
}

#[inline(always)]
fn acos_u35(self) -> Self {
map::unary_f32(self, scalar::acos_u35_f32)
fn acos_u35(self) -> Self
where
Self::Engine: Simd<Vf32 = Self>,
{
portable_f32::acos_u35(self)
}

#[inline(always)]
fn atan_u35(self) -> Self {
map::unary_f32(self, scalar::atan_u35_f32)
fn atan_u35(self) -> Self
where
Self::Engine: Simd<Vf32 = Self>,
{
portable_f32::atan_u35(self)
}
}

Expand Down
Loading
Loading