Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion benches/simd_math_remaining_baseline/inverse_hyperbolic.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::Criterion;
use simdeez::math::SimdMathF32InverseHyperbolic;
use simdeez::math::{SimdMathF32InverseHyperbolic, SimdMathF64InverseHyperbolic};
use simdeez::{prelude::*, simd_unsafe_generate_all};

use crate::shared::{self, INPUT_LEN};
Expand All @@ -19,6 +19,21 @@ fn scalar_atanh_sum(input: &[f32]) -> f32 {
input.iter().copied().map(f32::atanh).sum()
}

#[inline(never)]
fn scalar_asinh_sum_f64(input: &[f64]) -> f64 {
input.iter().copied().map(f64::asinh).sum()
}

#[inline(never)]
fn scalar_acosh_sum_f64(input: &[f64]) -> f64 {
input.iter().copied().map(f64::acosh).sum()
}

#[inline(never)]
fn scalar_atanh_sum_f64(input: &[f64]) -> f64 {
input.iter().copied().map(f64::atanh).sum()
}

simd_unsafe_generate_all!(
fn simdeez_asinh_sum(input: &[f32]) -> f32 {
shared::simdeez_unary_sum_impl::<S>(input, |v| v.asinh_u35())
Expand All @@ -37,11 +52,35 @@ simd_unsafe_generate_all!(
}
);

simd_unsafe_generate_all!(
fn simdeez_asinh_sum_f64(input: &[f64]) -> f64 {
shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.asinh_u35())
}
);

simd_unsafe_generate_all!(
fn simdeez_acosh_sum_f64(input: &[f64]) -> f64 {
shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.acosh_u35())
}
);

simd_unsafe_generate_all!(
fn simdeez_atanh_sum_f64(input: &[f64]) -> f64 {
shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.atanh_u35())
}
);

pub fn register(c: &mut Criterion) {
let asinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB001, -16_384.0..16_384.0);
let acosh_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB002, 1.0, 16_384.0);
let atanh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB003, -0.999_999..0.999_999);

let asinh_inputs_f64 =
shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB101, -16_384.0..16_384.0);
let acosh_inputs_f64 = shared::make_positive_inputs_f64(INPUT_LEN, 0xDEADB102, 1.0, 16_384.0);
let atanh_inputs_f64 =
shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB103, -0.999_999_999_999..0.999_999_999_999);

shared::bench_unary(
c,
"simd_math_baseline/f32/asinh_u35",
Expand All @@ -65,4 +104,28 @@ pub fn register(c: &mut Criterion) {
scalar_atanh_sum,
simdeez_atanh_sum,
);

shared::bench_unary_f64(
c,
"simd_math_baseline/f64/asinh_u35",
&asinh_inputs_f64,
scalar_asinh_sum_f64,
simdeez_asinh_sum_f64,
);

shared::bench_unary_f64(
c,
"simd_math_baseline/f64/acosh_u35",
&acosh_inputs_f64,
scalar_acosh_sum_f64,
simdeez_acosh_sum_f64,
);

shared::bench_unary_f64(
c,
"simd_math_baseline/f64/atanh_u35",
&atanh_inputs_f64,
scalar_atanh_sum_f64,
simdeez_atanh_sum_f64,
);
}
40 changes: 40 additions & 0 deletions benches/simd_math_remaining_baseline/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ pub fn make_positive_inputs(len: usize, seed: u64, min: f32, max: f32) -> Vec<f3
(0..len).map(|_| rng.gen_range(min..max)).collect()
}

pub fn make_unary_inputs_f64(len: usize, seed: u64, range: core::ops::Range<f64>) -> Vec<f64> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(range.clone())).collect()
}

pub fn make_positive_inputs_f64(len: usize, seed: u64, min: f64, max: f64) -> Vec<f64> {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
(0..len).map(|_| rng.gen_range(min..max)).collect()
}

pub fn make_binary_inputs(
len: usize,
seed: u64,
Expand All @@ -40,6 +50,18 @@ pub fn simdeez_unary_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S:
sum
}

#[inline(always)]
pub fn simdeez_unary_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
let mut sum = 0.0f64;
let mut i = 0;
while i + S::Vf64::WIDTH <= input.len() {
let v = S::Vf64::load_from_slice(&input[i..]);
sum += op(v).horizontal_add();
i += S::Vf64::WIDTH;
}
sum
}

#[inline(always)]
pub fn simdeez_binary_sum_impl<S: Simd>(
a: &[f32],
Expand Down Expand Up @@ -75,6 +97,24 @@ pub fn bench_unary(
group.finish();
}

pub fn bench_unary_f64(
c: &mut Criterion,
name: &str,
input: &[f64],
scalar: fn(&[f64]) -> f64,
simd: fn(&[f64]) -> f64,
) {
let mut group = c.benchmark_group(name);
group.throughput(Throughput::Elements(input.len() as u64));
group.bench_function("scalar-native", |b| {
b.iter(|| black_box(scalar(black_box(input))))
});
group.bench_function("simdeez-runtime", |b| {
b.iter(|| black_box(simd(black_box(input))))
});
group.finish();
}

pub fn bench_binary(
c: &mut Criterion,
name: &str,
Expand Down
93 changes: 88 additions & 5 deletions src/math/f64/inverse_hyperbolic.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,109 @@
use crate::math::{map, scalar};
use crate::SimdFloat64;
use crate::math::{f64, scalar};
use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};

type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;

#[inline(always)]
fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
where
V: SimdFloat64,
{
unsafe {
let lanes = mask.as_array();
for lane in 0..V::WIDTH {
if lanes[lane] != 0 {
return true;
}
}
}

false
}

#[inline(always)]
fn patch_exceptional_lanes<V>(
input: V,
output: V,
exceptional_mask: SimdI64<V>,
scalar_fallback: fn(f64) -> f64,
) -> V
where
V: SimdFloat64,
{
if !any_lane_nonzero::<V>(exceptional_mask) {
return output;
}

unsafe {
let input_lanes = input.as_array();
let mask_lanes = exceptional_mask.as_array();
let mut output_lanes = output.as_array();

for lane in 0..V::WIDTH {
if mask_lanes[lane] != 0 {
output_lanes[lane] = scalar_fallback(input_lanes[lane]);
}
}

V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
}
}

#[inline(always)]
pub(crate) fn asinh_u35<V>(input: V) -> V
where
V: SimdFloat64,
{
map::unary_f64(input, scalar::asinh_u35_f64)
let finite_mask = input.cmp_eq(input).bitcast_i64();
let abs_x = input.abs();
let tiny_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64();
let large_mask = abs_x.cmp_gt(V::set1(1.0e150)).bitcast_i64();
let zero_mask = input.cmp_eq(V::zeroes()).bitcast_i64();
let exceptional_mask =
finite_mask.cmp_eq(SimdI64::<V>::zeroes()) | tiny_mask | large_mask | zero_mask;

let radicand = (abs_x * abs_x) + V::set1(1.0);
let magnitude = f64::ln_u35(abs_x + radicand.sqrt());
let negative_mask = input.cmp_lt(V::zeroes());
let fast = negative_mask.blendv(magnitude, -magnitude);

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asinh_u35_f64)
}

#[inline(always)]
pub(crate) fn acosh_u35<V>(input: V) -> V
where
V: SimdFloat64,
{
map::unary_f64(input, scalar::acosh_u35_f64)
let finite_mask = input.cmp_eq(input).bitcast_i64();
let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64();
let fast_mask = finite_mask & in_domain_mask;
let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt());
let fast = f64::ln_u35(input + root_term);

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64)
}

#[inline(always)]
pub(crate) fn atanh_u35<V>(input: V) -> V
where
V: SimdFloat64,
{
map::unary_f64(input, scalar::atanh_u35_f64)
let finite_mask = input.cmp_eq(input).bitcast_i64();
let abs_x = input.abs();
let strict_domain_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64();
let non_zero_mask = input.cmp_neq(V::zeroes()).bitcast_i64();
let stable_range_mask = abs_x.cmp_lte(V::set1(0.99)).bitcast_i64();
let away_from_zero_mask = abs_x.cmp_gte(V::set1(0.9)).bitcast_i64();
let fast_mask =
finite_mask & strict_domain_mask & non_zero_mask & stable_range_mask & away_from_zero_mask;
let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

let one = V::set1(1.0);
let ratio = (one + input) / (one - input);
let fast = f64::ln_u35(ratio) * V::set1(0.5);

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atanh_u35_f64)
}
Loading
Loading