diff --git a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs index c8d3734..2ea9d50 100644 --- a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs +++ b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs @@ -1,5 +1,5 @@ use criterion::Criterion; -use simdeez::math::SimdMathF32InverseHyperbolic; +use simdeez::math::{SimdMathF32InverseHyperbolic, SimdMathF64InverseHyperbolic}; use simdeez::{prelude::*, simd_unsafe_generate_all}; use crate::shared::{self, INPUT_LEN}; @@ -19,6 +19,21 @@ fn scalar_atanh_sum(input: &[f32]) -> f32 { input.iter().copied().map(f32::atanh).sum() } +#[inline(never)] +fn scalar_asinh_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::asinh).sum() +} + +#[inline(never)] +fn scalar_acosh_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::acosh).sum() +} + +#[inline(never)] +fn scalar_atanh_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::atanh).sum() +} + simd_unsafe_generate_all!( fn simdeez_asinh_sum(input: &[f32]) -> f32 { shared::simdeez_unary_sum_impl::(input, |v| v.asinh_u35()) @@ -37,11 +52,35 @@ simd_unsafe_generate_all!( } ); +simd_unsafe_generate_all!( + fn simdeez_asinh_sum_f64(input: &[f64]) -> f64 { + shared::simdeez_unary_sum_impl_f64::(input, |v| v.asinh_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_acosh_sum_f64(input: &[f64]) -> f64 { + shared::simdeez_unary_sum_impl_f64::(input, |v| v.acosh_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_atanh_sum_f64(input: &[f64]) -> f64 { + shared::simdeez_unary_sum_impl_f64::(input, |v| v.atanh_u35()) + } +); + pub fn register(c: &mut Criterion) { let asinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB001, -16_384.0..16_384.0); let acosh_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB002, 1.0, 16_384.0); let atanh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB003, -0.999_999..0.999_999); + let asinh_inputs_f64 = + shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB101, -16_384.0..16_384.0); + let acosh_inputs_f64 = shared::make_positive_inputs_f64(INPUT_LEN, 0xDEADB102, 1.0, 16_384.0); + let atanh_inputs_f64 = + shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB103, -0.999_999_999_999..0.999_999_999_999); + shared::bench_unary( c, "simd_math_baseline/f32/asinh_u35", @@ -65,4 +104,28 @@ pub fn register(c: &mut Criterion) { scalar_atanh_sum, simdeez_atanh_sum, ); + + shared::bench_unary_f64( + c, + "simd_math_baseline/f64/asinh_u35", + &asinh_inputs_f64, + scalar_asinh_sum_f64, + simdeez_asinh_sum_f64, + ); + + shared::bench_unary_f64( + c, + "simd_math_baseline/f64/acosh_u35", + &acosh_inputs_f64, + scalar_acosh_sum_f64, + simdeez_acosh_sum_f64, + ); + + shared::bench_unary_f64( + c, + "simd_math_baseline/f64/atanh_u35", + &atanh_inputs_f64, + scalar_atanh_sum_f64, + simdeez_atanh_sum_f64, + ); } diff --git a/benches/simd_math_remaining_baseline/shared.rs b/benches/simd_math_remaining_baseline/shared.rs index de3e87e..14ef80a 100644 --- a/benches/simd_math_remaining_baseline/shared.rs +++ b/benches/simd_math_remaining_baseline/shared.rs @@ -16,6 +16,16 @@ pub fn make_positive_inputs(len: usize, seed: u64, min: f32, max: f32) -> Vec) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.gen_range(range.clone())).collect() +} + +pub fn make_positive_inputs_f64(len: usize, seed: u64, min: f64, max: f64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.gen_range(min..max)).collect() +} + pub fn make_binary_inputs( len: usize, seed: u64, @@ -40,6 +50,18 @@ pub fn simdeez_unary_sum_impl(input: &[f32], op: impl Fn(S::Vf32) -> S: sum } +#[inline(always)] +pub fn simdeez_unary_sum_impl_f64(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 { + let mut sum = 0.0f64; + let mut i = 0; + while i + S::Vf64::WIDTH <= input.len() { + let v = S::Vf64::load_from_slice(&input[i..]); + sum += op(v).horizontal_add(); + i += S::Vf64::WIDTH; + } + sum +} + #[inline(always)] pub fn simdeez_binary_sum_impl( a: &[f32], @@ -75,6 +97,24 @@ pub fn bench_unary( group.finish(); } +pub fn bench_unary_f64( + c: &mut Criterion, + name: &str, + input: &[f64], + scalar: fn(&[f64]) -> f64, + simd: fn(&[f64]) -> f64, +) { + let mut group = c.benchmark_group(name); + group.throughput(Throughput::Elements(input.len() as u64)); + group.bench_function("scalar-native", |b| { + b.iter(|| black_box(scalar(black_box(input)))) + }); + group.bench_function("simdeez-runtime", |b| { + b.iter(|| black_box(simd(black_box(input)))) + }); + group.finish(); +} + pub fn bench_binary( c: &mut Criterion, name: &str, diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs index 503988c..1ddde48 100644 --- a/src/math/f64/inverse_hyperbolic.rs +++ b/src/math/f64/inverse_hyperbolic.rs @@ -1,12 +1,73 @@ -use crate::math::{map, scalar}; -use crate::SimdFloat64; +use crate::math::{f64, scalar}; +use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; + +type SimdI64 = <::Engine as Simd>::Vi64; + +#[inline(always)] +fn any_lane_nonzero(mask: SimdI64) -> bool +where + V: SimdFloat64, +{ + unsafe { + let lanes = mask.as_array(); + for lane in 0..V::WIDTH { + if lanes[lane] != 0 { + return true; + } + } + } + + false +} + +#[inline(always)] +fn patch_exceptional_lanes( + input: V, + output: V, + exceptional_mask: SimdI64, + scalar_fallback: fn(f64) -> f64, +) -> V +where + V: SimdFloat64, +{ + if !any_lane_nonzero::(exceptional_mask) { + return output; + } + + unsafe { + let input_lanes = input.as_array(); + let mask_lanes = exceptional_mask.as_array(); + let mut output_lanes = output.as_array(); + + for lane in 0..V::WIDTH { + if mask_lanes[lane] != 0 { + output_lanes[lane] = scalar_fallback(input_lanes[lane]); + } + } + + V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) + } +} #[inline(always)] pub(crate) fn asinh_u35(input: V) -> V where V: SimdFloat64, { - map::unary_f64(input, scalar::asinh_u35_f64) + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let abs_x = input.abs(); + let tiny_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64(); + let large_mask = abs_x.cmp_gt(V::set1(1.0e150)).bitcast_i64(); + let zero_mask = input.cmp_eq(V::zeroes()).bitcast_i64(); + let exceptional_mask = + finite_mask.cmp_eq(SimdI64::::zeroes()) | tiny_mask | large_mask | zero_mask; + + let radicand = (abs_x * abs_x) + V::set1(1.0); + let magnitude = f64::ln_u35(abs_x + radicand.sqrt()); + let negative_mask = input.cmp_lt(V::zeroes()); + let fast = negative_mask.blendv(magnitude, -magnitude); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asinh_u35_f64) } #[inline(always)] @@ -14,7 +75,15 @@ pub(crate) fn acosh_u35(input: V) -> V where V: SimdFloat64, { - map::unary_f64(input, scalar::acosh_u35_f64) + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64(); + let fast_mask = finite_mask & in_domain_mask; + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + + let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt()); + let fast = f64::ln_u35(input + root_term); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64) } #[inline(always)] @@ -22,5 +91,19 @@ pub(crate) fn atanh_u35(input: V) -> V where V: SimdFloat64, { - map::unary_f64(input, scalar::atanh_u35_f64) + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let abs_x = input.abs(); + let strict_domain_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64(); + let non_zero_mask = input.cmp_neq(V::zeroes()).bitcast_i64(); + let stable_range_mask = abs_x.cmp_lte(V::set1(0.99)).bitcast_i64(); + let away_from_zero_mask = abs_x.cmp_gte(V::set1(0.9)).bitcast_i64(); + let fast_mask = + finite_mask & strict_domain_mask & non_zero_mask & stable_range_mask & away_from_zero_mask; + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + + let one = V::set1(1.0); + let ratio = (one + input) / (one - input); + let fast = f64::ln_u35(ratio) * V::set1(0.5); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atanh_u35_f64) } diff --git a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs index b5390c4..c07c793 100644 --- a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs +++ b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs @@ -1,5 +1,5 @@ use super::*; -use crate::math::SimdMathF32InverseHyperbolic; +use crate::math::{SimdMathF32InverseHyperbolic, SimdMathF64InverseHyperbolic}; fn run_f32_inverse_hyperbolic_domain_edges() { let acosh_inputs = [ @@ -135,3 +135,138 @@ simd_math_targeted_all_backends!( f32_inverse_hyperbolic_mixed_lanes, run_f32_inverse_hyperbolic_mixed_lanes ); + +fn run_f64_inverse_hyperbolic_domain_edges() { + let acosh_inputs = [ + 0.0f64, + 0.5, + 1.0 - f64::EPSILON, + 1.0, + 1.0 + f64::EPSILON, + 1.000_000_000_001, + 2.0, + 10.0, + f64::INFINITY, + f64::NAN, + ]; + check_targeted_unary_f64::( + "acosh_u35", + &acosh_inputs, + contracts::ACOSH_U35_F64_MAX_ULP, + |v| v.acosh_u35(), + f64::acosh, + ); + + let atanh_inputs = [ + -1.0f64, + -1.0 + f64::EPSILON, + -0.999_999_999_999, + -0.5, + -0.0, + 0.0, + 0.5, + 0.999_999_999_999, + 1.0 - f64::EPSILON, + 1.0, + 1.0 + f64::EPSILON, + f64::NAN, + ]; + check_targeted_unary_f64::( + "atanh_u35", + &atanh_inputs, + contracts::ATANH_U35_F64_MAX_ULP, + |v| v.atanh_u35(), + f64::atanh, + ); + + let asinh_inputs = [ + f64::NEG_INFINITY, + -1.0e300, + -1000.0, + -1.0, + -f64::MIN_POSITIVE, + -0.0, + 0.0, + f64::MIN_POSITIVE, + 1.0, + 1000.0, + 1.0e300, + f64::INFINITY, + f64::NAN, + ]; + check_targeted_unary_f64::( + "asinh_u35", + &asinh_inputs, + contracts::ASINH_U35_F64_MAX_ULP, + |v| v.asinh_u35(), + f64::asinh, + ); +} + +fn run_f64_inverse_hyperbolic_mixed_lanes() { + let mut lanes = vec![0.0f64; S::Vf64::WIDTH]; + let pattern = [ + -0.75f64, + -0.0, + 0.0, + 0.75, + 0.999_999_999_999, + -0.999_999_999_999, + 1.0, + -1.0, + 1.0 + f64::EPSILON, + f64::NAN, + f64::INFINITY, + f64::NEG_INFINITY, + 1.0, + 2.0, + 10.0, + 0.5, + ]; + for (i, lane) in lanes.iter_mut().enumerate() { + *lane = pattern[i % pattern.len()]; + } + + let input = S::Vf64::load_from_slice(&lanes); + let asinh = input.asinh_u35(); + let acosh = input.acosh_u35(); + let atanh = input.atanh_u35(); + + for (lane, &x) in lanes.iter().enumerate() { + assert_f64_contract( + "asinh_u35", + x, + asinh[lane], + x.asinh(), + contracts::ASINH_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("lane {lane}: {e}")); + + assert_f64_contract( + "acosh_u35", + x, + acosh[lane], + x.acosh(), + contracts::ACOSH_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("lane {lane}: {e}")); + + assert_f64_contract( + "atanh_u35", + x, + atanh[lane], + x.atanh(), + contracts::ATANH_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("lane {lane}: {e}")); + } +} + +simd_math_targeted_all_backends!( + f64_inverse_hyperbolic_domain_edges, + run_f64_inverse_hyperbolic_domain_edges +); +simd_math_targeted_all_backends!( + f64_inverse_hyperbolic_mixed_lanes, + run_f64_inverse_hyperbolic_mixed_lanes +); diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs index babe6b3..fe2f727 100644 --- a/src/tests/simd_math_targeted_edges/mod.rs +++ b/src/tests/simd_math_targeted_edges/mod.rs @@ -13,7 +13,7 @@ use crate::engines::wasm32::Wasm; use crate::engines::{avx2::Avx2, sse2::Sse2, sse41::Sse41}; use crate::math::SimdMathF32Core; -use crate::math::{contracts, SimdMathF32}; +use crate::math::{contracts, SimdMathF32, SimdMathF64}; use crate::{Simd, SimdBaseIo, SimdConsts}; fn assert_f32_contract( @@ -89,6 +89,79 @@ fn check_targeted_unary_f32( } } +fn assert_f64_contract( + fn_name: &str, + input: f64, + actual: f64, + expected: f64, + max_ulp: u64, +) -> Result<(), String> { + if expected.is_nan() { + if actual.is_nan() { + return Ok(()); + } + return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}")); + } + + if expected.is_infinite() { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected {:?}, got {:?}", + expected, actual + )); + } + + if expected == 0.0 { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}", + expected.to_bits(), + actual.to_bits() + )); + } + + if actual.is_nan() || actual.is_infinite() { + return Err(format!( + "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}" + )); + } + + let ulp = ulp_distance_f64(actual, expected) + .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?; + if ulp > max_ulp { + return Err(format!( + "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})" + )); + } + + Ok(()) +} + +fn check_targeted_unary_f64( + fn_name: &str, + inputs: &[f64], + max_ulp: u64, + simd_fn: impl Fn(S::Vf64) -> S::Vf64, + scalar_ref: impl Fn(f64) -> f64, +) { + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let input = S::Vf64::load_from_slice(chunk); + let output = simd_fn(input); + + for (lane, &x) in chunk.iter().enumerate() { + let actual = output[lane]; + let expected = scalar_ref(x); + if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) { + panic!("{err}"); + } + } + } +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) { assert_eq!(input.len(), output.len());