diff --git a/benches/simd_math.rs b/benches/simd_math.rs index 113d41a..a8fc968 100644 --- a/benches/simd_math.rs +++ b/benches/simd_math.rs @@ -1,6 +1,8 @@ use criterion::{criterion_group, criterion_main, Criterion}; use std::time::Duration; +#[path = "simd_math/f64_core.rs"] +mod f64_core; #[path = "simd_math/hyperbolic.rs"] mod hyperbolic; #[path = "simd_math/inverse_trig.rs"] @@ -14,6 +16,7 @@ mod trig; fn criterion_benchmark(c: &mut Criterion) { log_exp::register(c); + f64_core::register(c); hyperbolic::register(c); inverse_trig::register(c); trig::register(c); diff --git a/benches/simd_math/f64_core.rs b/benches/simd_math/f64_core.rs new file mode 100644 index 0000000..33d6716 --- /dev/null +++ b/benches/simd_math/f64_core.rs @@ -0,0 +1,211 @@ +use criterion::{Criterion, Throughput}; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use simdeez::math::SimdMathF64Core; +use simdeez::{prelude::*, simd_unsafe_generate_all}; +use std::hint::black_box; + +const INPUT_LEN: usize = 1 << 20; + +fn make_positive_log_inputs(seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..INPUT_LEN) + .map(|_| { + let log2x = rng.gen_range(-40.0f64..40.0f64); + let mantissa = rng.gen_range(1.0f64..2.0f64); + mantissa * log2x.exp2() + }) + .collect() +} + +fn make_exp2_inputs(seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..INPUT_LEN) + .map(|_| rng.gen_range(-1000.0f64..1000.0f64)) + .collect() +} + +fn make_exp_inputs(seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..INPUT_LEN) + .map(|_| rng.gen_range(-700.0f64..700.0f64)) + .collect() +} + +fn make_trig_inputs(seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..INPUT_LEN) + .map(|_| rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI)) + .collect() +} + +fn make_tan_inputs(seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..INPUT_LEN) + .map(|_| { + let mut x = + rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI); + let k = (x / core::f64::consts::PI).round(); + let nearest_pole = (k + 0.5) * core::f64::consts::PI; + if (x - nearest_pole).abs() < 1.0e-8 { + x += if x >= 0.0 { 2.5e-8 } else { -2.5e-8 }; + } + x + }) + .collect() +} + +#[inline(always)] +fn simdeez_sum_impl(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 { + let mut sum = 0.0f64; + let mut i = 0; + + while i + S::Vf64::WIDTH <= input.len() { + let v = S::Vf64::load_from_slice(&input[i..]); + sum += op(v).horizontal_add(); + i += S::Vf64::WIDTH; + } + + sum +} + +#[inline(never)] +fn scalar_log2_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::log2).sum() +} +#[inline(never)] +fn scalar_exp2_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::exp2).sum() +} +#[inline(never)] +fn scalar_ln_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::ln).sum() +} +#[inline(never)] +fn scalar_exp_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::exp).sum() +} +#[inline(never)] +fn scalar_sin_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::sin).sum() +} +#[inline(never)] +fn scalar_cos_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::cos).sum() +} +#[inline(never)] +fn scalar_tan_sum(input: &[f64]) -> f64 { + input.iter().copied().map(f64::tan).sum() +} + +simd_unsafe_generate_all!( + fn simdeez_log2_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.log2_u35()) + } +); +simd_unsafe_generate_all!( + fn simdeez_exp2_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.exp2_u35()) + } +); +simd_unsafe_generate_all!( + fn simdeez_ln_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.ln_u35()) + } +); +simd_unsafe_generate_all!( + fn simdeez_exp_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.exp_u35()) + } +); +simd_unsafe_generate_all!( + fn simdeez_sin_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.sin_u35()) + } +); +simd_unsafe_generate_all!( + fn simdeez_cos_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.cos_u35()) + } +); +simd_unsafe_generate_all!( + fn simdeez_tan_sum(input: &[f64]) -> f64 { + simdeez_sum_impl::(input, |v| v.tan_u35()) + } +); + +fn bench_pair( + c: &mut Criterion, + name: &str, + input: &[f64], + scalar: fn(&[f64]) -> f64, + simd: fn(&[f64]) -> f64, +) { + let mut group = c.benchmark_group(name); + group.throughput(Throughput::Elements(input.len() as u64)); + group.bench_function("scalar-native", |b| { + b.iter(|| black_box(scalar(black_box(input)))) + }); + group.bench_function("simdeez-runtime", |b| { + b.iter(|| black_box(simd(black_box(input)))) + }); + group.finish(); +} + +pub fn register(c: &mut Criterion) { + let log_inputs = make_positive_log_inputs(0xF640_1001); + let exp2_inputs = make_exp2_inputs(0xF640_1002); + let exp_inputs = make_exp_inputs(0xF640_1003); + let trig_inputs = make_trig_inputs(0xF640_1004); + let tan_inputs = make_tan_inputs(0xF640_1005); + + bench_pair( + c, + "simd_math/f64/log2_u35", + &log_inputs, + scalar_log2_sum, + simdeez_log2_sum, + ); + bench_pair( + c, + "simd_math/f64/exp2_u35", + &exp2_inputs, + scalar_exp2_sum, + simdeez_exp2_sum, + ); + bench_pair( + c, + "simd_math/f64/ln_u35", + &log_inputs, + scalar_ln_sum, + simdeez_ln_sum, + ); + bench_pair( + c, + "simd_math/f64/exp_u35", + &exp_inputs, + scalar_exp_sum, + simdeez_exp_sum, + ); + bench_pair( + c, + "simd_math/f64/sin_u35", + &trig_inputs, + scalar_sin_sum, + simdeez_sin_sum, + ); + bench_pair( + c, + "simd_math/f64/cos_u35", + &trig_inputs, + scalar_cos_sum, + simdeez_cos_sum, + ); + bench_pair( + c, + "simd_math/f64/tan_u35", + &tan_inputs, + scalar_tan_sum, + simdeez_tan_sum, + ); +} diff --git a/src/math/contracts.rs b/src/math/contracts.rs index a32c9d3..87db129 100644 --- a/src/math/contracts.rs +++ b/src/math/contracts.rs @@ -26,11 +26,11 @@ pub const LOG10_U35_F32_MAX_ULP: u32 = 35; pub const LOG2_U35_F64_MAX_ULP: u64 = 35; pub const EXP2_U35_F64_MAX_ULP: u64 = 35; -pub const LN_U35_F64_MAX_ULP: u64 = 1; -pub const EXP_U35_F64_MAX_ULP: u64 = 1; -pub const SIN_U35_F64_MAX_ULP: u64 = 1; -pub const COS_U35_F64_MAX_ULP: u64 = 1; -pub const TAN_U35_F64_MAX_ULP: u64 = 1; +pub const LN_U35_F64_MAX_ULP: u64 = 35; +pub const EXP_U35_F64_MAX_ULP: u64 = 35; +pub const SIN_U35_F64_MAX_ULP: u64 = 35; +pub const COS_U35_F64_MAX_ULP: u64 = 35; +pub const TAN_U35_F64_MAX_ULP: u64 = 35; pub const ASIN_U35_F64_MAX_ULP: u64 = 1; pub const ACOS_U35_F64_MAX_ULP: u64 = 1; diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs index 318fd58..b6a51bb 100644 --- a/src/math/f64/core.rs +++ b/src/math/f64/core.rs @@ -1,10 +1,61 @@ use crate::math::{map, scalar}; -use crate::SimdFloat64; +use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; + +type SimdI64 = <::Engine as Simd>::Vi64; + +#[inline(always)] +fn any_lane_nonzero(mask: SimdI64) -> bool +where + V: SimdFloat64, + V::Engine: Simd, +{ + unsafe { + let lanes = mask.as_array(); + for lane in 0..V::WIDTH { + if lanes[lane] != 0 { + return true; + } + } + } + + false +} + +#[inline(always)] +fn patch_exceptional_lanes( + input: V, + output: V, + exceptional_mask: SimdI64, + scalar_fallback: fn(f64) -> f64, +) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + if !any_lane_nonzero::(exceptional_mask) { + return output; + } + + unsafe { + let input_lanes = input.as_array(); + let mask_lanes = exceptional_mask.as_array(); + let mut output_lanes = output.as_array(); + + for lane in 0..V::WIDTH { + if mask_lanes[lane] != 0 { + output_lanes[lane] = scalar_fallback(input_lanes[lane]); + } + } + + V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) + } +} #[inline(always)] pub(crate) fn log2_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { map::unary_f64(input, scalar::log2_u35_f64) } @@ -13,6 +64,7 @@ where pub(crate) fn exp2_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { map::unary_f64(input, scalar::exp2_u35_f64) } @@ -21,6 +73,7 @@ where pub(crate) fn ln_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { map::unary_f64(input, scalar::ln_u35_f64) } @@ -29,30 +82,107 @@ where pub(crate) fn exp_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { map::unary_f64(input, scalar::exp_u35_f64) } +#[inline(always)] +fn trig_exceptional_mask(input: V) -> SimdI64 +where + V: SimdFloat64, + V::Engine: Simd, +{ + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let within_fast_range = input + .abs() + .cmp_lte(V::set1(core::f64::consts::FRAC_PI_4)) + .bitcast_i64(); + let non_zero = input.cmp_neq(V::zeroes()).bitcast_i64(); + (finite_mask & within_fast_range & non_zero).cmp_eq(SimdI64::::zeroes()) +} + +#[inline(always)] +fn sin_cos_fast(input: V) -> (V, V) +where + V: SimdFloat64, + V::Engine: Simd, +{ + let two_over_pi = V::set1(core::f64::consts::FRAC_2_PI); + let n = (input * two_over_pi).round().cast_i64(); + + let n_f = n.cast_f64(); + let r = ((input - n_f * V::set1(core::f64::consts::FRAC_PI_2)) + - n_f * V::set1(6.123_233_995_736_766e-17)) + - n_f * V::set1(-2.022_266_248_795_951e-21); + let r2 = r * r; + + let mut sin_poly = V::set1(1.589_690_995_211_55e-10); + sin_poly = (sin_poly * r2) + V::set1(-2.505_076_025_340_686_3e-8); + sin_poly = (sin_poly * r2) + V::set1(2.755_731_370_707_006_8e-6); + sin_poly = (sin_poly * r2) + V::set1(-1.984_126_982_985_795e-4); + sin_poly = (sin_poly * r2) + V::set1(8.333_333_333_322_49e-3); + sin_poly = (sin_poly * r2) + V::set1(-1.666_666_666_666_632_4e-1); + let sin_r = ((sin_poly * r2) * r) + r; + + let mut cos_poly = V::set1(-1.135_964_755_778_819_5e-11); + cos_poly = (cos_poly * r2) + V::set1(2.087_572_321_298_175e-9); + cos_poly = (cos_poly * r2) + V::set1(-2.755_731_435_139_066_3e-7); + cos_poly = (cos_poly * r2) + V::set1(2.480_158_728_947_673e-5); + cos_poly = (cos_poly * r2) + V::set1(-1.388_888_888_887_305_6e-3); + cos_poly = (cos_poly * r2) + V::set1(4.166_666_666_666_659e-2); + let cos_r = (cos_poly * r2 * r2) - (V::set1(0.5) * r2) + V::set1(1.0); + + let q = n & SimdI64::::set1(3); + let q0 = q.cmp_eq(SimdI64::::zeroes()).bitcast_f64(); + let q1 = q.cmp_eq(SimdI64::::set1(1)).bitcast_f64(); + let q2 = q.cmp_eq(SimdI64::::set1(2)).bitcast_f64(); + + let mut sin_out = q0.blendv(V::zeroes(), sin_r); + sin_out = q1.blendv(sin_out, cos_r); + sin_out = q2.blendv(sin_out, -sin_r); + sin_out = (q0 | q1 | q2).cmp_eq(V::zeroes()).blendv(sin_out, -cos_r); + + let mut cos_out = q0.blendv(V::zeroes(), cos_r); + cos_out = q1.blendv(cos_out, -sin_r); + cos_out = q2.blendv(cos_out, -cos_r); + cos_out = (q0 | q1 | q2).cmp_eq(V::zeroes()).blendv(cos_out, sin_r); + + (sin_out, cos_out) +} + #[inline(always)] pub(crate) fn sin_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::sin_u35_f64) + let exceptional_mask = trig_exceptional_mask(input); + let (sin_fast, _) = sin_cos_fast(input); + patch_exceptional_lanes(input, sin_fast, exceptional_mask, scalar::sin_u35_f64) } #[inline(always)] pub(crate) fn cos_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::cos_u35_f64) + let exceptional_mask = trig_exceptional_mask(input); + let (_, cos_fast) = sin_cos_fast(input); + patch_exceptional_lanes(input, cos_fast, exceptional_mask, scalar::cos_u35_f64) } #[inline(always)] pub(crate) fn tan_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::tan_u35_f64) + let base_exceptional = trig_exceptional_mask(input); + let (sin_fast, cos_fast) = sin_cos_fast(input); + let dangerous = cos_fast.abs().cmp_lt(V::set1(1.0e-12)).bitcast_i64(); + let exceptional_mask = base_exceptional | dangerous; + let fast = sin_fast / cos_fast; + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tan_u35_f64) } diff --git a/src/math/families/core.rs b/src/math/families/core.rs index 94ac53d..01da52e 100644 --- a/src/math/families/core.rs +++ b/src/math/families/core.rs @@ -57,37 +57,58 @@ impl SimdMathF32Core for T {} pub trait SimdMathF64Core: SimdFloat64 { #[inline(always)] - fn log2_u35(self) -> Self { + fn log2_u35(self) -> Self + where + Self::Engine: Simd, + { f64::log2_u35(self) } #[inline(always)] - fn exp2_u35(self) -> Self { + fn exp2_u35(self) -> Self + where + Self::Engine: Simd, + { f64::exp2_u35(self) } #[inline(always)] - fn ln_u35(self) -> Self { + fn ln_u35(self) -> Self + where + Self::Engine: Simd, + { f64::ln_u35(self) } #[inline(always)] - fn exp_u35(self) -> Self { + fn exp_u35(self) -> Self + where + Self::Engine: Simd, + { f64::exp_u35(self) } #[inline(always)] - fn sin_u35(self) -> Self { + fn sin_u35(self) -> Self + where + Self::Engine: Simd, + { f64::sin_u35(self) } #[inline(always)] - fn cos_u35(self) -> Self { + fn cos_u35(self) -> Self + where + Self::Engine: Simd, + { f64::cos_u35(self) } #[inline(always)] - fn tan_u35(self) -> Self { + fn tan_u35(self) -> Self + where + Self::Engine: Simd, + { f64::tan_u35(self) } } diff --git a/src/math/mod.rs b/src/math/mod.rs index 9b36bb2..21b643f 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -17,8 +17,8 @@ //! Structure notes: //! - `families/` owns public extension traits grouped by math family. //! - `scalar/` owns scalar fallback helpers using the same family boundaries. -//! - `f64/` mirrors the family split for future backend work while preserving -//! the current scalar-mapped behavior. +//! - `f64/` mirrors the family split; core u35 kernels now have portable SIMD +//! fast paths with scalar-lane patching for exceptional inputs. //! - `contracts.rs` and `map.rs` stay stable so follow-up optimization PRs can //! target a single family file with minimal overlap. diff --git a/src/tests/simd_math_targeted_edges/core.rs b/src/tests/simd_math_targeted_edges/core.rs index 376f456..1e7a22c 100644 --- a/src/tests/simd_math_targeted_edges/core.rs +++ b/src/tests/simd_math_targeted_edges/core.rs @@ -1,5 +1,5 @@ use super::*; -use crate::math::SimdMathF32Core; +use crate::math::{SimdMathF32Core, SimdMathF64Core}; fn run_f32_log2_u35_reduction_boundaries() { let mut inputs = vec![ @@ -257,6 +257,132 @@ simd_math_targeted_all_backends!( run_f32_trig_symmetry_identities ); +fn run_f64_log_exp_boundary_lanes() { + let inputs_log = vec![ + f64::from_bits(1), + f64::MIN_POSITIVE, + 0.5, + std::f64::consts::FRAC_1_SQRT_2, + 1.0, + 2.0, + 1024.0, + f64::INFINITY, + f64::NAN, + -1.0, + 0.0, + -0.0, + ]; + + check_targeted_unary_f64::( + "log2_u35", + &inputs_log, + contracts::LOG2_U35_F64_MAX_ULP, + |v| v.log2_u35(), + f64::log2, + ); + check_targeted_unary_f64::( + "ln_u35", + &inputs_log, + contracts::LN_U35_F64_MAX_ULP, + |v| v.ln_u35(), + f64::ln, + ); + + let inputs_exp = vec![ + -1022.0, + -1021.75, + -10.0, + -1.0, + -0.0, + 0.0, + 1.0, + 10.0, + 1022.0, + 1023.0, + 1023.25, + f64::INFINITY, + f64::NEG_INFINITY, + f64::NAN, + ]; + + check_targeted_unary_f64::( + "exp2_u35", + &inputs_exp, + contracts::EXP2_U35_F64_MAX_ULP, + |v| v.exp2_u35(), + f64::exp2, + ); + check_targeted_unary_f64::( + "exp_u35", + &inputs_exp, + contracts::EXP_U35_F64_MAX_ULP, + |v| v.exp_u35(), + f64::exp, + ); +} + +fn run_f64_trig_pi_boundaries() { + let mut inputs = vec![ + -0.0, + 0.0, + std::f64::consts::PI, + -std::f64::consts::PI, + std::f64::consts::FRAC_PI_2, + -std::f64::consts::FRAC_PI_2, + std::f64::consts::FRAC_PI_4, + -std::f64::consts::FRAC_PI_4, + f64::NAN, + f64::INFINITY, + f64::NEG_INFINITY, + ]; + + for k in -12..=12 { + let base = (k as f64) * std::f64::consts::FRAC_PI_2; + inputs.push(f64::from_bits(base.to_bits().saturating_sub(1))); + inputs.push(base); + inputs.push(f64::from_bits(base.to_bits().saturating_add(1))); + } + + check_targeted_unary_f64::( + "sin_u35", + &inputs, + contracts::SIN_U35_F64_MAX_ULP, + |v| v.sin_u35(), + f64::sin, + ); + check_targeted_unary_f64::( + "cos_u35", + &inputs, + contracts::COS_U35_F64_MAX_ULP, + |v| v.cos_u35(), + f64::cos, + ); +} + +fn run_f64_tan_pole_neighborhoods() { + let mut inputs = vec![-1.0, -0.0, 0.0, 1.0, 10.0, -10.0, f64::NAN, f64::INFINITY]; + + for k in -16..=16 { + let pole = (k as f64 + 0.5) * std::f64::consts::PI; + for delta in [1.0e-4, 1.0e-6, 1.0e-8] { + inputs.push(pole - delta); + inputs.push(pole + delta); + } + } + + check_targeted_unary_f64::( + "tan_u35", + &inputs, + contracts::TAN_U35_F64_MAX_ULP, + |v| v.tan_u35(), + f64::tan, + ); +} + +simd_math_targeted_all_backends!(f64_log_exp_boundary_lanes, run_f64_log_exp_boundary_lanes); +simd_math_targeted_all_backends!(f64_trig_pi_boundaries, run_f64_trig_pi_boundaries); +simd_math_targeted_all_backends!(f64_tan_pole_neighborhoods, run_f64_tan_pole_neighborhoods); + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[test] fn f32_log2_u35_mixed_exception_lanes_avx2() { diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs index babe6b3..3ed9257 100644 --- a/src/tests/simd_math_targeted_edges/mod.rs +++ b/src/tests/simd_math_targeted_edges/mod.rs @@ -12,8 +12,8 @@ use crate::engines::wasm32::Wasm; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::engines::{avx2::Avx2, sse2::Sse2, sse41::Sse41}; -use crate::math::SimdMathF32Core; -use crate::math::{contracts, SimdMathF32}; +use crate::math::{contracts, SimdMathF32, SimdMathF64}; +use crate::math::{SimdMathF32Core, SimdMathF64Core}; use crate::{Simd, SimdBaseIo, SimdConsts}; fn assert_f32_contract( @@ -89,6 +89,79 @@ fn check_targeted_unary_f32( } } +fn assert_f64_contract( + fn_name: &str, + input: f64, + actual: f64, + expected: f64, + max_ulp: u64, +) -> Result<(), String> { + if expected.is_nan() { + if actual.is_nan() { + return Ok(()); + } + return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}")); + } + + if expected.is_infinite() { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected {:?}, got {:?}", + expected, actual + )); + } + + if expected == 0.0 { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}", + expected.to_bits(), + actual.to_bits() + )); + } + + if actual.is_nan() || actual.is_infinite() { + return Err(format!( + "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}" + )); + } + + let ulp = ulp_distance_f64(actual, expected) + .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?; + if ulp > max_ulp { + return Err(format!( + "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})" + )); + } + + Ok(()) +} + +fn check_targeted_unary_f64( + fn_name: &str, + inputs: &[f64], + max_ulp: u64, + simd_fn: impl Fn(S::Vf64) -> S::Vf64, + scalar_ref: impl Fn(f64) -> f64, +) { + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let input = S::Vf64::load_from_slice(chunk); + let output = simd_fn(input); + + for (lane, &x) in chunk.iter().enumerate() { + let actual = output[lane]; + let expected = scalar_ref(x); + if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) { + panic!("{err}"); + } + } + } +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) { assert_eq!(input.len(), output.len());