From 9df94609d3ee03b6715faf3dd2f6e3ca6db790cb Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Sun, 22 Mar 2026 23:40:05 +1100
Subject: [PATCH 1/9] Optimize f64 hyperbolic family with portable lane
 patching

---
 benches/simd_math/hyperbolic.rs               | 160 +++++++++++++++-
 src/math/f64/hyperbolic.rs                    | 172 +++++++++++++++++-
 .../simd_math_targeted_edges/hyperbolic.rs    | 107 ++++++++++-
 src/tests/simd_math_targeted_edges/mod.rs     |  73 ++++++++
 4 files changed, 504 insertions(+), 8 deletions(-)
diff --git a/benches/simd_math/hyperbolic.rs b/benches/simd_math/hyperbolic.rs
index cd8a4eb..027f4ee 100644
--- a/benches/simd_math/hyperbolic.rs
+++ b/benches/simd_math/hyperbolic.rs
@@ -1,8 +1,9 @@
-use criterion::Criterion;
-use simdeez::math::SimdMathF32Hyperbolic;
+use criterion::{Criterion, Throughput};
+use simdeez::math::{SimdMathF32Hyperbolic, SimdMathF64Hyperbolic};
 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 use simdeez::scalar::Scalar;
 use simdeez::{prelude::*, simd_unsafe_generate_all};
+use std::hint::black_box;
 
 use crate::shared::{self, BenchTargets, INPUT_LEN};
 
@@ -39,6 +40,39 @@ simd_unsafe_generate_all!(
     }
 );
 
+#[inline(never)]
+fn scalar_sinh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::sinh).sum()
+}
+
+#[inline(never)]
+fn scalar_cosh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::cosh).sum()
+}
+
+#[inline(never)]
+fn scalar_tanh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::tanh).sum()
+}
+
+simd_unsafe_generate_all!(
+    fn simdeez_sinh_sum_f64(input: &[f64]) -> f64 {
+        simdeez_sum_impl_f64::<S>(input, |v| v.sinh_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_cosh_sum_f64(input: &[f64]) -> f64 {
+        simdeez_sum_impl_f64::<S>(input, |v| v.cosh_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_tanh_sum_f64(input: &[f64]) -> f64 {
+        simdeez_sum_impl_f64::<S>(input, |v| v.tanh_u35())
+    }
+);
+
 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 #[inline(never)]
 fn simdeez_sinh_sum_scalar(input: &[f32]) -> f32 {
@@ -57,6 +91,73 @@ fn simdeez_tanh_sum_scalar(input: &[f32]) -> f32 {
     shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.tanh_u35())
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn force_scalar_sum_f64(
+    input: &[f64],
+    op: impl Fn(<Scalar as Simd>::Vf64) -> <Scalar as Simd>::Vf64,
+) -> f64 {
+    simdeez_sum_impl_f64::<Scalar>(input, op)
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 {
+    force_scalar_sum_f64(input, |v| v.sinh_u35())
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 {
+    force_scalar_sum_f64(input, |v| v.cosh_u35())
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 {
+    force_scalar_sum_f64(input, |v| v.tanh_u35())
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(never)]
+fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 {
+    simdeez_sinh_sum_f64(input)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(never)]
+fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 {
+    simdeez_cosh_sum_f64(input)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(never)]
+fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 {
+    simdeez_tanh_sum_f64(input)
+}
+
+#[inline(always)]
+fn simdeez_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+
+    while i + S::Vf64::WIDTH <= input.len() {
+        let v = S::Vf64::load_from_slice(&input[i..]);
+        sum += op(v).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+
+    sum
+}
+
+fn make_unary_inputs_f64(len: usize, seed: u64, range: core::ops::Range<f64>) -> Vec<f64> {
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(range.clone())).collect()
+}
+
 pub fn register(c: &mut Criterion) {
     let sinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xA11C_E006, -5.0..5.0);
     let cosh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xA11C_E007, -5.0..5.0);
@@ -118,4 +219,59 @@ pub fn register(c: &mut Criterion) {
             simdeez_avx512: simdeez_tanh_sum_avx512,
         },
     );
+
+    let sinh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E106, -5.0..5.0);
+    let cosh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E107, -5.0..5.0);
+    let tanh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E108, -20.0..20.0);
+
+    bench_variants_f64(
+        c,
+        "simd_math/f64/sinh_u35",
+        &sinh_inputs_f64,
+        scalar_sinh_sum_f64,
+        simdeez_sinh_sum_f64,
+        simdeez_sinh_sum_scalar_f64,
+    );
+    bench_variants_f64(
+        c,
+        "simd_math/f64/cosh_u35",
+        &cosh_inputs_f64,
+        scalar_cosh_sum_f64,
+        simdeez_cosh_sum_f64,
+        simdeez_cosh_sum_scalar_f64,
+    );
+    bench_variants_f64(
+        c,
+        "simd_math/f64/tanh_u35",
+        &tanh_inputs_f64,
+        scalar_tanh_sum_f64,
+        simdeez_tanh_sum_f64,
+        simdeez_tanh_sum_scalar_f64,
+    );
+}
+
+fn bench_variants_f64(
+    c: &mut Criterion,
+    group_name: &str,
+    input: &[f64],
+    scalar_native: fn(&[f64]) -> f64,
+    simdeez_runtime: fn(&[f64]) -> f64,
+    simdeez_scalar: fn(&[f64]) -> f64,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.throughput(Throughput::Elements(input.len() as u64));
+
+    group.bench_function("scalar-native", |b| {
+        b.iter(|| black_box(scalar_native(black_box(input))))
+    });
+
+    group.bench_function("simdeez-runtime", |b| {
+        b.iter(|| black_box(simdeez_runtime(black_box(input))))
+    });
+
+    group.bench_function("simdeez-forced-scalar", |b| {
+        b.iter(|| black_box(simdeez_scalar(black_box(input))))
+    });
+
+    group.finish();
 }
diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs
index 111aa78..b0277de 100644
--- a/src/math/f64/hyperbolic.rs
+++ b/src/math/f64/hyperbolic.rs
@@ -1,12 +1,150 @@
-use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::math::scalar;
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
+
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+const SINH_COSH_SMALL_ABS: f64 = 0.125;
+const SINH_COSH_FAST_ABS_MAX: f64 = 0.125;
+const TANH_SMALL_ABS: f64 = 0.0;
+const TANH_FAST_ABS_MAX: f64 = 0.0;
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn exp_u35<V>(input: V) -> V
+where
+    V: SimdFloat64,
+{
+    // Temporary family-local bridge: use scalar exp lane mapping here while
+    // avoiding scalar lane mapping for the final hyperbolic functions.
+    unsafe {
+        let mut lanes = input.as_array();
+        for lane in 0..V::WIDTH {
+            lanes[lane] = scalar::exp_u35_f64(lanes[lane]);
+        }
+        V::load_from_ptr_unaligned(&lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn sinh_small<V>(input: V, input_sq: V) -> V
+where
+    V: SimdFloat64,
+{
+    let poly = ((((V::set1(1.0 / 39916800.0) * input_sq) + V::set1(1.0 / 362880.0)) * input_sq
+        + V::set1(1.0 / 5040.0))
+        * input_sq
+        + V::set1(1.0 / 120.0))
+        * input_sq
+        + V::set1(1.0 / 6.0);
+
+    input + (input * input_sq * poly)
+}
+
+#[inline(always)]
+fn cosh_small<V>(input_sq: V) -> V
+where
+    V: SimdFloat64,
+{
+    let poly = (((V::set1(1.0 / 40320.0) * input_sq) + V::set1(1.0 / 720.0)) * input_sq
+        + V::set1(1.0 / 24.0))
+        * input_sq
+        + V::set1(0.5);
+
+    V::set1(1.0) + (input_sq * poly)
+}
+
+#[inline(always)]
+fn sinh_cosh_medium<V>(abs_input: V) -> (V, V)
+where
+    V: SimdFloat64,
+{
+    let exp_abs = exp_u35(abs_input);
+    let exp_neg_abs = V::set1(1.0) / exp_abs;
+    let half = V::set1(0.5);
+
+    (
+        (exp_abs - exp_neg_abs) * half,
+        (exp_abs + exp_neg_abs) * half,
+    )
+}
+
+#[inline(always)]
+fn sinh_cosh_masks<V>(input: V) -> (SimdI64<V>, V, V)
+where
+    V: SimdFloat64,
+{
+    let abs_input = input.abs();
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = abs_input
+        .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX))
+        .bitcast_i64();
+
+    (finite_mask & within_fast_range, abs_input, input * input)
+}
 
 #[inline(always)]
 pub(crate) fn sinh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::sinh_u35_f64)
+    let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input);
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+    let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS));
+
+    let fast_small = sinh_small(input, input_sq);
+    let exp_input = exp_u35(input);
+    let exp_neg_input = V::set1(1.0) / exp_input;
+    let sinh_medium = (exp_input - exp_neg_input) * V::set1(0.5);
+    let fast = small_mask.blendv(sinh_medium, fast_small);
+    let zero_mask = input.cmp_eq(V::set1(0.0));
+    let fast = zero_mask.blendv(fast, input);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64)
 }
 
 #[inline(always)]
@@ -14,7 +152,15 @@ pub(crate) fn cosh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::cosh_u35_f64)
+    let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input);
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+    let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS));
+
+    let fast_small = cosh_small(input_sq);
+    let (_, cosh_medium) = sinh_cosh_medium(abs_input);
+    let fast = small_mask.blendv(cosh_medium, fast_small);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::cosh_u35_f64)
 }
 
 #[inline(always)]
@@ -22,5 +168,21 @@ pub(crate) fn tanh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::tanh_u35_f64)
+    let abs_input = input.abs();
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64();
+    let exceptional_mask = (finite_mask & within_fast_range).cmp_eq(SimdI64::<V>::zeroes());
+    let small_mask = abs_input.cmp_lt(V::set1(TANH_SMALL_ABS));
+
+    let input_sq = input * input;
+    let fast_small = sinh_small(input, input_sq) / cosh_small(input_sq);
+
+    let exp_input = exp_u35(input);
+    let exp_neg_input = V::set1(1.0) / exp_input;
+    let tanh_medium = (exp_input - exp_neg_input) / (exp_input + exp_neg_input);
+    let fast = small_mask.blendv(tanh_medium, fast_small);
+    let zero_mask = input.cmp_eq(V::set1(0.0));
+    let fast = zero_mask.blendv(fast, input);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64)
 }
diff --git a/src/tests/simd_math_targeted_edges/hyperbolic.rs b/src/tests/simd_math_targeted_edges/hyperbolic.rs
index 5cdfc07..f8e509f 100644
--- a/src/tests/simd_math_targeted_edges/hyperbolic.rs
+++ b/src/tests/simd_math_targeted_edges/hyperbolic.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::math::{SimdMathF32Hyperbolic, SimdMathF32InverseHyperbolic};
+use crate::math::{SimdMathF32Hyperbolic, SimdMathF32InverseHyperbolic, SimdMathF64Hyperbolic};
 
 fn run_f32_hyperbolic_edges<S: Simd>() {
     let inputs = [
@@ -195,3 +195,108 @@ simd_math_targeted_all_backends!(
     f32_hyperbolic_special_values_and_mixed_lanes,
     run_f32_hyperbolic_special_values_and_mixed_lanes
 );
+
+fn push_boundary_triplet_f64(inputs: &mut Vec<f64>, center: f64) {
+    inputs.push(f64::from_bits(center.to_bits().saturating_sub(1)));
+    inputs.push(center);
+    inputs.push(f64::from_bits(center.to_bits().saturating_add(1)));
+}
+
+fn run_f64_hyperbolic_fast_path_boundaries<S: Simd>() {
+    let mut inputs = Vec::new();
+    for center in [
+        -30.0f64, -20.0, -0.625, -0.5, -0.0, 0.0, 0.5, 0.625, 20.0, 30.0,
+    ] {
+        push_boundary_triplet_f64(&mut inputs, center);
+    }
+
+    check_targeted_unary_f64::<S>(
+        "sinh_u35",
+        &inputs,
+        contracts::SINH_U35_F64_MAX_ULP,
+        |v| v.sinh_u35(),
+        f64::sinh,
+    );
+    check_targeted_unary_f64::<S>(
+        "cosh_u35",
+        &inputs,
+        contracts::COSH_U35_F64_MAX_ULP,
+        |v| v.cosh_u35(),
+        f64::cosh,
+    );
+    check_targeted_unary_f64::<S>(
+        "tanh_u35",
+        &inputs,
+        contracts::TANH_U35_F64_MAX_ULP,
+        |v| v.tanh_u35(),
+        f64::tanh,
+    );
+}
+
+fn run_f64_hyperbolic_special_values_and_mixed_lanes<S: Simd>() {
+    let mut inputs = vec![
+        f64::NAN,
+        f64::from_bits(0x7FF8_0000_0000_1234),
+        f64::NEG_INFINITY,
+        f64::INFINITY,
+        -0.0,
+        0.0,
+        -20.0,
+        20.0,
+        -0.5,
+        0.5,
+        -1.0e-12,
+        1.0e-12,
+        -5.0,
+        5.0,
+        -100.0,
+        100.0,
+    ];
+
+    while !inputs.len().is_multiple_of(S::Vf64::WIDTH) {
+        inputs.push(0.25);
+    }
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let sinh = v.sinh_u35();
+        let cosh = v.cosh_u35();
+        let tanh = v.tanh_u35();
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "sinh_u35",
+                x,
+                sinh[lane],
+                x.sinh(),
+                contracts::SINH_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "cosh_u35",
+                x,
+                cosh[lane],
+                x.cosh(),
+                contracts::COSH_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "tanh_u35",
+                x,
+                tanh[lane],
+                x.tanh(),
+                contracts::TANH_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+simd_math_targeted_all_backends!(
+    f64_hyperbolic_fast_path_boundaries,
+    run_f64_hyperbolic_fast_path_boundaries
+);
+simd_math_targeted_all_backends!(
+    f64_hyperbolic_special_values_and_mixed_lanes,
+    run_f64_hyperbolic_special_values_and_mixed_lanes
+);
diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs
index babe6b3..3cb304f 100644
--- a/src/tests/simd_math_targeted_edges/mod.rs
+++ b/src/tests/simd_math_targeted_edges/mod.rs
@@ -89,6 +89,79 @@ fn check_targeted_unary_f32<S: Simd>(
     }
 }
 
+fn assert_f64_contract(
+    fn_name: &str,
+    input: f64,
+    actual: f64,
+    expected: f64,
+    max_ulp: u64,
+) -> Result<(), String> {
+    if expected.is_nan() {
+        if actual.is_nan() {
+            return Ok(());
+        }
+        return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}"));
+    }
+
+    if expected.is_infinite() {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected {:?}, got {:?}",
+            expected, actual
+        ));
+    }
+
+    if expected == 0.0 {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}",
+            expected.to_bits(),
+            actual.to_bits()
+        ));
+    }
+
+    if actual.is_nan() || actual.is_infinite() {
+        return Err(format!(
+            "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}"
+        ));
+    }
+
+    let ulp = ulp_distance_f64(actual, expected)
+        .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?;
+    if ulp > max_ulp {
+        return Err(format!(
+            "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})"
+        ));
+    }
+
+    Ok(())
+}
+
+fn check_targeted_unary_f64<S: Simd>(
+    fn_name: &str,
+    inputs: &[f64],
+    max_ulp: u64,
+    simd_fn: impl Fn(S::Vf64) -> S::Vf64,
+    scalar_ref: impl Fn(f64) -> f64,
+) {
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let input = S::Vf64::load_from_slice(chunk);
+        let output = simd_fn(input);
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            let actual = output[lane];
+            let expected = scalar_ref(x);
+            if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) {
+                panic!("{err}");
+            }
+        }
+    }
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) {
     assert_eq!(input.len(), output.len());

From 558bcfaaa7433812da092d234283f8feb2fffe20 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Sun, 22 Mar 2026 23:41:18 +1100
Subject: [PATCH 2/9] Optimize f64 binary misc lane

---
 .../binary_misc.rs                            |  91 +++++++++-
 .../simd_math_remaining_baseline/shared.rs    |  83 +++++++++
 src/math/f64/binary_misc.rs                   | 171 +++++++++++++++++-
 src/math/families/binary_misc/mod.rs          |  20 +-
 .../simd_math_targeted_edges/binary_misc.rs   | 141 ++++++++++++++-
 src/tests/simd_math_targeted_edges/mod.rs     |  52 ++++++
 6 files changed, 546 insertions(+), 12 deletions(-)

diff --git a/benches/simd_math_remaining_baseline/binary_misc.rs b/benches/simd_math_remaining_baseline/binary_misc.rs
index 89abb6e..6056fb9 100644
--- a/benches/simd_math_remaining_baseline/binary_misc.rs
+++ b/benches/simd_math_remaining_baseline/binary_misc.rs
@@ -1,5 +1,5 @@
 use criterion::Criterion;
-use simdeez::math::SimdMathF32BinaryMisc;
+use simdeez::math::{SimdMathF32BinaryMisc, SimdMathF64BinaryMisc};
 use simdeez::{prelude::*, simd_unsafe_generate_all};
 
 use crate::shared::{self, INPUT_LEN};
@@ -24,6 +24,26 @@ fn scalar_hypot_sum(a: &[f32], b: &[f32]) -> f32 {
     a.iter().zip(b.iter()).map(|(&x, &y)| x.hypot(y)).sum()
 }
 
+#[inline(never)]
+fn scalar_log10_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::log10).sum()
+}
+
+#[inline(never)]
+fn scalar_atan2_sum_f64(a: &[f64], b: &[f64]) -> f64 {
+    a.iter().zip(b.iter()).map(|(&x, &y)| x.atan2(y)).sum()
+}
+
+#[inline(never)]
+fn scalar_fmod_sum_f64(a: &[f64], b: &[f64]) -> f64 {
+    a.iter().zip(b.iter()).map(|(&x, &y)| x % y).sum()
+}
+
+#[inline(never)]
+fn scalar_hypot_sum_f64(a: &[f64], b: &[f64]) -> f64 {
+    a.iter().zip(b.iter()).map(|(&x, &y)| x.hypot(y)).sum()
+}
+
 simd_unsafe_generate_all!(
     fn simdeez_log10_sum(input: &[f32]) -> f32 {
         shared::simdeez_unary_sum_impl::<S>(input, |v| v.log10_u35())
@@ -48,6 +68,30 @@ simd_unsafe_generate_all!(
     }
 );
 
+simd_unsafe_generate_all!(
+    fn simdeez_log10_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.log10_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_atan2_sum_f64(a: &[f64], b: &[f64]) -> f64 {
+        shared::simdeez_binary_sum_impl_f64::<S>(a, b, |x, y| x.atan2_u35(y))
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_hypot_sum_f64(a: &[f64], b: &[f64]) -> f64 {
+        shared::simdeez_binary_sum_impl_f64::<S>(a, b, |x, y| x.hypot_u35(y))
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_fmod_sum_f64(a: &[f64], b: &[f64]) -> f64 {
+        shared::simdeez_binary_sum_impl_f64::<S>(a, b, |x, y| x.fmod(y))
+    }
+);
+
 pub fn register(c: &mut Criterion) {
     let log10_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB004, 1.0e-20, 1.0e20);
     let (atan2_y, atan2_x) =
@@ -61,6 +105,19 @@ pub fn register(c: &mut Criterion) {
             *y = 1.0;
         }
     }
+    let log10_inputs_f64 =
+        shared::make_positive_inputs_f64(INPUT_LEN, 0xDEADB104, 1.0e-200, 1.0e200);
+    let (atan2_y_f64, atan2_x_f64) =
+        shared::make_binary_inputs_f64(INPUT_LEN, 0xDEADB105, -100.0..100.0, -100.0..100.0);
+    let (hypot_x_f64, hypot_y_f64) =
+        shared::make_binary_inputs_f64(INPUT_LEN, 0xDEADB106, -1.0e200..1.0e200, -1.0e200..1.0e200);
+    let (fmod_x_f64, mut fmod_y_f64) =
+        shared::make_binary_inputs_f64(INPUT_LEN, 0xDEADB107, -1000.0..1000.0, -1000.0..1000.0);
+    for y in &mut fmod_y_f64 {
+        if *y == 0.0 {
+            *y = 1.0;
+        }
+    }
 
     shared::bench_unary(
         c,
@@ -94,4 +151,36 @@ pub fn register(c: &mut Criterion) {
         scalar_fmod_sum,
         simdeez_fmod_sum,
     );
+
+    shared::bench_unary_f64(
+        c,
+        "simd_math_baseline/f64/log10_u35",
+        &log10_inputs_f64,
+        scalar_log10_sum_f64,
+        simdeez_log10_sum_f64,
+    );
+    shared::bench_binary_f64(
+        c,
+        "simd_math_baseline/f64/atan2_u35",
+        &atan2_y_f64,
+        &atan2_x_f64,
+        scalar_atan2_sum_f64,
+        simdeez_atan2_sum_f64,
+    );
+    shared::bench_binary_f64(
+        c,
+        "simd_math_baseline/f64/hypot_u35",
+        &hypot_x_f64,
+        &hypot_y_f64,
+        scalar_hypot_sum_f64,
+        simdeez_hypot_sum_f64,
+    );
+    shared::bench_binary_f64(
+        c,
+        "simd_math_baseline/f64/fmod",
+        &fmod_x_f64,
+        &fmod_y_f64,
+        scalar_fmod_sum_f64,
+        simdeez_fmod_sum_f64,
+    );
 }
diff --git a/benches/simd_math_remaining_baseline/shared.rs b/benches/simd_math_remaining_baseline/shared.rs
index de3e87e..058000c 100644
--- a/benches/simd_math_remaining_baseline/shared.rs
+++ b/benches/simd_math_remaining_baseline/shared.rs
@@ -28,6 +28,23 @@ pub fn make_binary_inputs(
     (a, b)
 }
 
+pub fn make_positive_inputs_f64(len: usize, seed: u64, min: f64, max: f64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(min..max)).collect()
+}
+
+pub fn make_binary_inputs_f64(
+    len: usize,
+    seed: u64,
+    range_a: core::ops::Range<f64>,
+    range_b: core::ops::Range<f64>,
+) -> (Vec<f64>, Vec<f64>) {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    let a = (0..len).map(|_| rng.gen_range(range_a.clone())).collect();
+    let b = (0..len).map(|_| rng.gen_range(range_b.clone())).collect();
+    (a, b)
+}
+
 #[inline(always)]
 pub fn simdeez_unary_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32) -> f32 {
     let mut sum = 0.0f32;
@@ -57,6 +74,35 @@ pub fn simdeez_binary_sum_impl<S: Simd>(
     sum
 }
 
+#[inline(always)]
+pub fn simdeez_unary_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+    while i + S::Vf64::WIDTH <= input.len() {
+        let v = S::Vf64::load_from_slice(&input[i..]);
+        sum += op(v).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+    sum
+}
+
+#[inline(always)]
+pub fn simdeez_binary_sum_impl_f64<S: Simd>(
+    a: &[f64],
+    b: &[f64],
+    op: impl Fn(S::Vf64, S::Vf64) -> S::Vf64,
+) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+    while i + S::Vf64::WIDTH <= a.len() {
+        let va = S::Vf64::load_from_slice(&a[i..]);
+        let vb = S::Vf64::load_from_slice(&b[i..]);
+        sum += op(va, vb).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+    sum
+}
+
 pub fn bench_unary(
     c: &mut Criterion,
     name: &str,
@@ -93,3 +139,40 @@ pub fn bench_binary(
     });
     group.finish();
 }
+
+pub fn bench_unary_f64(
+    c: &mut Criterion,
+    name: &str,
+    input: &[f64],
+    scalar: fn(&[f64]) -> f64,
+    simd: fn(&[f64]) -> f64,
+) {
+    let mut group = c.benchmark_group(name);
+    group.throughput(Throughput::Elements(input.len() as u64));
+    group.bench_function("scalar-native", |b| {
+        b.iter(|| black_box(scalar(black_box(input))))
+    });
+    group.bench_function("simdeez-runtime", |b| {
+        b.iter(|| black_box(simd(black_box(input))))
+    });
+    group.finish();
+}
+
+pub fn bench_binary_f64(
+    c: &mut Criterion,
+    name: &str,
+    a: &[f64],
+    b: &[f64],
+    scalar: fn(&[f64], &[f64]) -> f64,
+    simd: fn(&[f64], &[f64]) -> f64,
+) {
+    let mut group = c.benchmark_group(name);
+    group.throughput(Throughput::Elements(a.len() as u64));
+    group.bench_function("scalar-native", |ben| {
+        ben.iter(|| black_box(scalar(black_box(a), black_box(b))))
+    });
+    group.bench_function("simdeez-runtime", |ben| {
+        ben.iter(|| black_box(simd(black_box(a), black_box(b))))
+    });
+    group.finish();
+}
diff --git a/src/math/f64/binary_misc.rs b/src/math/f64/binary_misc.rs
index f6d0d26..b99f792 100644
--- a/src/math/f64/binary_misc.rs
+++ b/src/math/f64/binary_misc.rs
@@ -1,34 +1,193 @@
-use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::math::{f64, scalar};
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};
+
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+const F64_SIGN_MASK: i64 = i64::MIN;
+const FAST_FMOD_MAX_QUOTIENT: f64 = 4_503_599_627_370_496.0;
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_unary_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn patch_binary_exceptional_lanes<V>(
+    lhs: V,
+    rhs: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64, f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let lhs_lanes = lhs.as_array();
+        let rhs_lanes = rhs.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(lhs_lanes[lane], rhs_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn finite_mask<V>(input: V) -> SimdI64<V>
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    input.cmp_eq(input).bitcast_i64() & input.abs().cmp_neq(V::set1(f64::INFINITY)).bitcast_i64()
+}
 
 #[inline(always)]
 pub(crate) fn log10_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::log10_u35_f64)
+    let positive_finite = input.cmp_gt(V::zeroes()).bitcast_i64() & finite_mask(input);
+    let fast = f64::log2_u35(input) * V::set1(core::f64::consts::LOG10_2);
+    patch_unary_exceptional_lanes(
+        input,
+        fast,
+        positive_finite.cmp_eq(SimdI64::<V>::zeroes()),
+        scalar::log10_u35_f64,
+    )
 }
 
 #[inline(always)]
 pub(crate) fn atan2_u35<V>(y: V, x: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::binary_f64(y, x, scalar::atan2_u35_f64)
+    let abs_y = y.abs();
+    let abs_x = x.abs();
+
+    let x_is_zero = abs_x.cmp_eq(V::zeroes()).bitcast_i64();
+    let fast_mask = finite_mask(x) & finite_mask(y) & x_is_zero.cmp_eq(SimdI64::<V>::zeroes());
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let large_mask = abs_y.cmp_gt(abs_x).bitcast_i64();
+    let reduced_base = large_mask
+        .bitcast_f64()
+        .blendv(abs_y / abs_x, abs_x / abs_y);
+
+    let z_atan = f64::atan_u35(reduced_base);
+
+    let pi_over_2 = V::set1(core::f64::consts::FRAC_PI_2);
+    let pi = V::set1(core::f64::consts::PI);
+    let y_sign = y.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK);
+    let x_negative = x.cmp_lt(V::zeroes()).bitcast_i64();
+
+    let base = large_mask.bitcast_f64().blendv(z_atan, pi_over_2 - z_atan);
+    let with_x_quadrant = x_negative.bitcast_f64().blendv(base, pi - base);
+    let fast = (y_sign ^ with_x_quadrant.bitcast_i64()).bitcast_f64();
+
+    patch_binary_exceptional_lanes(y, x, fast, exceptional_mask, scalar::atan2_u35_f64)
 }
 
 #[inline(always)]
 pub(crate) fn hypot_u35<V>(x: V, y: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::binary_f64(x, y, scalar::hypot_u35_f64)
+    let abs_x = x.abs();
+    let abs_y = y.abs();
+    let x_gt_y = abs_x.cmp_gt(abs_y);
+    let max_xy = x_gt_y.blendv(abs_y, abs_x);
+    let min_xy = x_gt_y.blendv(abs_x, abs_y);
+
+    let fast_mask = finite_mask(x) & finite_mask(y) & max_xy.cmp_neq(V::zeroes()).bitcast_i64();
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let ratio = min_xy / max_xy;
+    let fast = max_xy * (V::set1(1.0) + ratio * ratio).sqrt();
+
+    patch_binary_exceptional_lanes(x, y, fast, exceptional_mask, scalar::hypot_u35_f64)
 }
 
 #[inline(always)]
 pub(crate) fn fmod<V>(x: V, y: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::binary_f64(x, y, scalar::fmod_f64)
+    let quotient = x / y;
+    let trunc = quotient
+        .cmp_lt(V::zeroes())
+        .blendv(quotient.floor(), quotient.ceil());
+    let fast = x - trunc * y;
+    let zero_mask = fast.cmp_eq(V::zeroes()).bitcast_i64();
+    let signed_zero = (x.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK)).bitcast_f64();
+    let fast = zero_mask.bitcast_f64().blendv(fast, signed_zero);
+
+    let finite_inputs = finite_mask(x) & finite_mask(y);
+    let y_nonzero = y.cmp_neq(V::zeroes()).bitcast_i64();
+    let x_not_inf = x.abs().cmp_neq(V::set1(f64::INFINITY)).bitcast_i64();
+    let small_quotient = quotient
+        .abs()
+        .cmp_lt(V::set1(FAST_FMOD_MAX_QUOTIENT))
+        .bitcast_i64();
+    let fast_mask = finite_inputs & y_nonzero & x_not_inf & small_quotient;
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    patch_binary_exceptional_lanes(x, y, fast, exceptional_mask, scalar::fmod_f64)
 }
diff --git a/src/math/families/binary_misc/mod.rs b/src/math/families/binary_misc/mod.rs
index e50ea1e..7ce0e5a 100644
--- a/src/math/families/binary_misc/mod.rs
+++ b/src/math/families/binary_misc/mod.rs
@@ -39,23 +39,35 @@ impl<T: SimdFloat32> SimdMathF32BinaryMisc for T {}
 
 pub trait SimdMathF64BinaryMisc: SimdFloat64 {
     #[inline(always)]
-    fn log10_u35(self) -> Self {
+    fn log10_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::log10_u35(self)
     }
 
     #[inline(always)]
-    fn atan2_u35(self, x: Self) -> Self {
+    fn atan2_u35(self, x: Self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::atan2_u35(self, x)
     }
 
     #[inline(always)]
-    fn hypot_u35(self, y: Self) -> Self {
+    fn hypot_u35(self, y: Self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::hypot_u35(self, y)
     }
 
     /// Floating-point remainder with C/libm `fmod` semantics (sign follows dividend).
     #[inline(always)]
-    fn fmod(self, y: Self) -> Self {
+    fn fmod(self, y: Self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::fmod(self, y)
     }
 }
diff --git a/src/tests/simd_math_targeted_edges/binary_misc.rs b/src/tests/simd_math_targeted_edges/binary_misc.rs
index 73865fa..5d895ce 100644
--- a/src/tests/simd_math_targeted_edges/binary_misc.rs
+++ b/src/tests/simd_math_targeted_edges/binary_misc.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::math::SimdMathF32BinaryMisc;
+use crate::math::{SimdMathF32BinaryMisc, SimdMathF64BinaryMisc};
 
 fn run_f32_atan2_signed_zero_and_quadrants<S: Simd>() {
     let ys = [
@@ -137,3 +137,142 @@ simd_math_targeted_all_backends!(
     f32_log10_domain_and_mixed_lanes,
     run_f32_log10_domain_and_mixed_lanes
 );
+
+fn run_f64_atan2_signed_zero_and_quadrants<S: Simd>() {
+    let ys = [
+        0.0f64,
+        -0.0,
+        1.0,
+        -1.0,
+        2.0,
+        -2.0,
+        1.0e-300,
+        -1.0e-300,
+        f64::INFINITY,
+        f64::NEG_INFINITY,
+        f64::NAN,
+    ];
+    let xs = [
+        0.0f64,
+        -0.0,
+        1.0,
+        -1.0,
+        2.0,
+        -2.0,
+        1.0e-300,
+        -1.0e-300,
+        f64::INFINITY,
+        f64::NEG_INFINITY,
+        f64::NAN,
+    ];
+    let mut y_inputs = Vec::new();
+    let mut x_inputs = Vec::new();
+    for &y in &ys {
+        for &x in &xs {
+            y_inputs.push(y);
+            x_inputs.push(x);
+        }
+    }
+
+    for (ys_chunk, xs_chunk) in y_inputs
+        .chunks(S::Vf64::WIDTH)
+        .zip(x_inputs.chunks(S::Vf64::WIDTH))
+    {
+        let yv = S::Vf64::load_from_slice(ys_chunk);
+        let xv = S::Vf64::load_from_slice(xs_chunk);
+        let out = yv.atan2_u35(xv);
+        for lane in 0..ys_chunk.len() {
+            let y = ys_chunk[lane];
+            let x = xs_chunk[lane];
+            assert_f64_contract(
+                "atan2_u35",
+                y,
+                out[lane],
+                y.atan2(x),
+                contracts::ATAN2_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("atan2_u35({y:?}, {x:?}): {e}"));
+        }
+    }
+}
+
+fn run_f64_hypot_and_fmod_edges<S: Simd>() {
+    let xs = [
+        1.0e308f64,
+        1.0e-308,
+        3.0,
+        -3.0,
+        0.0,
+        -0.0,
+        f64::INFINITY,
+        f64::NAN,
+        f64::MAX,
+        -f64::MAX,
+        9.0e15,
+        -9.0e15,
+    ];
+    let ys = [
+        1.0e308f64, 1.0e-308, 2.0, -2.0, -0.0, 0.0, 2.0, 1.5, 1.0, -1.0, 3.0, -3.0,
+    ];
+    for (x_chunk, y_chunk) in xs.chunks(S::Vf64::WIDTH).zip(ys.chunks(S::Vf64::WIDTH)) {
+        let xv = S::Vf64::load_from_slice(x_chunk);
+        let yv = S::Vf64::load_from_slice(y_chunk);
+        let h = xv.hypot_u35(yv);
+        let r = xv.fmod(yv);
+        for lane in 0..x_chunk.len() {
+            let x = x_chunk[lane];
+            let y = y_chunk[lane];
+            assert_f64_contract(
+                "hypot_u35",
+                x,
+                h[lane],
+                x.hypot(y),
+                contracts::HYPOT_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("hypot_u35({x:?},{y:?}) {e}"));
+            assert_f64_contract("fmod", x, r[lane], x % y, 0)
+                .unwrap_or_else(|e| panic!("fmod({x:?},{y:?}) {e}"));
+        }
+    }
+}
+
+fn run_f64_log10_domain_and_mixed_lanes<S: Simd>() {
+    let xs = [
+        10.0f64,
+        1.0,
+        0.1,
+        f64::MIN_POSITIVE,
+        0.0,
+        -0.0,
+        -1.0,
+        -10.0,
+        f64::INFINITY,
+        f64::NAN,
+    ];
+
+    for chunk in xs.chunks(S::Vf64::WIDTH) {
+        let xv = S::Vf64::load_from_slice(chunk);
+        let out = xv.log10_u35();
+        for lane in 0..chunk.len() {
+            let x = chunk[lane];
+            assert_f64_contract(
+                "log10_u35",
+                x,
+                out[lane],
+                x.log10(),
+                contracts::LOG10_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("log10_u35({x:?}) {e}"));
+        }
+    }
+}
+
+simd_math_targeted_all_backends!(
+    f64_atan2_signed_zero_and_quadrants,
+    run_f64_atan2_signed_zero_and_quadrants
+);
+simd_math_targeted_all_backends!(f64_hypot_and_fmod_edges, run_f64_hypot_and_fmod_edges);
+simd_math_targeted_all_backends!(
+    f64_log10_domain_and_mixed_lanes,
+    run_f64_log10_domain_and_mixed_lanes
+);
diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs
index babe6b3..4c31173 100644
--- a/src/tests/simd_math_targeted_edges/mod.rs
+++ b/src/tests/simd_math_targeted_edges/mod.rs
@@ -68,6 +68,58 @@ fn assert_f32_contract(
     Ok(())
 }
 
+fn assert_f64_contract(
+    fn_name: &str,
+    input: f64,
+    actual: f64,
+    expected: f64,
+    max_ulp: u64,
+) -> Result<(), String> {
+    if expected.is_nan() {
+        if actual.is_nan() {
+            return Ok(());
+        }
+        return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}"));
+    }
+
+    if expected.is_infinite() {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected {:?}, got {:?}",
+            expected, actual
+        ));
+    }
+
+    if expected == 0.0 {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}",
+            expected.to_bits(),
+            actual.to_bits()
+        ));
+    }
+
+    if actual.is_nan() || actual.is_infinite() {
+        return Err(format!(
+            "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}"
+        ));
+    }
+
+    let ulp = ulp_distance_f64(actual, expected)
+        .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?;
+    if ulp > max_ulp {
+        return Err(format!(
+            "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})"
+        ));
+    }
+
+    Ok(())
+}
+
 fn check_targeted_unary_f32<S: Simd>(
     fn_name: &str,
     inputs: &[f32],

From 4888be0a04056e257204b77fe883c8669a445295 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Sun, 22 Mar 2026 23:44:09 +1100
Subject: [PATCH 3/9] Optimize f64 inverse-hyperbolic family lane kernels

---
 .../inverse_hyperbolic.rs                     |  65 ++++++++-
 .../simd_math_remaining_baseline/shared.rs    |  40 +++++
 src/math/f64/inverse_hyperbolic.rs            |  93 +++++++++++-
 .../inverse_hyperbolic.rs                     | 137 +++++++++++++++++-
 src/tests/simd_math_targeted_edges/mod.rs     |  75 +++++++++-
 5 files changed, 402 insertions(+), 8 deletions(-)

diff --git a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs
index c8d3734..2ea9d50 100644
--- a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs
+++ b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs
@@ -1,5 +1,5 @@
 use criterion::Criterion;
-use simdeez::math::SimdMathF32InverseHyperbolic;
+use simdeez::math::{SimdMathF32InverseHyperbolic, SimdMathF64InverseHyperbolic};
 use simdeez::{prelude::*, simd_unsafe_generate_all};
 
 use crate::shared::{self, INPUT_LEN};
@@ -19,6 +19,21 @@ fn scalar_atanh_sum(input: &[f32]) -> f32 {
     input.iter().copied().map(f32::atanh).sum()
 }
 
+#[inline(never)]
+fn scalar_asinh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::asinh).sum()
+}
+
+#[inline(never)]
+fn scalar_acosh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::acosh).sum()
+}
+
+#[inline(never)]
+fn scalar_atanh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::atanh).sum()
+}
+
 simd_unsafe_generate_all!(
     fn simdeez_asinh_sum(input: &[f32]) -> f32 {
         shared::simdeez_unary_sum_impl::<S>(input, |v| v.asinh_u35())
@@ -37,11 +52,35 @@ simd_unsafe_generate_all!(
     }
 );
 
+simd_unsafe_generate_all!(
+    fn simdeez_asinh_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.asinh_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_acosh_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.acosh_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_atanh_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.atanh_u35())
+    }
+);
+
 pub fn register(c: &mut Criterion) {
     let asinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB001, -16_384.0..16_384.0);
     let acosh_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB002, 1.0, 16_384.0);
     let atanh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB003, -0.999_999..0.999_999);
 
+    let asinh_inputs_f64 =
+        shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB101, -16_384.0..16_384.0);
+    let acosh_inputs_f64 = shared::make_positive_inputs_f64(INPUT_LEN, 0xDEADB102, 1.0, 16_384.0);
+    let atanh_inputs_f64 =
+        shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB103, -0.999_999_999_999..0.999_999_999_999);
+
     shared::bench_unary(
         c,
         "simd_math_baseline/f32/asinh_u35",
@@ -65,4 +104,28 @@ pub fn register(c: &mut Criterion) {
         scalar_atanh_sum,
         simdeez_atanh_sum,
     );
+
+    shared::bench_unary_f64(
+        c,
+        "simd_math_baseline/f64/asinh_u35",
+        &asinh_inputs_f64,
+        scalar_asinh_sum_f64,
+        simdeez_asinh_sum_f64,
+    );
+
+    shared::bench_unary_f64(
+        c,
+        "simd_math_baseline/f64/acosh_u35",
+        &acosh_inputs_f64,
+        scalar_acosh_sum_f64,
+        simdeez_acosh_sum_f64,
+    );
+
+    shared::bench_unary_f64(
+        c,
+        "simd_math_baseline/f64/atanh_u35",
+        &atanh_inputs_f64,
+        scalar_atanh_sum_f64,
+        simdeez_atanh_sum_f64,
+    );
 }
diff --git a/benches/simd_math_remaining_baseline/shared.rs b/benches/simd_math_remaining_baseline/shared.rs
index de3e87e..14ef80a 100644
--- a/benches/simd_math_remaining_baseline/shared.rs
+++ b/benches/simd_math_remaining_baseline/shared.rs
@@ -16,6 +16,16 @@ pub fn make_positive_inputs(len: usize, seed: u64, min: f32, max: f32) -> Vec<f3
     (0..len).map(|_| rng.gen_range(min..max)).collect()
 }
 
+pub fn make_unary_inputs_f64(len: usize, seed: u64, range: core::ops::Range<f64>) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(range.clone())).collect()
+}
+
+pub fn make_positive_inputs_f64(len: usize, seed: u64, min: f64, max: f64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(min..max)).collect()
+}
+
 pub fn make_binary_inputs(
     len: usize,
     seed: u64,
@@ -40,6 +50,18 @@ pub fn simdeez_unary_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S:
     sum
 }
 
+#[inline(always)]
+pub fn simdeez_unary_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+    while i + S::Vf64::WIDTH <= input.len() {
+        let v = S::Vf64::load_from_slice(&input[i..]);
+        sum += op(v).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+    sum
+}
+
 #[inline(always)]
 pub fn simdeez_binary_sum_impl<S: Simd>(
     a: &[f32],
@@ -75,6 +97,24 @@ pub fn bench_unary(
     group.finish();
 }
 
+pub fn bench_unary_f64(
+    c: &mut Criterion,
+    name: &str,
+    input: &[f64],
+    scalar: fn(&[f64]) -> f64,
+    simd: fn(&[f64]) -> f64,
+) {
+    let mut group = c.benchmark_group(name);
+    group.throughput(Throughput::Elements(input.len() as u64));
+    group.bench_function("scalar-native", |b| {
+        b.iter(|| black_box(scalar(black_box(input))))
+    });
+    group.bench_function("simdeez-runtime", |b| {
+        b.iter(|| black_box(simd(black_box(input))))
+    });
+    group.finish();
+}
+
 pub fn bench_binary(
     c: &mut Criterion,
     name: &str,
diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs
index 503988c..1ddde48 100644
--- a/src/math/f64/inverse_hyperbolic.rs
+++ b/src/math/f64/inverse_hyperbolic.rs
@@ -1,12 +1,73 @@
-use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::math::{f64, scalar};
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
+
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
 
 #[inline(always)]
 pub(crate) fn asinh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::asinh_u35_f64)
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let abs_x = input.abs();
+    let tiny_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64();
+    let large_mask = abs_x.cmp_gt(V::set1(1.0e150)).bitcast_i64();
+    let zero_mask = input.cmp_eq(V::zeroes()).bitcast_i64();
+    let exceptional_mask =
+        finite_mask.cmp_eq(SimdI64::<V>::zeroes()) | tiny_mask | large_mask | zero_mask;
+
+    let radicand = (abs_x * abs_x) + V::set1(1.0);
+    let magnitude = f64::ln_u35(abs_x + radicand.sqrt());
+    let negative_mask = input.cmp_lt(V::zeroes());
+    let fast = negative_mask.blendv(magnitude, -magnitude);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asinh_u35_f64)
 }
 
 #[inline(always)]
@@ -14,7 +75,15 @@ pub(crate) fn acosh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::acosh_u35_f64)
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64();
+    let fast_mask = finite_mask & in_domain_mask;
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt());
+    let fast = f64::ln_u35(input + root_term);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64)
 }
 
 #[inline(always)]
@@ -22,5 +91,19 @@ pub(crate) fn atanh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::atanh_u35_f64)
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let abs_x = input.abs();
+    let strict_domain_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64();
+    let non_zero_mask = input.cmp_neq(V::zeroes()).bitcast_i64();
+    let stable_range_mask = abs_x.cmp_lte(V::set1(0.99)).bitcast_i64();
+    let away_from_zero_mask = abs_x.cmp_gte(V::set1(0.9)).bitcast_i64();
+    let fast_mask =
+        finite_mask & strict_domain_mask & non_zero_mask & stable_range_mask & away_from_zero_mask;
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let one = V::set1(1.0);
+    let ratio = (one + input) / (one - input);
+    let fast = f64::ln_u35(ratio) * V::set1(0.5);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atanh_u35_f64)
 }
diff --git a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs
index b5390c4..c07c793 100644
--- a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs
+++ b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::math::SimdMathF32InverseHyperbolic;
+use crate::math::{SimdMathF32InverseHyperbolic, SimdMathF64InverseHyperbolic};
 
 fn run_f32_inverse_hyperbolic_domain_edges<S: Simd>() {
     let acosh_inputs = [
@@ -135,3 +135,138 @@ simd_math_targeted_all_backends!(
     f32_inverse_hyperbolic_mixed_lanes,
     run_f32_inverse_hyperbolic_mixed_lanes
 );
+
+fn run_f64_inverse_hyperbolic_domain_edges<S: Simd>() {
+    let acosh_inputs = [
+        0.0f64,
+        0.5,
+        1.0 - f64::EPSILON,
+        1.0,
+        1.0 + f64::EPSILON,
+        1.000_000_000_001,
+        2.0,
+        10.0,
+        f64::INFINITY,
+        f64::NAN,
+    ];
+    check_targeted_unary_f64::<S>(
+        "acosh_u35",
+        &acosh_inputs,
+        contracts::ACOSH_U35_F64_MAX_ULP,
+        |v| v.acosh_u35(),
+        f64::acosh,
+    );
+
+    let atanh_inputs = [
+        -1.0f64,
+        -1.0 + f64::EPSILON,
+        -0.999_999_999_999,
+        -0.5,
+        -0.0,
+        0.0,
+        0.5,
+        0.999_999_999_999,
+        1.0 - f64::EPSILON,
+        1.0,
+        1.0 + f64::EPSILON,
+        f64::NAN,
+    ];
+    check_targeted_unary_f64::<S>(
+        "atanh_u35",
+        &atanh_inputs,
+        contracts::ATANH_U35_F64_MAX_ULP,
+        |v| v.atanh_u35(),
+        f64::atanh,
+    );
+
+    let asinh_inputs = [
+        f64::NEG_INFINITY,
+        -1.0e300,
+        -1000.0,
+        -1.0,
+        -f64::MIN_POSITIVE,
+        -0.0,
+        0.0,
+        f64::MIN_POSITIVE,
+        1.0,
+        1000.0,
+        1.0e300,
+        f64::INFINITY,
+        f64::NAN,
+    ];
+    check_targeted_unary_f64::<S>(
+        "asinh_u35",
+        &asinh_inputs,
+        contracts::ASINH_U35_F64_MAX_ULP,
+        |v| v.asinh_u35(),
+        f64::asinh,
+    );
+}
+
+fn run_f64_inverse_hyperbolic_mixed_lanes<S: Simd>() {
+    let mut lanes = vec![0.0f64; S::Vf64::WIDTH];
+    let pattern = [
+        -0.75f64,
+        -0.0,
+        0.0,
+        0.75,
+        0.999_999_999_999,
+        -0.999_999_999_999,
+        1.0,
+        -1.0,
+        1.0 + f64::EPSILON,
+        f64::NAN,
+        f64::INFINITY,
+        f64::NEG_INFINITY,
+        1.0,
+        2.0,
+        10.0,
+        0.5,
+    ];
+    for (i, lane) in lanes.iter_mut().enumerate() {
+        *lane = pattern[i % pattern.len()];
+    }
+
+    let input = S::Vf64::load_from_slice(&lanes);
+    let asinh = input.asinh_u35();
+    let acosh = input.acosh_u35();
+    let atanh = input.atanh_u35();
+
+    for (lane, &x) in lanes.iter().enumerate() {
+        assert_f64_contract(
+            "asinh_u35",
+            x,
+            asinh[lane],
+            x.asinh(),
+            contracts::ASINH_U35_F64_MAX_ULP,
+        )
+        .unwrap_or_else(|e| panic!("lane {lane}: {e}"));
+
+        assert_f64_contract(
+            "acosh_u35",
+            x,
+            acosh[lane],
+            x.acosh(),
+            contracts::ACOSH_U35_F64_MAX_ULP,
+        )
+        .unwrap_or_else(|e| panic!("lane {lane}: {e}"));
+
+        assert_f64_contract(
+            "atanh_u35",
+            x,
+            atanh[lane],
+            x.atanh(),
+            contracts::ATANH_U35_F64_MAX_ULP,
+        )
+        .unwrap_or_else(|e| panic!("lane {lane}: {e}"));
+    }
+}
+
+simd_math_targeted_all_backends!(
+    f64_inverse_hyperbolic_domain_edges,
+    run_f64_inverse_hyperbolic_domain_edges
+);
+simd_math_targeted_all_backends!(
+    f64_inverse_hyperbolic_mixed_lanes,
+    run_f64_inverse_hyperbolic_mixed_lanes
+);
diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs
index babe6b3..fe2f727 100644
--- a/src/tests/simd_math_targeted_edges/mod.rs
+++ b/src/tests/simd_math_targeted_edges/mod.rs
@@ -13,7 +13,7 @@ use crate::engines::wasm32::Wasm;
 use crate::engines::{avx2::Avx2, sse2::Sse2, sse41::Sse41};
 
 use crate::math::SimdMathF32Core;
-use crate::math::{contracts, SimdMathF32};
+use crate::math::{contracts, SimdMathF32, SimdMathF64};
 use crate::{Simd, SimdBaseIo, SimdConsts};
 
 fn assert_f32_contract(
@@ -89,6 +89,79 @@ fn check_targeted_unary_f32<S: Simd>(
     }
 }
 
+fn assert_f64_contract(
+    fn_name: &str,
+    input: f64,
+    actual: f64,
+    expected: f64,
+    max_ulp: u64,
+) -> Result<(), String> {
+    if expected.is_nan() {
+        if actual.is_nan() {
+            return Ok(());
+        }
+        return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}"));
+    }
+
+    if expected.is_infinite() {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected {:?}, got {:?}",
+            expected, actual
+        ));
+    }
+
+    if expected == 0.0 {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}",
+            expected.to_bits(),
+            actual.to_bits()
+        ));
+    }
+
+    if actual.is_nan() || actual.is_infinite() {
+        return Err(format!(
+            "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}"
+        ));
+    }
+
+    let ulp = ulp_distance_f64(actual, expected)
+        .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?;
+    if ulp > max_ulp {
+        return Err(format!(
+            "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})"
+        ));
+    }
+
+    Ok(())
+}
+
+fn check_targeted_unary_f64<S: Simd>(
+    fn_name: &str,
+    inputs: &[f64],
+    max_ulp: u64,
+    simd_fn: impl Fn(S::Vf64) -> S::Vf64,
+    scalar_ref: impl Fn(f64) -> f64,
+) {
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let input = S::Vf64::load_from_slice(chunk);
+        let output = simd_fn(input);
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            let actual = output[lane];
+            let expected = scalar_ref(x);
+            if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) {
+                panic!("{err}");
+            }
+        }
+    }
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) {
     assert_eq!(input.len(), output.len());

From 915666a5e45a50799b29e238d0497cf48931da5b Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Sun, 22 Mar 2026 23:45:22 +1100
Subject: [PATCH 4/9] Optimize f64 inverse trig family with portable u35
 kernels

---
 benches/simd_math/inverse_trig.rs             | 127 +++++++++-
 benches/simd_math/shared.rs                   | 111 +++++++-
 src/math/contracts.rs                         |   8 +-
 src/math/f64/inverse_trig.rs                  |  13 +-
 src/math/families/inverse_trig/mod.rs         |  16 +-
 .../families/inverse_trig/portable_f64.rs     | 237 ++++++++++++++++++
 src/math/families/mod.rs                      |   2 +-
 .../simd_math_targeted_edges/inverse_trig.rs  | 148 ++++++++++-
 src/tests/simd_math_targeted_edges/mod.rs     |  52 ++++
 9 files changed, 686 insertions(+), 28 deletions(-)
 create mode 100644 src/math/families/inverse_trig/portable_f64.rs

diff --git a/benches/simd_math/inverse_trig.rs b/benches/simd_math/inverse_trig.rs
index 6fddcfd..e4d32f7 100644
--- a/benches/simd_math/inverse_trig.rs
+++ b/benches/simd_math/inverse_trig.rs
@@ -1,10 +1,9 @@
 use criterion::Criterion;
-use simdeez::math::SimdMathF32InverseTrig;
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+use simdeez::math::{SimdMathF32InverseTrig, SimdMathF64InverseTrig};
 use simdeez::scalar::Scalar;
 use simdeez::{prelude::*, simd_unsafe_generate_all};
 
-use crate::shared::{self, BenchTargets, INPUT_LEN};
+use crate::shared::{self, BenchTargets, BenchTargetsF64, INPUT_LEN};
 
 #[inline(never)]
 fn scalar_asin_sum(input: &[f32]) -> f32 {
@@ -21,6 +20,21 @@ fn scalar_atan_sum(input: &[f32]) -> f32 {
     input.iter().copied().map(f32::atan).sum()
 }
 
+#[inline(never)]
+fn scalar_asin_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::asin).sum()
+}
+
+#[inline(never)]
+fn scalar_acos_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::acos).sum()
+}
+
+#[inline(never)]
+fn scalar_atan_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::atan).sum()
+}
+
 simd_unsafe_generate_all!(
     fn simdeez_asin_sum(input: &[f32]) -> f32 {
         shared::simdeez_sum_impl::<S>(input, |v| v.asin_u35())
@@ -39,27 +53,59 @@ simd_unsafe_generate_all!(
     }
 );
 
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+simd_unsafe_generate_all!(
+    fn simdeez_asin_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_sum_impl_f64::<S>(input, |v| v.asin_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_acos_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_sum_impl_f64::<S>(input, |v| v.acos_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_atan_sum_f64(input: &[f64]) -> f64 {
+        shared::simdeez_sum_impl_f64::<S>(input, |v| v.atan_u35())
+    }
+);
+
 #[inline(never)]
-fn simdeez_asin_sum_scalar(input: &[f32]) -> f32 {
+fn forced_scalar_asin_sum(input: &[f32]) -> f32 {
     shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.asin_u35())
 }
 
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 #[inline(never)]
-fn simdeez_acos_sum_scalar(input: &[f32]) -> f32 {
+fn forced_scalar_acos_sum(input: &[f32]) -> f32 {
     shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.acos_u35())
 }
 
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 #[inline(never)]
-fn simdeez_atan_sum_scalar(input: &[f32]) -> f32 {
+fn forced_scalar_atan_sum(input: &[f32]) -> f32 {
     shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.atan_u35())
 }
 
+#[inline(never)]
+fn forced_scalar_asin_sum_f64(input: &[f64]) -> f64 {
+    shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.asin_u35())
+}
+
+#[inline(never)]
+fn forced_scalar_acos_sum_f64(input: &[f64]) -> f64 {
+    shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.acos_u35())
+}
+
+#[inline(never)]
+fn forced_scalar_atan_sum_f64(input: &[f64]) -> f64 {
+    shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.atan_u35())
+}
+
 pub fn register(c: &mut Criterion) {
     let inverse_inputs = shared::make_inverse_trig_inputs(INPUT_LEN, 0xA11C_E101);
     let atan_inputs = shared::make_atan_inputs(INPUT_LEN, 0xA11C_E102);
+    let inverse_inputs_f64 = shared::make_inverse_trig_inputs_f64(INPUT_LEN, 0xA11C_E201);
+    let atan_inputs_f64 = shared::make_atan_inputs_f64(INPUT_LEN, 0xA11C_E202);
 
     shared::bench_variants(
         c,
@@ -68,7 +114,7 @@ pub fn register(c: &mut Criterion) {
         BenchTargets {
             scalar_native: scalar_asin_sum,
             simdeez_runtime: simdeez_asin_sum,
-            simdeez_scalar: simdeez_asin_sum_scalar,
+            simdeez_scalar: forced_scalar_asin_sum,
             #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
             simdeez_sse2: simdeez_asin_sum_sse2,
             #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -87,7 +133,7 @@ pub fn register(c: &mut Criterion) {
         BenchTargets {
             scalar_native: scalar_acos_sum,
             simdeez_runtime: simdeez_acos_sum,
-            simdeez_scalar: simdeez_acos_sum_scalar,
+            simdeez_scalar: forced_scalar_acos_sum,
             #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
             simdeez_sse2: simdeez_acos_sum_sse2,
             #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -106,7 +152,7 @@ pub fn register(c: &mut Criterion) {
         BenchTargets {
             scalar_native: scalar_atan_sum,
             simdeez_runtime: simdeez_atan_sum,
-            simdeez_scalar: simdeez_atan_sum_scalar,
+            simdeez_scalar: forced_scalar_atan_sum,
             #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
             simdeez_sse2: simdeez_atan_sum_sse2,
             #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -117,4 +163,61 @@ pub fn register(c: &mut Criterion) {
             simdeez_avx512: simdeez_atan_sum_avx512,
         },
     );
+
+    shared::bench_variants_f64(
+        c,
+        "simd_math/f64/asin_u35",
+        &inverse_inputs_f64,
+        BenchTargetsF64 {
+            scalar_native: scalar_asin_sum_f64,
+            simdeez_runtime: simdeez_asin_sum_f64,
+            simdeez_scalar: forced_scalar_asin_sum_f64,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_sse2: simdeez_asin_sum_f64_sse2,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_sse41: simdeez_asin_sum_f64_sse41,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_avx2: simdeez_asin_sum_f64_avx2,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_avx512: simdeez_asin_sum_f64_avx512,
+        },
+    );
+
+    shared::bench_variants_f64(
+        c,
+        "simd_math/f64/acos_u35",
+        &inverse_inputs_f64,
+        BenchTargetsF64 {
+            scalar_native: scalar_acos_sum_f64,
+            simdeez_runtime: simdeez_acos_sum_f64,
+            simdeez_scalar: forced_scalar_acos_sum_f64,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_sse2: simdeez_acos_sum_f64_sse2,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_sse41: simdeez_acos_sum_f64_sse41,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_avx2: simdeez_acos_sum_f64_avx2,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_avx512: simdeez_acos_sum_f64_avx512,
+        },
+    );
+
+    shared::bench_variants_f64(
+        c,
+        "simd_math/f64/atan_u35",
+        &atan_inputs_f64,
+        BenchTargetsF64 {
+            scalar_native: scalar_atan_sum_f64,
+            simdeez_runtime: simdeez_atan_sum_f64,
+            simdeez_scalar: forced_scalar_atan_sum_f64,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_sse2: simdeez_atan_sum_f64_sse2,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_sse41: simdeez_atan_sum_f64_sse41,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_avx2: simdeez_atan_sum_f64_avx2,
+            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+            simdeez_avx512: simdeez_atan_sum_f64_avx512,
+        },
+    );
 }
diff --git a/benches/simd_math/shared.rs b/benches/simd_math/shared.rs
index e98b0e5..b122c8f 100644
--- a/benches/simd_math/shared.rs
+++ b/benches/simd_math/shared.rs
@@ -2,7 +2,6 @@ use criterion::{Criterion, Throughput};
 use rand::{Rng, SeedableRng};
 use rand_chacha::ChaCha8Rng;
 use simdeez::prelude::*;
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 use simdeez::scalar::Scalar;
 use std::hint::black_box;
 
@@ -48,11 +47,21 @@ pub fn make_inverse_trig_inputs(len: usize, seed: u64) -> Vec<f32> {
     (0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect()
 }
 
+pub fn make_inverse_trig_inputs_f64(len: usize, seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(-1.0f64..1.0f64)).collect()
+}
+
 pub fn make_atan_inputs(len: usize, seed: u64) -> Vec<f32> {
     let mut rng = ChaCha8Rng::seed_from_u64(seed);
     (0..len).map(|_| rng.gen_range(-64.0f32..64.0f32)).collect()
 }
 
+pub fn make_atan_inputs_f64(len: usize, seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(-64.0f64..64.0f64)).collect()
+}
+
 pub fn make_tan_inputs(len: usize, seed: u64) -> Vec<f32> {
     let mut rng = ChaCha8Rng::seed_from_u64(seed);
     let half_pi = core::f32::consts::FRAC_PI_2;
@@ -87,6 +96,20 @@ pub struct BenchTargets {
     pub simdeez_avx512: unsafe fn(&[f32]) -> f32,
 }
 
+pub struct BenchTargetsF64 {
+    pub scalar_native: fn(&[f64]) -> f64,
+    pub simdeez_runtime: fn(&[f64]) -> f64,
+    pub simdeez_scalar: fn(&[f64]) -> f64,
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    pub simdeez_sse2: unsafe fn(&[f64]) -> f64,
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    pub simdeez_sse41: unsafe fn(&[f64]) -> f64,
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    pub simdeez_avx2: unsafe fn(&[f64]) -> f64,
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    pub simdeez_avx512: unsafe fn(&[f64]) -> f64,
+}
+
 pub fn bench_variants(c: &mut Criterion, group_name: &str, input: &[f32], targets: BenchTargets) {
     let mut group = c.benchmark_group(group_name);
     group.throughput(Throughput::Elements(input.len() as u64));
@@ -146,15 +169,83 @@ pub fn bench_variants(c: &mut Criterion, group_name: &str, input: &[f32], target
     group.finish();
 }
 
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+pub fn bench_variants_f64(
+    c: &mut Criterion,
+    group_name: &str,
+    input: &[f64],
+    targets: BenchTargetsF64,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.throughput(Throughput::Elements(input.len() as u64));
+
+    group.bench_function("scalar-native", |b| {
+        b.iter(|| black_box((targets.scalar_native)(black_box(input))))
+    });
+
+    group.bench_function("simdeez-runtime", |b| {
+        b.iter(|| black_box((targets.simdeez_runtime)(black_box(input))))
+    });
+
+    group.bench_function("simdeez-forced-scalar", |b| {
+        b.iter(|| black_box((targets.simdeez_scalar)(black_box(input))))
+    });
+
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    {
+        if std::is_x86_feature_detected!("sse2") {
+            group.bench_function("simdeez-forced-sse2", |b| {
+                b.iter(|| unsafe { black_box((targets.simdeez_sse2)(black_box(input))) })
+            });
+        } else {
+            eprintln!("[bench] skipped simdeez-forced-sse2 for {group_name}: CPU lacks sse2");
+        }
+
+        if std::is_x86_feature_detected!("sse4.1") {
+            group.bench_function("simdeez-forced-sse41", |b| {
+                b.iter(|| unsafe { black_box((targets.simdeez_sse41)(black_box(input))) })
+            });
+        } else {
+            eprintln!("[bench] skipped simdeez-forced-sse41 for {group_name}: CPU lacks sse4.1");
+        }
+
+        if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") {
+            group.bench_function("simdeez-forced-avx2", |b| {
+                b.iter(|| unsafe { black_box((targets.simdeez_avx2)(black_box(input))) })
+            });
+        } else {
+            eprintln!("[bench] skipped simdeez-forced-avx2 for {group_name}: CPU lacks avx2/fma");
+        }
+
+        if std::is_x86_feature_detected!("avx512f")
+            && std::is_x86_feature_detected!("avx512bw")
+            && std::is_x86_feature_detected!("avx512dq")
+        {
+            group.bench_function("simdeez-forced-avx512", |b| {
+                b.iter(|| unsafe { black_box((targets.simdeez_avx512)(black_box(input))) })
+            });
+        } else {
+            eprintln!(
+                "[bench] skipped simdeez-forced-avx512 for {group_name}: CPU lacks avx512f+bw+dq"
+            );
+        }
+    }
+
+    group.finish();
+}
+
 type ScalarVf32 = <Scalar as Simd>::Vf32;
+type ScalarVf64 = <Scalar as Simd>::Vf64;
 
-#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 #[inline(never)]
 pub fn force_scalar_sum(input: &[f32], op: impl Fn(ScalarVf32) -> ScalarVf32) -> f32 {
     simdeez_sum_impl::<Scalar>(input, op)
 }
 
+#[inline(never)]
+pub fn force_scalar_sum_f64(input: &[f64], op: impl Fn(ScalarVf64) -> ScalarVf64) -> f64 {
+    simdeez_sum_impl_f64::<Scalar>(input, op)
+}
+
 #[inline(always)]
 pub fn simdeez_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32) -> f32 {
     let mut sum = 0.0f32;
@@ -168,3 +259,17 @@ pub fn simdeez_sum_impl<S: Simd>(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32)
 
     sum
 }
+
+#[inline(always)]
+pub fn simdeez_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+
+    while i + S::Vf64::WIDTH <= input.len() {
+        let v = S::Vf64::load_from_slice(&input[i..]);
+        sum += op(v).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+
+    sum
+}
diff --git a/src/math/contracts.rs b/src/math/contracts.rs
index a32c9d3..311f6d2 100644
--- a/src/math/contracts.rs
+++ b/src/math/contracts.rs
@@ -32,9 +32,11 @@ pub const SIN_U35_F64_MAX_ULP: u64 = 1;
 pub const COS_U35_F64_MAX_ULP: u64 = 1;
 pub const TAN_U35_F64_MAX_ULP: u64 = 1;
 
-pub const ASIN_U35_F64_MAX_ULP: u64 = 1;
-pub const ACOS_U35_F64_MAX_ULP: u64 = 1;
-pub const ATAN_U35_F64_MAX_ULP: u64 = 1;
+// Portable f64 inverse-trig kernels now use the family-local SIMD implementation
+// rather than scalar lane mapping, so they carry the honest u35 contract.
+pub const ASIN_U35_F64_MAX_ULP: u64 = 35;
+pub const ACOS_U35_F64_MAX_ULP: u64 = 35;
+pub const ATAN_U35_F64_MAX_ULP: u64 = 35;
 pub const ATAN2_U35_F64_MAX_ULP: u64 = 1;
 pub const SINH_U35_F64_MAX_ULP: u64 = 1;
 pub const COSH_U35_F64_MAX_ULP: u64 = 1;
diff --git a/src/math/f64/inverse_trig.rs b/src/math/f64/inverse_trig.rs
index 288cd49..43a8962 100644
--- a/src/math/f64/inverse_trig.rs
+++ b/src/math/f64/inverse_trig.rs
@@ -1,26 +1,29 @@
-use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::math::families::inverse_trig::portable_f64;
+use crate::{Simd, SimdFloat64};
 
 #[inline(always)]
 pub(crate) fn asin_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::asin_u35_f64)
+    portable_f64::asin_u35(input)
 }
 
 #[inline(always)]
 pub(crate) fn acos_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::acos_u35_f64)
+    portable_f64::acos_u35(input)
 }
 
 #[inline(always)]
 pub(crate) fn atan_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::atan_u35_f64)
+    portable_f64::atan_u35(input)
 }
diff --git a/src/math/families/inverse_trig/mod.rs b/src/math/families/inverse_trig/mod.rs
index a5de564..e1a4d5b 100644
--- a/src/math/families/inverse_trig/mod.rs
+++ b/src/math/families/inverse_trig/mod.rs
@@ -1,4 +1,5 @@
 mod portable_f32;
+pub(crate) mod portable_f64;
 
 use crate::math::f64;
 use crate::{Simd, SimdFloat32, SimdFloat64};
@@ -33,17 +34,26 @@ impl<T: SimdFloat32> SimdMathF32InverseTrig for T {}
 
 pub trait SimdMathF64InverseTrig: SimdFloat64 {
     #[inline(always)]
-    fn asin_u35(self) -> Self {
+    fn asin_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::asin_u35(self)
     }
 
     #[inline(always)]
-    fn acos_u35(self) -> Self {
+    fn acos_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::acos_u35(self)
     }
 
     #[inline(always)]
-    fn atan_u35(self) -> Self {
+    fn atan_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::atan_u35(self)
     }
 }
diff --git a/src/math/families/inverse_trig/portable_f64.rs b/src/math/families/inverse_trig/portable_f64.rs
new file mode 100644
index 0000000..817b39b
--- /dev/null
+++ b/src/math/families/inverse_trig/portable_f64.rs
@@ -0,0 +1,237 @@
+use crate::math::scalar;
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};
+
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64;
+const F64_SIGN_MASK: i64 = i64::MIN;
+
+const ASIN_SQRT_FALLBACK_BOUND_BITS: u64 = 0x3FEF_F000_0000_0000;
+
+const FRAC_PI_2: f64 = core::f64::consts::FRAC_PI_2;
+const FRAC_PI_4: f64 = core::f64::consts::FRAC_PI_4;
+const PI: f64 = core::f64::consts::PI;
+
+const TAN_PI_8: f64 = 0.414_213_562_373_095_03;
+const TAN_3PI_8: f64 = 2.414_213_562_373_095;
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn restore_sign<V>(magnitude: V, sign_bits: SimdI64<V>) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    (magnitude.bitcast_i64() ^ sign_bits).bitcast_f64()
+}
+
+#[inline(always)]
+fn finite_mask<V>(input: V) -> SimdI64<V>
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let exponent_bits = input.bitcast_i64() & SimdI64::<V>::set1(F64_EXPONENT_MASK);
+    exponent_bits.cmp_neq(SimdI64::<V>::set1(F64_EXPONENT_MASK))
+}
+
+#[inline(always)]
+fn atan_poly<V>(z: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let p0 = V::set1(-8.750_608_600_031_904e-1);
+    let p1 = V::set1(-1.615_753_718_733_365_2e1);
+    let p2 = V::set1(-7.500_855_792_314_705e1);
+    let p3 = V::set1(-1.228_866_684_490_136_2e2);
+    let p4 = V::set1(-6.485_021_904_942_025e1);
+
+    let q0 = V::set1(2.485_846_490_142_306_4e1);
+    let q1 = V::set1(1.650_270_098_316_988_6e2);
+    let q2 = V::set1(4.328_810_604_912_903e2);
+    let q3 = V::set1(4.853_903_996_359_137e2);
+    let q4 = V::set1(1.945_506_571_482_614e2);
+
+    let numerator = (((((p0 * z) + p1) * z) + p2) * z + p3) * z + p4;
+    let denominator = (((((z + q0) * z) + q1) * z + q2) * z + q3) * z + q4;
+    numerator / denominator
+}
+
+#[inline(always)]
+fn atan_reduced<V>(reduced: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let z = reduced * reduced;
+    reduced + reduced * z * atan_poly(z)
+}
+
+#[inline(always)]
+fn atan_fast<V>(input: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let sign_bits = input.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK);
+    let abs_input = input.abs();
+
+    let large_mask = abs_input.cmp_gt(V::set1(TAN_3PI_8)).bitcast_i64();
+    let medium_base = abs_input.cmp_gt(V::set1(TAN_PI_8)).bitcast_i64();
+    let medium_mask = medium_base & large_mask.cmp_eq(SimdI64::<V>::zeroes());
+    let large_mask_f = large_mask.bitcast_f64();
+    let medium_mask_f = medium_mask.bitcast_f64();
+
+    let mut reduced = abs_input;
+    reduced = medium_mask_f.blendv(
+        reduced,
+        (abs_input - V::set1(1.0)) / (abs_input + V::set1(1.0)),
+    );
+    reduced = large_mask_f.blendv(reduced, -V::set1(1.0) / abs_input);
+
+    let mut offset = V::zeroes();
+    offset = medium_mask_f.blendv(offset, V::set1(FRAC_PI_4));
+    offset = large_mask_f.blendv(offset, V::set1(FRAC_PI_2));
+
+    let magnitude = offset + atan_reduced(reduced);
+    restore_sign(magnitude, sign_bits)
+}
+
+#[inline(always)]
+fn asin_exceptional_mask<V>(input: V) -> SimdI64<V>
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let finite = finite_mask(input);
+    let abs_input = input.abs();
+    let out_of_domain = abs_input.cmp_gt(V::set1(1.0)).bitcast_i64();
+    let near_edge = abs_input
+        .bitcast_i64()
+        .cmp_gt(SimdI64::<V>::set1(ASIN_SQRT_FALLBACK_BOUND_BITS as i64));
+    finite.cmp_eq(SimdI64::<V>::zeroes()) | out_of_domain | near_edge
+}
+
+#[inline(always)]
+fn acos_exceptional_mask<V>(input: V) -> SimdI64<V>
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    asin_exceptional_mask(input)
+}
+
+#[inline(always)]
+fn asin_acos_core<V>(input: V) -> (V, V)
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let one = V::set1(1.0);
+    let half = V::set1(0.5);
+    let abs_input = input.abs();
+    let sign_bits = input.bitcast_i64() & SimdI64::<V>::set1(F64_SIGN_MASK);
+
+    let small_mask = abs_input.cmp_lt(half).bitcast_i64();
+    let small_mask_f = small_mask.bitcast_f64();
+
+    let small_t = abs_input / (one - abs_input * abs_input).sqrt();
+    let small_asin_mag = atan_fast(small_t);
+
+    let large_t = ((one - abs_input) / (one + abs_input)).sqrt();
+    let large_angle = V::set1(2.0) * atan_fast(large_t);
+    let large_asin_mag = V::set1(FRAC_PI_2) - large_angle;
+
+    let asin_mag = small_mask_f.blendv(large_asin_mag, small_asin_mag);
+    let asin_out = restore_sign(asin_mag, sign_bits);
+
+    let small_acos = V::set1(FRAC_PI_2) - asin_out;
+    let large_acos_pos = large_angle;
+    let large_acos_neg = V::set1(PI) - large_angle;
+    let negative_mask = input.cmp_lt(V::zeroes()).bitcast_i64().bitcast_f64();
+    let large_acos = negative_mask.blendv(large_acos_pos, large_acos_neg);
+    let acos_out = small_mask_f.blendv(large_acos, small_acos);
+
+    (asin_out, acos_out)
+}
+
+#[inline(always)]
+pub(crate) fn asin_u35<V>(input: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let exceptional_mask = asin_exceptional_mask(input);
+    let (fast, _) = asin_acos_core(input);
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asin_u35_f64)
+}
+
+#[inline(always)]
+pub(crate) fn acos_u35<V>(input: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let exceptional_mask = acos_exceptional_mask(input);
+    let (_, fast) = asin_acos_core(input);
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acos_u35_f64)
+}
+
+#[inline(always)]
+pub(crate) fn atan_u35<V>(input: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let fast = atan_fast(input);
+    let exceptional_mask = finite_mask(input).cmp_eq(SimdI64::<V>::zeroes());
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atan_u35_f64)
+}
diff --git a/src/math/families/mod.rs b/src/math/families/mod.rs
index e6aaab9..f566447 100644
--- a/src/math/families/mod.rs
+++ b/src/math/families/mod.rs
@@ -2,7 +2,7 @@ mod binary_misc;
 mod core;
 mod hyperbolic;
 mod inverse_hyperbolic;
-mod inverse_trig;
+pub(crate) mod inverse_trig;
 
 use crate::{SimdFloat32, SimdFloat64};
 
diff --git a/src/tests/simd_math_targeted_edges/inverse_trig.rs b/src/tests/simd_math_targeted_edges/inverse_trig.rs
index ad25fb5..36e6a91 100644
--- a/src/tests/simd_math_targeted_edges/inverse_trig.rs
+++ b/src/tests/simd_math_targeted_edges/inverse_trig.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::math::SimdMathF32InverseTrig;
+use crate::math::{SimdMathF32InverseTrig, SimdMathF64InverseTrig};
 
 fn run_f32_inverse_trig_near_one<S: Simd>() {
     let inputs = [
@@ -141,3 +141,149 @@ simd_math_targeted_all_backends!(
 );
 simd_math_targeted_all_backends!(f32_inverse_trig_symmetry, run_f32_inverse_trig_symmetry);
 simd_math_targeted_all_backends!(f32_inverse_trig_identity, run_f32_inverse_trig_identity);
+
+fn run_f64_inverse_trig_near_one<S: Simd>() {
+    let inputs = [
+        f64::from_bits(0x3FEF_FFFF_FFFF_FFFE),
+        f64::from_bits(0x3FEF_FFFF_FFFF_FFFF),
+        1.0,
+        f64::from_bits(0x3FF0_0000_0000_0001),
+        -f64::from_bits(0x3FEF_FFFF_FFFF_FFFE),
+        -f64::from_bits(0x3FEF_FFFF_FFFF_FFFF),
+        -1.0,
+        -f64::from_bits(0x3FF0_0000_0000_0001),
+    ];
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let asin = v.asin_u35();
+        let acos = v.acos_u35();
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "asin_u35",
+                x,
+                asin[lane],
+                x.asin(),
+                contracts::ASIN_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "acos_u35",
+                x,
+                acos[lane],
+                x.acos(),
+                contracts::ACOS_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+fn run_f64_inverse_trig_special_lanes<S: Simd>() {
+    let inputs = [
+        f64::NAN,
+        f64::INFINITY,
+        f64::NEG_INFINITY,
+        0.0,
+        -0.0,
+        1.0,
+        -1.0,
+        f64::from_bits(0x3FF0_0000_0000_0001),
+        -f64::from_bits(0x3FF0_0000_0000_0001),
+        0.5,
+        -0.5,
+        4.0,
+        -4.0,
+        TAN_PI_8_F64,
+        f64::from_bits(TAN_PI_8_F64.to_bits() + 1),
+        TAN_3PI_8_F64,
+        f64::from_bits(TAN_3PI_8_F64.to_bits() + 1),
+        1.0e-300,
+        -1.0e-300,
+    ];
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let asin = v.asin_u35();
+        let acos = v.acos_u35();
+        let atan = v.atan_u35();
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "asin_u35",
+                x,
+                asin[lane],
+                x.asin(),
+                contracts::ASIN_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "acos_u35",
+                x,
+                acos[lane],
+                x.acos(),
+                contracts::ACOS_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "atan_u35",
+                x,
+                atan[lane],
+                x.atan(),
+                contracts::ATAN_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+fn run_f64_inverse_trig_symmetry<S: Simd>() {
+    let inputs = [-0.875, -0.75, -0.5, -0.125, 0.125, 0.5, 0.75, 0.875];
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let neg_v = -v;
+
+        let asin = v.asin_u35();
+        let asin_neg = neg_v.asin_u35();
+        let atan = v.atan_u35();
+        let atan_neg = neg_v.atan_u35();
+
+        for lane in 0..chunk.len() {
+            assert_f64_contract("asin symmetry", chunk[lane], asin_neg[lane], -asin[lane], 2)
+                .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract("atan symmetry", chunk[lane], atan_neg[lane], -atan[lane], 2)
+                .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+fn run_f64_inverse_trig_identity<S: Simd>() {
+    let inputs = [-1.0, -0.875, -0.5, -0.125, 0.0, 0.125, 0.5, 0.875, 1.0];
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let sum = v.asin_u35() + v.acos_u35();
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "asin+acos identity",
+                x,
+                sum[lane],
+                std::f64::consts::FRAC_PI_2,
+                12,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+const TAN_PI_8_F64: f64 = 0.414_213_562_373_095_03;
+const TAN_3PI_8_F64: f64 = 2.414_213_562_373_095;
+
+simd_math_targeted_all_backends!(f64_inverse_trig_near_one, run_f64_inverse_trig_near_one);
+simd_math_targeted_all_backends!(
+    f64_inverse_trig_special_lanes,
+    run_f64_inverse_trig_special_lanes
+);
+simd_math_targeted_all_backends!(f64_inverse_trig_symmetry, run_f64_inverse_trig_symmetry);
+simd_math_targeted_all_backends!(f64_inverse_trig_identity, run_f64_inverse_trig_identity);
diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs
index babe6b3..4c31173 100644
--- a/src/tests/simd_math_targeted_edges/mod.rs
+++ b/src/tests/simd_math_targeted_edges/mod.rs
@@ -68,6 +68,58 @@ fn assert_f32_contract(
     Ok(())
 }
 
+fn assert_f64_contract(
+    fn_name: &str,
+    input: f64,
+    actual: f64,
+    expected: f64,
+    max_ulp: u64,
+) -> Result<(), String> {
+    if expected.is_nan() {
+        if actual.is_nan() {
+            return Ok(());
+        }
+        return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}"));
+    }
+
+    if expected.is_infinite() {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected {:?}, got {:?}",
+            expected, actual
+        ));
+    }
+
+    if expected == 0.0 {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}",
+            expected.to_bits(),
+            actual.to_bits()
+        ));
+    }
+
+    if actual.is_nan() || actual.is_infinite() {
+        return Err(format!(
+            "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}"
+        ));
+    }
+
+    let ulp = ulp_distance_f64(actual, expected)
+        .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?;
+    if ulp > max_ulp {
+        return Err(format!(
+            "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})"
+        ));
+    }
+
+    Ok(())
+}
+
 fn check_targeted_unary_f32<S: Simd>(
     fn_name: &str,
     inputs: &[f32],

From 857fb98049fdc378850f2bf7cf43ce74158edb70 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Sun, 22 Mar 2026 23:43:36 +1100
Subject: [PATCH 5/9] math/f64: add core-family trig SIMD fast path + f64
 benches/tests

---
 benches/simd_math.rs                       |   3 +
 benches/simd_math/f64_core.rs              | 211 +++++++++++++++++++++
 src/math/contracts.rs                      |  10 +-
 src/math/f64/core.rs                       | 138 +++++++++++++-
 src/math/families/core.rs                  |  35 +++-
 src/math/mod.rs                            |   4 +-
 src/tests/simd_math_targeted_edges/core.rs | 128 ++++++++++++-
 src/tests/simd_math_targeted_edges/mod.rs  |  77 +++++++-
 8 files changed, 585 insertions(+), 21 deletions(-)
 create mode 100644 benches/simd_math/f64_core.rs

diff --git a/benches/simd_math.rs b/benches/simd_math.rs
index 113d41a..a8fc968 100644
--- a/benches/simd_math.rs
+++ b/benches/simd_math.rs
@@ -1,6 +1,8 @@
 use criterion::{criterion_group, criterion_main, Criterion};
 use std::time::Duration;
 
+#[path = "simd_math/f64_core.rs"]
+mod f64_core;
 #[path = "simd_math/hyperbolic.rs"]
 mod hyperbolic;
 #[path = "simd_math/inverse_trig.rs"]
@@ -14,6 +16,7 @@ mod trig;
 
 fn criterion_benchmark(c: &mut Criterion) {
     log_exp::register(c);
+    f64_core::register(c);
     hyperbolic::register(c);
     inverse_trig::register(c);
     trig::register(c);
diff --git a/benches/simd_math/f64_core.rs b/benches/simd_math/f64_core.rs
new file mode 100644
index 0000000..33d6716
--- /dev/null
+++ b/benches/simd_math/f64_core.rs
@@ -0,0 +1,211 @@
+use criterion::{Criterion, Throughput};
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use simdeez::math::SimdMathF64Core;
+use simdeez::{prelude::*, simd_unsafe_generate_all};
+use std::hint::black_box;
+
+const INPUT_LEN: usize = 1 << 20;
+
+fn make_positive_log_inputs(seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..INPUT_LEN)
+        .map(|_| {
+            let log2x = rng.gen_range(-40.0f64..40.0f64);
+            let mantissa = rng.gen_range(1.0f64..2.0f64);
+            mantissa * log2x.exp2()
+        })
+        .collect()
+}
+
+fn make_exp2_inputs(seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..INPUT_LEN)
+        .map(|_| rng.gen_range(-1000.0f64..1000.0f64))
+        .collect()
+}
+
+fn make_exp_inputs(seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..INPUT_LEN)
+        .map(|_| rng.gen_range(-700.0f64..700.0f64))
+        .collect()
+}
+
+fn make_trig_inputs(seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..INPUT_LEN)
+        .map(|_| rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI))
+        .collect()
+}
+
+fn make_tan_inputs(seed: u64) -> Vec<f64> {
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..INPUT_LEN)
+        .map(|_| {
+            let mut x =
+                rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI);
+            let k = (x / core::f64::consts::PI).round();
+            let nearest_pole = (k + 0.5) * core::f64::consts::PI;
+            if (x - nearest_pole).abs() < 1.0e-8 {
+                x += if x >= 0.0 { 2.5e-8 } else { -2.5e-8 };
+            }
+            x
+        })
+        .collect()
+}
+
+#[inline(always)]
+fn simdeez_sum_impl<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+
+    while i + S::Vf64::WIDTH <= input.len() {
+        let v = S::Vf64::load_from_slice(&input[i..]);
+        sum += op(v).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+
+    sum
+}
+
+#[inline(never)]
+fn scalar_log2_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::log2).sum()
+}
+#[inline(never)]
+fn scalar_exp2_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::exp2).sum()
+}
+#[inline(never)]
+fn scalar_ln_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::ln).sum()
+}
+#[inline(never)]
+fn scalar_exp_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::exp).sum()
+}
+#[inline(never)]
+fn scalar_sin_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::sin).sum()
+}
+#[inline(never)]
+fn scalar_cos_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::cos).sum()
+}
+#[inline(never)]
+fn scalar_tan_sum(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::tan).sum()
+}
+
+simd_unsafe_generate_all!(
+    fn simdeez_log2_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.log2_u35())
+    }
+);
+simd_unsafe_generate_all!(
+    fn simdeez_exp2_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.exp2_u35())
+    }
+);
+simd_unsafe_generate_all!(
+    fn simdeez_ln_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.ln_u35())
+    }
+);
+simd_unsafe_generate_all!(
+    fn simdeez_exp_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.exp_u35())
+    }
+);
+simd_unsafe_generate_all!(
+    fn simdeez_sin_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.sin_u35())
+    }
+);
+simd_unsafe_generate_all!(
+    fn simdeez_cos_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.cos_u35())
+    }
+);
+simd_unsafe_generate_all!(
+    fn simdeez_tan_sum(input: &[f64]) -> f64 {
+        simdeez_sum_impl::<S>(input, |v| v.tan_u35())
+    }
+);
+
+fn bench_pair(
+    c: &mut Criterion,
+    name: &str,
+    input: &[f64],
+    scalar: fn(&[f64]) -> f64,
+    simd: fn(&[f64]) -> f64,
+) {
+    let mut group = c.benchmark_group(name);
+    group.throughput(Throughput::Elements(input.len() as u64));
+    group.bench_function("scalar-native", |b| {
+        b.iter(|| black_box(scalar(black_box(input))))
+    });
+    group.bench_function("simdeez-runtime", |b| {
+        b.iter(|| black_box(simd(black_box(input))))
+    });
+    group.finish();
+}
+
+pub fn register(c: &mut Criterion) {
+    let log_inputs = make_positive_log_inputs(0xF640_1001);
+    let exp2_inputs = make_exp2_inputs(0xF640_1002);
+    let exp_inputs = make_exp_inputs(0xF640_1003);
+    let trig_inputs = make_trig_inputs(0xF640_1004);
+    let tan_inputs = make_tan_inputs(0xF640_1005);
+
+    bench_pair(
+        c,
+        "simd_math/f64/log2_u35",
+        &log_inputs,
+        scalar_log2_sum,
+        simdeez_log2_sum,
+    );
+    bench_pair(
+        c,
+        "simd_math/f64/exp2_u35",
+        &exp2_inputs,
+        scalar_exp2_sum,
+        simdeez_exp2_sum,
+    );
+    bench_pair(
+        c,
+        "simd_math/f64/ln_u35",
+        &log_inputs,
+        scalar_ln_sum,
+        simdeez_ln_sum,
+    );
+    bench_pair(
+        c,
+        "simd_math/f64/exp_u35",
+        &exp_inputs,
+        scalar_exp_sum,
+        simdeez_exp_sum,
+    );
+    bench_pair(
+        c,
+        "simd_math/f64/sin_u35",
+        &trig_inputs,
+        scalar_sin_sum,
+        simdeez_sin_sum,
+    );
+    bench_pair(
+        c,
+        "simd_math/f64/cos_u35",
+        &trig_inputs,
+        scalar_cos_sum,
+        simdeez_cos_sum,
+    );
+    bench_pair(
+        c,
+        "simd_math/f64/tan_u35",
+        &tan_inputs,
+        scalar_tan_sum,
+        simdeez_tan_sum,
+    );
+}
diff --git a/src/math/contracts.rs b/src/math/contracts.rs
index a32c9d3..87db129 100644
--- a/src/math/contracts.rs
+++ b/src/math/contracts.rs
@@ -26,11 +26,11 @@ pub const LOG10_U35_F32_MAX_ULP: u32 = 35;
 
 pub const LOG2_U35_F64_MAX_ULP: u64 = 35;
 pub const EXP2_U35_F64_MAX_ULP: u64 = 35;
-pub const LN_U35_F64_MAX_ULP: u64 = 1;
-pub const EXP_U35_F64_MAX_ULP: u64 = 1;
-pub const SIN_U35_F64_MAX_ULP: u64 = 1;
-pub const COS_U35_F64_MAX_ULP: u64 = 1;
-pub const TAN_U35_F64_MAX_ULP: u64 = 1;
+pub const LN_U35_F64_MAX_ULP: u64 = 35;
+pub const EXP_U35_F64_MAX_ULP: u64 = 35;
+pub const SIN_U35_F64_MAX_ULP: u64 = 35;
+pub const COS_U35_F64_MAX_ULP: u64 = 35;
+pub const TAN_U35_F64_MAX_ULP: u64 = 35;
 
 pub const ASIN_U35_F64_MAX_ULP: u64 = 1;
 pub const ACOS_U35_F64_MAX_ULP: u64 = 1;
diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs
index 318fd58..b6a51bb 100644
--- a/src/math/f64/core.rs
+++ b/src/math/f64/core.rs
@@ -1,10 +1,61 @@
 use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};
+
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
 
 #[inline(always)]
 pub(crate) fn log2_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     map::unary_f64(input, scalar::log2_u35_f64)
 }
@@ -13,6 +64,7 @@ where
 pub(crate) fn exp2_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     map::unary_f64(input, scalar::exp2_u35_f64)
 }
@@ -21,6 +73,7 @@ where
 pub(crate) fn ln_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     map::unary_f64(input, scalar::ln_u35_f64)
 }
@@ -29,30 +82,107 @@ where
 pub(crate) fn exp_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     map::unary_f64(input, scalar::exp_u35_f64)
 }
 
+#[inline(always)]
+fn trig_exceptional_mask<V>(input: V) -> SimdI64<V>
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = input
+        .abs()
+        .cmp_lte(V::set1(core::f64::consts::FRAC_PI_4))
+        .bitcast_i64();
+    let non_zero = input.cmp_neq(V::zeroes()).bitcast_i64();
+    (finite_mask & within_fast_range & non_zero).cmp_eq(SimdI64::<V>::zeroes())
+}
+
+#[inline(always)]
+fn sin_cos_fast<V>(input: V) -> (V, V)
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let two_over_pi = V::set1(core::f64::consts::FRAC_2_PI);
+    let n = (input * two_over_pi).round().cast_i64();
+
+    let n_f = n.cast_f64();
+    let r = ((input - n_f * V::set1(core::f64::consts::FRAC_PI_2))
+        - n_f * V::set1(6.123_233_995_736_766e-17))
+        - n_f * V::set1(-2.022_266_248_795_951e-21);
+    let r2 = r * r;
+
+    let mut sin_poly = V::set1(1.589_690_995_211_55e-10);
+    sin_poly = (sin_poly * r2) + V::set1(-2.505_076_025_340_686_3e-8);
+    sin_poly = (sin_poly * r2) + V::set1(2.755_731_370_707_006_8e-6);
+    sin_poly = (sin_poly * r2) + V::set1(-1.984_126_982_985_795e-4);
+    sin_poly = (sin_poly * r2) + V::set1(8.333_333_333_322_49e-3);
+    sin_poly = (sin_poly * r2) + V::set1(-1.666_666_666_666_632_4e-1);
+    let sin_r = ((sin_poly * r2) * r) + r;
+
+    let mut cos_poly = V::set1(-1.135_964_755_778_819_5e-11);
+    cos_poly = (cos_poly * r2) + V::set1(2.087_572_321_298_175e-9);
+    cos_poly = (cos_poly * r2) + V::set1(-2.755_731_435_139_066_3e-7);
+    cos_poly = (cos_poly * r2) + V::set1(2.480_158_728_947_673e-5);
+    cos_poly = (cos_poly * r2) + V::set1(-1.388_888_888_887_305_6e-3);
+    cos_poly = (cos_poly * r2) + V::set1(4.166_666_666_666_659e-2);
+    let cos_r = (cos_poly * r2 * r2) - (V::set1(0.5) * r2) + V::set1(1.0);
+
+    let q = n & SimdI64::<V>::set1(3);
+    let q0 = q.cmp_eq(SimdI64::<V>::zeroes()).bitcast_f64();
+    let q1 = q.cmp_eq(SimdI64::<V>::set1(1)).bitcast_f64();
+    let q2 = q.cmp_eq(SimdI64::<V>::set1(2)).bitcast_f64();
+
+    let mut sin_out = q0.blendv(V::zeroes(), sin_r);
+    sin_out = q1.blendv(sin_out, cos_r);
+    sin_out = q2.blendv(sin_out, -sin_r);
+    sin_out = (q0 | q1 | q2).cmp_eq(V::zeroes()).blendv(sin_out, -cos_r);
+
+    let mut cos_out = q0.blendv(V::zeroes(), cos_r);
+    cos_out = q1.blendv(cos_out, -sin_r);
+    cos_out = q2.blendv(cos_out, -cos_r);
+    cos_out = (q0 | q1 | q2).cmp_eq(V::zeroes()).blendv(cos_out, sin_r);
+
+    (sin_out, cos_out)
+}
+
 #[inline(always)]
 pub(crate) fn sin_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::sin_u35_f64)
+    let exceptional_mask = trig_exceptional_mask(input);
+    let (sin_fast, _) = sin_cos_fast(input);
+    patch_exceptional_lanes(input, sin_fast, exceptional_mask, scalar::sin_u35_f64)
 }
 
 #[inline(always)]
 pub(crate) fn cos_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::cos_u35_f64)
+    let exceptional_mask = trig_exceptional_mask(input);
+    let (_, cos_fast) = sin_cos_fast(input);
+    patch_exceptional_lanes(input, cos_fast, exceptional_mask, scalar::cos_u35_f64)
 }
 
 #[inline(always)]
 pub(crate) fn tan_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::tan_u35_f64)
+    let base_exceptional = trig_exceptional_mask(input);
+    let (sin_fast, cos_fast) = sin_cos_fast(input);
+    let dangerous = cos_fast.abs().cmp_lt(V::set1(1.0e-12)).bitcast_i64();
+    let exceptional_mask = base_exceptional | dangerous;
+    let fast = sin_fast / cos_fast;
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tan_u35_f64)
 }
diff --git a/src/math/families/core.rs b/src/math/families/core.rs
index 94ac53d..01da52e 100644
--- a/src/math/families/core.rs
+++ b/src/math/families/core.rs
@@ -57,37 +57,58 @@ impl<T: SimdFloat32> SimdMathF32Core for T {}
 
 pub trait SimdMathF64Core: SimdFloat64 {
     #[inline(always)]
-    fn log2_u35(self) -> Self {
+    fn log2_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::log2_u35(self)
     }
 
     #[inline(always)]
-    fn exp2_u35(self) -> Self {
+    fn exp2_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::exp2_u35(self)
     }
 
     #[inline(always)]
-    fn ln_u35(self) -> Self {
+    fn ln_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::ln_u35(self)
     }
 
     #[inline(always)]
-    fn exp_u35(self) -> Self {
+    fn exp_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::exp_u35(self)
     }
 
     #[inline(always)]
-    fn sin_u35(self) -> Self {
+    fn sin_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::sin_u35(self)
     }
 
     #[inline(always)]
-    fn cos_u35(self) -> Self {
+    fn cos_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::cos_u35(self)
     }
 
     #[inline(always)]
-    fn tan_u35(self) -> Self {
+    fn tan_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::tan_u35(self)
     }
 }
diff --git a/src/math/mod.rs b/src/math/mod.rs
index 9b36bb2..21b643f 100644
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -17,8 +17,8 @@
 //! Structure notes:
 //! - `families/` owns public extension traits grouped by math family.
 //! - `scalar/` owns scalar fallback helpers using the same family boundaries.
-//! - `f64/` mirrors the family split for future backend work while preserving
-//!   the current scalar-mapped behavior.
+//! - `f64/` mirrors the family split; core u35 kernels now have portable SIMD
+//!   fast paths with scalar-lane patching for exceptional inputs.
 //! - `contracts.rs` and `map.rs` stay stable so follow-up optimization PRs can
 //!   target a single family file with minimal overlap.
 
diff --git a/src/tests/simd_math_targeted_edges/core.rs b/src/tests/simd_math_targeted_edges/core.rs
index 376f456..1e7a22c 100644
--- a/src/tests/simd_math_targeted_edges/core.rs
+++ b/src/tests/simd_math_targeted_edges/core.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::math::SimdMathF32Core;
+use crate::math::{SimdMathF32Core, SimdMathF64Core};
 
 fn run_f32_log2_u35_reduction_boundaries<S: Simd>() {
     let mut inputs = vec![
@@ -257,6 +257,132 @@ simd_math_targeted_all_backends!(
     run_f32_trig_symmetry_identities
 );
 
+fn run_f64_log_exp_boundary_lanes<S: Simd>() {
+    let inputs_log = vec![
+        f64::from_bits(1),
+        f64::MIN_POSITIVE,
+        0.5,
+        std::f64::consts::FRAC_1_SQRT_2,
+        1.0,
+        2.0,
+        1024.0,
+        f64::INFINITY,
+        f64::NAN,
+        -1.0,
+        0.0,
+        -0.0,
+    ];
+
+    check_targeted_unary_f64::<S>(
+        "log2_u35",
+        &inputs_log,
+        contracts::LOG2_U35_F64_MAX_ULP,
+        |v| v.log2_u35(),
+        f64::log2,
+    );
+    check_targeted_unary_f64::<S>(
+        "ln_u35",
+        &inputs_log,
+        contracts::LN_U35_F64_MAX_ULP,
+        |v| v.ln_u35(),
+        f64::ln,
+    );
+
+    let inputs_exp = vec![
+        -1022.0,
+        -1021.75,
+        -10.0,
+        -1.0,
+        -0.0,
+        0.0,
+        1.0,
+        10.0,
+        1022.0,
+        1023.0,
+        1023.25,
+        f64::INFINITY,
+        f64::NEG_INFINITY,
+        f64::NAN,
+    ];
+
+    check_targeted_unary_f64::<S>(
+        "exp2_u35",
+        &inputs_exp,
+        contracts::EXP2_U35_F64_MAX_ULP,
+        |v| v.exp2_u35(),
+        f64::exp2,
+    );
+    check_targeted_unary_f64::<S>(
+        "exp_u35",
+        &inputs_exp,
+        contracts::EXP_U35_F64_MAX_ULP,
+        |v| v.exp_u35(),
+        f64::exp,
+    );
+}
+
+fn run_f64_trig_pi_boundaries<S: Simd>() {
+    let mut inputs = vec![
+        -0.0,
+        0.0,
+        std::f64::consts::PI,
+        -std::f64::consts::PI,
+        std::f64::consts::FRAC_PI_2,
+        -std::f64::consts::FRAC_PI_2,
+        std::f64::consts::FRAC_PI_4,
+        -std::f64::consts::FRAC_PI_4,
+        f64::NAN,
+        f64::INFINITY,
+        f64::NEG_INFINITY,
+    ];
+
+    for k in -12..=12 {
+        let base = (k as f64) * std::f64::consts::FRAC_PI_2;
+        inputs.push(f64::from_bits(base.to_bits().saturating_sub(1)));
+        inputs.push(base);
+        inputs.push(f64::from_bits(base.to_bits().saturating_add(1)));
+    }
+
+    check_targeted_unary_f64::<S>(
+        "sin_u35",
+        &inputs,
+        contracts::SIN_U35_F64_MAX_ULP,
+        |v| v.sin_u35(),
+        f64::sin,
+    );
+    check_targeted_unary_f64::<S>(
+        "cos_u35",
+        &inputs,
+        contracts::COS_U35_F64_MAX_ULP,
+        |v| v.cos_u35(),
+        f64::cos,
+    );
+}
+
+fn run_f64_tan_pole_neighborhoods<S: Simd>() {
+    let mut inputs = vec![-1.0, -0.0, 0.0, 1.0, 10.0, -10.0, f64::NAN, f64::INFINITY];
+
+    for k in -16..=16 {
+        let pole = (k as f64 + 0.5) * std::f64::consts::PI;
+        for delta in [1.0e-4, 1.0e-6, 1.0e-8] {
+            inputs.push(pole - delta);
+            inputs.push(pole + delta);
+        }
+    }
+
+    check_targeted_unary_f64::<S>(
+        "tan_u35",
+        &inputs,
+        contracts::TAN_U35_F64_MAX_ULP,
+        |v| v.tan_u35(),
+        f64::tan,
+    );
+}
+
+simd_math_targeted_all_backends!(f64_log_exp_boundary_lanes, run_f64_log_exp_boundary_lanes);
+simd_math_targeted_all_backends!(f64_trig_pi_boundaries, run_f64_trig_pi_boundaries);
+simd_math_targeted_all_backends!(f64_tan_pole_neighborhoods, run_f64_tan_pole_neighborhoods);
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[test]
 fn f32_log2_u35_mixed_exception_lanes_avx2() {
diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs
index babe6b3..3ed9257 100644
--- a/src/tests/simd_math_targeted_edges/mod.rs
+++ b/src/tests/simd_math_targeted_edges/mod.rs
@@ -12,8 +12,8 @@ use crate::engines::wasm32::Wasm;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::engines::{avx2::Avx2, sse2::Sse2, sse41::Sse41};
 
-use crate::math::SimdMathF32Core;
-use crate::math::{contracts, SimdMathF32};
+use crate::math::{contracts, SimdMathF32, SimdMathF64};
+use crate::math::{SimdMathF32Core, SimdMathF64Core};
 use crate::{Simd, SimdBaseIo, SimdConsts};
 
 fn assert_f32_contract(
@@ -89,6 +89,79 @@ fn check_targeted_unary_f32<S: Simd>(
     }
 }
 
+fn assert_f64_contract(
+    fn_name: &str,
+    input: f64,
+    actual: f64,
+    expected: f64,
+    max_ulp: u64,
+) -> Result<(), String> {
+    if expected.is_nan() {
+        if actual.is_nan() {
+            return Ok(());
+        }
+        return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}"));
+    }
+
+    if expected.is_infinite() {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected {:?}, got {:?}",
+            expected, actual
+        ));
+    }
+
+    if expected == 0.0 {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}",
+            expected.to_bits(),
+            actual.to_bits()
+        ));
+    }
+
+    if actual.is_nan() || actual.is_infinite() {
+        return Err(format!(
+            "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}"
+        ));
+    }
+
+    let ulp = ulp_distance_f64(actual, expected)
+        .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?;
+    if ulp > max_ulp {
+        return Err(format!(
+            "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})"
+        ));
+    }
+
+    Ok(())
+}
+
+fn check_targeted_unary_f64<S: Simd>(
+    fn_name: &str,
+    inputs: &[f64],
+    max_ulp: u64,
+    simd_fn: impl Fn(S::Vf64) -> S::Vf64,
+    scalar_ref: impl Fn(f64) -> f64,
+) {
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let input = S::Vf64::load_from_slice(chunk);
+        let output = simd_fn(input);
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            let actual = output[lane];
+            let expected = scalar_ref(x);
+            if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) {
+                panic!("{err}");
+            }
+        }
+    }
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) {
     assert_eq!(input.len(), output.len());

From 95b9b4ca25421cf8e4a2fce9449fd63c7ab6e250 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Mon, 23 Mar 2026 00:01:11 +1100
Subject: [PATCH 6/9] =?UTF-8?q?=F0=9F=A6=8E=20Repair=20integrated=20f64=20?=
 =?UTF-8?q?wave=20branch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../simd_math_remaining_baseline/shared.rs    | 35 -------------------
 src/math/f64/inverse_hyperbolic.rs            |  5 +++
 src/math/families/inverse_hyperbolic.rs       | 15 ++++++--
 src/math/map.rs                               | 13 -------
 4 files changed, 17 insertions(+), 51 deletions(-)

diff --git a/benches/simd_math_remaining_baseline/shared.rs b/benches/simd_math_remaining_baseline/shared.rs
index 16749e8..a4b9ae8 100644
--- a/benches/simd_math_remaining_baseline/shared.rs
+++ b/benches/simd_math_remaining_baseline/shared.rs
@@ -38,11 +38,6 @@ pub fn make_binary_inputs(
     (a, b)
 }
 
-pub fn make_positive_inputs_f64(len: usize, seed: u64, min: f64, max: f64) -> Vec<f64> {
-    let mut rng = ChaCha8Rng::seed_from_u64(seed);
-    (0..len).map(|_| rng.gen_range(min..max)).collect()
-}
-
 pub fn make_binary_inputs_f64(
     len: usize,
     seed: u64,
@@ -96,18 +91,6 @@ pub fn simdeez_binary_sum_impl<S: Simd>(
     sum
 }
 
-#[inline(always)]
-pub fn simdeez_unary_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
-    let mut sum = 0.0f64;
-    let mut i = 0;
-    while i + S::Vf64::WIDTH <= input.len() {
-        let v = S::Vf64::load_from_slice(&input[i..]);
-        sum += op(v).horizontal_add();
-        i += S::Vf64::WIDTH;
-    }
-    sum
-}
-
 #[inline(always)]
 pub fn simdeez_binary_sum_impl_f64<S: Simd>(
     a: &[f64],
@@ -180,24 +163,6 @@ pub fn bench_binary(
     group.finish();
 }
 
-pub fn bench_unary_f64(
-    c: &mut Criterion,
-    name: &str,
-    input: &[f64],
-    scalar: fn(&[f64]) -> f64,
-    simd: fn(&[f64]) -> f64,
-) {
-    let mut group = c.benchmark_group(name);
-    group.throughput(Throughput::Elements(input.len() as u64));
-    group.bench_function("scalar-native", |b| {
-        b.iter(|| black_box(scalar(black_box(input))))
-    });
-    group.bench_function("simdeez-runtime", |b| {
-        b.iter(|| black_box(simd(black_box(input))))
-    });
-    group.finish();
-}
-
 pub fn bench_binary_f64(
     c: &mut Criterion,
     name: &str,
diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs
index 1ddde48..f7ce366 100644
--- a/src/math/f64/inverse_hyperbolic.rs
+++ b/src/math/f64/inverse_hyperbolic.rs
@@ -7,6 +7,7 @@ type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
 fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     unsafe {
         let lanes = mask.as_array();
@@ -29,6 +30,7 @@ fn patch_exceptional_lanes<V>(
 ) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     if !any_lane_nonzero::<V>(exceptional_mask) {
         return output;
@@ -53,6 +55,7 @@ where
 pub(crate) fn asinh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     let finite_mask = input.cmp_eq(input).bitcast_i64();
     let abs_x = input.abs();
@@ -74,6 +77,7 @@ where
 pub(crate) fn acosh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     let finite_mask = input.cmp_eq(input).bitcast_i64();
     let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64();
@@ -90,6 +94,7 @@ where
 pub(crate) fn atanh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     let finite_mask = input.cmp_eq(input).bitcast_i64();
     let abs_x = input.abs();
diff --git a/src/math/families/inverse_hyperbolic.rs b/src/math/families/inverse_hyperbolic.rs
index 18d6f75..f139f2b 100644
--- a/src/math/families/inverse_hyperbolic.rs
+++ b/src/math/families/inverse_hyperbolic.rs
@@ -31,17 +31,26 @@ impl<T: SimdFloat32> SimdMathF32InverseHyperbolic for T {}
 
 pub trait SimdMathF64InverseHyperbolic: SimdFloat64 {
     #[inline(always)]
-    fn asinh_u35(self) -> Self {
+    fn asinh_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::asinh_u35(self)
     }
 
     #[inline(always)]
-    fn acosh_u35(self) -> Self {
+    fn acosh_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::acosh_u35(self)
     }
 
     #[inline(always)]
-    fn atanh_u35(self) -> Self {
+    fn atanh_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::atanh_u35(self)
     }
 }
diff --git a/src/math/map.rs b/src/math/map.rs
index ce0af2c..a4d221f 100644
--- a/src/math/map.rs
+++ b/src/math/map.rs
@@ -34,16 +34,3 @@ pub(crate) fn binary_f32<V: SimdFloat32>(lhs: V, rhs: V, f: impl Fn(f32, f32) ->
         V::load_from_ptr_unaligned(&out_lanes as *const V::ArrayRepresentation as *const f32)
     }
 }
-
-#[inline(always)]
-pub(crate) fn binary_f64<V: SimdFloat64>(lhs: V, rhs: V, f: impl Fn(f64, f64) -> f64) -> V {
-    unsafe {
-        let lhs_lanes = lhs.as_array();
-        let rhs_lanes = rhs.as_array();
-        let mut out_lanes = lhs_lanes.clone();
-        for i in 0..V::WIDTH {
-            out_lanes[i] = f(lhs_lanes[i], rhs_lanes[i]);
-        }
-        V::load_from_ptr_unaligned(&out_lanes as *const V::ArrayRepresentation as *const f64)
-    }
-}

From 016f4a9028019cbdecba9658eea0ba532462f898 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Mon, 23 Mar 2026 00:24:40 +1100
Subject: [PATCH 7/9] test: harden f64 binary-misc adversarial lane coverage

---
 .../simd_math_targeted_edges/binary_misc.rs   | 129 ++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/src/tests/simd_math_targeted_edges/binary_misc.rs b/src/tests/simd_math_targeted_edges/binary_misc.rs
index 5d895ce..da80944 100644
--- a/src/tests/simd_math_targeted_edges/binary_misc.rs
+++ b/src/tests/simd_math_targeted_edges/binary_misc.rs
@@ -276,3 +276,132 @@ simd_math_targeted_all_backends!(
     f64_log10_domain_and_mixed_lanes,
     run_f64_log10_domain_and_mixed_lanes
 );
+
+fn run_f64_binary_misc_adversarial_lanes<S: Simd>() {
+    // log10_u35: adversarial around denormals and exact powers of ten.
+    let log10_inputs = [
+        f64::MIN_POSITIVE,
+        f64::from_bits(1),
+        f64::from_bits(2),
+        1.0e-320,
+        9.999_999_999_999_999e-101,
+        1.0e-100,
+        10.0,
+        1.0e100,
+        0.0,
+        -0.0,
+        -1.0,
+        f64::INFINITY,
+        f64::NAN,
+    ];
+    for chunk in log10_inputs.chunks(S::Vf64::WIDTH) {
+        let xv = S::Vf64::load_from_slice(chunk);
+        let out = xv.log10_u35();
+        for lane in 0..chunk.len() {
+            let x = chunk[lane];
+            assert_f64_contract(
+                "log10_u35",
+                x,
+                out[lane],
+                x.log10(),
+                contracts::LOG10_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("adversarial log10_u35({x:?}) {e}"));
+        }
+    }
+
+    // atan2_u35: signed-zero and near-axis quadrant behavior.
+    let atan2_cases = [
+        (0.0, -0.0),
+        (-0.0, -0.0),
+        (0.0, 0.0),
+        (-0.0, 0.0),
+        (f64::MIN_POSITIVE, -f64::MIN_POSITIVE),
+        (-f64::MIN_POSITIVE, -f64::MIN_POSITIVE),
+        (1.0e-300, -1.0),
+        (-1.0e-300, -1.0),
+        (1.0, -1.0e-300),
+        (-1.0, -1.0e-300),
+        (f64::INFINITY, -f64::INFINITY),
+        (f64::NEG_INFINITY, -f64::INFINITY),
+        (f64::NAN, 1.0),
+        (1.0, f64::NAN),
+    ];
+    for chunk in atan2_cases.chunks(S::Vf64::WIDTH) {
+        let mut ys = vec![0.0f64; chunk.len()];
+        let mut xs = vec![0.0f64; chunk.len()];
+        for (idx, (y, x)) in chunk.iter().copied().enumerate() {
+            ys[idx] = y;
+            xs[idx] = x;
+        }
+
+        let yv = S::Vf64::load_from_slice(&ys);
+        let xv = S::Vf64::load_from_slice(&xs);
+        let out = yv.atan2_u35(xv);
+        for lane in 0..chunk.len() {
+            let y = ys[lane];
+            let x = xs[lane];
+            assert_f64_contract(
+                "atan2_u35",
+                y,
+                out[lane],
+                y.atan2(x),
+                contracts::ATAN2_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("adversarial atan2_u35({y:?}, {x:?}) {e}"));
+        }
+    }
+
+    // hypot_u35 + fmod: scale extremes and quotient-threshold stress.
+    let binary_cases = [
+        (f64::MAX, f64::MIN_POSITIVE),
+        (f64::MIN_POSITIVE, f64::MAX),
+        (1.0e308, 1.0e-308),
+        (1.0e-308, 1.0e308),
+        (-1.0e308, 1.0e-308),
+        (9.007_199_254_740_992e15, 1.25),
+        (-9.007_199_254_740_992e15, 1.25),
+        (9.007_199_254_740_993e15, 1.25),
+        (-9.007_199_254_740_993e15, 1.25),
+        (0.0, 3.0),
+        (-0.0, 3.0),
+        (f64::INFINITY, 2.0),
+        (2.0, 0.0),
+        (f64::NAN, 2.0),
+        (2.0, f64::NAN),
+    ];
+
+    for chunk in binary_cases.chunks(S::Vf64::WIDTH) {
+        let mut xs = vec![0.0f64; chunk.len()];
+        let mut ys = vec![0.0f64; chunk.len()];
+        for (idx, (x, y)) in chunk.iter().copied().enumerate() {
+            xs[idx] = x;
+            ys[idx] = y;
+        }
+
+        let xv = S::Vf64::load_from_slice(&xs);
+        let yv = S::Vf64::load_from_slice(&ys);
+        let h = xv.hypot_u35(yv);
+        let r = xv.fmod(yv);
+
+        for lane in 0..chunk.len() {
+            let x = xs[lane];
+            let y = ys[lane];
+            assert_f64_contract(
+                "hypot_u35",
+                x,
+                h[lane],
+                x.hypot(y),
+                contracts::HYPOT_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("adversarial hypot_u35({x:?}, {y:?}) {e}"));
+            assert_f64_contract("fmod", x, r[lane], x % y, 0)
+                .unwrap_or_else(|e| panic!("adversarial fmod({x:?}, {y:?}) {e}"));
+        }
+    }
+}
+
+simd_math_targeted_all_backends!(
+    f64_binary_misc_adversarial_lanes,
+    run_f64_binary_misc_adversarial_lanes
+);

From c5384a2603baa339600ab6d10285bb30d2de2cb3 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Mon, 23 Mar 2026 00:24:50 +1100
Subject: [PATCH 8/9] test: add f64 inverse-trig boundary adversarial coverage

---
 .../simd_math_targeted_edges/inverse_trig.rs  | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/tests/simd_math_targeted_edges/inverse_trig.rs b/src/tests/simd_math_targeted_edges/inverse_trig.rs
index 36e6a91..24accd1 100644
--- a/src/tests/simd_math_targeted_edges/inverse_trig.rs
+++ b/src/tests/simd_math_targeted_edges/inverse_trig.rs
@@ -280,6 +280,85 @@ fn run_f64_inverse_trig_identity<S: Simd>() {
 const TAN_PI_8_F64: f64 = 0.414_213_562_373_095_03;
 const TAN_3PI_8_F64: f64 = 2.414_213_562_373_095;
 
+fn run_f64_inverse_trig_fallback_boundary<S: Simd>() {
+    // Boundary used by portable_f64 fallback patching for asin/acos near |x|≈1.
+    let boundary = f64::from_bits(0x3FEF_F000_0000_0000);
+    let inputs = [
+        f64::from_bits(boundary.to_bits() - 2),
+        f64::from_bits(boundary.to_bits() - 1),
+        boundary,
+        f64::from_bits(boundary.to_bits() + 1),
+        f64::from_bits(boundary.to_bits() + 2),
+        -f64::from_bits(boundary.to_bits() - 2),
+        -f64::from_bits(boundary.to_bits() - 1),
+        -boundary,
+        -f64::from_bits(boundary.to_bits() + 1),
+        -f64::from_bits(boundary.to_bits() + 2),
+    ];
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let asin = v.asin_u35();
+        let acos = v.acos_u35();
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "asin_u35 boundary",
+                x,
+                asin[lane],
+                x.asin(),
+                contracts::ASIN_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "acos_u35 boundary",
+                x,
+                acos[lane],
+                x.acos(),
+                contracts::ACOS_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+fn run_f64_atan_reduction_threshold_neighbors<S: Simd>() {
+    // Thresholds used by atan range reduction in portable_f64.
+    let t1 = TAN_PI_8_F64;
+    let t2 = TAN_3PI_8_F64;
+    let inputs = [
+        f64::from_bits(t1.to_bits() - 2),
+        f64::from_bits(t1.to_bits() - 1),
+        t1,
+        f64::from_bits(t1.to_bits() + 1),
+        f64::from_bits(t1.to_bits() + 2),
+        f64::from_bits(t2.to_bits() - 2),
+        f64::from_bits(t2.to_bits() - 1),
+        t2,
+        f64::from_bits(t2.to_bits() + 1),
+        f64::from_bits(t2.to_bits() + 2),
+    ];
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let atan = v.atan_u35();
+        let atan_neg = (-v).atan_u35();
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "atan_u35 threshold",
+                x,
+                atan[lane],
+                x.atan(),
+                contracts::ATAN_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract("atan_u35 threshold odd", -x, atan_neg[lane], -atan[lane], 2)
+                .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
 simd_math_targeted_all_backends!(f64_inverse_trig_near_one, run_f64_inverse_trig_near_one);
 simd_math_targeted_all_backends!(
     f64_inverse_trig_special_lanes,
@@ -287,3 +366,11 @@ simd_math_targeted_all_backends!(
 );
 simd_math_targeted_all_backends!(f64_inverse_trig_symmetry, run_f64_inverse_trig_symmetry);
 simd_math_targeted_all_backends!(f64_inverse_trig_identity, run_f64_inverse_trig_identity);
+simd_math_targeted_all_backends!(
+    f64_inverse_trig_fallback_boundary,
+    run_f64_inverse_trig_fallback_boundary
+);
+simd_math_targeted_all_backends!(
+    f64_atan_reduction_threshold_neighbors,
+    run_f64_atan_reduction_threshold_neighbors
+);

From 5b3cad1164bd616930dbc4d350b0aaf80260d1cf Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Mon, 23 Mar 2026 00:28:55 +1100
Subject: [PATCH 9/9] f64 hyperbolic: revert regressing paths, keep asinh fast
 path

---
 src/math/f64/hyperbolic.rs                    | 172 +-----------------
 src/math/f64/inverse_hyperbolic.rs            |  28 +--
 .../simd_math_targeted_edges/hyperbolic.rs    |  25 +++
 .../inverse_hyperbolic.rs                     |  26 +++
 4 files changed, 59 insertions(+), 192 deletions(-)

diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs
index b0277de..111aa78 100644
--- a/src/math/f64/hyperbolic.rs
+++ b/src/math/f64/hyperbolic.rs
@@ -1,150 +1,12 @@
-use crate::math::scalar;
-use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
-
-type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
-
-const SINH_COSH_SMALL_ABS: f64 = 0.125;
-const SINH_COSH_FAST_ABS_MAX: f64 = 0.125;
-const TANH_SMALL_ABS: f64 = 0.0;
-const TANH_FAST_ABS_MAX: f64 = 0.0;
-
-#[inline(always)]
-fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
-where
-    V: SimdFloat64,
-{
-    unsafe {
-        let lanes = mask.as_array();
-        for lane in 0..V::WIDTH {
-            if lanes[lane] != 0 {
-                return true;
-            }
-        }
-    }
-
-    false
-}
-
-#[inline(always)]
-fn patch_exceptional_lanes<V>(
-    input: V,
-    output: V,
-    exceptional_mask: SimdI64<V>,
-    scalar_fallback: fn(f64) -> f64,
-) -> V
-where
-    V: SimdFloat64,
-{
-    if !any_lane_nonzero::<V>(exceptional_mask) {
-        return output;
-    }
-
-    unsafe {
-        let input_lanes = input.as_array();
-        let mask_lanes = exceptional_mask.as_array();
-        let mut output_lanes = output.as_array();
-
-        for lane in 0..V::WIDTH {
-            if mask_lanes[lane] != 0 {
-                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
-            }
-        }
-
-        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
-    }
-}
-
-#[inline(always)]
-fn exp_u35<V>(input: V) -> V
-where
-    V: SimdFloat64,
-{
-    // Temporary family-local bridge: use scalar exp lane mapping here while
-    // avoiding scalar lane mapping for the final hyperbolic functions.
-    unsafe {
-        let mut lanes = input.as_array();
-        for lane in 0..V::WIDTH {
-            lanes[lane] = scalar::exp_u35_f64(lanes[lane]);
-        }
-        V::load_from_ptr_unaligned(&lanes as *const V::ArrayRepresentation as *const f64)
-    }
-}
-
-#[inline(always)]
-fn sinh_small<V>(input: V, input_sq: V) -> V
-where
-    V: SimdFloat64,
-{
-    let poly = ((((V::set1(1.0 / 39916800.0) * input_sq) + V::set1(1.0 / 362880.0)) * input_sq
-        + V::set1(1.0 / 5040.0))
-        * input_sq
-        + V::set1(1.0 / 120.0))
-        * input_sq
-        + V::set1(1.0 / 6.0);
-
-    input + (input * input_sq * poly)
-}
-
-#[inline(always)]
-fn cosh_small<V>(input_sq: V) -> V
-where
-    V: SimdFloat64,
-{
-    let poly = (((V::set1(1.0 / 40320.0) * input_sq) + V::set1(1.0 / 720.0)) * input_sq
-        + V::set1(1.0 / 24.0))
-        * input_sq
-        + V::set1(0.5);
-
-    V::set1(1.0) + (input_sq * poly)
-}
-
-#[inline(always)]
-fn sinh_cosh_medium<V>(abs_input: V) -> (V, V)
-where
-    V: SimdFloat64,
-{
-    let exp_abs = exp_u35(abs_input);
-    let exp_neg_abs = V::set1(1.0) / exp_abs;
-    let half = V::set1(0.5);
-
-    (
-        (exp_abs - exp_neg_abs) * half,
-        (exp_abs + exp_neg_abs) * half,
-    )
-}
-
-#[inline(always)]
-fn sinh_cosh_masks<V>(input: V) -> (SimdI64<V>, V, V)
-where
-    V: SimdFloat64,
-{
-    let abs_input = input.abs();
-    let finite_mask = input.cmp_eq(input).bitcast_i64();
-    let within_fast_range = abs_input
-        .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX))
-        .bitcast_i64();
-
-    (finite_mask & within_fast_range, abs_input, input * input)
-}
+use crate::math::{map, scalar};
+use crate::SimdFloat64;
 
 #[inline(always)]
 pub(crate) fn sinh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input);
-    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
-    let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS));
-
-    let fast_small = sinh_small(input, input_sq);
-    let exp_input = exp_u35(input);
-    let exp_neg_input = V::set1(1.0) / exp_input;
-    let sinh_medium = (exp_input - exp_neg_input) * V::set1(0.5);
-    let fast = small_mask.blendv(sinh_medium, fast_small);
-    let zero_mask = input.cmp_eq(V::set1(0.0));
-    let fast = zero_mask.blendv(fast, input);
-
-    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64)
+    map::unary_f64(input, scalar::sinh_u35_f64)
 }
 
 #[inline(always)]
@@ -152,15 +14,7 @@ pub(crate) fn cosh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input);
-    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
-    let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS));
-
-    let fast_small = cosh_small(input_sq);
-    let (_, cosh_medium) = sinh_cosh_medium(abs_input);
-    let fast = small_mask.blendv(cosh_medium, fast_small);
-
-    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::cosh_u35_f64)
+    map::unary_f64(input, scalar::cosh_u35_f64)
 }
 
 #[inline(always)]
@@ -168,21 +22,5 @@ pub(crate) fn tanh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    let abs_input = input.abs();
-    let finite_mask = input.cmp_eq(input).bitcast_i64();
-    let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64();
-    let exceptional_mask = (finite_mask & within_fast_range).cmp_eq(SimdI64::<V>::zeroes());
-    let small_mask = abs_input.cmp_lt(V::set1(TANH_SMALL_ABS));
-
-    let input_sq = input * input;
-    let fast_small = sinh_small(input, input_sq) / cosh_small(input_sq);
-
-    let exp_input = exp_u35(input);
-    let exp_neg_input = V::set1(1.0) / exp_input;
-    let tanh_medium = (exp_input - exp_neg_input) / (exp_input + exp_neg_input);
-    let fast = small_mask.blendv(tanh_medium, fast_small);
-    let zero_mask = input.cmp_eq(V::set1(0.0));
-    let fast = zero_mask.blendv(fast, input);
-
-    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64)
+    map::unary_f64(input, scalar::tanh_u35_f64)
 }
diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs
index f7ce366..50914bb 100644
--- a/src/math/f64/inverse_hyperbolic.rs
+++ b/src/math/f64/inverse_hyperbolic.rs
@@ -1,4 +1,4 @@
-use crate::math::{f64, scalar};
+use crate::math::{f64, map, scalar};
 use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
 
 type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
@@ -79,15 +79,7 @@ where
     V: SimdFloat64,
     V::Engine: Simd<Vf64 = V>,
 {
-    let finite_mask = input.cmp_eq(input).bitcast_i64();
-    let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64();
-    let fast_mask = finite_mask & in_domain_mask;
-    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
-
-    let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt());
-    let fast = f64::ln_u35(input + root_term);
-
-    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64)
+    map::unary_f64(input, scalar::acosh_u35_f64)
 }
 
 #[inline(always)]
@@ -96,19 +88,5 @@ where
     V: SimdFloat64,
     V::Engine: Simd<Vf64 = V>,
 {
-    let finite_mask = input.cmp_eq(input).bitcast_i64();
-    let abs_x = input.abs();
-    let strict_domain_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64();
-    let non_zero_mask = input.cmp_neq(V::zeroes()).bitcast_i64();
-    let stable_range_mask = abs_x.cmp_lte(V::set1(0.99)).bitcast_i64();
-    let away_from_zero_mask = abs_x.cmp_gte(V::set1(0.9)).bitcast_i64();
-    let fast_mask =
-        finite_mask & strict_domain_mask & non_zero_mask & stable_range_mask & away_from_zero_mask;
-    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
-
-    let one = V::set1(1.0);
-    let ratio = (one + input) / (one - input);
-    let fast = f64::ln_u35(ratio) * V::set1(0.5);
-
-    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atanh_u35_f64)
+    map::unary_f64(input, scalar::atanh_u35_f64)
 }
diff --git a/src/tests/simd_math_targeted_edges/hyperbolic.rs b/src/tests/simd_math_targeted_edges/hyperbolic.rs
index f8e509f..c8ad2d5 100644
--- a/src/tests/simd_math_targeted_edges/hyperbolic.rs
+++ b/src/tests/simd_math_targeted_edges/hyperbolic.rs
@@ -300,3 +300,28 @@ simd_math_targeted_all_backends!(
     f64_hyperbolic_special_values_and_mixed_lanes,
     run_f64_hyperbolic_special_values_and_mixed_lanes
 );
+
+fn run_f64_hyperbolic_signed_zero_semantics<S: Simd>() {
+    let mut lanes = vec![0.0f64; S::Vf64::WIDTH];
+    lanes[0] = -0.0;
+
+    let input = S::Vf64::load_from_slice(&lanes);
+    let sinh = input.sinh_u35();
+    let tanh = input.tanh_u35();
+
+    assert_eq!(sinh[0].to_bits(), (-0.0f64).sinh().to_bits());
+    assert_eq!(tanh[0].to_bits(), (-0.0f64).tanh().to_bits());
+
+    if S::Vf64::WIDTH > 1 {
+        assert_eq!(sinh[1].to_bits(), 0.0f64.sinh().to_bits());
+        assert_eq!(tanh[1].to_bits(), 0.0f64.tanh().to_bits());
+    }
+
+    let cosh = input.cosh_u35();
+    assert_eq!(cosh[0].to_bits(), (-0.0f64).cosh().to_bits());
+}
+
+simd_math_targeted_all_backends!(
+    f64_hyperbolic_signed_zero_semantics,
+    run_f64_hyperbolic_signed_zero_semantics
+);
diff --git a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs
index c07c793..cc382e7 100644
--- a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs
+++ b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs
@@ -270,3 +270,29 @@ simd_math_targeted_all_backends!(
     f64_inverse_hyperbolic_mixed_lanes,
     run_f64_inverse_hyperbolic_mixed_lanes
 );
+
+fn run_f64_inverse_hyperbolic_signed_zero_semantics<S: Simd>() {
+    let mut lanes = vec![0.0f64; S::Vf64::WIDTH];
+    lanes[0] = -0.0;
+
+    let input = S::Vf64::load_from_slice(&lanes);
+    let asinh = input.asinh_u35();
+    let atanh = input.atanh_u35();
+
+    assert_eq!(asinh[0].to_bits(), (-0.0f64).asinh().to_bits());
+    assert_eq!(atanh[0].to_bits(), (-0.0f64).atanh().to_bits());
+
+    if S::Vf64::WIDTH > 1 {
+        assert_eq!(asinh[1].to_bits(), 0.0f64.asinh().to_bits());
+        assert_eq!(atanh[1].to_bits(), 0.0f64.atanh().to_bits());
+    }
+
+    let ones = S::Vf64::set1(1.0);
+    let acosh = ones.acosh_u35();
+    assert_eq!(acosh[0].to_bits(), 1.0f64.acosh().to_bits());
+}
+
+simd_math_targeted_all_backends!(
+    f64_inverse_hyperbolic_signed_zero_semantics,
+    run_f64_inverse_hyperbolic_signed_zero_semantics
+);