arduano · arduano · Mar 23, 2026 · Mar 23, 2026
diff --git a/src/math/f64/binary_misc.rs b/src/math/f64/binary_misc.rs
@@ -1,13 +1,13 @@
 use crate::math::{f64, scalar};
 use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};
 
-// DECISION(2026-03-23): KEEP_MIXED
+// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
 // Function(s): f64 log10_u35
 // Why kept:
 // - local runtime-selected performance is clearly better than native scalar
-// - the current implementation still rides scalar-reference log2_u35 underneath
+// - the current implementation now rides the kept portable f64 log2_u35 core
 // Revisit when:
-// - f64 log2_u35 gets a new keep/revert outcome
+// - f64 core log2_u35 changes materially or non-x86 evidence disagrees sharply
 
 // DECISION(2026-03-23): KEEP_SIMD_PORTABLE
 // Function(s): f64 atan2_u35 / hypot_u35 / fmod

diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs
@@ -1,13 +1,22 @@
 use crate::math::{map, scalar};
-use crate::{Simd, SimdFloat64};
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt, SimdInt64};
 
-// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64;
+const F64_MANTISSA_MASK: i64 = 0x000F_FFFF_FFFF_FFFFu64 as i64;
+const F64_LOG_NORM_MANTISSA: i64 = 0x3FE0_0000_0000_0000u64 as i64;
+const F64_EXPONENT_BIAS_ADJUST: i64 = 1022;
+const F64_EXP_LN2_HI: f64 = 6.931_471_803_691_238e-1;
+const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10;
+
+// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
 // Function(s): f64 log2_u35 / exp2_u35 / ln_u35 / exp_u35
-// Why scalar:
-// - local benches keep putting runtime-selected behavior at or below native scalar
-// - family structure stays useful, but the current default is still scalar-reference
+// Why kept:
+// - the revived portable log/exp kernels now beat native scalar in local f64 core benches
+// - exceptional lanes still patch back to scalar references without giving up the fast AVX2 path
 // Revisit when:
-// - a genuinely worthwhile f64 log/exp SIMD kernel exists
+// - non-x86 evidence disagrees sharply or a cheaper approximation family appears
 
 // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
 // Function(s): f64 sin_u35 / cos_u35 / tan_u35
@@ -17,13 +26,116 @@ use crate::{Simd, SimdFloat64};
 // Revisit when:
 // - a stronger range-reduction strategy or cheaper trig kernel appears
 
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn log2_exceptional_mask<V>(input: V) -> SimdI64<V>
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let bits = input.bitcast_i64();
+    let exponent_bits = bits & F64_EXPONENT_MASK;
+
+    let non_positive = input
+        .cmp_gt(V::zeroes())
+        .bitcast_i64()
+        .cmp_eq(SimdI64::<V>::zeroes());
+    let subnormal_or_zero = exponent_bits.cmp_eq(SimdI64::<V>::zeroes());
+    let inf_or_nan = exponent_bits.cmp_eq(SimdI64::<V>::set1(F64_EXPONENT_MASK));
+
+    non_positive | subnormal_or_zero | inf_or_nan
+}
+
 #[inline(always)]
 pub(crate) fn log2_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
     V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::log2_u35_f64)
+    let bits = input.bitcast_i64();
+    let exponent_bits = bits & F64_EXPONENT_MASK;
+    let mantissa_bits = bits & F64_MANTISSA_MASK;
+
+    let exceptional_mask = log2_exceptional_mask(input);
+
+    let exponent =
+        (exponent_bits.shr(52) - SimdI64::<V>::set1(F64_EXPONENT_BIAS_ADJUST)).cast_f64();
+    let normalized_mantissa = (mantissa_bits | F64_LOG_NORM_MANTISSA).bitcast_f64();
+
+    let one = V::set1(1.0);
+    let sqrt_half = V::set1(core::f64::consts::FRAC_1_SQRT_2);
+
+    let adjust_mask = normalized_mantissa.cmp_lt(sqrt_half);
+    let exponent = exponent - adjust_mask.blendv(V::zeroes(), one);
+    let mantissa = adjust_mask.blendv(
+        normalized_mantissa,
+        normalized_mantissa + normalized_mantissa,
+    );
+
+    let t = (mantissa - one) / (mantissa + one);
+    let t2 = t * t;
+
+    let mut poly = V::set1(1.0 / 19.0);
+    poly = (poly * t2) + V::set1(1.0 / 17.0);
+    poly = (poly * t2) + V::set1(1.0 / 15.0);
+    poly = (poly * t2) + V::set1(1.0 / 13.0);
+    poly = (poly * t2) + V::set1(1.0 / 11.0);
+    poly = (poly * t2) + V::set1(1.0 / 9.0);
+    poly = (poly * t2) + V::set1(1.0 / 7.0);
+    poly = (poly * t2) + V::set1(1.0 / 5.0);
+    poly = (poly * t2) + V::set1(1.0 / 3.0);
+
+    let log2_mantissa = V::set1(2.0 * core::f64::consts::LOG2_E) * t * ((poly * t2) + one);
+    let fast = exponent + log2_mantissa;
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::log2_u35_f64)
 }
 
 #[inline(always)]
@@ -32,7 +144,34 @@ where
     V: SimdFloat64,
     V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::exp2_u35_f64)
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let in_lower_bound = input.cmp_gte(V::set1(-1022.0)).bitcast_i64();
+    let in_upper_bound = input.cmp_lte(V::set1(1023.0)).bitcast_i64();
+    let fast_mask = finite_mask & in_lower_bound & in_upper_bound;
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let integral = input.round().cast_i64();
+    let fractional = input - integral.cast_f64();
+    let reduced = fractional * V::set1(core::f64::consts::LN_2);
+
+    let mut poly = V::set1(1.0 / 479_001_600.0);
+    poly = (poly * reduced) + V::set1(1.0 / 39_916_800.0);
+    poly = (poly * reduced) + V::set1(1.0 / 3_628_800.0);
+    poly = (poly * reduced) + V::set1(1.0 / 362_880.0);
+    poly = (poly * reduced) + V::set1(1.0 / 40_320.0);
+    poly = (poly * reduced) + V::set1(1.0 / 5_040.0);
+    poly = (poly * reduced) + V::set1(1.0 / 720.0);
+    poly = (poly * reduced) + V::set1(1.0 / 120.0);
+    poly = (poly * reduced) + V::set1(1.0 / 24.0);
+    poly = (poly * reduced) + V::set1(1.0 / 6.0);
+    poly = (poly * reduced) + V::set1(0.5);
+
+    let exp_reduced = (poly * reduced * reduced) + reduced + V::set1(1.0);
+    let scale_bits = (integral + SimdI64::<V>::set1(1023)).shl(52);
+    let scale = scale_bits.bitcast_f64();
+    let fast = exp_reduced * scale;
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::exp2_u35_f64)
 }
 
 #[inline(always)]
@@ -41,15 +180,81 @@ where
     V: SimdFloat64,
     V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::ln_u35_f64)
+    let bits = input.bitcast_i64();
+    let exponent_bits = bits & F64_EXPONENT_MASK;
+    let mantissa_bits = bits & F64_MANTISSA_MASK;
+
+    let exceptional_mask = log2_exceptional_mask(input);
+
+    let exponent =
+        (exponent_bits.shr(52) - SimdI64::<V>::set1(F64_EXPONENT_BIAS_ADJUST)).cast_f64();
+    let normalized_mantissa = (mantissa_bits | F64_LOG_NORM_MANTISSA).bitcast_f64();
+
+    let one = V::set1(1.0);
+    let sqrt_half = V::set1(core::f64::consts::FRAC_1_SQRT_2);
+
+    let adjust_mask = normalized_mantissa.cmp_lt(sqrt_half);
+    let exponent = exponent - adjust_mask.blendv(V::zeroes(), one);
+    let mantissa = adjust_mask.blendv(
+        normalized_mantissa,
+        normalized_mantissa + normalized_mantissa,
+    );
+
+    let t = (mantissa - one) / (mantissa + one);
+    let t2 = t * t;
+
+    let mut poly = V::set1(1.0 / 19.0);
+    poly = (poly * t2) + V::set1(1.0 / 17.0);
+    poly = (poly * t2) + V::set1(1.0 / 15.0);
+    poly = (poly * t2) + V::set1(1.0 / 13.0);
+    poly = (poly * t2) + V::set1(1.0 / 11.0);
+    poly = (poly * t2) + V::set1(1.0 / 9.0);
+    poly = (poly * t2) + V::set1(1.0 / 7.0);
+    poly = (poly * t2) + V::set1(1.0 / 5.0);
+    poly = (poly * t2) + V::set1(1.0 / 3.0);
+
+    let ln_mantissa = V::set1(2.0) * t * ((poly * t2) + one);
+    let fast = exponent * V::set1(core::f64::consts::LN_2) + ln_mantissa;
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::ln_u35_f64)
 }
 
 #[inline(always)]
 pub(crate) fn exp_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::exp_u35_f64)
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let in_lower_bound = input.cmp_gte(V::set1(-708.0)).bitcast_i64();
+    let in_upper_bound = input.cmp_lte(V::set1(709.0)).bitcast_i64();
+    let fast_mask = finite_mask & in_lower_bound & in_upper_bound;
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let scaled = input * V::set1(core::f64::consts::LOG2_E);
+    let integral = scaled.round().cast_i64();
+    let integral_f = integral.cast_f64();
+    let reduced =
+        (input - integral_f * V::set1(F64_EXP_LN2_HI)) - integral_f * V::set1(F64_EXP_LN2_LO);
+
+    let mut poly = V::set1(1.0 / 479_001_600.0);
+    poly = (poly * reduced) + V::set1(1.0 / 39_916_800.0);
+    poly = (poly * reduced) + V::set1(1.0 / 3_628_800.0);
+    poly = (poly * reduced) + V::set1(1.0 / 362_880.0);
+    poly = (poly * reduced) + V::set1(1.0 / 40_320.0);
+    poly = (poly * reduced) + V::set1(1.0 / 5_040.0);
+    poly = (poly * reduced) + V::set1(1.0 / 720.0);
+    poly = (poly * reduced) + V::set1(1.0 / 120.0);
+    poly = (poly * reduced) + V::set1(1.0 / 24.0);
+    poly = (poly * reduced) + V::set1(1.0 / 6.0);
+    poly = (poly * reduced) + V::set1(0.5);
+
+    let exp_reduced = (poly * reduced * reduced) + reduced + V::set1(1.0);
+    let scale_bits = (integral + SimdI64::<V>::set1(1023)).shl(52);
+    let scale = scale_bits.bitcast_f64();
+    let fast = exp_reduced * scale;
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::exp_u35_f64)
 }
 
 #[inline(always)]

diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs
@@ -5,9 +5,9 @@ use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
 // Function(s): f64 asinh_u35
 // Why kept:
 // - local benches show the current hybrid path materially ahead of native scalar
-// - the fast path still depends on scalar-reference ln_u35, so this is not a full SIMD keep
+// - the fast path still uses an explicit scalar-lane ln step to preserve the stricter 1-ULP contract
 // Revisit when:
-// - f64 ln_u35 stops being scalar-reference or asinh gets its own cheaper core
+// - asinh gets its own cheaper core or can safely absorb the relaxed portable ln_u35 error budget
 
 // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
 // Function(s): f64 acosh_u35 / atanh_u35
@@ -82,7 +82,7 @@ where
         finite_mask.cmp_eq(SimdI64::<V>::zeroes()) | tiny_mask | large_mask | zero_mask;
 
     let radicand = (abs_x * abs_x) + V::set1(1.0);
-    let magnitude = f64::ln_u35(abs_x + radicand.sqrt());
+    let magnitude = map::unary_f64(abs_x + radicand.sqrt(), scalar::ln_u35_f64);
     let negative_mask = input.cmp_lt(V::zeroes());
     let fast = negative_mask.blendv(magnitude, -magnitude);
 

diff --git a/src/math/f64/mod.rs b/src/math/f64/mod.rs
@@ -1,9 +1,9 @@
 //! f64 SIMD math dispatch layering:
 //! - family-local modules own the public internal routing points for each math family.
 //! - current decisions are intentionally mixed:
-//!   scalar-reference for core/trig and hyperbolic defaults,
-//!   portable SIMD for inverse trig and several binary-misc kernels,
-//!   and hybrid paths where scalar sub-ops still underpin the fast path.
+//!   portable SIMD for the revived core log/exp family, inverse trig, and binary misc,
+//!   scalar-reference for trig and the losing hyperbolic defaults,
+//!   and hybrid paths where a stricter scalar sub-op still underpins the fast path.
 //! - follow-up optimization work can still replace one family module at a time.
 
 mod binary_misc;

diff --git a/src/math/mod.rs b/src/math/mod.rs
@@ -12,8 +12,8 @@
 //! `sinh_u35` / `cosh_u35` / `tanh_u35` now use family-local portable SIMD
 //! kernels with centralized scalar patching for exceptional lanes.
 //! The stabilized `f64` map is intentionally mixed:
-//! scalar-reference for the current losing core/trig and hyperbolic families,
-//! portable SIMD for inverse trig and several binary-misc kernels,
+//! portable SIMD for the revived core log/exp family, inverse trig, and several binary-misc kernels,
+//! scalar-reference for the current losing trig and hyperbolic families,
 //! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops.
 //!
 //! Structure notes:

diff --git a/src/tests/simd_math_targeted_edges/core.rs b/src/tests/simd_math_targeted_edges/core.rs
@@ -258,7 +258,7 @@ simd_math_targeted_all_backends!(
 );
 
 fn run_f64_log_exp_boundary_lanes<S: Simd>() {
-    let inputs_log = vec![
+    let mut inputs_log = vec![
         f64::from_bits(1),
         f64::MIN_POSITIVE,
         0.5,
@@ -273,6 +273,13 @@ fn run_f64_log_exp_boundary_lanes<S: Simd>() {
         -0.0,
     ];
 
+    for &scale in &[0.5f64, 1.0, 2.0, 8.0, 1024.0] {
+        let pivot = std::f64::consts::FRAC_1_SQRT_2 * scale;
+        inputs_log.push(f64::from_bits(pivot.to_bits() - 1));
+        inputs_log.push(pivot);
+        inputs_log.push(f64::from_bits(pivot.to_bits() + 1));
+    }
+
     check_targeted_unary_f64::<S>(
         "log2_u35",
         &inputs_log,
@@ -288,7 +295,7 @@ fn run_f64_log_exp_boundary_lanes<S: Simd>() {
         f64::ln,
     );
 
-    let inputs_exp = vec![
+    let mut inputs_exp = vec![
         -1022.0,
         -1021.75,
         -10.0,
@@ -305,6 +312,21 @@ fn run_f64_log_exp_boundary_lanes<S: Simd>() {
         f64::NAN,
     ];
 
+    for k in -4..=4 {
+        let center = k as f64;
+        inputs_exp.push(center - 1.0 / 4096.0);
+        inputs_exp.push(center);
+        inputs_exp.push(center + 1.0 / 4096.0);
+    }
+
+    for &center in &[
+        -1022.0f64, -1021.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1022.0, 1023.0,
+    ] {
+        inputs_exp.push(f64::from_bits(center.to_bits().saturating_sub(1)));
+        inputs_exp.push(center);
+        inputs_exp.push(f64::from_bits(center.to_bits().saturating_add(1)));
+    }
+
     check_targeted_unary_f64::<S>(
         "exp2_u35",
         &inputs_exp,