Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/math/f64/binary_misc.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use crate::math::{f64, scalar};
use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};

// DECISION(2026-03-23): KEEP_MIXED
// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
// Function(s): f64 log10_u35
// Why kept:
// - local runtime-selected performance is clearly better than native scalar
// - the current implementation still rides scalar-reference log2_u35 underneath
// - the current implementation now rides the kept portable f64 log2_u35 core
// Revisit when:
// - f64 log2_u35 gets a new keep/revert outcome
// - f64 core log2_u35 changes materially or non-x86 evidence disagrees sharply

// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
// Function(s): f64 atan2_u35 / hypot_u35 / fmod
Expand Down
225 changes: 215 additions & 10 deletions src/math/f64/core.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
use crate::math::{map, scalar};
use crate::{Simd, SimdFloat64};
use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt, SimdInt64};

// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;

const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64;
const F64_MANTISSA_MASK: i64 = 0x000F_FFFF_FFFF_FFFFu64 as i64;
const F64_LOG_NORM_MANTISSA: i64 = 0x3FE0_0000_0000_0000u64 as i64;
const F64_EXPONENT_BIAS_ADJUST: i64 = 1022;
const F64_EXP_LN2_HI: f64 = 6.931_471_803_691_238e-1;
const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10;

// DECISION(2026-03-23): KEEP_SIMD_PORTABLE
// Function(s): f64 log2_u35 / exp2_u35 / ln_u35 / exp_u35
// Why scalar:
// - local benches keep putting runtime-selected behavior at or below native scalar
// - family structure stays useful, but the current default is still scalar-reference
// Why kept:
// - the revived portable log/exp kernels now beat native scalar in local f64 core benches
// - exceptional lanes still patch back to scalar references without giving up the fast AVX2 path
// Revisit when:
// - a genuinely worthwhile f64 log/exp SIMD kernel exists
// - non-x86 evidence disagrees sharply or a cheaper approximation family appears

// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
// Function(s): f64 sin_u35 / cos_u35 / tan_u35
Expand All @@ -17,13 +26,116 @@ use crate::{Simd, SimdFloat64};
// Revisit when:
// - a stronger range-reduction strategy or cheaper trig kernel appears

#[inline(always)]
fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
unsafe {
let lanes = mask.as_array();
for lane in 0..V::WIDTH {
if lanes[lane] != 0 {
return true;
}
}
}

false
}

#[inline(always)]
fn patch_exceptional_lanes<V>(
input: V,
output: V,
exceptional_mask: SimdI64<V>,
scalar_fallback: fn(f64) -> f64,
) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
if !any_lane_nonzero::<V>(exceptional_mask) {
return output;
}

unsafe {
let input_lanes = input.as_array();
let mask_lanes = exceptional_mask.as_array();
let mut output_lanes = output.as_array();

for lane in 0..V::WIDTH {
if mask_lanes[lane] != 0 {
output_lanes[lane] = scalar_fallback(input_lanes[lane]);
}
}

V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
}
}

#[inline(always)]
fn log2_exceptional_mask<V>(input: V) -> SimdI64<V>
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
let bits = input.bitcast_i64();
let exponent_bits = bits & F64_EXPONENT_MASK;

let non_positive = input
.cmp_gt(V::zeroes())
.bitcast_i64()
.cmp_eq(SimdI64::<V>::zeroes());
let subnormal_or_zero = exponent_bits.cmp_eq(SimdI64::<V>::zeroes());
let inf_or_nan = exponent_bits.cmp_eq(SimdI64::<V>::set1(F64_EXPONENT_MASK));

non_positive | subnormal_or_zero | inf_or_nan
}

#[inline(always)]
pub(crate) fn log2_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::log2_u35_f64)
let bits = input.bitcast_i64();
let exponent_bits = bits & F64_EXPONENT_MASK;
let mantissa_bits = bits & F64_MANTISSA_MASK;

let exceptional_mask = log2_exceptional_mask(input);

let exponent =
(exponent_bits.shr(52) - SimdI64::<V>::set1(F64_EXPONENT_BIAS_ADJUST)).cast_f64();
let normalized_mantissa = (mantissa_bits | F64_LOG_NORM_MANTISSA).bitcast_f64();

let one = V::set1(1.0);
let sqrt_half = V::set1(core::f64::consts::FRAC_1_SQRT_2);

let adjust_mask = normalized_mantissa.cmp_lt(sqrt_half);
let exponent = exponent - adjust_mask.blendv(V::zeroes(), one);
let mantissa = adjust_mask.blendv(
normalized_mantissa,
normalized_mantissa + normalized_mantissa,
);

let t = (mantissa - one) / (mantissa + one);
let t2 = t * t;

let mut poly = V::set1(1.0 / 19.0);
poly = (poly * t2) + V::set1(1.0 / 17.0);
poly = (poly * t2) + V::set1(1.0 / 15.0);
poly = (poly * t2) + V::set1(1.0 / 13.0);
poly = (poly * t2) + V::set1(1.0 / 11.0);
poly = (poly * t2) + V::set1(1.0 / 9.0);
poly = (poly * t2) + V::set1(1.0 / 7.0);
poly = (poly * t2) + V::set1(1.0 / 5.0);
poly = (poly * t2) + V::set1(1.0 / 3.0);

let log2_mantissa = V::set1(2.0 * core::f64::consts::LOG2_E) * t * ((poly * t2) + one);
let fast = exponent + log2_mantissa;

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::log2_u35_f64)
}

#[inline(always)]
Expand All @@ -32,7 +144,34 @@ where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::exp2_u35_f64)
let finite_mask = input.cmp_eq(input).bitcast_i64();
let in_lower_bound = input.cmp_gte(V::set1(-1022.0)).bitcast_i64();
let in_upper_bound = input.cmp_lte(V::set1(1023.0)).bitcast_i64();
let fast_mask = finite_mask & in_lower_bound & in_upper_bound;
let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

let integral = input.round().cast_i64();
let fractional = input - integral.cast_f64();
let reduced = fractional * V::set1(core::f64::consts::LN_2);

let mut poly = V::set1(1.0 / 479_001_600.0);
poly = (poly * reduced) + V::set1(1.0 / 39_916_800.0);
poly = (poly * reduced) + V::set1(1.0 / 3_628_800.0);
poly = (poly * reduced) + V::set1(1.0 / 362_880.0);
poly = (poly * reduced) + V::set1(1.0 / 40_320.0);
poly = (poly * reduced) + V::set1(1.0 / 5_040.0);
poly = (poly * reduced) + V::set1(1.0 / 720.0);
poly = (poly * reduced) + V::set1(1.0 / 120.0);
poly = (poly * reduced) + V::set1(1.0 / 24.0);
poly = (poly * reduced) + V::set1(1.0 / 6.0);
poly = (poly * reduced) + V::set1(0.5);

let exp_reduced = (poly * reduced * reduced) + reduced + V::set1(1.0);
let scale_bits = (integral + SimdI64::<V>::set1(1023)).shl(52);
let scale = scale_bits.bitcast_f64();
let fast = exp_reduced * scale;

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::exp2_u35_f64)
}

#[inline(always)]
Expand All @@ -41,15 +180,81 @@ where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::ln_u35_f64)
let bits = input.bitcast_i64();
let exponent_bits = bits & F64_EXPONENT_MASK;
let mantissa_bits = bits & F64_MANTISSA_MASK;

let exceptional_mask = log2_exceptional_mask(input);

let exponent =
(exponent_bits.shr(52) - SimdI64::<V>::set1(F64_EXPONENT_BIAS_ADJUST)).cast_f64();
let normalized_mantissa = (mantissa_bits | F64_LOG_NORM_MANTISSA).bitcast_f64();

let one = V::set1(1.0);
let sqrt_half = V::set1(core::f64::consts::FRAC_1_SQRT_2);

let adjust_mask = normalized_mantissa.cmp_lt(sqrt_half);
let exponent = exponent - adjust_mask.blendv(V::zeroes(), one);
let mantissa = adjust_mask.blendv(
normalized_mantissa,
normalized_mantissa + normalized_mantissa,
);

let t = (mantissa - one) / (mantissa + one);
let t2 = t * t;

let mut poly = V::set1(1.0 / 19.0);
poly = (poly * t2) + V::set1(1.0 / 17.0);
poly = (poly * t2) + V::set1(1.0 / 15.0);
poly = (poly * t2) + V::set1(1.0 / 13.0);
poly = (poly * t2) + V::set1(1.0 / 11.0);
poly = (poly * t2) + V::set1(1.0 / 9.0);
poly = (poly * t2) + V::set1(1.0 / 7.0);
poly = (poly * t2) + V::set1(1.0 / 5.0);
poly = (poly * t2) + V::set1(1.0 / 3.0);

let ln_mantissa = V::set1(2.0) * t * ((poly * t2) + one);
let fast = exponent * V::set1(core::f64::consts::LN_2) + ln_mantissa;

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::ln_u35_f64)
}

#[inline(always)]
pub(crate) fn exp_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::exp_u35_f64)
let finite_mask = input.cmp_eq(input).bitcast_i64();
let in_lower_bound = input.cmp_gte(V::set1(-708.0)).bitcast_i64();
let in_upper_bound = input.cmp_lte(V::set1(709.0)).bitcast_i64();
let fast_mask = finite_mask & in_lower_bound & in_upper_bound;
let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

let scaled = input * V::set1(core::f64::consts::LOG2_E);
let integral = scaled.round().cast_i64();
let integral_f = integral.cast_f64();
let reduced =
(input - integral_f * V::set1(F64_EXP_LN2_HI)) - integral_f * V::set1(F64_EXP_LN2_LO);

let mut poly = V::set1(1.0 / 479_001_600.0);
poly = (poly * reduced) + V::set1(1.0 / 39_916_800.0);
poly = (poly * reduced) + V::set1(1.0 / 3_628_800.0);
poly = (poly * reduced) + V::set1(1.0 / 362_880.0);
poly = (poly * reduced) + V::set1(1.0 / 40_320.0);
poly = (poly * reduced) + V::set1(1.0 / 5_040.0);
poly = (poly * reduced) + V::set1(1.0 / 720.0);
poly = (poly * reduced) + V::set1(1.0 / 120.0);
poly = (poly * reduced) + V::set1(1.0 / 24.0);
poly = (poly * reduced) + V::set1(1.0 / 6.0);
poly = (poly * reduced) + V::set1(0.5);

let exp_reduced = (poly * reduced * reduced) + reduced + V::set1(1.0);
let scale_bits = (integral + SimdI64::<V>::set1(1023)).shl(52);
let scale = scale_bits.bitcast_f64();
let fast = exp_reduced * scale;

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::exp_u35_f64)
}

#[inline(always)]
Expand Down
6 changes: 3 additions & 3 deletions src/math/f64/inverse_hyperbolic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
// Function(s): f64 asinh_u35
// Why kept:
// - local benches show the current hybrid path materially ahead of native scalar
// - the fast path still depends on scalar-reference ln_u35, so this is not a full SIMD keep
// - the fast path still uses an explicit scalar-lane ln step to preserve the stricter 1-ULP contract
// Revisit when:
// - f64 ln_u35 stops being scalar-reference or asinh gets its own cheaper core
// - asinh gets its own cheaper core or can safely absorb the relaxed portable ln_u35 error budget

// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
// Function(s): f64 acosh_u35 / atanh_u35
Expand Down Expand Up @@ -82,7 +82,7 @@ where
finite_mask.cmp_eq(SimdI64::<V>::zeroes()) | tiny_mask | large_mask | zero_mask;

let radicand = (abs_x * abs_x) + V::set1(1.0);
let magnitude = f64::ln_u35(abs_x + radicand.sqrt());
let magnitude = map::unary_f64(abs_x + radicand.sqrt(), scalar::ln_u35_f64);
let negative_mask = input.cmp_lt(V::zeroes());
let fast = negative_mask.blendv(magnitude, -magnitude);

Expand Down
6 changes: 3 additions & 3 deletions src/math/f64/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
//! f64 SIMD math dispatch layering:
//! - family-local modules own the public internal routing points for each math family.
//! - current decisions are intentionally mixed:
//! scalar-reference for core/trig and hyperbolic defaults,
//! portable SIMD for inverse trig and several binary-misc kernels,
//! and hybrid paths where scalar sub-ops still underpin the fast path.
//! portable SIMD for the revived core log/exp family, inverse trig, and binary misc,
//! scalar-reference for trig and the losing hyperbolic defaults,
//! and hybrid paths where a stricter scalar sub-op still underpins the fast path.
//! - follow-up optimization work can still replace one family module at a time.

mod binary_misc;
Expand Down
4 changes: 2 additions & 2 deletions src/math/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
//! `sinh_u35` / `cosh_u35` / `tanh_u35` now use family-local portable SIMD
//! kernels with centralized scalar patching for exceptional lanes.
//! The stabilized `f64` map is intentionally mixed:
//! scalar-reference for the current losing core/trig and hyperbolic families,
//! portable SIMD for inverse trig and several binary-misc kernels,
//! portable SIMD for the revived core log/exp family, inverse trig, and several binary-misc kernels,
//! scalar-reference for the current losing trig and hyperbolic families,
//! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops.
//!
//! Structure notes:
Expand Down
26 changes: 24 additions & 2 deletions src/tests/simd_math_targeted_edges/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ simd_math_targeted_all_backends!(
);

fn run_f64_log_exp_boundary_lanes<S: Simd>() {
let inputs_log = vec![
let mut inputs_log = vec![
f64::from_bits(1),
f64::MIN_POSITIVE,
0.5,
Expand All @@ -273,6 +273,13 @@ fn run_f64_log_exp_boundary_lanes<S: Simd>() {
-0.0,
];

for &scale in &[0.5f64, 1.0, 2.0, 8.0, 1024.0] {
let pivot = std::f64::consts::FRAC_1_SQRT_2 * scale;
inputs_log.push(f64::from_bits(pivot.to_bits() - 1));
inputs_log.push(pivot);
inputs_log.push(f64::from_bits(pivot.to_bits() + 1));
}

check_targeted_unary_f64::<S>(
"log2_u35",
&inputs_log,
Expand All @@ -288,7 +295,7 @@ fn run_f64_log_exp_boundary_lanes<S: Simd>() {
f64::ln,
);

let inputs_exp = vec![
let mut inputs_exp = vec![
-1022.0,
-1021.75,
-10.0,
Expand All @@ -305,6 +312,21 @@ fn run_f64_log_exp_boundary_lanes<S: Simd>() {
f64::NAN,
];

for k in -4..=4 {
let center = k as f64;
inputs_exp.push(center - 1.0 / 4096.0);
inputs_exp.push(center);
inputs_exp.push(center + 1.0 / 4096.0);
}

for &center in &[
-1022.0f64, -1021.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1022.0, 1023.0,
] {
inputs_exp.push(f64::from_bits(center.to_bits().saturating_sub(1)));
inputs_exp.push(center);
inputs_exp.push(f64::from_bits(center.to_bits().saturating_add(1)));
}

check_targeted_unary_f64::<S>(
"exp2_u35",
&inputs_exp,
Expand Down
Loading