From 6dacdd406b7a225ee188a9f7a021c49465a71c54 Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Mon, 16 Mar 2026 12:35:22 -0700 Subject: [PATCH] speedup f32 horizontal_add --- src/ops/f32.rs | 24 +++++++++++++----------- src/ops/f64.rs | 2 ++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/ops/f32.rs b/src/ops/f32.rs index 0546114..cfbc96a 100644 --- a/src/ops/f32.rs +++ b/src/ops/f32.rs @@ -675,19 +675,21 @@ impl_op! { impl_op! { fn horizontal_add { for Avx2(a: __m256) -> f32 { - let a = _mm256_hadd_ps(a, a); - let b = _mm256_hadd_ps(a, a); - - let first = _mm_cvtss_f32(_mm256_extractf128_ps(b, 0)); - let second = _mm_cvtss_f32(_mm256_extractf128_ps(b, 1)); - - first + second + // benches show shuffle + add is ~20% faster than hadd for 4-wide vectors + let hi128 = _mm256_extractf128_ps(a, 1); + let lo128 = _mm256_castps256_ps128(a); + let sum128 = _mm_add_ps(lo128, hi128); + let shuf = _mm_movehdup_ps(sum128); + let sums = _mm_add_ps(sum128, shuf); + let shuf = _mm_movehl_ps(sums, sums); + _mm_cvtss_f32(_mm_add_ss(sums, shuf)) } for Sse41(a: __m128) -> f32 { - let a = _mm_hadd_ps(a, a); - let b = _mm_hadd_ps(a, a); - - _mm_cvtss_f32(b) + // benches show shuffle + add is ~24% faster than hadd for 4-wide vectors + let shuf = _mm_movehdup_ps(a); + let sums = _mm_add_ps(a, shuf); + let shuf = _mm_movehl_ps(sums, sums); + _mm_cvtss_f32(_mm_add_ss(sums, shuf)) } for Sse2(a: __m128) -> f32 { let t1 = _mm_movehl_ps(a, a); diff --git a/src/ops/f64.rs b/src/ops/f64.rs index afcb969..de76b32 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -653,12 +653,14 @@ impl_op! { impl_op! { fn horizontal_add { for Avx2(a: __m256d) -> f64 { + // benches show no benefit to shuffle + add for 2-wide vectors. let a = _mm256_hadd_pd(a, a); let first = _mm_cvtsd_f64(_mm256_extractf128_pd(a, 0)); let second = _mm_cvtsd_f64(_mm256_extractf128_pd(a, 1)); first + second } for Sse41(a: __m128d) -> f64 { + // benches show no benefit to shuffle + add for 2-wide vectors. _mm_cvtsd_f64(_mm_hadd_pd(a, a)) } for Sse2(a: __m128d) -> f64 {