From fc78a7dbba9fe820100eab30007fdb37a2770666 Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Mon, 16 Mar 2026 11:12:07 -0700 Subject: [PATCH 1/2] simd floor and ceil for floats --- src/ops/f32.rs | 24 ++++++++---------------- src/ops/f64.rs | 20 ++++++++------------ 2 files changed, 16 insertions(+), 28 deletions(-) diff --git a/src/ops/f32.rs b/src/ops/f32.rs index 0546114..d9265fd 100644 --- a/src/ops/f32.rs +++ b/src/ops/f32.rs @@ -363,14 +363,10 @@ impl_op! { _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128) -> __m128 { - let nums_arr = core::mem::transmute::<__m128, [f32; 4]>(a); - let ceil = [ - nums_arr[0].m_floor(), - nums_arr[1].m_floor(), - nums_arr[2].m_floor(), - nums_arr[3].m_floor(), - ]; - core::mem::transmute::<[f32; 4], __m128>(ceil) + let rounded = Ops::::round(a); + let mask = _mm_cmpgt_ps(rounded, a); + let one = _mm_and_ps(mask, _mm_set1_ps(1.0)); + _mm_sub_ps(rounded, one) } for Scalar(a: f32) -> f32 { a.m_floor() @@ -393,14 +389,10 @@ impl_op! { _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128) -> __m128 { - let nums_arr = core::mem::transmute::<__m128, [f32; 4]>(a); - let ceil = [ - nums_arr[0].m_ceil(), - nums_arr[1].m_ceil(), - nums_arr[2].m_ceil(), - nums_arr[3].m_ceil(), - ]; - core::mem::transmute::<[f32; 4], __m128>(ceil) + let rounded = Ops::::round(a); + let mask = _mm_cmplt_ps(rounded, a); + let one = _mm_and_ps(mask, _mm_set1_ps(1.0)); + _mm_add_ps(rounded, one) } for Scalar(a: f32) -> f32 { a.m_ceil() diff --git a/src/ops/f64.rs b/src/ops/f64.rs index afcb969..b41ad69 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -345,12 +345,10 @@ impl_op! { _mm_round_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128d) -> __m128d { - let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_floor(), - nums_arr[1].m_floor(), - ]; - core::mem::transmute::<[f64; 2], __m128d>(ceil) + let rounded = Ops::::round(a); + let mask = _mm_cmpgt_pd(rounded, a); + let one = _mm_and_pd(mask, _mm_set1_pd(1.0)); + _mm_sub_pd(rounded, one) } for Scalar(a: f64) -> f64 { a.m_floor() @@ -373,12 +371,10 @@ impl_op! { _mm_round_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128d) -> __m128d { - let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_ceil(), - nums_arr[1].m_ceil(), - ]; - core::mem::transmute::<[f64; 2], __m128d>(ceil) + let rounded = Ops::::round(a); + let mask = _mm_cmplt_pd(rounded, a); + let one = _mm_and_pd(mask, _mm_set1_pd(1.0)); + _mm_add_pd(rounded, one) } for Scalar(a: f64) -> f64 { a.m_ceil() From ae384ba6b044ed24ecee7f6c003720140902b166 Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Tue, 17 Mar 2026 10:39:46 -0700 Subject: [PATCH 2/2] match self call pattern from fast_round --- src/ops/f32.rs | 4 ++-- src/ops/f64.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ops/f32.rs b/src/ops/f32.rs index d9265fd..2c3bdee 100644 --- a/src/ops/f32.rs +++ b/src/ops/f32.rs @@ -363,7 +363,7 @@ impl_op! { _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128) -> __m128 { - let rounded = Ops::::round(a); + let rounded = Self::round(a); let mask = _mm_cmpgt_ps(rounded, a); let one = _mm_and_ps(mask, _mm_set1_ps(1.0)); _mm_sub_ps(rounded, one) @@ -389,7 +389,7 @@ impl_op! { _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128) -> __m128 { - let rounded = Ops::::round(a); + let rounded = Self::round(a); let mask = _mm_cmplt_ps(rounded, a); let one = _mm_and_ps(mask, _mm_set1_ps(1.0)); _mm_add_ps(rounded, one) diff --git a/src/ops/f64.rs b/src/ops/f64.rs index b41ad69..47483fb 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -345,7 +345,7 @@ impl_op! { _mm_round_pd(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128d) -> __m128d { - let rounded = Ops::::round(a); + let rounded = Self::round(a); let mask = _mm_cmpgt_pd(rounded, a); let one = _mm_and_pd(mask, _mm_set1_pd(1.0)); _mm_sub_pd(rounded, one) @@ -371,7 +371,7 @@ impl_op! { _mm_round_pd(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) } for Sse2(a: __m128d) -> __m128d { - let rounded = Ops::::round(a); + let rounded = Self::round(a); let mask = _mm_cmplt_pd(rounded, a); let one = _mm_and_pd(mask, _mm_set1_pd(1.0)); _mm_add_pd(rounded, one)