From b156b48e1899e3f29512cd21e830ba392c8ec5ca Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Mon, 16 Mar 2026 15:05:42 -0700 Subject: [PATCH 1/2] better f64 cast to i64 --- src/ops/f64.rs | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/src/ops/f64.rs b/src/ops/f64.rs index afcb969..2c1db6d 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -683,22 +683,17 @@ impl_op! { impl_op! { fn cast_i64 { for Avx2(a: __m256d) -> __m256i { - let nums_arr = core::mem::transmute::<__m256d, [f64; 4]>(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - nums_arr[2].m_round() as i64, - nums_arr[3].m_round() as i64, - ]; - core::mem::transmute::<_, __m256i>(ceil) + // Round in SIMD, then extract for i64 conversion (no native cvtpd_epi64 before AVX-512) + let rounded = _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let arr = core::mem::transmute::<__m256d, [f64; 4]>(rounded); + let result = [arr[0] as i64, arr[1] as i64, arr[2] as i64, arr[3] as i64]; + core::mem::transmute::<_, __m256i>(result) } for Sse41(a: __m128d) -> __m128i { - let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - ]; - core::mem::transmute::<_, __m128i>(ceil) + let rounded = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let arr = core::mem::transmute::<__m128d, [f64; 2]>(rounded); + let result = [arr[0] as i64, arr[1] as i64]; + core::mem::transmute::<_, __m128i>(result) } for Sse2(a: __m128d) -> __m128i { let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); @@ -712,20 +707,14 @@ impl_op! { a.m_round() as i64 } for Neon(a: float64x2_t) -> int64x2_t { - let nums_arr = core::mem::transmute::(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - ]; - core::mem::transmute::<_, int64x2_t>(ceil) + let rounded = vrndnq_f64(a); + vcvtq_s64_f64(rounded) } for Wasm(a: v128) -> v128 { - let nums_arr = core::mem::transmute::<_, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - ]; - core::mem::transmute::<_, v128>(ceil) + let rounded = f64x2_nearest(a); + let arr = core::mem::transmute::<_, [f64; 2]>(rounded); + let result = [arr[0] as i64, arr[1] as i64]; + core::mem::transmute::<_, v128>(result) } } } From bfd45750cb0e7998d689f0651749438813c5e2f5 Mon Sep 17 00:00:00 2001 From: Austin Orr Date: Tue, 17 Mar 2026 10:44:36 -0700 Subject: [PATCH 2/2] match self round call pattern --- src/ops/f64.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/f64.rs b/src/ops/f64.rs index 2c1db6d..d0ac433 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -684,13 +684,13 @@ impl_op! { fn cast_i64 { for Avx2(a: __m256d) -> __m256i { // Round in SIMD, then extract for i64 conversion (no native cvtpd_epi64 before AVX-512) - let rounded = _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let rounded = Self::round(a); let arr = core::mem::transmute::<__m256d, [f64; 4]>(rounded); let result = [arr[0] as i64, arr[1] as i64, arr[2] as i64, arr[3] as i64]; core::mem::transmute::<_, __m256i>(result) } for Sse41(a: __m128d) -> __m128i { - let rounded = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let rounded = Self::round(a); let arr = core::mem::transmute::<__m128d, [f64; 2]>(rounded); let result = [arr[0] as i64, arr[1] as i64]; core::mem::transmute::<_, __m128i>(result)