diff --git a/src/ops/f64.rs b/src/ops/f64.rs index afcb969..d0ac433 100644 --- a/src/ops/f64.rs +++ b/src/ops/f64.rs @@ -683,22 +683,17 @@ impl_op! { impl_op! { fn cast_i64 { for Avx2(a: __m256d) -> __m256i { - let nums_arr = core::mem::transmute::<__m256d, [f64; 4]>(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - nums_arr[2].m_round() as i64, - nums_arr[3].m_round() as i64, - ]; - core::mem::transmute::<_, __m256i>(ceil) + // Round in SIMD, then extract for i64 conversion (no native cvtpd_epi64 before AVX-512) + let rounded = Self::round(a); + let arr = core::mem::transmute::<__m256d, [f64; 4]>(rounded); + let result = [arr[0] as i64, arr[1] as i64, arr[2] as i64, arr[3] as i64]; + core::mem::transmute::<_, __m256i>(result) } for Sse41(a: __m128d) -> __m128i { - let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - ]; - core::mem::transmute::<_, __m128i>(ceil) + let rounded = Self::round(a); + let arr = core::mem::transmute::<__m128d, [f64; 2]>(rounded); + let result = [arr[0] as i64, arr[1] as i64]; + core::mem::transmute::<_, __m128i>(result) } for Sse2(a: __m128d) -> __m128i { let nums_arr = core::mem::transmute::<__m128d, [f64; 2]>(a); @@ -712,20 +707,14 @@ impl_op! { a.m_round() as i64 } for Neon(a: float64x2_t) -> int64x2_t { - let nums_arr = core::mem::transmute::(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - ]; - core::mem::transmute::<_, int64x2_t>(ceil) + let rounded = vrndnq_f64(a); + vcvtq_s64_f64(rounded) } for Wasm(a: v128) -> v128 { - let nums_arr = core::mem::transmute::<_, [f64; 2]>(a); - let ceil = [ - nums_arr[0].m_round() as i64, - nums_arr[1].m_round() as i64, - ]; - core::mem::transmute::<_, v128>(ceil) + let rounded = f64x2_nearest(a); + let arr = core::mem::transmute::<_, [f64; 2]>(rounded); + let result = [arr[0] as i64, arr[1] as i64]; + core::mem::transmute::<_, v128>(result) } } }