@@ -268,9 +268,9 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
268268 _mm_set1_ps (GGML_CPU_FP16_TO_FP32 (x0 ) * GGML_CPU_FP16_TO_FP32 (y0 )));
269269}
270270
271- static inline __m256 quad_mx_delta_float (const int8_t x0 , const float y0 , const int8_t x1 , const float y1 ) {
272- return _mm256_set_m128 (_mm_set1_ps (GGML_E8M0_TO_FP32_HALF (x1 ) * GGML_CPU_FP16_TO_FP32 (y1 )),
273- _mm_set1_ps (GGML_E8M0_TO_FP32_HALF (x0 ) * GGML_CPU_FP16_TO_FP32 (y0 )));
271+ static inline __m256 quad_mx_delta_float (const uint8_t x0 , const float y0 , const uint8_t x1 , const float y1 ) {
272+ return _mm256_set_m128 (_mm_set1_ps (GGML_CPU_E8M0_TO_FP32_HALF (x1 ) * GGML_CPU_FP16_TO_FP32 (y1 )),
273+ _mm_set1_ps (GGML_CPU_E8M0_TO_FP32_HALF (x0 ) * GGML_CPU_FP16_TO_FP32 (y0 )));
274274}
275275#endif
276276#elif defined(__SSSE3__ )
@@ -782,6 +782,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
782782
783783 __m256 accum1 = _mm256_setzero_ps ();
784784 __m256 accum2 = _mm256_setzero_ps ();
785+
785786 for (; ib + 1 < nb ; ib += 2 ) {
786787 const __m128i q4bits_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 0 ].qs );
787788 const __m128i q4bits_2 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
@@ -795,10 +796,10 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
795796 const __m256i p16_2 = mul_add_epi8 (q4b_2 , q8b_2 );
796797 const __m256i p_1 = _mm256_madd_epi16 (p16_1 , mone );
797798 const __m256i p_2 = _mm256_madd_epi16 (p16_2 , mone );
798- accum1 = _mm256_fmadd_ps ( _mm256_set1_ps (GGML_CPU_FP16_TO_FP32 (y [ib + 0 ].d )* GGML_E8M0_TO_FP32_HALF (x [ib + 0 ].e )),
799- _mm256_cvtepi32_ps ( p_1 ), accum1 );
800- accum2 = _mm256_fmadd_ps (_mm256_set1_ps ( GGML_CPU_FP16_TO_FP32 ( y [ ib + 1 ]. d ) * GGML_E8M0_TO_FP32_HALF ( x [ ib + 1 ]. e )),
801- _mm256_cvtepi32_ps (p_2 ), accum2 );
799+ const __m256 scale0 = _mm256_set1_ps (GGML_CPU_FP16_TO_FP32 (y [ib + 0 ].d )* GGML_CPU_E8M0_TO_FP32_HALF (x [ib + 0 ].e ));
800+ const __m256 scale1 = _mm256_set1_ps ( GGML_CPU_FP16_TO_FP32 ( y [ ib + 1 ]. d ) * GGML_CPU_E8M0_TO_FP32_HALF ( x [ ib + 1 ]. e ) );
801+ accum1 = _mm256_fmadd_ps (scale0 , _mm256_cvtepi32_ps ( p_1 ), accum1 );
802+ accum2 = _mm256_fmadd_ps ( scale1 , _mm256_cvtepi32_ps (p_2 ), accum2 );
802803 }
803804
804805 sumf = hsum_float_8 (_mm256_add_ps (accum1 , accum2 ));
@@ -830,7 +831,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
830831
831832#endif
832833 for (; ib < nb ; ++ ib ) {
833- const float d = GGML_CPU_FP16_TO_FP32 (y [ib ].d )* GGML_E8M0_TO_FP32_HALF (x [ib ].e );
834+ const float d = GGML_CPU_FP16_TO_FP32 (y [ib ].d )* GGML_CPU_E8M0_TO_FP32_HALF (x [ib ].e );
834835 int sumi1 = 0 ;
835836 int sumi2 = 0 ;
836837 for (int j = 0 ; j < QK_MXFP4 /2 ; ++ j ) {
@@ -3817,4 +3818,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
38173818 ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
38183819#endif
38193820}
3820-
0 commit comments