Skip to content

Commit 2ceda3f

Browse files
authored
ggml-cpu: use LUT for converting e8->f32 scales on x86 (#19288)
* ggml-cpu: use LUT for converting e8->f32 scales on x86 * add dispatch based on macro
1 parent 44008ce commit 2ceda3f

3 files changed

Lines changed: 28 additions & 9 deletions

File tree

ggml/src/ggml-cpu/arch/x86/quants.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,9 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
268268
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
269269
}
270270

271-
static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
272-
return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
273-
_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
271+
static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const uint8_t x1, const float y1) {
272+
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
273+
_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
274274
}
275275
#endif
276276
#elif defined(__SSSE3__)
@@ -782,6 +782,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
782782

783783
__m256 accum1 = _mm256_setzero_ps();
784784
__m256 accum2 = _mm256_setzero_ps();
785+
785786
for (; ib + 1 < nb; ib += 2) {
786787
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
787788
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
@@ -795,10 +796,10 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
795796
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
796797
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
797798
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
798-
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
799-
_mm256_cvtepi32_ps(p_1), accum1);
800-
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
801-
_mm256_cvtepi32_ps(p_2), accum2);
799+
const __m256 scale0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 0].e));
800+
const __m256 scale1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 1].e));
801+
accum1 = _mm256_fmadd_ps(scale0, _mm256_cvtepi32_ps(p_1), accum1);
802+
accum2 = _mm256_fmadd_ps(scale1, _mm256_cvtepi32_ps(p_2), accum2);
802803
}
803804

804805
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
@@ -830,7 +831,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
830831

831832
#endif
832833
for (; ib < nb; ++ib) {
833-
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
834+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib].e);
834835
int sumi1 = 0;
835836
int sumi2 = 0;
836837
for (int j = 0; j < QK_MXFP4/2; ++j) {
@@ -3817,4 +3818,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
38173818
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
38183819
#endif
38193820
}
3820-

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@
7575
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
7676
float ggml_table_f32_f16[1 << 16];
7777

78+
// precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h)
79+
float ggml_table_f32_e8m0_half[1 << 8];
80+
7881
#if defined(__ARM_ARCH)
7982
struct ggml_arm_arch_features_type {
8083
int sve_cnt;
@@ -3681,6 +3684,11 @@ void ggml_cpu_init(void) {
36813684
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
36823685
}
36833686

3687+
// initialize E8M0 half table (256 entries)
3688+
for (int i = 0; i < (1 << 8); ++i) {
3689+
ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i);
3690+
}
3691+
36843692
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
36853693

36863694
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,17 @@ extern "C" {
116116
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
117117
extern float ggml_table_f32_f16[1 << 16];
118118

119+
// precomputed f32 table for e8m0 half (1 KB)
120+
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
121+
extern float ggml_table_f32_e8m0_half[1 << 8];
122+
123+
// Use lookup table for E8M0 on x86 (faster than bit manipulation)
124+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
125+
#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
126+
#else
127+
#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
128+
#endif
129+
119130
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
120131
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
121132
// This is also true for POWER9.

0 commit comments

Comments
 (0)