Skip to content

Commit ba0e521

Browse files
committed
fix: Q1_0_g128 x86 CPU kernel - correct output + AVX2/AVX-512 VNNI
The Q1_0_g128 vec_dot kernel for x86 produces garbage output due to a float-to-int truncation bug: `sumi += d1 * sumi_block` accumulates a float product into an int, silently truncating the result to zero for small scale factors. This affects both the generic scalar fallback and the x86 arch-specific implementation. The ARM NEON implementation was correct and unaffected. Changes: - Fix generic scalar kernel (quants.c): accumulate `d0 * d1 * sumi` into float, matching the working ARM scalar fallback pattern - Replace x86 scalar-only kernel with three-tier implementation: 1. AVX-512 VNNI (BW+VL+VNNI): uses mask registers for single- instruction bit expansion + VPDPBUSD for dot product 2. AVX2: shuffle-based bit expansion + sign_epi8 multiply 3. Scalar fallback: corrected accumulation Benchmarks on AMD EPYC (Zen 4, 12 vCPU shared): Before (broken): garbage output at ~0.5 tok/s Scalar fix: correct output at ~3 tok/s AVX2: correct output at ~28 tok/s AVX-512 VNNI: correct output at ~50 tok/s (1.7B model)
1 parent 1179bfc commit ba0e521

File tree

2 files changed

+111
-50
lines changed

2 files changed

+111
-50
lines changed

ggml/src/ggml-cpu/arch/x86/quants.c

Lines changed: 99 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -662,39 +662,113 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
662662
const block_q1_0_g128 * GGML_RESTRICT x = vx;
663663
const block_q8_0 * GGML_RESTRICT y = vy;
664664

665-
float sumf = 0;
665+
float sumf = 0.0f;
666+
667+
#if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__)
668+
// AVX-512 VNNI path: mask registers for bit expansion + VNNI dot product
669+
const __m256i ones_u8 = _mm256_set1_epi8(1);
666670

667-
// Each Q1_0_g128 block has 128 elements
668-
// Each Q8_0 block has 32 elements
669-
// So we need 4 Q8_0 blocks per Q1_0_g128 block
670671
for (int ib = 0; ib < nb; ++ib) {
671672
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
672-
673-
int sumi = 0;
674-
675-
// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
673+
676674
for (int k = 0; k < 4; k++) {
677675
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
678-
679-
int sumi_block = 0;
680-
681-
for (int j = 0; j < QK8_0; j++) {
682-
const int bit_index = k * QK8_0 + j;
683-
const int byte_index = bit_index / 8;
684-
const int bit_offset = bit_index % 8;
685-
686-
// Extract bit: 1 = +1, 0 = -1
687-
const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
688-
const int yi = y[ib*4 + k].qs[j];
689-
690-
sumi_block += xi * yi;
676+
677+
// Load 32 bits of weights directly as a mask register
678+
const __mmask32 bmask = (__mmask32)(*(const uint32_t *)(x[ib].qs + k * 4));
679+
680+
// Load 32 int8 activations
681+
const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
682+
683+
// Sum ALL q8 values using VNNI (groups of 4 int8 -> int32)
684+
const __m256i sum_all = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, q8);
685+
686+
// Zero out q8 where bit=0, keep where bit=1 (single instruction)
687+
const __m256i masked_q8 = _mm256_maskz_mov_epi8(bmask, q8);
688+
689+
// Sum MASKED q8 values using VNNI
690+
const __m256i sum_masked = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, masked_q8);
691+
692+
// dot = 2 * sum_masked - sum_all
693+
// (weight = 2*bit - 1, so dot = sum((2*bit-1)*q8) = 2*sum(q8 where bit=1) - sum(q8))
694+
const __m256i dp = _mm256_sub_epi32(_mm256_slli_epi32(sum_masked, 1), sum_all);
695+
696+
// Horizontal sum of 8 int32 values
697+
const __m128i lo = _mm256_castsi256_si128(dp);
698+
const __m128i hi = _mm256_extracti128_si256(dp, 1);
699+
__m128i r = _mm_add_epi32(lo, hi);
700+
r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
701+
r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
702+
703+
sumf += d0 * d1 * (float)_mm_cvtsi128_si32(r);
704+
}
705+
}
706+
707+
#elif defined(__AVX2__)
708+
// AVX2 path: shuffle-based bit expansion + sign multiply
709+
const __m256i shuf = _mm256_setr_epi8(
710+
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
711+
2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
712+
const __m256i bmask = _mm256_set1_epi64x(0x8040201008040201LL);
713+
const __m256i ones8 = _mm256_set1_epi8(1);
714+
const __m256i neg8 = _mm256_set1_epi8(-1);
715+
const __m256i ones16 = _mm256_set1_epi16(1);
716+
717+
for (int ib = 0; ib < nb; ++ib) {
718+
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
719+
720+
for (int k = 0; k < 4; k++) {
721+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
722+
723+
// Broadcast 4 bytes of 1-bit weights, expand to per-byte mask
724+
__m256i vb = _mm256_set1_epi32(*(const int32_t *)(x[ib].qs + k * 4));
725+
__m256i ex = _mm256_shuffle_epi8(vb, shuf);
726+
ex = _mm256_cmpeq_epi8(_mm256_and_si256(ex, bmask), bmask);
727+
728+
// Convert mask to +1/-1
729+
const __m256i xi = _mm256_blendv_epi8(neg8, ones8, ex);
730+
731+
// Multiply: sign_epi8(q8, xi) = q8 * sign(xi)
732+
const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
733+
const __m256i prod = _mm256_sign_epi8(q8, xi);
734+
735+
// Horizontal sum of 32 int8 -> int32
736+
const __m256i p16_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(prod));
737+
const __m256i p16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(prod, 1));
738+
const __m256i s32_lo = _mm256_madd_epi16(p16_lo, ones16);
739+
const __m256i s32_hi = _mm256_madd_epi16(p16_hi, ones16);
740+
const __m256i s32 = _mm256_add_epi32(s32_lo, s32_hi);
741+
742+
const __m128i lo = _mm256_castsi256_si128(s32);
743+
const __m128i hi = _mm256_extracti128_si256(s32, 1);
744+
__m128i r = _mm_add_epi32(lo, hi);
745+
r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
746+
r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
747+
748+
sumf += d0 * d1 * (float)_mm_cvtsi128_si32(r);
749+
}
750+
}
751+
752+
#else
753+
// Scalar fallback
754+
for (int ib = 0; ib < nb; ++ib) {
755+
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
756+
757+
for (int k = 0; k < 4; k++) {
758+
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
759+
const uint8_t * bits = x[ib].qs + k * 4;
760+
const int8_t * q8 = y[ib*4 + k].qs;
761+
762+
int sumi = 0;
763+
for (int j = 0; j < 32; j++) {
764+
const int bit = (bits[j >> 3] >> (j & 7)) & 1;
765+
sumi += (2*bit - 1) * q8[j];
691766
}
692-
693-
sumi += d1 * sumi_block;
767+
768+
sumf += d0 * d1 * (float)sumi;
694769
}
695-
696-
sumf += d0 * sumi;
697770
}
771+
#endif
698772

699773
*s = sumf;
700774
}

ggml/src/ggml-cpu/quants.c

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -176,39 +176,26 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
176176
const block_q8_0 * GGML_RESTRICT y = vy;
177177

178178

179-
float sumf = 0.0;
180-
181-
// Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements
182-
// So we need 4 Q8_0 blocks per Q1_0_g128 block
179+
float sumf = 0.0f;
180+
183181
for (int i = 0; i < nb; i++) {
184182
const float d0 = GGML_FP16_TO_FP32(x[i].d);
185-
186-
int sumi = 0;
187-
188-
// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
183+
189184
for (int k = 0; k < 4; k++) {
190185
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
191-
192-
int sumi_block = 0;
193-
186+
const uint8_t * bits = x[i].qs + k * 4;
187+
const int8_t * q8 = y[i*4 + k].qs;
188+
189+
int sumi = 0;
194190
for (int j = 0; j < QK8_0; j++) {
195-
const int bit_index = k * QK8_0 + j;
196-
const int byte_index = bit_index / 8;
197-
const int bit_offset = bit_index % 8;
198-
199-
// Extract bit: 1 = +1, 0 = -1
200-
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
201-
const int yi = y[i*4 + k].qs[j];
202-
203-
sumi_block += xi * yi;
191+
const int bit = (bits[j >> 3] >> (j & 7)) & 1;
192+
sumi += (2*bit - 1) * q8[j];
204193
}
205-
206-
sumi += d1 * sumi_block;
194+
195+
sumf += d0 * d1 * (float)sumi;
207196
}
208-
209-
sumf += d0 * sumi;
210197
}
211-
198+
212199
*s = sumf;
213200
}
214201

0 commit comments

Comments
 (0)