fix: Q1_0_g128 x86 CPU kernel - correct output + AVX2/AVX-512 VNNI

stfurkan · stfurkan · commit ba0e521904bf · 2026-04-02T20:04:52.000+03:00
The Q1_0_g128 vec_dot kernel for x86 produces garbage output due to a
float-to-int truncation bug: `sumi += d1 * sumi_block` accumulates a
float product into an int, silently truncating the result to zero for
small scale factors. This affects both the generic scalar fallback and
the x86 arch-specific implementation.

The ARM NEON implementation was correct and unaffected.

Changes:
- Fix generic scalar kernel (quants.c): accumulate `d0 * d1 * sumi`
  into float, matching the working ARM scalar fallback pattern
- Replace x86 scalar-only kernel with three-tier implementation:
  1. AVX-512 VNNI (BW+VL+VNNI): uses mask registers for single-
     instruction bit expansion + VPDPBUSD for dot product
  2. AVX2: shuffle-based bit expansion + sign_epi8 multiply
  3. Scalar fallback: corrected accumulation

Benchmarks on AMD EPYC (Zen 4, 12 vCPU shared):
  Before (broken): garbage output at ~0.5 tok/s
  Scalar fix:      correct output at ~3 tok/s
  AVX2:            correct output at ~28 tok/s
  AVX-512 VNNI:    correct output at ~50 tok/s (1.7B model)
diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -662,39 +662,113 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     const block_q1_0_g128 * GGML_RESTRICT x = vx;
     const block_q8_0 * GGML_RESTRICT y = vy;
 
-    float sumf = 0;
+    float sumf = 0.0f;
+
+#if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__)
+    // AVX-512 VNNI path: mask registers for bit expansion + VNNI dot product
+    const __m256i ones_u8 = _mm256_set1_epi8(1);
 
-    // Each Q1_0_g128 block has 128 elements
-    // Each Q8_0 block has 32 elements
-    // So we need 4 Q8_0 blocks per Q1_0_g128 block
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        
-        int sumi = 0;
-        
-        // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
+
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
-            
-            int sumi_block = 0;
-            
-            for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
-                const int byte_index = bit_index / 8;
-                const int bit_offset = bit_index % 8;
-                
-                // Extract bit: 1 = +1, 0 = -1
-                const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-                const int yi = y[ib*4 + k].qs[j];
-                
-                sumi_block += xi * yi;
+
+            // Load 32 bits of weights directly as a mask register
+            const __mmask32 bmask = (__mmask32)(*(const uint32_t *)(x[ib].qs + k * 4));
+
+            // Load 32 int8 activations
+            const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+
+            // Sum ALL q8 values using VNNI (groups of 4 int8 -> int32)
+            const __m256i sum_all = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, q8);
+
+            // Zero out q8 where bit=0, keep where bit=1 (single instruction)
+            const __m256i masked_q8 = _mm256_maskz_mov_epi8(bmask, q8);
+
+            // Sum MASKED q8 values using VNNI
+            const __m256i sum_masked = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, masked_q8);
+
+            // dot = 2 * sum_masked - sum_all
+            // (weight = 2*bit - 1, so dot = sum((2*bit-1)*q8) = 2*sum(q8 where bit=1) - sum(q8))
+            const __m256i dp = _mm256_sub_epi32(_mm256_slli_epi32(sum_masked, 1), sum_all);
+
+            // Horizontal sum of 8 int32 values
+            const __m128i lo = _mm256_castsi256_si128(dp);
+            const __m128i hi = _mm256_extracti128_si256(dp, 1);
+            __m128i r = _mm_add_epi32(lo, hi);
+            r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+            r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+
+            sumf += d0 * d1 * (float)_mm_cvtsi128_si32(r);
+        }
+    }
+
+#elif defined(__AVX2__)
+    // AVX2 path: shuffle-based bit expansion + sign multiply
+    const __m256i shuf   = _mm256_setr_epi8(
+        0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
+        2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
+    const __m256i bmask  = _mm256_set1_epi64x(0x8040201008040201LL);
+    const __m256i ones8  = _mm256_set1_epi8(1);
+    const __m256i neg8   = _mm256_set1_epi8(-1);
+    const __m256i ones16 = _mm256_set1_epi16(1);
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+
+            // Broadcast 4 bytes of 1-bit weights, expand to per-byte mask
+            __m256i vb = _mm256_set1_epi32(*(const int32_t *)(x[ib].qs + k * 4));
+            __m256i ex = _mm256_shuffle_epi8(vb, shuf);
+            ex = _mm256_cmpeq_epi8(_mm256_and_si256(ex, bmask), bmask);
+
+            // Convert mask to +1/-1
+            const __m256i xi = _mm256_blendv_epi8(neg8, ones8, ex);
+
+            // Multiply: sign_epi8(q8, xi) = q8 * sign(xi)
+            const __m256i q8   = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+            const __m256i prod = _mm256_sign_epi8(q8, xi);
+
+            // Horizontal sum of 32 int8 -> int32
+            const __m256i p16_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(prod));
+            const __m256i p16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(prod, 1));
+            const __m256i s32_lo = _mm256_madd_epi16(p16_lo, ones16);
+            const __m256i s32_hi = _mm256_madd_epi16(p16_hi, ones16);
+            const __m256i s32    = _mm256_add_epi32(s32_lo, s32_hi);
+
+            const __m128i lo = _mm256_castsi256_si128(s32);
+            const __m128i hi = _mm256_extracti128_si256(s32, 1);
+            __m128i r = _mm_add_epi32(lo, hi);
+            r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+            r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+
+            sumf += d0 * d1 * (float)_mm_cvtsi128_si32(r);
+        }
+    }
+
+#else
+    // Scalar fallback
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+            const uint8_t * bits = x[ib].qs + k * 4;
+            const int8_t  * q8   = y[ib*4 + k].qs;
+
+            int sumi = 0;
+            for (int j = 0; j < 32; j++) {
+                const int bit = (bits[j >> 3] >> (j & 7)) & 1;
+                sumi += (2*bit - 1) * q8[j];
             }
-            
-            sumi += d1 * sumi_block;
+
+            sumf += d0 * d1 * (float)sumi;
         }
-        
-        sumf += d0 * sumi;
     }
+#endif
 
     *s = sumf;
 }
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
@@ -176,39 +176,26 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     const block_q8_0 * GGML_RESTRICT y = vy;
     
     
-    float sumf = 0.0;
-    
-    // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements
-    // So we need 4 Q8_0 blocks per Q1_0_g128 block
+    float sumf = 0.0f;
+
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        
-        int sumi = 0;
-        
-        // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
+
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-            
-            int sumi_block = 0;
-            
+            const uint8_t * bits = x[i].qs + k * 4;
+            const int8_t  * q8   = y[i*4 + k].qs;
+
+            int sumi = 0;
             for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
-                const int byte_index = bit_index / 8;
-                const int bit_offset = bit_index % 8;
-                
-                // Extract bit: 1 = +1, 0 = -1
-                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-                const int yi = y[i*4 + k].qs[j];
-                
-                sumi_block += xi * yi;
+                const int bit = (bits[j >> 3] >> (j & 7)) & 1;
+                sumi += (2*bit - 1) * q8[j];
             }
-            
-            sumi += d1 * sumi_block;
+
+            sumf += d0 * d1 * (float)sumi;
         }
-        
-        sumf += d0 * sumi;
     }
-    
+
     *s = sumf;
 }