ggml-cpu: refactor; add rvv repacking for q5_K

taimur-10x · taimur-10x · commit 3d2f7cfa4182 · 2026-02-24T13:02:35.000+05:00
diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp
@@ -1065,15 +1065,15 @@ void ggml_gemv_q5_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 
                 // Accumulation for 2 sub-blocks.
                 //
-                // This might overflow, so we accumulate in two steps.
+                // This might overflow, so we accumulate in 4 steps.
                 //
                 // Recheck.
-                for (int k = 0; k < 2; k++) {
+                for (int k = 0; k < 4; k++) {
                     // 4xM integer accumulators
                     vint16m1_t sumi_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
                     vint16m1_t sumi_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
 
-                    for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+                    for (int i = k * 8; i < (k + 1) * 8; i++) {
                         // Load `b_ptr`.
                         const vuint8mf2_t b_lo_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 64 * ncols_interleaved + i * ncols_interleaved], ncols_interleaved);
                         const vint8mf2_t b_s_lo_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_lo_packed, 0xF, ncols_interleaved));
@@ -1099,15 +1099,15 @@ void ggml_gemv_q5_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                 }
                 // Accumulation for 2 sub-blocks.
                 //
-                // This might overflow, so we accumulate in two steps.
+                // This might overflow, so we accumulate in 4 steps.
                 //
                 // Recheck.
-                for (int k = 0; k < 2; k++) {
+                for (int k = 0; k < 4; k++) {
                     // 4xM integer accumulators
                     vint16m1_t sumi_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
                     vint16m1_t sumi_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
 
-                    for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+                    for (int i = k * 8; i < (k + 1) * 8; i++) {
                         // Load `b_ptr`.
                         const vuint8mf2_t b_lo_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 64 * ncols_interleaved + 32 * ncols_interleaved + i * ncols_interleaved], ncols_interleaved);
                         const vint8mf2_t b_s_lo_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_lo_packed, 0xF, ncols_interleaved));
@@ -1202,7 +1202,7 @@ void ggml_gemv_q6_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
         for (int l = 0; l < nb; l++) {
             vint32m2_t sumi = __riscv_vmv_v_x_i32m2(0, ncols_interleaved);
 
-            // We process 2 16-element sub-blocks at once.
+            // We process 4 16-element sub-blocks at once.
             for (int j = 0; j < QK_K / 16; j += 4) {
                 // Load the scales.
                 //
@@ -2225,7 +2225,7 @@ static void ggml_gemm_q3_K_Mx1_q8_K(int   n,
                 for (int group = 0; group < 4; ++group) {
                     // High scales are needed for all 4 sub-blocks (0.5 register)
                     vuint8mf2_t v_sc_h_quad = __riscv_vle8_v_u8mf2(rhs_sc_high_ptr, 16);
-                    rhs_sc_high_ptr += ncols_interleaved; 
+                    rhs_sc_high_ptr += ncols_interleaved;
 
                     // --- Scope 1: Sub-blocks 1 & 2 (Pair 0) ---
                     // By scoping this, v_sc_l_pair0 dies before we load pair1
@@ -2926,10 +2926,10 @@ void ggml_gemm_q5_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 
                     // Accumulation for 2 sub-blocks.
                     //
-                    // This might overflow, so we accumulate in two steps.
+                    // This might overflow, so we accumulate in 4 steps.
                     //
                     // Recheck.
-                    for (int k = 0; k < 2; k++) {
+                    for (int k = 0; k < 4; k++) {
                         // 4xM integer accumulators
                         vint16m1_t sumi_0_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
                         vint16m1_t sumi_1_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
@@ -2940,7 +2940,7 @@ void ggml_gemm_q5_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                         vint16m1_t sumi_2_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
                         vint16m1_t sumi_3_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
 
-                        for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+                        for (int i = k * 8; i < (k + 1) * 8; i++) {
                             // Load `b_ptr`.
                             const vuint8mf2_t b_lo_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 64 * ncols_interleaved + i * ncols_interleaved], ncols_interleaved);
                             const vint8mf2_t b_s_lo_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_lo_packed, 0xF, ncols_interleaved));
@@ -2994,7 +2994,7 @@ void ggml_gemm_q5_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                     // This might overflow, so we accumulate in two steps.
                     //
                     // Recheck.
-                    for (int k = 0; k < 2; k++) {
+                    for (int k = 0; k < 4; k++) {
                         // 4xM integer accumulators
                         vint16m1_t sumi_0_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
                         vint16m1_t sumi_1_s_0_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
@@ -3005,7 +3005,7 @@ void ggml_gemm_q5_K_Mx1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                         vint16m1_t sumi_2_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
                         vint16m1_t sumi_3_s_1_16 = __riscv_vmv_v_x_i16m1(0.0f, ncols_interleaved);
 
-                        for (int i = k * 16; i < k * 16 + QK4_0 / 2; i++) {
+                        for (int i = k * 8; i < (k + 1) * 8; i++) {
                             // Load `b_ptr`.
                             const vuint8mf2_t b_lo_packed = __riscv_vle8_v_u8mf2(&b_ptr[l].qs[j * 64 * ncols_interleaved + 32 * ncols_interleaved + i * ncols_interleaved], ncols_interleaved);
                             const vint8mf2_t b_s_lo_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(b_lo_packed, 0xF, ncols_interleaved));