opt for multi query

luo-cheng2021 · luo-cheng2021 · commit 4f3ba528b808 · 2023-09-19T02:43:26.000+08:00
diff --git a/src/mha_gpt_amx.cpp b/src/mha_gpt_amx.cpp
@@ -116,6 +116,8 @@ void mha_gpt_impl_amx::mha_bf16(const tensor& q, const tensor& k, const tensor&
     auto head_size = q.m_dims[3];
     auto key_seq_len = k.m_dims[2];
     bool is_bloom = k.m_strides[3] > k.m_strides[2];
+    auto h_group_num = k.m_dims[1];
+    size_t h_each_group_len = head_num / h_group_num;
 
     uint8_t* out = output.data<uint8_t>();
 
@@ -130,8 +132,8 @@ void mha_gpt_impl_amx::mha_bf16(const tensor& q, const tensor& k, const tensor&
     if (use_gemv) {
         parallel_for2d(batch, head_num, [&](size_t thread_id, size_t i0, size_t i1) {
             auto q_sub = &q.at<uint8_t>({i0, i1});
-            auto k_sub = &k.at<uint8_t>({i0, i1});
-            auto v_sub = &v.at<uint8_t>({i0, i1});
+            auto k_sub = &k.at<uint8_t>({i0, i1 / h_each_group_len});
+            auto v_sub = &v.at<uint8_t>({i0, i1 / h_each_group_len});
 
             auto mat0_out = reinterpret_cast<uint8_t*>(_buffer_mat0_out + thread_id * _buffer_mat0_out_size);
             auto mat1_out = reinterpret_cast<uint8_t*>(_buffer_mat1_out + thread_id * _buffer_mat1_out_size);
@@ -177,8 +179,8 @@ void mha_gpt_impl_amx::mha_bf16(const tensor& q, const tensor& k, const tensor&
                 // k: [batch, head_num, key_seq_len, head_size]
                 // v: [batch, head_num, value_seq_len, head_size]
                 auto q_sub = &q.at<ov::bfloat16>({i0, i1, seq_start});
-                auto k_sub = &k.at<ov::bfloat16>({i0, i1});
-                auto v_sub = &v.at<ov::bfloat16>({i0, i1});
+                auto k_sub = &k.at<ov::bfloat16>({i0, i1 / h_each_group_len});
+                auto v_sub = &v.at<ov::bfloat16>({i0, i1 / h_each_group_len});
 
                 auto mat0_out = reinterpret_cast<float*>(_buffer_mat0_out + thread_id * _buffer_mat0_out_size);
                 auto mat1_out = reinterpret_cast<float*>(_buffer_mat1_out + thread_id * _buffer_mat1_out_size);
@@ -279,7 +281,7 @@ status_t mha_gpt_impl_amx::exec(const tensor& q, const tensor& k, const tensor&
     auto key_seq_len = k.m_dims[2];
 
     if (!(batch == k.m_dims[0] && batch == v.m_dims[0] &&
-          head_num == k.m_dims[1] && head_num == v.m_dims[1] &&
+          k.m_dims[1] == v.m_dims[1] &&
           key_seq_len == v.m_dims[2] &&
           head_size == k.m_dims[3] && head_size == v.m_dims[3])) {
         DEBUG_LOG << "dim of q,k,v is error.\n";
diff --git a/src/mm_kernel_common_amx.hpp b/src/mm_kernel_common_amx.hpp
@@ -2476,21 +2476,13 @@ template<>
 struct Matmul<ov::bfloat16, uint8_t, float> {
     tensor2D<uint8_t> internalBI8;
 
-    // wei_buff is ping-pong buffer containing ov::bfloat16 weights decompressed on the fly.
-    tensor2D<ov::bfloat16> weiBuff;
-
     bool constB;
     bool transposeB;
 
     constexpr static int kStep = 32;
 
-    // 2x2 C tiles buffer
-    // most usecase requires post-processing with AVX, thus buffC
-    // is used to transfer data to AVX register
-    tensor2D<float> buffC;
-
     Matmul(bool constB = false, bool transposeB = false) : 
-        constB(constB), transposeB(transposeB), buffC(32, 32) {}
+        constB(constB), transposeB(transposeB) {}
 
     float* dequant_scale_B;
     float* zp;
@@ -2500,6 +2492,14 @@ struct Matmul<ov::bfloat16, uint8_t, float> {
                     tensor2D<ov::bfloat16> & _matB,
                     int n0, int n1,
                     PP ppkernel) {
+        alignas(64) float buff[32 * 32];
+        // wei_buff is ping-pong buffer containing ov::bfloat16 weights decompressed on the fly.
+        alignas(64) ov::bfloat16 weiBuff[32 * 2 * 32];
+        // 2x2 C tiles buffer
+        // most usecase requires post-processing with AVX, thus buffC
+        // is used to transfer data to AVX register
+        tensor2D<float> buffC(32, 32, buff, 32 * sizeof(float));
+
         auto matB = getSubMatB(_matB, n0, n1, transposeB);
         int M = matA.dims[0];
         int K = matA.dims[1];
@@ -2523,9 +2523,7 @@ struct Matmul<ov::bfloat16, uint8_t, float> {
             //constexpr int prefetch_ahead = 64*1024;
             tileconfig_t tfg(1, 0, {M,M,M,16,16}, 64);
             auto * pBint = reinterpret_cast<int8_t*>(&internalBI8[0]);
-            auto & B2buff = weiBuff;
-            B2buff.resize(32*2, 32);
-            auto * const pB = &B2buff[0];
+            auto * const pB = weiBuff;
             auto * pBsrc = pB + (32*32) * 0;
             auto * pBdst = pB + (32*32) * 1;
             auto * const pC0 = &buffC[0];