From 839c7e22b8b733a61e71de1fc8a8f8966abf3e47 Mon Sep 17 00:00:00 2001 From: PMZFX Date: Thu, 9 Apr 2026 19:29:38 -0400 Subject: [PATCH] SYCL: use native subgroup size for K-quant DMMV kernels Use WARP_SIZE (16) instead of QK_WARP_SIZE (32) for K-quant DMMV kernel dispatch (Q2_K through Q6_K) on Intel SYCL targets. The original kernels were migrated from CUDA via DPCT and retained a 32-wide subgroup size. Intel Xe2 natively uses 16-lane subgroups, and the DPCT tool itself flagged these kernels with register pressure warnings recommending a smaller subgroup size. Each kernel thread now processes both halves of the QK_K=256 block via a loop, preserving identical total work and numerical results. Tested on Intel Arc Pro B70 (Xe2/Battlemage): - test-backend-ops: all K-quant types pass (debug + release) - perplexity: unchanged (Q4_K_M and Q6_K, wikitext-2) - llama-bench: 2.3-2.7x prefill improvement, neutral tg Co-Authored-By: Claude Opus 4.6 (1M context) --- ggml/src/ggml-sycl/dmmv.cpp | 437 +++++++++++++++++------------------- 1 file changed, 208 insertions(+), 229 deletions(-) diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 1c8b6f3771f..1ed896e9d51 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -217,13 +217,6 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, } } -/* -DPCT1110:4: The total declared local variable size in device function -dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -245,19 +238,15 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int in = tid % step; // 0...15 or 0...7 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; uint32_t aux[4]; const uint8_t * d = (const uint8_t *)aux; @@ -265,33 +254,39 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm[0]; const float dmin = x[i].dm[1]; - const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + } + tmp += dall * sum1 - dmin * sum2; } - tmp += dall * sum1 - dmin * sum2; } #else @@ -333,7 +328,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -343,13 +338,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, } } -/* -DPCT1110:5: The total declared local variable size in device function -dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -373,52 +361,52 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const uint16_t kmask2 = 0x0f0f; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); + const int in = tid % step; // 0...15 or 0...7 const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; uint16_t utmp[4]; const int8_t * s = (const int8_t *)utmp; - const uint16_t s_shift = 4*im; - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; const uint8_t * h = x[i].hmask + l0; - - const uint16_t * a = (const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - const float d = x[i].d; - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + const uint16_t s_shift = 4*im; + const uint8_t m = 1 << (4*im); + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; } - tmp += d * sum; } #else @@ -452,7 +440,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -461,14 +449,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, dst[row] = tmp; } } - -/* -DPCT1110:6: The total declared local variable size in device function -dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -489,22 +469,19 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const uint16_t kmask3 = 0xc0c0; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 + const int il_base = tid/step; // 0 or 1 (was 0...3) + const int ir = tid - step*il_base; // 0...7 or 0...3 const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; + const int in = il_base%2; const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; @@ -521,55 +498,60 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - const float dall = x[i].dm[0]; const float dmin = x[i].dm[1]; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); #if K_QUANTS_PER_ITERATION == 2 - const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); - const uint32_t * q2 = q1 + 16; - - q32[0] = q1[0] & 0x0f0f0f0f; - q32[1] = q1[0] & 0xf0f0f0f0; - q32[2] = q2[0] & 0x0f0f0f0f; - q32[3] = q2[0] & 0xf0f0f0f0; - - sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 4; ++l) { - s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; - s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + - s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - - dmin * smin; + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; #else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; #endif + } } #else @@ -605,7 +587,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -615,13 +597,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, } } -/* -DPCT1110:7: The total declared local variable size in device function -dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -641,22 +616,16 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; - const int tid = item_ct1.get_local_id(2) / 2; // 0...15 + const int tid = item_ct1.get_local_id(2) / 2; // 0...7 const int ix = item_ct1.get_local_id(2) % 2; - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 + const int il_base = tid/4; // 0 or 1 (was 0...3) + const int ir = tid - 4*il_base;// 0...3 const int n = 2; - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; + const int in = il_base%2; const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - const uint8_t hm1 = 1 << (2*im); - const uint8_t hm2 = hm1 << 4; uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; @@ -666,51 +635,59 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, for (int i = ix; i < num_blocks_per_row; i += 2) { - const uint8_t * ql1 = x[i].qs + q_offset; const uint8_t * qh = x[i].qh + l0; - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - const float dall = x[i].dm[0]; const float dmin = x[i].dm[1]; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - const uint16_t * q1 = (const uint16_t *)ql1; - const uint16_t * q2 = q1 + 32; - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[8] & 0x0f0f; - q16[2] = (q1[0] >> 4) & 0x0f0f; - q16[3] = (q1[8] >> 4) & 0x0f0f; - q16[4] = q2[0] & 0x0f0f; - q16[5] = q2[8] & 0x0f0f; - q16[6] = (q2[0] >> 4) & 0x0f0f; - q16[7] = (q2[8] >> 4) & 0x0f0f; - for (int l = 0; l < n; ++l) { - sum.x() += - y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + - y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); - sum.y() += - y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + - y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); - sum.z() += - y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + - y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); - sum.w() += - y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + - y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); - smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] - + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + const uint8_t * ql1 = x[i].qs + q_offset; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; } - tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + - sum.w() * sc[5]) - - dmin * smin; } #else @@ -739,7 +716,7 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -766,14 +743,13 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int in = tid % step; // 0...15 or 0...7 #if K_QUANTS_PER_ITERATION == 1 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 @@ -782,42 +758,45 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; #endif - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; float tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y = yy + i * QK_K + y_offset; - const uint8_t * ql = x[i].ql + ql_offset; - const uint8_t * qh = x[i].qh + qh_offset; - const int8_t * s = x[i].scales + s_offset; - const float d = x[i].d; + for (int im = 0; im < 2; ++im) { + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + #if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; #else - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp += sum; + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; #endif + } } @@ -854,7 +833,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -1098,10 +1077,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1114,10 +1093,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1130,10 +1109,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1143,10 +1122,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); }); } @@ -1159,10 +1138,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); }); }