Skip to content

Commit db9d8aa

Browse files
ggml-cuda: native bf16 flash attention for vec kernel (ggml-org#20525)
* ggml-cuda: native bf16 flash attention for vec and tile kernels mma kernel still converts bf16 to fp16 before launch, native mma bf16 todo * ggml-cuda: address code owner review feedback reverted tile kernel changes to avoid larger refactor * fix ci failures on turing and hip * fix bf16 vec kernel compile on hip v_dot2 platforms * add comments --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
1 parent ccb87fa commit db9d8aa

21 files changed

Lines changed: 197 additions & 25 deletions

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,11 @@ if (CUDAToolkit_FOUND)
116116
list(APPEND GGML_SOURCES_CUDA ${SRCS})
117117
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
118118
else()
119-
file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
120-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
121-
file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
122-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
123-
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
124-
list(APPEND GGML_SOURCES_CUDA ${SRCS})
119+
list(APPEND GGML_SOURCES_CUDA
120+
template-instances/fattn-vec-instance-f16-f16.cu
121+
template-instances/fattn-vec-instance-q4_0-q4_0.cu
122+
template-instances/fattn-vec-instance-q8_0-q8_0.cu
123+
template-instances/fattn-vec-instance-bf16-bf16.cu)
125124
endif()
126125

127126
ggml_add_backend_library(ggml-cuda

ggml/src/ggml-cuda/convert.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ template<typename dst_t, typename src_t>
4141
return __bfloat162float(x);
4242
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, half2>) {
4343
return __float22half2_rn(x);
44+
} else if constexpr(std::is_same_v<src_t, nv_bfloat162> && std::is_same_v<dst_t, float2>) {
45+
#if !defined(GGML_USE_HIP) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
46+
return __bfloat1622float2(x);
47+
#else
48+
return make_float2(__bfloat162float(__low2bfloat16(x)), __bfloat162float(__high2bfloat16(x)));
49+
#endif // !defined(GGML_USE_HIP) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
4450
} else if constexpr(std::is_same_v<src_t, float2> && std::is_same_v<dst_t, nv_bfloat162>) {
4551
// bypass compile error on cuda 12.0.1
4652
#ifdef GGML_USE_HIP

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,37 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
7474
return sum;
7575
}
7676

77+
template <int D, int nthreads>
78+
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_bf16(
79+
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
80+
81+
const nv_bfloat162 * K_bf16 = (const nv_bfloat162 *) K_c;
82+
GGML_UNUSED(Q_q8);
83+
GGML_UNUSED(Q_ds_v);
84+
85+
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
86+
constexpr int cpy_ne = cpy_nb / 4;
87+
88+
float sum = 0.0f;
89+
90+
#pragma unroll
91+
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
92+
__align__(16) nv_bfloat162 tmp[cpy_ne];
93+
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_bf16 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
94+
#pragma unroll
95+
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
96+
#ifdef V_DOT2_F32_F16_AVAILABLE
97+
// FIXME replace macros in vector FA kernel with templating and use FP32 for BF16
98+
ggml_cuda_mad(sum, ggml_cuda_cast<float2>(tmp[k_KQ_1]), __half22float2(((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]));
99+
#else
100+
ggml_cuda_mad(sum, ggml_cuda_cast<float2>(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
101+
#endif // V_DOT2_F32_F16_AVAILABLE
102+
}
103+
}
104+
105+
return sum;
106+
}
107+
77108
template<int D, int nthreads>
78109
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0(
79110
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
@@ -321,6 +352,19 @@ static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict_
321352
}
322353
}
323354

355+
template <typename T, int ne>
356+
static __device__ __forceinline__ void dequantize_V_bf16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
357+
static_assert(std::is_same_v<T, float>, "BF16 V dequantization only supports float output");
358+
static_assert(ne % 2 == 0, "bad ne");
359+
__align__(16) nv_bfloat162 tmp[ne/2];
360+
ggml_cuda_memcpy_1<ne*sizeof(nv_bfloat16)>(tmp, (const nv_bfloat16 *) vx + i0);
361+
float2 * dst_f2 = (float2 *) dst;
362+
#pragma unroll
363+
for (int l = 0; l < ne/2; ++l) {
364+
dst_f2[l] = ggml_cuda_cast<float2>(tmp[l]);
365+
}
366+
}
367+
324368
template <typename T, int ne>
325369
static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
326370
const block_q4_0 * x = (const block_q4_0 *) vx;
@@ -547,6 +591,8 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
547591
return vec_dot_fattn_vec_KQ_q5_1<D, nthreads>;
548592
} else if constexpr (type_K == GGML_TYPE_Q8_0) {
549593
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
594+
} else if constexpr (type_K == GGML_TYPE_BF16) {
595+
return vec_dot_fattn_vec_KQ_bf16<D, nthreads>;
550596
} else {
551597
static_assert(type_K == -1, "bad type");
552598
return nullptr;
@@ -567,6 +613,8 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
567613
return dequantize_V_q5_1<T, ne>;
568614
} else if constexpr (type_V == GGML_TYPE_Q8_0) {
569615
return dequantize_V_q8_0<T, ne>;
616+
} else if constexpr (type_V == GGML_TYPE_BF16) {
617+
return dequantize_V_bf16<float, ne>;
570618
} else {
571619
static_assert(type_V == -1, "bad type");
572620
return nullptr;

ggml/src/ggml-cuda/fattn-vec.cuh

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,17 @@ static __global__ void flash_attn_ext_vec(
7575
#endif // GGML_USE_HIP
7676

7777
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
78-
constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q;
79-
constexpr int nthreads_V = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q;
78+
constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q;
79+
constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q;
8080

8181
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
8282
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
8383

84-
constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4;
84+
constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4;
8585
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
8686

8787
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
88-
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
88+
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16;
8989
#ifdef V_DOT2_F32_F16_AVAILABLE
9090
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
9191
#else
@@ -323,8 +323,18 @@ static __global__ void flash_attn_ext_vec(
323323
#pragma unroll
324324
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
325325
half2 tmp[V_rows_per_thread/2];
326-
dequantize_V(V + k*nb21, tmp,
327-
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
326+
if constexpr (type_V == GGML_TYPE_BF16) {
327+
float2 tmp_f[V_rows_per_thread/2];
328+
dequantize_V(V + k*nb21, tmp_f,
329+
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
330+
#pragma unroll
331+
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
332+
tmp[i_VKQ_1] = __float22half2_rn(tmp_f[i_VKQ_1]);
333+
}
334+
} else {
335+
dequantize_V(V + k*nb21, tmp,
336+
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
337+
}
328338
#pragma unroll
329339
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
330340
#pragma unroll
@@ -563,24 +573,28 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten
563573
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \
564574
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \
565575
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \
576+
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_BF16); \
566577

567578
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16)
568579
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0)
569580
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1)
570581
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0)
571582
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1)
572583
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0)
584+
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_BF16)
573585

574586
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16)
575587
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0)
576588
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1)
577589
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0)
578590
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1)
579591
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0)
592+
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_BF16)
580593

581594
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16)
582595
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0)
583596
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1)
584597
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
585598
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
586599
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)
600+
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16)

ggml/src/ggml-cuda/fattn.cu

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,45 +224,60 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t
224224
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_F16)
225225
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_F16)
226226
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_F16)
227+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_F16)
227228

228229
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_0)
229230
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
230231
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
231232
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
232233
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
233234
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
235+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q4_0)
234236

235237
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_1)
236238
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
237239
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
238240
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
239241
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
240242
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
243+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q4_1)
241244

242245
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_0)
243246
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
244247
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
245248
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
246249
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
247250
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
251+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q5_0)
248252

249253
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_1)
250254
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
251255
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
252256
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
253257
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
254258
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
259+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q5_1)
255260

256261
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q8_0)
257262
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
258263
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
259264
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
260265
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
261266
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
267+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q8_0)
268+
269+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_BF16)
270+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_BF16)
271+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_BF16)
272+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_BF16)
273+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_BF16)
274+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_BF16)
275+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
262276
#else
263277
FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
264278
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
265279
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
280+
FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16)
266281
#endif // GGML_CUDA_FA_ALL_QUANTS
267282

268283
GGML_ABORT("fatal error");
@@ -355,6 +370,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
355370
#endif // GGML_CUDA_FA_ALL_QUANTS
356371
case GGML_TYPE_Q4_0:
357372
case GGML_TYPE_Q8_0:
373+
case GGML_TYPE_BF16:
358374
break;
359375
default:
360376
return BEST_FATTN_KERNEL_NONE;
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec.cuh"
4+
5+
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_BF16);
6+
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_BF16);
7+
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_BF16);
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec.cuh"
4+
5+
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_F16);
6+
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_F16);
7+
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_F16);
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec.cuh"
4+
5+
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q4_0);
6+
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q4_0);
7+
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q4_0);
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec.cuh"
4+
5+
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q4_1);
6+
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q4_1);
7+
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q4_1);
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
2+
3+
#include "../fattn-vec.cuh"
4+
5+
DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q5_0);
6+
DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q5_0);
7+
DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q5_0);

0 commit comments

Comments
 (0)