From 2374b99c16937cf00d327c93b1d2d12d708d4f30 Mon Sep 17 00:00:00 2001 From: Biogenic Ooze Date: Fri, 8 May 2026 10:52:15 +0300 Subject: [PATCH] Enhance CUDA flash attention kernel selection for DKQ=512 with low gqa_ratio This update modifies the kernel selection logic in the CUDA implementation of the flash attention mechanism. Specifically, when the query dimension (DKQ) is set to 512 and the gqa_ratio is less than 3, the code now routes to the TILE kernel instead of falling through to an abort condition. This change ensures better compatibility and performance for specific hardware configurations, particularly for models like Gemma 4 E4B. --- ggml/src/ggml-cuda/fattn.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d222b7016..42fe4d34337 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -519,6 +519,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const return BEST_FATTN_KERNEL_VEC; } } + // MMA template instances for (DKQ=512, DV=512) only exist with ncols2 in {4, 8} + // (see template-instances/generate_cu_files.py). When gqa_ratio < 3 the MMA + // dispatcher in switch_ncols2<512,512> falls through to GGML_ABORT (fattn.cu:109). + // fattn-tile has DKQ=DV=512 with ncols2 fallback to {2,1} (commit 425db5b), + // so route DKQ=512 + low gqa_ratio to TILE (e.g. Gemma 4 E4B with gqa_ratio=2). + if (Q->ne[0] == 512 && gqa_ratio < 3) { + return BEST_FATTN_KERNEL_TILE; + } return BEST_FATTN_KERNEL_MMA_F16; }