diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index 38d222b7016..42fe4d34337 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -519,6 +519,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                 return BEST_FATTN_KERNEL_VEC;
             }
         }
+        // MMA template instances for (DKQ=512, DV=512) only exist with ncols2 in {4, 8}
+        // (see template-instances/generate_cu_files.py). When gqa_ratio < 3 the MMA
+        // dispatcher in switch_ncols2<512,512> falls through to GGML_ABORT (fattn.cu:109).
+        // fattn-tile has DKQ=DV=512 with ncols2 fallback to {2,1} (commit 425db5b),
+        // so route DKQ=512 + low gqa_ratio to TILE (e.g. Gemma 4 E4B with gqa_ratio=2).
+        if (Q->ne[0] == 512 && gqa_ratio < 3) {
+            return BEST_FATTN_KERNEL_TILE;
+        }
         return BEST_FATTN_KERNEL_MMA_F16;
     }