From b345c42e71b3870f6386ffabf07e85b157bd1592 Mon Sep 17 00:00:00 2001
From: rasbid <104773487+rasbid@users.noreply.github.com>
Date: Mon, 13 Oct 2025 20:00:31 +0300
Subject: [PATCH] Allow subgroup reductions on stable AMD GCN drivers

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 31 +++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ebbb412e55f..862dedae82c 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -378,6 +378,24 @@ enum shader_reduction_mode {
 static constexpr uint32_t num_argsort_pipelines = 11;
 static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1);
 
+static bool ggml_vk_driver_has_unstable_subgroup_arithmetic(vk_device_architecture architecture, vk::DriverId driver_id) {
+    if (architecture != vk_device_architecture::AMD_GCN) {
+        return false;
+    }
+
+    switch (driver_id) {
+        case vk::DriverId::eMesaRadv:
+        case vk::DriverId::eAmdOpenSource:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool ggml_vk_supports_subgroup_reduction(vk_device_architecture architecture, vk::DriverId driver_id, bool subgroup_arithmetic) {
+    return subgroup_arithmetic && !ggml_vk_driver_has_unstable_subgroup_arithmetic(architecture, driver_id);
+}
+
 struct vk_device_struct {
     std::recursive_mutex mutex;
 
@@ -3114,7 +3132,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
         rm_stdq = 2;
     uint32_t rm_iq = 2 * rm_kq;
 
-    const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN;
+    const bool use_subgroups = ggml_vk_supports_subgroup_reduction(device->architecture, device->driver_id, device->subgroup_arithmetic);
     // Ensure a subgroup size >= 16 is available
     const bool use_subgroups16 = use_subgroups && subgroup_min_size_16;
 
@@ -9699,6 +9717,17 @@ static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx
 }
 
 #ifdef GGML_VULKAN_RUN_TESTS
+struct ggml_vk_subgroup_support_tests {
+    ggml_vk_subgroup_support_tests() {
+        GGML_ASSERT(!ggml_vk_supports_subgroup_reduction(vk_device_architecture::AMD_GCN, vk::DriverId::eMesaRadv, true));
+        GGML_ASSERT(!ggml_vk_supports_subgroup_reduction(vk_device_architecture::AMD_GCN, vk::DriverId::eAmdOpenSource, true));
+        GGML_ASSERT(ggml_vk_supports_subgroup_reduction(vk_device_architecture::AMD_GCN, vk::DriverId::eAmdProprietary, true));
+        GGML_ASSERT(!ggml_vk_supports_subgroup_reduction(vk_device_architecture::AMD_GCN, vk::DriverId::eAmdProprietary, false));
+    }
+};
+
+static ggml_vk_subgroup_support_tests ggml_vk_subgroup_support_tests_instance;
+
 static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
     if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
         return;