From 7f1fed8f813213a5c95f06419433e1a4df0d137f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 28 May 2026 09:06:38 +0200
Subject: [PATCH] Don't unconditionally set the sub group size.

 is currently unconditionally set to the device's
reported subgroup size (or a heuristic default). However, the spec mentions:

> Note that there is no guarantee for the value of get_sub_group_size()
> even when this attribute is present, particularly when the work-group size
> is not evenly divisible by the required sub-group size.

Specifically, PoCL reports a subgroup count of 0 when using a work-group size
that's smaller than the chosen subgroup size:

The above is with the fix from this PR already, which only sets the
attribute when explicitly requesting a subgroup size. Normally, PoCL
determines an appropriate subgroup size per launch, so revert to that
by not setting the attribute by default.

This bug broke the RNG, which queries the sub group count. FWIW, this only
surfaced on https://github.com/JuliaGPU/GPUCompiler.jl/pull/812, because
previously the exception trap was simply removed by PoCL resulting in
the subsequent memory access simply happening as if there was no OOB.
---
 src/compiler/compilation.jl | 16 +++++++---------
 src/compiler/execution.jl   |  2 +-
 test/intrinsics.jl          |  6 +++---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 5f9ae484..0059c24c 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -1,7 +1,8 @@
 ## gpucompiler interface
 
 Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
-    sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
+    # request a fixed sub-group width via `intel_reqd_sub_group_size`
+    sub_group_size::Union{Nothing,Int} = nothing
 end
 
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
@@ -32,9 +33,8 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
                    Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
                    job, mod, entry)
 
-    # Set the subgroup size if supported
     sg_size = job.config.params.sub_group_size
-    if sg_size >= 0
+    if sg_size !== nothing
         metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
     end
 
@@ -136,15 +136,13 @@ function compiler_config(dev::cl.Device; kwargs...)
     end
     return config
 end
-@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...)
+@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
+                                     sub_group_size::Union{Nothing,Int}=nothing, kwargs...)
     supports_fp16 = "cl_khr_fp16" in dev.extensions
     supports_fp64 = "cl_khr_fp64" in dev.extensions
 
-    # Set to -1 if specifying a subgroup size is not supported
-    sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
-        cl.sub_group_size(dev)
-    else
-        -1
+    if sub_group_size !== nothing && !("cl_intel_required_subgroup_size" in dev.extensions)
+        error("Device does not support cl_intel_required_subgroup_size")
     end
 
     # create GPUCompiler objects
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 68885b8f..f32433e0 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -4,7 +4,7 @@ export @opencl, clfunction
 ## high-level @opencl interface
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions, :backend, :validate]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions, :backend, :validate, :sub_group_size]
 const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
 
 macro opencl(ex...)
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 107ec0a3..98bc4e33 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -213,7 +213,7 @@ end
         N = local_size * numworkgroups
 
         results = CLVector{SubgroupData}(undef, N)
-        kernel = @opencl launch = false test_subgroup_kernel(results)
+        kernel = @opencl launch = false sub_group_size = sg_size test_subgroup_kernel(results)
 
         kernel(results; local_size, global_size=N)
 
@@ -248,7 +248,7 @@ end
         @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
             a = rand(T, sg_size)
             d_a = CLArray(a)
-            @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
+            @opencl local_size = sg_size global_size = sg_size sub_group_size = sg_size shfl_idx_kernel(d_a)
             @test Array(d_a) == reverse(a)
         end
     end
@@ -267,7 +267,7 @@ end
             in = rand(T, sg_size)
             idxs = xor.(0:(sg_size - 1), 1) .+ 1
             d_in = CLArray(in)
-            @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
+            @opencl local_size = sg_size global_size = sg_size sub_group_size = sg_size shfl_xor_kernel(d_in)
             @test Array(d_in) == in[idxs]
         end
     end