From 7f1fed8f813213a5c95f06419433e1a4df0d137f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 28 May 2026 09:06:38 +0200 Subject: [PATCH] Don't unconditionally set the sub group size. is currently unconditionally set to the device's reported subgroup size (or a heuristic default). However, the spec mentions: > Note that there is no guarantee for the value of get_sub_group_size() > even when this attribute is present, particularly when the work-group size > is not evenly divisible by the required sub-group size. Specifically, PoCL reports a subgroup count of 0 when using a work-group size that's smaller than the chosen subgroup size: The above is with the fix from this PR already, which only sets the attribute when explicitly requesting a subgroup size. Normally, PoCL determines an appropriate subgroup size per launch, so revert to that by not setting the attribute by default. This bug broke the RNG, which queries the sub group count. FWIW, this only surfaced on https://github.com/JuliaGPU/GPUCompiler.jl/pull/812, because previously the exception trap was simply removed by PoCL resulting in the subsequent memory access simply happening as if there was no OOB. --- src/compiler/compilation.jl | 16 +++++++--------- src/compiler/execution.jl | 2 +- test/intrinsics.jl | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 5f9ae484..0059c24c 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -1,7 +1,8 @@ ## gpucompiler interface Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams - sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed + # request a fixed sub-group width via `intel_reqd_sub_group_size` + sub_group_size::Union{Nothing,Int} = nothing end const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} @@ -32,9 +33,8 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob), Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, job, mod, entry) - # Set the subgroup size if supported sg_size = job.config.params.sub_group_size - if sg_size >= 0 + if sg_size !== nothing metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))]) end @@ -136,15 +136,13 @@ function compiler_config(dev::cl.Device; kwargs...) end return config end -@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...) +@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, + sub_group_size::Union{Nothing,Int}=nothing, kwargs...) supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions - # Set to -1 if specifying a subgroup size is not supported - sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions - cl.sub_group_size(dev) - else - -1 + if sub_group_size !== nothing && !("cl_intel_required_subgroup_size" in dev.extensions) + error("Device does not support cl_intel_required_subgroup_size") end # create GPUCompiler objects diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 68885b8f..f32433e0 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -4,7 +4,7 @@ export @opencl, clfunction ## high-level @opencl interface const MACRO_KWARGS = [:launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions, :backend, :validate] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions, :backend, :validate, :sub_group_size] const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 107ec0a3..98bc4e33 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -213,7 +213,7 @@ end N = local_size * numworkgroups results = CLVector{SubgroupData}(undef, N) - kernel = @opencl launch = false test_subgroup_kernel(results) + kernel = @opencl launch = false sub_group_size = sg_size test_subgroup_kernel(results) kernel(results; local_size, global_size=N) @@ -248,7 +248,7 @@ end @testset for T in cl.sub_group_shuffle_supported_types(cl.device()) a = rand(T, sg_size) d_a = CLArray(a) - @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a) + @opencl local_size = sg_size global_size = sg_size sub_group_size = sg_size shfl_idx_kernel(d_a) @test Array(d_a) == reverse(a) end end @@ -267,7 +267,7 @@ end in = rand(T, sg_size) idxs = xor.(0:(sg_size - 1), 1) .+ 1 d_in = CLArray(in) - @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in) + @opencl local_size = sg_size global_size = sg_size sub_group_size = sg_size shfl_xor_kernel(d_in) @test Array(d_in) == in[idxs] end end