diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 5f9ae484..0059c24c 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -1,7 +1,8 @@
 ## gpucompiler interface
 
 Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
-    sub_group_size::Int # Some devices support multiple sizes. This is used to force one when needed
+    # request a fixed sub-group width via `intel_reqd_sub_group_size`
+    sub_group_size::Union{Nothing,Int} = nothing
 end
 
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
@@ -32,9 +33,8 @@ function GPUCompiler.finish_module!(@nospecialize(job::OpenCLCompilerJob),
                    Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
                    job, mod, entry)
 
-    # Set the subgroup size if supported
     sg_size = job.config.params.sub_group_size
-    if sg_size >= 0
+    if sg_size !== nothing
         metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(sg_size))])
     end
 
@@ -136,15 +136,13 @@ function compiler_config(dev::cl.Device; kwargs...)
     end
     return config
 end
-@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...)
+@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false,
+                                     sub_group_size::Union{Nothing,Int}=nothing, kwargs...)
     supports_fp16 = "cl_khr_fp16" in dev.extensions
     supports_fp64 = "cl_khr_fp64" in dev.extensions
 
-    # Set to -1 if specifying a subgroup size is not supported
-    sub_group_size = if "cl_intel_required_subgroup_size" in dev.extensions
-        cl.sub_group_size(dev)
-    else
-        -1
+    if sub_group_size !== nothing && !("cl_intel_required_subgroup_size" in dev.extensions)
+        error("Device does not support cl_intel_required_subgroup_size")
     end
 
     # create GPUCompiler objects
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 68885b8f..f32433e0 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -4,7 +4,7 @@ export @opencl, clfunction
 ## high-level @opencl interface
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions, :backend, :validate]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :extensions, :backend, :validate, :sub_group_size]
 const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
 
 macro opencl(ex...)
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 107ec0a3..98bc4e33 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -213,7 +213,7 @@ end
         N = local_size * numworkgroups
 
         results = CLVector{SubgroupData}(undef, N)
-        kernel = @opencl launch = false test_subgroup_kernel(results)
+        kernel = @opencl launch = false sub_group_size = sg_size test_subgroup_kernel(results)
 
         kernel(results; local_size, global_size=N)
 
@@ -248,7 +248,7 @@ end
         @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
             a = rand(T, sg_size)
             d_a = CLArray(a)
-            @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
+            @opencl local_size = sg_size global_size = sg_size sub_group_size = sg_size shfl_idx_kernel(d_a)
             @test Array(d_a) == reverse(a)
         end
     end
@@ -267,7 +267,7 @@ end
             in = rand(T, sg_size)
             idxs = xor.(0:(sg_size - 1), 1) .+ 1
             d_in = CLArray(in)
-            @opencl local_size = sg_size global_size = sg_size shfl_xor_kernel(d_in)
+            @opencl local_size = sg_size global_size = sg_size sub_group_size = sg_size shfl_xor_kernel(d_in)
             @test Array(d_in) == in[idxs]
         end
     end