GCN: use byref instead of byval+lower_byval for kernel arguments#772
GCN: use byref instead of byval+lower_byval for kernel arguments#772gbaraldi wants to merge 14 commits intoJuliaGPU:masterfrom
Conversation
On AMDGPU, kernel arguments already reside in the read-only kernarg segment. The current pipeline adds `byval` attributes and then `lower_byval` expands them into first-class aggregates (FCAs), which forces LLVM to extractvalue every field and store the entire struct into scratch memory via alloca — even when only a few fields are used. For large structs (e.g. Oceananigans' ImmersedBoundaryGrid), this produces dozens of dead scratch stores. Using `byref` instead keeps the pointer semantics, allowing LLVM to generate scalar loads directly from the kernarg segment on demand. The invariant.load and TBAA metadata that Julia emits remain valid since the kernarg memory is immutable. The byref pointer parameters are rewritten to addrspace(4) (AMDGPU constant/kernarg address space), with addrspacecasts inserted so the function body can continue using generic pointers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/gcn.jl b/src/gcn.jl
index e310b5c..2903354 100644
--- a/src/gcn.jl
+++ b/src/gcn.jl
@@ -64,7 +64,7 @@ function finish_ir!(
# optimize after address space rewriting: propagate addrspace(4) through
# the addrspacecast chains, then clean up newly-exposed opportunities
tm = llvm_machine(job.config.target)
- @dispose pb=NewPMPassBuilder() begin
+ @dispose pb = NewPMPassBuilder() begin
add!(pb, NewPMFunctionPassManager()) do fpm
add!(fpm, InferAddressSpacesPass())
add!(fpm, SROAPass())
@@ -139,7 +139,7 @@ function add_kernarg_address_spaces!(
# (which expects flat pointers) continues to work. The AMDGPU backend's
# AMDGPULowerKernelArguments traces these casts and produces s_load.
new_args = LLVM.Value[]
- @dispose builder=IRBuilder() begin
+ @dispose builder = IRBuilder() begin
entry_bb = BasicBlock(new_f, "conversion")
position!(builder, entry_bb)
@@ -185,7 +185,7 @@ function add_kernarg_address_spaces!(
LLVM.name!(new_f, fn)
# clean up the extra conversion block
- @dispose pb=NewPMPassBuilder() begin
+ @dispose pb = NewPMPassBuilder() begin
add!(pb, NewPMFunctionPassManager()) do fpm
add!(fpm, SimplifyCFGPass())
end
diff --git a/test/gcn.jl b/test/gcn.jl
index 5b49cf5..c667cb8 100644
--- a/test/gcn.jl
+++ b/test/gcn.jl
@@ -37,121 +37,127 @@ end
end
end
-@testset "kernarg address space for byref parameters" begin
- mod = @eval module $(gensym())
- struct MyStruct
- x::Float64
- y::Float64
- end
-
- function kernel(s::MyStruct)
- s.x + s.y
- return
- end
- end
+ @testset "kernarg address space for byref parameters" begin
+ mod = @eval module $(gensym())
+ struct MyStruct
+ x::Float64
+ y::Float64
+ end
- # byref struct params should be ptr addrspace(4) in kernel IR
- @test @filecheck begin
- check"CHECK: define amdgpu_kernel void @_Z6kernel8MyStruct(ptr addrspace(4)"
- GCN.code_llvm(mod.kernel, Tuple{mod.MyStruct}; dump_module=true, kernel=true)
- end
+ function kernel(s::MyStruct)
+ s.x + s.y
+ return
+ end
+ end
- # non-kernel should NOT have addrspace(4)
- @test @filecheck begin
- check"CHECK-NOT: addrspace(4)"
- GCN.code_llvm(mod.kernel, Tuple{mod.MyStruct}; dump_module=true, kernel=false)
- end
-end
+ # byref struct params should be ptr addrspace(4) in kernel IR
+ @test @filecheck begin
+ check"CHECK: define amdgpu_kernel void @_Z6kernel8MyStruct(ptr addrspace(4)"
+ GCN.code_llvm(mod.kernel, Tuple{mod.MyStruct}; dump_module = true, kernel = true)
+ end
-@testset "byref attribute preserved on kernarg parameters" begin
- mod = @eval module $(gensym())
- struct LargeStruct
- a::Float64
- b::Float64
- c::Float64
- d::Float64
+ # non-kernel should NOT have addrspace(4)
+ @test @filecheck begin
+ check"CHECK-NOT: addrspace(4)"
+ GCN.code_llvm(mod.kernel, Tuple{mod.MyStruct}; dump_module = true, kernel = false)
+ end
end
- function kernel(s::LargeStruct, out::Ptr{Float64})
- unsafe_store!(out, s.a + s.b + s.c + s.d)
- return
- end
- end
+ @testset "byref attribute preserved on kernarg parameters" begin
+ mod = @eval module $(gensym())
+ struct LargeStruct
+ a::Float64
+ b::Float64
+ c::Float64
+ d::Float64
+ end
- # the byref attribute must survive the addrspace rewrite (clone_into! can drop it)
- @test @filecheck begin
- check"CHECK: byref"
- check"CHECK: addrspace(4)"
- GCN.code_llvm(mod.kernel, Tuple{mod.LargeStruct, Ptr{Float64}};
- dump_module=true, kernel=true)
- end
-end
+ function kernel(s::LargeStruct, out::Ptr{Float64})
+ unsafe_store!(out, s.a + s.b + s.c + s.d)
+ return
+ end
+ end
-@testset "mixed byref and scalar kernel parameters" begin
- mod = @eval module $(gensym())
- struct Params
- x::Float64
- y::Float64
+ # the byref attribute must survive the addrspace rewrite (clone_into! can drop it)
+ @test @filecheck begin
+ check"CHECK: byref"
+ check"CHECK: addrspace(4)"
+ GCN.code_llvm(
+ mod.kernel, Tuple{mod.LargeStruct, Ptr{Float64}};
+ dump_module = true, kernel = true
+ )
+ end
end
- function kernel(a::Float64, s::Params, out::Ptr{Float64})
- unsafe_store!(out, a + s.x + s.y)
- return
- end
- end
+ @testset "mixed byref and scalar kernel parameters" begin
+ mod = @eval module $(gensym())
+ struct Params
+ x::Float64
+ y::Float64
+ end
- # scalar Float64 should NOT be in addrspace(4),
- # only the struct byref param should be.
- # NOTE: Ptr{Float64} is lowered to i64 on Julia ≤1.11 and ptr on Julia 1.12+.
- @test @filecheck begin
- check"CHECK: define amdgpu_kernel void"
- check"CHECK-SAME: double"
- check"CHECK-SAME: ptr addrspace(4)"
- check"CHECK-SAME: {{(i64|ptr)}}"
- GCN.code_llvm(mod.kernel, Tuple{Float64, mod.Params, Ptr{Float64}};
- dump_module=true, kernel=true)
- end
-end
+ function kernel(a::Float64, s::Params, out::Ptr{Float64})
+ unsafe_store!(out, a + s.x + s.y)
+ return
+ end
+ end
-@testset "add_kernarg_address_spaces! rewrites IR correctly" begin
- mod = @eval module $(gensym())
- struct KernelArgs
- x::Float64
- y::Float64
- z::Float64
+ # scalar Float64 should NOT be in addrspace(4),
+ # only the struct byref param should be.
+ # NOTE: Ptr{Float64} is lowered to i64 on Julia ≤1.11 and ptr on Julia 1.12+.
+ @test @filecheck begin
+ check"CHECK: define amdgpu_kernel void"
+ check"CHECK-SAME: double"
+ check"CHECK-SAME: ptr addrspace(4)"
+ check"CHECK-SAME: {{(i64|ptr)}}"
+ GCN.code_llvm(
+ mod.kernel, Tuple{Float64, mod.Params, Ptr{Float64}};
+ dump_module = true, kernel = true
+ )
+ end
end
- function kernel(s::KernelArgs, scale::Float64, out::Ptr{Float64})
- unsafe_store!(out, (s.x + s.y + s.z) * scale)
- return
- end
- end
+ @testset "add_kernarg_address_spaces! rewrites IR correctly" begin
+ mod = @eval module $(gensym())
+ struct KernelArgs
+ x::Float64
+ y::Float64
+ z::Float64
+ end
- job, _ = GCN.create_job(mod.kernel, Tuple{mod.KernelArgs, Float64, Ptr{Float64}};
- kernel=true)
- JuliaContext() do ctx
- ir, meta = GPUCompiler.compile(:llvm, job)
+ function kernel(s::KernelArgs, scale::Float64, out::Ptr{Float64})
+ unsafe_store!(out, (s.x + s.y + s.z) * scale)
+ return
+ end
+ end
- entry = meta.entry
- ft = function_type(entry)
- params = parameters(ft)
+ job, _ = GCN.create_job(
+ mod.kernel, Tuple{mod.KernelArgs, Float64, Ptr{Float64}};
+ kernel = true
+ )
+ JuliaContext() do ctx
+ ir, meta = GPUCompiler.compile(:llvm, job)
- # the struct byref param should be ptr addrspace(4)
- has_as4 = any(p -> p isa LLVM.PointerType && addrspace(p) == 4, params)
- @test has_as4
+ entry = meta.entry
+ ft = function_type(entry)
+ params = parameters(ft)
- # non-struct params (double, and i64/ptr for Ptr{Float64}) should NOT
- # be in addrspace(4). Ptr{Float64} is i64 on Julia ≤1.11, ptr on 1.12+.
- non_byref = filter(p -> !(p isa LLVM.PointerType && addrspace(p) == 4), params)
- @test !isempty(non_byref) # double (and i64 or ptr) params
+ # the struct byref param should be ptr addrspace(4)
+ has_as4 = any(p -> p isa LLVM.PointerType && addrspace(p) == 4, params)
+ @test has_as4
- # byref attribute must be present
- ir_str = string(ir)
- @test occursin("byref", ir_str)
+ # non-struct params (double, and i64/ptr for Ptr{Float64}) should NOT
+ # be in addrspace(4). Ptr{Float64} is i64 on Julia ≤1.11, ptr on 1.12+.
+ non_byref = filter(p -> !(p isa LLVM.PointerType && addrspace(p) == 4), params)
+ @test !isempty(non_byref) # double (and i64 or ptr) params
- dispose(ir)
- end
-end
+ # byref attribute must be present
+ ir_str = string(ir)
+ @test occursin("byref", ir_str)
+
+ dispose(ir)
+ end
+ end
@testset "https://github.com/JuliaGPU/AMDGPU.jl/issues/846" begin
ir, rt = GCN.code_typed((Tuple{Tuple{Val{4}}, Tuple{Float32}},); always_inline=true) do t
@@ -165,47 +171,49 @@ end
############################################################################################
@testset "assembly" begin
-@testset "s_load for kernarg struct access" begin
- mod = @eval module $(gensym())
- struct MyStruct
- x::Float64
- y::Float64
- end
+ @testset "s_load for kernarg struct access" begin
+ mod = @eval module $(gensym())
+ struct MyStruct
+ x::Float64
+ y::Float64
+ end
- function kernel(s::MyStruct, out::Ptr{Float64})
- unsafe_store!(out, s.x + s.y)
- return
+ function kernel(s::MyStruct, out::Ptr{Float64})
+ unsafe_store!(out, s.x + s.y)
+ return
+ end
+ end
+
+ # struct field loads from kernarg should use s_load, not flat_load
+ @test @filecheck begin
+ check"CHECK: s_load_dwordx"
+ check"CHECK-NOT: flat_load"
+ GCN.code_native(mod.kernel, Tuple{mod.MyStruct, Ptr{Float64}}; kernel = true)
+ end
end
- end
- # struct field loads from kernarg should use s_load, not flat_load
- @test @filecheck begin
- check"CHECK: s_load_dwordx"
- check"CHECK-NOT: flat_load"
- GCN.code_native(mod.kernel, Tuple{mod.MyStruct, Ptr{Float64}}; kernel=true)
- end
-end
+ @testset "no scratch spills for small struct kernarg" begin
+ mod = @eval module $(gensym())
+ struct SmallStruct
+ x::Float64
+ y::Float64
+ end
-@testset "no scratch spills for small struct kernarg" begin
- mod = @eval module $(gensym())
- struct SmallStruct
- x::Float64
- y::Float64
- end
+ function kernel(s::SmallStruct, out::Ptr{Float64})
+ unsafe_store!(out, s.x + s.y)
+ return
+ end
+ end
- function kernel(s::SmallStruct, out::Ptr{Float64})
- unsafe_store!(out, s.x + s.y)
- return
+ # a small struct kernel should not need scratch memory
+ @test @filecheck begin
+ check"CHECK: .private_segment_fixed_size: 0"
+ GCN.code_native(
+ mod.kernel, Tuple{mod.SmallStruct, Ptr{Float64}};
+ dump_module = true, kernel = true
+ )
+ end
end
- end
-
- # a small struct kernel should not need scratch memory
- @test @filecheck begin
- check"CHECK: .private_segment_fixed_size: 0"
- GCN.code_native(mod.kernel, Tuple{mod.SmallStruct, Ptr{Float64}};
- dump_module=true, kernel=true)
- end
-end
@testset "skip scalar trap" begin
mod = @eval module $(gensym()) |
…spacecast The addrspacecast from addrspace(4) to addrspace(0) caused "illegal VGPR to SGPR copy" errors because LLVM couldn't properly lower generic pointer accesses back to the constant address space. Instead, follow Metal's approach: load the struct from the addrspace(4) kernarg pointer into a local alloca (addrspace 5), and let SROA decompose it during the optimization pipeline. This avoids the address space mismatch while still benefiting from byref semantics — the load from addrspace(4) is a scalar load from the kernarg segment, and SROA will eliminate dead fields. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
The byref TypeAttribute may be dropped when copying attributes to the new function with changed parameter types (ptr -> ptr addrspace(4)). Explicitly re-add it to ensure the AMDGPU backend knows the kernarg contains the struct data inline, not a pointer to it. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Remove manual rewrite_byref_addrspaces!. The AMDGPU backend's AMDGPULowerKernelArguments pass already knows how to handle ptr byref(T) on amdgpu_kernel functions — it rewrites the pointer to load from the kernarg segment (addrspace 4) automatically. The previous manual approaches (addrspacecast, load→alloca→store) conflicted with the backend's own lowering. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Add finish_ir! for GCN that rewrites byref kernel parameters from flat (addrspace 0) to kernarg (addrspace 4) after optimization. Clang emits byref params as ptr addrspace(4) from the frontend, but Julia's RemoveJuliaAddrspacesPass strips them to flat. This causes struct field loads to use flat_load instead of s_load. The pass creates a new function with ptr addrspace(4) parameters, inserts addrspacecasts back to flat for the cloned IR, then runs InferAddressSpaces to propagate addrspace(4) through all GEPs and loads. The result is that all kernel argument struct field accesses become s_load (scalar, cached, one per wavefront) instead of flat_load (per-lane, address disambiguation overhead). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
InferAddressSpaces needs TargetTransformInfo to determine the flat address space (0 on AMDGPU). Without passing a TargetMachine, the pass has no TTI and skips all promotions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
CloneFunctionInto rebuilds the AttributeList from scratch using VMap. For byref params, VMap maps old args to addrspacecast instructions (not Arguments), so dyn_cast<Argument> fails and byref(T) is silently dropped. The subsequent setAttributes() overwrites any attrs we set before clone_into!. Without byref, the backend emits global_buffer(8) metadata instead of by_value(sizeof(T)), causing HIP to copy only 8 bytes of struct data as a "pointer" — leading to illegal address errors at runtime. Also remove InferAddressSpaces (rely on AMDGPULowerKernelArguments in codegen to trace addrspacecast chains for s_load). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Verify that byref struct kernel parameters are rewritten to ptr addrspace(4) in IR and that the backend emits s_load (not flat_load) for struct field access. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
|
@vchuravy after a small session of being nerd sniped. This + JuliaGPU/AMDGPU.jl#894 does VERY AI AHEAD (I couldn't be bothered) GCN Kernarg Address Space Rewriting: Performance SummaryWhat changedByref kernel parameters are now rewritten from flat ( The fix lives in Why this mattersOn AMDGPU, Julia's Results: 48 Oceananigans kernels on gfx942Aggregate
Per-kernel breakdown (sorted by VGPR reduction)
Notes on remaining register-capped kernelsThe 4 SGPR increase is expected and beneficialSGPRs increased by ~50% because scalar registers now hold uniform kernel argument data that previously consumed vector registers via Implementation detailsKey bug fix:
|
Run InferAddressSpaces (with TargetMachine) after add_kernarg_address_spaces! to propagate addrspace(4) through addrspacecast chains. Follow up with SROA, InstCombine, EarlyCSE, and SimplifyCFG to clean up newly-exposed opportunities. The earlier illegal address errors were caused by byref attribute loss in clone_into!, not by InferAddressSpaces itself. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
src/gcn.jl
Outdated
| return entry | ||
| end | ||
|
|
||
| # Rewrite byref kernel parameters from flat (addrspace 0) to kernarg (addrspace 4). |
There was a problem hiding this comment.
Small nit, addrspace 4 is the constant one and not kernarg.
src/gcn.jl
Outdated
| # Clang emits byref parameters as `ptr addrspace(4)` from the frontend, but Julia's | ||
| # RemoveJuliaAddrspacesPass strips all address spaces to flat. This pass restores the |
There was a problem hiding this comment.
This part of the comment makes no sense it seems to imply that somehow Julia emits addrspace(4) and then strips it. Julia doesn't known that it ought to emit in addrspace(4) (which we could fix), since we already fixed alloca addrspace emission.
There was a problem hiding this comment.
The comment is half wrong. The argument is first emitted as addresspace(11). Which gets stripped to 0 and then we need to replace it to 4
vchuravy
left a comment
There was a problem hiding this comment.
Looks generally good, but we should add more tests :)
- Fix addrspace(4) comment: it's the "constant" address space, not "kernarg" - Rewrite doc comment to accurately describe the Julia → AS(11) → AS(0) → AS(4) flow - Add InferAddressSpaces + SROA + InstCombine + EarlyCSE after kernarg rewrite - Add tests: byref attribute preservation, mixed byref/scalar params, programmatic IR inspection via compile(:llvm), zero scratch spills - Apply Runic formatting to new code Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
The @dispose macro on older LLVM.jl requires `pb=expr()` without spaces, not `pb = expr()`. The Runic-suggested formatting breaks precompilation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
On LLVM 16 (Julia ≤1.11) with typed pointers, add_kernarg_address_spaces!
was creating opaque `ptr addrspace(4)` via `LLVM.PointerType(4)`, which
introduced opaque pointers into a typed-pointer module. When
InferAddressSpaces then propagated these through memcpy intrinsics
(which use typed `i8 addrspace(N)*`), LLVM's verifier rejected the
type mismatch.
Fix by following Metal's pattern: use `LLVM.PointerType(eltype, 4)`
on typed-pointer contexts and the original param type as the
addrspacecast target.
Also fix tests: Ptr{Float64} is lowered to i64 on Julia ≤1.11 and
ptr on 1.12+, so use {{(i64|ptr)}} in filecheck and check for
non-byref params instead of non-AS4 pointer params.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
classify_arguments can fail post-optimization on typed-pointer LLVM because convert(LLVMType, source_typ) may produce a different element type than the post-optimization codegen type, hitting the assertion at irgen.jl:322. This was triggered by the Symbols test in AMDGPU.jl. Instead of re-classifying arguments, just check for the byref attribute directly on each parameter — we know it's present because irgen.jl added it. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Summary
byval+lower_byval(FCA expansion) withbyreffor GCN kernel argumentspass_by_refinterface method (defaults tofalse, GCN overrides totrue)byrefpointer parameters to addrspace(4) (kernarg) with addrspacecasts back to generic for the function bodyMotivation
On AMDGPU, kernel arguments reside in the read-only kernarg segment. The current pipeline:
irgenaddsbyvalattributes to aggregate parameterslower_byvalreplacesptr byval(T)with the FCA type directlyextractvalues every field andstores the entire struct into anallocain scratch memory (addrspace 5)For large structs (e.g. Oceananigans'
ImmersedBoundaryGrid— hundreds of bytes with ~28 array descriptors), this produces 78 scratch stores but only 4 scratch loads. ~95% of the stores are dead, but LLVM's DSE/SROA can't eliminate them from the massive nested aggregate.With
byref, the pointer semantics are preserved. LLVM generatess_loaddirectly from the kernarg segment on demand, only loading fields that are actually used. Theinvariant.loadand TBAA metadata Julia emits remain valid since kernarg memory is immutable.Test plan
mask_immersed_field!kernel to confirm reduced scratch usage🤖 Generated with Claude Code