From d0fbeb8293ee3c3cc5f7a390bd51696410f12394 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 11:34:58 -0300 Subject: [PATCH 01/14] GCN: use byref instead of byval+lower_byval for kernel arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On AMDGPU, kernel arguments already reside in the read-only kernarg segment. The current pipeline adds `byval` attributes and then `lower_byval` expands them into first-class aggregates (FCAs), which forces LLVM to extractvalue every field and store the entire struct into scratch memory via alloca — even when only a few fields are used. For large structs (e.g. Oceananigans' ImmersedBoundaryGrid), this produces dozens of dead scratch stores. Using `byref` instead keeps the pointer semantics, allowing LLVM to generate scalar loads directly from the kernarg segment on demand. The invariant.load and TBAA metadata that Julia emits remain valid since the kernarg memory is immutable. The byref pointer parameters are rewritten to addrspace(4) (AMDGPU constant/kernarg address space), with addrspacecasts inserted so the function body can continue using generic pointers. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 88 ++++++++++++++++++++++++++++++++++++++++++++++-- src/interface.jl | 6 ++++ src/irgen.jl | 6 +++- 3 files changed, 97 insertions(+), 3 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 146d9a33..578a6b4c 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -40,6 +40,90 @@ runtime_slug(job::CompilerJob{GCNCompilerTarget}) = "gcn-$(job.config.target.dev const gcn_intrinsics = () # TODO: ("vprintf", "__assertfail", "malloc", "free") isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsics) +pass_by_ref(@nospecialize(job::CompilerJob{GCNCompilerTarget})) = true + +# AMDGPU constant/kernarg address space +const GCN_ADDRSPACE_CONSTANT = 4 + +# Rewrite byref pointer parameters from addrspace 0 to addrspace 4 (kernarg), +# inserting addrspacecasts so the function body can continue using generic pointers. +function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), + mod::LLVM.Module, f::LLVM.Function) + ft = function_type(f) + + # find byref parameters + byref = BitVector(undef, length(parameters(ft))) + for i in 1:length(byref) + byref[i] = false + for attr in collect(parameter_attributes(f, i)) + if kind(attr) == kind(TypeAttribute("byref", LLVM.VoidType())) + byref[i] = true + end + end + end + any(byref) || return f + + # build new function type with addrspace(4) pointers for byref params + new_types = LLVMType[] + for (i, param) in enumerate(parameters(ft)) + if byref[i] + push!(new_types, LLVM.PointerType(GCN_ADDRSPACE_CONSTANT)) + else + push!(new_types, param) + end + end + new_ft = LLVM.FunctionType(return_type(ft), new_types) + new_f = LLVM.Function(mod, "", new_ft) + linkage!(new_f, linkage(f)) + callconv!(new_f, callconv(f)) + for (arg, new_arg) in zip(parameters(f), parameters(new_f)) + LLVM.name!(new_arg, LLVM.name(arg)) + end + + # copy parameter attributes + for (i, _) in enumerate(parameters(ft)) + for attr in collect(parameter_attributes(f, i)) + push!(parameter_attributes(new_f, i), attr) + end + end + + # insert addrspacecasts in entry block + new_args = LLVM.Value[] + @dispose builder=IRBuilder() begin + entry = BasicBlock(new_f, "conversion") + position!(builder, entry) + + for (i, param) in enumerate(parameters(ft)) + if byref[i] + # cast from addrspace(4) to addrspace(0) for the function body + ptr = addrspacecast!(builder, parameters(new_f)[i], param) + push!(new_args, ptr) + else + push!(new_args, parameters(new_f)[i]) + end + end + + value_map = Dict{LLVM.Value, LLVM.Value}( + param => new_args[i] for (i, param) in enumerate(parameters(f)) + ) + value_map[f] = new_f + clone_into!(new_f, f; value_map, + changes=LLVM.API.LLVMCloneFunctionChangeTypeGlobalChanges) + + br!(builder, blocks(new_f)[2]) + end + + # replace old function + fn = LLVM.name(f) + prune_constexpr_uses!(f) + @assert isempty(uses(f)) + replace_metadata_uses!(f, new_f) + erase!(f) + LLVM.name!(new_f, fn) + + return new_f +end + function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, entry::LLVM.Function) lower_throw_extra!(mod) @@ -48,8 +132,8 @@ function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), # calling convention callconv!(entry, LLVM.API.LLVMAMDGPUKERNELCallConv) - # work around bad byval codegen (JuliaGPU/GPUCompiler.jl#92) - entry = lower_byval(job, mod, entry) + # rewrite byref parameters to use the kernarg address space + entry = rewrite_byref_addrspaces!(job, mod, entry) end return entry diff --git a/src/interface.jl b/src/interface.jl index 21ddcf57..fc65c888 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -272,6 +272,12 @@ kernel_state_type(@nospecialize(job::CompilerJob)) = Nothing # Does the target need to pass kernel arguments by value? pass_by_value(@nospecialize(job::CompilerJob)) = true +# Should the target use byref instead of byval+lower_byval for kernel arguments? +# When true, aggregate arguments are passed as pointers with the byref attribute, +# allowing the backend to load fields directly from the argument memory (e.g. kernarg +# segment on AMDGPU) instead of materializing the entire struct via first-class aggregates. +pass_by_ref(@nospecialize(job::CompilerJob)) = false + # whether pointer is a valid call target valid_function_pointer(@nospecialize(job::CompilerJob), ptr::Ptr{Cvoid}) = false diff --git a/src/irgen.jl b/src/irgen.jl index 5149e9f0..744904e5 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -94,7 +94,11 @@ function irgen(@nospecialize(job::CompilerJob)) for arg in args if arg.cc == BITS_REF llvm_typ = convert(LLVMType, arg.typ) - attr = TypeAttribute("byval", llvm_typ) + if pass_by_ref(job) + attr = TypeAttribute("byref", llvm_typ) + else + attr = TypeAttribute("byval", llvm_typ) + end push!(parameter_attributes(entry, arg.idx), attr) end end From 5e07342732175b156b7d54529ccf5236733d3eb5 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 12:02:17 -0300 Subject: [PATCH 02/14] Fix byref rewrite: load from addrspace(4) into alloca instead of addrspacecast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The addrspacecast from addrspace(4) to addrspace(0) caused "illegal VGPR to SGPR copy" errors because LLVM couldn't properly lower generic pointer accesses back to the constant address space. Instead, follow Metal's approach: load the struct from the addrspace(4) kernarg pointer into a local alloca (addrspace 5), and let SROA decompose it during the optimization pipeline. This avoids the address space mismatch while still benefiting from byref semantics — the load from addrspace(4) is a scalar load from the kernarg segment, and SROA will eliminate dead fields. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 578a6b4c..cea66e02 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -46,13 +46,20 @@ pass_by_ref(@nospecialize(job::CompilerJob{GCNCompilerTarget})) = true const GCN_ADDRSPACE_CONSTANT = 4 # Rewrite byref pointer parameters from addrspace 0 to addrspace 4 (kernarg), -# inserting addrspacecasts so the function body can continue using generic pointers. +# loading the data into local allocas so the function body can use generic pointers. +# SROA will decompose the allocas during the optimization pipeline that follows. function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, f::LLVM.Function) ft = function_type(f) - # find byref parameters + # find byref parameters and their types + args = classify_arguments(job, ft) + filter!(args) do arg + arg.cc != GHOST + end + byref = BitVector(undef, length(parameters(ft))) + byref_types = Vector{Any}(undef, length(parameters(ft))) for i in 1:length(byref) byref[i] = false for attr in collect(parameter_attributes(f, i)) @@ -61,6 +68,11 @@ function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTar end end end + for arg in args + if arg.idx !== nothing && byref[arg.idx] + byref_types[arg.idx] = arg.typ + end + end any(byref) || return f # build new function type with addrspace(4) pointers for byref params @@ -87,7 +99,7 @@ function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTar end end - # insert addrspacecasts in entry block + # load byref arguments from addrspace(4) into local allocas new_args = LLVM.Value[] @dispose builder=IRBuilder() begin entry = BasicBlock(new_f, "conversion") @@ -95,8 +107,13 @@ function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTar for (i, param) in enumerate(parameters(ft)) if byref[i] - # cast from addrspace(4) to addrspace(0) for the function body - ptr = addrspacecast!(builder, parameters(new_f)[i], param) + # load the value from the kernarg pointer and store into a stack slot, + # so the function body can keep using addrspace(0) pointers. + # SROA will decompose this during optimization. + llvm_typ = convert(LLVMType, byref_types[i]) + val = load!(builder, llvm_typ, parameters(new_f)[i]) + ptr = alloca!(builder, llvm_typ) + store!(builder, val, ptr) push!(new_args, ptr) else push!(new_args, parameters(new_f)[i]) From 2cce5a4a6d6628bd31eb5ec60965aa88b39df34f Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 12:05:33 -0300 Subject: [PATCH 03/14] Ensure byref attribute is preserved on rewritten parameters The byref TypeAttribute may be dropped when copying attributes to the new function with changed parameter types (ptr -> ptr addrspace(4)). Explicitly re-add it to ensure the AMDGPU backend knows the kernarg contains the struct data inline, not a pointer to it. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gcn.jl b/src/gcn.jl index cea66e02..8da94f7c 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -92,11 +92,17 @@ function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTar LLVM.name!(new_arg, LLVM.name(arg)) end - # copy parameter attributes + # copy parameter attributes, ensuring byref is preserved with correct type for (i, _) in enumerate(parameters(ft)) for attr in collect(parameter_attributes(f, i)) push!(parameter_attributes(new_f, i), attr) end + # explicitly re-add byref with the correct type, in case the copy + # dropped it due to the parameter type change + if byref[i] + llvm_typ = convert(LLVMType, byref_types[i]) + push!(parameter_attributes(new_f, i), TypeAttribute("byref", llvm_typ)) + end end # load byref arguments from addrspace(4) into local allocas From 047e172360b8e2ffbb38a9a540a29e6d6c4d12a9 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 12:09:40 -0300 Subject: [PATCH 04/14] Simplify: let AMDGPU backend handle byref natively MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove manual rewrite_byref_addrspaces!. The AMDGPU backend's AMDGPULowerKernelArguments pass already knows how to handle ptr byref(T) on amdgpu_kernel functions — it rewrites the pointer to load from the kernarg segment (addrspace 4) automatically. The previous manual approaches (addrspacecast, load→alloca→store) conflicted with the backend's own lowering. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 110 ++--------------------------------------------------- 1 file changed, 3 insertions(+), 107 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 8da94f7c..4164ea8b 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -42,111 +42,6 @@ isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsic pass_by_ref(@nospecialize(job::CompilerJob{GCNCompilerTarget})) = true -# AMDGPU constant/kernarg address space -const GCN_ADDRSPACE_CONSTANT = 4 - -# Rewrite byref pointer parameters from addrspace 0 to addrspace 4 (kernarg), -# loading the data into local allocas so the function body can use generic pointers. -# SROA will decompose the allocas during the optimization pipeline that follows. -function rewrite_byref_addrspaces!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), - mod::LLVM.Module, f::LLVM.Function) - ft = function_type(f) - - # find byref parameters and their types - args = classify_arguments(job, ft) - filter!(args) do arg - arg.cc != GHOST - end - - byref = BitVector(undef, length(parameters(ft))) - byref_types = Vector{Any}(undef, length(parameters(ft))) - for i in 1:length(byref) - byref[i] = false - for attr in collect(parameter_attributes(f, i)) - if kind(attr) == kind(TypeAttribute("byref", LLVM.VoidType())) - byref[i] = true - end - end - end - for arg in args - if arg.idx !== nothing && byref[arg.idx] - byref_types[arg.idx] = arg.typ - end - end - any(byref) || return f - - # build new function type with addrspace(4) pointers for byref params - new_types = LLVMType[] - for (i, param) in enumerate(parameters(ft)) - if byref[i] - push!(new_types, LLVM.PointerType(GCN_ADDRSPACE_CONSTANT)) - else - push!(new_types, param) - end - end - new_ft = LLVM.FunctionType(return_type(ft), new_types) - new_f = LLVM.Function(mod, "", new_ft) - linkage!(new_f, linkage(f)) - callconv!(new_f, callconv(f)) - for (arg, new_arg) in zip(parameters(f), parameters(new_f)) - LLVM.name!(new_arg, LLVM.name(arg)) - end - - # copy parameter attributes, ensuring byref is preserved with correct type - for (i, _) in enumerate(parameters(ft)) - for attr in collect(parameter_attributes(f, i)) - push!(parameter_attributes(new_f, i), attr) - end - # explicitly re-add byref with the correct type, in case the copy - # dropped it due to the parameter type change - if byref[i] - llvm_typ = convert(LLVMType, byref_types[i]) - push!(parameter_attributes(new_f, i), TypeAttribute("byref", llvm_typ)) - end - end - - # load byref arguments from addrspace(4) into local allocas - new_args = LLVM.Value[] - @dispose builder=IRBuilder() begin - entry = BasicBlock(new_f, "conversion") - position!(builder, entry) - - for (i, param) in enumerate(parameters(ft)) - if byref[i] - # load the value from the kernarg pointer and store into a stack slot, - # so the function body can keep using addrspace(0) pointers. - # SROA will decompose this during optimization. - llvm_typ = convert(LLVMType, byref_types[i]) - val = load!(builder, llvm_typ, parameters(new_f)[i]) - ptr = alloca!(builder, llvm_typ) - store!(builder, val, ptr) - push!(new_args, ptr) - else - push!(new_args, parameters(new_f)[i]) - end - end - - value_map = Dict{LLVM.Value, LLVM.Value}( - param => new_args[i] for (i, param) in enumerate(parameters(f)) - ) - value_map[f] = new_f - clone_into!(new_f, f; value_map, - changes=LLVM.API.LLVMCloneFunctionChangeTypeGlobalChanges) - - br!(builder, blocks(new_f)[2]) - end - - # replace old function - fn = LLVM.name(f) - prune_constexpr_uses!(f) - @assert isempty(uses(f)) - replace_metadata_uses!(f, new_f) - erase!(f) - LLVM.name!(new_f, fn) - - return new_f -end - function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, entry::LLVM.Function) lower_throw_extra!(mod) @@ -155,8 +50,9 @@ function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), # calling convention callconv!(entry, LLVM.API.LLVMAMDGPUKERNELCallConv) - # rewrite byref parameters to use the kernarg address space - entry = rewrite_byref_addrspaces!(job, mod, entry) + # with byref, the AMDGPU backend's AMDGPULowerKernelArguments pass + # will handle loading from the kernarg segment directly. + # no need for lower_byval or manual rewriting. end return entry From 1c2064756755f23eaa3a462db7ffa300ea6d8825 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 17:39:05 -0300 Subject: [PATCH 05/14] GCN: rewrite byref kernel params to addrspace(4) for s_load Add finish_ir! for GCN that rewrites byref kernel parameters from flat (addrspace 0) to kernarg (addrspace 4) after optimization. Clang emits byref params as ptr addrspace(4) from the frontend, but Julia's RemoveJuliaAddrspacesPass strips them to flat. This causes struct field loads to use flat_load instead of s_load. The pass creates a new function with ptr addrspace(4) parameters, inserts addrspacecasts back to flat for the cloned IR, then runs InferAddressSpaces to propagate addrspace(4) through all GEPs and loads. The result is that all kernel argument struct field accesses become s_load (scalar, cached, one per wavefront) instead of flat_load (per-lane, address disambiguation overhead). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 4164ea8b..19aa146b 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -49,15 +49,126 @@ function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), if job.config.kernel # calling convention callconv!(entry, LLVM.API.LLVMAMDGPUKERNELCallConv) - - # with byref, the AMDGPU backend's AMDGPULowerKernelArguments pass - # will handle loading from the kernarg segment directly. - # no need for lower_byval or manual rewriting. end return entry end +function finish_ir!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, + entry::LLVM.Function) + if job.config.kernel + entry = add_kernarg_address_spaces!(job, mod, entry) + end + return entry +end + +# Rewrite byref kernel parameters from flat (addrspace 0) to kernarg (addrspace 4). +# +# On AMDGPU, the kernarg segment is in address space 4 and is scalar-loadable via s_load. +# Clang emits byref parameters as `ptr addrspace(4)` from the frontend, but Julia's +# RemoveJuliaAddrspacesPass strips all address spaces to flat. This pass restores the +# correct address space so that struct field loads from byref arguments become s_load +# instead of flat_load. +# +# NOTE: must run after optimization, where RemoveJuliaAddrspacesPass has already +# converted Julia's addrspace(11) to flat (addrspace 0) on these parameters. +function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM.Module, + f::LLVM.Function) + ft = function_type(f) + + # find the byref parameters + byref_mask = BitVector(undef, length(parameters(ft))) + args = classify_arguments(job, ft; post_optimization=job.config.optimize) + filter!(args) do arg + arg.cc != GHOST + end + for arg in args + byref_mask[arg.idx] = (arg.cc == BITS_REF || arg.cc == KERNEL_STATE) + end + + # check if any flat pointer byref params need rewriting + needs_rewrite = false + for (i, param) in enumerate(parameters(ft)) + if byref_mask[i] && param isa LLVM.PointerType && addrspace(param) == 0 + needs_rewrite = true + break + end + end + needs_rewrite || return f + + # generate the new function type with kernarg address space on byref params + new_types = LLVMType[] + for (i, param) in enumerate(parameters(ft)) + if byref_mask[i] && param isa LLVM.PointerType && addrspace(param) == 0 + push!(new_types, LLVM.PointerType(#=kernarg=# 4)) + else + push!(new_types, param) + end + end + new_ft = LLVM.FunctionType(return_type(ft), new_types) + new_f = LLVM.Function(mod, "", new_ft) + linkage!(new_f, linkage(f)) + for (arg, new_arg) in zip(parameters(f), parameters(new_f)) + LLVM.name!(new_arg, LLVM.name(arg)) + end + + # insert addrspacecasts from kernarg (4) back to flat (0) so that the cloned IR + # (which expects flat pointers) continues to work. InferAddressSpaces will then + # propagate addrspace(4) through GEPs and loads, eliminating the casts. + new_args = LLVM.Value[] + @dispose builder=IRBuilder() begin + entry_bb = BasicBlock(new_f, "conversion") + position!(builder, entry_bb) + + for (i, param) in enumerate(parameters(ft)) + if byref_mask[i] && param isa LLVM.PointerType && addrspace(param) == 0 + cast = addrspacecast!(builder, parameters(new_f)[i], LLVM.PointerType(0)) + push!(new_args, cast) + else + push!(new_args, parameters(new_f)[i]) + end + for attr in collect(parameter_attributes(f, i)) + push!(parameter_attributes(new_f, i), attr) + end + end + + # clone the original function body + value_map = Dict{LLVM.Value, LLVM.Value}( + param => new_args[i] for (i, param) in enumerate(parameters(f)) + ) + value_map[f] = new_f + clone_into!(new_f, f; value_map, + changes=LLVM.API.LLVMCloneFunctionChangeTypeGlobalChanges) + + # fall through from conversion block to cloned entry + br!(builder, blocks(new_f)[2]) + end + + # replace the old function + fn = LLVM.name(f) + prune_constexpr_uses!(f) + @assert isempty(uses(f)) + replace_metadata_uses!(f, new_f) + erase!(f) + LLVM.name!(new_f, fn) + + # propagate addrspace(4) through GEPs and loads, then clean up + @dispose pb=NewPMPassBuilder() begin + add!(pb, NewPMFunctionPassManager()) do fpm + add!(fpm, InferAddressSpacesPass()) + end + add!(pb, NewPMFunctionPassManager()) do fpm + add!(fpm, SimplifyCFGPass()) + add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) + add!(fpm, InstCombinePass()) + end + run!(pb, mod) + end + + return functions(mod)[fn] +end + ## LLVM passes From 9cc22e3efe033deea3f002a440728b86fa3bc457 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 17:46:20 -0300 Subject: [PATCH 06/14] Pass TargetMachine to InferAddressSpaces in finish_ir! cleanup InferAddressSpaces needs TargetTransformInfo to determine the flat address space (0 on AMDGPU). Without passing a TargetMachine, the pass has no TTI and skips all promotions. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 19aa146b..947d370c 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -152,7 +152,10 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. erase!(f) LLVM.name!(new_f, fn) - # propagate addrspace(4) through GEPs and loads, then clean up + # propagate addrspace(4) through GEPs and loads, then clean up. + # InferAddressSpaces needs TargetTransformInfo (via TargetMachine) to know + # that flat address space is 0 on AMDGPU. + tm = llvm_machine(job.config.target) @dispose pb=NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, InferAddressSpacesPass()) @@ -163,8 +166,9 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. add!(fpm, EarlyCSEPass()) add!(fpm, InstCombinePass()) end - run!(pb, mod) + run!(pb, mod, tm) end + dispose(tm) return functions(mod)[fn] end From 6e80b8b25636b75423a31601e418419bdf3d75c4 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 18:31:00 -0300 Subject: [PATCH 07/14] Fix byref attribute loss: copy param attrs after clone_into! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CloneFunctionInto rebuilds the AttributeList from scratch using VMap. For byref params, VMap maps old args to addrspacecast instructions (not Arguments), so dyn_cast fails and byref(T) is silently dropped. The subsequent setAttributes() overwrites any attrs we set before clone_into!. Without byref, the backend emits global_buffer(8) metadata instead of by_value(sizeof(T)), causing HIP to copy only 8 bytes of struct data as a "pointer" — leading to illegal address errors at runtime. Also remove InferAddressSpaces (rely on AMDGPULowerKernelArguments in codegen to trace addrspacecast chains for s_load). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 947d370c..27d8cb08 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -113,8 +113,8 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. end # insert addrspacecasts from kernarg (4) back to flat (0) so that the cloned IR - # (which expects flat pointers) continues to work. InferAddressSpaces will then - # propagate addrspace(4) through GEPs and loads, eliminating the casts. + # (which expects flat pointers) continues to work. The AMDGPU backend's + # AMDGPULowerKernelArguments traces these casts and produces s_load. new_args = LLVM.Value[] @dispose builder=IRBuilder() begin entry_bb = BasicBlock(new_f, "conversion") @@ -127,9 +127,6 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. else push!(new_args, parameters(new_f)[i]) end - for attr in collect(parameter_attributes(f, i)) - push!(parameter_attributes(new_f, i), attr) - end end # clone the original function body @@ -144,6 +141,16 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. br!(builder, blocks(new_f)[2]) end + # copy parameter attributes AFTER clone_into!, because CloneFunctionInto + # overwrites all attributes via setAttributes. For byref params, the VMap + # maps old args to addrspacecast instructions (not Arguments), so LLVM's + # attribute remapping silently drops them. We must re-add them here. + for i in 1:length(parameters(ft)) + for attr in collect(parameter_attributes(f, i)) + push!(parameter_attributes(new_f, i), attr) + end + end + # replace the old function fn = LLVM.name(f) prune_constexpr_uses!(f) @@ -152,23 +159,18 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. erase!(f) LLVM.name!(new_f, fn) - # propagate addrspace(4) through GEPs and loads, then clean up. - # InferAddressSpaces needs TargetTransformInfo (via TargetMachine) to know - # that flat address space is 0 on AMDGPU. - tm = llvm_machine(job.config.target) + # clean up the extra conversion block. + # NOTE: we do NOT run InferAddressSpaces here — the AMDGPU backend's + # AMDGPULowerKernelArguments pass traces addrspacecast chains during codegen + # and correctly produces s_load for addrspace(4) provenance. Running + # InferAddressSpaces with a TargetMachine can over-propagate addrspace(4) + # into pointer values loaded from the struct (which should remain flat/global). @dispose pb=NewPMPassBuilder() begin - add!(pb, NewPMFunctionPassManager()) do fpm - add!(fpm, InferAddressSpacesPass()) - end add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, SimplifyCFGPass()) - add!(fpm, SROAPass()) - add!(fpm, EarlyCSEPass()) - add!(fpm, InstCombinePass()) end - run!(pb, mod, tm) + run!(pb, mod) end - dispose(tm) return functions(mod)[fn] end From 14e010b2256d0bf570e422f1c8651bf9b9e11d76 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 19:33:42 -0300 Subject: [PATCH 08/14] Add tests for kernarg addrspace(4) rewriting on GCN Verify that byref struct kernel parameters are rewritten to ptr addrspace(4) in IR and that the backend emits s_load (not flat_load) for struct field access. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/gcn.jl | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/gcn.jl b/test/gcn.jl index 95641a44..3f08e518 100644 --- a/test/gcn.jl +++ b/test/gcn.jl @@ -37,6 +37,32 @@ end end end +@testset "kernarg address space for byref parameters" begin + mod = @eval module $(gensym()) + struct MyStruct + x::Float64 + y::Float64 + end + + function kernel(s::MyStruct) + s.x + s.y + return + end + end + + # byref struct params should be ptr addrspace(4) in kernel IR + @test @filecheck begin + check"CHECK: define amdgpu_kernel void @_Z6kernel8MyStruct(ptr addrspace(4)" + GCN.code_llvm(mod.kernel, Tuple{mod.MyStruct}; dump_module=true, kernel=true) + end + + # non-kernel should NOT have addrspace(4) + @test @filecheck begin + check"CHECK-NOT: addrspace(4)" + GCN.code_llvm(mod.kernel, Tuple{mod.MyStruct}; dump_module=true, kernel=false) + end +end + @testset "https://github.com/JuliaGPU/AMDGPU.jl/issues/846" begin ir, rt = GCN.code_typed((Tuple{Tuple{Val{4}}, Tuple{Float32}},); always_inline=true) do t t[1] @@ -49,6 +75,26 @@ end ############################################################################################ @testset "assembly" begin +@testset "s_load for kernarg struct access" begin + mod = @eval module $(gensym()) + struct MyStruct + x::Float64 + y::Float64 + end + + function kernel(s::MyStruct, out::Ptr{Float64}) + unsafe_store!(out, s.x + s.y) + return + end + end + + @test @filecheck begin + check"CHECK: s_load_dwordx" + check"CHECK-NOT: flat_load" + GCN.code_native(mod.kernel, Tuple{mod.MyStruct, Ptr{Float64}}; kernel=true) + end +end + @testset "skip scalar trap" begin mod = @eval module $(gensym()) workitem_idx_x() = ccall("llvm.amdgcn.workitem.id.x", llvmcall, Int32, ()) From 3caabbf6388d739e25977e360f9b1f3e5091e749 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 20:16:04 -0300 Subject: [PATCH 09/14] Add InferAddressSpaces + optimization passes after kernarg rewrite Run InferAddressSpaces (with TargetMachine) after add_kernarg_address_spaces! to propagate addrspace(4) through addrspacecast chains. Follow up with SROA, InstCombine, EarlyCSE, and SimplifyCFG to clean up newly-exposed opportunities. The earlier illegal address errors were caused by byref attribute loss in clone_into!, not by InferAddressSpaces itself. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 27d8cb08..013e288b 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -58,6 +58,20 @@ function finish_ir!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLV entry::LLVM.Function) if job.config.kernel entry = add_kernarg_address_spaces!(job, mod, entry) + + # optimize after address space rewriting: propagate addrspace(4) through + # the addrspacecast chains, then clean up newly-exposed opportunities + tm = llvm_machine(job.config.target) + @dispose pb=NewPMPassBuilder() tm begin + add!(pb, NewPMFunctionPassManager()) do fpm + add!(fpm, InferAddressSpacesPass()) + add!(fpm, SROAPass()) + add!(fpm, InstCombinePass()) + add!(fpm, EarlyCSEPass()) + add!(fpm, SimplifyCFGPass()) + end + run!(pb, mod, tm) + end end return entry end @@ -159,12 +173,7 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. erase!(f) LLVM.name!(new_f, fn) - # clean up the extra conversion block. - # NOTE: we do NOT run InferAddressSpaces here — the AMDGPU backend's - # AMDGPULowerKernelArguments pass traces addrspacecast chains during codegen - # and correctly produces s_load for addrspace(4) provenance. Running - # InferAddressSpaces with a TargetMachine can over-propagate addrspace(4) - # into pointer values loaded from the struct (which should remain flat/global). + # clean up the extra conversion block @dispose pb=NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, SimplifyCFGPass()) From f98636843f7b5c4c0540ed1cdbc687e17ba211f6 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Thu, 19 Mar 2026 09:43:15 -0300 Subject: [PATCH 10/14] Address review feedback: fix comments, add tests, Runic formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix addrspace(4) comment: it's the "constant" address space, not "kernarg" - Rewrite doc comment to accurately describe the Julia → AS(11) → AS(0) → AS(4) flow - Add InferAddressSpaces + SROA + InstCombine + EarlyCSE after kernarg rewrite - Add tests: byref attribute preservation, mixed byref/scalar params, programmatic IR inspection via compile(:llvm), zero scratch spills - Apply Runic formatting to new code Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 41 +++++++++++-------- test/gcn.jl | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 17 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 013e288b..c52d92bd 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -54,15 +54,17 @@ function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), return entry end -function finish_ir!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, - entry::LLVM.Function) +function finish_ir!( + @nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, + entry::LLVM.Function + ) if job.config.kernel entry = add_kernarg_address_spaces!(job, mod, entry) # optimize after address space rewriting: propagate addrspace(4) through # the addrspacecast chains, then clean up newly-exposed opportunities tm = llvm_machine(job.config.target) - @dispose pb=NewPMPassBuilder() tm begin + @dispose pb = NewPMPassBuilder() tm begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, InferAddressSpacesPass()) add!(fpm, SROAPass()) @@ -76,23 +78,26 @@ function finish_ir!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLV return entry end -# Rewrite byref kernel parameters from flat (addrspace 0) to kernarg (addrspace 4). +# Rewrite byref kernel parameters from flat (addrspace 0) to constant (addrspace 4). # -# On AMDGPU, the kernarg segment is in address space 4 and is scalar-loadable via s_load. -# Clang emits byref parameters as `ptr addrspace(4)` from the frontend, but Julia's -# RemoveJuliaAddrspacesPass strips all address spaces to flat. This pass restores the -# correct address space so that struct field loads from byref arguments become s_load -# instead of flat_load. +# On AMDGPU, kernel arguments reside in the constant address space (addrspace 4), +# which is scalar-loadable via s_load. Julia initially emits byref parameters as +# pointers in addrspace(11) (tracked/derived), but RemoveJuliaAddrspacesPass strips +# all non-integral address spaces to flat (addrspace 0) during optimization. This pass +# restores addrspace(4) on byref parameters so that the backend can emit s_load +# instead of flat_load for struct field accesses. # # NOTE: must run after optimization, where RemoveJuliaAddrspacesPass has already # converted Julia's addrspace(11) to flat (addrspace 0) on these parameters. -function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM.Module, - f::LLVM.Function) +function add_kernarg_address_spaces!( + @nospecialize(job::CompilerJob), mod::LLVM.Module, + f::LLVM.Function + ) ft = function_type(f) # find the byref parameters byref_mask = BitVector(undef, length(parameters(ft))) - args = classify_arguments(job, ft; post_optimization=job.config.optimize) + args = classify_arguments(job, ft; post_optimization = job.config.optimize) filter!(args) do arg arg.cc != GHOST end @@ -114,7 +119,7 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. new_types = LLVMType[] for (i, param) in enumerate(parameters(ft)) if byref_mask[i] && param isa LLVM.PointerType && addrspace(param) == 0 - push!(new_types, LLVM.PointerType(#=kernarg=# 4)) + push!(new_types, LLVM.PointerType(#=constant=# 4)) else push!(new_types, param) end @@ -130,7 +135,7 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. # (which expects flat pointers) continues to work. The AMDGPU backend's # AMDGPULowerKernelArguments traces these casts and produces s_load. new_args = LLVM.Value[] - @dispose builder=IRBuilder() begin + @dispose builder = IRBuilder() begin entry_bb = BasicBlock(new_f, "conversion") position!(builder, entry_bb) @@ -148,8 +153,10 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. param => new_args[i] for (i, param) in enumerate(parameters(f)) ) value_map[f] = new_f - clone_into!(new_f, f; value_map, - changes=LLVM.API.LLVMCloneFunctionChangeTypeGlobalChanges) + clone_into!( + new_f, f; value_map, + changes = LLVM.API.LLVMCloneFunctionChangeTypeGlobalChanges + ) # fall through from conversion block to cloned entry br!(builder, blocks(new_f)[2]) @@ -174,7 +181,7 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM. LLVM.name!(new_f, fn) # clean up the extra conversion block - @dispose pb=NewPMPassBuilder() begin + @dispose pb = NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, SimplifyCFGPass()) end diff --git a/test/gcn.jl b/test/gcn.jl index 3f08e518..ca8e8a68 100644 --- a/test/gcn.jl +++ b/test/gcn.jl @@ -63,6 +63,96 @@ end end end +@testset "byref attribute preserved on kernarg parameters" begin + mod = @eval module $(gensym()) + struct LargeStruct + a::Float64 + b::Float64 + c::Float64 + d::Float64 + end + + function kernel(s::LargeStruct, out::Ptr{Float64}) + unsafe_store!(out, s.a + s.b + s.c + s.d) + return + end + end + + # the byref attribute must survive the addrspace rewrite (clone_into! can drop it) + @test @filecheck begin + check"CHECK: byref" + check"CHECK: addrspace(4)" + GCN.code_llvm(mod.kernel, Tuple{mod.LargeStruct, Ptr{Float64}}; + dump_module=true, kernel=true) + end +end + +@testset "mixed byref and scalar kernel parameters" begin + mod = @eval module $(gensym()) + struct Params + x::Float64 + y::Float64 + end + + function kernel(a::Float64, s::Params, out::Ptr{Float64}) + unsafe_store!(out, a + s.x + s.y) + return + end + end + + # scalar Float64 and Ptr should NOT be in addrspace(4), + # only the struct byref param should be + @test @filecheck begin + check"CHECK: define amdgpu_kernel void" + check"CHECK-SAME: double" + check"CHECK-SAME: ptr addrspace(4)" + check"CHECK-SAME: ptr" + GCN.code_llvm(mod.kernel, Tuple{Float64, mod.Params, Ptr{Float64}}; + dump_module=true, kernel=true) + end +end + +@testset "add_kernarg_address_spaces! rewrites IR correctly" begin + mod = @eval module $(gensym()) + struct KernelArgs + x::Float64 + y::Float64 + z::Float64 + end + + function kernel(s::KernelArgs, scale::Float64, out::Ptr{Float64}) + unsafe_store!(out, (s.x + s.y + s.z) * scale) + return + end + end + + job, _ = GCN.create_job(mod.kernel, Tuple{mod.KernelArgs, Float64, Ptr{Float64}}; + kernel=true) + JuliaContext() do ctx + ir, meta = GPUCompiler.compile(:llvm, job) + + entry = meta.entry + ft = function_type(entry) + params = parameters(ft) + + # the struct byref param should be ptr addrspace(4) + has_as4 = any(p -> p isa LLVM.PointerType && addrspace(p) == 4, params) + @test has_as4 + + # non-struct params (double, ptr) should NOT be in addrspace(4) + non_as4_ptrs = filter(params) do p + p isa LLVM.PointerType && addrspace(p) != 4 + end + @test !isempty(non_as4_ptrs) # the Ptr{Float64} out param + + # byref attribute must be present + ir_str = string(ir) + @test occursin("byref", ir_str) + + dispose(ir) + end +end + @testset "https://github.com/JuliaGPU/AMDGPU.jl/issues/846" begin ir, rt = GCN.code_typed((Tuple{Tuple{Val{4}}, Tuple{Float32}},); always_inline=true) do t t[1] @@ -88,6 +178,7 @@ end end end + # struct field loads from kernarg should use s_load, not flat_load @test @filecheck begin check"CHECK: s_load_dwordx" check"CHECK-NOT: flat_load" @@ -95,6 +186,27 @@ end end end +@testset "no scratch spills for small struct kernarg" begin + mod = @eval module $(gensym()) + struct SmallStruct + x::Float64 + y::Float64 + end + + function kernel(s::SmallStruct, out::Ptr{Float64}) + unsafe_store!(out, s.x + s.y) + return + end + end + + # a small struct kernel should not need scratch memory + @test @filecheck begin + check"CHECK: .private_segment_fixed_size: 0" + GCN.code_native(mod.kernel, Tuple{mod.SmallStruct, Ptr{Float64}}; + dump_module=true, kernel=true) + end +end + @testset "skip scalar trap" begin mod = @eval module $(gensym()) workitem_idx_x() = ccall("llvm.amdgcn.workitem.id.x", llvmcall, Int32, ()) From 369d6ebe44e06ad43c93c690d0c6cc38b5eb840f Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Thu, 19 Mar 2026 11:17:54 -0300 Subject: [PATCH 11/14] Fix @dispose macro: revert space around = for older LLVM.jl compat The @dispose macro on older LLVM.jl requires `pb=expr()` without spaces, not `pb = expr()`. The Runic-suggested formatting breaks precompilation. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index c52d92bd..bfbe572d 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -64,7 +64,7 @@ function finish_ir!( # optimize after address space rewriting: propagate addrspace(4) through # the addrspacecast chains, then clean up newly-exposed opportunities tm = llvm_machine(job.config.target) - @dispose pb = NewPMPassBuilder() tm begin + @dispose pb=NewPMPassBuilder() tm begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, InferAddressSpacesPass()) add!(fpm, SROAPass()) @@ -135,7 +135,7 @@ function add_kernarg_address_spaces!( # (which expects flat pointers) continues to work. The AMDGPU backend's # AMDGPULowerKernelArguments traces these casts and produces s_load. new_args = LLVM.Value[] - @dispose builder = IRBuilder() begin + @dispose builder=IRBuilder() begin entry_bb = BasicBlock(new_f, "conversion") position!(builder, entry_bb) @@ -181,7 +181,7 @@ function add_kernarg_address_spaces!( LLVM.name!(new_f, fn) # clean up the extra conversion block - @dispose pb = NewPMPassBuilder() begin + @dispose pb=NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, SimplifyCFGPass()) end From 7c72b3d223d4192366f31e4a0ca058735b0c05a8 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Thu, 19 Mar 2026 11:47:39 -0300 Subject: [PATCH 12/14] Fix @dispose: tm is not a managed resource, just pass to run! Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gcn.jl b/src/gcn.jl index bfbe572d..d6598b4e 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -64,7 +64,7 @@ function finish_ir!( # optimize after address space rewriting: propagate addrspace(4) through # the addrspacecast chains, then clean up newly-exposed opportunities tm = llvm_machine(job.config.target) - @dispose pb=NewPMPassBuilder() tm begin + @dispose pb=NewPMPassBuilder() begin add!(pb, NewPMFunctionPassManager()) do fpm add!(fpm, InferAddressSpacesPass()) add!(fpm, SROAPass()) From 687f79dcfbd2caeaedcc58a28ed9ba7a84dff7ee Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 20 Mar 2026 12:33:18 -0300 Subject: [PATCH 13/14] Fix typed-pointer compat and test failures for kernarg rewrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On LLVM 16 (Julia ≤1.11) with typed pointers, add_kernarg_address_spaces! was creating opaque `ptr addrspace(4)` via `LLVM.PointerType(4)`, which introduced opaque pointers into a typed-pointer module. When InferAddressSpaces then propagated these through memcpy intrinsics (which use typed `i8 addrspace(N)*`), LLVM's verifier rejected the type mismatch. Fix by following Metal's pattern: use `LLVM.PointerType(eltype, 4)` on typed-pointer contexts and the original param type as the addrspacecast target. Also fix tests: Ptr{Float64} is lowered to i64 on Julia ≤1.11 and ptr on 1.12+, so use {{(i64|ptr)}} in filecheck and check for non-byref params instead of non-AS4 pointer params. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 10 +++++++--- test/gcn.jl | 16 ++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index d6598b4e..1c9547f8 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -115,11 +115,15 @@ function add_kernarg_address_spaces!( end needs_rewrite || return f - # generate the new function type with kernarg address space on byref params + # generate the new function type with constant address space on byref params new_types = LLVMType[] for (i, param) in enumerate(parameters(ft)) if byref_mask[i] && param isa LLVM.PointerType && addrspace(param) == 0 - push!(new_types, LLVM.PointerType(#=constant=# 4)) + if supports_typed_pointers(context()) + push!(new_types, LLVM.PointerType(eltype(param), #=constant=# 4)) + else + push!(new_types, LLVM.PointerType(#=constant=# 4)) + end else push!(new_types, param) end @@ -141,7 +145,7 @@ function add_kernarg_address_spaces!( for (i, param) in enumerate(parameters(ft)) if byref_mask[i] && param isa LLVM.PointerType && addrspace(param) == 0 - cast = addrspacecast!(builder, parameters(new_f)[i], LLVM.PointerType(0)) + cast = addrspacecast!(builder, parameters(new_f)[i], param) push!(new_args, cast) else push!(new_args, parameters(new_f)[i]) diff --git a/test/gcn.jl b/test/gcn.jl index ca8e8a68..5b49cf59 100644 --- a/test/gcn.jl +++ b/test/gcn.jl @@ -100,13 +100,14 @@ end end end - # scalar Float64 and Ptr should NOT be in addrspace(4), - # only the struct byref param should be + # scalar Float64 should NOT be in addrspace(4), + # only the struct byref param should be. + # NOTE: Ptr{Float64} is lowered to i64 on Julia ≤1.11 and ptr on Julia 1.12+. @test @filecheck begin check"CHECK: define amdgpu_kernel void" check"CHECK-SAME: double" check"CHECK-SAME: ptr addrspace(4)" - check"CHECK-SAME: ptr" + check"CHECK-SAME: {{(i64|ptr)}}" GCN.code_llvm(mod.kernel, Tuple{Float64, mod.Params, Ptr{Float64}}; dump_module=true, kernel=true) end @@ -139,11 +140,10 @@ end has_as4 = any(p -> p isa LLVM.PointerType && addrspace(p) == 4, params) @test has_as4 - # non-struct params (double, ptr) should NOT be in addrspace(4) - non_as4_ptrs = filter(params) do p - p isa LLVM.PointerType && addrspace(p) != 4 - end - @test !isempty(non_as4_ptrs) # the Ptr{Float64} out param + # non-struct params (double, and i64/ptr for Ptr{Float64}) should NOT + # be in addrspace(4). Ptr{Float64} is i64 on Julia ≤1.11, ptr on 1.12+. + non_byref = filter(p -> !(p isa LLVM.PointerType && addrspace(p) == 4), params) + @test !isempty(non_byref) # double (and i64 or ptr) params # byref attribute must be present ir_str = string(ir) From b08a764a0efcff63d80596c157c4c7cbda78d10b Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 20 Mar 2026 14:00:35 -0300 Subject: [PATCH 14/14] Detect byref params via attribute instead of classify_arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit classify_arguments can fail post-optimization on typed-pointer LLVM because convert(LLVMType, source_typ) may produce a different element type than the post-optimization codegen type, hitting the assertion at irgen.jl:322. This was triggered by the Symbols test in AMDGPU.jl. Instead of re-classifying arguments, just check for the byref attribute directly on each parameter — we know it's present because irgen.jl added it. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/gcn.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gcn.jl b/src/gcn.jl index 1c9547f8..e310b5c5 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -95,14 +95,14 @@ function add_kernarg_address_spaces!( ) ft = function_type(f) - # find the byref parameters + # find the byref parameters by checking for the byref attribute directly, + # rather than re-classifying arguments (which can fail on typed-pointer LLVM + # due to element type mismatches in classify_arguments assertions). + byref_kind = LLVM.API.LLVMGetEnumAttributeKindForName("byref", 5) byref_mask = BitVector(undef, length(parameters(ft))) - args = classify_arguments(job, ft; post_optimization = job.config.optimize) - filter!(args) do arg - arg.cc != GHOST - end - for arg in args - byref_mask[arg.idx] = (arg.cc == BITS_REF || arg.cc == KERNEL_STATE) + for i in 1:length(parameters(ft)) + attrs = collect(parameter_attributes(f, i)) + byref_mask[i] = any(a -> a isa TypeAttribute && kind(a) == byref_kind, attrs) end # check if any flat pointer byref params need rewriting