From 0e8077e74fead08e6bf4ab87bc272780649fda11 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 30 May 2026 23:35:57 +0200 Subject: [PATCH 1/9] Metal: inline exception-reporting runtime functions before AS inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The exception-reporting runtime functions (report_exception and friends) read GPUCompiler's deduced type-name and stack-frame string globals through a generic pointer argument. Out of line, an address-space inference pass can't trace those reads back to the constant globals, so they stay in the flat/generic space — and Metal's shader validator crashes its compiler service on a generic-space load of a constant global. Force-inlining the functions before InferAddressSpaces lets it resolve the reads to the constant globals (a clean constant-space load the validator accepts). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/metal.jl b/src/metal.jl index 7cf9115d..33463228 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -171,6 +171,21 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L entry = add_parameter_address_spaces!(job, mod, entry) entry = add_global_address_spaces!(job, mod, entry) + # the exception-reporting runtime functions read the deduced type-name and + # stack-frame string globals through a generic pointer argument. inline them so the + # address-space inference below can trace those reads back to the constant globals + # and keep them in the constant space: an out-of-line generic-space load of a + # constant global makes Metal's shader validator crash its compiler service. + for f in functions(mod) + if startswith(LLVM.name(f), "gpu_report_exception") + push!(function_attributes(f), EnumAttribute("alwaysinline", 0)) + end + end + @dispose pb=NewPMPassBuilder() begin + add!(pb, AlwaysInlinerPass()) + run!(pb, mod) + end + # propagate specific address spaces through addrspacecast chains introduced # by the rewrites above, so that loads/stores happen in the right address # space (e.g. constant globals in addrspace 2 rather than via a cast to 0, From f291a190f7219ca9fd87f3fc3bef49028f5f22a0 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 09:37:18 +0200 Subject: [PATCH 2/9] Initial implementation. --- src/metal.jl | 164 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 14 deletions(-) diff --git a/src/metal.jl b/src/metal.jl index 33463228..7ddaedbe 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -171,20 +171,14 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L entry = add_parameter_address_spaces!(job, mod, entry) entry = add_global_address_spaces!(job, mod, entry) - # the exception-reporting runtime functions read the deduced type-name and - # stack-frame string globals through a generic pointer argument. inline them so the - # address-space inference below can trace those reads back to the constant globals - # and keep them in the constant space: an out-of-line generic-space load of a - # constant global makes Metal's shader validator crash its compiler service. - for f in functions(mod) - if startswith(LLVM.name(f), "gpu_report_exception") - push!(function_attributes(f), EnumAttribute("alwaysinline", 0)) - end - end - @dispose pb=NewPMPassBuilder() begin - add!(pb, AlwaysInlinerPass()) - run!(pb, mod) - end + # `add_global_address_spaces!` puts constant globals (e.g. the deduced exception + # type-name and stack-frame strings) in the constant space, but a global passed to an + # out-of-line runtime function still reaches it through a *generic* pointer parameter, + # so the read is a generic-space load of constant data. Metal's shader validator + # crashes its compiler service on exactly that. Narrow such parameters to the address + # space their callers actually pass (the interprocedural complement to the + # `InferAddressSpaces` run below), so the read happens in the constant space directly. + propagate_argument_address_spaces!(mod) # propagate specific address spaces through addrspacecast chains introduced # by the rewrites above, so that loads/stores happen in the right address @@ -457,6 +451,148 @@ function add_global_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM.M end +# interprocedural address-space narrowing +# +# `add_global_address_spaces!` places constant data in the constant address space, but a +# global handed to an out-of-line function still reaches it through a *generic* pointer +# parameter (GPUCompiler's runtime functions, e.g. the exception reporters, take `Ptr` +# arguments). The callee then reads constant data with a generic-space load — which makes +# Metal's shader validator crash its compiler service. +# +# `InferAddressSpaces` rewrites such generic accesses into the concrete space, but only +# within a function; it cannot cross the call boundary. This pass is its interprocedural +# complement: where every caller passes a constant global for a pointer parameter (as +# `addrspacecast( -> generic)`), it retargets that parameter to the global's address +# space and drops the casts at the call sites. The callee body is left untouched — the +# narrowed parameter is cast straight back to generic on entry — so the rewrite is trivially +# correct; the subsequent `InferAddressSpaces` run then folds that intra-function cast away, +# turning the read into a constant-space load. No name matching, no inlining, no per-target +# address-space table: the space is read from the IR, so any back-end can run it. + +# If `v` is an `addrspacecast` (instruction or constant expression) of a constant global from +# a non-generic address space to the generic one, return the global; otherwise `nothing`. +function constant_global_addrspacecast_source(@nospecialize(v)) + (v isa LLVM.Instruction || v isa LLVM.ConstantExpr) || return nothing + opcode(v) == LLVM.API.LLVMAddrSpaceCast || return nothing + addrspace(value_type(v)) == 0 || return nothing + src = operands(v)[1] + (src isa LLVM.GlobalVariable && isconstant(src) && addrspace(value_type(src)) != 0) || + return nothing + return src +end + +function propagate_argument_address_spaces!(mod::LLVM.Module) + changed = false + for f in collect(functions(mod)) + isempty(blocks(f)) && continue # only functions we can rewrite (have a body) + param_types = parameters(function_type(f)) + + # collect call sites; bail unless every use is a direct call we can update + callsites = LLVM.CallInst[] + only_calls = true + for use in uses(f) + v = user(use) + if v isa LLVM.CallInst && called_operand(v) == f + push!(callsites, v) + else + only_calls = false + break + end + end + (only_calls && !isempty(callsites)) || continue + + # for each generic pointer parameter, find the address space its callers agree on + new_addrspaces = fill(-1, length(param_types)) + for (i, pty) in enumerate(param_types) + (pty isa LLVM.PointerType && addrspace(pty) == 0) || continue + as = -1 + for cs in callsites + src = constant_global_addrspacecast_source(arguments(cs)[i]) + if src === nothing + as = -1; break + end + src_as = addrspace(value_type(src)) + as == -1 ? (as = src_as) : (as == src_as || (as = -1; break)) + end + as > 0 && (new_addrspaces[i] = as) + end + any(>=(0), new_addrspaces) || continue + + narrow_pointer_parameters!(mod, f, new_addrspaces, callsites) + changed = true + end + return changed +end + +# Clone `f` with the pointer parameters listed in `new_addrspaces` (index => address space, +# `-1` to leave alone) retargeted to those address spaces, casting each retargeted parameter +# back to generic on entry so the cloned body is unchanged. Rewrite `callsites` to pass the +# un-casted source value for each retargeted argument. +function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, + new_addrspaces::Vector{Int}, callsites) + ft = function_type(f) + retarget(pty::LLVM.PointerType, as::Integer) = + supports_typed_pointers(context()) ? LLVM.PointerType(eltype(pty), as) : + LLVM.PointerType(as) + new_types = LLVM.LLVMType[new_addrspaces[i] >= 0 ? + retarget(param_typ::LLVM.PointerType, new_addrspaces[i]) : + param_typ + for (i, param_typ) in enumerate(parameters(ft))] + new_ft = LLVM.FunctionType(return_type(ft), new_types) + + new_f = LLVM.Function(mod, "", new_ft) + linkage!(new_f, linkage(f)) + callconv!(new_f, callconv(f)) + for (old_arg, new_arg) in zip(parameters(f), parameters(new_f)) + LLVM.name!(new_arg, LLVM.name(old_arg)) + end + + # cast each retargeted parameter back to generic so the cloned body keeps using it + # unchanged (InferAddressSpaces folds the cast away afterwards) + @dispose builder=IRBuilder() begin + entry = BasicBlock(new_f, "conversion") + position!(builder, entry) + new_args = LLVM.Value[] + for (i, param_typ) in enumerate(parameters(ft)) + if new_addrspaces[i] >= 0 + push!(new_args, addrspacecast!(builder, parameters(new_f)[i], param_typ)) + else + push!(new_args, parameters(new_f)[i]) + end + end + + value_map = Dict{LLVM.Value, LLVM.Value}( + param => new_args[i] for (i, param) in enumerate(parameters(f))) + value_map[f] = new_f + clone_into!(new_f, f; value_map, + changes=LLVM.API.LLVMCloneFunctionChangeTypeGlobalChanges) + + br!(builder, blocks(new_f)[2]) # fall through to the cloned entry block + end + + # rewrite call sites to pass the un-casted source value for each retargeted argument + @dispose builder=IRBuilder() begin + for cs in callsites + position!(builder, cs) + new_args = LLVM.Value[new_addrspaces[i] >= 0 ? + constant_global_addrspacecast_source(arg) : arg + for (i, arg) in enumerate(arguments(cs))] + new_call = call!(builder, new_ft, new_f, new_args, operand_bundles(cs)) + callconv!(new_call, callconv(cs)) + replace_uses!(cs, new_call) + erase!(cs) + end + end + + fn = LLVM.name(f) + @assert isempty(uses(f)) # every use was a call site we just rewrote + replace_metadata_uses!(f, new_f) + erase!(f) + LLVM.name!(new_f, fn) + return new_f +end + + # value-to-reference conversion # # Metal doesn't support passing values, so we need to convert those to references instead From 4221628dcdcf461f72e84633aca74b95166b6552 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 09:47:46 +0200 Subject: [PATCH 3/9] Metal: only narrow argument address spaces for local functions Changing a function's signature is unsound if it has callers outside the module, so restrict the IPO address-space pass to internal/private linkage. By finish_ir! the pipeline has already internalized everything but the kernel entrypoints, so the targeted runtime helpers still qualify. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/metal.jl b/src/metal.jl index 7ddaedbe..8a10aaf4 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -485,6 +485,15 @@ function propagate_argument_address_spaces!(mod::LLVM.Module) changed = false for f in collect(functions(mod)) isempty(blocks(f)) && continue # only functions we can rewrite (have a body) + + # changing a function's signature is only sound when it has no callers we cannot + # see; require local (internal/private) linkage, which rules out symbols that may + # be called from outside the module. by the time `finish_ir!` runs this, the + # pipeline has already internalized everything except the kernel entrypoints (see + # `InternalizePass` in `driver.jl`), so the runtime helpers we target qualify while + # the externally-visible entry — which has no in-module callers anyway — does not. + linkage(f) in (LLVM.API.LLVMInternalLinkage, LLVM.API.LLVMPrivateLinkage) || continue + param_types = parameters(function_type(f)) # collect call sites; bail unless every use is a direct call we can update From bc8e600e5a0a7b7cd45faf45f510f3de03c76f6e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 09:48:37 +0200 Subject: [PATCH 4/9] Metal: preserve attributes when narrowing argument address spaces clone_into! drops a parameter's attributes when it is remapped to the entry addrspacecast rather than to a new argument, so reattach them to the retargeted parameters. Also carry the call-site attributes over to the rewritten calls. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/metal.jl b/src/metal.jl index 8a10aaf4..c27acca1 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -533,6 +533,23 @@ function propagate_argument_address_spaces!(mod::LLVM.Module) return changed end +# copy the call-site attributes (function/return/per-argument) from `src` onto `dst`. the +# narrowing keeps argument positions unchanged, so they map across one-to-one. +function copy_callsite_attributes!(dst::LLVM.CallInst, src::LLVM.CallInst) + for attr in collect(function_attributes(src)) + push!(function_attributes(dst), attr) + end + for attr in collect(return_attributes(src)) + push!(return_attributes(dst), attr) + end + for i in 1:length(arguments(src)) + for attr in collect(argument_attributes(src, i)) + push!(argument_attributes(dst, i), attr) + end + end + return dst +end + # Clone `f` with the pointer parameters listed in `new_addrspaces` (index => address space, # `-1` to leave alone) retargeted to those address spaces, casting each retargeted parameter # back to generic on entry so the cloned body is unchanged. Rewrite `callsites` to pass the @@ -579,6 +596,17 @@ function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, br!(builder, blocks(new_f)[2]) # fall through to the cloned entry block end + # `clone_into!` copies a parameter's attributes only when it maps to a new *argument*; + # the retargeted parameters map to the entry addrspacecast instead, so their attributes + # (nonnull, dereferenceable, align, ...) are dropped. Reattach them — they remain valid + # for the narrowed (specific-AS) pointer, and the non-retargeted ones are already copied. + for i in 1:length(new_addrspaces) + new_addrspaces[i] >= 0 || continue + for attr in collect(parameter_attributes(f, i)) + push!(parameter_attributes(new_f, i), attr) + end + end + # rewrite call sites to pass the un-casted source value for each retargeted argument @dispose builder=IRBuilder() begin for cs in callsites @@ -588,6 +616,7 @@ function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, for (i, arg) in enumerate(arguments(cs))] new_call = call!(builder, new_ft, new_f, new_args, operand_bundles(cs)) callconv!(new_call, callconv(cs)) + copy_callsite_attributes!(new_call, cs) replace_uses!(cs, new_call) erase!(cs) end From 31bf6562c3e7ae52dccc94af8de0eaec2446a2ab Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 09:50:03 +0200 Subject: [PATCH 5/9] Metal: handle recursion in argument address-space narrowing Cloning remaps a function's recursive self-calls to the clone but leaves them with the old signature. Collect those self-calls from the clone and rewrite them through the same path as the external call sites, so a narrowed function that calls itself stays well-typed. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/metal.jl b/src/metal.jl index c27acca1..9c86d14f 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -550,10 +550,28 @@ function copy_callsite_attributes!(dst::LLVM.CallInst, src::LLVM.CallInst) return dst end +# rewrite a single call so it targets `new_f`/`new_ft`, passing the un-casted source value +# for each retargeted argument (and the original argument otherwise). Preserves calling +# convention, operand bundles and attributes; replaces and erases the old call. +function rewrite_narrowed_call!(builder::IRBuilder, cs::LLVM.CallInst, + new_f::LLVM.Function, new_ft::LLVM.FunctionType, + new_addrspaces::Vector{Int}) + position!(builder, cs) + new_args = LLVM.Value[new_addrspaces[i] >= 0 ? + constant_global_addrspacecast_source(arg) : arg + for (i, arg) in enumerate(arguments(cs))] + new_call = call!(builder, new_ft, new_f, new_args, operand_bundles(cs)) + callconv!(new_call, callconv(cs)) + copy_callsite_attributes!(new_call, cs) + replace_uses!(cs, new_call) + erase!(cs) + return new_call +end + # Clone `f` with the pointer parameters listed in `new_addrspaces` (index => address space, # `-1` to leave alone) retargeted to those address spaces, casting each retargeted parameter # back to generic on entry so the cloned body is unchanged. Rewrite `callsites` to pass the -# un-casted source value for each retargeted argument. +# un-casted source value for each retargeted argument; recursive self-calls are handled too. function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, new_addrspaces::Vector{Int}, callsites) ft = function_type(f) @@ -607,18 +625,23 @@ function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, end end + # if `f` was (directly) recursive, cloning remapped its self-calls to `new_f` but left + # them with the old signature; collect them from the clone so they get rewritten too. + # (these are distinct from the recursive call still sitting in the old `f`, which is in + # `callsites` and gets erased along with `f`.) collect before rewriting so the freshly + # built calls — which also target `new_f` — are not revisited. + self_calls = LLVM.CallInst[] + for bb in blocks(new_f), inst in instructions(bb) + inst isa LLVM.CallInst && called_operand(inst) == new_f && push!(self_calls, inst) + end + # rewrite call sites to pass the un-casted source value for each retargeted argument @dispose builder=IRBuilder() begin for cs in callsites - position!(builder, cs) - new_args = LLVM.Value[new_addrspaces[i] >= 0 ? - constant_global_addrspacecast_source(arg) : arg - for (i, arg) in enumerate(arguments(cs))] - new_call = call!(builder, new_ft, new_f, new_args, operand_bundles(cs)) - callconv!(new_call, callconv(cs)) - copy_callsite_attributes!(new_call, cs) - replace_uses!(cs, new_call) - erase!(cs) + rewrite_narrowed_call!(builder, cs, new_f, new_ft, new_addrspaces) + end + for cs in self_calls + rewrite_narrowed_call!(builder, cs, new_f, new_ft, new_addrspaces) end end From 96f0b57c91ea6ee46831e0ba7bc935f611a98001 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 09:50:58 +0200 Subject: [PATCH 6/9] Metal: narrow argument address spaces for any specific-AS source The narrowing only relocates a side-effect-free addrspacecast across the call boundary, so it is correct for any pointer with a known address space, not just constant globals. Drop the global restriction so device data threaded through helpers benefits too, and rename the predicate accordingly. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/src/metal.jl b/src/metal.jl index 9c86d14f..146a66be 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -453,30 +453,35 @@ end # interprocedural address-space narrowing # -# `add_global_address_spaces!` places constant data in the constant address space, but a -# global handed to an out-of-line function still reaches it through a *generic* pointer -# parameter (GPUCompiler's runtime functions, e.g. the exception reporters, take `Ptr` -# arguments). The callee then reads constant data with a generic-space load — which makes -# Metal's shader validator crash its compiler service. +# `InferAddressSpaces` rewrites a generic (flat) memory access into the concrete address +# space when it can trace the pointer back to an `addrspacecast` from that space — but only +# within a single function. A pointer that crosses a call boundary as a generic parameter +# loses that provenance: e.g. `add_global_address_spaces!` puts constant data in the +# constant space, yet a global handed to an out-of-line runtime function (the exception +# reporters take `Ptr` arguments) still arrives through a generic parameter, so the callee +# reads it with a generic-space load — which makes Metal's shader validator crash. # -# `InferAddressSpaces` rewrites such generic accesses into the concrete space, but only -# within a function; it cannot cross the call boundary. This pass is its interprocedural -# complement: where every caller passes a constant global for a pointer parameter (as -# `addrspacecast( -> generic)`), it retargets that parameter to the global's address -# space and drops the casts at the call sites. The callee body is left untouched — the -# narrowed parameter is cast straight back to generic on entry — so the rewrite is trivially -# correct; the subsequent `InferAddressSpaces` run then folds that intra-function cast away, -# turning the read into a constant-space load. No name matching, no inlining, no per-target -# address-space table: the space is read from the IR, so any back-end can run it. - -# If `v` is an `addrspacecast` (instruction or constant expression) of a constant global from -# a non-generic address space to the generic one, return the global; otherwise `nothing`. -function constant_global_addrspacecast_source(@nospecialize(v)) +# This pass is the interprocedural complement. Where every caller passes the same shape of +# value for a generic pointer parameter — `addrspacecast( -> +# generic)` — it retargets the parameter to that space and drops the casts at the call +# sites, casting the parameter straight back to generic on entry so the callee body is left +# untouched. That is a pure relocation of a side-effect-free cast across the boundary (the +# source flows in as the argument and the identical pointer is recomputed inside the +# callee), hence trivially correct; the subsequent `InferAddressSpaces` run then folds the +# entry cast away, turning the read into a specific-space load. The source need not be a +# constant global — any pointer whose address space is known qualifies (e.g. device data +# threaded through a helper). No name matching, no inlining, no per-target address-space +# table: the space is read from the IR, so any back-end can run it. + +# If `v` is an `addrspacecast` (instruction or constant expression) of a pointer from a +# specific (non-generic) address space to the generic one, return that source pointer; +# otherwise `nothing`. +function addrspacecast_to_generic_source(@nospecialize(v)) (v isa LLVM.Instruction || v isa LLVM.ConstantExpr) || return nothing opcode(v) == LLVM.API.LLVMAddrSpaceCast || return nothing addrspace(value_type(v)) == 0 || return nothing src = operands(v)[1] - (src isa LLVM.GlobalVariable && isconstant(src) && addrspace(value_type(src)) != 0) || + (value_type(src) isa LLVM.PointerType && addrspace(value_type(src)) != 0) || return nothing return src end @@ -516,7 +521,7 @@ function propagate_argument_address_spaces!(mod::LLVM.Module) (pty isa LLVM.PointerType && addrspace(pty) == 0) || continue as = -1 for cs in callsites - src = constant_global_addrspacecast_source(arguments(cs)[i]) + src = addrspacecast_to_generic_source(arguments(cs)[i]) if src === nothing as = -1; break end @@ -558,7 +563,7 @@ function rewrite_narrowed_call!(builder::IRBuilder, cs::LLVM.CallInst, new_addrspaces::Vector{Int}) position!(builder, cs) new_args = LLVM.Value[new_addrspaces[i] >= 0 ? - constant_global_addrspacecast_source(arg) : arg + addrspacecast_to_generic_source(arg) : arg for (i, arg) in enumerate(arguments(cs))] new_call = call!(builder, new_ft, new_f, new_args, operand_bundles(cs)) callconv!(new_call, callconv(cs)) From e6795439be18917b69255ad3fb4db18d6a725f55 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 09:58:54 +0200 Subject: [PATCH 7/9] Metal: test argument address-space narrowing Cover the narrowing of agreeing call sites, attribute preservation, and the bail-out cases (disagreeing sources, address-taken or externally-visible callees), plus self-recursion and a non-global (device-pointer) source. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/metal.jl | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/test/metal.jl b/test/metal.jl index 87539b26..4e43ce38 100644 --- a/test/metal.jl +++ b/test/metal.jl @@ -208,4 +208,132 @@ end end +@testset "argument address-space narrowing" begin + # pointer type in address space `as`, typed- and opaque-pointer compatible + asptr(as) = supports_typed_pointers() ? LLVM.PointerType(LLVM.Int8Type(), as) : + LLVM.PointerType(as) + + # build a module with an internal `callee` that loads through a generic (AS 0) pointer + # parameter, reached from one `caller` per entry in `caller_src_as`, each passing a + # constant global in that address space cast to generic. + function narrowing_module(caller_src_as::Vector{Int}; + callee_linkage=LLVM.API.LLVMInternalLinkage, + recursive=false, address_taken=false) + mod = LLVM.Module("test") + i8 = LLVM.Int8Type() + callee_ft = LLVM.FunctionType(i8, LLVM.LLVMType[asptr(0)]) + callee = LLVM.Function(mod, "callee", callee_ft) + linkage!(callee, callee_linkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(callee, "entry")) + v = load!(builder, i8, parameters(callee)[1]) + if recursive + # a (would-be infinite) self-call passing a constant global, only to + # exercise the recursion path; not meant to run. + g = GlobalVariable(mod, i8, "gself", caller_src_as[1]) + initializer!(g, ConstantInt(i8, 7)); constant!(g, true) + call!(builder, callee_ft, callee, [const_addrspacecast(g, asptr(0))]) + end + ret!(builder, v) + end + for (n, as) in enumerate(caller_src_as) + g = GlobalVariable(mod, i8, "g$n", as) + initializer!(g, ConstantInt(i8, n)); constant!(g, true) + caller = LLVM.Function(mod, "caller$n", LLVM.FunctionType(i8, LLVM.LLVMType[])) + linkage!(caller, LLVM.API.LLVMInternalLinkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(caller, "entry")) + ret!(builder, call!(builder, callee_ft, callee, + [const_addrspacecast(g, asptr(0))])) + end + end + if address_taken + # a non-call use of the callee: stash its address in a global + initializer!(GlobalVariable(mod, value_type(callee), "fp"), callee) + end + return mod + end + + callee_param_as(mod) = addrspace(parameters(function_type(functions(mod)["callee"]))[1]) + function calls_to(mod, fname) + f = functions(mod)[fname] + [inst for g in functions(mod) for bb in blocks(g) for inst in instructions(bb) + if inst isa LLVM.CallInst && called_operand(inst) == f] + end + + # all callers agree -> the parameter is narrowed; attributes survive; IR stays valid + Context() do ctx + mod = narrowing_module([2, 2]) + callee = functions(mod)["callee"] + push!(parameter_attributes(callee, 1), EnumAttribute("nonnull", 0)) + push!(function_attributes(callee), EnumAttribute("nounwind", 0)) + + @test GPUCompiler.propagate_argument_address_spaces!(mod) + @test callee_param_as(mod) == 2 + @test all(c -> addrspace(value_type(arguments(c)[1])) == 2, calls_to(mod, "callee")) + + callee = functions(mod)["callee"] + @test kind(EnumAttribute("nonnull", 0)) in kind.(collect(parameter_attributes(callee, 1))) + @test kind(EnumAttribute("nounwind", 0)) in kind.(collect(function_attributes(callee))) + @test (verify(mod); true) + end + + # callers disagree on the source address space -> left alone + Context() do ctx + mod = narrowing_module([2, 1]) + @test !GPUCompiler.propagate_argument_address_spaces!(mod) + @test callee_param_as(mod) == 0 + end + + # the callee's address is taken (a non-call use) -> left alone + Context() do ctx + mod = narrowing_module([2]; address_taken=true) + @test !GPUCompiler.propagate_argument_address_spaces!(mod) + @test callee_param_as(mod) == 0 + end + + # externally-visible callee -> left alone (its signature may be observed elsewhere) + Context() do ctx + mod = narrowing_module([2]; callee_linkage=LLVM.API.LLVMExternalLinkage) + @test !GPUCompiler.propagate_argument_address_spaces!(mod) + @test callee_param_as(mod) == 0 + end + + # a self-recursive callee is narrowed and the self-call rewritten to stay well-typed: + # every call to it (recursive included) must now pass the constant-space pointer + Context() do ctx + mod = narrowing_module([2]; recursive=true) + @test GPUCompiler.propagate_argument_address_spaces!(mod) + @test callee_param_as(mod) == 2 + @test length(calls_to(mod, "callee")) == 2 + @test all(c -> addrspace(value_type(arguments(c)[1])) == 2, calls_to(mod, "callee")) + @test (verify(mod); true) + end + + # the source need not be a global: a device pointer (AS 1) threaded through a helper + # as a generic pointer is narrowed to AS 1 just the same + Context() do ctx + mod = LLVM.Module("test") + i8 = LLVM.Int8Type() + callee_ft = LLVM.FunctionType(i8, LLVM.LLVMType[asptr(0)]) + callee = LLVM.Function(mod, "callee", callee_ft) + linkage!(callee, LLVM.API.LLVMInternalLinkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(callee, "entry")) + ret!(builder, load!(builder, i8, parameters(callee)[1])) + end + caller = LLVM.Function(mod, "caller", LLVM.FunctionType(i8, LLVM.LLVMType[asptr(1)])) + linkage!(caller, LLVM.API.LLVMInternalLinkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(caller, "entry")) + gen = addrspacecast!(builder, parameters(caller)[1], asptr(0)) + ret!(builder, call!(builder, callee_ft, callee, [gen])) + end + + @test GPUCompiler.propagate_argument_address_spaces!(mod) + @test callee_param_as(mod) == 1 + @test (verify(mod); true) + end +end + end From 70a987ce245be092527515f82c21bf28f044e46d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 10:02:25 +0200 Subject: [PATCH 8/9] Metal: tighten address-space narrowing comments Trim the pass comments to the essentials, drop the duplicated rationale at the call site, and remove em dashes. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 65 ++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/src/metal.jl b/src/metal.jl index 146a66be..9eca5639 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -171,13 +171,9 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L entry = add_parameter_address_spaces!(job, mod, entry) entry = add_global_address_spaces!(job, mod, entry) - # `add_global_address_spaces!` puts constant globals (e.g. the deduced exception - # type-name and stack-frame strings) in the constant space, but a global passed to an - # out-of-line runtime function still reaches it through a *generic* pointer parameter, - # so the read is a generic-space load of constant data. Metal's shader validator - # crashes its compiler service on exactly that. Narrow such parameters to the address - # space their callers actually pass (the interprocedural complement to the - # `InferAddressSpaces` run below), so the read happens in the constant space directly. + # narrow generic pointer parameters whose callers all pass a specific-AS pointer, so + # the constant globals read by out-of-line runtime functions (e.g. the exception + # reporters) load from the constant space rather than crashing Metal's validator. propagate_argument_address_spaces!(mod) # propagate specific address spaces through addrspacecast chains introduced @@ -453,25 +449,20 @@ end # interprocedural address-space narrowing # -# `InferAddressSpaces` rewrites a generic (flat) memory access into the concrete address -# space when it can trace the pointer back to an `addrspacecast` from that space — but only -# within a single function. A pointer that crosses a call boundary as a generic parameter -# loses that provenance: e.g. `add_global_address_spaces!` puts constant data in the -# constant space, yet a global handed to an out-of-line runtime function (the exception -# reporters take `Ptr` arguments) still arrives through a generic parameter, so the callee -# reads it with a generic-space load — which makes Metal's shader validator crash. +# `InferAddressSpaces` rewrites a generic (flat) load/store into a concrete address space +# when it can trace the pointer back to an `addrspacecast` from that space, but only within +# one function. A pointer crossing a call boundary as a generic parameter loses that +# provenance: a constant global passed to an out-of-line runtime function (the exception +# reporters take `Ptr` arguments) arrives generic and is read with a generic-space load, +# which crashes Metal's shader validator. # -# This pass is the interprocedural complement. Where every caller passes the same shape of -# value for a generic pointer parameter — `addrspacecast( -> -# generic)` — it retargets the parameter to that space and drops the casts at the call -# sites, casting the parameter straight back to generic on entry so the callee body is left -# untouched. That is a pure relocation of a side-effect-free cast across the boundary (the -# source flows in as the argument and the identical pointer is recomputed inside the -# callee), hence trivially correct; the subsequent `InferAddressSpaces` run then folds the -# entry cast away, turning the read into a specific-space load. The source need not be a -# constant global — any pointer whose address space is known qualifies (e.g. device data -# threaded through a helper). No name matching, no inlining, no per-target address-space -# table: the space is read from the IR, so any back-end can run it. +# This pass is the interprocedural complement. When every caller passes the same kind of +# value for a generic pointer parameter, `addrspacecast( -> +# generic)`, it retargets the parameter to that space, drops the casts at the call sites, +# and casts back to generic on entry so the body is unchanged. That only relocates a +# side-effect-free cast across the boundary, so it is trivially correct; the following +# `InferAddressSpaces` run folds the entry cast away. The source need not be a constant +# global; any pointer with a known address space qualifies, so any back-end can run it. # If `v` is an `addrspacecast` (instruction or constant expression) of a pointer from a # specific (non-generic) address space to the generic one, return that source pointer; @@ -491,12 +482,9 @@ function propagate_argument_address_spaces!(mod::LLVM.Module) for f in collect(functions(mod)) isempty(blocks(f)) && continue # only functions we can rewrite (have a body) - # changing a function's signature is only sound when it has no callers we cannot - # see; require local (internal/private) linkage, which rules out symbols that may - # be called from outside the module. by the time `finish_ir!` runs this, the - # pipeline has already internalized everything except the kernel entrypoints (see - # `InternalizePass` in `driver.jl`), so the runtime helpers we target qualify while - # the externally-visible entry — which has no in-module callers anyway — does not. + # rewriting a signature is only sound with no callers outside the module, so require + # local (internal/private) linkage. by `finish_ir!` the pipeline has internalized + # everything but the kernel entrypoints, so the runtime helpers we target qualify. linkage(f) in (LLVM.API.LLVMInternalLinkage, LLVM.API.LLVMPrivateLinkage) || continue param_types = parameters(function_type(f)) @@ -619,10 +607,9 @@ function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, br!(builder, blocks(new_f)[2]) # fall through to the cloned entry block end - # `clone_into!` copies a parameter's attributes only when it maps to a new *argument*; - # the retargeted parameters map to the entry addrspacecast instead, so their attributes - # (nonnull, dereferenceable, align, ...) are dropped. Reattach them — they remain valid - # for the narrowed (specific-AS) pointer, and the non-retargeted ones are already copied. + # `clone_into!` copies a parameter's attributes only when it maps to a new argument; the + # retargeted ones map to the entry addrspacecast instead, so theirs are dropped. Reattach + # them; they stay valid on the narrowed pointer, and non-retargeted params keep theirs. for i in 1:length(new_addrspaces) new_addrspaces[i] >= 0 || continue for attr in collect(parameter_attributes(f, i)) @@ -630,11 +617,9 @@ function narrow_pointer_parameters!(mod::LLVM.Module, f::LLVM.Function, end end - # if `f` was (directly) recursive, cloning remapped its self-calls to `new_f` but left - # them with the old signature; collect them from the clone so they get rewritten too. - # (these are distinct from the recursive call still sitting in the old `f`, which is in - # `callsites` and gets erased along with `f`.) collect before rewriting so the freshly - # built calls — which also target `new_f` — are not revisited. + # a (directly) recursive `f` has self-calls that cloning retargeted to `new_f` but left + # with the old signature; collect them from the clone for rewriting. collect first, since + # the rewritten calls also target `new_f` and must not be revisited. self_calls = LLVM.CallInst[] for bb in blocks(new_f), inst in instructions(bb) inst isa LLVM.CallInst && called_operand(inst) == new_f && push!(self_calls, inst) From f1bf67e532d0317b7c9005d0b940dc44aa197785 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 1 Jun 2026 12:22:43 +0200 Subject: [PATCH 9/9] Metal: iterate argument address-space narrowing to a fixed point A single sweep only narrows a function once all its callers already pass an addrspacecast-from-specific, so a constant reaching a deep callee through a delegating helper was missed unless functions happened to be visited in the right order. Iterate until no change so narrowing is order-independent and transitive; this lets back-ends delegate exception reporters instead of duplicating their bodies. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/metal.jl | 16 ++++++++++++++++ test/metal.jl | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/src/metal.jl b/src/metal.jl index 9eca5639..9f2541f1 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -463,6 +463,13 @@ end # side-effect-free cast across the boundary, so it is trivially correct; the following # `InferAddressSpaces` run folds the entry cast away. The source need not be a constant # global; any pointer with a known address space qualifies, so any back-end can run it. +# +# Narrowing one function makes its body forward an `addrspacecast`-from-specific to the +# functions it calls, exposing them in turn. We therefore iterate to a fixed point so a +# constant reaches an arbitrarily deep callee (e.g. an exception reporter that delegates to +# another) regardless of the order functions are visited in. This terminates: each sweep +# that changes anything strictly reduces the number of generic pointer parameters in the +# module, and narrowing never introduces a new one. # If `v` is an `addrspacecast` (instruction or constant expression) of a pointer from a # specific (non-generic) address space to the generic one, return that source pointer; @@ -478,6 +485,15 @@ function addrspacecast_to_generic_source(@nospecialize(v)) end function propagate_argument_address_spaces!(mod::LLVM.Module) + changed = false + while propagate_argument_address_spaces_once!(mod) + changed = true + end + return changed +end + +# a single narrowing sweep over the module; returns whether anything changed. +function propagate_argument_address_spaces_once!(mod::LLVM.Module) changed = false for f in collect(functions(mod)) isempty(blocks(f)) && continue # only functions we can rewrite (have a body) diff --git a/test/metal.jl b/test/metal.jl index 4e43ce38..ae837ccd 100644 --- a/test/metal.jl +++ b/test/metal.jl @@ -334,6 +334,51 @@ end @test callee_param_as(mod) == 1 @test (verify(mod); true) end + + # a two-level delegation chain (caller -> mid -> leaf) needs the fixpoint: one sweep + # narrows `mid` (its caller passes a constant global), which only then exposes `leaf`, + # since `mid` now forwards an addrspacecast-from-constant. iterate until both narrow. + Context() do ctx + mod = LLVM.Module("test") + i8 = LLVM.Int8Type() + ft = LLVM.FunctionType(i8, LLVM.LLVMType[asptr(0)]) + param_as(name) = addrspace(parameters(function_type(functions(mod)[name]))[1]) + + # leaf: loads through its generic pointer parameter + leaf = LLVM.Function(mod, "leaf", ft) + linkage!(leaf, LLVM.API.LLVMInternalLinkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(leaf, "entry")) + ret!(builder, load!(builder, i8, parameters(leaf)[1])) + end + + # mid: forwards its generic pointer parameter to leaf + mid = LLVM.Function(mod, "mid", ft) + linkage!(mid, LLVM.API.LLVMInternalLinkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(mid, "entry")) + ret!(builder, call!(builder, ft, leaf, [parameters(mid)[1]])) + end + + # caller: passes a constant global (AS 2) cast to generic into mid + g = GlobalVariable(mod, i8, "g", 2) + initializer!(g, ConstantInt(i8, 1)); constant!(g, true) + caller = LLVM.Function(mod, "caller", LLVM.FunctionType(i8, LLVM.LLVMType[])) + linkage!(caller, LLVM.API.LLVMInternalLinkage) + @dispose builder=IRBuilder() begin + position!(builder, BasicBlock(caller, "entry")) + ret!(builder, call!(builder, ft, mid, [const_addrspacecast(g, asptr(0))])) + end + + # a single sweep reaches only `mid`; the fixpoint must then narrow `leaf` too + @test GPUCompiler.propagate_argument_address_spaces_once!(mod) + @test param_as("mid") == 2 + @test param_as("leaf") == 0 + + @test GPUCompiler.propagate_argument_address_spaces!(mod) + @test param_as("leaf") == 2 + @test (verify(mod); true) + end end end