Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 74 additions & 50 deletions src/metal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,16 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L
# JuliaGPU/Metal.jl#113
hide_noreturn!(job, mod)

# get rid of unreachable control flow (JuliaGPU/Metal.jl#370).
# note that this currently works in tandem with the `hide_noreturn!` pass above,
# as `replace_unreachable!` doesn't handle functions that _only_ contain `unreachable`.
if job.config.target.macos < v"15"
for f in functions(mod)
replace_unreachable!(job, f)
end
# rewrite unreachable control flow into clean returns. two Apple-specific reasons:
# - JuliaGPU/Metal.jl#370: divergent `unreachable` crashes the back-end (pre-macOS 15).
# - JuliaGPU/Metal.jl#433: device-side exceptions lower to `llvm.trap`, but a compute
# trap wedges the whole Apple GPU (no compute watchdog; only a reboot clears it).
# `replace_unreachable!` strips the trap and turns the throw into a return instead.
#
# `hide_noreturn!` above must still run first: it drops the `noreturn` attribute (which
# the back-end would otherwise rediscover, #113) and inlines such functions.
for f in functions(mod)
replace_unreachable!(job, f)
end

# lower LLVM intrinsics that AIR doesn't support
Expand Down Expand Up @@ -1112,15 +1115,24 @@ function annotate_air_intrinsics!(@nospecialize(job::CompilerJob), mod::LLVM.Mod
return changed
end

# replace unreachable control flow with branches to the exit block
# replace unreachable control flow (and the trap that precedes it) with a return.
#
# before macOS 15, code generated by Julia 1.11 causes compilation failures in the back-end.
# the reduced example contains unreachable control flow executed divergently, so this is a
# similar issue as encountered with NVIDIA, albeit causing crashes instead of miscompiles.
# two reasons:
# - before macOS 15, code generated by Julia 1.11 causes compilation failures in the
# back-end: the reduced example contains unreachable control flow executed divergently,
# similar to what we hit on NVIDIA, but causing crashes instead of miscompiles (#370).
# - device-side exceptions lower to a `llvm.trap` followed by `unreachable`, but a compute
# trap wedges the whole Apple GPU (no watchdog; reboot to clear, JuliaGPU/Metal.jl#433).
#
# the proposed solution is to avoid (divergent) unreachable control flow, instead replacing
# it by branches to the exit block. since `unreachable` doesn't lower to anything that
# aborts the kernel anyway (can we fix this?), this transformation should be safe.
# so we replace `unreachable` (and any immediately preceding `llvm.trap`) by a branch to a
# return block — reusing the function's existing `ret`, or synthesizing one (`ret void`, or
# `ret undef` for value-returning functions) when the function _only_ contains `unreachable`.
#
# this returns from *this function* only (returning undef to the caller), not the whole
# kernel; it is not a true abort. a `threadgroup_barrier` between the throw and the return
# is still skipped by the faulting lane and will deadlock — but that already wedges today
# via the trap, so this is no worse, and it fixes the common (barrier-free) case. swallowed
# exceptions should be surfaced separately via a `signal_exception` host-visible flag.
function replace_unreachable!(@nospecialize(job::CompilerJob), f::LLVM.Function)
# find unreachable instructions and exit blocks
unreachables = Instruction[]
Expand All @@ -1135,47 +1147,59 @@ function replace_unreachable!(@nospecialize(job::CompilerJob), f::LLVM.Function)
end
isempty(unreachables) && return false

# if we don't have an exit block, we can't do much. we could insert a return, but that
# would probably keep the problematic control flow just as it is.
isempty(exit_blocks) && return false

@dispose builder=IRBuilder() begin
# if we have multiple exit blocks, take the last one, which is hopefully the least
# divergent (assuming divergent control flow is the root of the problem here).
exit_block = last(exit_blocks)
ret = terminator(exit_block)

# create a return block with only the return instruction, so that we only have to
# care about any values returned, and not about any other SSA value in the block.
if first(instructions(exit_block)) == ret
# we can reuse the exit block if it only contains the return
return_block = exit_block
else
# split the exit block right before the ret
local return_block
if isempty(exit_blocks)
# the function has no normal return (e.g. a kernel whose only path
# is a `throw`, which lowers to trap + unreachable). synthesize a
# return block so we can still strip the trap and turn the
# `unreachable` into a clean return.
return_block = BasicBlock(f, "ret")
move_after(return_block, exit_block)

# emit a branch
position!(builder, ret)
br!(builder, return_block)

# move the return
remove!(ret)
position!(builder, return_block)
insert!(builder, ret)
end
rt = return_type(function_type(f))
if rt == LLVM.VoidType()
ret!(builder)
else
ret!(builder, UndefValue(rt))
end
else
# if we have multiple exit blocks, take the last one, which is hopefully the least
# divergent (assuming divergent control flow is the root of the problem here).
exit_block = last(exit_blocks)
ret = terminator(exit_block)

# create a return block with only the return instruction, so that we only have to
# care about any values returned, and not about any other SSA value in the block.
if first(instructions(exit_block)) == ret
# we can reuse the exit block if it only contains the return
return_block = exit_block
else
# split the exit block right before the ret
return_block = BasicBlock(f, "ret")
move_after(return_block, exit_block)

# emit a branch
position!(builder, ret)
br!(builder, return_block)

# move the return
remove!(ret)
position!(builder, return_block)
insert!(builder, ret)
end

# when returning a value, add a phi node to the return block, so that we can later
# add incoming undef values when branching from `unreachable` blocks
if !isempty(operands(ret))
position!(builder, ret)
# XXX: support aggregate returns?
val = only(operands(ret))
phi = phi!(builder, value_type(val))
for pred in predecessors(return_block)
push!(incoming(phi), (val, pred))
# when returning a value, add a phi node to the return block, so that we can later
# add incoming undef values when branching from `unreachable` blocks
if !isempty(operands(ret))
position!(builder, ret)
# XXX: support aggregate returns?
val = only(operands(ret))
phi = phi!(builder, value_type(val))
for pred in predecessors(return_block)
push!(incoming(phi), (val, pred))
end
operands(ret)[1] = phi
end
operands(ret)[1] = phi
end

# replace the unreachable with a branch to the return block
Expand Down
Loading