From 7ea489b09c4fbebee46b5b926ec00815a2d2a4ff Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 27 May 2026 13:13:24 +0200 Subject: [PATCH] Metal: turn device-exception traps into returns to avoid GPU hangs On Apple GPUs a compute `trap` wedges the whole device (no compute watchdog; only a reboot clears it), so device-side exceptions hung the GPU on macOS 15+. Run `replace_unreachable!` unconditionally and have it strip the preceding `llvm.trap` and synthesize a return when a function only contains `unreachable`. See JuliaGPU/Metal.jl#433. Co-Authored-By: Claude Opus 4.7 --- src/metal.jl | 124 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 50 deletions(-) diff --git a/src/metal.jl b/src/metal.jl index cc192294..3452fe1c 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -230,13 +230,16 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L # JuliaGPU/Metal.jl#113 hide_noreturn!(job, mod) - # get rid of unreachable control flow (JuliaGPU/Metal.jl#370). - # note that this currently works in tandem with the `hide_noreturn!` pass above, - # as `replace_unreachable!` doesn't handle functions that _only_ contain `unreachable`. - if job.config.target.macos < v"15" - for f in functions(mod) - replace_unreachable!(job, f) - end + # rewrite unreachable control flow into clean returns. two Apple-specific reasons: + # - JuliaGPU/Metal.jl#370: divergent `unreachable` crashes the back-end (pre-macOS 15). + # - JuliaGPU/Metal.jl#433: device-side exceptions lower to `llvm.trap`, but a compute + # trap wedges the whole Apple GPU (no compute watchdog; only a reboot clears it). + # `replace_unreachable!` strips the trap and turns the throw into a return instead. + # + # `hide_noreturn!` above must still run first: it drops the `noreturn` attribute (which + # the back-end would otherwise rediscover, #113) and inlines such functions. + for f in functions(mod) + replace_unreachable!(job, f) end # lower LLVM intrinsics that AIR doesn't support @@ -1112,15 +1115,24 @@ function annotate_air_intrinsics!(@nospecialize(job::CompilerJob), mod::LLVM.Mod return changed end -# replace unreachable control flow with branches to the exit block +# replace unreachable control flow (and the trap that precedes it) with a return. # -# before macOS 15, code generated by Julia 1.11 causes compilation failures in the back-end. -# the reduced example contains unreachable control flow executed divergently, so this is a -# similar issue as encountered with NVIDIA, albeit causing crashes instead of miscompiles. +# two reasons: +# - before macOS 15, code generated by Julia 1.11 causes compilation failures in the +# back-end: the reduced example contains unreachable control flow executed divergently, +# similar to what we hit on NVIDIA, but causing crashes instead of miscompiles (#370). +# - device-side exceptions lower to a `llvm.trap` followed by `unreachable`, but a compute +# trap wedges the whole Apple GPU (no watchdog; reboot to clear, JuliaGPU/Metal.jl#433). # -# the proposed solution is to avoid (divergent) unreachable control flow, instead replacing -# it by branches to the exit block. since `unreachable` doesn't lower to anything that -# aborts the kernel anyway (can we fix this?), this transformation should be safe. +# so we replace `unreachable` (and any immediately preceding `llvm.trap`) by a branch to a +# return block — reusing the function's existing `ret`, or synthesizing one (`ret void`, or +# `ret undef` for value-returning functions) when the function _only_ contains `unreachable`. +# +# this returns from *this function* only (returning undef to the caller), not the whole +# kernel; it is not a true abort. a `threadgroup_barrier` between the throw and the return +# is still skipped by the faulting lane and will deadlock — but that already wedges today +# via the trap, so this is no worse, and it fixes the common (barrier-free) case. swallowed +# exceptions should be surfaced separately via a `signal_exception` host-visible flag. function replace_unreachable!(@nospecialize(job::CompilerJob), f::LLVM.Function) # find unreachable instructions and exit blocks unreachables = Instruction[] @@ -1135,47 +1147,59 @@ function replace_unreachable!(@nospecialize(job::CompilerJob), f::LLVM.Function) end isempty(unreachables) && return false - # if we don't have an exit block, we can't do much. we could insert a return, but that - # would probably keep the problematic control flow just as it is. - isempty(exit_blocks) && return false - @dispose builder=IRBuilder() begin - # if we have multiple exit blocks, take the last one, which is hopefully the least - # divergent (assuming divergent control flow is the root of the problem here). - exit_block = last(exit_blocks) - ret = terminator(exit_block) - - # create a return block with only the return instruction, so that we only have to - # care about any values returned, and not about any other SSA value in the block. - if first(instructions(exit_block)) == ret - # we can reuse the exit block if it only contains the return - return_block = exit_block - else - # split the exit block right before the ret + local return_block + if isempty(exit_blocks) + # the function has no normal return (e.g. a kernel whose only path + # is a `throw`, which lowers to trap + unreachable). synthesize a + # return block so we can still strip the trap and turn the + # `unreachable` into a clean return. return_block = BasicBlock(f, "ret") - move_after(return_block, exit_block) - - # emit a branch - position!(builder, ret) - br!(builder, return_block) - - # move the return - remove!(ret) position!(builder, return_block) - insert!(builder, ret) - end + rt = return_type(function_type(f)) + if rt == LLVM.VoidType() + ret!(builder) + else + ret!(builder, UndefValue(rt)) + end + else + # if we have multiple exit blocks, take the last one, which is hopefully the least + # divergent (assuming divergent control flow is the root of the problem here). + exit_block = last(exit_blocks) + ret = terminator(exit_block) + + # create a return block with only the return instruction, so that we only have to + # care about any values returned, and not about any other SSA value in the block. + if first(instructions(exit_block)) == ret + # we can reuse the exit block if it only contains the return + return_block = exit_block + else + # split the exit block right before the ret + return_block = BasicBlock(f, "ret") + move_after(return_block, exit_block) + + # emit a branch + position!(builder, ret) + br!(builder, return_block) + + # move the return + remove!(ret) + position!(builder, return_block) + insert!(builder, ret) + end - # when returning a value, add a phi node to the return block, so that we can later - # add incoming undef values when branching from `unreachable` blocks - if !isempty(operands(ret)) - position!(builder, ret) - # XXX: support aggregate returns? - val = only(operands(ret)) - phi = phi!(builder, value_type(val)) - for pred in predecessors(return_block) - push!(incoming(phi), (val, pred)) + # when returning a value, add a phi node to the return block, so that we can later + # add incoming undef values when branching from `unreachable` blocks + if !isempty(operands(ret)) + position!(builder, ret) + # XXX: support aggregate returns? + val = only(operands(ret)) + phi = phi!(builder, value_type(val)) + for pred in predecessors(return_block) + push!(incoming(phi), (val, pred)) + end + operands(ret)[1] = phi end - operands(ret)[1] = phi end # replace the unreachable with a branch to the return block