From 76d1cf3fd5db7e36b9419f5961ac8fe8a72db713 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 26 May 2026 15:15:46 +0200 Subject: [PATCH 1/2] Fix atomic for 1.13 --- src/compiler/codegen.jl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/compiler/codegen.jl b/src/compiler/codegen.jl index fce8fca44..8fc3b1335 100644 --- a/src/compiler/codegen.jl +++ b/src/compiler/codegen.jl @@ -91,6 +91,19 @@ function GPUCompiler.finish_module!( end end + # LLVM 20+ requires !amdgpu.no.fine.grained.memory per-instruction metadata + # on FP atomicrmw to select native hardware atomics (e.g. global_atomic_add_f32) + # instead of expanding to a CAS loop on some targets (e.g. gfx1100). + if job.config.params.unsafe_fp_atomics + fp_binops = (LLVM.API.LLVMAtomicRMWBinOpFAdd, LLVM.API.LLVMAtomicRMWBinOpFSub, + LLVM.API.LLVMAtomicRMWBinOpFMax, LLVM.API.LLVMAtomicRMWBinOpFMin) + empty_md = MDNode(Metadata[]) + for fn in LLVM.functions(mod), bb in LLVM.blocks(fn), inst in LLVM.instructions(bb) + inst isa LLVM.AtomicRMWInst && LLVM.binop(inst) ∈ fp_binops && + (LLVM.metadata(inst)["amdgpu.no.fine.grained.memory"] = empty_md) + end + end + return entry end From a106753b3931839a477e8ec7591ce6e67cf1d39f Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 26 May 2026 16:55:41 +0200 Subject: [PATCH 2/2] Fixup --- src/compiler/codegen.jl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/compiler/codegen.jl b/src/compiler/codegen.jl index 8fc3b1335..570edfc3d 100644 --- a/src/compiler/codegen.jl +++ b/src/compiler/codegen.jl @@ -91,16 +91,22 @@ function GPUCompiler.finish_module!( end end - # LLVM 20+ requires !amdgpu.no.fine.grained.memory per-instruction metadata - # on FP atomicrmw to select native hardware atomics (e.g. global_atomic_add_f32) - # instead of expanding to a CAS loop on some targets (e.g. gfx1100). + # LLVM 20+ requires !amdgpu.no.fine.grained.memory on FP atomicrmw to emit + # native hardware atomics (e.g. global_atomic_add_f32) instead of a CAS loop. + # Mirrors Clang's setTargetAtomicMetadata; unsafe_fp_atomics is the opt-in. if job.config.params.unsafe_fp_atomics fp_binops = (LLVM.API.LLVMAtomicRMWBinOpFAdd, LLVM.API.LLVMAtomicRMWBinOpFSub, LLVM.API.LLVMAtomicRMWBinOpFMax, LLVM.API.LLVMAtomicRMWBinOpFMin) empty_md = MDNode(Metadata[]) for fn in LLVM.functions(mod), bb in LLVM.blocks(fn), inst in LLVM.instructions(bb) - inst isa LLVM.AtomicRMWInst && LLVM.binop(inst) ∈ fp_binops && - (LLVM.metadata(inst)["amdgpu.no.fine.grained.memory"] = empty_md) + inst isa LLVM.AtomicRMWInst || continue + op = LLVM.binop(inst) + op ∈ fp_binops || continue + md = LLVM.metadata(inst) + md["amdgpu.no.fine.grained.memory"] = empty_md + if op == LLVM.API.LLVMAtomicRMWBinOpFAdd && LLVM.value_type(inst) == LLVM.FloatType() + md["amdgpu.ignore.denormal.mode"] = empty_md + end end end @@ -213,7 +219,7 @@ function hipcompile(@nospecialize(job::CompilerJob)) GPUCompiler.compile(:obj, job) end - global_hostcalls = pop!(_global_hostcalls, hash(job)) + global_hostcalls = pop!(_global_hostcalls, hash(job), Symbol[]) # Late global hostcalls detection. append!(global_hostcalls, find_global_hostcalls(meta.ir))