From 052750721da546dab69e022f17b453bfad48c2d9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 13:14:38 +0200
Subject: [PATCH 01/10] Drop fdiv/sqrt/inv device overrides handled by
 GPUCompiler.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GPUCompiler's `PTXFDivFastPass` handles `afn`-flagged fdiv (covering
`@fastmath` per-call and the `fastmath=true` job kwarg), and NVPTX
already pattern-matches plain `fdiv 1.0, x` to `rcp.rn`. The only
remaining override is `FastMath.inv_fast(::AbstractFloat)`, which
Julia upstream doesn't implement for floats — route through `div_fast`
so the pass sees `afn`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CUDACore/Project.toml                  |  2 +-
 CUDACore/src/device/intrinsics/math.jl | 37 +++++++++-----------------
 test/core/codegen.jl                   | 37 ++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index 2c9992781e..174a776d0b 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.12"
+GPUCompiler = "1.13"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index 95735015f2..bd6d3dd54b 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -319,10 +319,9 @@ end
 
 ## roots and powers
 
-@device_override Base.sqrt(x::Float64) = ccall("extern __nv_sqrt", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.sqrt(x::Float32) = ccall("extern __nv_sqrtf", llvmcall, Cfloat, (Cfloat,), x)
-# sqrt(::Float16) inherits from Julia (Float16(sqrt(Float32(x)))), routing through __nv_sqrtf.
-@device_override FastMath.sqrt_fast(x::Union{Float32, Float64}) = sqrt(x)
+# `Base.sqrt` inherits from Julia (`llvm.sqrt.f{32,64}`); routing through
+# libdevice's `__nv_sqrtf` would force `sqrt.approx.*` unconditionally, since
+# LLVM's NVVMReflectPass folds `__CUDA_PREC_SQRT` to 0.
 
 @device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x)
 @device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x)
@@ -508,27 +507,15 @@ end
 @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
-@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
-
-@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
-@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)
-
-@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
-@device_override function FastMath.inv_fast(x::Float64)
-    # Get the approximate reciprocal
-    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
-    # This instruction chops off last 32bits of mantissa and computes inverse
-    # while treating all subnormal numbers as 0.0
-    # If reciprocal would be subnormal, underflows to 0.0
-    # 32 least significant bits of the result are filled with 0s
-    inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
-
-    # Approximate the missing 32bits of mantissa with a single cubic iteration
-    e = fma(inv_x, -x, 1.0)
-    e = fma(e, e, e)
-    inv_x = fma(e, inv_x, inv_x)
-end
+# `Base.{/, inv}` and `Base.FastMath.div_fast` inherit from Julia:
+# GPUCompiler's `PTXFDivFastPass` handles the `afn`-flagged fdiv these emit,
+# and NVPTX pattern-matches plain `fdiv 1.0, x` to `rcp.rn`.
+#
+# `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only
+# `Complex` has a method) and the catch-all fallback drops `afn`; route it
+# through `div_fast` so the pass sees the flag.
+@device_override FastMath.inv_fast(x::Union{Float32, Float64}) =
+    FastMath.div_fast(one(x), x)
 
 ## distributions
 
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index db431db4ed..ea6b1b6c47 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -203,6 +203,43 @@ end
 
     asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
     @test occursin("sqrt.approx.ftz", asm)
+
+    # Pin down PTX for `/` and `inv` across {f32, f64} × {plain, @fastmath}
+    # × {default, fastmath=true job}, since `Base.{/, inv}` and their fast
+    # variants are now handled entirely by GPUCompiler's `PTXFDivFastPass`.
+    fdiv(x, y) = x / y
+    finv(x) = inv(x)
+    fdiv_fast(x, y) = @fastmath x / y
+    finv_fast(x) = @fastmath inv(x)
+
+    # plain: precise lowering (inv via NVPTX matching `fdiv 1.0, x` to rcp.rn).
+    for T in (Float32, Float64)
+        suffix = T === Float32 ? "f32" : "f64"
+        @test occursin("div.rn.$suffix", sprint(io->CUDA.code_ptx(io, fdiv, Tuple{T,T})))
+        @test occursin("rcp.rn.$suffix", sprint(io->CUDA.code_ptx(io, finv, Tuple{T,})))
+    end
+
+    # @fastmath: per-call `afn` reaches the pass. f64 → rcp + Newton (no
+    # native fast f64 fdiv); f32 stays non-FTZ since the job isn't fast.
+    asm = sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float32,Float32}))
+    @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm)
+    asm = sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float32,}))
+    @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm)
+    @test occursin("rcp.approx.ftz.f64",
+                   sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float64,Float64})))
+    @test occursin("rcp.approx.ftz.f64",
+                   sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float64,})))
+
+    # fastmath=true job: `apply_fastmath!` stamps `afn` on plain ops; f32
+    # additionally picks up FTZ.
+    @test occursin("div.approx.ftz.f32",
+                   sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float32,Float32}; fastmath=true)))
+    @test occursin("div.approx.ftz.f32",
+                   sprint(io->CUDA.code_ptx(io, finv, Tuple{Float32,}; fastmath=true)))
+    @test occursin("rcp.approx.ftz.f64",
+                   sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float64,Float64}; fastmath=true)))
+    @test occursin("rcp.approx.ftz.f64",
+                   sprint(io->CUDA.code_ptx(io, finv, Tuple{Float64,}; fastmath=true)))
 end
 
 @testset "fma/muladd emit fma.rn" begin

From 5f69f39fb11be9708c1ae09551ab396c61de6311 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 13:36:05 +0200
Subject: [PATCH 02/10] Drop more device overrides handled by Julia + NVPTX
 directly.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

isfinite/isinf/isnan, signbit/copysign/abs, trunc/ceil/floor, fma, and
muladd inherit from Julia. Julia emits canonical LLVM ops (`llvm.fabs`,
`llvm.floor`, `llvm.copysign`, `llvm.fma`, `fmul contract + fadd contract`,
etc.), and the NVPTX backend lowers them to the same single-instruction
PTX the libdevice overrides used to produce after inlining.

`Base.fma(::Float16,...)` is the lone exception — its `jl_have_fma`
runtime call isn't recognized by GPUCompiler's `cpu_features!`, so the
branch survives the optimizer.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CUDACore/src/device/intrinsics/math.jl | 56 ++++++++++----------------
 test/core/codegen.jl                   | 14 ++-----
 2 files changed, 24 insertions(+), 46 deletions(-)

diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index bd6d3dd54b..6621355d8b 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -286,15 +286,11 @@ end
 
 ## floating-point handling
 
-@device_override Base.isfinite(x::Float32) = (ccall("extern __nv_finitef", llvmcall, Int32, (Cfloat,), x)) != 0
-@device_override Base.isfinite(x::Float64) = (ccall("extern __nv_isfinited", llvmcall, Int32, (Cdouble,), x)) != 0
-
-@device_override Base.isinf(x::Float64) = (ccall("extern __nv_isinfd", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.isinf(x::Float32) = (ccall("extern __nv_isinff", llvmcall, Int32, (Cfloat,), x)) != 0
-
-@device_override Base.isnan(x::Float64) = (ccall("extern __nv_isnand", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.isnan(x::Float32) = (ccall("extern __nv_isnanf", llvmcall, Int32, (Cfloat,), x)) != 0
-# isnan(::Float16) inherits from Julia (x != x), which compiles to a single setp.neu.f16.
+# isfinite/isinf/isnan inherit from Julia, which uses pure FP comparisons and
+# integer bit tricks: e.g. `isnan(x) = x != x` compiles to `setp.neu.f{16,32,64}`,
+# `isinf(x) = abs(x) == Inf` to `abs.f32 + setp.eq.f32`. The libdevice
+# `__nv_is{inf,nan,finite}*` wrappers do the same in their bodies, so the
+# overrides produced equivalent PTX after inlining.
 
 @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x)
 @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x)
@@ -305,17 +301,10 @@ end
 
 ## sign handling
 
-@device_override Base.signbit(x::Float64) = (ccall("extern __nv_signbitd", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.signbit(x::Float32) = (ccall("extern __nv_signbitf", llvmcall, Int32, (Cfloat,), x)) != 0
-
-@device_override Base.copysign(x::Float64, y::Float64) = ccall("extern __nv_copysign", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-@device_override Base.copysign(x::Float32, y::Float32) = ccall("extern __nv_copysignf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-
-@device_override Base.abs(x::Int32) =   ccall("extern __nv_abs", llvmcall, Int32, (Int32,), x)
-@device_override Base.abs(f::Float64) = ccall("extern __nv_fabs", llvmcall, Cdouble, (Cdouble,), f)
-@device_override Base.abs(f::Float32) = ccall("extern __nv_fabsf", llvmcall, Cfloat, (Cfloat,), f)
-# abs(::Float16) inherits from Julia (abs_float intrinsic), lowering to and.b16.
-@device_override Base.abs(x::Int64) =   ccall("extern __nv_llabs", llvmcall, Int64, (Int64,), x)
+# signbit/copysign/abs inherit from Julia. Julia emits canonical LLVM ops:
+# `signbit` is a sign-bit extract; `copysign` is `llvm.copysign.f{32,64}`
+# (NVPTX has custom lowering to bit-twiddle); `abs` is `llvm.fabs.f{32,64}`
+# (NVPTX legal → `abs.f{32,64}`) for floats, or two-complement for ints.
 
 ## roots and powers
 
@@ -394,14 +383,8 @@ end
 #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x)
 #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x)
 
-@device_override Base.trunc(x::Float64) = ccall("extern __nv_trunc", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.trunc(x::Float32) = ccall("extern __nv_truncf", llvmcall, Cfloat, (Cfloat,), x)
-
-@device_override Base.ceil(x::Float64) = ccall("extern __nv_ceil", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.ceil(x::Float32) = ccall("extern __nv_ceilf", llvmcall, Cfloat, (Cfloat,), x)
-
-@device_override Base.floor(f::Float64) = ccall("extern __nv_floor", llvmcall, Cdouble, (Cdouble,), f)
-@device_override Base.floor(f::Float32) = ccall("extern __nv_floorf", llvmcall, Cfloat, (Cfloat,), f)
+# trunc/ceil/floor inherit from Julia (`llvm.{trunc,ceil,floor}.f{32,64}`),
+# which NVPTX lowers natively to `cvt.r{zi,pi,mi}.f{32,64}.f{32,64}`.
 
 #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y)
 #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
@@ -536,13 +519,16 @@ end
 @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
-@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
-@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
-
-@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
-@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+# fma/muladd inherit from Julia. `Base.fma(::Float{32,64})` branches on
+# `julia.cpu.have_fma.f{32,64}`, which GPUCompiler folds to `true` on PTX,
+# leaving `llvm.fma`. `Base.muladd` emits `fmul contract + fadd contract`,
+# fused by the backend. Both lower to `fma.rn.f{32,64}` on NVPTX.
+#
+# `Base.fma(::Float16,...)` instead branches on a runtime `jl_have_fma` call
+# that GPUCompiler can't fold (see GPUCompiler's `cpu_features!`), so we
+# keep an explicit override for that one type.
+@device_override Base.fma(x::Float16, y::Float16, z::Float16) =
+    ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
 
 # Directed rounding for binary arithmetic and fma. NVPTX exposes
 # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub`
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index ea6b1b6c47..6694539a26 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -35,17 +35,9 @@ end
     end
 end
 
-@testset "muladd uses LLVM intrinsic" begin
-    function muladd_kernel(ptr)
-        unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-        return
-    end
-
-    for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
-        ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}}))
-        @test occursin("llvm.fmuladd.$suffix", ir)
-    end
-end
+# muladd is no longer overridden — Julia emits `fmul contract + fadd contract`,
+# which the backend fuses. The "fma/muladd emit fma.rn" PTX testset below
+# verifies the actual end-to-end result on every supported FP type.
 
 @testset "assume" begin
     foo(i) = cld(42, i)

From c9391fc9c873828e1a31a909e844ec6972a6de6f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 14:31:25 +0200
Subject: [PATCH 03/10] Test math intrinsics lower without libdevice.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pin down PTX for the ops whose `@device_override`s were dropped — abs,
floor/ceil/trunc, isnan/isinf/isfinite/signbit, copysign, min/max — across
{f32, f64}, plain vs. `@fastmath` where it matters, and with job-wide
`fastmath=true` (which also flips f32 ops to their `.ftz` variants via
`apply_fastmath!`'s `denormal-fp-math-f32` attribute).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/core/codegen.jl | 77 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 6694539a26..562cff0b0d 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -257,6 +257,83 @@ end
     end
 end
 
+@testset "math intrinsics lower without libdevice" begin
+    # Pin down PTX for the ops whose `@device_override`s were dropped, since
+    # they now rely on Julia emitting canonical LLVM IR and NVPTX lowering it.
+    # No libdevice (`__nv_*`) calls should remain in any of these.
+
+    suffix(::Type{Float32}) = "f32"
+    suffix(::Type{Float64}) = "f64"
+    asm_of(f, T) = sprint(io->CUDA.code_ptx(io, f, T))
+    asm_fast(f, T) = sprint(io->CUDA.code_ptx(io, f, T; fastmath=true))
+
+    # `abs` lowers to a single instruction for both floats and ints.
+    for T in (Float32, Float64)
+        asm = asm_of(x->abs(x), Tuple{T})
+        @test occursin("abs.$(suffix(T))", asm) && !occursin("__nv_", asm)
+    end
+    for (T, s) in ((Int32, "s32"), (Int64, "s64"))
+        asm = asm_of(x->abs(x), Tuple{T})
+        @test occursin("abs.$s", asm) && !occursin("__nv_", asm)
+    end
+
+    # floor/ceil/trunc → `cvt.r{m,p,z}i.fXX.fXX`.
+    for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi"))
+        for T in (Float32, Float64)
+            s = suffix(T)
+            asm = asm_of(x->op(x), Tuple{T})
+            @test occursin("cvt.$rnd.$s.$s", asm) && !occursin("__nv_", asm)
+        end
+    end
+
+    # FP predicates: should be pure compare/bit-test, no libdevice.
+    for T in (Float32, Float64)
+        for op in (isnan, isinf, isfinite, signbit)
+            asm = asm_of(x->op(x), Tuple{T})
+            @test !occursin("__nv_", asm)
+        end
+    end
+    # isnan is the cleanest: a single `setp.nan.fXX`.
+    @test occursin("setp.nan.f32", asm_of(x->isnan(x), Tuple{Float32}))
+    @test occursin("setp.nan.f64", asm_of(x->isnan(x), Tuple{Float64}))
+
+    # copysign: no libdevice, no single instruction (bit-twiddle on NVPTX).
+    for T in (Float32, Float64)
+        asm = asm_of((x, y) -> copysign(x, y), Tuple{T, T})
+        @test !occursin("__nv_", asm)
+    end
+
+    # Default `min`/`max` propagate NaN (Julia semantics). f32 on sm_80+ +
+    # LLVM 14+ gets the dedicated `min.NaN`/`max.NaN`; f64 emulates via PTX.
+    for (op, instr) in ((min, "min"), (max, "max"))
+        for T in (Float32, Float64)
+            asm = asm_of((x, y) -> op(x, y), Tuple{T, T})
+            @test occursin("$instr.", asm)
+        end
+    end
+    @test occursin("min.NaN.f32", asm_of((x, y) -> min(x, y), Tuple{Float32, Float32}))
+    @test occursin("max.NaN.f32", asm_of((x, y) -> max(x, y), Tuple{Float32, Float32}))
+
+    # `@fastmath min/max` drops NaN handling, becoming a plain compare+select.
+    for T in (Float32, Float64)
+        asm = asm_of((x, y) -> @fastmath(min(x, y)), Tuple{T, T})
+        @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm)
+        asm = asm_of((x, y) -> @fastmath(max(x, y)), Tuple{T, T})
+        @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm)
+    end
+
+    # job-wide `fastmath=true` propagates `afn` via `apply_fastmath!`. The
+    # ops here don't use `afn` to pick variants, but `apply_fastmath!` also
+    # sets `denormal-fp-math-f32="preserve-sign"`, which NVPTX' `useF32FTZ`
+    # reads to pick FTZ variants for *every* f32 op (`abs.ftz.f32`,
+    # `cvt.r{m,p,z}i.ftz.*`, etc.). f64 stays unchanged since NVPTX has no
+    # FTZ behavior on f64.
+    @test occursin("abs.ftz.f32", asm_fast(x->abs(x), Tuple{Float32}))
+    @test occursin("abs.f64", asm_fast(x->abs(x), Tuple{Float64}))
+    @test occursin("cvt.rmi.ftz.f32.f32", asm_fast(x->floor(x), Tuple{Float32}))
+    @test occursin("cvt.rmi.f64.f64", asm_fast(x->floor(x), Tuple{Float64}))
+end
+
 @testset "header rewrite (.target/.version bump)" begin
     # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 +
     # LLVM 18 on a Blackwell device), `_compiler_config` produces a split

From 30a9b7e769c955a37a16bdf1ce34af53b7e6a462 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 14:35:52 +0200
Subject: [PATCH 04/10] Simplify.

---
 CUDACore/src/device/intrinsics/math.jl | 37 +++-----------------------
 test/core/codegen.jl                   |  4 ---
 2 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index 6621355d8b..d52160d7c8 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -286,12 +286,6 @@ end
 
 ## floating-point handling
 
-# isfinite/isinf/isnan inherit from Julia, which uses pure FP comparisons and
-# integer bit tricks: e.g. `isnan(x) = x != x` compiles to `setp.neu.f{16,32,64}`,
-# `isinf(x) = abs(x) == Inf` to `abs.f32 + setp.eq.f32`. The libdevice
-# `__nv_is{inf,nan,finite}*` wrappers do the same in their bodies, so the
-# overrides produced equivalent PTX after inlining.
-
 @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x)
 @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x)
 
@@ -299,19 +293,8 @@ end
 @device_function nextafter(x::Float32, y::Float32) = ccall("extern __nv_nextafterf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
 
-## sign handling
-
-# signbit/copysign/abs inherit from Julia. Julia emits canonical LLVM ops:
-# `signbit` is a sign-bit extract; `copysign` is `llvm.copysign.f{32,64}`
-# (NVPTX has custom lowering to bit-twiddle); `abs` is `llvm.fabs.f{32,64}`
-# (NVPTX legal → `abs.f{32,64}`) for floats, or two-complement for ints.
-
 ## roots and powers
 
-# `Base.sqrt` inherits from Julia (`llvm.sqrt.f{32,64}`); routing through
-# libdevice's `__nv_sqrtf` would force `sqrt.approx.*` unconditionally, since
-# LLVM's NVVMReflectPass folds `__CUDA_PREC_SQRT` to 0.
-
 @device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x)
 @device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x)
 @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x)))
@@ -383,9 +366,6 @@ end
 #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x)
 #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x)
 
-# trunc/ceil/floor inherit from Julia (`llvm.{trunc,ceil,floor}.f{32,64}`),
-# which NVPTX lowers natively to `cvt.r{zi,pi,mi}.f{32,64}.f{32,64}`.
-
 #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y)
 #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
 #@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y))
@@ -490,16 +470,12 @@ end
 @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
-# `Base.{/, inv}` and `Base.FastMath.div_fast` inherit from Julia:
-# GPUCompiler's `PTXFDivFastPass` handles the `afn`-flagged fdiv these emit,
-# and NVPTX pattern-matches plain `fdiv 1.0, x` to `rcp.rn`.
-#
 # `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only
-# `Complex` has a method) and the catch-all fallback drops `afn`; route it
-# through `div_fast` so the pass sees the flag.
+# `Complex` has a method) and the catch-all fallback drops `afn`
 @device_override FastMath.inv_fast(x::Union{Float32, Float64}) =
     FastMath.div_fast(one(x), x)
 
+
 ## distributions
 
 # TODO: override StatsFun.jl?
@@ -519,14 +495,7 @@ end
 @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
-# fma/muladd inherit from Julia. `Base.fma(::Float{32,64})` branches on
-# `julia.cpu.have_fma.f{32,64}`, which GPUCompiler folds to `true` on PTX,
-# leaving `llvm.fma`. `Base.muladd` emits `fmul contract + fadd contract`,
-# fused by the backend. Both lower to `fma.rn.f{32,64}` on NVPTX.
-#
-# `Base.fma(::Float16,...)` instead branches on a runtime `jl_have_fma` call
-# that GPUCompiler can't fold (see GPUCompiler's `cpu_features!`), so we
-# keep an explicit override for that one type.
+# `Base.fma(::Float16,...)` branches on `jl_have_fma`
 @device_override Base.fma(x::Float16, y::Float16, z::Float16) =
     ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
 
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 562cff0b0d..bc9b92f6a9 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -35,10 +35,6 @@ end
     end
 end
 
-# muladd is no longer overridden — Julia emits `fmul contract + fadd contract`,
-# which the backend fuses. The "fma/muladd emit fma.rn" PTX testset below
-# verifies the actual end-to-end result on every supported FP type.
-
 @testset "assume" begin
     foo(i) = cld(42, i)
     ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))

From f51889672801e9dbf3c900bbc99987a25fda331e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 15:20:50 +0200
Subject: [PATCH 05/10] Improve tests.

---
 CUDACore/Project.toml                  |   2 +-
 CUDACore/src/device/intrinsics/math.jl |   3 +
 test/Project.toml                      |   1 +
 test/core/codegen.jl                   | 162 +-----------------
 test/core/math.jl                      | 226 +++++++++++++++++++++++++
 test/setup.jl                          |   1 +
 6 files changed, 235 insertions(+), 160 deletions(-)
 create mode 100644 test/core/math.jl

diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index 174a776d0b..820b3cf86e 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.13"
+GPUCompiler = "1.13.1"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index d52160d7c8..6ffe352425 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -1,5 +1,8 @@
 # math functionality
 
+# we only use libdevice where needed. if possible, we go through LLVM instead,
+# ideally relying on Julia's existing definitions.
+
 @public fma, rsqrt, saturate, byte_perm, assume
 @public add_rn, add_rz, add_rm, add_rp
 @public sub_rn, sub_rz, sub_rm, sub_rp
diff --git a/test/Project.toml b/test/Project.toml
index b37fcf9ed4..510d5844af 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -11,6 +11,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index bc9b92f6a9..b0a95cd982 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -170,165 +170,9 @@ end
     @test !occursin(".local", asm)
 end
 
-@testset "fastmath" begin
-    function div_kernel(x)
-        i = threadIdx().x
-        @fastmath @inbounds x[i] = 1 / x[i]
-        return
-    end
-
-    asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("div.approx.ftz", asm)
-
-    function sqrt_kernel(x)
-        i = threadIdx().x
-        @inbounds x[i] = sqrt(x[i])
-        return
-    end
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
-    @test occursin("sqrt.r", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("sqrt.approx.ftz", asm)
-
-    # Pin down PTX for `/` and `inv` across {f32, f64} × {plain, @fastmath}
-    # × {default, fastmath=true job}, since `Base.{/, inv}` and their fast
-    # variants are now handled entirely by GPUCompiler's `PTXFDivFastPass`.
-    fdiv(x, y) = x / y
-    finv(x) = inv(x)
-    fdiv_fast(x, y) = @fastmath x / y
-    finv_fast(x) = @fastmath inv(x)
-
-    # plain: precise lowering (inv via NVPTX matching `fdiv 1.0, x` to rcp.rn).
-    for T in (Float32, Float64)
-        suffix = T === Float32 ? "f32" : "f64"
-        @test occursin("div.rn.$suffix", sprint(io->CUDA.code_ptx(io, fdiv, Tuple{T,T})))
-        @test occursin("rcp.rn.$suffix", sprint(io->CUDA.code_ptx(io, finv, Tuple{T,})))
-    end
-
-    # @fastmath: per-call `afn` reaches the pass. f64 → rcp + Newton (no
-    # native fast f64 fdiv); f32 stays non-FTZ since the job isn't fast.
-    asm = sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float32,Float32}))
-    @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm)
-    asm = sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float32,}))
-    @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm)
-    @test occursin("rcp.approx.ftz.f64",
-                   sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float64,Float64})))
-    @test occursin("rcp.approx.ftz.f64",
-                   sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float64,})))
-
-    # fastmath=true job: `apply_fastmath!` stamps `afn` on plain ops; f32
-    # additionally picks up FTZ.
-    @test occursin("div.approx.ftz.f32",
-                   sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float32,Float32}; fastmath=true)))
-    @test occursin("div.approx.ftz.f32",
-                   sprint(io->CUDA.code_ptx(io, finv, Tuple{Float32,}; fastmath=true)))
-    @test occursin("rcp.approx.ftz.f64",
-                   sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float64,Float64}; fastmath=true)))
-    @test occursin("rcp.approx.ftz.f64",
-                   sprint(io->CUDA.code_ptx(io, finv, Tuple{Float64,}; fastmath=true)))
-end
-
-@testset "fma/muladd emit fma.rn" begin
-    # fma and muladd should both lower to fma.rn in PTX
-    function fma_kernel(a, b, c)
-        @inbounds a[] = fma(b[], c[], a[])
-        return
-    end
-    function muladd_kernel(a, b, c)
-        @inbounds a[] = muladd(b[], c[], a[])
-        return
-    end
-
-    for T in (Float16, Float32, Float64)
-        asm = sprint(io->CUDA.code_ptx(io, fma_kernel,
-            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
-        @test occursin("fma.rn", asm)
-        @test !occursin("__nv_fma", asm)
-
-        asm = sprint(io->CUDA.code_ptx(io, muladd_kernel,
-            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
-        @test occursin("fma.rn", asm)
-    end
-end
-
-@testset "math intrinsics lower without libdevice" begin
-    # Pin down PTX for the ops whose `@device_override`s were dropped, since
-    # they now rely on Julia emitting canonical LLVM IR and NVPTX lowering it.
-    # No libdevice (`__nv_*`) calls should remain in any of these.
-
-    suffix(::Type{Float32}) = "f32"
-    suffix(::Type{Float64}) = "f64"
-    asm_of(f, T) = sprint(io->CUDA.code_ptx(io, f, T))
-    asm_fast(f, T) = sprint(io->CUDA.code_ptx(io, f, T; fastmath=true))
-
-    # `abs` lowers to a single instruction for both floats and ints.
-    for T in (Float32, Float64)
-        asm = asm_of(x->abs(x), Tuple{T})
-        @test occursin("abs.$(suffix(T))", asm) && !occursin("__nv_", asm)
-    end
-    for (T, s) in ((Int32, "s32"), (Int64, "s64"))
-        asm = asm_of(x->abs(x), Tuple{T})
-        @test occursin("abs.$s", asm) && !occursin("__nv_", asm)
-    end
-
-    # floor/ceil/trunc → `cvt.r{m,p,z}i.fXX.fXX`.
-    for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi"))
-        for T in (Float32, Float64)
-            s = suffix(T)
-            asm = asm_of(x->op(x), Tuple{T})
-            @test occursin("cvt.$rnd.$s.$s", asm) && !occursin("__nv_", asm)
-        end
-    end
-
-    # FP predicates: should be pure compare/bit-test, no libdevice.
-    for T in (Float32, Float64)
-        for op in (isnan, isinf, isfinite, signbit)
-            asm = asm_of(x->op(x), Tuple{T})
-            @test !occursin("__nv_", asm)
-        end
-    end
-    # isnan is the cleanest: a single `setp.nan.fXX`.
-    @test occursin("setp.nan.f32", asm_of(x->isnan(x), Tuple{Float32}))
-    @test occursin("setp.nan.f64", asm_of(x->isnan(x), Tuple{Float64}))
-
-    # copysign: no libdevice, no single instruction (bit-twiddle on NVPTX).
-    for T in (Float32, Float64)
-        asm = asm_of((x, y) -> copysign(x, y), Tuple{T, T})
-        @test !occursin("__nv_", asm)
-    end
-
-    # Default `min`/`max` propagate NaN (Julia semantics). f32 on sm_80+ +
-    # LLVM 14+ gets the dedicated `min.NaN`/`max.NaN`; f64 emulates via PTX.
-    for (op, instr) in ((min, "min"), (max, "max"))
-        for T in (Float32, Float64)
-            asm = asm_of((x, y) -> op(x, y), Tuple{T, T})
-            @test occursin("$instr.", asm)
-        end
-    end
-    @test occursin("min.NaN.f32", asm_of((x, y) -> min(x, y), Tuple{Float32, Float32}))
-    @test occursin("max.NaN.f32", asm_of((x, y) -> max(x, y), Tuple{Float32, Float32}))
-
-    # `@fastmath min/max` drops NaN handling, becoming a plain compare+select.
-    for T in (Float32, Float64)
-        asm = asm_of((x, y) -> @fastmath(min(x, y)), Tuple{T, T})
-        @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm)
-        asm = asm_of((x, y) -> @fastmath(max(x, y)), Tuple{T, T})
-        @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm)
-    end
-
-    # job-wide `fastmath=true` propagates `afn` via `apply_fastmath!`. The
-    # ops here don't use `afn` to pick variants, but `apply_fastmath!` also
-    # sets `denormal-fp-math-f32="preserve-sign"`, which NVPTX' `useF32FTZ`
-    # reads to pick FTZ variants for *every* f32 op (`abs.ftz.f32`,
-    # `cvt.r{m,p,z}i.ftz.*`, etc.). f64 stays unchanged since NVPTX has no
-    # FTZ behavior on f64.
-    @test occursin("abs.ftz.f32", asm_fast(x->abs(x), Tuple{Float32}))
-    @test occursin("abs.f64", asm_fast(x->abs(x), Tuple{Float64}))
-    @test occursin("cvt.rmi.ftz.f32.f32", asm_fast(x->floor(x), Tuple{Float32}))
-    @test occursin("cvt.rmi.f64.f64", asm_fast(x->floor(x), Tuple{Float64}))
-end
+# PTX-level checks for math intrinsics (abs/floor/ceil/trunc/isnan/isinf/
+# isfinite/signbit/copysign/min/max/fma/muladd/sqrt/div/inv, both plain and
+# `@fastmath`) live in `test/core/math.jl` with one testset per op family.
 
 @testset "header rewrite (.target/.version bump)" begin
     # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 +
diff --git a/test/core/math.jl b/test/core/math.jl
new file mode 100644
index 0000000000..a070eb4d38
--- /dev/null
+++ b/test/core/math.jl
@@ -0,0 +1,226 @@
+# Verify PTX lowering of math intrinsics across {f32, f64} × {plain,
+# `@fastmath`} × {default, job-wide `fastmath=true`}. Most of these used to
+# require `@device_override`s pointing at libdevice; now they're handled by
+# Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!` and
+# `PTXFDivFastPass`. Each testset pins down the actual PTX so the wiring
+# stays put.
+
+@testset "math" begin
+
+@testset "abs" begin
+    for fastmath in (false, true)
+        # f32: job-wide fastmath flips to the `.ftz` variant.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
+            @check cond=fastmath  "abs.ftz.f32"
+            @check cond=!fastmath "abs.f32"
+            @check_not "__nv_"
+            abs(x)
+        end
+        # f64: no FTZ on PTX for f64.
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
+            @check "abs.f64"
+            @check_not "__nv_"
+            abs(x)
+        end
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x
+        @check "abs.s32"
+        @check_not "__nv_"
+        abs(x)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x
+        @check "abs.s64"
+        @check_not "__nv_"
+        abs(x)
+    end
+end
+
+@testset "floor/ceil/trunc" begin
+    for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi"))
+        for fastmath in (false, true)
+            @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
+                @check cond=fastmath  "cvt.$rnd.ftz.f32.f32"
+                @check cond=!fastmath "cvt.$rnd.f32.f32"
+                @check_not "__nv_"
+                op(x)
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
+                @check "cvt.$rnd.f64.f64"
+                @check_not "__nv_"
+                op(x)
+            end
+        end
+    end
+end
+
+@testset "isnan/isinf/isfinite" begin
+    # All three should be pure FP compares / bit-tests, no libdevice.
+    for T in (Float32, Float64), op in (isnan, isinf, isfinite)
+        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+            @check_not "__nv_"
+            op(x)
+        end
+    end
+
+    # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`.
+    @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+        @check "setp.nan.f32"
+        isnan(x)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+        @check "setp.nan.f64"
+        isnan(x)
+    end
+end
+
+@testset "signbit" begin
+    for T in (Float32, Float64)
+        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+            @check_not "__nv_"
+            signbit(x)
+        end
+    end
+end
+
+@testset "copysign" begin
+    # NVPTX has no single copysign instruction (custom-lowered to bit ops);
+    # we just verify libdevice isn't on the path.
+    for T in (Float32, Float64)
+        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+            @check_not "__nv_"
+            copysign(x, y)
+        end
+    end
+end
+
+@testset "min/max" begin
+    # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ &
+    # LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to emulate
+    # since PTX has no `.NaN` variant for f64.
+    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+        @check "min.NaN.f32"
+        min(x, y)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+        @check "max.NaN.f32"
+        max(x, y)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
+        @check_not "__nv_"
+        min(x, y)
+    end
+
+    # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select.
+    for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+            @check "setp.lt.$s"
+            @check "selp.$s"
+            @fastmath min(x, y)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+            @check "setp.lt.$s"
+            @check "selp.$s"
+            @fastmath max(x, y)
+        end
+    end
+end
+
+@testset "fma/muladd" begin
+    # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for f32/f64
+    # by GPUCompiler; for f16 we keep an explicit override). `Base.muladd`
+    # lowers to `fmul contract + fadd contract`, which the backend fuses.
+    # Either way: a single `fma.rn` per type.
+    for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64"))
+        @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
+            @check "fma.rn.$s"
+            @check_not "__nv_fma"
+            fma(x, y, z)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
+            @check "fma.rn.$s"
+            muladd(x, y, z)
+        end
+    end
+end
+
+@testset "sqrt" begin
+    # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`; per-call
+    # `@fastmath` → `sqrt.approx.fXX`; job-wide `fastmath=true` → the FTZ
+    # variant via `apply_fastmath!`.
+    for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+            @check "sqrt.rn.$s"
+            @check_not "sqrt.approx"
+            sqrt(x)
+        end
+    end
+
+    @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+        @check "sqrt.approx.f32"
+        @check_not "sqrt.approx.ftz"
+        @fastmath sqrt(x)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
+        @check "sqrt.approx.ftz.f32"
+        sqrt(x)
+    end
+    # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp.
+    @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+        @check "rsqrt.approx.f64"
+        @fastmath sqrt(x)
+    end
+end
+
+@testset "div/inv" begin
+    # `Base.{/, inv}` and their fast variants are now handled entirely by
+    # GPUCompiler's `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches
+    # `fdiv 1.0, x` to `rcp.rn`.
+    for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+            @check "div.rn.$s"
+            x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+            @check "rcp.rn.$s"
+            inv(x)
+        end
+    end
+
+    # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since the
+    # job isn't fast; f64 always uses rcp+Newton (no native fast f64 fdiv).
+    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+        @check "div.approx.f32"
+        @check_not "div.approx.ftz"
+        @fastmath x / y
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+        @check "div.approx.f32"
+        @check_not "div.approx.ftz"
+        @fastmath inv(x)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
+        @check "rcp.approx.ftz.f64"
+        @fastmath x / y
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+        @check "rcp.approx.ftz.f64"
+        @fastmath inv(x)
+    end
+
+    # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as @fastmath,
+    # and f32 additionally picks up FTZ.
+    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y
+        @check "div.approx.ftz.f32"
+        x / y
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
+        @check "div.approx.ftz.f32"
+        inv(x)
+    end
+    @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y
+        @check "rcp.approx.ftz.f64"
+        x / y
+    end
+end
+
+end
diff --git a/test/setup.jl b/test/setup.jl
index dc28bd9dc9..2561cbe7a1 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -1,6 +1,7 @@
 using Test
 using CUDA
 using CUDACore
+using FileCheck
 using GPUArrays
 using NVML: has_nvml, NVML
 using ParallelTestRunner

From 7ab5b8c54bec753b85331c1c464f676aab3a2989 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 15:36:38 +0200
Subject: [PATCH 06/10] Use FileCheck more widely.

---
 test/core/codegen.jl                |  92 ++++------
 test/core/device/array.jl           |  22 +--
 test/core/device/intrinsics.jl      |   8 +-
 test/core/device/intrinsics/math.jl | 273 +++++++++++++++++++++++-----
 test/core/device/intrinsics/wmma.jl |  22 +--
 test/core/device/ldg.jl             |  21 +--
 test/core/execution.jl              |  36 ++--
 test/core/math.jl                   | 226 -----------------------
 8 files changed, 317 insertions(+), 383 deletions(-)
 delete mode 100644 test/core/math.jl

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index b0a95cd982..97d1771792 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -1,49 +1,45 @@
 @testset "LLVM IR" begin
 
 @testset "JuliaLang/julia#21121" begin
-    function foobar()
+    @test @filecheck CUDA.code_llvm(Tuple{}) do
+        @check_not "inttoptr"
         weight_matrix = CuStaticSharedArray(Float32, (16, 16))
         sync_threads()
         weight_matrix[1, 16] *= 2
         sync_threads()
     end
-
-    ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}))
-    @test !occursin("inttoptr", ir)
 end
 
 @testset "CUDA.jl#553" begin
-    function kernel(ptr)
-       unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-       return
+    @test @filecheck CUDA.code_llvm(Tuple{Ptr{Float32}}) do ptr
+        @check_not "@__nv_fmaf"
+        unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+        return
     end
-
-    ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{Ptr{Float32}}))
-    @test !occursin("@__nv_fmaf", ir)
 end
 
 @testset "fma uses LLVM intrinsic" begin
-    function fma_kernel(ptr)
-        unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-        return
-    end
-
     for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
-        ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}}))
-        @test occursin("llvm.fma.$suffix", ir)
-        @test !occursin("__nv_fma", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
+            @check "llvm.fma.$suffix"
+            @check_not "__nv_fma"
+            unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+            return
+        end
     end
 end
 
 @testset "assume" begin
-    foo(i) = cld(42, i)
-    ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))
-    @test occursin("@gpu_report_exception", ir)
-
+    @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
+        @check "@gpu_report_exception"
+        cld(42, i)
+    end
 
-    bar(i) = (CUDA.assume(i > 0); cld(42, i))
-    ir = sprint(io->CUDA.code_llvm(io, bar, Tuple{Int}))
-    @test !occursin("gpu_report_exception", ir)
+    @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
+        @check_not "gpu_report_exception"
+        CUDA.assume(i > 0)
+        cld(42, i)
+    end
 end
 
 @testset "stripping invariant.load" begin
@@ -132,48 +128,32 @@ end
 @testset "PTX" begin
 
 @testset "always_inline" begin
-    function f_expensive(x)
-        Base.Cartesian.@nexprs 30 i -> x = sin(x)+i
-    end
-
-    function g(x)
-        f_expensive(x)
-        return
-    end
-    function h(x)
-        f_expensive(x)
-        return
+    # without `always_inline`, the helper survives as a separate `.func`;
+    # with it set, the helper is inlined and no `.func julia_f_expensive`
+    # declaration remains. The closure-form lambdas below recreate the
+    # `f_expensive` helper at each test site, so each parent has its own
+    # call edge to verify the kwarg sticks.
+    f_expensive(x) = (Base.Cartesian.@nexprs 30 i -> x = sin(x)+i; x)
+    for always_inline in (false, true)
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}; always_inline) do x
+            @check     cond=!always_inline "{{\\.func .*julia_f_expensive}}"
+            @check_not cond=always_inline  "{{\\.func .*julia_f_expensive}}"
+            f_expensive(x)
+            return
+        end
     end
-
-    asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}))
-    @test occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}; always_inline=true))
-    @test !occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}; always_inline=true))
-    @test !occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}))
-    @test occursin(r"\.func .*julia_f_expensive", asm)
 end
 
 @testset "local memory stores due to byval" begin
     # JuliaGPU/GPUCompiler.jl#92
-    function kernel(y1, y2)
+    @test @filecheck CUDA.code_ptx(NTuple{2,CuDeviceArray{Float32,1,AS.Global}}) do y1, y2
+        @check_not ".local"
         y = threadIdx().x == 1 ? y1 : y2
         @inbounds y[] = 0
         return
     end
-
-    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
-    @test !occursin(".local", asm)
 end
 
-# PTX-level checks for math intrinsics (abs/floor/ceil/trunc/isnan/isinf/
-# isfinite/signbit/copysign/min/max/fma/muladd/sqrt/div/inv, both plain and
-# `@fastmath`) live in `test/core/math.jl` with one testset per op family.
-
 @testset "header rewrite (.target/.version bump)" begin
     # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 +
     # LLVM 18 on a Blackwell device), `_compiler_config` produces a split
diff --git a/test/core/device/array.jl b/test/core/device/array.jl
index 240fed288b..f8f5b7a142 100644
--- a/test/core/device/array.jl
+++ b/test/core/device/array.jl
@@ -68,22 +68,23 @@ end
 
 @testset "bounds checking" begin
     @testset "#313" begin
-        function kernel(dest)
+        kernel = dest -> (dest[1] = 1; nothing)
+        tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
+                            Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
+        @test @filecheck CUDA.code_llvm(tt) do dest
+            @check_not "jl_invoke"
             dest[1] = 1
             nothing
         end
-        tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
-                            Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
-
-        ir = sprint(io->CUDA.code_llvm(io, kernel, tt))
-        @test !occursin("jl_invoke", ir)
+        # also smoke-test that PTX codegen succeeds for this signature.
         CUDA.code_ptx(devnull, kernel, tt)
     end
 
     # test that we don't do needless bounds checking when the kernel already does it
     # (enabled by the fact that we store `len` next to `dims`)
-    let
-        function kernel(A)
+    for N in 1:3
+        @test @filecheck CUDA.code_llvm(Tuple{CuDeviceArray{Int,N,AS.Global}}) do A
+            @check_not "boundserror"
             idx = threadIdx().x
             if idx <= length(A)
                 # we did our own bounds checking, so no check should be left!
@@ -91,11 +92,6 @@ end
             end
             return
         end
-
-        for N in 1:3
-            ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{CuDeviceArray{Int,N,AS.Global}}))
-            @test !occursin("boundserror", ir)
-        end
     end
 end
 
diff --git a/test/core/device/intrinsics.jl b/test/core/device/intrinsics.jl
index 2491e33423..504ef8c3b5 100644
--- a/test/core/device/intrinsics.jl
+++ b/test/core/device/intrinsics.jl
@@ -31,10 +31,10 @@
     @on_device active_mask()
 
     @testset "range metadata" begin
-        foobar() = threadIdx().x
-        ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true))
-
-        @test occursin(r"call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{}; raw=true) do
+            @check "{{call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range}}"
+            threadIdx().x
+        end
     end
 end
 
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index 1843f6213f..5eea318449 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -338,72 +338,34 @@ using SpecialFunctions
 
     @testset "@fastmath sincos" begin
         # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
-        function kernel(a, b, c)
+        @test @filecheck CUDA.code_ptx(NTuple{3,CuDeviceArray{Float32,1,AS.Global}}) do a, b, c
+            @check "sin.approx.f32"
+            @check "cos.approx.f32"
+            @check_not "__nv"  # from libdevice
             @inbounds b[], c[] = @fastmath sincos(a[])
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
-        @assert contains(asm, "sin.approx.f32")
-        @assert contains(asm, "cos.approx.f32")
-        @assert !contains(asm, "__nv")  # from libdevice
     end
 
     @testset "inv" begin
-        # Base.inv should use accurate rcp instructions (rcp.rn)
+        # Base.inv should use accurate rcp instructions (rcp.rn).
+        # PTX-level patterns for inv / inv_fast / div / div_fast live in
+        # `test/core/math.jl`; here we only sanity-check correctness on GPU.
         for T in (Float32, Float64)
             @test testf(x -> inv.(x), rand(T, 10) .+ T(0.1))
             @test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0])
         end
-
-        function kernel_inv_f32(a)
-            @inbounds a[] = inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
-        @test contains(asm, "rcp.rn.f32")
-
-        function kernel_inv_f64(a)
-            @inbounds a[] = inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
-        @test contains(asm, "rcp.rn.f64")
     end
 
     @testset "inv_fast" begin
-        # inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa)
-        function kernel_inv_fast_f32(a)
-            @inbounds a[] = @fastmath inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
-        @test contains(asm, "rcp.approx.ftz.f32")
-
         fast_inv(x) = @fastmath inv(x)
         xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
         @test Array(map(fast_inv, cu(xs32))) ≈ inv.(xs32) rtol = 1.0f-4
-
-        # inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson
-        function kernel_inv_fast_f64(a)
-            @inbounds a[] = @fastmath inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
-        @test contains(asm, "rcp.approx.ftz.f64")
-
         xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
         @test Array(map(fast_inv, CuArray(xs64))) ≈ inv.(xs64) rtol = 1.0e-10
     end
 
     @testset "div_fast Float64" begin
-        # FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y)
-        function kernel_div_fast_f64(a, b, c)
-            @inbounds c[] = @fastmath a[] / b[]
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}}))
-        @test contains(asm, "rcp.approx.ftz.f64")
-
         fast_div(x, y) = @fastmath x / y
         xs = rand(Float64, 10) .+ 0.1
         ys = rand(Float64, 10) .+ 0.1
@@ -420,4 +382,225 @@ using SpecialFunctions
             @test isequal(maximum(AT([NaN])), NaN)
         end
     end
+
+    # PTX lowering pins for the standard math ops. Most of these used to
+    # require `@device_override`s pointing at libdevice; now they're handled
+    # by Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!`,
+    # `PTXFDivFastPass`, and `PTXFSqrtFastPass`. Each testset pins the actual
+    # PTX so the wiring stays put across {f32, f64} × {plain, `@fastmath`} ×
+    # {default, job-wide `fastmath=true`}.
+
+    @testset "abs PTX" begin
+        for fastmath in (false, true)
+            # f32: job-wide fastmath flips to the `.ftz` variant.
+            @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
+                @check cond=fastmath  "abs.ftz.f32"
+                @check cond=!fastmath "abs.f32"
+                @check_not "__nv_"
+                abs(x)
+            end
+            # f64: no FTZ on PTX for f64.
+            @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
+                @check "abs.f64"
+                @check_not "__nv_"
+                abs(x)
+            end
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x
+            @check "abs.s32"
+            @check_not "__nv_"
+            abs(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x
+            @check "abs.s64"
+            @check_not "__nv_"
+            abs(x)
+        end
+    end
+
+    @testset "floor/ceil/trunc PTX" begin
+        for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi"))
+            for fastmath in (false, true)
+                @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
+                    @check cond=fastmath  "cvt.$rnd.ftz.f32.f32"
+                    @check cond=!fastmath "cvt.$rnd.f32.f32"
+                    @check_not "__nv_"
+                    op(x)
+                end
+                @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
+                    @check "cvt.$rnd.f64.f64"
+                    @check_not "__nv_"
+                    op(x)
+                end
+            end
+        end
+    end
+
+    @testset "isnan/isinf/isfinite PTX" begin
+        # All three should be pure FP compares / bit-tests, no libdevice.
+        for T in (Float32, Float64), op in (isnan, isinf, isfinite)
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check_not "__nv_"
+                op(x)
+            end
+        end
+        # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+            @check "setp.nan.f32"
+            isnan(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+            @check "setp.nan.f64"
+            isnan(x)
+        end
+    end
+
+    @testset "signbit PTX" begin
+        for T in (Float32, Float64)
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check_not "__nv_"
+                signbit(x)
+            end
+        end
+    end
+
+    @testset "copysign PTX" begin
+        # NVPTX has no single copysign instruction (custom-lowered to bit ops);
+        # we just verify libdevice isn't on the path.
+        for T in (Float32, Float64)
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check_not "__nv_"
+                copysign(x, y)
+            end
+        end
+    end
+
+    @testset "min/max PTX" begin
+        # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+
+        # + LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to
+        # emulate since PTX has no `.NaN` variant for f64.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+            @check "min.NaN.f32"
+            min(x, y)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+            @check "max.NaN.f32"
+            max(x, y)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
+            @check_not "__nv_"
+            min(x, y)
+        end
+
+        # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check "setp.lt.$s"
+                @check "selp.$s"
+                @fastmath min(x, y)
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check "setp.lt.$s"
+                @check "selp.$s"
+                @fastmath max(x, y)
+            end
+        end
+    end
+
+    @testset "fma/muladd PTX" begin
+        # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for
+        # f32/f64 by GPUCompiler; for f16 we keep an explicit override).
+        # `Base.muladd` lowers to `fmul contract + fadd contract`, which the
+        # backend fuses. Either way: a single `fma.rn` per type.
+        for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
+                @check "fma.rn.$s"
+                @check_not "__nv_fma"
+                fma(x, y, z)
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
+                @check "fma.rn.$s"
+                muladd(x, y, z)
+            end
+        end
+    end
+
+    @testset "sqrt PTX" begin
+        # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`;
+        # per-call `@fastmath` → `sqrt.approx.fXX` (via `PTXFSqrtFastPass`);
+        # job-wide `fastmath=true` → the FTZ variant via `apply_fastmath!`.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check "sqrt.rn.$s"
+                @check_not "sqrt.approx"
+                sqrt(x)
+            end
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+            @check "sqrt.approx.f32"
+            @check_not "sqrt.approx.ftz"
+            @fastmath sqrt(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
+            @check "sqrt.approx.ftz.f32"
+            sqrt(x)
+        end
+        # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp.
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+            @check "rsqrt.approx.f64"
+            @fastmath sqrt(x)
+        end
+    end
+
+    @testset "div/inv PTX" begin
+        # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's
+        # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches
+        # `fdiv 1.0, x` to `rcp.rn`.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check "div.rn.$s"
+                x / y
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check "rcp.rn.$s"
+                inv(x)
+            end
+        end
+
+        # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since
+        # the job isn't fast; f64 always uses rcp+Newton.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+            @check "div.approx.f32"
+            @check_not "div.approx.ftz"
+            @fastmath x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+            @check "div.approx.f32"
+            @check_not "div.approx.ftz"
+            @fastmath inv(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
+            @check "rcp.approx.ftz.f64"
+            @fastmath x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+            @check "rcp.approx.ftz.f64"
+            @fastmath inv(x)
+        end
+
+        # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as
+        # `@fastmath`, and f32 additionally picks up FTZ.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y
+            @check "div.approx.ftz.f32"
+            x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
+            @check "div.approx.ftz.f32"
+            inv(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y
+            @check "rcp.approx.ftz.f64"
+            x / y
+        end
+    end
 end
diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl
index f4ef5fdf4c..3281c56157 100644
--- a/test/core/device/intrinsics/wmma.jl
+++ b/test/core/device/intrinsics/wmma.jl
@@ -478,36 +478,26 @@ end
 
 @testset "Codegen addressing" begin
     @testset "Global" begin
-        function kernel(d)
+        @test @filecheck CUDA.code_ptx((CuDeviceArray{Float32,1,CUDA.AS.Global},)) do d
+            @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32}}"
+            @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}"
             conf = WMMA.Config{16, 16, 16, Float32}
-
             d_frag = WMMA.fill_c(Float32(0), conf)
             WMMA.store_d(pointer(d), d_frag, 16, WMMA.ColMajor, conf)
-
             return
         end
-
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
-
-        @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
-        @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
     end
 
     @testset "Shared" begin
-        function kernel()
+        @test @filecheck CUDA.code_ptx(()) do
+            @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32}}"
+            @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}"
             shmem = CuStaticSharedArray(Float32, (16, 16))
             conf = WMMA.Config{16, 16, 16, Float32}
-
             d_frag = WMMA.fill_c(Float32(0), conf)
             WMMA.store_d(pointer(shmem), d_frag, 16, WMMA.ColMajor, conf)
-
             return
         end
-
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, ()))
-
-        @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
-        @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx)
     end
 end
 
diff --git a/test/core/device/ldg.jl b/test/core/device/ldg.jl
index 0385e4e34a..b25f953449 100644
--- a/test/core/device/ldg.jl
+++ b/test/core/device/ldg.jl
@@ -1,15 +1,14 @@
 @testset "ldg" begin
-    # NOTE: This is necessary because it seems that code_llvm has a bug which causes it to ignore
-    #       the method table. Wrapping it in a function gets us what we want currently but the PR
-    #       here: https://github.com/JuliaLang/julia/pull/60718 will likely fix this according to
-    #       @vchuravy. It is currently not backported.
-    ir = sprint(io->CUDA.code_llvm(io, (args...)->CUDACore.pointerref_ldg(args...), Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true))
-    if Base.libllvm_version >= v"20"
-        # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with
-        # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`.
-        @test occursin("!invariant.load", ir)
-    else
-        @test occursin("@llvm.nvvm.ldg", ir)
+    # NOTE: the wrapping function is necessary because code_llvm has a bug
+    # that causes it to ignore the method table; JuliaLang/julia#60718 will
+    # likely fix this according to @vchuravy.
+    # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with
+    # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`.
+    new_llvm = Base.libllvm_version >= v"20"
+    @test @filecheck CUDA.code_llvm(Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true) do args...
+        @check cond=new_llvm  "!invariant.load"
+        @check cond=!new_llvm "@llvm.nvvm.ldg"
+        CUDACore.pointerref_ldg(args...)
     end
 end
 
diff --git a/test/core/execution.jl b/test/core/execution.jl
index 387af3547e..4f8a8d3495 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -61,16 +61,22 @@ end
     # to constructing the SMVersion directly.
     @cuda launch=false arch=dev_cap dummy()
     # but we should be able to see it in the generated PTX code
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50"))
-    @test contains(asm, ".target sm_50")
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0"))
-    @test contains(asm, ".target sm_50")
+    @test @filecheck CUDA.code_ptx((); arch=sm"50") do
+        @check ".target sm_50"
+        dummy()
+    end
+    @test @filecheck CUDA.code_ptx((); arch=v"5.0") do
+        @check ".target sm_50"
+        dummy()
+    end
 
     # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the
     # `.version` line should match what was asked for, independently of what LLVM and
     # ptxas would natively pick.
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3"))
-    @test contains(asm, ".version 6.3")
+    @test @filecheck CUDA.code_ptx((); ptx=v"6.3") do
+        @check ".version 6.3"
+        dummy()
+    end
 
     # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it
     # used to be); a clearly out-of-range value must error at config time.
@@ -84,14 +90,18 @@ end
     sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
 
     if dev_cap >= v"9.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a))
-        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+        @test @filecheck CUDA.code_ptx((); arch=sm_a) do
+            @check ".target $(CUDACore.cpu_name(sm_a))"
+            dummy()
+        end
         # arch-specific cubin should also actually launch on the matching device
         @cuda arch=sm_a dummy()
     end
     if dev_cap >= v"10.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f))
-        @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))")
+        @test @filecheck CUDA.code_ptx((); arch=sm_f) do
+            @check ".target $(CUDACore.cpu_name(sm_f))"
+            dummy()
+        end
         @cuda arch=sm_f dummy()
     end
 
@@ -103,8 +113,10 @@ end
     # since we know the exact device. The cuda-side `.target` is the variant regardless of
     # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs.
     if dev_cap >= v"9.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, ()))
-        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+        @test @filecheck CUDA.code_ptx(()) do
+            @check ".target $(CUDACore.cpu_name(sm_a))"
+            dummy()
+        end
     end
 
     # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built
diff --git a/test/core/math.jl b/test/core/math.jl
deleted file mode 100644
index a070eb4d38..0000000000
--- a/test/core/math.jl
+++ /dev/null
@@ -1,226 +0,0 @@
-# Verify PTX lowering of math intrinsics across {f32, f64} × {plain,
-# `@fastmath`} × {default, job-wide `fastmath=true`}. Most of these used to
-# require `@device_override`s pointing at libdevice; now they're handled by
-# Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!` and
-# `PTXFDivFastPass`. Each testset pins down the actual PTX so the wiring
-# stays put.
-
-@testset "math" begin
-
-@testset "abs" begin
-    for fastmath in (false, true)
-        # f32: job-wide fastmath flips to the `.ftz` variant.
-        @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
-            @check cond=fastmath  "abs.ftz.f32"
-            @check cond=!fastmath "abs.f32"
-            @check_not "__nv_"
-            abs(x)
-        end
-        # f64: no FTZ on PTX for f64.
-        @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
-            @check "abs.f64"
-            @check_not "__nv_"
-            abs(x)
-        end
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x
-        @check "abs.s32"
-        @check_not "__nv_"
-        abs(x)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x
-        @check "abs.s64"
-        @check_not "__nv_"
-        abs(x)
-    end
-end
-
-@testset "floor/ceil/trunc" begin
-    for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi"))
-        for fastmath in (false, true)
-            @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
-                @check cond=fastmath  "cvt.$rnd.ftz.f32.f32"
-                @check cond=!fastmath "cvt.$rnd.f32.f32"
-                @check_not "__nv_"
-                op(x)
-            end
-            @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
-                @check "cvt.$rnd.f64.f64"
-                @check_not "__nv_"
-                op(x)
-            end
-        end
-    end
-end
-
-@testset "isnan/isinf/isfinite" begin
-    # All three should be pure FP compares / bit-tests, no libdevice.
-    for T in (Float32, Float64), op in (isnan, isinf, isfinite)
-        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
-            @check_not "__nv_"
-            op(x)
-        end
-    end
-
-    # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`.
-    @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
-        @check "setp.nan.f32"
-        isnan(x)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
-        @check "setp.nan.f64"
-        isnan(x)
-    end
-end
-
-@testset "signbit" begin
-    for T in (Float32, Float64)
-        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
-            @check_not "__nv_"
-            signbit(x)
-        end
-    end
-end
-
-@testset "copysign" begin
-    # NVPTX has no single copysign instruction (custom-lowered to bit ops);
-    # we just verify libdevice isn't on the path.
-    for T in (Float32, Float64)
-        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
-            @check_not "__nv_"
-            copysign(x, y)
-        end
-    end
-end
-
-@testset "min/max" begin
-    # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ &
-    # LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to emulate
-    # since PTX has no `.NaN` variant for f64.
-    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
-        @check "min.NaN.f32"
-        min(x, y)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
-        @check "max.NaN.f32"
-        max(x, y)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
-        @check_not "__nv_"
-        min(x, y)
-    end
-
-    # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select.
-    for (T, s) in ((Float32, "f32"), (Float64, "f64"))
-        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
-            @check "setp.lt.$s"
-            @check "selp.$s"
-            @fastmath min(x, y)
-        end
-        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
-            @check "setp.lt.$s"
-            @check "selp.$s"
-            @fastmath max(x, y)
-        end
-    end
-end
-
-@testset "fma/muladd" begin
-    # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for f32/f64
-    # by GPUCompiler; for f16 we keep an explicit override). `Base.muladd`
-    # lowers to `fmul contract + fadd contract`, which the backend fuses.
-    # Either way: a single `fma.rn` per type.
-    for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64"))
-        @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
-            @check "fma.rn.$s"
-            @check_not "__nv_fma"
-            fma(x, y, z)
-        end
-        @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
-            @check "fma.rn.$s"
-            muladd(x, y, z)
-        end
-    end
-end
-
-@testset "sqrt" begin
-    # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`; per-call
-    # `@fastmath` → `sqrt.approx.fXX`; job-wide `fastmath=true` → the FTZ
-    # variant via `apply_fastmath!`.
-    for (T, s) in ((Float32, "f32"), (Float64, "f64"))
-        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
-            @check "sqrt.rn.$s"
-            @check_not "sqrt.approx"
-            sqrt(x)
-        end
-    end
-
-    @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
-        @check "sqrt.approx.f32"
-        @check_not "sqrt.approx.ftz"
-        @fastmath sqrt(x)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
-        @check "sqrt.approx.ftz.f32"
-        sqrt(x)
-    end
-    # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp.
-    @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
-        @check "rsqrt.approx.f64"
-        @fastmath sqrt(x)
-    end
-end
-
-@testset "div/inv" begin
-    # `Base.{/, inv}` and their fast variants are now handled entirely by
-    # GPUCompiler's `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches
-    # `fdiv 1.0, x` to `rcp.rn`.
-    for (T, s) in ((Float32, "f32"), (Float64, "f64"))
-        @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
-            @check "div.rn.$s"
-            x / y
-        end
-        @test @filecheck CUDA.code_ptx(Tuple{T}) do x
-            @check "rcp.rn.$s"
-            inv(x)
-        end
-    end
-
-    # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since the
-    # job isn't fast; f64 always uses rcp+Newton (no native fast f64 fdiv).
-    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
-        @check "div.approx.f32"
-        @check_not "div.approx.ftz"
-        @fastmath x / y
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
-        @check "div.approx.f32"
-        @check_not "div.approx.ftz"
-        @fastmath inv(x)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
-        @check "rcp.approx.ftz.f64"
-        @fastmath x / y
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
-        @check "rcp.approx.ftz.f64"
-        @fastmath inv(x)
-    end
-
-    # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as @fastmath,
-    # and f32 additionally picks up FTZ.
-    @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y
-        @check "div.approx.ftz.f32"
-        x / y
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
-        @check "div.approx.ftz.f32"
-        inv(x)
-    end
-    @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y
-        @check "rcp.approx.ftz.f64"
-        x / y
-    end
-end
-
-end

From bc47367138093ac218509123436007348d60e2b1 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 17:24:49 +0200
Subject: [PATCH 07/10] Fix CI.

---
 test/helpers.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/helpers.jl b/test/helpers.jl
index 2809a4b4a8..7ad4eb1a5e 100644
--- a/test/helpers.jl
+++ b/test/helpers.jl
@@ -6,6 +6,7 @@
 using CUDA, CUDACore, GPUArrays
 using CUDA: i32
 using Adapt
+using FileCheck
 using ..Main: TestSuite, can_use_cupti, sanitize
 
 testf(@nospecialize(f), @nospecialize(xs...); kwargs...) =

From 0b0dc669ee02af7c40741d0d1b31ed5f78249708 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 May 2026 17:24:58 +0200
Subject: [PATCH 08/10] Address review comments.

---
 CUDACore/src/device/intrinsics/math.jl | 13 ++++++++++++-
 test/core/codegen.jl                   | 14 ++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index 6ffe352425..d6140cb1ae 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -475,7 +475,7 @@ end
 
 # `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only
 # `Complex` has a method) and the catch-all fallback drops `afn`
-@device_override FastMath.inv_fast(x::Union{Float32, Float64}) =
+@device_override FastMath.inv_fast(x::Union{Float16, Float32, Float64}) =
     FastMath.div_fast(one(x), x)
 
 
@@ -502,6 +502,17 @@ end
 @device_override Base.fma(x::Float16, y::Float16, z::Float16) =
     ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
 
+# `Base.muladd(x, y, z) = fma(x, y, z)` is the natural choice on GPU: NVPTX
+# always lowers `llvm.fmuladd.fXX` to `fma.rn`, and routing through
+# `llvm.fmuladd` (rather than Julia's default `fmul contract + fadd contract`)
+# keeps the fusion robust under vectorization (per JuliaGPU/CUDA.jl#3149).
+@device_override Base.muladd(x::Float64, y::Float64, z::Float64) =
+    ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
+@device_override Base.muladd(x::Float32, y::Float32, z::Float32) =
+    ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
+@device_override Base.muladd(x::Float16, y::Float16, z::Float16) =
+    ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+
 # Directed rounding for binary arithmetic and fma. NVPTX exposes
 # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub`
 # intrinsic, so subtraction reuses add(x, -y) (negation is bit-exact for IEEE
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index 97d1771792..1941f289cc 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -29,6 +29,20 @@ end
     end
 end
 
+@testset "muladd uses LLVM intrinsic" begin
+    # `Base.muladd` emits `fmul contract + fadd contract` upstream, which the
+    # backend usually fuses to `fma.rn`. On GPU the fusion is unreliable under
+    # vectorization (JuliaGPU/CUDA.jl#3149), so the override routes through
+    # `llvm.fmuladd.fXX` directly.
+    for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
+        @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
+            @check "llvm.fmuladd.$suffix"
+            unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+            return
+        end
+    end
+end
+
 @testset "assume" begin
     @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
         @check "@gpu_report_exception"

From dffd57f14d7b88b9b03764d0f875049eedcd7a04 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 May 2026 09:07:52 +0200
Subject: [PATCH 09/10] Simplify rsqrt, fix min/max PTX test for sm<80.

rsqrt now uses high-level `@fastmath 1/sqrt(x)`; GPUCompiler's new
PTXRSqrtFastPass lowers it to `nvvm.rsqrt.approx.{f,d}` directly. Adds a
PTX FileCheck test pinning the lowering.

Pin `arch=sm"80"` on the min.NaN.f32 / max.NaN.f32 PTX checks so they
pass on sm_75 CI runners.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CUDACore/Project.toml                  |  2 +-
 CUDACore/src/device/intrinsics/math.jl |  4 ++--
 test/core/device/intrinsics/math.jl    | 27 +++++++++++++++++++++++---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index 820b3cf86e..62d4e196e5 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.13.1"
+GPUCompiler = "1.13.2"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index d6140cb1ae..bda26d9a42 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -298,8 +298,8 @@ end
 
 ## roots and powers
 
-@device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x)
-@device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x)
+@device_function rsqrt(x::Float64) = @fastmath 1.0 / sqrt(x)
+@device_function rsqrt(x::Float32) = @fastmath 1f0 / sqrt(x)
 @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x)))
 
 @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x)
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index 5eea318449..13c5b4e5ae 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -478,12 +478,13 @@ using SpecialFunctions
     @testset "min/max PTX" begin
         # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+
         # + LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to
-        # emulate since PTX has no `.NaN` variant for f64.
-        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+        # emulate since PTX has no `.NaN` variant for f64. Pin `arch=sm"80"`
+        # so the test is deterministic regardless of the CI runner's device.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y
             @check "min.NaN.f32"
             min(x, y)
         end
-        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y
             @check "max.NaN.f32"
             max(x, y)
         end
@@ -552,6 +553,26 @@ using SpecialFunctions
         end
     end
 
+    @testset "rsqrt PTX" begin
+        # `CUDA.rsqrt(x)` is `@fastmath 1/sqrt(x)`; GPUCompiler's
+        # `PTXRSqrtFastPass` folds the `afn 1/sqrt(x)` pattern to a single
+        # `nvvm.rsqrt.approx.{f,d}` call. f16 computes in f32, so it still
+        # hits the f32 instruction.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check "rsqrt.approx.$s"
+                @check_not "sqrt.approx"
+                @check_not "__nv_"
+                CUDA.rsqrt(x)
+            end
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float16}) do x
+            @check "rsqrt.approx.f32"
+            @check_not "__nv_"
+            CUDA.rsqrt(x)
+        end
+    end
+
     @testset "div/inv PTX" begin
         # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's
         # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches

From 986ca42b55f766b21fb4378e2468b138ac4058f9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 May 2026 09:49:27 +0200
Subject: [PATCH 10/10] Define rsqrt via direct NVPTX intrinsic, not @fastmath.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `@fastmath 1/sqrt(x)` form stamps `fast` (nnan/ninf/...) on the IR
operations, which let LLVM DCE caller-side `isnan(rsqrt(x))` and
`isinf(rsqrt(x))` checks before our PTXRSqrtFastPass folded the pattern —
a behavior regression versus the libdevice path. Direct `ccall` to
`llvm.nvvm.rsqrt.approx.{f,d}` is opaque to fast-math reasoning, matches
what libdevice itself does (a thin wrapper around the same intrinsic),
and produces strictly cleaner IR than libdevice (single rsqrt call +
select rather than phi + duplicate call).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CUDACore/Project.toml                  | 2 +-
 CUDACore/src/device/intrinsics/math.jl | 7 +++++--
 test/core/device/intrinsics/math.jl    | 8 ++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index 62d4e196e5..820b3cf86e 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.13.2"
+GPUCompiler = "1.13.1"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index bda26d9a42..16f9cc8738 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -298,8 +298,11 @@ end
 
 ## roots and powers
 
-@device_function rsqrt(x::Float64) = @fastmath 1.0 / sqrt(x)
-@device_function rsqrt(x::Float32) = @fastmath 1f0 / sqrt(x)
+# NVPTX has native `rsqrt.approx.{f32,f64}`; call the intrinsic directly. The
+# obvious alternative, `@fastmath 1/sqrt(x)`, also lowers to `rsqrt.approx`
+# (via `PTXRSqrtFastPass`), but is too aggressive wrt. fast-math behavior.
+@device_function rsqrt(x::Float64) = ccall("llvm.nvvm.rsqrt.approx.d", llvmcall, Cdouble, (Cdouble,), x)
+@device_function rsqrt(x::Float32) = ccall("llvm.nvvm.rsqrt.approx.f", llvmcall, Cfloat, (Cfloat,), x)
 @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x)))
 
 @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x)
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index 13c5b4e5ae..4ad292390c 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -554,10 +554,10 @@ using SpecialFunctions
     end
 
     @testset "rsqrt PTX" begin
-        # `CUDA.rsqrt(x)` is `@fastmath 1/sqrt(x)`; GPUCompiler's
-        # `PTXRSqrtFastPass` folds the `afn 1/sqrt(x)` pattern to a single
-        # `nvvm.rsqrt.approx.{f,d}` call. f16 computes in f32, so it still
-        # hits the f32 instruction.
+        # `CUDA.rsqrt(x)` directly calls the NVPTX `rsqrt.approx.{f,d}`
+        # intrinsic — no libdevice, and no `@fastmath` so caller-side NaN/Inf
+        # checks aren't DCE'd by `nnan`/`ninf` propagation. f16 computes in
+        # f32, so it still hits the f32 instruction.
         for (T, s) in ((Float32, "f32"), (Float64, "f64"))
             @test @filecheck CUDA.code_ptx(Tuple{T}) do x
                 @check "rsqrt.approx.$s"