JuliaGPU · maleadt · May 21, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.12"
+GPUCompiler = "1.13.1"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"

diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
@@ -1,5 +1,8 @@
 # math functionality
 
+# we only use libdevice where needed. if possible, we go through LLVM instead,
+# ideally relying on Julia's existing definitions.
+
 @public fma, rsqrt, saturate, byte_perm, assume
 @public add_rn, add_rz, add_rm, add_rp
 @public sub_rn, sub_rz, sub_rm, sub_rp
@@ -286,46 +289,20 @@ end
 
 ## floating-point handling
 
-@device_override Base.isfinite(x::Float32) = (ccall("extern __nv_finitef", llvmcall, Int32, (Cfloat,), x)) != 0
-@device_override Base.isfinite(x::Float64) = (ccall("extern __nv_isfinited", llvmcall, Int32, (Cdouble,), x)) != 0
-
-@device_override Base.isinf(x::Float64) = (ccall("extern __nv_isinfd", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.isinf(x::Float32) = (ccall("extern __nv_isinff", llvmcall, Int32, (Cfloat,), x)) != 0
-
-@device_override Base.isnan(x::Float64) = (ccall("extern __nv_isnand", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.isnan(x::Float32) = (ccall("extern __nv_isnanf", llvmcall, Int32, (Cfloat,), x)) != 0
-# isnan(::Float16) inherits from Julia (x != x), which compiles to a single setp.neu.f16.
-
 @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x)
 @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x)
 
 @device_function nextafter(x::Float64, y::Float64) = ccall("extern __nv_nextafter", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_function nextafter(x::Float32, y::Float32) = ccall("extern __nv_nextafterf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
 
-## sign handling
-
-@device_override Base.signbit(x::Float64) = (ccall("extern __nv_signbitd", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.signbit(x::Float32) = (ccall("extern __nv_signbitf", llvmcall, Int32, (Cfloat,), x)) != 0
-
-@device_override Base.copysign(x::Float64, y::Float64) = ccall("extern __nv_copysign", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-@device_override Base.copysign(x::Float32, y::Float32) = ccall("extern __nv_copysignf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-
-@device_override Base.abs(x::Int32) =   ccall("extern __nv_abs", llvmcall, Int32, (Int32,), x)
-@device_override Base.abs(f::Float64) = ccall("extern __nv_fabs", llvmcall, Cdouble, (Cdouble,), f)
-@device_override Base.abs(f::Float32) = ccall("extern __nv_fabsf", llvmcall, Cfloat, (Cfloat,), f)
-# abs(::Float16) inherits from Julia (abs_float intrinsic), lowering to and.b16.
-@device_override Base.abs(x::Int64) =   ccall("extern __nv_llabs", llvmcall, Int64, (Int64,), x)
-
 ## roots and powers
 
-@device_override Base.sqrt(x::Float64) = ccall("extern __nv_sqrt", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.sqrt(x::Float32) = ccall("extern __nv_sqrtf", llvmcall, Cfloat, (Cfloat,), x)
-# sqrt(::Float16) inherits from Julia (Float16(sqrt(Float32(x)))), routing through __nv_sqrtf.
-@device_override FastMath.sqrt_fast(x::Union{Float32, Float64}) = sqrt(x)
-
-@device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x)
-@device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x)
+# NVPTX has native `rsqrt.approx.{f32,f64}`; call the intrinsic directly. The
+# obvious alternative, `@fastmath 1/sqrt(x)`, also lowers to `rsqrt.approx`
+# (via `PTXRSqrtFastPass`), but is too aggressive wrt. fast-math behavior.
+@device_function rsqrt(x::Float64) = ccall("llvm.nvvm.rsqrt.approx.d", llvmcall, Cdouble, (Cdouble,), x)
+@device_function rsqrt(x::Float32) = ccall("llvm.nvvm.rsqrt.approx.f", llvmcall, Cfloat, (Cfloat,), x)
 @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x)))
 
 @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x)
@@ -395,15 +372,6 @@ end
 #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x)
 #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x)
 
-@device_override Base.trunc(x::Float64) = ccall("extern __nv_trunc", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.trunc(x::Float32) = ccall("extern __nv_truncf", llvmcall, Cfloat, (Cfloat,), x)
-
-@device_override Base.ceil(x::Float64) = ccall("extern __nv_ceil", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.ceil(x::Float32) = ccall("extern __nv_ceilf", llvmcall, Cfloat, (Cfloat,), x)
-
-@device_override Base.floor(f::Float64) = ccall("extern __nv_floor", llvmcall, Cdouble, (Cdouble,), f)
-@device_override Base.floor(f::Float32) = ccall("extern __nv_floorf", llvmcall, Cfloat, (Cfloat,), f)
-
 #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y)
 #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
 #@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y))
@@ -508,27 +476,11 @@ end
 @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
-@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
-
-@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
-@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)
-
-@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
-@device_override function FastMath.inv_fast(x::Float64)
-    # Get the approximate reciprocal
-    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
-    # This instruction chops off last 32bits of mantissa and computes inverse
-    # while treating all subnormal numbers as 0.0
-    # If reciprocal would be subnormal, underflows to 0.0
-    # 32 least significant bits of the result are filled with 0s
-    inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
-
-    # Approximate the missing 32bits of mantissa with a single cubic iteration
-    e = fma(inv_x, -x, 1.0)
-    e = fma(e, e, e)
-    inv_x = fma(e, inv_x, inv_x)
-end
+# `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only
+# `Complex` has a method) and the catch-all fallback drops `afn`
+@device_override FastMath.inv_fast(x::Union{Float16, Float32, Float64}) =
+    FastMath.div_fast(one(x), x)
+
 
 ## distributions
 
@@ -549,13 +501,20 @@ end
 @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
-@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
-@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
-
-@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
-@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+# `Base.fma(::Float16,...)` branches on `jl_have_fma`
+@device_override Base.fma(x::Float16, y::Float16, z::Float16) =
+    ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+
+# `Base.muladd(x, y, z) = fma(x, y, z)` is the natural choice on GPU: NVPTX
+# always lowers `llvm.fmuladd.fXX` to `fma.rn`, and routing through
+# `llvm.fmuladd` (rather than Julia's default `fmul contract + fadd contract`)
+# keeps the fusion robust under vectorization (per JuliaGPU/CUDA.jl#3149).
+@device_override Base.muladd(x::Float64, y::Float64, z::Float64) =
+    ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
+@device_override Base.muladd(x::Float32, y::Float32, z::Float32) =
+    ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
+@device_override Base.muladd(x::Float16, y::Float16, z::Float16) =
+    ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
 
 # Directed rounding for binary arithmetic and fma. NVPTX exposes
 # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub`

diff --git a/test/Project.toml b/test/Project.toml
@@ -11,6 +11,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
@@ -1,61 +1,59 @@
 @testset "LLVM IR" begin
 
 @testset "JuliaLang/julia#21121" begin
-    function foobar()
+    @test @filecheck CUDA.code_llvm(Tuple{}) do
+        @check_not "inttoptr"
         weight_matrix = CuStaticSharedArray(Float32, (16, 16))
         sync_threads()
         weight_matrix[1, 16] *= 2
         sync_threads()
     end
-
-    ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}))
-    @test !occursin("inttoptr", ir)
 end
 
 @testset "CUDA.jl#553" begin
-    function kernel(ptr)
-       unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-       return
+    @test @filecheck CUDA.code_llvm(Tuple{Ptr{Float32}}) do ptr
+        @check_not "@__nv_fmaf"
+        unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+        return
     end
-
-    ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{Ptr{Float32}}))
-    @test !occursin("@__nv_fmaf", ir)
 end
 
 @testset "fma uses LLVM intrinsic" begin
-    function fma_kernel(ptr)
-        unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-        return
-    end
-
     for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
-        ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}}))
-        @test occursin("llvm.fma.$suffix", ir)
-        @test !occursin("__nv_fma", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
+            @check "llvm.fma.$suffix"
+            @check_not "__nv_fma"
+            unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+            return
+        end
     end
 end
 
 @testset "muladd uses LLVM intrinsic" begin
-    function muladd_kernel(ptr)
-        unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-        return
-    end
-
+    # `Base.muladd` emits `fmul contract + fadd contract` upstream, which the
+    # backend usually fuses to `fma.rn`. On GPU the fusion is unreliable under
+    # vectorization (JuliaGPU/CUDA.jl#3149), so the override routes through
+    # `llvm.fmuladd.fXX` directly.
     for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
-        ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}}))
-        @test occursin("llvm.fmuladd.$suffix", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
+            @check "llvm.fmuladd.$suffix"
+            unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+            return
+        end
     end
 end
 
 @testset "assume" begin
-    foo(i) = cld(42, i)
-    ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))
-    @test occursin("@gpu_report_exception", ir)
-
+    @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
+        @check "@gpu_report_exception"
+        cld(42, i)
+    end
 
-    bar(i) = (CUDA.assume(i > 0); cld(42, i))
-    ir = sprint(io->CUDA.code_llvm(io, bar, Tuple{Int}))
-    @test !occursin("gpu_report_exception", ir)
+    @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
+        @check_not "gpu_report_exception"
+        CUDA.assume(i > 0)
+        cld(42, i)
+    end
 end
 
 @testset "stripping invariant.load" begin
@@ -144,88 +142,30 @@ end
 @testset "PTX" begin
 
 @testset "always_inline" begin
-    function f_expensive(x)
-        Base.Cartesian.@nexprs 30 i -> x = sin(x)+i
-    end
-
-    function g(x)
-        f_expensive(x)
-        return
-    end
-    function h(x)
-        f_expensive(x)
-        return
+    # without `always_inline`, the helper survives as a separate `.func`;
+    # with it set, the helper is inlined and no `.func julia_f_expensive`
+    # declaration remains. The closure-form lambdas below recreate the
+    # `f_expensive` helper at each test site, so each parent has its own
+    # call edge to verify the kwarg sticks.
+    f_expensive(x) = (Base.Cartesian.@nexprs 30 i -> x = sin(x)+i; x)
+    for always_inline in (false, true)
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}; always_inline) do x
+            @check     cond=!always_inline "{{\\.func .*julia_f_expensive}}"
+            @check_not cond=always_inline  "{{\\.func .*julia_f_expensive}}"
+            f_expensive(x)
+            return
+        end
     end
-
-    asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}))
-    @test occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}; always_inline=true))
-    @test !occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}; always_inline=true))
-    @test !occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}))
-    @test occursin(r"\.func .*julia_f_expensive", asm)
 end
 
 @testset "local memory stores due to byval" begin
     # JuliaGPU/GPUCompiler.jl#92
-    function kernel(y1, y2)
+    @test @filecheck CUDA.code_ptx(NTuple{2,CuDeviceArray{Float32,1,AS.Global}}) do y1, y2
+        @check_not ".local"
         y = threadIdx().x == 1 ? y1 : y2
         @inbounds y[] = 0
         return
     end
-
-    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
-    @test !occursin(".local", asm)
-end
-
-@testset "fastmath" begin
-    function div_kernel(x)
-        i = threadIdx().x
-        @fastmath @inbounds x[i] = 1 / x[i]
-        return
-    end
-
-    asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("div.approx.ftz", asm)
-
-    function sqrt_kernel(x)
-        i = threadIdx().x
-        @inbounds x[i] = sqrt(x[i])
-        return
-    end
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
-    @test occursin("sqrt.r", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("sqrt.approx.ftz", asm)
-end
-
-@testset "fma/muladd emit fma.rn" begin
-    # fma and muladd should both lower to fma.rn in PTX
-    function fma_kernel(a, b, c)
-        @inbounds a[] = fma(b[], c[], a[])
-        return
-    end
-    function muladd_kernel(a, b, c)
-        @inbounds a[] = muladd(b[], c[], a[])
-        return
-    end
-
-    for T in (Float16, Float32, Float64)
-        asm = sprint(io->CUDA.code_ptx(io, fma_kernel,
-            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
-        @test occursin("fma.rn", asm)
-        @test !occursin("__nv_fma", asm)
-
-        asm = sprint(io->CUDA.code_ptx(io, muladd_kernel,
-            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
-        @test occursin("fma.rn", asm)
-    end
 end
 
 @testset "header rewrite (.target/.version bump)" begin

diff --git a/test/core/device/array.jl b/test/core/device/array.jl
@@ -68,34 +68,30 @@ end
 
 @testset "bounds checking" begin
     @testset "#313" begin
-        function kernel(dest)
+        kernel = dest -> (dest[1] = 1; nothing)
+        tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
+                            Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
+        @test @filecheck CUDA.code_llvm(tt) do dest
+            @check_not "jl_invoke"
             dest[1] = 1
             nothing
         end
-        tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
-                            Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
-
-        ir = sprint(io->CUDA.code_llvm(io, kernel, tt))
-        @test !occursin("jl_invoke", ir)
+        # also smoke-test that PTX codegen succeeds for this signature.
         CUDA.code_ptx(devnull, kernel, tt)
     end
 
     # test that we don't do needless bounds checking when the kernel already does it
     # (enabled by the fact that we store `len` next to `dims`)
-    let
-        function kernel(A)
+    for N in 1:3
+        @test @filecheck CUDA.code_llvm(Tuple{CuDeviceArray{Int,N,AS.Global}}) do A
+            @check_not "boundserror"
             idx = threadIdx().x
             if idx <= length(A)
                 # we did our own bounds checking, so no check should be left!
                 A[idx] = 1
             end
             return
         end
-
-        for N in 1:3
-            ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{CuDeviceArray{Int,N,AS.Global}}))
-            @test !occursin("boundserror", ir)
-        end
     end
 end