From 052750721da546dab69e022f17b453bfad48c2d9 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 13:14:38 +0200 Subject: [PATCH 01/10] Drop fdiv/sqrt/inv device overrides handled by GPUCompiler. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPUCompiler's `PTXFDivFastPass` handles `afn`-flagged fdiv (covering `@fastmath` per-call and the `fastmath=true` job kwarg), and NVPTX already pattern-matches plain `fdiv 1.0, x` to `rcp.rn`. The only remaining override is `FastMath.inv_fast(::AbstractFloat)`, which Julia upstream doesn't implement for floats — route through `div_fast` so the pass sees `afn`. Co-Authored-By: Claude Opus 4.7 (1M context) --- CUDACore/Project.toml | 2 +- CUDACore/src/device/intrinsics/math.jl | 37 +++++++++----------------- test/core/codegen.jl | 37 ++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 2c9992781e..174a776d0b 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.12" +GPUCompiler = "1.13" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index 95735015f2..bd6d3dd54b 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -319,10 +319,9 @@ end ## roots and powers -@device_override Base.sqrt(x::Float64) = ccall("extern __nv_sqrt", llvmcall, Cdouble, (Cdouble,), x) -@device_override Base.sqrt(x::Float32) = ccall("extern __nv_sqrtf", llvmcall, Cfloat, (Cfloat,), x) -# sqrt(::Float16) inherits from Julia (Float16(sqrt(Float32(x)))), routing through __nv_sqrtf. -@device_override FastMath.sqrt_fast(x::Union{Float32, Float64}) = sqrt(x) +# `Base.sqrt` inherits from Julia (`llvm.sqrt.f{32,64}`); routing through +# libdevice's `__nv_sqrtf` would force `sqrt.approx.*` unconditionally, since +# LLVM's NVVMReflectPass folds `__CUDA_PREC_SQRT` to 0. @device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x) @device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x) @@ -508,27 +507,15 @@ end @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest)) -@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y) - -@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x) -@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x) - -@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x) -@device_override function FastMath.inv_fast(x::Float64) - # Get the approximate reciprocal - # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 - # This instruction chops off last 32bits of mantissa and computes inverse - # while treating all subnormal numbers as 0.0 - # If reciprocal would be subnormal, underflows to 0.0 - # 32 least significant bits of the result are filled with 0s - inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x) - - # Approximate the missing 32bits of mantissa with a single cubic iteration - e = fma(inv_x, -x, 1.0) - e = fma(e, e, e) - inv_x = fma(e, inv_x, inv_x) -end +# `Base.{/, inv}` and `Base.FastMath.div_fast` inherit from Julia: +# GPUCompiler's `PTXFDivFastPass` handles the `afn`-flagged fdiv these emit, +# and NVPTX pattern-matches plain `fdiv 1.0, x` to `rcp.rn`. +# +# `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only +# `Complex` has a method) and the catch-all fallback drops `afn`; route it +# through `div_fast` so the pass sees the flag. +@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = + FastMath.div_fast(one(x), x) ## distributions diff --git a/test/core/codegen.jl b/test/core/codegen.jl index db431db4ed..ea6b1b6c47 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -203,6 +203,43 @@ end asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) @test occursin("sqrt.approx.ftz", asm) + + # Pin down PTX for `/` and `inv` across {f32, f64} × {plain, @fastmath} + # × {default, fastmath=true job}, since `Base.{/, inv}` and their fast + # variants are now handled entirely by GPUCompiler's `PTXFDivFastPass`. + fdiv(x, y) = x / y + finv(x) = inv(x) + fdiv_fast(x, y) = @fastmath x / y + finv_fast(x) = @fastmath inv(x) + + # plain: precise lowering (inv via NVPTX matching `fdiv 1.0, x` to rcp.rn). + for T in (Float32, Float64) + suffix = T === Float32 ? "f32" : "f64" + @test occursin("div.rn.$suffix", sprint(io->CUDA.code_ptx(io, fdiv, Tuple{T,T}))) + @test occursin("rcp.rn.$suffix", sprint(io->CUDA.code_ptx(io, finv, Tuple{T,}))) + end + + # @fastmath: per-call `afn` reaches the pass. f64 → rcp + Newton (no + # native fast f64 fdiv); f32 stays non-FTZ since the job isn't fast. + asm = sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float32,Float32})) + @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm) + asm = sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float32,})) + @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm) + @test occursin("rcp.approx.ftz.f64", + sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float64,Float64}))) + @test occursin("rcp.approx.ftz.f64", + sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float64,}))) + + # fastmath=true job: `apply_fastmath!` stamps `afn` on plain ops; f32 + # additionally picks up FTZ. + @test occursin("div.approx.ftz.f32", + sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float32,Float32}; fastmath=true))) + @test occursin("div.approx.ftz.f32", + sprint(io->CUDA.code_ptx(io, finv, Tuple{Float32,}; fastmath=true))) + @test occursin("rcp.approx.ftz.f64", + sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float64,Float64}; fastmath=true))) + @test occursin("rcp.approx.ftz.f64", + sprint(io->CUDA.code_ptx(io, finv, Tuple{Float64,}; fastmath=true))) end @testset "fma/muladd emit fma.rn" begin From 5f69f39fb11be9708c1ae09551ab396c61de6311 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 13:36:05 +0200 Subject: [PATCH 02/10] Drop more device overrides handled by Julia + NVPTX directly. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit isfinite/isinf/isnan, signbit/copysign/abs, trunc/ceil/floor, fma, and muladd inherit from Julia. Julia emits canonical LLVM ops (`llvm.fabs`, `llvm.floor`, `llvm.copysign`, `llvm.fma`, `fmul contract + fadd contract`, etc.), and the NVPTX backend lowers them to the same single-instruction PTX the libdevice overrides used to produce after inlining. `Base.fma(::Float16,...)` is the lone exception — its `jl_have_fma` runtime call isn't recognized by GPUCompiler's `cpu_features!`, so the branch survives the optimizer. Co-Authored-By: Claude Opus 4.7 (1M context) --- CUDACore/src/device/intrinsics/math.jl | 56 ++++++++++---------------- test/core/codegen.jl | 14 ++----- 2 files changed, 24 insertions(+), 46 deletions(-) diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index bd6d3dd54b..6621355d8b 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -286,15 +286,11 @@ end ## floating-point handling -@device_override Base.isfinite(x::Float32) = (ccall("extern __nv_finitef", llvmcall, Int32, (Cfloat,), x)) != 0 -@device_override Base.isfinite(x::Float64) = (ccall("extern __nv_isfinited", llvmcall, Int32, (Cdouble,), x)) != 0 - -@device_override Base.isinf(x::Float64) = (ccall("extern __nv_isinfd", llvmcall, Int32, (Cdouble,), x)) != 0 -@device_override Base.isinf(x::Float32) = (ccall("extern __nv_isinff", llvmcall, Int32, (Cfloat,), x)) != 0 - -@device_override Base.isnan(x::Float64) = (ccall("extern __nv_isnand", llvmcall, Int32, (Cdouble,), x)) != 0 -@device_override Base.isnan(x::Float32) = (ccall("extern __nv_isnanf", llvmcall, Int32, (Cfloat,), x)) != 0 -# isnan(::Float16) inherits from Julia (x != x), which compiles to a single setp.neu.f16. +# isfinite/isinf/isnan inherit from Julia, which uses pure FP comparisons and +# integer bit tricks: e.g. `isnan(x) = x != x` compiles to `setp.neu.f{16,32,64}`, +# `isinf(x) = abs(x) == Inf` to `abs.f32 + setp.eq.f32`. The libdevice +# `__nv_is{inf,nan,finite}*` wrappers do the same in their bodies, so the +# overrides produced equivalent PTX after inlining. @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x) @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x) @@ -305,17 +301,10 @@ end ## sign handling -@device_override Base.signbit(x::Float64) = (ccall("extern __nv_signbitd", llvmcall, Int32, (Cdouble,), x)) != 0 -@device_override Base.signbit(x::Float32) = (ccall("extern __nv_signbitf", llvmcall, Int32, (Cfloat,), x)) != 0 - -@device_override Base.copysign(x::Float64, y::Float64) = ccall("extern __nv_copysign", llvmcall, Cdouble, (Cdouble, Cdouble), x, y) -@device_override Base.copysign(x::Float32, y::Float32) = ccall("extern __nv_copysignf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) - -@device_override Base.abs(x::Int32) = ccall("extern __nv_abs", llvmcall, Int32, (Int32,), x) -@device_override Base.abs(f::Float64) = ccall("extern __nv_fabs", llvmcall, Cdouble, (Cdouble,), f) -@device_override Base.abs(f::Float32) = ccall("extern __nv_fabsf", llvmcall, Cfloat, (Cfloat,), f) -# abs(::Float16) inherits from Julia (abs_float intrinsic), lowering to and.b16. -@device_override Base.abs(x::Int64) = ccall("extern __nv_llabs", llvmcall, Int64, (Int64,), x) +# signbit/copysign/abs inherit from Julia. Julia emits canonical LLVM ops: +# `signbit` is a sign-bit extract; `copysign` is `llvm.copysign.f{32,64}` +# (NVPTX has custom lowering to bit-twiddle); `abs` is `llvm.fabs.f{32,64}` +# (NVPTX legal → `abs.f{32,64}`) for floats, or two-complement for ints. ## roots and powers @@ -394,14 +383,8 @@ end #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x) #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x) -@device_override Base.trunc(x::Float64) = ccall("extern __nv_trunc", llvmcall, Cdouble, (Cdouble,), x) -@device_override Base.trunc(x::Float32) = ccall("extern __nv_truncf", llvmcall, Cfloat, (Cfloat,), x) - -@device_override Base.ceil(x::Float64) = ccall("extern __nv_ceil", llvmcall, Cdouble, (Cdouble,), x) -@device_override Base.ceil(x::Float32) = ccall("extern __nv_ceilf", llvmcall, Cfloat, (Cfloat,), x) - -@device_override Base.floor(f::Float64) = ccall("extern __nv_floor", llvmcall, Cdouble, (Cdouble,), f) -@device_override Base.floor(f::Float32) = ccall("extern __nv_floorf", llvmcall, Cfloat, (Cfloat,), f) +# trunc/ceil/floor inherit from Julia (`llvm.{trunc,ceil,floor}.f{32,64}`), +# which NVPTX lowers natively to `cvt.r{zi,pi,mi}.f{32,64}.f{32,64}`. #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y) #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y) @@ -536,13 +519,16 @@ end @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y) @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z) -@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z) -@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) - -@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z) -@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z) -@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) +# fma/muladd inherit from Julia. `Base.fma(::Float{32,64})` branches on +# `julia.cpu.have_fma.f{32,64}`, which GPUCompiler folds to `true` on PTX, +# leaving `llvm.fma`. `Base.muladd` emits `fmul contract + fadd contract`, +# fused by the backend. Both lower to `fma.rn.f{32,64}` on NVPTX. +# +# `Base.fma(::Float16,...)` instead branches on a runtime `jl_have_fma` call +# that GPUCompiler can't fold (see GPUCompiler's `cpu_features!`), so we +# keep an explicit override for that one type. +@device_override Base.fma(x::Float16, y::Float16, z::Float16) = + ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) # Directed rounding for binary arithmetic and fma. NVPTX exposes # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub` diff --git a/test/core/codegen.jl b/test/core/codegen.jl index ea6b1b6c47..6694539a26 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -35,17 +35,9 @@ end end end -@testset "muladd uses LLVM intrinsic" begin - function muladd_kernel(ptr) - unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) - return - end - - for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16")) - ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}})) - @test occursin("llvm.fmuladd.$suffix", ir) - end -end +# muladd is no longer overridden — Julia emits `fmul contract + fadd contract`, +# which the backend fuses. The "fma/muladd emit fma.rn" PTX testset below +# verifies the actual end-to-end result on every supported FP type. @testset "assume" begin foo(i) = cld(42, i) From c9391fc9c873828e1a31a909e844ec6972a6de6f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 14:31:25 +0200 Subject: [PATCH 03/10] Test math intrinsics lower without libdevice. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin down PTX for the ops whose `@device_override`s were dropped — abs, floor/ceil/trunc, isnan/isinf/isfinite/signbit, copysign, min/max — across {f32, f64}, plain vs. `@fastmath` where it matters, and with job-wide `fastmath=true` (which also flips f32 ops to their `.ftz` variants via `apply_fastmath!`'s `denormal-fp-math-f32` attribute). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/core/codegen.jl | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 6694539a26..562cff0b0d 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -257,6 +257,83 @@ end end end +@testset "math intrinsics lower without libdevice" begin + # Pin down PTX for the ops whose `@device_override`s were dropped, since + # they now rely on Julia emitting canonical LLVM IR and NVPTX lowering it. + # No libdevice (`__nv_*`) calls should remain in any of these. + + suffix(::Type{Float32}) = "f32" + suffix(::Type{Float64}) = "f64" + asm_of(f, T) = sprint(io->CUDA.code_ptx(io, f, T)) + asm_fast(f, T) = sprint(io->CUDA.code_ptx(io, f, T; fastmath=true)) + + # `abs` lowers to a single instruction for both floats and ints. + for T in (Float32, Float64) + asm = asm_of(x->abs(x), Tuple{T}) + @test occursin("abs.$(suffix(T))", asm) && !occursin("__nv_", asm) + end + for (T, s) in ((Int32, "s32"), (Int64, "s64")) + asm = asm_of(x->abs(x), Tuple{T}) + @test occursin("abs.$s", asm) && !occursin("__nv_", asm) + end + + # floor/ceil/trunc → `cvt.r{m,p,z}i.fXX.fXX`. + for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi")) + for T in (Float32, Float64) + s = suffix(T) + asm = asm_of(x->op(x), Tuple{T}) + @test occursin("cvt.$rnd.$s.$s", asm) && !occursin("__nv_", asm) + end + end + + # FP predicates: should be pure compare/bit-test, no libdevice. + for T in (Float32, Float64) + for op in (isnan, isinf, isfinite, signbit) + asm = asm_of(x->op(x), Tuple{T}) + @test !occursin("__nv_", asm) + end + end + # isnan is the cleanest: a single `setp.nan.fXX`. + @test occursin("setp.nan.f32", asm_of(x->isnan(x), Tuple{Float32})) + @test occursin("setp.nan.f64", asm_of(x->isnan(x), Tuple{Float64})) + + # copysign: no libdevice, no single instruction (bit-twiddle on NVPTX). + for T in (Float32, Float64) + asm = asm_of((x, y) -> copysign(x, y), Tuple{T, T}) + @test !occursin("__nv_", asm) + end + + # Default `min`/`max` propagate NaN (Julia semantics). f32 on sm_80+ + + # LLVM 14+ gets the dedicated `min.NaN`/`max.NaN`; f64 emulates via PTX. + for (op, instr) in ((min, "min"), (max, "max")) + for T in (Float32, Float64) + asm = asm_of((x, y) -> op(x, y), Tuple{T, T}) + @test occursin("$instr.", asm) + end + end + @test occursin("min.NaN.f32", asm_of((x, y) -> min(x, y), Tuple{Float32, Float32})) + @test occursin("max.NaN.f32", asm_of((x, y) -> max(x, y), Tuple{Float32, Float32})) + + # `@fastmath min/max` drops NaN handling, becoming a plain compare+select. + for T in (Float32, Float64) + asm = asm_of((x, y) -> @fastmath(min(x, y)), Tuple{T, T}) + @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm) + asm = asm_of((x, y) -> @fastmath(max(x, y)), Tuple{T, T}) + @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm) + end + + # job-wide `fastmath=true` propagates `afn` via `apply_fastmath!`. The + # ops here don't use `afn` to pick variants, but `apply_fastmath!` also + # sets `denormal-fp-math-f32="preserve-sign"`, which NVPTX' `useF32FTZ` + # reads to pick FTZ variants for *every* f32 op (`abs.ftz.f32`, + # `cvt.r{m,p,z}i.ftz.*`, etc.). f64 stays unchanged since NVPTX has no + # FTZ behavior on f64. + @test occursin("abs.ftz.f32", asm_fast(x->abs(x), Tuple{Float32})) + @test occursin("abs.f64", asm_fast(x->abs(x), Tuple{Float64})) + @test occursin("cvt.rmi.ftz.f32.f32", asm_fast(x->floor(x), Tuple{Float32})) + @test occursin("cvt.rmi.f64.f64", asm_fast(x->floor(x), Tuple{Float64})) +end + @testset "header rewrite (.target/.version bump)" begin # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 + # LLVM 18 on a Blackwell device), `_compiler_config` produces a split From 30a9b7e769c955a37a16bdf1ce34af53b7e6a462 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 14:35:52 +0200 Subject: [PATCH 04/10] Simplify. --- CUDACore/src/device/intrinsics/math.jl | 37 +++----------------------- test/core/codegen.jl | 4 --- 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index 6621355d8b..d52160d7c8 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -286,12 +286,6 @@ end ## floating-point handling -# isfinite/isinf/isnan inherit from Julia, which uses pure FP comparisons and -# integer bit tricks: e.g. `isnan(x) = x != x` compiles to `setp.neu.f{16,32,64}`, -# `isinf(x) = abs(x) == Inf` to `abs.f32 + setp.eq.f32`. The libdevice -# `__nv_is{inf,nan,finite}*` wrappers do the same in their bodies, so the -# overrides produced equivalent PTX after inlining. - @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x) @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x) @@ -299,19 +293,8 @@ end @device_function nextafter(x::Float32, y::Float32) = ccall("extern __nv_nextafterf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -## sign handling - -# signbit/copysign/abs inherit from Julia. Julia emits canonical LLVM ops: -# `signbit` is a sign-bit extract; `copysign` is `llvm.copysign.f{32,64}` -# (NVPTX has custom lowering to bit-twiddle); `abs` is `llvm.fabs.f{32,64}` -# (NVPTX legal → `abs.f{32,64}`) for floats, or two-complement for ints. - ## roots and powers -# `Base.sqrt` inherits from Julia (`llvm.sqrt.f{32,64}`); routing through -# libdevice's `__nv_sqrtf` would force `sqrt.approx.*` unconditionally, since -# LLVM's NVVMReflectPass folds `__CUDA_PREC_SQRT` to 0. - @device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x) @device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x) @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x))) @@ -383,9 +366,6 @@ end #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x) #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x) -# trunc/ceil/floor inherit from Julia (`llvm.{trunc,ceil,floor}.f{32,64}`), -# which NVPTX lowers natively to `cvt.r{zi,pi,mi}.f{32,64}.f{32,64}`. - #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y) #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y) #@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y)) @@ -490,16 +470,12 @@ end @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest)) -# `Base.{/, inv}` and `Base.FastMath.div_fast` inherit from Julia: -# GPUCompiler's `PTXFDivFastPass` handles the `afn`-flagged fdiv these emit, -# and NVPTX pattern-matches plain `fdiv 1.0, x` to `rcp.rn`. -# # `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only -# `Complex` has a method) and the catch-all fallback drops `afn`; route it -# through `div_fast` so the pass sees the flag. +# `Complex` has a method) and the catch-all fallback drops `afn` @device_override FastMath.inv_fast(x::Union{Float32, Float64}) = FastMath.div_fast(one(x), x) + ## distributions # TODO: override StatsFun.jl? @@ -519,14 +495,7 @@ end @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y) @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -# fma/muladd inherit from Julia. `Base.fma(::Float{32,64})` branches on -# `julia.cpu.have_fma.f{32,64}`, which GPUCompiler folds to `true` on PTX, -# leaving `llvm.fma`. `Base.muladd` emits `fmul contract + fadd contract`, -# fused by the backend. Both lower to `fma.rn.f{32,64}` on NVPTX. -# -# `Base.fma(::Float16,...)` instead branches on a runtime `jl_have_fma` call -# that GPUCompiler can't fold (see GPUCompiler's `cpu_features!`), so we -# keep an explicit override for that one type. +# `Base.fma(::Float16,...)` branches on `jl_have_fma` @device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 562cff0b0d..bc9b92f6a9 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -35,10 +35,6 @@ end end end -# muladd is no longer overridden — Julia emits `fmul contract + fadd contract`, -# which the backend fuses. The "fma/muladd emit fma.rn" PTX testset below -# verifies the actual end-to-end result on every supported FP type. - @testset "assume" begin foo(i) = cld(42, i) ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int})) From f51889672801e9dbf3c900bbc99987a25fda331e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 15:20:50 +0200 Subject: [PATCH 05/10] Improve tests. --- CUDACore/Project.toml | 2 +- CUDACore/src/device/intrinsics/math.jl | 3 + test/Project.toml | 1 + test/core/codegen.jl | 162 +----------------- test/core/math.jl | 226 +++++++++++++++++++++++++ test/setup.jl | 1 + 6 files changed, 235 insertions(+), 160 deletions(-) create mode 100644 test/core/math.jl diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 174a776d0b..820b3cf86e 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.13" +GPUCompiler = "1.13.1" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index d52160d7c8..6ffe352425 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -1,5 +1,8 @@ # math functionality +# we only use libdevice where needed. if possible, we go through LLVM instead, +# ideally relying on Julia's existing definitions. + @public fma, rsqrt, saturate, byte_perm, assume @public add_rn, add_rz, add_rm, add_rp @public sub_rn, sub_rz, sub_rm, sub_rp diff --git a/test/Project.toml b/test/Project.toml index b37fcf9ed4..510d5844af 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -11,6 +11,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" diff --git a/test/core/codegen.jl b/test/core/codegen.jl index bc9b92f6a9..b0a95cd982 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -170,165 +170,9 @@ end @test !occursin(".local", asm) end -@testset "fastmath" begin - function div_kernel(x) - i = threadIdx().x - @fastmath @inbounds x[i] = 1 / x[i] - return - end - - asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) - @test occursin("div.approx.ftz", asm) - - function sqrt_kernel(x) - i = threadIdx().x - @inbounds x[i] = sqrt(x[i]) - return - end - - asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) - @test occursin("sqrt.r", asm) - - asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) - @test occursin("sqrt.approx.ftz", asm) - - # Pin down PTX for `/` and `inv` across {f32, f64} × {plain, @fastmath} - # × {default, fastmath=true job}, since `Base.{/, inv}` and their fast - # variants are now handled entirely by GPUCompiler's `PTXFDivFastPass`. - fdiv(x, y) = x / y - finv(x) = inv(x) - fdiv_fast(x, y) = @fastmath x / y - finv_fast(x) = @fastmath inv(x) - - # plain: precise lowering (inv via NVPTX matching `fdiv 1.0, x` to rcp.rn). - for T in (Float32, Float64) - suffix = T === Float32 ? "f32" : "f64" - @test occursin("div.rn.$suffix", sprint(io->CUDA.code_ptx(io, fdiv, Tuple{T,T}))) - @test occursin("rcp.rn.$suffix", sprint(io->CUDA.code_ptx(io, finv, Tuple{T,}))) - end - - # @fastmath: per-call `afn` reaches the pass. f64 → rcp + Newton (no - # native fast f64 fdiv); f32 stays non-FTZ since the job isn't fast. - asm = sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float32,Float32})) - @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm) - asm = sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float32,})) - @test occursin("div.approx.f32", asm) && !occursin("div.approx.ftz", asm) - @test occursin("rcp.approx.ftz.f64", - sprint(io->CUDA.code_ptx(io, fdiv_fast, Tuple{Float64,Float64}))) - @test occursin("rcp.approx.ftz.f64", - sprint(io->CUDA.code_ptx(io, finv_fast, Tuple{Float64,}))) - - # fastmath=true job: `apply_fastmath!` stamps `afn` on plain ops; f32 - # additionally picks up FTZ. - @test occursin("div.approx.ftz.f32", - sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float32,Float32}; fastmath=true))) - @test occursin("div.approx.ftz.f32", - sprint(io->CUDA.code_ptx(io, finv, Tuple{Float32,}; fastmath=true))) - @test occursin("rcp.approx.ftz.f64", - sprint(io->CUDA.code_ptx(io, fdiv, Tuple{Float64,Float64}; fastmath=true))) - @test occursin("rcp.approx.ftz.f64", - sprint(io->CUDA.code_ptx(io, finv, Tuple{Float64,}; fastmath=true))) -end - -@testset "fma/muladd emit fma.rn" begin - # fma and muladd should both lower to fma.rn in PTX - function fma_kernel(a, b, c) - @inbounds a[] = fma(b[], c[], a[]) - return - end - function muladd_kernel(a, b, c) - @inbounds a[] = muladd(b[], c[], a[]) - return - end - - for T in (Float16, Float32, Float64) - asm = sprint(io->CUDA.code_ptx(io, fma_kernel, - NTuple{3,CuDeviceArray{T,1,AS.Global}})) - @test occursin("fma.rn", asm) - @test !occursin("__nv_fma", asm) - - asm = sprint(io->CUDA.code_ptx(io, muladd_kernel, - NTuple{3,CuDeviceArray{T,1,AS.Global}})) - @test occursin("fma.rn", asm) - end -end - -@testset "math intrinsics lower without libdevice" begin - # Pin down PTX for the ops whose `@device_override`s were dropped, since - # they now rely on Julia emitting canonical LLVM IR and NVPTX lowering it. - # No libdevice (`__nv_*`) calls should remain in any of these. - - suffix(::Type{Float32}) = "f32" - suffix(::Type{Float64}) = "f64" - asm_of(f, T) = sprint(io->CUDA.code_ptx(io, f, T)) - asm_fast(f, T) = sprint(io->CUDA.code_ptx(io, f, T; fastmath=true)) - - # `abs` lowers to a single instruction for both floats and ints. - for T in (Float32, Float64) - asm = asm_of(x->abs(x), Tuple{T}) - @test occursin("abs.$(suffix(T))", asm) && !occursin("__nv_", asm) - end - for (T, s) in ((Int32, "s32"), (Int64, "s64")) - asm = asm_of(x->abs(x), Tuple{T}) - @test occursin("abs.$s", asm) && !occursin("__nv_", asm) - end - - # floor/ceil/trunc → `cvt.r{m,p,z}i.fXX.fXX`. - for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi")) - for T in (Float32, Float64) - s = suffix(T) - asm = asm_of(x->op(x), Tuple{T}) - @test occursin("cvt.$rnd.$s.$s", asm) && !occursin("__nv_", asm) - end - end - - # FP predicates: should be pure compare/bit-test, no libdevice. - for T in (Float32, Float64) - for op in (isnan, isinf, isfinite, signbit) - asm = asm_of(x->op(x), Tuple{T}) - @test !occursin("__nv_", asm) - end - end - # isnan is the cleanest: a single `setp.nan.fXX`. - @test occursin("setp.nan.f32", asm_of(x->isnan(x), Tuple{Float32})) - @test occursin("setp.nan.f64", asm_of(x->isnan(x), Tuple{Float64})) - - # copysign: no libdevice, no single instruction (bit-twiddle on NVPTX). - for T in (Float32, Float64) - asm = asm_of((x, y) -> copysign(x, y), Tuple{T, T}) - @test !occursin("__nv_", asm) - end - - # Default `min`/`max` propagate NaN (Julia semantics). f32 on sm_80+ + - # LLVM 14+ gets the dedicated `min.NaN`/`max.NaN`; f64 emulates via PTX. - for (op, instr) in ((min, "min"), (max, "max")) - for T in (Float32, Float64) - asm = asm_of((x, y) -> op(x, y), Tuple{T, T}) - @test occursin("$instr.", asm) - end - end - @test occursin("min.NaN.f32", asm_of((x, y) -> min(x, y), Tuple{Float32, Float32})) - @test occursin("max.NaN.f32", asm_of((x, y) -> max(x, y), Tuple{Float32, Float32})) - - # `@fastmath min/max` drops NaN handling, becoming a plain compare+select. - for T in (Float32, Float64) - asm = asm_of((x, y) -> @fastmath(min(x, y)), Tuple{T, T}) - @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm) - asm = asm_of((x, y) -> @fastmath(max(x, y)), Tuple{T, T}) - @test occursin("setp.lt.$(suffix(T))", asm) && occursin("selp.$(suffix(T))", asm) - end - - # job-wide `fastmath=true` propagates `afn` via `apply_fastmath!`. The - # ops here don't use `afn` to pick variants, but `apply_fastmath!` also - # sets `denormal-fp-math-f32="preserve-sign"`, which NVPTX' `useF32FTZ` - # reads to pick FTZ variants for *every* f32 op (`abs.ftz.f32`, - # `cvt.r{m,p,z}i.ftz.*`, etc.). f64 stays unchanged since NVPTX has no - # FTZ behavior on f64. - @test occursin("abs.ftz.f32", asm_fast(x->abs(x), Tuple{Float32})) - @test occursin("abs.f64", asm_fast(x->abs(x), Tuple{Float64})) - @test occursin("cvt.rmi.ftz.f32.f32", asm_fast(x->floor(x), Tuple{Float32})) - @test occursin("cvt.rmi.f64.f64", asm_fast(x->floor(x), Tuple{Float64})) -end +# PTX-level checks for math intrinsics (abs/floor/ceil/trunc/isnan/isinf/ +# isfinite/signbit/copysign/min/max/fma/muladd/sqrt/div/inv, both plain and +# `@fastmath`) live in `test/core/math.jl` with one testset per op family. @testset "header rewrite (.target/.version bump)" begin # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 + diff --git a/test/core/math.jl b/test/core/math.jl new file mode 100644 index 0000000000..a070eb4d38 --- /dev/null +++ b/test/core/math.jl @@ -0,0 +1,226 @@ +# Verify PTX lowering of math intrinsics across {f32, f64} × {plain, +# `@fastmath`} × {default, job-wide `fastmath=true`}. Most of these used to +# require `@device_override`s pointing at libdevice; now they're handled by +# Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!` and +# `PTXFDivFastPass`. Each testset pins down the actual PTX so the wiring +# stays put. + +@testset "math" begin + +@testset "abs" begin + for fastmath in (false, true) + # f32: job-wide fastmath flips to the `.ftz` variant. + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x + @check cond=fastmath "abs.ftz.f32" + @check cond=!fastmath "abs.f32" + @check_not "__nv_" + abs(x) + end + # f64: no FTZ on PTX for f64. + @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x + @check "abs.f64" + @check_not "__nv_" + abs(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x + @check "abs.s32" + @check_not "__nv_" + abs(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x + @check "abs.s64" + @check_not "__nv_" + abs(x) + end +end + +@testset "floor/ceil/trunc" begin + for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi")) + for fastmath in (false, true) + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x + @check cond=fastmath "cvt.$rnd.ftz.f32.f32" + @check cond=!fastmath "cvt.$rnd.f32.f32" + @check_not "__nv_" + op(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x + @check "cvt.$rnd.f64.f64" + @check_not "__nv_" + op(x) + end + end + end +end + +@testset "isnan/isinf/isfinite" begin + # All three should be pure FP compares / bit-tests, no libdevice. + for T in (Float32, Float64), op in (isnan, isinf, isfinite) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check_not "__nv_" + op(x) + end + end + + # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`. + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "setp.nan.f32" + isnan(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "setp.nan.f64" + isnan(x) + end +end + +@testset "signbit" begin + for T in (Float32, Float64) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check_not "__nv_" + signbit(x) + end + end +end + +@testset "copysign" begin + # NVPTX has no single copysign instruction (custom-lowered to bit ops); + # we just verify libdevice isn't on the path. + for T in (Float32, Float64) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check_not "__nv_" + copysign(x, y) + end + end +end + +@testset "min/max" begin + # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ & + # LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to emulate + # since PTX has no `.NaN` variant for f64. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "min.NaN.f32" + min(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "max.NaN.f32" + max(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y + @check_not "__nv_" + min(x, y) + end + + # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "setp.lt.$s" + @check "selp.$s" + @fastmath min(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "setp.lt.$s" + @check "selp.$s" + @fastmath max(x, y) + end + end +end + +@testset "fma/muladd" begin + # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for f32/f64 + # by GPUCompiler; for f16 we keep an explicit override). `Base.muladd` + # lowers to `fmul contract + fadd contract`, which the backend fuses. + # Either way: a single `fma.rn` per type. + for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z + @check "fma.rn.$s" + @check_not "__nv_fma" + fma(x, y, z) + end + @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z + @check "fma.rn.$s" + muladd(x, y, z) + end + end +end + +@testset "sqrt" begin + # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`; per-call + # `@fastmath` → `sqrt.approx.fXX`; job-wide `fastmath=true` → the FTZ + # variant via `apply_fastmath!`. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "sqrt.rn.$s" + @check_not "sqrt.approx" + sqrt(x) + end + end + + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "sqrt.approx.f32" + @check_not "sqrt.approx.ftz" + @fastmath sqrt(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x + @check "sqrt.approx.ftz.f32" + sqrt(x) + end + # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp. + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "rsqrt.approx.f64" + @fastmath sqrt(x) + end +end + +@testset "div/inv" begin + # `Base.{/, inv}` and their fast variants are now handled entirely by + # GPUCompiler's `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches + # `fdiv 1.0, x` to `rcp.rn`. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "div.rn.$s" + x / y + end + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "rcp.rn.$s" + inv(x) + end + end + + # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since the + # job isn't fast; f64 always uses rcp+Newton (no native fast f64 fdiv). + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "div.approx.f32" + @check_not "div.approx.ftz" + @fastmath x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "div.approx.f32" + @check_not "div.approx.ftz" + @fastmath inv(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y + @check "rcp.approx.ftz.f64" + @fastmath x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "rcp.approx.ftz.f64" + @fastmath inv(x) + end + + # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as @fastmath, + # and f32 additionally picks up FTZ. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y + @check "div.approx.ftz.f32" + x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x + @check "div.approx.ftz.f32" + inv(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y + @check "rcp.approx.ftz.f64" + x / y + end +end + +end diff --git a/test/setup.jl b/test/setup.jl index dc28bd9dc9..2561cbe7a1 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,6 +1,7 @@ using Test using CUDA using CUDACore +using FileCheck using GPUArrays using NVML: has_nvml, NVML using ParallelTestRunner From 7ab5b8c54bec753b85331c1c464f676aab3a2989 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 15:36:38 +0200 Subject: [PATCH 06/10] Use FileCheck more widely. --- test/core/codegen.jl | 92 ++++------ test/core/device/array.jl | 22 +-- test/core/device/intrinsics.jl | 8 +- test/core/device/intrinsics/math.jl | 273 +++++++++++++++++++++++----- test/core/device/intrinsics/wmma.jl | 22 +-- test/core/device/ldg.jl | 21 +-- test/core/execution.jl | 36 ++-- test/core/math.jl | 226 ----------------------- 8 files changed, 317 insertions(+), 383 deletions(-) delete mode 100644 test/core/math.jl diff --git a/test/core/codegen.jl b/test/core/codegen.jl index b0a95cd982..97d1771792 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -1,49 +1,45 @@ @testset "LLVM IR" begin @testset "JuliaLang/julia#21121" begin - function foobar() + @test @filecheck CUDA.code_llvm(Tuple{}) do + @check_not "inttoptr" weight_matrix = CuStaticSharedArray(Float32, (16, 16)) sync_threads() weight_matrix[1, 16] *= 2 sync_threads() end - - ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{})) - @test !occursin("inttoptr", ir) end @testset "CUDA.jl#553" begin - function kernel(ptr) - unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) - return + @test @filecheck CUDA.code_llvm(Tuple{Ptr{Float32}}) do ptr + @check_not "@__nv_fmaf" + unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) + return end - - ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{Ptr{Float32}})) - @test !occursin("@__nv_fmaf", ir) end @testset "fma uses LLVM intrinsic" begin - function fma_kernel(ptr) - unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) - return - end - for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16")) - ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}})) - @test occursin("llvm.fma.$suffix", ir) - @test !occursin("__nv_fma", ir) + @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr + @check "llvm.fma.$suffix" + @check_not "__nv_fma" + unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) + return + end end end @testset "assume" begin - foo(i) = cld(42, i) - ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int})) - @test occursin("@gpu_report_exception", ir) - + @test @filecheck CUDA.code_llvm(Tuple{Int}) do i + @check "@gpu_report_exception" + cld(42, i) + end - bar(i) = (CUDA.assume(i > 0); cld(42, i)) - ir = sprint(io->CUDA.code_llvm(io, bar, Tuple{Int})) - @test !occursin("gpu_report_exception", ir) + @test @filecheck CUDA.code_llvm(Tuple{Int}) do i + @check_not "gpu_report_exception" + CUDA.assume(i > 0) + cld(42, i) + end end @testset "stripping invariant.load" begin @@ -132,48 +128,32 @@ end @testset "PTX" begin @testset "always_inline" begin - function f_expensive(x) - Base.Cartesian.@nexprs 30 i -> x = sin(x)+i - end - - function g(x) - f_expensive(x) - return - end - function h(x) - f_expensive(x) - return + # without `always_inline`, the helper survives as a separate `.func`; + # with it set, the helper is inlined and no `.func julia_f_expensive` + # declaration remains. The closure-form lambdas below recreate the + # `f_expensive` helper at each test site, so each parent has its own + # call edge to verify the kwarg sticks. + f_expensive(x) = (Base.Cartesian.@nexprs 30 i -> x = sin(x)+i; x) + for always_inline in (false, true) + @test @filecheck CUDA.code_ptx(Tuple{Float64}; always_inline) do x + @check cond=!always_inline "{{\\.func .*julia_f_expensive}}" + @check_not cond=always_inline "{{\\.func .*julia_f_expensive}}" + f_expensive(x) + return + end end - - asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64})) - @test occursin(r"\.func .*julia_f_expensive", asm) - - asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}; always_inline=true)) - @test !occursin(r"\.func .*julia_f_expensive", asm) - - asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}; always_inline=true)) - @test !occursin(r"\.func .*julia_f_expensive", asm) - - asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64})) - @test occursin(r"\.func .*julia_f_expensive", asm) end @testset "local memory stores due to byval" begin # JuliaGPU/GPUCompiler.jl#92 - function kernel(y1, y2) + @test @filecheck CUDA.code_ptx(NTuple{2,CuDeviceArray{Float32,1,AS.Global}}) do y1, y2 + @check_not ".local" y = threadIdx().x == 1 ? y1 : y2 @inbounds y[] = 0 return end - - asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}})) - @test !occursin(".local", asm) end -# PTX-level checks for math intrinsics (abs/floor/ceil/trunc/isnan/isinf/ -# isfinite/signbit/copysign/min/max/fma/muladd/sqrt/div/inv, both plain and -# `@fastmath`) live in `test/core/math.jl` with one testset per op family. - @testset "header rewrite (.target/.version bump)" begin # When LLVM's NVPTX backend can't reach the device cap (e.g. Julia 1.12 + # LLVM 18 on a Blackwell device), `_compiler_config` produces a split diff --git a/test/core/device/array.jl b/test/core/device/array.jl index 240fed288b..f8f5b7a142 100644 --- a/test/core/device/array.jl +++ b/test/core/device/array.jl @@ -68,22 +68,23 @@ end @testset "bounds checking" begin @testset "#313" begin - function kernel(dest) + kernel = dest -> (dest[1] = 1; nothing) + tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global}, + Tuple{UnitRange{Int64},UnitRange{Int64}},false}} + @test @filecheck CUDA.code_llvm(tt) do dest + @check_not "jl_invoke" dest[1] = 1 nothing end - tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global}, - Tuple{UnitRange{Int64},UnitRange{Int64}},false}} - - ir = sprint(io->CUDA.code_llvm(io, kernel, tt)) - @test !occursin("jl_invoke", ir) + # also smoke-test that PTX codegen succeeds for this signature. CUDA.code_ptx(devnull, kernel, tt) end # test that we don't do needless bounds checking when the kernel already does it # (enabled by the fact that we store `len` next to `dims`) - let - function kernel(A) + for N in 1:3 + @test @filecheck CUDA.code_llvm(Tuple{CuDeviceArray{Int,N,AS.Global}}) do A + @check_not "boundserror" idx = threadIdx().x if idx <= length(A) # we did our own bounds checking, so no check should be left! @@ -91,11 +92,6 @@ end end return end - - for N in 1:3 - ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{CuDeviceArray{Int,N,AS.Global}})) - @test !occursin("boundserror", ir) - end end end diff --git a/test/core/device/intrinsics.jl b/test/core/device/intrinsics.jl index 2491e33423..504ef8c3b5 100644 --- a/test/core/device/intrinsics.jl +++ b/test/core/device/intrinsics.jl @@ -31,10 +31,10 @@ @on_device active_mask() @testset "range metadata" begin - foobar() = threadIdx().x - ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true)) - - @test occursin(r"call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range", ir) + @test @filecheck CUDA.code_llvm(Tuple{}; raw=true) do + @check "{{call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range}}" + threadIdx().x + end end end diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl index 1843f6213f..5eea318449 100644 --- a/test/core/device/intrinsics/math.jl +++ b/test/core/device/intrinsics/math.jl @@ -338,72 +338,34 @@ using SpecialFunctions @testset "@fastmath sincos" begin # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos - function kernel(a, b, c) + @test @filecheck CUDA.code_ptx(NTuple{3,CuDeviceArray{Float32,1,AS.Global}}) do a, b, c + @check "sin.approx.f32" + @check "cos.approx.f32" + @check_not "__nv" # from libdevice @inbounds b[], c[] = @fastmath sincos(a[]) return end - asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}})) - @assert contains(asm, "sin.approx.f32") - @assert contains(asm, "cos.approx.f32") - @assert !contains(asm, "__nv") # from libdevice end @testset "inv" begin - # Base.inv should use accurate rcp instructions (rcp.rn) + # Base.inv should use accurate rcp instructions (rcp.rn). + # PTX-level patterns for inv / inv_fast / div / div_fast live in + # `test/core/math.jl`; here we only sanity-check correctness on GPU. for T in (Float32, Float64) @test testf(x -> inv.(x), rand(T, 10) .+ T(0.1)) @test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]) end - - function kernel_inv_f32(a) - @inbounds a[] = inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}})) - @test contains(asm, "rcp.rn.f32") - - function kernel_inv_f64(a) - @inbounds a[] = inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}})) - @test contains(asm, "rcp.rn.f64") end @testset "inv_fast" begin - # inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa) - function kernel_inv_fast_f32(a) - @inbounds a[] = @fastmath inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}})) - @test contains(asm, "rcp.approx.ftz.f32") - fast_inv(x) = @fastmath inv(x) xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0] @test Array(map(fast_inv, cu(xs32))) ≈ inv.(xs32) rtol = 1.0f-4 - - # inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson - function kernel_inv_fast_f64(a) - @inbounds a[] = @fastmath inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}})) - @test contains(asm, "rcp.approx.ftz.f64") - xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0] @test Array(map(fast_inv, CuArray(xs64))) ≈ inv.(xs64) rtol = 1.0e-10 end @testset "div_fast Float64" begin - # FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y) - function kernel_div_fast_f64(a, b, c) - @inbounds c[] = @fastmath a[] / b[] - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}})) - @test contains(asm, "rcp.approx.ftz.f64") - fast_div(x, y) = @fastmath x / y xs = rand(Float64, 10) .+ 0.1 ys = rand(Float64, 10) .+ 0.1 @@ -420,4 +382,225 @@ using SpecialFunctions @test isequal(maximum(AT([NaN])), NaN) end end + + # PTX lowering pins for the standard math ops. Most of these used to + # require `@device_override`s pointing at libdevice; now they're handled + # by Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!`, + # `PTXFDivFastPass`, and `PTXFSqrtFastPass`. Each testset pins the actual + # PTX so the wiring stays put across {f32, f64} × {plain, `@fastmath`} × + # {default, job-wide `fastmath=true`}. + + @testset "abs PTX" begin + for fastmath in (false, true) + # f32: job-wide fastmath flips to the `.ftz` variant. + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x + @check cond=fastmath "abs.ftz.f32" + @check cond=!fastmath "abs.f32" + @check_not "__nv_" + abs(x) + end + # f64: no FTZ on PTX for f64. + @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x + @check "abs.f64" + @check_not "__nv_" + abs(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x + @check "abs.s32" + @check_not "__nv_" + abs(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x + @check "abs.s64" + @check_not "__nv_" + abs(x) + end + end + + @testset "floor/ceil/trunc PTX" begin + for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi")) + for fastmath in (false, true) + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x + @check cond=fastmath "cvt.$rnd.ftz.f32.f32" + @check cond=!fastmath "cvt.$rnd.f32.f32" + @check_not "__nv_" + op(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x + @check "cvt.$rnd.f64.f64" + @check_not "__nv_" + op(x) + end + end + end + end + + @testset "isnan/isinf/isfinite PTX" begin + # All three should be pure FP compares / bit-tests, no libdevice. + for T in (Float32, Float64), op in (isnan, isinf, isfinite) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check_not "__nv_" + op(x) + end + end + # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`. + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "setp.nan.f32" + isnan(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "setp.nan.f64" + isnan(x) + end + end + + @testset "signbit PTX" begin + for T in (Float32, Float64) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check_not "__nv_" + signbit(x) + end + end + end + + @testset "copysign PTX" begin + # NVPTX has no single copysign instruction (custom-lowered to bit ops); + # we just verify libdevice isn't on the path. + for T in (Float32, Float64) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check_not "__nv_" + copysign(x, y) + end + end + end + + @testset "min/max PTX" begin + # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ + # + LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to + # emulate since PTX has no `.NaN` variant for f64. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "min.NaN.f32" + min(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "max.NaN.f32" + max(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y + @check_not "__nv_" + min(x, y) + end + + # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "setp.lt.$s" + @check "selp.$s" + @fastmath min(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "setp.lt.$s" + @check "selp.$s" + @fastmath max(x, y) + end + end + end + + @testset "fma/muladd PTX" begin + # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for + # f32/f64 by GPUCompiler; for f16 we keep an explicit override). + # `Base.muladd` lowers to `fmul contract + fadd contract`, which the + # backend fuses. Either way: a single `fma.rn` per type. + for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z + @check "fma.rn.$s" + @check_not "__nv_fma" + fma(x, y, z) + end + @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z + @check "fma.rn.$s" + muladd(x, y, z) + end + end + end + + @testset "sqrt PTX" begin + # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`; + # per-call `@fastmath` → `sqrt.approx.fXX` (via `PTXFSqrtFastPass`); + # job-wide `fastmath=true` → the FTZ variant via `apply_fastmath!`. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "sqrt.rn.$s" + @check_not "sqrt.approx" + sqrt(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "sqrt.approx.f32" + @check_not "sqrt.approx.ftz" + @fastmath sqrt(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x + @check "sqrt.approx.ftz.f32" + sqrt(x) + end + # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp. + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "rsqrt.approx.f64" + @fastmath sqrt(x) + end + end + + @testset "div/inv PTX" begin + # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's + # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches + # `fdiv 1.0, x` to `rcp.rn`. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "div.rn.$s" + x / y + end + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "rcp.rn.$s" + inv(x) + end + end + + # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since + # the job isn't fast; f64 always uses rcp+Newton. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "div.approx.f32" + @check_not "div.approx.ftz" + @fastmath x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "div.approx.f32" + @check_not "div.approx.ftz" + @fastmath inv(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y + @check "rcp.approx.ftz.f64" + @fastmath x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "rcp.approx.ftz.f64" + @fastmath inv(x) + end + + # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as + # `@fastmath`, and f32 additionally picks up FTZ. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y + @check "div.approx.ftz.f32" + x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x + @check "div.approx.ftz.f32" + inv(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y + @check "rcp.approx.ftz.f64" + x / y + end + end end diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl index f4ef5fdf4c..3281c56157 100644 --- a/test/core/device/intrinsics/wmma.jl +++ b/test/core/device/intrinsics/wmma.jl @@ -478,36 +478,26 @@ end @testset "Codegen addressing" begin @testset "Global" begin - function kernel(d) + @test @filecheck CUDA.code_ptx((CuDeviceArray{Float32,1,CUDA.AS.Global},)) do d + @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32}}" + @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}" conf = WMMA.Config{16, 16, 16, Float32} - d_frag = WMMA.fill_c(Float32(0), conf) WMMA.store_d(pointer(d), d_frag, 16, WMMA.ColMajor, conf) - return end - - ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},))) - - @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) - @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx) end @testset "Shared" begin - function kernel() + @test @filecheck CUDA.code_ptx(()) do + @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32}}" + @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}" shmem = CuStaticSharedArray(Float32, (16, 16)) conf = WMMA.Config{16, 16, 16, Float32} - d_frag = WMMA.fill_c(Float32(0), conf) WMMA.store_d(pointer(shmem), d_frag, 16, WMMA.ColMajor, conf) - return end - - ptx = sprint(io -> CUDA.code_ptx(io, kernel, ())) - - @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) - @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx) end end diff --git a/test/core/device/ldg.jl b/test/core/device/ldg.jl index 0385e4e34a..b25f953449 100644 --- a/test/core/device/ldg.jl +++ b/test/core/device/ldg.jl @@ -1,15 +1,14 @@ @testset "ldg" begin - # NOTE: This is necessary because it seems that code_llvm has a bug which causes it to ignore - # the method table. Wrapping it in a function gets us what we want currently but the PR - # here: https://github.com/JuliaLang/julia/pull/60718 will likely fix this according to - # @vchuravy. It is currently not backported. - ir = sprint(io->CUDA.code_llvm(io, (args...)->CUDACore.pointerref_ldg(args...), Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true)) - if Base.libllvm_version >= v"20" - # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with - # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`. - @test occursin("!invariant.load", ir) - else - @test occursin("@llvm.nvvm.ldg", ir) + # NOTE: the wrapping function is necessary because code_llvm has a bug + # that causes it to ignore the method table; JuliaLang/julia#60718 will + # likely fix this according to @vchuravy. + # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with + # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`. + new_llvm = Base.libllvm_version >= v"20" + @test @filecheck CUDA.code_llvm(Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true) do args... + @check cond=new_llvm "!invariant.load" + @check cond=!new_llvm "@llvm.nvvm.ldg" + CUDACore.pointerref_ldg(args...) end end diff --git a/test/core/execution.jl b/test/core/execution.jl index 387af3547e..4f8a8d3495 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -61,16 +61,22 @@ end # to constructing the SMVersion directly. @cuda launch=false arch=dev_cap dummy() # but we should be able to see it in the generated PTX code - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50")) - @test contains(asm, ".target sm_50") - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0")) - @test contains(asm, ".target sm_50") + @test @filecheck CUDA.code_ptx((); arch=sm"50") do + @check ".target sm_50" + dummy() + end + @test @filecheck CUDA.code_ptx((); arch=v"5.0") do + @check ".target sm_50" + dummy() + end # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the # `.version` line should match what was asked for, independently of what LLVM and # ptxas would natively pick. - asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3")) - @test contains(asm, ".version 6.3") + @test @filecheck CUDA.code_ptx((); ptx=v"6.3") do + @check ".version 6.3" + dummy() + end # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it # used to be); a clearly out-of-range value must error at config time. @@ -84,14 +90,18 @@ end sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) if dev_cap >= v"9.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a)) - @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + @test @filecheck CUDA.code_ptx((); arch=sm_a) do + @check ".target $(CUDACore.cpu_name(sm_a))" + dummy() + end # arch-specific cubin should also actually launch on the matching device @cuda arch=sm_a dummy() end if dev_cap >= v"10.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f)) - @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))") + @test @filecheck CUDA.code_ptx((); arch=sm_f) do + @check ".target $(CUDACore.cpu_name(sm_f))" + dummy() + end @cuda arch=sm_f dummy() end @@ -103,8 +113,10 @@ end # since we know the exact device. The cuda-side `.target` is the variant regardless of # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs. if dev_cap >= v"9.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, ())) - @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + @test @filecheck CUDA.code_ptx(()) do + @check ".target $(CUDACore.cpu_name(sm_a))" + dummy() + end end # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built diff --git a/test/core/math.jl b/test/core/math.jl deleted file mode 100644 index a070eb4d38..0000000000 --- a/test/core/math.jl +++ /dev/null @@ -1,226 +0,0 @@ -# Verify PTX lowering of math intrinsics across {f32, f64} × {plain, -# `@fastmath`} × {default, job-wide `fastmath=true`}. Most of these used to -# require `@device_override`s pointing at libdevice; now they're handled by -# Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!` and -# `PTXFDivFastPass`. Each testset pins down the actual PTX so the wiring -# stays put. - -@testset "math" begin - -@testset "abs" begin - for fastmath in (false, true) - # f32: job-wide fastmath flips to the `.ftz` variant. - @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x - @check cond=fastmath "abs.ftz.f32" - @check cond=!fastmath "abs.f32" - @check_not "__nv_" - abs(x) - end - # f64: no FTZ on PTX for f64. - @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x - @check "abs.f64" - @check_not "__nv_" - abs(x) - end - end - @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x - @check "abs.s32" - @check_not "__nv_" - abs(x) - end - @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x - @check "abs.s64" - @check_not "__nv_" - abs(x) - end -end - -@testset "floor/ceil/trunc" begin - for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi")) - for fastmath in (false, true) - @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x - @check cond=fastmath "cvt.$rnd.ftz.f32.f32" - @check cond=!fastmath "cvt.$rnd.f32.f32" - @check_not "__nv_" - op(x) - end - @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x - @check "cvt.$rnd.f64.f64" - @check_not "__nv_" - op(x) - end - end - end -end - -@testset "isnan/isinf/isfinite" begin - # All three should be pure FP compares / bit-tests, no libdevice. - for T in (Float32, Float64), op in (isnan, isinf, isfinite) - @test @filecheck CUDA.code_ptx(Tuple{T}) do x - @check_not "__nv_" - op(x) - end - end - - # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`. - @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x - @check "setp.nan.f32" - isnan(x) - end - @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x - @check "setp.nan.f64" - isnan(x) - end -end - -@testset "signbit" begin - for T in (Float32, Float64) - @test @filecheck CUDA.code_ptx(Tuple{T}) do x - @check_not "__nv_" - signbit(x) - end - end -end - -@testset "copysign" begin - # NVPTX has no single copysign instruction (custom-lowered to bit ops); - # we just verify libdevice isn't on the path. - for T in (Float32, Float64) - @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y - @check_not "__nv_" - copysign(x, y) - end - end -end - -@testset "min/max" begin - # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ & - # LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to emulate - # since PTX has no `.NaN` variant for f64. - @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y - @check "min.NaN.f32" - min(x, y) - end - @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y - @check "max.NaN.f32" - max(x, y) - end - @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y - @check_not "__nv_" - min(x, y) - end - - # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select. - for (T, s) in ((Float32, "f32"), (Float64, "f64")) - @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y - @check "setp.lt.$s" - @check "selp.$s" - @fastmath min(x, y) - end - @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y - @check "setp.lt.$s" - @check "selp.$s" - @fastmath max(x, y) - end - end -end - -@testset "fma/muladd" begin - # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for f32/f64 - # by GPUCompiler; for f16 we keep an explicit override). `Base.muladd` - # lowers to `fmul contract + fadd contract`, which the backend fuses. - # Either way: a single `fma.rn` per type. - for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64")) - @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z - @check "fma.rn.$s" - @check_not "__nv_fma" - fma(x, y, z) - end - @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z - @check "fma.rn.$s" - muladd(x, y, z) - end - end -end - -@testset "sqrt" begin - # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`; per-call - # `@fastmath` → `sqrt.approx.fXX`; job-wide `fastmath=true` → the FTZ - # variant via `apply_fastmath!`. - for (T, s) in ((Float32, "f32"), (Float64, "f64")) - @test @filecheck CUDA.code_ptx(Tuple{T}) do x - @check "sqrt.rn.$s" - @check_not "sqrt.approx" - sqrt(x) - end - end - - @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x - @check "sqrt.approx.f32" - @check_not "sqrt.approx.ftz" - @fastmath sqrt(x) - end - @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x - @check "sqrt.approx.ftz.f32" - sqrt(x) - end - # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp. - @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x - @check "rsqrt.approx.f64" - @fastmath sqrt(x) - end -end - -@testset "div/inv" begin - # `Base.{/, inv}` and their fast variants are now handled entirely by - # GPUCompiler's `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches - # `fdiv 1.0, x` to `rcp.rn`. - for (T, s) in ((Float32, "f32"), (Float64, "f64")) - @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y - @check "div.rn.$s" - x / y - end - @test @filecheck CUDA.code_ptx(Tuple{T}) do x - @check "rcp.rn.$s" - inv(x) - end - end - - # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since the - # job isn't fast; f64 always uses rcp+Newton (no native fast f64 fdiv). - @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y - @check "div.approx.f32" - @check_not "div.approx.ftz" - @fastmath x / y - end - @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x - @check "div.approx.f32" - @check_not "div.approx.ftz" - @fastmath inv(x) - end - @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y - @check "rcp.approx.ftz.f64" - @fastmath x / y - end - @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x - @check "rcp.approx.ftz.f64" - @fastmath inv(x) - end - - # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as @fastmath, - # and f32 additionally picks up FTZ. - @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y - @check "div.approx.ftz.f32" - x / y - end - @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x - @check "div.approx.ftz.f32" - inv(x) - end - @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y - @check "rcp.approx.ftz.f64" - x / y - end -end - -end From bc47367138093ac218509123436007348d60e2b1 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 17:24:49 +0200 Subject: [PATCH 07/10] Fix CI. --- test/helpers.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/helpers.jl b/test/helpers.jl index 2809a4b4a8..7ad4eb1a5e 100644 --- a/test/helpers.jl +++ b/test/helpers.jl @@ -6,6 +6,7 @@ using CUDA, CUDACore, GPUArrays using CUDA: i32 using Adapt +using FileCheck using ..Main: TestSuite, can_use_cupti, sanitize testf(@nospecialize(f), @nospecialize(xs...); kwargs...) = From 0b0dc669ee02af7c40741d0d1b31ed5f78249708 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 May 2026 17:24:58 +0200 Subject: [PATCH 08/10] Address review comments. --- CUDACore/src/device/intrinsics/math.jl | 13 ++++++++++++- test/core/codegen.jl | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index 6ffe352425..d6140cb1ae 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -475,7 +475,7 @@ end # `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only # `Complex` has a method) and the catch-all fallback drops `afn` -@device_override FastMath.inv_fast(x::Union{Float32, Float64}) = +@device_override FastMath.inv_fast(x::Union{Float16, Float32, Float64}) = FastMath.div_fast(one(x), x) @@ -502,6 +502,17 @@ end @device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) +# `Base.muladd(x, y, z) = fma(x, y, z)` is the natural choice on GPU: NVPTX +# always lowers `llvm.fmuladd.fXX` to `fma.rn`, and routing through +# `llvm.fmuladd` (rather than Julia's default `fmul contract + fadd contract`) +# keeps the fusion robust under vectorization (per JuliaGPU/CUDA.jl#3149). +@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = + ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z) +@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = + ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z) +@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = + ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) + # Directed rounding for binary arithmetic and fma. NVPTX exposes # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub` # intrinsic, so subtraction reuses add(x, -y) (negation is bit-exact for IEEE diff --git a/test/core/codegen.jl b/test/core/codegen.jl index 97d1771792..1941f289cc 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -29,6 +29,20 @@ end end end +@testset "muladd uses LLVM intrinsic" begin + # `Base.muladd` emits `fmul contract + fadd contract` upstream, which the + # backend usually fuses to `fma.rn`. On GPU the fusion is unreliable under + # vectorization (JuliaGPU/CUDA.jl#3149), so the override routes through + # `llvm.fmuladd.fXX` directly. + for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16")) + @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr + @check "llvm.fmuladd.$suffix" + unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) + return + end + end +end + @testset "assume" begin @test @filecheck CUDA.code_llvm(Tuple{Int}) do i @check "@gpu_report_exception" From dffd57f14d7b88b9b03764d0f875049eedcd7a04 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 May 2026 09:07:52 +0200 Subject: [PATCH 09/10] Simplify rsqrt, fix min/max PTX test for sm<80. rsqrt now uses high-level `@fastmath 1/sqrt(x)`; GPUCompiler's new PTXRSqrtFastPass lowers it to `nvvm.rsqrt.approx.{f,d}` directly. Adds a PTX FileCheck test pinning the lowering. Pin `arch=sm"80"` on the min.NaN.f32 / max.NaN.f32 PTX checks so they pass on sm_75 CI runners. Co-Authored-By: Claude Opus 4.7 (1M context) --- CUDACore/Project.toml | 2 +- CUDACore/src/device/intrinsics/math.jl | 4 ++-- test/core/device/intrinsics/math.jl | 27 +++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 820b3cf86e..62d4e196e5 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.13.1" +GPUCompiler = "1.13.2" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index d6140cb1ae..bda26d9a42 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -298,8 +298,8 @@ end ## roots and powers -@device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x) -@device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x) +@device_function rsqrt(x::Float64) = @fastmath 1.0 / sqrt(x) +@device_function rsqrt(x::Float32) = @fastmath 1f0 / sqrt(x) @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x))) @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x) diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl index 5eea318449..13c5b4e5ae 100644 --- a/test/core/device/intrinsics/math.jl +++ b/test/core/device/intrinsics/math.jl @@ -478,12 +478,13 @@ using SpecialFunctions @testset "min/max PTX" begin # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ # + LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to - # emulate since PTX has no `.NaN` variant for f64. - @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + # emulate since PTX has no `.NaN` variant for f64. Pin `arch=sm"80"` + # so the test is deterministic regardless of the CI runner's device. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y @check "min.NaN.f32" min(x, y) end - @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y @check "max.NaN.f32" max(x, y) end @@ -552,6 +553,26 @@ using SpecialFunctions end end + @testset "rsqrt PTX" begin + # `CUDA.rsqrt(x)` is `@fastmath 1/sqrt(x)`; GPUCompiler's + # `PTXRSqrtFastPass` folds the `afn 1/sqrt(x)` pattern to a single + # `nvvm.rsqrt.approx.{f,d}` call. f16 computes in f32, so it still + # hits the f32 instruction. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "rsqrt.approx.$s" + @check_not "sqrt.approx" + @check_not "__nv_" + CUDA.rsqrt(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Float16}) do x + @check "rsqrt.approx.f32" + @check_not "__nv_" + CUDA.rsqrt(x) + end + end + @testset "div/inv PTX" begin # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches From 986ca42b55f766b21fb4378e2468b138ac4058f9 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 May 2026 09:49:27 +0200 Subject: [PATCH 10/10] Define rsqrt via direct NVPTX intrinsic, not @fastmath. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `@fastmath 1/sqrt(x)` form stamps `fast` (nnan/ninf/...) on the IR operations, which let LLVM DCE caller-side `isnan(rsqrt(x))` and `isinf(rsqrt(x))` checks before our PTXRSqrtFastPass folded the pattern — a behavior regression versus the libdevice path. Direct `ccall` to `llvm.nvvm.rsqrt.approx.{f,d}` is opaque to fast-math reasoning, matches what libdevice itself does (a thin wrapper around the same intrinsic), and produces strictly cleaner IR than libdevice (single rsqrt call + select rather than phi + duplicate call). Co-Authored-By: Claude Opus 4.7 (1M context) --- CUDACore/Project.toml | 2 +- CUDACore/src/device/intrinsics/math.jl | 7 +++++-- test/core/device/intrinsics/math.jl | 8 ++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 62d4e196e5..820b3cf86e 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.13.2" +GPUCompiler = "1.13.1" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index bda26d9a42..16f9cc8738 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -298,8 +298,11 @@ end ## roots and powers -@device_function rsqrt(x::Float64) = @fastmath 1.0 / sqrt(x) -@device_function rsqrt(x::Float32) = @fastmath 1f0 / sqrt(x) +# NVPTX has native `rsqrt.approx.{f32,f64}`; call the intrinsic directly. The +# obvious alternative, `@fastmath 1/sqrt(x)`, also lowers to `rsqrt.approx` +# (via `PTXRSqrtFastPass`), but is too aggressive wrt. fast-math behavior. +@device_function rsqrt(x::Float64) = ccall("llvm.nvvm.rsqrt.approx.d", llvmcall, Cdouble, (Cdouble,), x) +@device_function rsqrt(x::Float32) = ccall("llvm.nvvm.rsqrt.approx.f", llvmcall, Cfloat, (Cfloat,), x) @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x))) @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x) diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl index 13c5b4e5ae..4ad292390c 100644 --- a/test/core/device/intrinsics/math.jl +++ b/test/core/device/intrinsics/math.jl @@ -554,10 +554,10 @@ using SpecialFunctions end @testset "rsqrt PTX" begin - # `CUDA.rsqrt(x)` is `@fastmath 1/sqrt(x)`; GPUCompiler's - # `PTXRSqrtFastPass` folds the `afn 1/sqrt(x)` pattern to a single - # `nvvm.rsqrt.approx.{f,d}` call. f16 computes in f32, so it still - # hits the f32 instruction. + # `CUDA.rsqrt(x)` directly calls the NVPTX `rsqrt.approx.{f,d}` + # intrinsic — no libdevice, and no `@fastmath` so caller-side NaN/Inf + # checks aren't DCE'd by `nnan`/`ninf` propagation. f16 computes in + # f32, so it still hits the f32 instruction. for (T, s) in ((Float32, "f32"), (Float64, "f64")) @test @filecheck CUDA.code_ptx(Tuple{T}) do x @check "rsqrt.approx.$s"