diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml index 2c9992781e..820b3cf86e 100644 --- a/CUDACore/Project.toml +++ b/CUDACore/Project.toml @@ -53,7 +53,7 @@ ChainRulesCore = "1" EnzymeCore = "0.8.2" ExprTools = "0.1" GPUArrays = "11.5.4" -GPUCompiler = "1.12" +GPUCompiler = "1.13.1" GPUToolbox = "1.1" KernelAbstractions = "0.9.38" LLVM = "9.6" diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl index 95735015f2..16f9cc8738 100644 --- a/CUDACore/src/device/intrinsics/math.jl +++ b/CUDACore/src/device/intrinsics/math.jl @@ -1,5 +1,8 @@ # math functionality +# we only use libdevice where needed. if possible, we go through LLVM instead, +# ideally relying on Julia's existing definitions. + @public fma, rsqrt, saturate, byte_perm, assume @public add_rn, add_rz, add_rm, add_rp @public sub_rn, sub_rz, sub_rm, sub_rp @@ -286,16 +289,6 @@ end ## floating-point handling -@device_override Base.isfinite(x::Float32) = (ccall("extern __nv_finitef", llvmcall, Int32, (Cfloat,), x)) != 0 -@device_override Base.isfinite(x::Float64) = (ccall("extern __nv_isfinited", llvmcall, Int32, (Cdouble,), x)) != 0 - -@device_override Base.isinf(x::Float64) = (ccall("extern __nv_isinfd", llvmcall, Int32, (Cdouble,), x)) != 0 -@device_override Base.isinf(x::Float32) = (ccall("extern __nv_isinff", llvmcall, Int32, (Cfloat,), x)) != 0 - -@device_override Base.isnan(x::Float64) = (ccall("extern __nv_isnand", llvmcall, Int32, (Cdouble,), x)) != 0 -@device_override Base.isnan(x::Float32) = (ccall("extern __nv_isnanf", llvmcall, Int32, (Cfloat,), x)) != 0 -# isnan(::Float16) inherits from Julia (x != x), which compiles to a single setp.neu.f16. - @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x) @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x) @@ -303,29 +296,13 @@ end @device_function nextafter(x::Float32, y::Float32) = ccall("extern __nv_nextafterf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -## sign handling - -@device_override Base.signbit(x::Float64) = (ccall("extern __nv_signbitd", llvmcall, Int32, (Cdouble,), x)) != 0 -@device_override Base.signbit(x::Float32) = (ccall("extern __nv_signbitf", llvmcall, Int32, (Cfloat,), x)) != 0 - -@device_override Base.copysign(x::Float64, y::Float64) = ccall("extern __nv_copysign", llvmcall, Cdouble, (Cdouble, Cdouble), x, y) -@device_override Base.copysign(x::Float32, y::Float32) = ccall("extern __nv_copysignf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) - -@device_override Base.abs(x::Int32) = ccall("extern __nv_abs", llvmcall, Int32, (Int32,), x) -@device_override Base.abs(f::Float64) = ccall("extern __nv_fabs", llvmcall, Cdouble, (Cdouble,), f) -@device_override Base.abs(f::Float32) = ccall("extern __nv_fabsf", llvmcall, Cfloat, (Cfloat,), f) -# abs(::Float16) inherits from Julia (abs_float intrinsic), lowering to and.b16. -@device_override Base.abs(x::Int64) = ccall("extern __nv_llabs", llvmcall, Int64, (Int64,), x) - ## roots and powers -@device_override Base.sqrt(x::Float64) = ccall("extern __nv_sqrt", llvmcall, Cdouble, (Cdouble,), x) -@device_override Base.sqrt(x::Float32) = ccall("extern __nv_sqrtf", llvmcall, Cfloat, (Cfloat,), x) -# sqrt(::Float16) inherits from Julia (Float16(sqrt(Float32(x)))), routing through __nv_sqrtf. -@device_override FastMath.sqrt_fast(x::Union{Float32, Float64}) = sqrt(x) - -@device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x) -@device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x) +# NVPTX has native `rsqrt.approx.{f32,f64}`; call the intrinsic directly. The +# obvious alternative, `@fastmath 1/sqrt(x)`, also lowers to `rsqrt.approx` +# (via `PTXRSqrtFastPass`), but is too aggressive wrt. fast-math behavior. +@device_function rsqrt(x::Float64) = ccall("llvm.nvvm.rsqrt.approx.d", llvmcall, Cdouble, (Cdouble,), x) +@device_function rsqrt(x::Float32) = ccall("llvm.nvvm.rsqrt.approx.f", llvmcall, Cfloat, (Cfloat,), x) @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x))) @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x) @@ -395,15 +372,6 @@ end #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x) #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x) -@device_override Base.trunc(x::Float64) = ccall("extern __nv_trunc", llvmcall, Cdouble, (Cdouble,), x) -@device_override Base.trunc(x::Float32) = ccall("extern __nv_truncf", llvmcall, Cfloat, (Cfloat,), x) - -@device_override Base.ceil(x::Float64) = ccall("extern __nv_ceil", llvmcall, Cdouble, (Cdouble,), x) -@device_override Base.ceil(x::Float32) = ccall("extern __nv_ceilf", llvmcall, Cfloat, (Cfloat,), x) - -@device_override Base.floor(f::Float64) = ccall("extern __nv_floor", llvmcall, Cdouble, (Cdouble,), f) -@device_override Base.floor(f::Float32) = ccall("extern __nv_floorf", llvmcall, Cfloat, (Cfloat,), f) - #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y) #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y) #@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y)) @@ -508,27 +476,11 @@ end @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest)) -@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y) - -@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x) -@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x) - -@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x) -@device_override function FastMath.inv_fast(x::Float64) - # Get the approximate reciprocal - # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 - # This instruction chops off last 32bits of mantissa and computes inverse - # while treating all subnormal numbers as 0.0 - # If reciprocal would be subnormal, underflows to 0.0 - # 32 least significant bits of the result are filled with 0s - inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x) - - # Approximate the missing 32bits of mantissa with a single cubic iteration - e = fma(inv_x, -x, 1.0) - e = fma(e, e, e) - inv_x = fma(e, inv_x, inv_x) -end +# `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only +# `Complex` has a method) and the catch-all fallback drops `afn` +@device_override FastMath.inv_fast(x::Union{Float16, Float32, Float64}) = + FastMath.div_fast(one(x), x) + ## distributions @@ -549,13 +501,20 @@ end @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y) @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y) -@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z) -@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z) -@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) - -@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z) -@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z) -@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) +# `Base.fma(::Float16,...)` branches on `jl_have_fma` +@device_override Base.fma(x::Float16, y::Float16, z::Float16) = + ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) + +# `Base.muladd(x, y, z) = fma(x, y, z)` is the natural choice on GPU: NVPTX +# always lowers `llvm.fmuladd.fXX` to `fma.rn`, and routing through +# `llvm.fmuladd` (rather than Julia's default `fmul contract + fadd contract`) +# keeps the fusion robust under vectorization (per JuliaGPU/CUDA.jl#3149). +@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = + ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z) +@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = + ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z) +@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = + ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z) # Directed rounding for binary arithmetic and fma. NVPTX exposes # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub` diff --git a/test/Project.toml b/test/Project.toml index b37fcf9ed4..510d5844af 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -11,6 +11,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" diff --git a/test/core/codegen.jl b/test/core/codegen.jl index db431db4ed..1941f289cc 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -1,61 +1,59 @@ @testset "LLVM IR" begin @testset "JuliaLang/julia#21121" begin - function foobar() + @test @filecheck CUDA.code_llvm(Tuple{}) do + @check_not "inttoptr" weight_matrix = CuStaticSharedArray(Float32, (16, 16)) sync_threads() weight_matrix[1, 16] *= 2 sync_threads() end - - ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{})) - @test !occursin("inttoptr", ir) end @testset "CUDA.jl#553" begin - function kernel(ptr) - unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) - return + @test @filecheck CUDA.code_llvm(Tuple{Ptr{Float32}}) do ptr + @check_not "@__nv_fmaf" + unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) + return end - - ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{Ptr{Float32}})) - @test !occursin("@__nv_fmaf", ir) end @testset "fma uses LLVM intrinsic" begin - function fma_kernel(ptr) - unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) - return - end - for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16")) - ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}})) - @test occursin("llvm.fma.$suffix", ir) - @test !occursin("__nv_fma", ir) + @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr + @check "llvm.fma.$suffix" + @check_not "__nv_fma" + unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) + return + end end end @testset "muladd uses LLVM intrinsic" begin - function muladd_kernel(ptr) - unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) - return - end - + # `Base.muladd` emits `fmul contract + fadd contract` upstream, which the + # backend usually fuses to `fma.rn`. On GPU the fusion is unreliable under + # vectorization (JuliaGPU/CUDA.jl#3149), so the override routes through + # `llvm.fmuladd.fXX` directly. for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16")) - ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}})) - @test occursin("llvm.fmuladd.$suffix", ir) + @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr + @check "llvm.fmuladd.$suffix" + unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3))) + return + end end end @testset "assume" begin - foo(i) = cld(42, i) - ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int})) - @test occursin("@gpu_report_exception", ir) - + @test @filecheck CUDA.code_llvm(Tuple{Int}) do i + @check "@gpu_report_exception" + cld(42, i) + end - bar(i) = (CUDA.assume(i > 0); cld(42, i)) - ir = sprint(io->CUDA.code_llvm(io, bar, Tuple{Int})) - @test !occursin("gpu_report_exception", ir) + @test @filecheck CUDA.code_llvm(Tuple{Int}) do i + @check_not "gpu_report_exception" + CUDA.assume(i > 0) + cld(42, i) + end end @testset "stripping invariant.load" begin @@ -144,88 +142,30 @@ end @testset "PTX" begin @testset "always_inline" begin - function f_expensive(x) - Base.Cartesian.@nexprs 30 i -> x = sin(x)+i - end - - function g(x) - f_expensive(x) - return - end - function h(x) - f_expensive(x) - return + # without `always_inline`, the helper survives as a separate `.func`; + # with it set, the helper is inlined and no `.func julia_f_expensive` + # declaration remains. The closure-form lambdas below recreate the + # `f_expensive` helper at each test site, so each parent has its own + # call edge to verify the kwarg sticks. + f_expensive(x) = (Base.Cartesian.@nexprs 30 i -> x = sin(x)+i; x) + for always_inline in (false, true) + @test @filecheck CUDA.code_ptx(Tuple{Float64}; always_inline) do x + @check cond=!always_inline "{{\\.func .*julia_f_expensive}}" + @check_not cond=always_inline "{{\\.func .*julia_f_expensive}}" + f_expensive(x) + return + end end - - asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64})) - @test occursin(r"\.func .*julia_f_expensive", asm) - - asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}; always_inline=true)) - @test !occursin(r"\.func .*julia_f_expensive", asm) - - asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}; always_inline=true)) - @test !occursin(r"\.func .*julia_f_expensive", asm) - - asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64})) - @test occursin(r"\.func .*julia_f_expensive", asm) end @testset "local memory stores due to byval" begin # JuliaGPU/GPUCompiler.jl#92 - function kernel(y1, y2) + @test @filecheck CUDA.code_ptx(NTuple{2,CuDeviceArray{Float32,1,AS.Global}}) do y1, y2 + @check_not ".local" y = threadIdx().x == 1 ? y1 : y2 @inbounds y[] = 0 return end - - asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}})) - @test !occursin(".local", asm) -end - -@testset "fastmath" begin - function div_kernel(x) - i = threadIdx().x - @fastmath @inbounds x[i] = 1 / x[i] - return - end - - asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) - @test occursin("div.approx.ftz", asm) - - function sqrt_kernel(x) - i = threadIdx().x - @inbounds x[i] = sqrt(x[i]) - return - end - - asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) - @test occursin("sqrt.r", asm) - - asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) - @test occursin("sqrt.approx.ftz", asm) -end - -@testset "fma/muladd emit fma.rn" begin - # fma and muladd should both lower to fma.rn in PTX - function fma_kernel(a, b, c) - @inbounds a[] = fma(b[], c[], a[]) - return - end - function muladd_kernel(a, b, c) - @inbounds a[] = muladd(b[], c[], a[]) - return - end - - for T in (Float16, Float32, Float64) - asm = sprint(io->CUDA.code_ptx(io, fma_kernel, - NTuple{3,CuDeviceArray{T,1,AS.Global}})) - @test occursin("fma.rn", asm) - @test !occursin("__nv_fma", asm) - - asm = sprint(io->CUDA.code_ptx(io, muladd_kernel, - NTuple{3,CuDeviceArray{T,1,AS.Global}})) - @test occursin("fma.rn", asm) - end end @testset "header rewrite (.target/.version bump)" begin diff --git a/test/core/device/array.jl b/test/core/device/array.jl index 240fed288b..f8f5b7a142 100644 --- a/test/core/device/array.jl +++ b/test/core/device/array.jl @@ -68,22 +68,23 @@ end @testset "bounds checking" begin @testset "#313" begin - function kernel(dest) + kernel = dest -> (dest[1] = 1; nothing) + tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global}, + Tuple{UnitRange{Int64},UnitRange{Int64}},false}} + @test @filecheck CUDA.code_llvm(tt) do dest + @check_not "jl_invoke" dest[1] = 1 nothing end - tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global}, - Tuple{UnitRange{Int64},UnitRange{Int64}},false}} - - ir = sprint(io->CUDA.code_llvm(io, kernel, tt)) - @test !occursin("jl_invoke", ir) + # also smoke-test that PTX codegen succeeds for this signature. CUDA.code_ptx(devnull, kernel, tt) end # test that we don't do needless bounds checking when the kernel already does it # (enabled by the fact that we store `len` next to `dims`) - let - function kernel(A) + for N in 1:3 + @test @filecheck CUDA.code_llvm(Tuple{CuDeviceArray{Int,N,AS.Global}}) do A + @check_not "boundserror" idx = threadIdx().x if idx <= length(A) # we did our own bounds checking, so no check should be left! @@ -91,11 +92,6 @@ end end return end - - for N in 1:3 - ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{CuDeviceArray{Int,N,AS.Global}})) - @test !occursin("boundserror", ir) - end end end diff --git a/test/core/device/intrinsics.jl b/test/core/device/intrinsics.jl index 2491e33423..504ef8c3b5 100644 --- a/test/core/device/intrinsics.jl +++ b/test/core/device/intrinsics.jl @@ -31,10 +31,10 @@ @on_device active_mask() @testset "range metadata" begin - foobar() = threadIdx().x - ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true)) - - @test occursin(r"call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range", ir) + @test @filecheck CUDA.code_llvm(Tuple{}; raw=true) do + @check "{{call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range}}" + threadIdx().x + end end end diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl index 1843f6213f..4ad292390c 100644 --- a/test/core/device/intrinsics/math.jl +++ b/test/core/device/intrinsics/math.jl @@ -338,72 +338,34 @@ using SpecialFunctions @testset "@fastmath sincos" begin # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos - function kernel(a, b, c) + @test @filecheck CUDA.code_ptx(NTuple{3,CuDeviceArray{Float32,1,AS.Global}}) do a, b, c + @check "sin.approx.f32" + @check "cos.approx.f32" + @check_not "__nv" # from libdevice @inbounds b[], c[] = @fastmath sincos(a[]) return end - asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}})) - @assert contains(asm, "sin.approx.f32") - @assert contains(asm, "cos.approx.f32") - @assert !contains(asm, "__nv") # from libdevice end @testset "inv" begin - # Base.inv should use accurate rcp instructions (rcp.rn) + # Base.inv should use accurate rcp instructions (rcp.rn). + # PTX-level patterns for inv / inv_fast / div / div_fast live in + # `test/core/math.jl`; here we only sanity-check correctness on GPU. for T in (Float32, Float64) @test testf(x -> inv.(x), rand(T, 10) .+ T(0.1)) @test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]) end - - function kernel_inv_f32(a) - @inbounds a[] = inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}})) - @test contains(asm, "rcp.rn.f32") - - function kernel_inv_f64(a) - @inbounds a[] = inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}})) - @test contains(asm, "rcp.rn.f64") end @testset "inv_fast" begin - # inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa) - function kernel_inv_fast_f32(a) - @inbounds a[] = @fastmath inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}})) - @test contains(asm, "rcp.approx.ftz.f32") - fast_inv(x) = @fastmath inv(x) xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0] @test Array(map(fast_inv, cu(xs32))) ≈ inv.(xs32) rtol = 1.0f-4 - - # inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson - function kernel_inv_fast_f64(a) - @inbounds a[] = @fastmath inv(a[]) - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}})) - @test contains(asm, "rcp.approx.ftz.f64") - xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0] @test Array(map(fast_inv, CuArray(xs64))) ≈ inv.(xs64) rtol = 1.0e-10 end @testset "div_fast Float64" begin - # FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y) - function kernel_div_fast_f64(a, b, c) - @inbounds c[] = @fastmath a[] / b[] - return - end - asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}})) - @test contains(asm, "rcp.approx.ftz.f64") - fast_div(x, y) = @fastmath x / y xs = rand(Float64, 10) .+ 0.1 ys = rand(Float64, 10) .+ 0.1 @@ -420,4 +382,246 @@ using SpecialFunctions @test isequal(maximum(AT([NaN])), NaN) end end + + # PTX lowering pins for the standard math ops. Most of these used to + # require `@device_override`s pointing at libdevice; now they're handled + # by Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!`, + # `PTXFDivFastPass`, and `PTXFSqrtFastPass`. Each testset pins the actual + # PTX so the wiring stays put across {f32, f64} × {plain, `@fastmath`} × + # {default, job-wide `fastmath=true`}. + + @testset "abs PTX" begin + for fastmath in (false, true) + # f32: job-wide fastmath flips to the `.ftz` variant. + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x + @check cond=fastmath "abs.ftz.f32" + @check cond=!fastmath "abs.f32" + @check_not "__nv_" + abs(x) + end + # f64: no FTZ on PTX for f64. + @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x + @check "abs.f64" + @check_not "__nv_" + abs(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x + @check "abs.s32" + @check_not "__nv_" + abs(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x + @check "abs.s64" + @check_not "__nv_" + abs(x) + end + end + + @testset "floor/ceil/trunc PTX" begin + for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi")) + for fastmath in (false, true) + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x + @check cond=fastmath "cvt.$rnd.ftz.f32.f32" + @check cond=!fastmath "cvt.$rnd.f32.f32" + @check_not "__nv_" + op(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x + @check "cvt.$rnd.f64.f64" + @check_not "__nv_" + op(x) + end + end + end + end + + @testset "isnan/isinf/isfinite PTX" begin + # All three should be pure FP compares / bit-tests, no libdevice. + for T in (Float32, Float64), op in (isnan, isinf, isfinite) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check_not "__nv_" + op(x) + end + end + # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`. + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "setp.nan.f32" + isnan(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "setp.nan.f64" + isnan(x) + end + end + + @testset "signbit PTX" begin + for T in (Float32, Float64) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check_not "__nv_" + signbit(x) + end + end + end + + @testset "copysign PTX" begin + # NVPTX has no single copysign instruction (custom-lowered to bit ops); + # we just verify libdevice isn't on the path. + for T in (Float32, Float64) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check_not "__nv_" + copysign(x, y) + end + end + end + + @testset "min/max PTX" begin + # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+ + # + LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to + # emulate since PTX has no `.NaN` variant for f64. Pin `arch=sm"80"` + # so the test is deterministic regardless of the CI runner's device. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y + @check "min.NaN.f32" + min(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y + @check "max.NaN.f32" + max(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y + @check_not "__nv_" + min(x, y) + end + + # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "setp.lt.$s" + @check "selp.$s" + @fastmath min(x, y) + end + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "setp.lt.$s" + @check "selp.$s" + @fastmath max(x, y) + end + end + end + + @testset "fma/muladd PTX" begin + # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for + # f32/f64 by GPUCompiler; for f16 we keep an explicit override). + # `Base.muladd` lowers to `fmul contract + fadd contract`, which the + # backend fuses. Either way: a single `fma.rn` per type. + for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z + @check "fma.rn.$s" + @check_not "__nv_fma" + fma(x, y, z) + end + @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z + @check "fma.rn.$s" + muladd(x, y, z) + end + end + end + + @testset "sqrt PTX" begin + # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`; + # per-call `@fastmath` → `sqrt.approx.fXX` (via `PTXFSqrtFastPass`); + # job-wide `fastmath=true` → the FTZ variant via `apply_fastmath!`. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "sqrt.rn.$s" + @check_not "sqrt.approx" + sqrt(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "sqrt.approx.f32" + @check_not "sqrt.approx.ftz" + @fastmath sqrt(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x + @check "sqrt.approx.ftz.f32" + sqrt(x) + end + # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp. + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "rsqrt.approx.f64" + @fastmath sqrt(x) + end + end + + @testset "rsqrt PTX" begin + # `CUDA.rsqrt(x)` directly calls the NVPTX `rsqrt.approx.{f,d}` + # intrinsic — no libdevice, and no `@fastmath` so caller-side NaN/Inf + # checks aren't DCE'd by `nnan`/`ninf` propagation. f16 computes in + # f32, so it still hits the f32 instruction. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "rsqrt.approx.$s" + @check_not "sqrt.approx" + @check_not "__nv_" + CUDA.rsqrt(x) + end + end + @test @filecheck CUDA.code_ptx(Tuple{Float16}) do x + @check "rsqrt.approx.f32" + @check_not "__nv_" + CUDA.rsqrt(x) + end + end + + @testset "div/inv PTX" begin + # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's + # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches + # `fdiv 1.0, x` to `rcp.rn`. + for (T, s) in ((Float32, "f32"), (Float64, "f64")) + @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y + @check "div.rn.$s" + x / y + end + @test @filecheck CUDA.code_ptx(Tuple{T}) do x + @check "rcp.rn.$s" + inv(x) + end + end + + # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since + # the job isn't fast; f64 always uses rcp+Newton. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y + @check "div.approx.f32" + @check_not "div.approx.ftz" + @fastmath x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x + @check "div.approx.f32" + @check_not "div.approx.ftz" + @fastmath inv(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y + @check "rcp.approx.ftz.f64" + @fastmath x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x + @check "rcp.approx.ftz.f64" + @fastmath inv(x) + end + + # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as + # `@fastmath`, and f32 additionally picks up FTZ. + @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y + @check "div.approx.ftz.f32" + x / y + end + @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x + @check "div.approx.ftz.f32" + inv(x) + end + @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y + @check "rcp.approx.ftz.f64" + x / y + end + end end diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl index f4ef5fdf4c..3281c56157 100644 --- a/test/core/device/intrinsics/wmma.jl +++ b/test/core/device/intrinsics/wmma.jl @@ -478,36 +478,26 @@ end @testset "Codegen addressing" begin @testset "Global" begin - function kernel(d) + @test @filecheck CUDA.code_ptx((CuDeviceArray{Float32,1,CUDA.AS.Global},)) do d + @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32}}" + @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}" conf = WMMA.Config{16, 16, 16, Float32} - d_frag = WMMA.fill_c(Float32(0), conf) WMMA.store_d(pointer(d), d_frag, 16, WMMA.ColMajor, conf) - return end - - ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},))) - - @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) - @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx) end @testset "Shared" begin - function kernel() + @test @filecheck CUDA.code_ptx(()) do + @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32}}" + @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}" shmem = CuStaticSharedArray(Float32, (16, 16)) conf = WMMA.Config{16, 16, 16, Float32} - d_frag = WMMA.fill_c(Float32(0), conf) WMMA.store_d(pointer(shmem), d_frag, 16, WMMA.ColMajor, conf) - return end - - ptx = sprint(io -> CUDA.code_ptx(io, kernel, ())) - - @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) - @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx) end end diff --git a/test/core/device/ldg.jl b/test/core/device/ldg.jl index 0385e4e34a..b25f953449 100644 --- a/test/core/device/ldg.jl +++ b/test/core/device/ldg.jl @@ -1,15 +1,14 @@ @testset "ldg" begin - # NOTE: This is necessary because it seems that code_llvm has a bug which causes it to ignore - # the method table. Wrapping it in a function gets us what we want currently but the PR - # here: https://github.com/JuliaLang/julia/pull/60718 will likely fix this according to - # @vchuravy. It is currently not backported. - ir = sprint(io->CUDA.code_llvm(io, (args...)->CUDACore.pointerref_ldg(args...), Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true)) - if Base.libllvm_version >= v"20" - # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with - # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`. - @test occursin("!invariant.load", ir) - else - @test occursin("@llvm.nvvm.ldg", ir) + # NOTE: the wrapping function is necessary because code_llvm has a bug + # that causes it to ignore the method table; JuliaLang/julia#60718 will + # likely fix this according to @vchuravy. + # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with + # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`. + new_llvm = Base.libllvm_version >= v"20" + @test @filecheck CUDA.code_llvm(Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true) do args... + @check cond=new_llvm "!invariant.load" + @check cond=!new_llvm "@llvm.nvvm.ldg" + CUDACore.pointerref_ldg(args...) end end diff --git a/test/core/execution.jl b/test/core/execution.jl index 387af3547e..4f8a8d3495 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -61,16 +61,22 @@ end # to constructing the SMVersion directly. @cuda launch=false arch=dev_cap dummy() # but we should be able to see it in the generated PTX code - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50")) - @test contains(asm, ".target sm_50") - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0")) - @test contains(asm, ".target sm_50") + @test @filecheck CUDA.code_ptx((); arch=sm"50") do + @check ".target sm_50" + dummy() + end + @test @filecheck CUDA.code_ptx((); arch=v"5.0") do + @check ".target sm_50" + dummy() + end # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the # `.version` line should match what was asked for, independently of what LLVM and # ptxas would natively pick. - asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3")) - @test contains(asm, ".version 6.3") + @test @filecheck CUDA.code_ptx((); ptx=v"6.3") do + @check ".version 6.3" + dummy() + end # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it # used to be); a clearly out-of-range value must error at config time. @@ -84,14 +90,18 @@ end sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family) if dev_cap >= v"9.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a)) - @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + @test @filecheck CUDA.code_ptx((); arch=sm_a) do + @check ".target $(CUDACore.cpu_name(sm_a))" + dummy() + end # arch-specific cubin should also actually launch on the matching device @cuda arch=sm_a dummy() end if dev_cap >= v"10.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f)) - @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))") + @test @filecheck CUDA.code_ptx((); arch=sm_f) do + @check ".target $(CUDACore.cpu_name(sm_f))" + dummy() + end @cuda arch=sm_f dummy() end @@ -103,8 +113,10 @@ end # since we know the exact device. The cuda-side `.target` is the variant regardless of # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs. if dev_cap >= v"9.0" - asm = sprint(io->CUDA.code_ptx(io, dummy, ())) - @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))") + @test @filecheck CUDA.code_ptx(()) do + @check ".target $(CUDACore.cpu_name(sm_a))" + dummy() + end end # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built diff --git a/test/helpers.jl b/test/helpers.jl index 2809a4b4a8..7ad4eb1a5e 100644 --- a/test/helpers.jl +++ b/test/helpers.jl @@ -6,6 +6,7 @@ using CUDA, CUDACore, GPUArrays using CUDA: i32 using Adapt +using FileCheck using ..Main: TestSuite, can_use_cupti, sanitize testf(@nospecialize(f), @nospecialize(xs...); kwargs...) = diff --git a/test/setup.jl b/test/setup.jl index dc28bd9dc9..2561cbe7a1 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,6 +1,7 @@ using Test using CUDA using CUDACore +using FileCheck using GPUArrays using NVML: has_nvml, NVML using ParallelTestRunner