Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CUDACore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ ChainRulesCore = "1"
EnzymeCore = "0.8.2"
ExprTools = "0.1"
GPUArrays = "11.5.4"
GPUCompiler = "1.12"
GPUCompiler = "1.13.1"
GPUToolbox = "1.1"
KernelAbstractions = "0.9.38"
LLVM = "9.6"
Expand Down
95 changes: 27 additions & 68 deletions CUDACore/src/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# math functionality

# we only use libdevice where needed. if possible, we go through LLVM instead,
# ideally relying on Julia's existing definitions.

@public fma, rsqrt, saturate, byte_perm, assume
@public add_rn, add_rz, add_rm, add_rp
@public sub_rn, sub_rz, sub_rm, sub_rp
Expand Down Expand Up @@ -286,46 +289,20 @@ end

## floating-point handling

@device_override Base.isfinite(x::Float32) = (ccall("extern __nv_finitef", llvmcall, Int32, (Cfloat,), x)) != 0
@device_override Base.isfinite(x::Float64) = (ccall("extern __nv_isfinited", llvmcall, Int32, (Cdouble,), x)) != 0

@device_override Base.isinf(x::Float64) = (ccall("extern __nv_isinfd", llvmcall, Int32, (Cdouble,), x)) != 0
@device_override Base.isinf(x::Float32) = (ccall("extern __nv_isinff", llvmcall, Int32, (Cfloat,), x)) != 0

@device_override Base.isnan(x::Float64) = (ccall("extern __nv_isnand", llvmcall, Int32, (Cdouble,), x)) != 0
@device_override Base.isnan(x::Float32) = (ccall("extern __nv_isnanf", llvmcall, Int32, (Cfloat,), x)) != 0
# isnan(::Float16) inherits from Julia (x != x), which compiles to a single setp.neu.f16.

@device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x)
@device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x)

@device_function nextafter(x::Float64, y::Float64) = ccall("extern __nv_nextafter", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
@device_function nextafter(x::Float32, y::Float32) = ccall("extern __nv_nextafterf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)


## sign handling

@device_override Base.signbit(x::Float64) = (ccall("extern __nv_signbitd", llvmcall, Int32, (Cdouble,), x)) != 0
@device_override Base.signbit(x::Float32) = (ccall("extern __nv_signbitf", llvmcall, Int32, (Cfloat,), x)) != 0

@device_override Base.copysign(x::Float64, y::Float64) = ccall("extern __nv_copysign", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
@device_override Base.copysign(x::Float32, y::Float32) = ccall("extern __nv_copysignf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)

@device_override Base.abs(x::Int32) = ccall("extern __nv_abs", llvmcall, Int32, (Int32,), x)
@device_override Base.abs(f::Float64) = ccall("extern __nv_fabs", llvmcall, Cdouble, (Cdouble,), f)
@device_override Base.abs(f::Float32) = ccall("extern __nv_fabsf", llvmcall, Cfloat, (Cfloat,), f)
# abs(::Float16) inherits from Julia (abs_float intrinsic), lowering to and.b16.
@device_override Base.abs(x::Int64) = ccall("extern __nv_llabs", llvmcall, Int64, (Int64,), x)

## roots and powers

@device_override Base.sqrt(x::Float64) = ccall("extern __nv_sqrt", llvmcall, Cdouble, (Cdouble,), x)
@device_override Base.sqrt(x::Float32) = ccall("extern __nv_sqrtf", llvmcall, Cfloat, (Cfloat,), x)
# sqrt(::Float16) inherits from Julia (Float16(sqrt(Float32(x)))), routing through __nv_sqrtf.
@device_override FastMath.sqrt_fast(x::Union{Float32, Float64}) = sqrt(x)

@device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x)
@device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x)
# NVPTX has native `rsqrt.approx.{f32,f64}`; call the intrinsic directly. The
# obvious alternative, `@fastmath 1/sqrt(x)`, also lowers to `rsqrt.approx`
# (via `PTXRSqrtFastPass`), but is too aggressive wrt. fast-math behavior.
@device_function rsqrt(x::Float64) = ccall("llvm.nvvm.rsqrt.approx.d", llvmcall, Cdouble, (Cdouble,), x)
@device_function rsqrt(x::Float32) = ccall("llvm.nvvm.rsqrt.approx.f", llvmcall, Cfloat, (Cfloat,), x)
@device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x)))

@device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x)
Expand Down Expand Up @@ -395,15 +372,6 @@ end
#@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x)
#@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x)

@device_override Base.trunc(x::Float64) = ccall("extern __nv_trunc", llvmcall, Cdouble, (Cdouble,), x)
@device_override Base.trunc(x::Float32) = ccall("extern __nv_truncf", llvmcall, Cfloat, (Cfloat,), x)

@device_override Base.ceil(x::Float64) = ccall("extern __nv_ceil", llvmcall, Cdouble, (Cdouble,), x)
@device_override Base.ceil(x::Float32) = ccall("extern __nv_ceilf", llvmcall, Cfloat, (Cfloat,), x)

@device_override Base.floor(f::Float64) = ccall("extern __nv_floor", llvmcall, Cdouble, (Cdouble,), f)
@device_override Base.floor(f::Float32) = ccall("extern __nv_floorf", llvmcall, Cfloat, (Cfloat,), f)

#@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y)
#@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
#@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y))
Expand Down Expand Up @@ -508,27 +476,11 @@ end
@device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
@device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))

@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)

@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)

@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
@device_override function FastMath.inv_fast(x::Float64)
# Get the approximate reciprocal
# https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
# This instruction chops off last 32bits of mantissa and computes inverse
# while treating all subnormal numbers as 0.0
# If reciprocal would be subnormal, underflows to 0.0
# 32 least significant bits of the result are filled with 0s
inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)

# Approximate the missing 32bits of mantissa with a single cubic iteration
e = fma(inv_x, -x, 1.0)
e = fma(e, e, e)
inv_x = fma(e, inv_x, inv_x)
end
# `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only
# `Complex` has a method) and the catch-all fallback drops `afn`
@device_override FastMath.inv_fast(x::Union{Float16, Float32, Float64}) =
FastMath.div_fast(one(x), x)
Comment thread
maleadt marked this conversation as resolved.


## distributions

Expand All @@ -549,13 +501,20 @@ end
@device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
@device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)

@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)

@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
Comment thread
maleadt marked this conversation as resolved.
# `Base.fma(::Float16,...)` branches on `jl_have_fma`
@device_override Base.fma(x::Float16, y::Float16, z::Float16) =
ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)

# `Base.muladd(x, y, z) = fma(x, y, z)` is the natural choice on GPU: NVPTX
# always lowers `llvm.fmuladd.fXX` to `fma.rn`, and routing through
# `llvm.fmuladd` (rather than Julia's default `fmul contract + fadd contract`)
# keeps the fusion robust under vectorization (per JuliaGPU/CUDA.jl#3149).
@device_override Base.muladd(x::Float64, y::Float64, z::Float64) =
ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should really do this upstream as well. Our mulladd could technically leak in the way we emit it.

@device_override Base.muladd(x::Float32, y::Float32, z::Float32) =
ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
@device_override Base.muladd(x::Float16, y::Float16, z::Float16) =
ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)

# Directed rounding for binary arithmetic and fma. NVPTX exposes
# `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub`
Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
Expand Down
150 changes: 45 additions & 105 deletions test/core/codegen.jl
Original file line number Diff line number Diff line change
@@ -1,61 +1,59 @@
@testset "LLVM IR" begin

@testset "JuliaLang/julia#21121" begin
function foobar()
@test @filecheck CUDA.code_llvm(Tuple{}) do
@check_not "inttoptr"
weight_matrix = CuStaticSharedArray(Float32, (16, 16))
sync_threads()
weight_matrix[1, 16] *= 2
sync_threads()
end

ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}))
@test !occursin("inttoptr", ir)
end

@testset "CUDA.jl#553" begin
function kernel(ptr)
unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
return
@test @filecheck CUDA.code_llvm(Tuple{Ptr{Float32}}) do ptr
@check_not "@__nv_fmaf"
unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
return
end

ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{Ptr{Float32}}))
@test !occursin("@__nv_fmaf", ir)
end

@testset "fma uses LLVM intrinsic" begin
function fma_kernel(ptr)
unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
return
end

for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}}))
@test occursin("llvm.fma.$suffix", ir)
@test !occursin("__nv_fma", ir)
@test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
@check "llvm.fma.$suffix"
@check_not "__nv_fma"
unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
return
end
end
end

@testset "muladd uses LLVM intrinsic" begin
function muladd_kernel(ptr)
unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
return
end

# `Base.muladd` emits `fmul contract + fadd contract` upstream, which the
# backend usually fuses to `fma.rn`. On GPU the fusion is unreliable under
# vectorization (JuliaGPU/CUDA.jl#3149), so the override routes through
# `llvm.fmuladd.fXX` directly.
for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}}))
@test occursin("llvm.fmuladd.$suffix", ir)
Comment thread
maleadt marked this conversation as resolved.
@test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
@check "llvm.fmuladd.$suffix"
unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
return
end
end
end

@testset "assume" begin
foo(i) = cld(42, i)
ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))
@test occursin("@gpu_report_exception", ir)

@test @filecheck CUDA.code_llvm(Tuple{Int}) do i
@check "@gpu_report_exception"
cld(42, i)
end

bar(i) = (CUDA.assume(i > 0); cld(42, i))
ir = sprint(io->CUDA.code_llvm(io, bar, Tuple{Int}))
@test !occursin("gpu_report_exception", ir)
@test @filecheck CUDA.code_llvm(Tuple{Int}) do i
@check_not "gpu_report_exception"
CUDA.assume(i > 0)
cld(42, i)
end
end

@testset "stripping invariant.load" begin
Expand Down Expand Up @@ -144,88 +142,30 @@ end
@testset "PTX" begin

@testset "always_inline" begin
function f_expensive(x)
Base.Cartesian.@nexprs 30 i -> x = sin(x)+i
end

function g(x)
f_expensive(x)
return
end
function h(x)
f_expensive(x)
return
# without `always_inline`, the helper survives as a separate `.func`;
# with it set, the helper is inlined and no `.func julia_f_expensive`
# declaration remains. The closure-form lambdas below recreate the
# `f_expensive` helper at each test site, so each parent has its own
# call edge to verify the kwarg sticks.
f_expensive(x) = (Base.Cartesian.@nexprs 30 i -> x = sin(x)+i; x)
for always_inline in (false, true)
@test @filecheck CUDA.code_ptx(Tuple{Float64}; always_inline) do x
@check cond=!always_inline "{{\\.func .*julia_f_expensive}}"
@check_not cond=always_inline "{{\\.func .*julia_f_expensive}}"
f_expensive(x)
return
end
end

asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}))
@test occursin(r"\.func .*julia_f_expensive", asm)

asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}; always_inline=true))
@test !occursin(r"\.func .*julia_f_expensive", asm)

asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}; always_inline=true))
@test !occursin(r"\.func .*julia_f_expensive", asm)

asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}))
@test occursin(r"\.func .*julia_f_expensive", asm)
end

@testset "local memory stores due to byval" begin
# JuliaGPU/GPUCompiler.jl#92
function kernel(y1, y2)
@test @filecheck CUDA.code_ptx(NTuple{2,CuDeviceArray{Float32,1,AS.Global}}) do y1, y2
@check_not ".local"
y = threadIdx().x == 1 ? y1 : y2
@inbounds y[] = 0
return
end

asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
@test !occursin(".local", asm)
end

@testset "fastmath" begin
function div_kernel(x)
i = threadIdx().x
@fastmath @inbounds x[i] = 1 / x[i]
return
end

asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
@test occursin("div.approx.ftz", asm)

function sqrt_kernel(x)
i = threadIdx().x
@inbounds x[i] = sqrt(x[i])
return
end

asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
@test occursin("sqrt.r", asm)

asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
@test occursin("sqrt.approx.ftz", asm)
end

@testset "fma/muladd emit fma.rn" begin
# fma and muladd should both lower to fma.rn in PTX
function fma_kernel(a, b, c)
@inbounds a[] = fma(b[], c[], a[])
return
end
function muladd_kernel(a, b, c)
@inbounds a[] = muladd(b[], c[], a[])
return
end

for T in (Float16, Float32, Float64)
asm = sprint(io->CUDA.code_ptx(io, fma_kernel,
NTuple{3,CuDeviceArray{T,1,AS.Global}}))
@test occursin("fma.rn", asm)
@test !occursin("__nv_fma", asm)

asm = sprint(io->CUDA.code_ptx(io, muladd_kernel,
NTuple{3,CuDeviceArray{T,1,AS.Global}}))
@test occursin("fma.rn", asm)
end
end

@testset "header rewrite (.target/.version bump)" begin
Expand Down
22 changes: 9 additions & 13 deletions test/core/device/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,34 +68,30 @@ end

@testset "bounds checking" begin
@testset "#313" begin
function kernel(dest)
kernel = dest -> (dest[1] = 1; nothing)
tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
@test @filecheck CUDA.code_llvm(tt) do dest
@check_not "jl_invoke"
dest[1] = 1
nothing
end
tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
Tuple{UnitRange{Int64},UnitRange{Int64}},false}}

ir = sprint(io->CUDA.code_llvm(io, kernel, tt))
@test !occursin("jl_invoke", ir)
# also smoke-test that PTX codegen succeeds for this signature.
CUDA.code_ptx(devnull, kernel, tt)
end

# test that we don't do needless bounds checking when the kernel already does it
# (enabled by the fact that we store `len` next to `dims`)
let
function kernel(A)
for N in 1:3
@test @filecheck CUDA.code_llvm(Tuple{CuDeviceArray{Int,N,AS.Global}}) do A
@check_not "boundserror"
idx = threadIdx().x
if idx <= length(A)
# we did our own bounds checking, so no check should be left!
A[idx] = 1
end
return
end

for N in 1:3
ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{CuDeviceArray{Int,N,AS.Global}}))
@test !occursin("boundserror", ir)
end
end
end

Expand Down
Loading