Make Ahead-of-Time compilation feasible for packages using CUDA.jl#2998
Make Ahead-of-Time compilation feasible for packages using CUDA.jl#2998maleadt merged 6 commits intoJuliaGPU:masterfrom
Conversation
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## master #2998 +/- ##
=======================================
Coverage 90.42% 90.42%
=======================================
Files 141 141
Lines 11993 11993
=======================================
Hits 10845 10845
Misses 1148 1148 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index 0874adf77..1a6d64784 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -151,7 +151,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Int16, :UInt16)
end
intr = "atom$scope.cas.b16 \$0, [\$1], \$2, \$3;"
- @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) =
+ @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T, $A}, cmp::$T, val::$T) =
@asmcall($intr, "=h,l,h,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T,$T}, ptr, cmp, val)
end
@@ -172,7 +172,7 @@ for A in (AS.Generic, AS.Global, AS.Shared)
nb = sizeof(T)*8
fn = Symbol("atomic_$(op)!")
intr = "llvm.nvvm.atomic.load.$op.$nb.p$(convert(Int, A))i$nb"
- @eval @device_function @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) =
+ @eval @device_function @inline $fn(ptr::LLVMPtr{$T, $A}, val::$T) =
@typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,$A}, $T), ptr, val)
end
end
@@ -192,7 +192,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
end
intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
- @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
+ @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T, $A}, val::$T) =
@asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
end
diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl
index 0c07e3117..4bf553ce9 100644
--- a/src/device/intrinsics/cooperative_groups.jl
+++ b/src/device/intrinsics/cooperative_groups.jl
@@ -24,7 +24,7 @@ Noteworthy missing functionality:
module CG
using ..CUDA
-using ..CUDA: i32, Aligned, alignment, @device_function
+ using ..CUDA: i32, Aligned, alignment, @device_function
import ..LLVM
using ..LLVM.Interop
@@ -73,7 +73,7 @@ const grid_workspace = Ptr{grid_workspace_st}
end
end
-@device_function function get_grid_workspace()
+ @device_function function get_grid_workspace()
# interpret the address from envreg 1 and 2 as the driver's grid workspace
hi = ccall("llvm.nvvm.read.ptx.sreg.envreg1", llvmcall, UInt32, ())
lo = ccall("llvm.nvvm.read.ptx.sreg.envreg2", llvmcall, UInt32, ())
@@ -561,13 +561,13 @@ end
## pipeline operations
-@device_function pipeline_commit() =
- ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ())
+ @device_function pipeline_commit() =
+ ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ())
-@device_function pipeline_wait_prior(n) =
+ @device_function pipeline_wait_prior(n) =
ccall("llvm.nvvm.cp.async.wait.group", llvmcall, Cvoid, (Int32,), n)
-@device_function @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where T
+ @device_function @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where {T}
size_and_align = sizeof(T)
size_and_align in (4, 8, 16) || :(return error($"Unsupported size $size_and_align"))
intr = "llvm.nvvm.cp.async.ca.shared.global.$(sizeof(T))"
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
index 2b4922e74..e4fb488f8 100644
--- a/src/device/intrinsics/indexing.jl
+++ b/src/device/intrinsics/indexing.jl
@@ -49,7 +49,7 @@ for dim in (:x, :y, :z)
# Thread index in block
fn = Symbol("threadIdx_$dim")
intr = Symbol("tid.$dim")
- @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:max_block_size[dim]-1))) + 1i32
+ @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:(max_block_size[dim] - 1)))) + 1i32
# Block size (#threads per block)
fn = Symbol("blockDim_$dim")
@@ -59,7 +59,7 @@ for dim in (:x, :y, :z)
# Block index in grid
fn = Symbol("blockIdx_$dim")
intr = Symbol("ctaid.$dim")
- @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:max_grid_size[dim]-1))) + 1i32
+ @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:(max_grid_size[dim] - 1)))) + 1i32
# Grid size (#blocks per grid)
fn = Symbol("gridDim_$dim")
@@ -69,7 +69,7 @@ for dim in (:x, :y, :z)
# Block index in cluster
fn = Symbol("blockIdxInCluster_$dim")
intr = Symbol("cluster.ctaid.$dim")
- @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:max_cluster_size[dim]-1))) + 1i32
+ @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:(max_cluster_size[dim] - 1)))) + 1i32
# Cluster size (#blocks per cluster)
fn = Symbol("clusterDim_$dim")
@@ -79,7 +79,7 @@ for dim in (:x, :y, :z)
# Cluster index in grid
fn = Symbol("clusterIdx_$dim")
intr = Symbol("clusterid.$dim")
- @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:max_grid_size[dim]-1))) + 1i32
+ @eval @device_function @inline $fn() = _index($(Val(intr)), $(Val(0:(max_grid_size[dim] - 1)))) + 1i32
# Grid size in clusters (#clusters per grid)
fn = Symbol("gridClusterDim_$dim")
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index f7e0a0cdd..2df2d3815 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -67,7 +67,7 @@ end # @device_functions
export barrier_sync
-@device_function barrier_sync(id=0) =
+@device_function barrier_sync(id = 0) =
ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id)
export cluster_arrive, cluster_arrive_relaxed, cluster_wait
diff --git a/src/device/intrinsics/warp.jl b/src/device/intrinsics/warp.jl
index 63913c5aa..ed60d495f 100644
--- a/src/device/intrinsics/warp.jl
+++ b/src/device/intrinsics/warp.jl
@@ -26,7 +26,7 @@ for (name, mode, mask, offset) in (("_up", :up, UInt32(0x00), src->src),
for (T,typ) in ((Int32, "i32"), (UInt32, "i32"), (Float32, "f32"))
intrinsic = "llvm.nvvm.shfl.sync.$mode.$typ"
@eval begin
- @device_function @inline $fname(mask, val::$T, src, width=$ws) =
+ @device_function @inline $fname(mask, val::$T, src, width = $ws) =
ccall($intrinsic, llvmcall, $T,
(UInt32, $T, UInt32, UInt32),
mask, val, $(offset(:src)), pack(width, $mask))
diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl
index 214a4a537..3b085371b 100644
--- a/src/device/intrinsics/wmma.jl
+++ b/src/device/intrinsics/wmma.jl
@@ -2,7 +2,7 @@ export WMMA
module WMMA
import ..LLVM
-using ..CUDA: AS, @device_function
+ using ..CUDA: AS, @device_function
using Core: LLVMPtr
using BFloat16s: BFloat16
@@ -214,10 +214,10 @@ for ops in all_ldst_ops,
ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int})
if sz == 1
- @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride))
+ @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride))
else
struct_ty = Symbol("LLVMStruct$sz")
- @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
+ @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
end
@eval export $func_name
@eval @doc (@doc llvm_wmma_load) $func_name
@@ -284,7 +284,7 @@ export llvm_wmma_store
ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int})
- @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
+ @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
@eval export $func_name
@eval @doc (@doc llvm_wmma_store) $func_name
end
@@ -361,10 +361,10 @@ for ops in all_wmma_ops,
c_vars = ntuple(i -> :(c[$i]), c_sz)
if d_sz == 1
- @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
+ @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
else
struct_ty = Symbol("LLVMStruct$d_sz")
- @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
+ @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
end
@eval export $func_name
@eval @doc (@doc llvm_wmma_mma) $func_name
diff --git a/src/device/pointer.jl b/src/device/pointer.jl
index a1eb2e8c2..3f87840bf 100644
--- a/src/device/pointer.jl
+++ b/src/device/pointer.jl
@@ -34,7 +34,8 @@ for T in LDGTypes
typ = Symbol(class, width)
intr = "llvm.nvvm.ldg.global.$class.$typ.p1$typ"
- @eval @device_function @inline function pointerref_ldg(base_ptr::LLVMPtr{$T,AS.Global}, i::Integer,
+ @eval @device_function @inline function pointerref_ldg(
+ base_ptr::LLVMPtr{$T, AS.Global}, i::Integer,
::Val{align}) where align
offset = i-one(i) # in elements
ptr = base_ptr + offset*sizeof($T)
@@ -53,7 +54,8 @@ for (N, T) in ((4, Float32), (2, Float64), (4, Int8), (4, Int16), (4, Int32), (2
typ = Symbol(class, width)
intr = "llvm.nvvm.ldg.global.$class.v$N$typ.p1v$N$typ"
- @eval @device_function @inline function pointerref_ldg(base_ptr::LLVMPtr{NTuple{$N, Base.VecElement{$T}},AS.Global}, i::Integer,
+ @eval @device_function @inline function pointerref_ldg(
+ base_ptr::LLVMPtr{NTuple{$N, Base.VecElement{$T}}, AS.Global}, i::Integer,
::Val{align}) where align
offset = i-one(i) # in elements
ptr = base_ptr + offset*$N*sizeof($T)
diff --git a/src/device/texture.jl b/src/device/texture.jl
index f675ff583..e28b60810 100644
--- a/src/device/texture.jl
+++ b/src/device/texture.jl
@@ -67,7 +67,7 @@ for (dispatch_rettyp, julia_rettyp, llvm_rettyp) in
NTuple{4,$dispatch_rettyp}})
# tex1D only supports array memory
- @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp,1,ArrayMemorySource}, x::Number) =
+ @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp, 1, ArrayMemorySource}, x::Number) =
Tuple(ccall($("llvm.nvvm.tex.unified.1d.$llvm_rettyp.f32"), llvmcall,
$julia_rettyp, (CUtexObject, Float32), texObject, x))
@@ -78,7 +78,7 @@ for (dispatch_rettyp, julia_rettyp, llvm_rettyp) in
julia_sig = ntuple(_->Float32, dims)
julia_params = ntuple(i->:($(julia_args[i])::Number), dims)
- @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp,$dims}, $(julia_params...)) =
+ @eval @device_function tex(texObject::CuDeviceTexture{<:$eltyp, $dims}, $(julia_params...)) =
Tuple(ccall($("llvm.nvvm.tex.unified.$llvm_dim.$llvm_rettyp.f32"), llvmcall,
$julia_rettyp, (CUtexObject, $(julia_sig...)), texObject, $(julia_args...)))
end
diff --git a/src/device/utils.jl b/src/device/utils.jl
index a5cbda460..a342de7c5 100644
--- a/src/device/utils.jl
+++ b/src/device/utils.jl
@@ -19,7 +19,7 @@ macro device_override(ex)
end)
else
esc(quote
- Base.Experimental.@overlay($(CUDA).method_table, $ex)
+ Base.Experimental.@overlay($(CUDA).method_table, $ex)
end)
end
end
@@ -37,7 +37,7 @@ macro device_function(ex)
$(combinedef(def))
# NOTE: no use of `@consistent_overlay` here because the regular function errors
- Base.Experimental.@overlay($(CUDA).method_table, $ex)
+ Base.Experimental.@overlay($(CUDA).method_table, $ex)
end)
end
diff --git a/test/core/device/ldg.jl b/test/core/device/ldg.jl
index 07e1644f8..c0a1505b8 100644
--- a/test/core/device/ldg.jl
+++ b/test/core/device/ldg.jl
@@ -3,7 +3,7 @@
# the method table. Wrapping it in a function gets us what we want currently but the PR
# here: https://github.com/JuliaLang/julia/pull/60718 will likely fix this according to
# @vchuravy. It is currently not backported.
- ir = sprint(io->CUDA.code_llvm(io, (args...)->CUDA.pointerref_ldg(args...), Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true))
+ ir = sprint(io -> CUDA.code_llvm(io, (args...) -> CUDA.pointerref_ldg(args...), Tuple{Core.LLVMPtr{Int, AS.Global}, Int, Val{1}}; raw = true))
if Base.libllvm_version >= v"20"
# `@llvm.nvvm.ldg` was removed in LLVM 20; the auto-upgrade
# replaces it with a load bearing `!invariant.load` metadata |
|
Can confirm. Error Remaining issue is as you state at point 4. Previously it was observed in comment when trying to AOT compile GPUCompiler. I assume |
I started some digging into this yesterday and I think this is a bit of a red herring. If this was the core issue I would expect the methods that leak through to be all of the ones that need to be specified by the target or only the ones that are provided defaults. The two that currently are causing issues are |
|
I've started to mark all the methods in CUDA.jl as device methods. I still have to clean it up. Using @apozharski 's branch https://github.com/apozharski/GPUCompiler.jl/tree/ap/aot-compilation I can now get past the compilation. However, the next hurdle is that Core.InitError(mod=:CUDA_Driver_jll, error=Base.TaskFailedException(task=Core.Task(next=nothing, queue=nothing, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Core.Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=Base.IOError(msg="could not spawn setenv(`/home/michel/git/ExaPF.jl/app/build/bin/julia -C native -g1 -O0 --compile=min -t1 --startup-file=no -e '...' |
@michel2323 the workaround I found for this while trying to compile libMad is to bundle the julia executable (see the latest commit on my fork of JuliaC). Ideally |
3b831b0 to
784cad3
Compare
|
Nice! Thank you! @apozharski |
We used to do |
0a61a43 to
df004da
Compare
This is out of scope, but a user changing their drivers/hardware while code is running causing segfaults seems to me like "expected" behavior 😅. Unless you mean this only runs once and not each time a new Julia process calls |
Doing some more digging my little hack of bundling the julia executable (in truth I think this is a horrible hack) is insufficient to work around this. This just means the run fails and the aot compiled object just always falls back to loading the system version of the driver. To get the hack to work correctly I would further need to include a standard sysimage as well in the distribution of a library which is just yet more bloat I would hope to avoid. @maleadt would you be willing to accept a PR/change to |
|
@michel2323 this is also relevant to your work (as I misled you with the initial hack 😅 ) https://github.com/apozharski/CUDA_Driver_jll.jl/tree/ap/in-memory-driver-check <- This branch of |
|
|
I am currently stuck 100% on some other topics so pushing this over the line fell back in my priorities. I should be able to try to push this over the line in the next couple of weeks. This is the branch I have been using in my work and it "works on my machine" but could probably use some replication. |
df004da to
dc99a48
Compare
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 710e176 | Previous: cbd36b0 | Ratio |
|---|---|---|---|
latency/precompile |
4593923087 ns |
4596549945 ns |
1.00 |
latency/ttfp |
4434475791.5 ns |
4384041577.5 ns |
1.01 |
latency/import |
3831057860.5 ns |
3815804889.5 ns |
1.00 |
integration/volumerhs |
9433885.5 ns |
9438630 ns |
1.00 |
integration/byval/slices=1 |
145758 ns |
145743 ns |
1.00 |
integration/byval/slices=3 |
422877.5 ns |
423271 ns |
1.00 |
integration/byval/reference |
144152 ns |
144005 ns |
1.00 |
integration/byval/slices=2 |
284629 ns |
284820 ns |
1.00 |
integration/cudadevrt |
102603 ns |
102501 ns |
1.00 |
kernel/indexing |
13408 ns |
13325 ns |
1.01 |
kernel/indexing_checked |
14239 ns |
14152 ns |
1.01 |
kernel/occupancy |
669.2452830188679 ns |
658.3765432098766 ns |
1.02 |
kernel/launch |
2135.3333333333335 ns |
2168 ns |
0.98 |
kernel/rand |
14775 ns |
18024 ns |
0.82 |
array/reverse/1d |
18246 ns |
18361 ns |
0.99 |
array/reverse/2dL_inplace |
65924 ns |
66059 ns |
1.00 |
array/reverse/1dL |
68784.5 ns |
68935 ns |
1.00 |
array/reverse/2d |
20643 ns |
20424 ns |
1.01 |
array/reverse/1d_inplace |
9818.666666666666 ns |
10273.666666666666 ns |
0.96 |
array/reverse/2d_inplace |
10924 ns |
10677 ns |
1.02 |
array/reverse/2dL |
72575 ns |
72483 ns |
1.00 |
array/reverse/1dL_inplace |
65861 ns |
65932 ns |
1.00 |
array/copy |
18932 ns |
18710 ns |
1.01 |
array/iteration/findall/int |
147627.5 ns |
149223 ns |
0.99 |
array/iteration/findall/bool |
131450 ns |
132253 ns |
0.99 |
array/iteration/findfirst/int |
83028.5 ns |
83319 ns |
1.00 |
array/iteration/findfirst/bool |
81087 ns |
81194 ns |
1.00 |
array/iteration/scalar |
68317.5 ns |
67940 ns |
1.01 |
array/iteration/logical |
199329.5 ns |
203059 ns |
0.98 |
array/iteration/findmin/1d |
85686.5 ns |
87925 ns |
0.97 |
array/iteration/findmin/2d |
117060 ns |
116883.5 ns |
1.00 |
array/reductions/reduce/Int64/1d |
42214 ns |
43431 ns |
0.97 |
array/reductions/reduce/Int64/dims=1 |
42221.5 ns |
45333 ns |
0.93 |
array/reductions/reduce/Int64/dims=2 |
59536 ns |
59667 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
87758 ns |
87800 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
84572 ns |
84635 ns |
1.00 |
array/reductions/reduce/Float32/1d |
35057 ns |
35342 ns |
0.99 |
array/reductions/reduce/Float32/dims=1 |
49243 ns |
49503 ns |
0.99 |
array/reductions/reduce/Float32/dims=2 |
56701 ns |
56667 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
51861 ns |
51807 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
69317 ns |
70151.5 ns |
0.99 |
array/reductions/mapreduce/Int64/1d |
42593.5 ns |
43078 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=1 |
44860 ns |
42508 ns |
1.06 |
array/reductions/mapreduce/Int64/dims=2 |
59587 ns |
59834 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1L |
87712.5 ns |
87965 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
84713 ns |
84897 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
34959 ns |
35085 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1 |
49561 ns |
41547 ns |
1.19 |
array/reductions/mapreduce/Float32/dims=2 |
56681 ns |
56696 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
51927 ns |
51840 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
69415.5 ns |
70021 ns |
0.99 |
array/broadcast |
20524 ns |
20569 ns |
1.00 |
array/copyto!/gpu_to_gpu |
11395 ns |
11387 ns |
1.00 |
array/copyto!/cpu_to_gpu |
215143 ns |
216250 ns |
0.99 |
array/copyto!/gpu_to_cpu |
281876 ns |
285879 ns |
0.99 |
array/accumulate/Int64/1d |
118653 ns |
118904.5 ns |
1.00 |
array/accumulate/Int64/dims=1 |
79786 ns |
80426 ns |
0.99 |
array/accumulate/Int64/dims=2 |
156706 ns |
156465.5 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1705442 ns |
1703915 ns |
1.00 |
array/accumulate/Int64/dims=2L |
960989 ns |
961803 ns |
1.00 |
array/accumulate/Float32/1d |
101871 ns |
101423 ns |
1.00 |
array/accumulate/Float32/dims=1 |
77214.5 ns |
76736 ns |
1.01 |
array/accumulate/Float32/dims=2 |
143815 ns |
143893 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1594405 ns |
1591735.5 ns |
1.00 |
array/accumulate/Float32/dims=2L |
660420.5 ns |
659940 ns |
1.00 |
array/construct |
1304.4 ns |
1328.3 ns |
0.98 |
array/random/randn/Float32 |
43909 ns |
44421 ns |
0.99 |
array/random/randn!/Float32 |
31453 ns |
30091 ns |
1.05 |
array/random/rand!/Int64 |
34091 ns |
26869 ns |
1.27 |
array/random/rand!/Float32 |
8566 ns |
8394.333333333334 ns |
1.02 |
array/random/rand/Int64 |
37277 ns |
29750 ns |
1.25 |
array/random/rand/Float32 |
13038 ns |
13048 ns |
1.00 |
array/permutedims/4d |
51917 ns |
52066.5 ns |
1.00 |
array/permutedims/2d |
52386 ns |
52335 ns |
1.00 |
array/permutedims/3d |
52531 ns |
52691 ns |
1.00 |
array/sorting/1d |
2735243.5 ns |
2744370 ns |
1.00 |
array/sorting/by |
3328206 ns |
3314416 ns |
1.00 |
array/sorting/2d |
1071848.5 ns |
1072423 ns |
1.00 |
cuda/synchronization/stream/auto |
986.2 ns |
1088.5454545454545 ns |
0.91 |
cuda/synchronization/stream/nonblocking |
7546.3 ns |
7527.299999999999 ns |
1.00 |
cuda/synchronization/stream/blocking |
838.2358490566038 ns |
842.5068493150685 ns |
0.99 |
cuda/synchronization/context/auto |
1178.8 ns |
1182.6 ns |
1.00 |
cuda/synchronization/context/nonblocking |
8453.2 ns |
7454.700000000001 ns |
1.13 |
cuda/synchronization/context/blocking |
929.8936170212766 ns |
939.9677419354839 ns |
0.99 |
This comment was automatically generated by workflow using github-action-benchmark.
beff30c to
ba0bf0f
Compare
|
Ok I now suspect that CI will pass (except on nightly it seems), so this is ready for re-review. |
|
Hmmm, I am not sure the failure on CUDA 13.2 is related It seems to be an OOM issue which it seems occurred on previous commits as well. |
ba0bf0f to
2da39a7
Compare
|
It is failing now. You dropped last commit in rebase? |
Ah yea, good catch |
Currently, Ahead-of-Time (AOT) compilation for any package
using CUDA.jlis impossible (with or without--trim). This is due to several things as far as I can tell:CUDA.jlwhich are visible in the global method table,llvmcallLLVM ir withllvm.nvvmintrinsics which therefore leak through to the AOT step which generates assembly, leading to compilation failures.MethodInstances exist for the methods, the methods are concrete (unsure if this is the correct Julia terminology). This is because they either are defined to take concrete types (for example theclockmethods) or no arguments at all.libcudadevrt.jl. The "concrete" methods in that file containccalls to external functions which do not exist until they are dynamically loaded.gpu_malloc,gpu_signal_exception, andgpu_report_oomin the produced llvm ir. These functions don't exist as they would come from the runtime compilation of the runtime. This is likely a problem more fixable inGPUCompiler.jl, but maybe @vchuravy or @maleadt you have a better understanding of that.The relevant issues and other PRs:
@device_functionto provide erroring copies of the methods for the global method table