Benchmark reverse on bigger arrays#2833
Merged
maleadt merged 2 commits intoJuliaGPU:masterfrom Sep 11, 2025
Merged
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/perf/array.jl b/perf/array.jl
index 30348a512..7adb375db 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -55,11 +55,11 @@ let group = addgroup!(group, "reverse")
group["1d"] = @async_benchmarkable reverse($gpu_vec)
group["1dL"] = @async_benchmarkable reverse($gpu_vec_long)
group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
- group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims=1)
+ group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims = 1)
group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long)
group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
- group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
+ group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims = 2)
end
group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 |
maleadt
approved these changes
Aug 7, 2025
auto-merge was automatically disabled
September 5, 2025 17:53
Head branch was pushed to by a user without write access
391638b to
33f73e0
Compare
Contributor
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 7c005df | Previous: d670186 | Ratio |
|---|---|---|---|
latency/precompile |
43572403994.5 ns |
43647228816 ns |
1.00 |
latency/ttfp |
7274860573 ns |
7277435679 ns |
1.00 |
latency/import |
3835703622.5 ns |
3852618873.5 ns |
1.00 |
integration/volumerhs |
9609940 ns |
9627931.5 ns |
1.00 |
integration/byval/slices=1 |
147028 ns |
147110 ns |
1.00 |
integration/byval/slices=3 |
426147 ns |
426020 ns |
1.00 |
integration/byval/reference |
145137 ns |
145041 ns |
1.00 |
integration/byval/slices=2 |
286450 ns |
286551.5 ns |
1.00 |
integration/cudadevrt |
103552 ns |
103546 ns |
1.00 |
kernel/indexing |
14270 ns |
14322 ns |
1.00 |
kernel/indexing_checked |
14989 ns |
14915 ns |
1.00 |
kernel/occupancy |
688.0526315789474 ns |
667.2075471698113 ns |
1.03 |
kernel/launch |
2132.95 ns |
2223.222222222222 ns |
0.96 |
kernel/rand |
14752 ns |
18438 ns |
0.80 |
array/reverse/1d |
19648 ns |
19972 ns |
0.98 |
array/reverse/2dL_inplace |
66921 ns |
||
array/reverse/1dL |
69924 ns |
||
array/reverse/2d |
22043 ns |
24087.5 ns |
0.92 |
array/reverse/1d_inplace |
9561 ns |
10265 ns |
0.93 |
array/reverse/2d_inplace |
11078 ns |
11666 ns |
0.95 |
array/reverse/2dL |
74018 ns |
||
array/reverse/1dL_inplace |
66771 ns |
||
array/copy |
20334 ns |
21022 ns |
0.97 |
array/iteration/findall/int |
157708 ns |
157451 ns |
1.00 |
array/iteration/findall/bool |
139787 ns |
138964 ns |
1.01 |
array/iteration/findfirst/int |
2161061.5 ns |
2145633 ns |
1.01 |
array/iteration/findfirst/bool |
2143779 ns |
2125537.5 ns |
1.01 |
array/iteration/scalar |
72525 ns |
72116 ns |
1.01 |
array/iteration/logical |
236269.5 ns |
235859 ns |
1.00 |
array/iteration/findmin/1d |
258429 ns |
258194 ns |
1.00 |
array/iteration/findmin/2d |
96303 ns |
96186 ns |
1.00 |
array/reductions/reduce/Int64/1d |
147342 ns |
147739.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=1 |
43938 ns |
44174 ns |
0.99 |
array/reductions/reduce/Int64/dims=2 |
61319 ns |
61550 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
88919 ns |
89011.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
654799 ns |
657807 ns |
1.00 |
array/reductions/reduce/Float32/1d |
103721 ns |
104011 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
40824 ns |
40950 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
59404 ns |
59626 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
52297 ns |
52408 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
544052 ns |
547881 ns |
0.99 |
array/reductions/mapreduce/Int64/1d |
148541 ns |
149756 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=1 |
43984 ns |
44269 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=2 |
61397 ns |
61933 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=1L |
88787 ns |
89034 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
680736 ns |
685908.5 ns |
0.99 |
array/reductions/mapreduce/Float32/1d |
104121 ns |
105015.5 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=1 |
40907 ns |
40993 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2 |
59419 ns |
59573 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
52511 ns |
52874 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=2L |
546452 ns |
550366 ns |
0.99 |
array/broadcast |
20216 ns |
20326 ns |
0.99 |
array/copyto!/gpu_to_gpu |
12817 ns |
12862 ns |
1.00 |
array/copyto!/cpu_to_gpu |
216578 ns |
214253 ns |
1.01 |
array/copyto!/gpu_to_cpu |
286120.5 ns |
286370.5 ns |
1.00 |
array/accumulate/Int64/1d |
124945 ns |
124932 ns |
1.00 |
array/accumulate/Int64/dims=1 |
83680 ns |
83506 ns |
1.00 |
array/accumulate/Int64/dims=2 |
157660.5 ns |
157937 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1710260 ns |
1720004 ns |
0.99 |
array/accumulate/Int64/dims=2L |
966715 ns |
968081 ns |
1.00 |
array/accumulate/Float32/1d |
109842 ns |
109339 ns |
1.00 |
array/accumulate/Float32/dims=1 |
80526 ns |
80455 ns |
1.00 |
array/accumulate/Float32/dims=2 |
147358 ns |
147609.5 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1618698.5 ns |
1618340 ns |
1.00 |
array/accumulate/Float32/dims=2L |
698592 ns |
700504 ns |
1.00 |
array/construct |
1616.4 ns |
1612.7 ns |
1.00 |
array/random/randn/Float32 |
44402 ns |
44195.5 ns |
1.00 |
array/random/randn!/Float32 |
24940 ns |
24926 ns |
1.00 |
array/random/rand!/Int64 |
27447 ns |
27742 ns |
0.99 |
array/random/rand!/Float32 |
8834.333333333334 ns |
8671.666666666666 ns |
1.02 |
array/random/rand/Int64 |
30044 ns |
30303 ns |
0.99 |
array/random/rand/Float32 |
12934.5 ns |
13171 ns |
0.98 |
array/permutedims/4d |
59971 ns |
60857.5 ns |
0.99 |
array/permutedims/2d |
53804 ns |
54404 ns |
0.99 |
array/permutedims/3d |
54872 ns |
55128 ns |
1.00 |
array/sorting/1d |
2757614 ns |
2756220.5 ns |
1.00 |
array/sorting/by |
3344832.5 ns |
3354879 ns |
1.00 |
array/sorting/2d |
1081338 ns |
1084703.5 ns |
1.00 |
cuda/synchronization/stream/auto |
1004.6923076923077 ns |
1064.1 ns |
0.94 |
cuda/synchronization/stream/nonblocking |
7875.1 ns |
7654.4 ns |
1.03 |
cuda/synchronization/stream/blocking |
798.6210526315789 ns |
827.0243902439024 ns |
0.97 |
cuda/synchronization/context/auto |
1170 ns |
1151.2 ns |
1.02 |
cuda/synchronization/context/nonblocking |
7330.2 ns |
7325.5 ns |
1.00 |
cuda/synchronization/context/blocking |
899.3617021276596 ns |
916.8333333333334 ns |
0.98 |
This comment was automatically generated by workflow using github-action-benchmark.
Member
Author
|
Test failures seem unrelated? |
33f73e0 to
7c005df
Compare
Member
|
Yeah, that's #2885 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Currently part of #2832 but this can (and should) be merged seperately beforehand.