[CUSPARSE] Fix wrong method for kron with Diagonal and add support to kron with FillArrays#2973
[CUSPARSE] Fix wrong method for kron with Diagonal and add support to kron with FillArrays#2973albertomercurio wants to merge 7 commits intoJuliaGPU:masterfrom
kron with Diagonal and add support to kron with FillArrays#2973Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/ext/FillArraysExt.jl b/ext/FillArraysExt.jl
index 865faebf5..9fbe77f1f 100644
--- a/ext/FillArraysExt.jl
+++ b/ext/FillArraysExt.jl
@@ -28,13 +28,13 @@ function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T
col = repeat(col, inner = nB)
data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = nB)
- row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
- col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
+ row .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
+ col .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
# Multiply by the fill value (already promoted type T)
data .*= fill_value
- CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+ return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end
# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO
@@ -52,9 +52,9 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end
- row = (0:nA-1) .* mB
+ row = (0:(nA - 1)) .* mB
row = CUDA.CuVector(repeat(row, inner = Bnnz))
- col = (0:nA-1) .* nB
+ col = (0:(nA - 1)) .* nB
col = CUDA.CuVector(repeat(col, inner = Bnnz))
data = CUDA.fill(T(fill_value), nA * Bnnz)
@@ -63,7 +63,7 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
data .*= repeat(B.nzVal, outer = nA)
- CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+ return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end
end # extension module
diff --git a/test/libraries/cusparse/linalg.jl b/test/libraries/cusparse/linalg.jl
index 6c6308003..153fbebaf 100644
--- a/test/libraries/cusparse/linalg.jl
+++ b/test/libraries/cusparse/linalg.jl
@@ -6,7 +6,7 @@ m = 10
A = sprand(T, m, m, 0.2)
B = sprand(T, m, m, 0.3)
ZA = spzeros(T, m, m)
- C = Diagonal(rand(T, div(m, 2)))
+ C = Diagonal(rand(T, div(m, 2)))
dC = adapt(CuArray, C)
@testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
@@ -38,45 +38,45 @@ m = 10
end
@testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint)
@test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C)
- @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
+ @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C)
@test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA))
end
end
-
+
# Test type promotion for kron with Diagonal
@testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64)]
A_T1 = sprand(T1, m, m, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2)))
C_T2 = collect(dC_T2)
-
+
# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, dC_T2)
result_cpu = kron(A_T1, C_T2)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
-
+
# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(dC_T2, dA_T1)
result_cpu = kron(C_T2, A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end
-
+
# Test kron with zero diagonal matrices
@testset "kron with zero Diagonal" begin
A = sprand(T, m, m, 0.2)
dA = CuSparseMatrixCSR(A)
dC_zeros = Diagonal(CUDA.zeros(T, div(m, 2)))
C_zeros = collect(dC_zeros)
-
+
# Test kron(sparse, zero_diagonal)
result_gpu = kron(dA, dC_zeros)
result_cpu = kron(A, C_zeros)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
-
+
# Test kron(zero_diagonal, sparse)
result_gpu = kron(dC_zeros, dA)
result_cpu = kron(C_zeros, A)
diff --git a/test/libraries/fillarrays.jl b/test/libraries/fillarrays.jl
index e4d97c0c8..9df35256f 100644
--- a/test/libraries/fillarrays.jl
+++ b/test/libraries/fillarrays.jl
@@ -6,91 +6,91 @@ using FillArrays
@testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
m, n = 100, 80
A = sprand(T, m, n, 0.2)
-
+
# Test with Ones
@testset "Ones - size $diag_size" for diag_size in [5, 10]
C_ones = Diagonal(Ones{T}(diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_ones))
result_cpu = kron(A, collect(C_ones))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_ones, dA))
result_cpu = kron(collect(C_ones), A)
@test result_gpu ≈ result_cpu
end
-
+
# Test with Fill (constant value)
@testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10]
C_fill = Diagonal(Fill(val, diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_fill))
result_cpu = kron(A, collect(C_fill))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_fill, dA))
result_cpu = kron(collect(C_fill), A)
@test result_gpu ≈ result_cpu
end
-
+
# Test with Zeros
@testset "Zeros - size $diag_size" for diag_size in [5, 10]
C_zeros = Diagonal(Zeros{T}(diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = kron(dA, C_zeros)
result_cpu = kron(A, collect(C_zeros))
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
-
+
# Test kron(diagonal, sparse)
result_gpu = kron(C_zeros, dA)
result_cpu = kron(collect(C_zeros), A)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
end
-
+
# Test with transpose and adjoint wrappers
@testset "Transpose/Adjoint with Fill" begin
diag_size = 5
val = T(3.0)
C_fill = Diagonal(Fill(val, diag_size))
-
+
@testset "opa = $opa" for opa in (identity, transpose, adjoint)
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(opa(sparse), diagonal)
result_gpu = collect(kron(opa(dA), C_fill))
result_cpu = kron(opa(A), collect(C_fill))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, opa(sparse))
result_gpu = collect(kron(C_fill, opa(dA)))
result_cpu = kron(collect(C_fill), opa(A))
@test result_gpu ≈ result_cpu
end
end
-
+
# Test type promotion
@testset "Type promotion" begin
@testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64), (ComplexF32, Float32)]
A_T1 = sprand(T1, m, n, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
C_fill_T2 = Diagonal(Fill(T2(2.5), 5))
-
+
# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, C_fill_T2)
result_cpu = kron(A_T1, collect(C_fill_T2))
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
-
+
# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(C_fill_T2, dA_T1)
result_cpu = kron(collect(C_fill_T2), A_T1) |
|
Replaces #2804? |
|
Oh I wasn't aware of that PR. I would say that both fix the bug. This PR also implements the |
|
Hello, any thoughts on this? |
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 2ee822e | Previous: 5d9474a | Ratio |
|---|---|---|---|
latency/precompile |
54976267490 ns |
55510377029.5 ns |
0.99 |
latency/ttfp |
7892269137 ns |
7790703567 ns |
1.01 |
latency/import |
4163528684.5 ns |
4122189304 ns |
1.01 |
integration/volumerhs |
9623297.5 ns |
9624973 ns |
1.00 |
integration/byval/slices=1 |
147104 ns |
147064 ns |
1.00 |
integration/byval/slices=3 |
425930 ns |
425893 ns |
1.00 |
integration/byval/reference |
145081 ns |
145082 ns |
1.00 |
integration/byval/slices=2 |
286421.5 ns |
286384 ns |
1.00 |
integration/cudadevrt |
103563 ns |
103602 ns |
1.00 |
kernel/indexing |
14245 ns |
14225 ns |
1.00 |
kernel/indexing_checked |
14857 ns |
14969 ns |
0.99 |
kernel/occupancy |
730.8037037037036 ns |
732.5227272727273 ns |
1.00 |
kernel/launch |
2282.3333333333335 ns |
2249.4444444444443 ns |
1.01 |
kernel/rand |
14871 ns |
18642 ns |
0.80 |
array/reverse/1d |
19817 ns |
19990 ns |
0.99 |
array/reverse/2dL_inplace |
66891 ns |
66917 ns |
1.00 |
array/reverse/1dL |
70066 ns |
70158 ns |
1.00 |
array/reverse/2d |
21887 ns |
21954 ns |
1.00 |
array/reverse/1d_inplace |
9936 ns |
9677 ns |
1.03 |
array/reverse/2d_inplace |
13478 ns |
11077 ns |
1.22 |
array/reverse/2dL |
73893 ns |
74051.5 ns |
1.00 |
array/reverse/1dL_inplace |
67245 ns |
66880 ns |
1.01 |
array/copy |
20582 ns |
20660 ns |
1.00 |
array/iteration/findall/int |
157809.5 ns |
158373 ns |
1.00 |
array/iteration/findall/bool |
139937 ns |
140139 ns |
1.00 |
array/iteration/findfirst/int |
161392 ns |
161271 ns |
1.00 |
array/iteration/findfirst/bool |
162120.5 ns |
162049 ns |
1.00 |
array/iteration/scalar |
71973 ns |
72812.5 ns |
0.99 |
array/iteration/logical |
216531 ns |
216894.5 ns |
1.00 |
array/iteration/findmin/1d |
51408 ns |
50981 ns |
1.01 |
array/iteration/findmin/2d |
96465 ns |
96704 ns |
1.00 |
array/reductions/reduce/Int64/1d |
43712 ns |
43491 ns |
1.01 |
array/reductions/reduce/Int64/dims=1 |
45340.5 ns |
52642.5 ns |
0.86 |
array/reductions/reduce/Int64/dims=2 |
61470 ns |
61484 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
88983 ns |
88879 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
88210 ns |
87977 ns |
1.00 |
array/reductions/reduce/Float32/1d |
37257.5 ns |
37248.5 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
45350.5 ns |
43278 ns |
1.05 |
array/reductions/reduce/Float32/dims=2 |
59694 ns |
60066 ns |
0.99 |
array/reductions/reduce/Float32/dims=1L |
52331 ns |
52282 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
71952 ns |
72365.5 ns |
0.99 |
array/reductions/mapreduce/Int64/1d |
43926 ns |
43561 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1 |
45095.5 ns |
44306 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=2 |
61364 ns |
61482 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1L |
88947 ns |
89001 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
87932 ns |
88320 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
37014 ns |
38092.5 ns |
0.97 |
array/reductions/mapreduce/Float32/dims=1 |
51969.5 ns |
41962 ns |
1.24 |
array/reductions/mapreduce/Float32/dims=2 |
59883 ns |
60039 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
52655.5 ns |
52636 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
72106 ns |
72310 ns |
1.00 |
array/broadcast |
20168 ns |
20127 ns |
1.00 |
array/copyto!/gpu_to_gpu |
11228 ns |
12738 ns |
0.88 |
array/copyto!/cpu_to_gpu |
217858.5 ns |
217857 ns |
1.00 |
array/copyto!/gpu_to_cpu |
283913 ns |
287088 ns |
0.99 |
array/accumulate/Int64/1d |
124690 ns |
124778 ns |
1.00 |
array/accumulate/Int64/dims=1 |
83792 ns |
83708 ns |
1.00 |
array/accumulate/Int64/dims=2 |
158389.5 ns |
158367 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1720126 ns |
1710164 ns |
1.01 |
array/accumulate/Int64/dims=2L |
968168 ns |
967254 ns |
1.00 |
array/accumulate/Float32/1d |
109079 ns |
109314 ns |
1.00 |
array/accumulate/Float32/dims=1 |
80980 ns |
80184 ns |
1.01 |
array/accumulate/Float32/dims=2 |
148191 ns |
147922 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1619247 ns |
1618786 ns |
1.00 |
array/accumulate/Float32/dims=2L |
698719 ns |
698724 ns |
1.00 |
array/construct |
1291.75 ns |
1295.5 ns |
1.00 |
array/random/randn/Float32 |
45186.5 ns |
47861 ns |
0.94 |
array/random/randn!/Float32 |
24986 ns |
24875 ns |
1.00 |
array/random/rand!/Int64 |
27569 ns |
27408 ns |
1.01 |
array/random/rand!/Float32 |
8848.833333333332 ns |
8909.666666666666 ns |
0.99 |
array/random/rand/Int64 |
30658 ns |
30055 ns |
1.02 |
array/random/rand/Float32 |
13117 ns |
13184 ns |
0.99 |
array/permutedims/4d |
55312 ns |
55109 ns |
1.00 |
array/permutedims/2d |
54319 ns |
53832 ns |
1.01 |
array/permutedims/3d |
54902 ns |
54841 ns |
1.00 |
array/sorting/1d |
2757494 ns |
2757534 ns |
1.00 |
array/sorting/by |
3344816 ns |
3344541 ns |
1.00 |
array/sorting/2d |
1081979 ns |
1081521 ns |
1.00 |
cuda/synchronization/stream/auto |
1030 ns |
1036.5 ns |
0.99 |
cuda/synchronization/stream/nonblocking |
7308.700000000001 ns |
7410.8 ns |
0.99 |
cuda/synchronization/stream/blocking |
833.3855421686746 ns |
820.6336633663366 ns |
1.02 |
cuda/synchronization/context/auto |
1162.7 ns |
1154.3 ns |
1.01 |
cuda/synchronization/context/nonblocking |
6919.4 ns |
7124.4 ns |
0.97 |
cuda/synchronization/context/blocking |
905.9565217391304 ns |
887.4107142857143 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
|
These changes seem to cause test failures. |
|
Ok I should have fixed the errors. There are some errors on Julia v1.12 and nightly related to |
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## master #2973 +/- ##
==========================================
+ Coverage 89.43% 89.47% +0.03%
==========================================
Files 148 148
Lines 12991 13001 +10
==========================================
+ Hits 11619 11632 +13
+ Misses 1372 1369 -3 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Ok by updating the branch all the tests pass |
The previous implementation of
kronbetween a CUSPARSE matrix and aDiagonalwas wrong. It was assuming the diagonal to be the identity, which is not always the case. I have extended it to any generalDiagonalmatrix embedding aCuVector.In order to support the
kronwith identity like matrices, I have added the extension to FillArrays.jl. Here the diagonal is filled by an unique value.