diff --git a/Project.toml b/Project.toml index 7e82876c5..22379ab5c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TensorKit" uuid = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec" -authors = ["Jutho Haegeman, Lukas Devos"] version = "0.16.3" +authors = ["Jutho Haegeman, Lukas Devos"] [deps] LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" @@ -53,7 +53,7 @@ Printf = "1" Random = "1" SafeTestsets = "0.1" ScopedValues = "1.3.0" -Strided = "2" +Strided = "2.3.4" TensorKitSectors = "0.3.5" TensorOperations = "5.1" Test = "1" @@ -87,3 +87,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" [targets] test = ["ArgParse", "Adapt", "Aqua", "AllocCheck", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote", "Mooncake", "JET"] + +[sources] +Strided = {url = "https://github.com/QuantumKitHub/Strided.jl", rev = "ksh/copyto"} diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl index f5efb98bb..4ee4865f1 100644 --- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl +++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl @@ -10,7 +10,7 @@ using TensorKit.Factorizations using TensorKit.Strided using TensorKit.Factorizations: AbstractAlgorithm using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check -import TensorKit: randisometry, rand, randn +import TensorKit: randisometry, rand, randn, _copyto!, _add_general_kernel_nonthreaded!, blocktype using TensorKit: MatrixAlgebraKit diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl index f065c2ec1..2932c863f 100644 --- a/ext/TensorKitCUDAExt/cutensormap.jl +++ b/ext/TensorKitCUDAExt/cutensormap.jl @@ -17,6 +17,10 @@ function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::Abstr return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V) end +function TensorKit.blocktype(::Type{<:CuTensorMap{T, S}}) where {T, S} + return CuMatrix{T, CUDA.DeviceMemory} +end + for (fname, felt) in ((:zeros, :zero), (:ones, :one)) @eval begin function CUDA.$fname( @@ -101,18 +105,6 @@ function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S} return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)] end -function Base.convert( - TT::Type{CuTensorMap{T, S, N₁, N₂}}, - t::AbstractTensorMap{<:Any, S, N₁, N₂} - ) where {T, S, N₁, N₂} - if typeof(t) === TT - return t - else - tnew = TT(undef, space(t)) - return copy!(tnew, t) - end -end - function LinearAlgebra.isposdef(t::CuTensorMap) domain(t) == codomain(t) || throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same")) @@ -138,10 +130,9 @@ function Base.promote_rule( return CuTensorMap{T, S, N₁, N₂} end -TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} = +TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} = CuArray{T, N, CUDA.default_memory} - # CuTensorMap exponentation: function TensorKit.exp!(t::CuTensorMap) domain(t) == codomain(t) || @@ -168,3 +159,30 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth) return tf end end + +function TensorKit._add_general_kernel_nonthreaded!( + tdst::CuTensorMap, tsrc::CuTensorMap, p, transformer::TensorKit.GenericTreeTransformer, α, β, backend... + ) + # preallocate buffers + buffers = TensorKit.allocate_buffers(tdst, tsrc, transformer) + + for subtransformer in transformer.data + # Special case without intermediate buffers whenever there is only a single block + if length(subtransformer[1]) == 1 + TensorKit._add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) + else + cu_subtransformer = tuple(CUDA.adapt(CuArray, subtransformer[1]), subtransformer[2:end]...) + TensorKit._add_transform_multi!(tdst, tsrc, p, cu_subtransformer, buffers, α, β, backend...) + end + end + return nothing +end + +function TensorKit.allocate_buffers( + tdst::CuTensorMap, tsrc::CuTensorMap, transformer::TensorKit.GenericTreeTransformer + ) + sz = TensorKit.buffersize(transformer) + # force zeros to ensure the buffers are empty + # otherwise memory re-use can fill them with garbage data + return CUDA.zeros(eltype(tdst.data), sz), CUDA.zeros(eltype(tsrc.data), sz) +end diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index a7105cda6..a62429dfe 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -57,10 +57,18 @@ function _interleave(a::NTuple{N}, b::NTuple{N}) where {N} return (a[1], b[1], _interleave(tail(a), tail(b))...) end +_copyto!(A::StridedView, B::StridedView) = copyto!(A, B) + # Low-overhead implementation of `copyto!` for specific case of `stride(B, 1) < stride(B, 2)` -# used in indexmanipulations: avoids the overhead of Strided.jl -function _copyto!(A::StridedView{<:Any, 1}, B::StridedView{<:Any, 2}) - length(A) == length(B) || throw(DimensionMismatch()) +# for CPU-hosted Arrays # used in indexmanipulations: avoids the overhead of Strided.jl + +@static if VERSION < v"1.11" # TODO: remove once support for v1.10 is dropped + const AT = Array +else + const AT = Memory +end +function _copyto!(A::StridedView{TA, 1, <:AT}, B::StridedView{TB, 2, <:AT}) where {TA <: Number, TB <: Number} + length(A) == length(B) || throw(DimensionMismatch(lazy"length of A ($(length(A))) does not match length of B ($(length(B))")) Adata = parent(A) Astr = stride(A, 1) diff --git a/src/tensors/abstracttensor.jl b/src/tensors/abstracttensor.jl index 2d7239460..9293f1375 100644 --- a/src/tensors/abstracttensor.jl +++ b/src/tensors/abstracttensor.jl @@ -53,9 +53,11 @@ storagetype(t) = storagetype(typeof(t)) function storagetype(::Type{T}) where {T <: AbstractTensorMap} if T isa Union # attempt to be slightly more specific by promoting unions - Ma = storagetype(T.a) - Mb = storagetype(T.b) - return promote_storagetype(Ma, Mb) + return promote_storagetype(T.a, T.b) + elseif eltype(T) isa Union + # attempt to be slightly more specific by promoting unions + TU = eltype(T) + return promote_storagetype(TU.a, TU.b) else # fallback definition by using scalartype return similarstoragetype(scalartype(T)) @@ -103,8 +105,9 @@ similarstoragetype(X::Type, ::Type{T}) where {T <: Number} = # implement on tensors similarstoragetype(::Type{TT}) where {TT <: AbstractTensorMap} = similarstoragetype(storagetype(TT)) -similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number} = - similarstoragetype(storagetype(TT), T) +function similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number} + return similarstoragetype(storagetype(TT), T) +end # implement on arrays similarstoragetype(::Type{A}) where {A <: DenseVector{<:Number}} = A diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl index 0070bc2d4..4fd68f7da 100644 --- a/src/tensors/braidingtensor.jl +++ b/src/tensors/braidingtensor.jl @@ -171,12 +171,15 @@ end has_shared_permute(t::BraidingTensor, ::Index2Tuple) = false function add_transform!( tdst::AbstractTensorMap, - tsrc::BraidingTensor, (p₁, p₂)::Index2Tuple, + tsrc::BraidingTensor{T, S}, + (p₁, p₂)::Index2Tuple, fusiontreetransform, α::Number, β::Number, backend::AbstractBackend... - ) + ) where {T, S} + tsrc_map = similar(tdst, storagetype(tdst), space(tsrc)) + copy!(tsrc_map, tsrc) return add_transform!( - tdst, TensorMap(tsrc), (p₁, p₂), fusiontreetransform, α, β, + tdst, tsrc_map, (p₁, p₂), fusiontreetransform, α, β, backend... ) end diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl index 0fad13473..4b4682763 100644 --- a/test/cuda/tensors.jl +++ b/test/cuda/tensors.jl @@ -98,8 +98,8 @@ for V in spacelist next = @constinferred Nothing iterate(bs, state) b2 = @constinferred block(t, first(blocksectors(t))) @test b1 == b2 - @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)} - @test_broken typeof(b1) === TensorKit.blocktype(t) + @test eltype(bs) === Pair{typeof(c), typeof(b1)} + @test typeof(b1) === TensorKit.blocktype(t) @test typeof(c) === sectortype(t) end end @@ -162,8 +162,8 @@ for V in spacelist next = @constinferred Nothing iterate(bs, state) b2 = @constinferred block(t', first(blocksectors(t'))) @test b1 == b2 - @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)} - @test_broken typeof(b1) === TensorKit.blocktype(t') + @test eltype(bs) === Pair{typeof(c), typeof(b1)} + @test typeof(b1) === TensorKit.blocktype(t') @test typeof(c) === sectortype(t) # linear algebra @test isa(@constinferred(norm(t)), real(T)) @@ -236,8 +236,8 @@ for V in spacelist α = rand(T) @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2) @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t)) - @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t) - @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t) + @test adapt(Vector{T}, (α * t)) ≈ α * adapt(Vector{T}, t) + @test adapt(Vector{T}, (t + t)) ≈ 2 * adapt(Vector{T}, t) end end @timedtestset "Real and imaginary parts" begin @@ -290,28 +290,29 @@ for V in spacelist @timedtestset "Permutations: test via inner product invariance" begin W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5 t = CUDA.rand(ComplexF64, W) + ht = adapt(Vector{ComplexF64}, t) t′ = CUDA.randn!(similar(t)) + ht′ = adapt(Vector{ComplexF64}, t′) + dot_htt′ = dot(ht′, ht) + dot_tt′ = dot(t′, t) + @test dot_tt′ ≈ dot_htt′ + norm_t = norm(t) for k in 0:5 for p in permutations(1:5) p1 = ntuple(n -> p[n], k) p2 = ntuple(n -> p[k + n], 5 - k) - CUDA.@allowscalar begin - t2 = @constinferred permute(t, (p1, p2)) - t2 = permute(t, (p1, p2)) - @test norm(t2) ≈ norm(t) - t2′ = permute(t′, (p1, p2)) - @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2)) - end - end - - CUDA.@allowscalar begin - t3 = @constinferred repartition(t, $k) - t3 = repartition(t, k) - @test norm(t3) ≈ norm(t) - t3′ = @constinferred repartition!(similar(t3), t′) - @test norm(t3′) ≈ norm(t′) - @test dot(t′, t) ≈ dot(t3′, t3) + t2 = @constinferred permute(t, (p1, p2)) + t2′ = permute(t′, (p1, p2)) + @test norm(t2) ≈ norm_t + @test dot(t2′, t2) ≈ dot_tt′ + @test dot(transpose(t2′), transpose(t2)) ≈ dot_tt′ end + t3 = @constinferred repartition(t, $k) + t3 = repartition(t, k) + t3′ = @constinferred repartition!(similar(t3), t′) + @test norm(t3) ≈ norm(t) + @test norm(t3′) ≈ norm(t′) + @test dot(t′, t) ≈ dot(t3′, t3) end end if BraidingStyle(I) isa SymmetricBraiding @@ -322,34 +323,35 @@ for V in spacelist for p in permutations(1:5) p1 = ntuple(n -> p[n], k) p2 = ntuple(n -> p[k + n], 5 - k) - dt2 = CUDA.@allowscalar permute(t, (p1, p2)) - ht2 = permute(TensorKit.to_cpu(t), (p1, p2)) - @test ht2 == TensorKit.to_cpu(dt2) + ht2 = permute(adapt(Vector{ComplexF64}, t), (p1, p2)) + dt2 = permute(t, (p1, p2)) + @test ht2 ≈ adapt(Vector{ComplexF64}, dt2) + ht3 = transpose(adapt(Vector{ComplexF64}, dt2)) + dt3 = transpose(dt2) + hht3 = adapt(Vector{ComplexF64}, dt3) + @test ht3 ≈ hht3 end - - dt3 = CUDA.@allowscalar repartition(t, k) - ht3 = repartition(TensorKit.to_cpu(t), k) - @test ht3 == TensorKit.to_cpu(dt3) + dt4 = repartition(t, k) + ht4 = repartition(adapt(Vector{ComplexF64}, t), k) + @test ht4 == adapt(Vector{ComplexF64}, dt4) end end end @timedtestset "Full trace: test self-consistency" begin t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1') - CUDA.@allowscalar begin - t2 = permute(t, ((1, 2), (4, 3))) - s = @constinferred tr(t2) - @test conj(s) ≈ tr(t2') - if !isdual(V1) - t2 = twist!(t2, 1) - end - if isdual(V2) - t2 = twist!(t2, 2) - end - ss = tr(t2) - @tensor s2 = t[a, b, b, a] - @tensor t3[a, b] := t[a, c, c, b] - @tensor s3 = t3[a, a] + t2 = permute(t, ((1, 2), (4, 3))) + s = @constinferred tr(t2) + @test conj(s) ≈ tr(t2') + if !isdual(V1) + t2 = twist!(t2, 1) + end + if isdual(V2) + t2 = twist!(t2, 2) end + ss = tr(t2) + @tensor s2 = t[a, b, b, a] + @tensor t3[a, b] := t[a, c, c, b] + @tensor s3 = t3[a, a] @test ss ≈ s2 @test ss ≈ s3 end @@ -363,24 +365,20 @@ for V in spacelist if BraidingStyle(I) isa Bosonic && hasfusiontensor(I) @timedtestset "Trace: test via conversion" begin t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3') - CUDA.@allowscalar begin - @tensor t2[a, b] := t[c, d, b, d, c, a] - @tensor t3[a, b] := ad(t)[c, d, b, d, c, a] - end + @tensor t2[a, b] := t[c, d, b, d, c, a] + @tensor t3[a, b] := ad(t)[c, d, b, d, c, a] @test t3 ≈ ad(t2) end end @timedtestset "Trace and contraction" begin t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3) t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1') - CUDA.@allowscalar begin - t3 = t1 ⊗ t2 - @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x] - @tensor tb[a, b] := t3[x, y, a, y, b, x] - end + t3 = t1 ⊗ t2 + @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x] + @tensor tb[a, b] := t3[x, y, a, y, b, x] @test ta ≈ tb end - #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I) + if BraidingStyle(I) isa Bosonic && hasfusiontensor(I) @timedtestset "Tensor contraction: test via CPU" begin dA1 = CUDA.randn(ComplexF64, V1' * V2', V3') dA2 = CUDA.randn(ComplexF64, V3 * V4, V5) @@ -395,45 +393,39 @@ for V in spacelist TensorKit.to_cpu(dH)[s1, s2, t1, t2] @test TensorKit.to_cpu(dHrA12) ≈ hHrA12 end - end=# # doesn't yet work because of AdjointTensor + end @timedtestset "Index flipping: test flipping inverse" begin t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1) for i in 1:4 - CUDA.@allowscalar begin - @test t ≈ flip(flip(t, i), i; inv = true) - @test t ≈ flip(flip(t, i; inv = true), i) - end + @test t ≈ flip(flip(t, i), i; inv = true) + @test t ≈ flip(flip(t, i; inv = true), i) end end - #=@timedtestset "Index flipping: test via explicit flip" begin + @timedtestset "Index flipping: test via explicit flip" begin t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1) - F1 = unitary(flip(V1), V1) + F1 = adapt(CuArray{ComplexF64}, unitary(flip(V1), V1)) - CUDA.@allowscalar begin - @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d] - @test flip(t, 1) ≈ tf - @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d] - @test twist!(flip(t, 2), 2) ≈ tf - @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d] - @test flip(t, 3) ≈ tf - @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d'] - @test twist!(flip(t, 4), 4) ≈ tf - end + @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d] + @test flip(t, 1) ≈ tf + @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d] + @test twist!(flip(t, 2), 2) ≈ tf + @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d] + @test flip(t, 3) ≈ tf + @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d'] + @test twist!(flip(t, 4), 4) ≈ tf end @timedtestset "Index flipping: test via contraction" begin t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4) t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1) - CUDA.@allowscalar begin - @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x] - @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x] - @test ta ≈ tb - @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x] - @test ta ≈ tb - @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x] - @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x] - @test flip(ta, (1, 2)) ≈ tb - end - end=# # TODO + @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x] + @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x] + @test ta ≈ tb + @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x] + @test ta ≈ tb + @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x] + @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x] + @test flip(ta, (1, 2)) ≈ tb + end @timedtestset "Multiplication of isometries: test properties" begin W2 = V4 ⊗ V5 W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))