Working braiding transpose and more tests

kshyatt · kshyatt · commit 813e2c2bd0fe · 2026-03-18T10:31:01.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,9 +1,10 @@
 name = "TensorKit"
 uuid = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec"
-authors = ["Jutho Haegeman, Lukas Devos"]
 version = "0.16.3"
+authors = ["Jutho Haegeman, Lukas Devos"]
 
 [deps]
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MatrixAlgebraKit = "6c742aac-3347-4629-af66-fc926824e5e4"
@@ -53,7 +54,7 @@ Printf = "1"
 Random = "1"
 SafeTestsets = "0.1"
 ScopedValues = "1.3.0"
-Strided = "2.3.4"
+Strided = "2.3.5"
 TensorKitSectors = "0.3.5"
 TensorOperations = "5.1"
 Test = "1"
@@ -87,3 +88,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "AllocCheck", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote", "Mooncake", "JET"]
+
+[sources]
+Strided = {url = "https://github.com/QuantumKitHub/Strided.jl", rev = "ksh/copyto"}
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -177,3 +177,12 @@ function TensorKit._add_general_kernel_nonthreaded!(
     end
     return nothing
 end
+
+function TensorKit.allocate_buffers(
+        tdst::CuTensorMap, tsrc::CuTensorMap, transformer::TensorKit.GenericTreeTransformer
+    )
+    sz = TensorKit.buffersize(transformer)
+    # force zeros to ensure the buffers are empty
+    # otherwise memory re-use can fill them with garbage data
+    return CUDA.zeros(eltype(tdst.data), sz), CUDA.zeros(eltype(tsrc.data), sz)
+end
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
@@ -290,28 +290,29 @@ for V in spacelist
         @timedtestset "Permutations: test via inner product invariance" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
             t = CUDA.rand(ComplexF64, W)
+            ht = adapt(Vector{ComplexF64}, t)
             t′ = CUDA.randn!(similar(t))
+            ht′ = adapt(Vector{ComplexF64}, t′)
+            dot_htt′ = dot(ht′, ht)
+            dot_tt′ = dot(t′, t)
+            @test dot_tt′ ≈ dot_htt′
+            norm_t = norm(t)
             for k in 0:5
                 for p in permutations(1:5)
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
-                    CUDA.@allowscalar begin
-                        t2 = @constinferred permute(t, (p1, p2))
-                        t2 = permute(t, (p1, p2))
-                        @test norm(t2) ≈ norm(t)
-                        t2′ = permute(t′, (p1, p2))
-                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
-                    end
-                end
-
-                CUDA.@allowscalar begin
-                    t3 = @constinferred repartition(t, $k)
-                    t3 = repartition(t, k)
-                    @test norm(t3) ≈ norm(t)
-                    t3′ = @constinferred repartition!(similar(t3), t′)
-                    @test norm(t3′) ≈ norm(t′)
-                    @test dot(t′, t) ≈ dot(t3′, t3)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    t2′ = permute(t′, (p1, p2))
+                    @test norm(t2) ≈ norm_t
+                    @test dot(t2′, t2) ≈ dot_tt′
+                    @test dot(transpose(t2′), transpose(t2)) ≈ dot_tt′
                 end
+                t3 = @constinferred repartition(t, $k)
+                t3 = repartition(t, k)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3) ≈ norm(t)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
             end
         end
         if BraidingStyle(I) isa SymmetricBraiding
@@ -322,34 +323,35 @@ for V in spacelist
                     for p in permutations(1:5)
                         p1 = ntuple(n -> p[n], k)
                         p2 = ntuple(n -> p[k + n], 5 - k)
-                        dt2 = CUDA.@allowscalar permute(t, (p1, p2))
-                        ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
-                        @test ht2 == adapt(Vector{ComplexF64}, dt2)
+                        ht2 = permute(adapt(Vector{ComplexF64}, t), (p1, p2))
+                        dt2 = permute(t, (p1, p2))
+                        @test ht2 ≈ adapt(Vector{ComplexF64}, dt2)
+                        ht3 = transpose(adapt(Vector{ComplexF64}, dt2))
+                        dt3 = transpose(dt2)
+                        hht3 = adapt(Vector{ComplexF64}, dt3)
+                        @test ht3 ≈ hht3
                     end
-
-                    dt3 = CUDA.@allowscalar repartition(t, k)
-                    ht3 = repartition(adapt(Vector{ComplexF64}, t), k)
-                    @test ht3 == adapt(Vector{ComplexF64}, dt3)
+                    dt4 = repartition(t, k)
+                    ht4 = repartition(adapt(Vector{ComplexF64}, t), k)
+                    @test ht4 == adapt(Vector{ComplexF64}, dt4)
                 end
             end
         end
         @timedtestset "Full trace: test self-consistency" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
-            CUDA.@allowscalar begin
-                t2 = permute(t, ((1, 2), (4, 3)))
-                s = @constinferred tr(t2)
-                @test conj(s) ≈ tr(t2')
-                if !isdual(V1)
-                    t2 = twist!(t2, 1)
-                end
-                if isdual(V2)
-                    t2 = twist!(t2, 2)
-                end
-                ss = tr(t2)
-                @tensor s2 = t[a, b, b, a]
-                @tensor t3[a, b] := t[a, c, c, b]
-                @tensor s3 = t3[a, a]
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
             end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
+            end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
             @test ss ≈ s2
             @test ss ≈ s3
         end
@@ -363,24 +365,20 @@ for V in spacelist
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Trace: test via conversion" begin
                 t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
-                CUDA.@allowscalar begin
-                    @tensor t2[a, b] := t[c, d, b, d, c, a]
-                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
-                end
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
                 @test t3 ≈ ad(t2)
             end
         end
         @timedtestset "Trace and contraction" begin
             t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
             t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
-            CUDA.@allowscalar begin
-                t3 = t1 ⊗ t2
-                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
-                @tensor tb[a, b] := t3[x, y, a, y, b, x]
-            end
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
             @test ta ≈ tb
         end
-        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Tensor contraction: test via CPU" begin
                 dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
                 dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
@@ -395,7 +393,7 @@ for V in spacelist
                     TensorKit.to_cpu(dH)[s1, s2, t1, t2]
                 @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
             end
-        end=# # doesn't yet work because of AdjointTensor
+        end
         @timedtestset "Index flipping: test flipping inverse" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
             for i in 1:4
@@ -405,7 +403,7 @@ for V in spacelist
                 end
             end
         end
-        #=@timedtestset "Index flipping: test via explicit flip" begin
+        @timedtestset "Index flipping: test via explicit flip" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
             F1 = unitary(flip(V1), V1)
 
@@ -433,7 +431,7 @@ for V in spacelist
                 @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
                 @test flip(ta, (1, 2)) ≈ tb
             end
-        end=# # TODO
+        end
         @timedtestset "Multiplication of isometries: test properties" begin
             W2 = V4 ⊗ V5
             W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))