diff --git a/Project.toml b/Project.toml
index 7e82876c5..22379ab5c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TensorKit"
 uuid = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec"
-authors = ["Jutho Haegeman, Lukas Devos"]
 version = "0.16.3"
+authors = ["Jutho Haegeman, Lukas Devos"]
 
 [deps]
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
@@ -53,7 +53,7 @@ Printf = "1"
 Random = "1"
 SafeTestsets = "0.1"
 ScopedValues = "1.3.0"
-Strided = "2"
+Strided = "2.3.4"
 TensorKitSectors = "0.3.5"
 TensorOperations = "5.1"
 Test = "1"
@@ -87,3 +87,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "AllocCheck", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote", "Mooncake", "JET"]
+
+[sources]
+Strided = {url = "https://github.com/QuantumKitHub/Strided.jl", rev = "ksh/copyto"}
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index f5efb98bb..4ee4865f1 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -10,7 +10,7 @@ using TensorKit.Factorizations
 using TensorKit.Strided
 using TensorKit.Factorizations: AbstractAlgorithm
 using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
-import TensorKit: randisometry, rand, randn
+import TensorKit: randisometry, rand, randn, _copyto!, _add_general_kernel_nonthreaded!, blocktype
 
 using TensorKit: MatrixAlgebraKit
 
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index f065c2ec1..2932c863f 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -17,6 +17,10 @@ function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::Abstr
     return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
 end
 
+function TensorKit.blocktype(::Type{<:CuTensorMap{T, S}}) where {T, S}
+    return CuMatrix{T, CUDA.DeviceMemory}
+end
+
 for (fname, felt) in ((:zeros, :zero), (:ones, :one))
     @eval begin
         function CUDA.$fname(
@@ -101,18 +105,6 @@ function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
     return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
 end
 
-function Base.convert(
-        TT::Type{CuTensorMap{T, S, N₁, N₂}},
-        t::AbstractTensorMap{<:Any, S, N₁, N₂}
-    ) where {T, S, N₁, N₂}
-    if typeof(t) === TT
-        return t
-    else
-        tnew = TT(undef, space(t))
-        return copy!(tnew, t)
-    end
-end
-
 function LinearAlgebra.isposdef(t::CuTensorMap)
     domain(t) == codomain(t) ||
         throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
@@ -138,10 +130,9 @@ function Base.promote_rule(
     return CuTensorMap{T, S, N₁, N₂}
 end
 
-TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
+TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
     CuArray{T, N, CUDA.default_memory}
 
-
 # CuTensorMap exponentation:
 function TensorKit.exp!(t::CuTensorMap)
     domain(t) == codomain(t) ||
@@ -168,3 +159,30 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+function TensorKit._add_general_kernel_nonthreaded!(
+        tdst::CuTensorMap, tsrc::CuTensorMap, p, transformer::TensorKit.GenericTreeTransformer, α, β, backend...
+    )
+    # preallocate buffers
+    buffers = TensorKit.allocate_buffers(tdst, tsrc, transformer)
+
+    for subtransformer in transformer.data
+        # Special case without intermediate buffers whenever there is only a single block
+        if length(subtransformer[1]) == 1
+            TensorKit._add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+        else
+            cu_subtransformer = tuple(CUDA.adapt(CuArray, subtransformer[1]), subtransformer[2:end]...)
+            TensorKit._add_transform_multi!(tdst, tsrc, p, cu_subtransformer, buffers, α, β, backend...)
+        end
+    end
+    return nothing
+end
+
+function TensorKit.allocate_buffers(
+        tdst::CuTensorMap, tsrc::CuTensorMap, transformer::TensorKit.GenericTreeTransformer
+    )
+    sz = TensorKit.buffersize(transformer)
+    # force zeros to ensure the buffers are empty
+    # otherwise memory re-use can fill them with garbage data
+    return CUDA.zeros(eltype(tdst.data), sz), CUDA.zeros(eltype(tsrc.data), sz)
+end
diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index a7105cda6..a62429dfe 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -57,10 +57,18 @@ function _interleave(a::NTuple{N}, b::NTuple{N}) where {N}
     return (a[1], b[1], _interleave(tail(a), tail(b))...)
 end
 
+_copyto!(A::StridedView, B::StridedView) = copyto!(A, B)
+
 # Low-overhead implementation of `copyto!` for specific case of `stride(B, 1) < stride(B, 2)`
-# used in indexmanipulations: avoids the overhead of Strided.jl
-function _copyto!(A::StridedView{<:Any, 1}, B::StridedView{<:Any, 2})
-    length(A) == length(B) || throw(DimensionMismatch())
+# for CPU-hosted Arrays # used in indexmanipulations: avoids the overhead of Strided.jl
+
+@static if VERSION < v"1.11" # TODO: remove once support for v1.10 is dropped
+    const AT = Array
+else
+    const AT = Memory
+end
+function _copyto!(A::StridedView{TA, 1, <:AT}, B::StridedView{TB, 2, <:AT}) where {TA <: Number, TB <: Number}
+    length(A) == length(B) || throw(DimensionMismatch(lazy"length of A ($(length(A))) does not match length of B ($(length(B))"))
 
     Adata = parent(A)
     Astr = stride(A, 1)
diff --git a/src/tensors/abstracttensor.jl b/src/tensors/abstracttensor.jl
index 2d7239460..9293f1375 100644
--- a/src/tensors/abstracttensor.jl
+++ b/src/tensors/abstracttensor.jl
@@ -53,9 +53,11 @@ storagetype(t) = storagetype(typeof(t))
 function storagetype(::Type{T}) where {T <: AbstractTensorMap}
     if T isa Union
         # attempt to be slightly more specific by promoting unions
-        Ma = storagetype(T.a)
-        Mb = storagetype(T.b)
-        return promote_storagetype(Ma, Mb)
+        return promote_storagetype(T.a, T.b)
+    elseif eltype(T) isa Union
+        # attempt to be slightly more specific by promoting unions
+        TU = eltype(T)
+        return promote_storagetype(TU.a, TU.b)
     else
         # fallback definition by using scalartype
         return similarstoragetype(scalartype(T))
@@ -103,8 +105,9 @@ similarstoragetype(X::Type, ::Type{T}) where {T <: Number} =
 
 # implement on tensors
 similarstoragetype(::Type{TT}) where {TT <: AbstractTensorMap} = similarstoragetype(storagetype(TT))
-similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number} =
-    similarstoragetype(storagetype(TT), T)
+function similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number}
+    return similarstoragetype(storagetype(TT), T)
+end
 
 # implement on arrays
 similarstoragetype(::Type{A}) where {A <: DenseVector{<:Number}} = A
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index 0070bc2d4..4fd68f7da 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -171,12 +171,15 @@ end
 has_shared_permute(t::BraidingTensor, ::Index2Tuple) = false
 function add_transform!(
         tdst::AbstractTensorMap,
-        tsrc::BraidingTensor, (p₁, p₂)::Index2Tuple,
+        tsrc::BraidingTensor{T, S},
+        (p₁, p₂)::Index2Tuple,
         fusiontreetransform,
         α::Number, β::Number, backend::AbstractBackend...
-    )
+    ) where {T, S}
+    tsrc_map = similar(tdst, storagetype(tdst), space(tsrc))
+    copy!(tsrc_map, tsrc)
     return add_transform!(
-        tdst, TensorMap(tsrc), (p₁, p₂), fusiontreetransform, α, β,
+        tdst, tsrc_map, (p₁, p₂), fusiontreetransform, α, β,
         backend...
     )
 end
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 0fad13473..4b4682763 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -98,8 +98,8 @@ for V in spacelist
                 next = @constinferred Nothing iterate(bs, state)
                 b2 = @constinferred block(t, first(blocksectors(t)))
                 @test b1 == b2
-                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
-                @test_broken typeof(b1) === TensorKit.blocktype(t)
+                @test eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test typeof(b1) === TensorKit.blocktype(t)
                 @test typeof(c) === sectortype(t)
             end
         end
@@ -162,8 +162,8 @@ for V in spacelist
                 next = @constinferred Nothing iterate(bs, state)
                 b2 = @constinferred block(t', first(blocksectors(t')))
                 @test b1 == b2
-                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
-                @test_broken typeof(b1) === TensorKit.blocktype(t')
+                @test eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test typeof(b1) === TensorKit.blocktype(t')
                 @test typeof(c) === sectortype(t)
                 # linear algebra
                 @test isa(@constinferred(norm(t)), real(T))
@@ -236,8 +236,8 @@ for V in spacelist
                     α = rand(T)
                     @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
                     @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
-                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                    @test adapt(Vector{T}, (α * t)) ≈ α * adapt(Vector{T}, t)
+                    @test adapt(Vector{T}, (t + t)) ≈ 2 * adapt(Vector{T}, t)
                 end
             end
             @timedtestset "Real and imaginary parts" begin
@@ -290,28 +290,29 @@ for V in spacelist
         @timedtestset "Permutations: test via inner product invariance" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
             t = CUDA.rand(ComplexF64, W)
+            ht = adapt(Vector{ComplexF64}, t)
             t′ = CUDA.randn!(similar(t))
+            ht′ = adapt(Vector{ComplexF64}, t′)
+            dot_htt′ = dot(ht′, ht)
+            dot_tt′ = dot(t′, t)
+            @test dot_tt′ ≈ dot_htt′
+            norm_t = norm(t)
             for k in 0:5
                 for p in permutations(1:5)
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
-                    CUDA.@allowscalar begin
-                        t2 = @constinferred permute(t, (p1, p2))
-                        t2 = permute(t, (p1, p2))
-                        @test norm(t2) ≈ norm(t)
-                        t2′ = permute(t′, (p1, p2))
-                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
-                    end
-                end
-
-                CUDA.@allowscalar begin
-                    t3 = @constinferred repartition(t, $k)
-                    t3 = repartition(t, k)
-                    @test norm(t3) ≈ norm(t)
-                    t3′ = @constinferred repartition!(similar(t3), t′)
-                    @test norm(t3′) ≈ norm(t′)
-                    @test dot(t′, t) ≈ dot(t3′, t3)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    t2′ = permute(t′, (p1, p2))
+                    @test norm(t2) ≈ norm_t
+                    @test dot(t2′, t2) ≈ dot_tt′
+                    @test dot(transpose(t2′), transpose(t2)) ≈ dot_tt′
                 end
+                t3 = @constinferred repartition(t, $k)
+                t3 = repartition(t, k)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3) ≈ norm(t)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
             end
         end
         if BraidingStyle(I) isa SymmetricBraiding
@@ -322,34 +323,35 @@ for V in spacelist
                     for p in permutations(1:5)
                         p1 = ntuple(n -> p[n], k)
                         p2 = ntuple(n -> p[k + n], 5 - k)
-                        dt2 = CUDA.@allowscalar permute(t, (p1, p2))
-                        ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
-                        @test ht2 == TensorKit.to_cpu(dt2)
+                        ht2 = permute(adapt(Vector{ComplexF64}, t), (p1, p2))
+                        dt2 = permute(t, (p1, p2))
+                        @test ht2 ≈ adapt(Vector{ComplexF64}, dt2)
+                        ht3 = transpose(adapt(Vector{ComplexF64}, dt2))
+                        dt3 = transpose(dt2)
+                        hht3 = adapt(Vector{ComplexF64}, dt3)
+                        @test ht3 ≈ hht3
                     end
-
-                    dt3 = CUDA.@allowscalar repartition(t, k)
-                    ht3 = repartition(TensorKit.to_cpu(t), k)
-                    @test ht3 == TensorKit.to_cpu(dt3)
+                    dt4 = repartition(t, k)
+                    ht4 = repartition(adapt(Vector{ComplexF64}, t), k)
+                    @test ht4 == adapt(Vector{ComplexF64}, dt4)
                 end
             end
         end
         @timedtestset "Full trace: test self-consistency" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
-            CUDA.@allowscalar begin
-                t2 = permute(t, ((1, 2), (4, 3)))
-                s = @constinferred tr(t2)
-                @test conj(s) ≈ tr(t2')
-                if !isdual(V1)
-                    t2 = twist!(t2, 1)
-                end
-                if isdual(V2)
-                    t2 = twist!(t2, 2)
-                end
-                ss = tr(t2)
-                @tensor s2 = t[a, b, b, a]
-                @tensor t3[a, b] := t[a, c, c, b]
-                @tensor s3 = t3[a, a]
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
             end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
             @test ss ≈ s2
             @test ss ≈ s3
         end
@@ -363,24 +365,20 @@ for V in spacelist
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Trace: test via conversion" begin
                 t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
-                CUDA.@allowscalar begin
-                    @tensor t2[a, b] := t[c, d, b, d, c, a]
-                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
-                end
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
                 @test t3 ≈ ad(t2)
             end
         end
         @timedtestset "Trace and contraction" begin
             t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
             t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
-            CUDA.@allowscalar begin
-                t3 = t1 ⊗ t2
-                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
-                @tensor tb[a, b] := t3[x, y, a, y, b, x]
-            end
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
             @test ta ≈ tb
         end
-        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Tensor contraction: test via CPU" begin
                 dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
                 dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
@@ -395,45 +393,39 @@ for V in spacelist
                     TensorKit.to_cpu(dH)[s1, s2, t1, t2]
                 @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
             end
-        end=# # doesn't yet work because of AdjointTensor
+        end
         @timedtestset "Index flipping: test flipping inverse" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
             for i in 1:4
-                CUDA.@allowscalar begin
-                    @test t ≈ flip(flip(t, i), i; inv = true)
-                    @test t ≈ flip(flip(t, i; inv = true), i)
-                end
+                @test t ≈ flip(flip(t, i), i; inv = true)
+                @test t ≈ flip(flip(t, i; inv = true), i)
             end
         end
-        #=@timedtestset "Index flipping: test via explicit flip" begin
+        @timedtestset "Index flipping: test via explicit flip" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
-            F1 = unitary(flip(V1), V1)
+            F1 = adapt(CuArray{ComplexF64}, unitary(flip(V1), V1))
 
-            CUDA.@allowscalar begin
-                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
-                @test flip(t, 1) ≈ tf
-                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
-                @test twist!(flip(t, 2), 2) ≈ tf
-                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
-                @test flip(t, 3) ≈ tf
-                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
-                @test twist!(flip(t, 4), 4) ≈ tf
-            end
+            @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+            @test flip(t, 1) ≈ tf
+            @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+            @test twist!(flip(t, 2), 2) ≈ tf
+            @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+            @test flip(t, 3) ≈ tf
+            @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+            @test twist!(flip(t, 4), 4) ≈ tf
         end
         @timedtestset "Index flipping: test via contraction" begin
             t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
             t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
-            CUDA.@allowscalar begin
-                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
-                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
-                @test ta ≈ tb
-                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
-                @test ta ≈ tb
-                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
-                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
-                @test flip(ta, (1, 2)) ≈ tb
-            end
-        end=# # TODO
+            @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+            @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+            @test ta ≈ tb
+            @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+            @test ta ≈ tb
+            @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+            @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+            @test flip(ta, (1, 2)) ≈ tb
+        end
         @timedtestset "Multiplication of isometries: test properties" begin
             W2 = V4 ⊗ V5
             W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))