From 011f496062ecb1435452406d00c324b40a0e2a64 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 3 Feb 2026 02:18:46 -0500
Subject: [PATCH 1/7] Changes for non-CPU array support

Fix sources

Don't use a storagetype as input for MPS's

Remove duplicates in extension

more comments

Apply suggestions from code review

Co-authored-by: Lukas Devos <ldevos98@gmail.com>

Fix sources

Remove some extra storagetypes
---
 Project.toml                      |  3 ++
 src/algorithms/toolbox.jl         |  6 +--
 src/operators/mpohamiltonian.jl   |  1 +
 src/states/abstractmps.jl         | 12 ++---
 src/states/infinitemps.jl         | 29 +++++++-----
 src/states/multilinemps.jl        | 10 ----
 src/states/ortho.jl               | 12 +++--
 src/states/quasiparticle_state.jl |  2 +-
 src/utility/multiline.jl          | 11 +++++
 test/cuda/operators.jl            | 78 +++++++++++++++++++++++++++++++
 test/cuda/states.jl               | 52 +++++++++++++++++++++
 test/operators/mpohamiltonian.jl  |  5 ++
 test/states/infinitemps.jl        |  6 +--
 test/states/multilinemps.jl       |  2 +
 test/states/quasiparticle.jl      |  4 ++
 15 files changed, 195 insertions(+), 38 deletions(-)
 create mode 100644 test/cuda/operators.jl
 create mode 100644 test/cuda/states.jl

diff --git a/Project.toml b/Project.toml
index 51e51efdf..115c923b6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -76,3 +76,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["Aqua", "Adapt", "CUDA", "cuTENSOR", "Pkg", "Test", "TestExtras", "Plots", "Combinatorics", "ParallelTestRunner", "TensorKitTensors"]
+
+[sources]
+TensorKit = {url="https://github.com/QuantumKitHub/TensorKit.jl", rev="ksh/cuda_tweaks"}
diff --git a/src/algorithms/toolbox.jl b/src/algorithms/toolbox.jl
index f24d6230a..c33238dfe 100644
--- a/src/algorithms/toolbox.jl
+++ b/src/algorithms/toolbox.jl
@@ -19,11 +19,9 @@ function entropy(spectrum::TensorKit.SectorVector{T}) where {T}
     S = zero(T)
     tol = eps(T)
     for (c, b) in pairs(spectrum)
-        s = zero(S)
-        for x in b
-            x < tol && break
+        s = sum(b; init = zero(S)) do x
             x² = x^2
-            s += x² * log(x²)
+            return x < tol ? zero(x) : x² * log(x²)
         end
         S += oftype(S, dim(c) * s)
     end
diff --git a/src/operators/mpohamiltonian.jl b/src/operators/mpohamiltonian.jl
index fe3ffdb1c..1f2e7f980 100644
--- a/src/operators/mpohamiltonian.jl
+++ b/src/operators/mpohamiltonian.jl
@@ -32,6 +32,7 @@ struct MPOHamiltonian{TO <: JordanMPOTensor, V <: AbstractVector{TO}} <: Abstrac
     W::V
 end
 OperatorStyle(::Type{<:MPOHamiltonian}) = HamiltonianStyle()
+TensorKit.storagetype(::Type{MPOHamiltonian{O, V}}) where {O, V} = storagetype(O)
 
 const FiniteMPOHamiltonian{O <: MPOTensor} = MPOHamiltonian{O, Vector{O}}
 Base.isfinite(::Type{<:FiniteMPOHamiltonian}) = true
diff --git a/src/states/abstractmps.jl b/src/states/abstractmps.jl
index a8b704127..cbc019dfb 100644
--- a/src/states/abstractmps.jl
+++ b/src/states/abstractmps.jl
@@ -33,15 +33,15 @@ Construct an `MPSTensor` with given physical and virtual spaces.
 - `right_D::Int`: right virtual dimension
 """
 function MPSTensor(
-        ::UndefInitializer, eltype, P::Union{S, CompositeSpace{S}}, Vₗ::S, Vᵣ::S = Vₗ
+        ::UndefInitializer, T, P::Union{S, CompositeSpace{S}}, Vₗ::S, Vᵣ::S = Vₗ
     ) where {S <: ElementarySpace}
-    TT = tensormaptype(S, 1 + (P isa S ? 1 : length(P)), 1, eltype)
+    TT = tensormaptype(S, 1 + (P isa S ? 1 : length(P)), 1, T)
     return TT(undef, Vₗ ⊗ P ← Vᵣ)
 end
 function MPSTensor(
-        f, eltype, P::Union{S, CompositeSpace{S}}, Vₗ::S, Vᵣ::S = Vₗ
+        f, T, P::Union{S, CompositeSpace{S}}, Vₗ::S, Vᵣ::S = Vₗ
     ) where {S <: ElementarySpace}
-    A = MPSTensor(undef, eltype, P, Vₗ, Vᵣ)
+    A = MPSTensor(undef, T, P, Vₗ, Vᵣ)
     if f === rand
         return rand!(A)
     elseif f === randn
@@ -71,7 +71,7 @@ Construct an `MPSTensor` with given physical and virtual dimensions.
 - `Dₗ::Int`: left virtual dimension
 - `Dᵣ::Int`: right virtual dimension
 """
-MPSTensor(f, eltype, d::Int, Dₗ::Int, Dᵣ::Int = Dₗ) = MPSTensor(f, eltype, ℂ^d, ℂ^Dₗ, ℂ^Dᵣ)
+MPSTensor(f, T, d::Int, Dₗ::Int, Dᵣ::Int = Dₗ) = MPSTensor(f, T, ℂ^d, ℂ^Dₗ, ℂ^Dᵣ)
 MPSTensor(d::Int, Dₗ::Int; Dᵣ::Int = Dₗ) = MPSTensor(ℂ^d, ℂ^Dₗ, ℂ^Dᵣ)
 
 """
@@ -79,7 +79,7 @@ MPSTensor(d::Int, Dₗ::Int; Dᵣ::Int = Dₗ) = MPSTensor(ℂ^d, ℂ^Dₗ, ℂ^
 
 Convert an array to an `MPSTensor`.
 """
-function MPSTensor(A::AbstractArray{T}) where {T <: Number}
+function MPSTensor(A::AbstractArray{<:Number})
     @assert ndims(A) > 2 "MPSTensor should have at least 3 dims, but has $ndims(A)"
     sz = size(A)
     V = foldl(⊗, ComplexSpace.(sz[1:(end - 1)])) ← ℂ^sz[end]
diff --git a/src/states/infinitemps.jl b/src/states/infinitemps.jl
index 1cd07bab4..33636e53d 100644
--- a/src/states/infinitemps.jl
+++ b/src/states/infinitemps.jl
@@ -116,35 +116,42 @@ function InfiniteMPS(
         convert(PeriodicVector{B}, C), convert(PeriodicVector{A}, AC)
     )
 end
-
+function InfiniteMPS(
+        T::Type,
+        pspaces::AbstractVector{S}, Dspaces::AbstractVector{S};
+        kwargs...
+    ) where {S <: IndexSpace}
+    return InfiniteMPS(MPSTensor.(rand, T, pspaces, circshift(Dspaces, 1), Dspaces); kwargs...)
+end
 function InfiniteMPS(
         pspaces::AbstractVector{S}, Dspaces::AbstractVector{S};
         kwargs...
     ) where {S <: IndexSpace}
-    return InfiniteMPS(MPSTensor.(pspaces, circshift(Dspaces, 1), Dspaces); kwargs...)
+    return InfiniteMPS(MPSTensor.(rand, ComplexF64, pspaces, circshift(Dspaces, 1), Dspaces); kwargs...)
 end
 function InfiniteMPS(
-        f, elt::Type{<:Number}, pspaces::AbstractVector{S}, Dspaces::AbstractVector{S};
+        f, T::Type, pspaces::AbstractVector{S}, Dspaces::AbstractVector{S};
         kwargs...
     ) where {S <: IndexSpace}
     return InfiniteMPS(
-        MPSTensor.(f, elt, pspaces, circshift(Dspaces, 1), Dspaces);
+        MPSTensor.(f, T, pspaces, circshift(Dspaces, 1), Dspaces);
         kwargs...
     )
 end
-InfiniteMPS(d::S, D::S) where {S <: Union{Int, <:IndexSpace}} = InfiniteMPS([d], [D])
+InfiniteMPS(T::Type, d::S, D::S; kwargs...) where {S <: Union{Int, <:IndexSpace}} = InfiniteMPS(T, [d], [D]; kwargs...)
+InfiniteMPS(d::S, D::S; kwargs...) where {S <: Union{Int, <:IndexSpace}} = InfiniteMPS([d], [D]; kwargs...)
 function InfiniteMPS(
-        f, elt::Type{<:Number}, d::S, D::S
+        f, T::Type, d::S, D::S; kwargs...
     ) where {S <: Union{Int, <:IndexSpace}}
-    return InfiniteMPS(f, elt, [d], [D])
+    return InfiniteMPS(f, T, [d], [D]; kwargs...)
 end
-function InfiniteMPS(ds::AbstractVector{Int}, Ds::AbstractVector{Int})
-    return InfiniteMPS(ComplexSpace.(ds), ComplexSpace.(Ds))
+function InfiniteMPS(ds::AbstractVector{Int}, Ds::AbstractVector{Int}; kwargs...)
+    return InfiniteMPS(ComplexSpace.(ds), ComplexSpace.(Ds); kwargs...)
 end
 function InfiniteMPS(
-        f, elt::Type{<:Number}, ds::AbstractVector{Int}, Ds::AbstractVector{Int}, kwargs...
+        f, T::Type, ds::AbstractVector{Int}, Ds::AbstractVector{Int}, kwargs...
     )
-    return InfiniteMPS(f, elt, ComplexSpace.(ds), ComplexSpace.(Ds); kwargs...)
+    return InfiniteMPS(f, T, ComplexSpace.(ds), ComplexSpace.(Ds); kwargs...)
 end
 
 function InfiniteMPS(A::AbstractVector{<:GenericMPSTensor}; kwargs...)
diff --git a/src/states/multilinemps.jl b/src/states/multilinemps.jl
index 6a66437b4..a090b67cd 100644
--- a/src/states/multilinemps.jl
+++ b/src/states/multilinemps.jl
@@ -81,16 +81,6 @@ for f in (:r_RR, :r_RL, :r_LR, :r_LL)
     @eval $f(t::MultilineMPS, i, j = size(t, 2)) = $f(t[i], j)
 end
 
-site_type(::Type{Multiline{S}}) where {S} = site_type(S)
-bond_type(::Type{Multiline{S}}) where {S} = bond_type(S)
-site_type(st::Multiline) = site_type(typeof(st))
-bond_type(st::Multiline) = bond_type(typeof(st))
-VectorInterface.scalartype(::Multiline{T}) where {T} = scalartype(T)
-TensorKit.sectortype(t::Multiline) = sectortype(typeof(t))
-TensorKit.sectortype(::Type{Multiline{T}}) where {T} = sectortype(T)
-TensorKit.spacetype(t::Multiline) = spacetype(typeof(t))
-TensorKit.spacetype(::Type{Multiline{T}}) where {T} = spacetype(T)
-
 function TensorKit.dot(a::MultilineMPS, b::MultilineMPS; kwargs...)
     return sum(dot.(parent(a), parent(b); kwargs...))
 end
diff --git a/src/states/ortho.jl b/src/states/ortho.jl
index 22a8685fc..a8158a325 100644
--- a/src/states/ortho.jl
+++ b/src/states/ortho.jl
@@ -21,7 +21,7 @@ $(TYPEDFIELDS)
     verbosity::Int = VERBOSE_WARN
 
     "algorithm used for orthogonalization of the tensors"
-    alg_orth = LAPACK_HouseholderQR(; positive = true)
+    alg_orth = Defaults.alg_qr()
     "algorithm used for the eigensolver"
     alg_eigsolve = _GAUGE_ALG_EIGSOLVE
     "minimal amount of iterations before using the eigensolver steps"
@@ -46,7 +46,7 @@ $(TYPEDFIELDS)
     verbosity::Int = VERBOSE_WARN
 
     "algorithm used for orthogonalization of the tensors"
-    alg_orth = LAPACK_HouseholderLQ(; positive = true)
+    alg_orth = Defaults.alg_lq()
     "algorithm used for the eigensolver"
     alg_eigsolve = _GAUGE_ALG_EIGSOLVE
     "minimal amount of iterations before using the eigensolver steps"
@@ -73,16 +73,22 @@ end
 
 function MixedCanonical(;
         tol::Real = Defaults.tolgauge, maxiter::Int = Defaults.maxiter,
-        verbosity::Int = VERBOSE_WARN, alg_orth = LAPACK_HouseholderQR(; positive = true),
+        verbosity::Int = VERBOSE_WARN, alg_orth = Defaults.alg_qr(),
         alg_eigsolve = _GAUGE_ALG_EIGSOLVE,
         eig_miniter::Int = 10, order::Symbol = :LR
     )
     if alg_orth isa LAPACK_HouseholderQR
         alg_leftorth = alg_orth
         alg_rightorth = LAPACK_HouseholderLQ(; alg_orth.kwargs...)
+    elseif alg_orth isa CUSOLVER_HouseholderQR
+        alg_leftorth = alg_orth
+        alg_rightorth = LQViaTransposedQR(CUSOLVER_HouseholderQR(; alg_orth.kwargs...))
     elseif alg_orth isa LAPACK_HouseholderLQ
         alg_leftorth = LAPACK_HouseholderQR(; alg_orth.kwargs...)
         alg_rightorth = alg_orth
+    elseif alg_orth isa LQViaTransposedQR
+        alg_leftorth = alg_orth
+        alg_rightorth = alg_orth.qr_alg
     else
         alg_leftorth = alg_rightorth = alg_orth
     end
diff --git a/src/states/quasiparticle_state.jl b/src/states/quasiparticle_state.jl
index 7e9d78e46..83ef7553c 100644
--- a/src/states/quasiparticle_state.jl
+++ b/src/states/quasiparticle_state.jl
@@ -1,7 +1,7 @@
 #=
 Should not be constructed by the user - acts like a vector (used in eigsolve)
 I think it makes sense to see these things as an actual state instead of return an array of B tensors (what we used to do)
-This will allow us to plot energy density (finite qp) and measure observeables.
+This will allow us to plot energy density (finite qp) and measure observables.
 =#
 
 struct LeftGaugedQP{S, T1, T2, E <: Number}
diff --git a/src/utility/multiline.jl b/src/utility/multiline.jl
index ebcff438f..53b02cfc6 100644
--- a/src/utility/multiline.jl
+++ b/src/utility/multiline.jl
@@ -106,3 +106,14 @@ function VectorInterface.inner(x::Multiline, y::Multiline)
 end
 
 LinearAlgebra.norm(x::Multiline) = sqrt(real(inner(x, x)))
+
+# TensorKit
+#----------
+
+site_type(::Type{Multiline{S}}) where {S} = site_type(S)
+bond_type(::Type{Multiline{S}}) where {S} = bond_type(S)
+site_type(st::Multiline) = site_type(typeof(st))
+bond_type(st::Multiline) = bond_type(typeof(st))
+TensorKit.sectortype(::Type{Multiline{T}}) where {T} = sectortype(T)
+TensorKit.spacetype(::Type{Multiline{T}}) where {T} = spacetype(T)
+TensorKit.storagetype(::Type{Multiline{T}}) where {T} = storagetype(T)
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
new file mode 100644
index 000000000..02334797b
--- /dev/null
+++ b/test/cuda/operators.jl
@@ -0,0 +1,78 @@
+using .TestSetup
+using Test, TestExtras
+using MPSKit
+using MPSKit: GeometryStyle, FiniteChainStyle, InfiniteChainStyle, OperatorStyle, MPOStyle
+using TensorKit
+using TensorKit: ℙ, tensormaptype, TensorMapWithStorage
+using Adapt, CUDA, cuTENSOR
+
+@testset "CuFiniteMPO" for V in (ℂ^2, U1Space(0 => 1, 1 => 1))
+    # start from random operators
+    L = 4
+    T = ComplexF64
+
+    O₁ = rand(T, V^L, V^L)
+    O₂ = rand(T, space(O₁))
+    O₃ = rand(real(T), space(O₁))
+
+    mpo₁ = adapt(CuArray, FiniteMPO(O₁))
+    mpo₂ = adapt(CuArray, FiniteMPO(O₂))
+    mpo₃ = adapt(CuArray, FiniteMPO(O₃))
+
+    @test isfinite(mpo₁)
+    @test isfinite(typeof(mpo₁))
+    @test GeometryStyle(typeof(mpo₁)) == FiniteChainStyle()
+    @test GeometryStyle(mpo₁) == FiniteChainStyle()
+    @test OperatorStyle(typeof(mpo₁)) == MPOStyle()
+
+    @test @constinferred physicalspace(mpo₁) == fill(V, L)
+    Vleft = @constinferred left_virtualspace(mpo₁)
+    Vright = @constinferred right_virtualspace(mpo₂)
+    for i in 1:L
+        @test Vleft[i] == left_virtualspace(mpo₁, i)
+        @test Vright[i] == right_virtualspace(mpo₁, i)
+    end
+
+    TM = TensorMapWithStorage{T, CuVector{T, CUDA.DeviceMemory}}
+    #@test convert(TM, mpo₁) ≈ O₁
+    #@test convert(TM, -mpo₂) ≈ -O₂
+    #@test convert(TM, @constinferred complex(mpo₃)) ≈ complex(O₃)
+
+
+    # test scalar multiplication
+    α = rand(T)
+    #@test convert(TM, α * mpo₁) ≈ α * O₁
+    #@test convert(TM, mpo₁ * α) ≈ O₁ * α
+    @test α * mpo₃ ≈ α * complex(mpo₃) atol = 1.0e-6
+
+    # test addition and multiplication
+    #@test convert(TM, mpo₁ + mpo₂) ≈ O₁ + O₂
+    #@test convert(TM, mpo₁ + mpo₃) ≈ O₁ + O₃
+    #@test convert(TM, mpo₁ * mpo₂) ≈ O₁ * O₂
+    #@test convert(TM, mpo₁ * mpo₃) ≈ O₁ * O₃
+
+    # test application to a state
+    ψ₁ = adapt(CuArray, rand(T, domain(O₁)))
+    #ψ₂ = adapt(CuArray, rand(real(T), domain(O₂))) # not allowed due to cuTENSOR
+    mps₁ = adapt(CuArray, FiniteMPS(ψ₁))
+    #mps₂ = adapt(CuArray, FiniteMPS(ψ₂))
+
+    @test @constinferred GeometryStyle(mps₁, mpo₁, mps₁) == GeometryStyle(mps₁)
+
+    #@test convert(TM, mpo₁ * mps₁) ≈ O₁ * ψ₁
+    @test mpo₁ * ψ₁ ≈ O₁ * ψ₁
+    #@test convert(TM, mpo₃ * mps₁) ≈ O₃ * ψ₁
+    @test mpo₃ * ψ₁ ≈ O₃ * ψ₁
+    #@test convert(TM, mpo₁ * mps₂) ≈ O₁ * ψ₂
+    #@test mpo₁ * ψ₂ ≈ O₁ * ψ₂
+
+    @test dot(mps₁, mpo₁, mps₁) ≈ dot(ψ₁, O₁, ψ₁)
+    @test dot(mps₁, mpo₁, mps₁) ≈ dot(mps₁, mpo₁ * mps₁)
+    # test conversion to and from mps
+    mpomps₁ = convert(FiniteMPS, mpo₁)
+    mpompsmpo₁ = convert(FiniteMPO, mpomps₁)
+
+    @test convert(FiniteMPO, mpomps₁) ≈ mpo₁ rtol = 1.0e-6
+
+    @test dot(mpomps₁, mpomps₁) ≈ dot(mpo₁, mpo₁)
+end
diff --git a/test/cuda/states.jl b/test/cuda/states.jl
new file mode 100644
index 000000000..3722a2be1
--- /dev/null
+++ b/test/cuda/states.jl
@@ -0,0 +1,52 @@
+using MPSKit
+using MPSKit: _transpose_front, _transpose_tail
+using MPSKit: GeometryStyle, InfiniteChainStyle, TransferMatrix
+using TensorKit
+using TensorKit: ℙ
+using Adapt, CUDA
+
+@testset "CuMPS ($(sectortype(D)), $elt)" for (D, d, elt) in
+    [(ℙ^10, ℙ^2, ComplexF64), (Rep[U₁](1 => 3), Rep[U₁](0 => 1), ComplexF64)]
+    tol = Float64(eps(real(elt)) * 100)
+
+    ψ = adapt(CuArray, InfiniteMPS([rand(elt, D * d, D), rand(elt, D * d, D)]; tol))
+    @test TensorKit.storagetype(ψ) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test eltype(ψ) == eltype(typeof(ψ))
+
+    #=for i in 1:length(ψ)
+        @plansor difference[-1 -2; -3] := ψ.AL[i][-1 -2; 1] * ψ.C[i][1; -3] -
+            ψ.C[i - 1][-1; 1] * ψ.AR[i][1 -2; -3]
+        @test norm(difference, Inf) < tol * 10
+
+        @test l_LL(ψ, i) * TransferMatrix(ψ.AL[i], ψ.AL[i]) ≈ l_LL(ψ, i + 1)
+        @test l_LR(ψ, i) * TransferMatrix(ψ.AL[i], ψ.AR[i]) ≈ l_LR(ψ, i + 1)
+        @test l_RL(ψ, i) * TransferMatrix(ψ.AR[i], ψ.AL[i]) ≈ l_RL(ψ, i + 1)
+        @test l_RR(ψ, i) * TransferMatrix(ψ.AR[i], ψ.AR[i]) ≈ l_RR(ψ, i + 1)
+
+        @test TransferMatrix(ψ.AL[i], ψ.AL[i]) * r_LL(ψ, i) ≈ r_LL(ψ, i + 1)
+        @test TransferMatrix(ψ.AL[i], ψ.AR[i]) * r_LR(ψ, i) ≈ r_LR(ψ, i + 1)
+        @test TransferMatrix(ψ.AR[i], ψ.AL[i]) * r_RL(ψ, i) ≈ r_RL(ψ, i + 1)
+        @test TransferMatrix(ψ.AR[i], ψ.AR[i]) * r_RR(ψ, i) ≈ r_RR(ψ, i + 1)
+    end=# # TODO
+
+    L = rand(3:20)
+    ψ = adapt(CuArray, FiniteMPS(rand, elt, L, d, D))
+    @test TensorKit.storagetype(ψ) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test eltype(ψ) == eltype(typeof(ψ))
+    #=ovl = dot(ψ, ψ)
+
+    @test ovl ≈ norm(ψ.AC[1])^2
+
+    for i in 1:length(ψ)
+        @test ψ.AC[i] ≈ ψ.AL[i] * ψ.C[i]
+        #@test ψ.AC[i] ≈ _transpose_front(ψ.C[i - 1] * _transpose_tail(ψ.AR[i])) # TODO
+    end
+
+    @test ComplexF64 == scalartype(ψ)
+    ψ = ψ * 3
+    @test ovl * 9 ≈ norm(ψ)^2
+    ψ = 3 * ψ
+    @test ovl * 9 * 9 ≈ norm(ψ)^2
+
+    @test norm(2 * ψ + ψ - 3 * ψ) ≈ 0.0 atol = sqrt(eps(real(ComplexF64)))=# # TODO
+end
diff --git a/test/operators/mpohamiltonian.jl b/test/operators/mpohamiltonian.jl
index e199df30f..a6d0003bf 100644
--- a/test/operators/mpohamiltonian.jl
+++ b/test/operators/mpohamiltonian.jl
@@ -50,6 +50,8 @@ vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5
     @test OperatorStyle(typeof(H)) == HamiltonianStyle()
     @test OperatorStyle(H) == HamiltonianStyle()
     @test OperatorStyle(H, H′) == OperatorStyle(H)
+    @test TensorKit.storagetype(H) == Vector{T}
+    @test TensorKit.storagetype(typeof(H)) == Vector{T}
 
     # Infinite
     Ws = [Wmid]
@@ -68,6 +70,8 @@ vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5
     @test GeometryStyle(H) == InfiniteChainStyle()
     @test OperatorStyle(typeof(H)) == HamiltonianStyle()
     @test OperatorStyle(H) == HamiltonianStyle()
+    @test TensorKit.storagetype(H′) == Vector{T}
+    @test TensorKit.storagetype(typeof(H′)) == Vector{T}
 end
 
 @testset "Finite MPOHamiltonian" begin
@@ -278,6 +282,7 @@ end
         @test H4 isa InfiniteMPOHamiltonian
         @test scalartype(H4) == T
         @test storagetype(H4) == Vector{T}
+        @test storagetype(typeof(H4)) == Vector{T}
         @test expectation_value(mps2, H3) ≈ expectation_value(mps2, H4)
     end
 end
diff --git a/test/states/infinitemps.jl b/test/states/infinitemps.jl
index d701d8cb3..a704edd47 100644
--- a/test/states/infinitemps.jl
+++ b/test/states/infinitemps.jl
@@ -48,7 +48,7 @@ using Adapt
 end
 
 @testset "InfiniteMPS copying" begin
-    mps1 = InfiniteMPS(rand, ComplexF64, ℂ^2, ℂ^5)
+    mps1 = InfiniteMPS(rand, ComplexF64, [2], [5])
     mps2 = copy(mps1)
 
     @test mps1 !== mps2
@@ -74,7 +74,7 @@ end
 
 @testset "Adapt" begin
     for (d, D) in [(ℂ^2, ℂ^4), (ℙ^2, ℙ^4)]
-        mps1 = InfiniteMPS(rand, Float32, d, D)
+        mps1 = InfiniteMPS(Float32, d, D)
         for T in (Float64, ComplexF64)
             mps2 = @testinferred adapt(Vector{T}, mps1)
             @test mps2 isa InfiniteMPS
@@ -108,7 +108,7 @@ end
 end
 
 @testset "InfiniteMPS copying" begin
-    mps1 = InfiniteMPS(rand, ComplexF64, ℂ^2, ℂ^5)
+    mps1 = InfiniteMPS(2, 5)
     mps2 = copy(mps1)
 
     @test mps1 !== mps2
diff --git a/test/states/multilinemps.jl b/test/states/multilinemps.jl
index cf0d04316..e9fb8a569 100644
--- a/test/states/multilinemps.jl
+++ b/test/states/multilinemps.jl
@@ -23,6 +23,8 @@ using TensorKit: ℙ
 
     @test GeometryStyle(typeof(ψ)) == InfiniteChainStyle()
     @test GeometryStyle(ψ) == InfiniteChainStyle()
+    @test TensorKit.storagetype(ψ) == Vector{elt}
+    @test TensorKit.sectortype(ψ) == sectortype(D)
 
     @test !isfinite(typeof(ψ))
 
diff --git a/test/states/quasiparticle.jl b/test/states/quasiparticle.jl
index 5ce88975c..08362ccf3 100644
--- a/test/states/quasiparticle.jl
+++ b/test/states/quasiparticle.jl
@@ -24,6 +24,10 @@ using TensorKit: ℙ
         #rand_quasiparticle is a private non-exported function
         ϕ₁ = LeftGaugedQP(rand, ψ)
         ϕ₂ = LeftGaugedQP(rand, ψ)
+        @test TensorKit.storagetype(ϕ₁) == TensorKit.storagetype(ψ)
+        @test TensorKit.storagetype(typeof(ϕ₁)) == TensorKit.storagetype(ψ)
+        @test TensorKit.storagetype(ϕ₂) == TensorKit.storagetype(ψ)
+        @test TensorKit.storagetype(typeof(ϕ₂)) == TensorKit.storagetype(ψ)
 
         @test GeometryStyle(ϕ₁) == FiniteChainStyle()
         @test GeometryStyle(typeof(ϕ₂)) == FiniteChainStyle()

From 2c66082cfe410c9a0feea8c13bca751f3debc3a0 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 19 Mar 2026 10:43:51 -0400
Subject: [PATCH 2/7] Set algo

---
 test/cuda/operators.jl | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index 02334797b..0b954b818 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -3,9 +3,12 @@ using Test, TestExtras
 using MPSKit
 using MPSKit: GeometryStyle, FiniteChainStyle, InfiniteChainStyle, OperatorStyle, MPOStyle
 using TensorKit
+using MatrixAlgebraKit
 using TensorKit: ℙ, tensormaptype, TensorMapWithStorage
 using Adapt, CUDA, cuTENSOR
 
+MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
+
 @testset "CuFiniteMPO" for V in (ℂ^2, U1Space(0 => 1, 1 => 1))
     # start from random operators
     L = 4
@@ -15,15 +18,18 @@ using Adapt, CUDA, cuTENSOR
     O₂ = rand(T, space(O₁))
     O₃ = rand(real(T), space(O₁))
 
-    mpo₁ = adapt(CuArray, FiniteMPO(O₁))
-    mpo₂ = adapt(CuArray, FiniteMPO(O₂))
-    mpo₃ = adapt(CuArray, FiniteMPO(O₃))
+    mpo₁ = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPO(O₁))
+    mpo₂ = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPO(O₂))
+    mpo₃ = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPO(O₃))
 
     @test isfinite(mpo₁)
     @test isfinite(typeof(mpo₁))
     @test GeometryStyle(typeof(mpo₁)) == FiniteChainStyle()
     @test GeometryStyle(mpo₁) == FiniteChainStyle()
     @test OperatorStyle(typeof(mpo₁)) == MPOStyle()
+    @test TensorKit.storagetype(mpo₁) == CuVector{T, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(mpo₂) == CuVector{T, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(mpo₃) == CuVector{T, CUDA.DeviceMemory}
 
     @test @constinferred physicalspace(mpo₁) == fill(V, L)
     Vleft = @constinferred left_virtualspace(mpo₁)

From 3ad5f044639861de9a815f5e473e8287398ee8ab Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 19 Mar 2026 11:46:18 -0400
Subject: [PATCH 3/7] More dumb fixes

---
 test/cuda/operators.jl | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index 0b954b818..c843c6169 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -17,6 +17,10 @@ MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
     O₁ = rand(T, V^L, V^L)
     O₂ = rand(T, space(O₁))
     O₃ = rand(real(T), space(O₁))
+    
+    dO₁ = adapt(CuArray, O₁)
+    dO₂ = adapt(CuArray, O₂)
+    dO₃ = adapt(CuArray, O₃)
 
     mpo₁ = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPO(O₁))
     mpo₂ = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPO(O₂))
@@ -65,14 +69,14 @@ MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
 
     @test @constinferred GeometryStyle(mps₁, mpo₁, mps₁) == GeometryStyle(mps₁)
 
-    #@test convert(TM, mpo₁ * mps₁) ≈ O₁ * ψ₁
-    @test mpo₁ * ψ₁ ≈ O₁ * ψ₁
-    #@test convert(TM, mpo₃ * mps₁) ≈ O₃ * ψ₁
-    @test mpo₃ * ψ₁ ≈ O₃ * ψ₁
-    #@test convert(TM, mpo₁ * mps₂) ≈ O₁ * ψ₂
-    #@test mpo₁ * ψ₂ ≈ O₁ * ψ₂
+    #@test convert(TM, mpo₁ * mps₁) ≈ dO₁ * ψ₁
+    @test mpo₁ * ψ₁ ≈ dO₁ * ψ₁
+    #@test convert(TM, mpo₃ * mps₁) ≈ dO₃ * ψ₁
+    @test mpo₃ * ψ₁ ≈ dO₃ * ψ₁
+    #@test convert(TM, mpo₁ * mps₂) ≈ dO₁ * ψ₂
+    #@test mpo₁ * ψ₂ ≈ dO₁ * ψ₂
 
-    @test dot(mps₁, mpo₁, mps₁) ≈ dot(ψ₁, O₁, ψ₁)
+    @test dot(mps₁, mpo₁, mps₁) ≈ dot(ψ₁, dO₁, ψ₁)
     @test dot(mps₁, mpo₁, mps₁) ≈ dot(mps₁, mpo₁ * mps₁)
     # test conversion to and from mps
     mpomps₁ = convert(FiniteMPS, mpo₁)

From ee113fa555abf34011cc74545694a47405898e1d Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 19 Mar 2026 13:23:38 -0400
Subject: [PATCH 4/7] More stuff working

---
 src/operators/abstractmpo.jl |  4 ++--
 src/utility/utility.jl       |  4 ++--
 test/cuda/operators.jl       | 27 +++++++++++++--------------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/operators/abstractmpo.jl b/src/operators/abstractmpo.jl
index 2ad30033b..2e3d3ffda 100644
--- a/src/operators/abstractmpo.jl
+++ b/src/operators/abstractmpo.jl
@@ -151,13 +151,13 @@ end
 Compute the mpo tensor that arises from multiplying MPOs.
 """
 function fuse_mul_mpo(O1, O2)
-    T = promote_type(scalartype(O1), scalartype(O2))
+    T = TensorKit.promote_storagetype(storagetype(O1), storagetype(O2))
     F_left = fuser(T, left_virtualspace(O2), left_virtualspace(O1))
     F_right = fuser(T, right_virtualspace(O2), right_virtualspace(O1))
     return _fuse_mpo_mpo(O1, O2, F_left, F_right)
 end
 function fuse_mul_mpo(O1::BraidingTensor, O2::BraidingTensor)
-    T = promote_type(scalartype(O1), scalartype(O2))
+    T = TensorKit.promote_storagetype(storagetype(O1), storagetype(O2))
     V = fuse(left_virtualspace(O2) ⊗ left_virtualspace(O1)) ⊗ physicalspace(O1) ←
         physicalspace(O2) ⊗ fuse(right_virtualspace(O2) ⊗ right_virtualspace(O1))
     return BraidingTensor{T}(V)
diff --git a/src/utility/utility.jl b/src/utility/utility.jl
index f5841b7e2..973f37889 100644
--- a/src/utility/utility.jl
+++ b/src/utility/utility.jl
@@ -123,8 +123,8 @@ function check_length(a, b...)
     return L
 end
 
-function fuser(::Type{T}, V1::S, V2::S) where {T, S <: IndexSpace}
-    return isomorphism(T, fuse(V1 ⊗ V2), V1 ⊗ V2)
+function fuser(::Type{TorA}, V1::S, V2::S) where {TorA, S <: IndexSpace}
+    return isomorphism(TorA, fuse(V1 ⊗ V2), V1 ⊗ V2)
 end
 
 """
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index c843c6169..6fc4d4b7b 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -17,7 +17,7 @@ MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
     O₁ = rand(T, V^L, V^L)
     O₂ = rand(T, space(O₁))
     O₃ = rand(real(T), space(O₁))
-    
+
     dO₁ = adapt(CuArray, O₁)
     dO₂ = adapt(CuArray, O₂)
     dO₃ = adapt(CuArray, O₃)
@@ -43,23 +43,22 @@ MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
         @test Vright[i] == right_virtualspace(mpo₁, i)
     end
 
-    TM = TensorMapWithStorage{T, CuVector{T, CUDA.DeviceMemory}}
-    #@test convert(TM, mpo₁) ≈ O₁
-    #@test convert(TM, -mpo₂) ≈ -O₂
-    #@test convert(TM, @constinferred complex(mpo₃)) ≈ complex(O₃)
-
+    TM = TensorMap
+    @test convert(TM, mpo₁) ≈ dO₁
+    @test convert(TM, -mpo₂) ≈ -dO₂
+    @test convert(TM, @constinferred complex(mpo₃)) ≈ complex(dO₃)
 
     # test scalar multiplication
     α = rand(T)
-    #@test convert(TM, α * mpo₁) ≈ α * O₁
-    #@test convert(TM, mpo₁ * α) ≈ O₁ * α
+    @test convert(TM, α * mpo₁) ≈ α * dO₁
+    @test convert(TM, mpo₁ * α) ≈ dO₁ * α
     @test α * mpo₃ ≈ α * complex(mpo₃) atol = 1.0e-6
 
     # test addition and multiplication
-    #@test convert(TM, mpo₁ + mpo₂) ≈ O₁ + O₂
-    #@test convert(TM, mpo₁ + mpo₃) ≈ O₁ + O₃
-    #@test convert(TM, mpo₁ * mpo₂) ≈ O₁ * O₂
-    #@test convert(TM, mpo₁ * mpo₃) ≈ O₁ * O₃
+    @test convert(TM, mpo₁ + mpo₂) ≈ dO₁ + dO₂
+    @test convert(TM, mpo₁ + mpo₃) ≈ dO₁ + dO₃
+    @test convert(TM, mpo₁ * mpo₂) ≈ dO₁ * dO₂
+    @test convert(TM, mpo₁ * mpo₃) ≈ dO₁ * dO₃
 
     # test application to a state
     ψ₁ = adapt(CuArray, rand(T, domain(O₁)))
@@ -69,9 +68,9 @@ MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
 
     @test @constinferred GeometryStyle(mps₁, mpo₁, mps₁) == GeometryStyle(mps₁)
 
-    #@test convert(TM, mpo₁ * mps₁) ≈ dO₁ * ψ₁
+    @test convert(TM, mpo₁ * mps₁) ≈ dO₁ * ψ₁
     @test mpo₁ * ψ₁ ≈ dO₁ * ψ₁
-    #@test convert(TM, mpo₃ * mps₁) ≈ dO₃ * ψ₁
+    @test convert(TM, mpo₃ * mps₁) ≈ dO₃ * ψ₁
     @test mpo₃ * ψ₁ ≈ dO₃ * ψ₁
     #@test convert(TM, mpo₁ * mps₂) ≈ dO₁ * ψ₂
     #@test mpo₁ * ψ₂ ≈ dO₁ * ψ₂

From 6790f37d4550e7a70381018946a15b23f469d641 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 20 Mar 2026 04:17:58 -0400
Subject: [PATCH 5/7] Another fix and no sources needed

---
 Project.toml                 | 3 ---
 src/operators/abstractmpo.jl | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 115c923b6..51e51efdf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -76,6 +76,3 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["Aqua", "Adapt", "CUDA", "cuTENSOR", "Pkg", "Test", "TestExtras", "Plots", "Combinatorics", "ParallelTestRunner", "TensorKitTensors"]
-
-[sources]
-TensorKit = {url="https://github.com/QuantumKitHub/TensorKit.jl", rev="ksh/cuda_tweaks"}
diff --git a/src/operators/abstractmpo.jl b/src/operators/abstractmpo.jl
index 2e3d3ffda..5988027e2 100644
--- a/src/operators/abstractmpo.jl
+++ b/src/operators/abstractmpo.jl
@@ -157,7 +157,7 @@ function fuse_mul_mpo(O1, O2)
     return _fuse_mpo_mpo(O1, O2, F_left, F_right)
 end
 function fuse_mul_mpo(O1::BraidingTensor, O2::BraidingTensor)
-    T = TensorKit.promote_storagetype(storagetype(O1), storagetype(O2))
+    T = promote_type(scalartype(O1), scalartype(O2))
     V = fuse(left_virtualspace(O2) ⊗ left_virtualspace(O1)) ⊗ physicalspace(O1) ←
         physicalspace(O2) ⊗ fuse(right_virtualspace(O2) ⊗ right_virtualspace(O1))
     return BraidingTensor{T}(V)

From b8169a93140add3546b5e446898b9f91c92835e3 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kslimes@gmail.com>
Date: Fri, 20 Mar 2026 11:29:03 +0100
Subject: [PATCH 6/7] One more try

---
 src/operators/abstractmpo.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operators/abstractmpo.jl b/src/operators/abstractmpo.jl
index 5988027e2..54ecf0358 100644
--- a/src/operators/abstractmpo.jl
+++ b/src/operators/abstractmpo.jl
@@ -151,7 +151,8 @@ end
 Compute the mpo tensor that arises from multiplying MPOs.
 """
 function fuse_mul_mpo(O1, O2)
-    T = TensorKit.promote_storagetype(storagetype(O1), storagetype(O2))
+    TT = promote_type(scalartype(O1), scalartype(O2))
+    T = TensorKit.similarstoragetype(storagetype(O1), TT)
     F_left = fuser(T, left_virtualspace(O2), left_virtualspace(O1))
     F_right = fuser(T, right_virtualspace(O2), right_virtualspace(O1))
     return _fuse_mpo_mpo(O1, O2, F_left, F_right)

From e428a7e5f76ddec821d3760d7a43789ad479550e Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 20 Mar 2026 15:09:27 -0400
Subject: [PATCH 7/7] Enable more tests

---
 test/cuda/operators.jl |  2 ++
 test/cuda/states.jl    | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index 6fc4d4b7b..b3f03796c 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -7,6 +7,8 @@ using MatrixAlgebraKit
 using TensorKit: ℙ, tensormaptype, TensorMapWithStorage
 using Adapt, CUDA, cuTENSOR
 
+# TODO revisit this once https://github.com/QuantumKitHub/MatrixAlgebraKit.jl/issues/176
+# is resolved
 MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
 
 @testset "CuFiniteMPO" for V in (ℂ^2, U1Space(0 => 1, 1 => 1))
diff --git a/test/cuda/states.jl b/test/cuda/states.jl
index 3722a2be1..39627ac41 100644
--- a/test/cuda/states.jl
+++ b/test/cuda/states.jl
@@ -3,7 +3,7 @@ using MPSKit: _transpose_front, _transpose_tail
 using MPSKit: GeometryStyle, InfiniteChainStyle, TransferMatrix
 using TensorKit
 using TensorKit: ℙ
-using Adapt, CUDA
+using Adapt, CUDA, cuTENSOR
 
 @testset "CuMPS ($(sectortype(D)), $elt)" for (D, d, elt) in
     [(ℙ^10, ℙ^2, ComplexF64), (Rep[U₁](1 => 3), Rep[U₁](0 => 1), ComplexF64)]
@@ -13,7 +13,7 @@ using Adapt, CUDA
     @test TensorKit.storagetype(ψ) == CuVector{ComplexF64, CUDA.DeviceMemory}
     @test eltype(ψ) == eltype(typeof(ψ))
 
-    #=for i in 1:length(ψ)
+    for i in 1:length(ψ)
         @plansor difference[-1 -2; -3] := ψ.AL[i][-1 -2; 1] * ψ.C[i][1; -3] -
             ψ.C[i - 1][-1; 1] * ψ.AR[i][1 -2; -3]
         @test norm(difference, Inf) < tol * 10
@@ -27,19 +27,19 @@ using Adapt, CUDA
         @test TransferMatrix(ψ.AL[i], ψ.AR[i]) * r_LR(ψ, i) ≈ r_LR(ψ, i + 1)
         @test TransferMatrix(ψ.AR[i], ψ.AL[i]) * r_RL(ψ, i) ≈ r_RL(ψ, i + 1)
         @test TransferMatrix(ψ.AR[i], ψ.AR[i]) * r_RR(ψ, i) ≈ r_RR(ψ, i + 1)
-    end=# # TODO
+    end
 
     L = rand(3:20)
     ψ = adapt(CuArray, FiniteMPS(rand, elt, L, d, D))
     @test TensorKit.storagetype(ψ) == CuVector{ComplexF64, CUDA.DeviceMemory}
     @test eltype(ψ) == eltype(typeof(ψ))
-    #=ovl = dot(ψ, ψ)
+    ovl = dot(ψ, ψ)
 
     @test ovl ≈ norm(ψ.AC[1])^2
 
     for i in 1:length(ψ)
         @test ψ.AC[i] ≈ ψ.AL[i] * ψ.C[i]
-        #@test ψ.AC[i] ≈ _transpose_front(ψ.C[i - 1] * _transpose_tail(ψ.AR[i])) # TODO
+        @test ψ.AC[i] ≈ _transpose_front(ψ.C[i - 1] * _transpose_tail(ψ.AR[i]))
     end
 
     @test ComplexF64 == scalartype(ψ)
@@ -48,5 +48,5 @@ using Adapt, CUDA
     ψ = 3 * ψ
     @test ovl * 9 * 9 ≈ norm(ψ)^2
 
-    @test norm(2 * ψ + ψ - 3 * ψ) ≈ 0.0 atol = sqrt(eps(real(ComplexF64)))=# # TODO
+    @test norm(2 * ψ + ψ - 3 * ψ) ≈ 0.0 atol = sqrt(eps(real(ComplexF64)))
 end