Replace deprecated torch APIs with modern torch.linalg equivalents

saitcakmak · saitcakmak · commit 9261a2644059 · 2026-03-03T19:13:24.000-05:00
torch.inverse, torch.pinverse, and torch.norm have been deprecated
since PyTorch 1.9. This updates all usage to their modern replacements
and, critically, registers torch.linalg.inv for __torch_function__
dispatch so that torch.linalg.inv(linear_op) works correctly.

Changes:
- Register torch.linalg.inv alongside torch.inverse for LinearOperator
  dispatch (fixes torch.linalg.inv not working on LinearOperators)
- Replace torch.pinverse() with torch.linalg.pinv()
- Replace torch.norm() with torch.linalg.vector_norm() (source files)
  and torch.linalg.norm() (test files)
- Update stale comments referencing torch.cholesky, torch.solve,
  torch.symeig, and torch.eig to their modern equivalents
diff --git a/examples/LinearOperator_demo.ipynb b/examples/LinearOperator_demo.ipynb
@@ -220,7 +220,7 @@
    "source": [
     "#### Eigendecomposition\n",
     "\n",
-    "This uses `__torch_function__` in order to dispatch `torch.symeig` to a custom implementation that essentially just returns the diagonal elements and the identity matrix (should sort the evals and permute the evecs to have the exact same behavior, that's an easy thing to do).\n",
+    "This uses `__torch_function__` in order to dispatch `torch.linalg.eigh` to a custom implementation that essentially just returns the diagonal elements and the identity matrix (should sort the evals and permute the evecs to have the exact same behavior, that's an easy thing to do).\n",
     "\n",
     "Time complexity goes from $\\mathcal O(n^3)$ to $\\mathcal O(1)$ (without sorting). Memory complexity goes from $\\mathcal O(n^2)$ to $\\mathcal O(n)$. \n",
     "\n",
@@ -858,8 +858,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tri_inv = torch.inverse(tri)\n",
-    "tri_lo_inv = tri_lo.inverse()  # TODO: Handle in torch.inverse by registering via __torch_function__\n",
+    "tri_inv = torch.linalg.inv(tri)\n",
+    "tri_lo_inv = torch.linalg.inv(tri_lo)\n",
     "\n",
     "assert torch.allclose(tri_inv, tri_lo_inv.to_dense())"
    ]
@@ -879,7 +879,7 @@
     }
    ],
    "source": [
-    "t_d = %timeit -o torch.inverse(tri)"
+    "t_d = %timeit -o torch.linalg.inv(tri)"
    ]
   },
   {
diff --git a/linear_operator/functions/_inv_quad_logdet.py b/linear_operator/functions/_inv_quad_logdet.py
@@ -106,7 +106,7 @@ def forward(
             else:
                 probe_vectors = precond_lt.zero_mean_mvn_samples(num_random_probes)
             probe_vectors = probe_vectors.unsqueeze(-2).transpose(0, -2).squeeze(0).mT.contiguous()
-            probe_vector_norms = torch.norm(probe_vectors, p=2, dim=-2, keepdim=True)
+            probe_vector_norms = torch.linalg.vector_norm(probe_vectors, ord=2, dim=-2, keepdim=True)
             probe_vectors = probe_vectors.div(probe_vector_norms)
 
         # Probe vectors
diff --git a/linear_operator/functions/_pivoted_cholesky.py b/linear_operator/functions/_pivoted_cholesky.py
@@ -41,7 +41,7 @@ def forward(ctx, representation_tree, max_iter, error_tol, *matrix_args):
             device=matrix.device,
         )
         orig_error = torch.max(matrix_diag, dim=-1)[0]
-        errors = torch.norm(matrix_diag, 1, dim=-1) / orig_error
+        errors = torch.linalg.vector_norm(matrix_diag, ord=1, dim=-1) / orig_error
 
         # The permutation
         permutation = torch.arange(0, matrix_shape[-1], dtype=torch.long, device=matrix_diag.device)
@@ -96,7 +96,7 @@ def forward(ctx, representation_tree, max_iter, error_tol, *matrix_args):
                 L[..., m, :] = L_m
 
                 # Keep track of errors - for potential early stopping
-                errors = torch.norm(matrix_diag.gather(-1, pi_i), 1, dim=-1) / orig_error
+                errors = torch.linalg.vector_norm(matrix_diag.gather(-1, pi_i), ord=1, dim=-1) / orig_error
 
             m = m + 1
 
diff --git a/linear_operator/operators/_linear_operator.py b/linear_operator/operators/_linear_operator.py
@@ -80,7 +80,7 @@ def _implements_second_arg(torch_function: Callable) -> Callable:
     where the first argument of the function is a torch.Tensor and the
     second argument is a LinearOperator
 
-    Examples of this include :meth:`torch.cholesky_solve`, `torch.solve`, or `torch.matmul`.
+    Examples of this include :meth:`torch.cholesky_solve`, `torch.linalg.solve`, or `torch.matmul`.
     """
 
     @functools.wraps(torch_function)
@@ -1803,13 +1803,17 @@ def inv_quad_logdet(
             inv_quad_term = inv_quad_term.sum(-1)
         return inv_quad_term, logdet_term
 
+    @_implements(torch.linalg.inv)
     @_implements(torch.inverse)
     def inverse(
         self: LinearOperator,  # shape: (*batch, N, N)
     ) -> LinearOperator:  # shape: (*batch, N, N)
         # Only implemented by some LinearOperator subclasses
-        # We define it here so that we can map the torch function torch.inverse to the LinearOperator method
-        raise NotImplementedError(f"torch.inverse({self.__class__.__name__}) is not implemented.")
+        # We define it here so that we can map torch.linalg.inv / torch.inverse to the LinearOperator method
+        raise NotImplementedError(
+            f"torch.linalg.inv({self.__class__.__name__}) is not implemented. "
+            "The LinearOperator subclass must implement the `inverse` method."
+        )
 
     @property
     def is_square(self) -> bool:
@@ -2296,7 +2300,7 @@ def root_inv_decomposition(
         elif method == "pinverse":
             # this is numerically unstable and should rarely be used
             root = self.root_decomposition().root.to_dense()
-            inv_root = torch.pinverse(root).mT
+            inv_root = torch.linalg.pinv(root).mT
         else:
             raise RuntimeError(f"Unknown root inv decomposition method '{method}'")
 
diff --git a/linear_operator/operators/kronecker_product_added_diag_linear_operator.py b/linear_operator/operators/kronecker_product_added_diag_linear_operator.py
@@ -110,7 +110,7 @@ def _logdet(
 
             else:
                 # we use the same matrix determinant identity: |K + D| = |D| |I + D^{-1}K|
-                # but have to symmetrize the second matrix because torch.eig may not be
+                # but have to symmetrize the second matrix because torch.linalg.eig may not be
                 # completely differentiable.
                 lt = self.linear_op
                 dlt = self.diag_tensor
diff --git a/linear_operator/utils/cholesky.py b/linear_operator/utils/cholesky.py
@@ -56,9 +56,9 @@ def psd_safe_cholesky(A, upper=False, out=None, jitter=None, max_tries=None):
         :attr:`A` (Tensor):
             The tensor to compute the Cholesky decomposition of
         :attr:`upper` (bool, optional):
-            See torch.cholesky
+            See torch.linalg.cholesky
         :attr:`out` (Tensor, optional):
-            See torch.cholesky
+            See torch.linalg.cholesky
         :attr:`jitter` (float, optional):
             The jitter to add to the diagonal of A in case A is only p.s.d. If omitted,
             uses settings.cholesky_jitter.value()
diff --git a/linear_operator/utils/lanczos.py b/linear_operator/utils/lanczos.py
@@ -78,7 +78,7 @@ def lanczos_tridiag(
 
     # Begin algorithm
     # Initial Q vector: q_0_vec
-    q_0_vec = init_vecs / torch.norm(init_vecs, 2, dim=dim_dimension).unsqueeze(dim_dimension)
+    q_0_vec = init_vecs / torch.linalg.vector_norm(init_vecs, ord=2, dim=dim_dimension).unsqueeze(dim_dimension)
     q_mat[0].copy_(q_0_vec)
 
     # Initial alpha value: alpha_0
@@ -87,7 +87,7 @@ def lanczos_tridiag(
 
     # Initial beta value: beta_0
     r_vec.sub_(alpha_0.unsqueeze(dim_dimension).mul(q_0_vec))
-    beta_0 = torch.norm(r_vec, 2, dim=dim_dimension)
+    beta_0 = torch.linalg.vector_norm(r_vec, ord=2, dim=dim_dimension)
 
     # Copy over alpha_0 and beta_0 to t_mat
     t_mat[0, 0].copy_(alpha_0)
@@ -118,7 +118,7 @@ def lanczos_tridiag(
             correction = r_vec.unsqueeze(0).mul(q_mat[: k + 1]).sum(dim_dimension, keepdim=True)
             correction = q_mat[: k + 1].mul(correction).sum(0)
             r_vec.sub_(correction)
-            r_vec_norm = torch.norm(r_vec, 2, dim=dim_dimension, keepdim=True)
+            r_vec_norm = torch.linalg.vector_norm(r_vec, ord=2, dim=dim_dimension, keepdim=True)
             r_vec.div_(r_vec_norm)
 
             # Get next beta value
@@ -137,7 +137,7 @@ def lanczos_tridiag(
                 correction = r_vec.unsqueeze(0).mul(q_mat[: k + 1]).sum(dim_dimension, keepdim=True)
                 correction = q_mat[: k + 1].mul(correction).sum(0)
                 r_vec.sub_(correction)
-                r_vec_norm = torch.norm(r_vec, 2, dim=dim_dimension, keepdim=True)
+                r_vec_norm = torch.linalg.vector_norm(r_vec, ord=2, dim=dim_dimension, keepdim=True)
                 r_vec.div_(r_vec_norm)
                 inner_products = q_mat[: k + 1].mul(r_vec.unsqueeze(0)).sum(dim_dimension)
 
diff --git a/linear_operator/utils/linear_cg.py b/linear_operator/utils/linear_cg.py
@@ -296,7 +296,7 @@ def linear_cg(
                 curr_conjugate_vec,
             )
 
-        torch.norm(residual, 2, dim=-2, keepdim=True, out=residual_norm)
+        torch.linalg.vector_norm(residual, ord=2, dim=-2, keepdim=True, out=residual_norm)
         residual_norm.masked_fill_(rhs_is_zero, 0)
         torch.lt(residual_norm, stop_updating_after, out=has_converged)
 
diff --git a/linear_operator/utils/minres.py b/linear_operator/utils/minres.py
@@ -182,8 +182,8 @@ def minres(
 
         # Check convergence criterion
         if (i + 1) % 10 == 0:
-            torch.norm(search_update, dim=-2, out=search_update_norm)
-            torch.norm(solution, dim=-2, out=solution_norm)
+            torch.linalg.vector_norm(search_update, dim=-2, out=search_update_norm)
+            torch.linalg.vector_norm(solution, dim=-2, out=solution_norm)
             conv = search_update_norm.div_(solution_norm).mean().item()
             if conv < settings.minres_tolerance.value():
                 break
diff --git a/test/functions/test_dsmm.py b/test/functions/test_dsmm.py
@@ -16,7 +16,7 @@ def test_forward(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.mm(sparse.to_dense(), dense)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
     def test_forward_batch(self):
         i = torch.tensor(
@@ -29,7 +29,7 @@ def test_forward_batch(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.matmul(sparse.to_dense(), dense)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
     def test_forward_multi_batch(self):
         i = torch.tensor(
@@ -47,7 +47,7 @@ def test_forward_multi_batch(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.matmul(sparse.to_dense(), dense)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
     def test_backward(self):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]], dtype=torch.long)
@@ -61,7 +61,7 @@ def test_backward(self):
         res.backward(grad_output)
         actual = torch.mm(sparse.to_dense(), dense_copy)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
     def test_backward_batch(self):
         i = torch.tensor(
@@ -78,7 +78,7 @@ def test_backward_batch(self):
         res.backward(grad_output)
         actual = torch.matmul(sparse.to_dense(), dense_copy)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
     def test_backward_multi_batch(self):
         i = torch.tensor(
@@ -100,7 +100,7 @@ def test_backward_multi_batch(self):
         res.backward(grad_output)
         actual = torch.matmul(sparse.to_dense(), dense_copy)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
     def test_broadcast_rhs(self):
         i = torch.tensor([[0, 1, 1, 0, 1, 1], [2, 0, 2, 2, 0, 2]], dtype=torch.long)
@@ -111,12 +111,12 @@ def test_broadcast_rhs(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.matmul(sparse.to_dense(), dense_copy)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
         grad_output = torch.randn(4, 2, 2, 4)
         res.backward(grad_output)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
         i = torch.tensor(
             [[0, 0, 0, 1, 1, 1], [0, 1, 1, 0, 1, 1], [2, 0, 2, 2, 0, 2]],
@@ -129,12 +129,12 @@ def test_broadcast_rhs(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.matmul(sparse.to_dense(), dense_copy)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
         grad_output = torch.randn(4, 2, 2, 4)
         res.backward(grad_output)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
     def test_broadcast_sparse(self):
         i = torch.tensor(
@@ -148,12 +148,12 @@ def test_broadcast_sparse(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.matmul(sparse.to_dense(), dense_copy)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
         grad_output = torch.randn(2, 2, 4)
         res.backward(grad_output)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
     def test_broadcast_singleton(self):
         i = torch.tensor(
@@ -167,12 +167,12 @@ def test_broadcast_singleton(self):
 
         res = linear_operator.dsmm(sparse, dense)
         actual = torch.matmul(sparse.to_dense(), dense_copy)
-        self.assertLess(torch.norm(res - actual), 1e-5)
+        self.assertLess(torch.linalg.norm(res - actual), 1e-5)
 
         grad_output = torch.randn(2, 2, 4)
         res.backward(grad_output)
         actual.backward(grad_output)
-        self.assertLess(torch.norm(dense.grad - dense_copy.grad).item(), 1e-5)
+        self.assertLess(torch.linalg.norm(dense.grad - dense_copy.grad).item(), 1e-5)
 
 
 if __name__ == "__main__":
diff --git a/test/operators/test_added_diag_linear_operator.py b/test/operators/test_added_diag_linear_operator.py
@@ -119,7 +119,7 @@ def precond_closure(rhs):
 
         # gut checking that our preconditioner is not breaking anything
         self.assertEqual(standard_solve.shape, overrode_solve.shape)
-        self.assertLess(torch.norm(standard_solve - overrode_solve) / standard_solve.norm(), 1.0)
+        self.assertLess(torch.linalg.norm(standard_solve - overrode_solve) / standard_solve.norm(), 1.0)
 
 
 if __name__ == "__main__":
diff --git a/test/operators/test_chol_linear_operator.py b/test/operators/test_chol_linear_operator.py
@@ -40,10 +40,14 @@ def test_inverse(self):
         linear_op_copy.requires_grad_(True)
         evaluated = self.evaluate_linear_op(linear_op_copy)
 
-        inverse = torch.inverse(linear_op).to_dense()
+        inverse = torch.linalg.inv(linear_op).to_dense()
         inverse_actual = evaluated.inverse()
         self.assertAllClose(inverse, inverse_actual)
 
+        # Verify deprecated torch.inverse also dispatches correctly
+        inverse_deprecated = torch.inverse(linear_op).to_dense()
+        self.assertAllClose(inverse, inverse_deprecated)
+
         # Backwards
         inverse.sum().backward()
         inverse_actual.sum().backward()
diff --git a/test/operators/test_dense_linear_operator.py b/test/operators/test_dense_linear_operator.py
@@ -29,7 +29,7 @@ def test_root_decomposition_exact(self):
             root_approx = linear_op.root_decomposition()
             res = root_approx.matmul(test_mat)
             actual = linear_op.matmul(test_mat)
-            self.assertLess(torch.norm(res - actual) / actual.norm(), 0.1)
+            self.assertLess(torch.linalg.norm(res - actual) / actual.norm(), 0.1)
 
     def test_no_root_computation_when_no_cached_roots(self):
         """
@@ -93,7 +93,7 @@ def test_root_decomposition_exact(self):
             root_approx = linear_op.root_decomposition()
             res = root_approx.matmul(test_mat)
             actual = linear_op.matmul(test_mat)
-            self.assertLess(torch.norm(res - actual) / actual.norm(), 0.1)
+            self.assertLess(torch.linalg.norm(res - actual) / actual.norm(), 0.1)
 
 
 class TestDenseLinearOperatorMultiBatch(LinearOperatorTestCase, unittest.TestCase):
diff --git a/test/operators/test_diag_linear_operator.py b/test/operators/test_diag_linear_operator.py
@@ -54,10 +54,14 @@ def test_inverse(self):
         linear_op_copy.requires_grad_(True)
         evaluated = self.evaluate_linear_op(linear_op_copy)
 
-        inverse = torch.inverse(linear_op).to_dense()
+        inverse = torch.linalg.inv(linear_op).to_dense()
         inverse_actual = evaluated.inverse()
         self.assertAllClose(inverse, inverse_actual)
 
+        # Verify deprecated torch.inverse also dispatches correctly
+        inverse_deprecated = torch.inverse(linear_op).to_dense()
+        self.assertAllClose(inverse, inverse_deprecated)
+
         # Backwards
         inverse.sum().backward()
         inverse_actual.sum().backward()
diff --git a/test/operators/test_permutation_linear_operator.py b/test/operators/test_permutation_linear_operator.py
@@ -64,9 +64,13 @@ def test_permutation_linear_operator(self):
                 self.assertTrue(torch.equal(y, xp))
 
                 # inverse of permutation operator
-                P_inv = torch.inverse(P)
+                P_inv = torch.linalg.inv(P)
                 self.assertTrue(torch.equal(P_inv @ y, expanded_x))
 
+                # Verify deprecated torch.inverse also dispatches correctly
+                P_inv_deprecated = torch.inverse(P)
+                self.assertTrue(torch.equal(P_inv.to_dense(), P_inv_deprecated.to_dense()))
+
                 # transpose of permutation operator is equal to its inverse
                 self.assertTrue(torch.equal(P.transpose(-1, -2).perm, P_inv.perm))
 
diff --git a/test/operators/test_triangular_linear_operator.py b/test/operators/test_triangular_linear_operator.py
@@ -42,10 +42,14 @@ def test_inverse(self):
         evaluated = self.evaluate_linear_op(linear_op_copy)
         evaluated.register_hook(self._ensure_symmetric_grad)
 
-        inverse = torch.inverse(linear_op).to_dense()
+        inverse = torch.linalg.inv(linear_op).to_dense()
         inverse_actual = evaluated.inverse()
         self.assertAllClose(inverse, inverse_actual)
 
+        # Verify deprecated torch.inverse also dispatches correctly
+        inverse_deprecated = torch.inverse(linear_op).to_dense()
+        self.assertAllClose(inverse, inverse_deprecated)
+
         # Backwards
         inverse.sum().backward()
         inverse_actual.sum().backward()
diff --git a/test/operators/test_zero_linear_operator.py b/test/operators/test_zero_linear_operator.py

Original file line number	Diff line number	Diff line change
`@@ -296,7 +296,7 @@ def linear_cg(`
`296`	`296`	`curr_conjugate_vec,`
`297`	`297`	`)`
`298`	`298`
`299`		`- torch.norm(residual, 2, dim=-2, keepdim=True, out=residual_norm)`
	`299`	`+ torch.linalg.vector_norm(residual, ord=2, dim=-2, keepdim=True, out=residual_norm)`
`300`	`300`	`residual_norm.masked_fill_(rhs_is_zero, 0)`
`301`	`301`	`torch.lt(residual_norm, stop_updating_after, out=has_converged)`
`302`	`302`