From e9ececb389f12fbdb5ca7a8a20cd5dbbdcb397f1 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Fri, 27 Feb 2026 01:43:37 +0000
Subject: [PATCH 01/23] support for dpnp.scipy.linalg.lu()

---
 dpnp/scipy/linalg/__init__.py   |   3 +-
 dpnp/scipy/linalg/_decomp_lu.py | 142 ++++++++++
 dpnp/scipy/linalg/_utils.py     | 284 ++++++++++++++++++++
 dpnp/tests/test_linalg.py       | 457 ++++++++++++++++++++++++++++++++
 dpnp/tests/test_sycl_queue.py   |   7 +
 dpnp/tests/test_usm_type.py     |   7 +
 6 files changed, 899 insertions(+), 1 deletion(-)

diff --git a/dpnp/scipy/linalg/__init__.py b/dpnp/scipy/linalg/__init__.py
index 3afc08a6fdb9..81eadd890fa9 100644
--- a/dpnp/scipy/linalg/__init__.py
+++ b/dpnp/scipy/linalg/__init__.py
@@ -35,9 +35,10 @@
 
 """
 
-from ._decomp_lu import lu_factor, lu_solve
+from ._decomp_lu import lu, lu_factor, lu_solve
 
 __all__ = [
+    "lu",
     "lu_factor",
     "lu_solve",
 ]
diff --git a/dpnp/scipy/linalg/_decomp_lu.py b/dpnp/scipy/linalg/_decomp_lu.py
index 292d7fffe4b4..85161cec2c6f 100644
--- a/dpnp/scipy/linalg/_decomp_lu.py
+++ b/dpnp/scipy/linalg/_decomp_lu.py
@@ -46,11 +46,153 @@
 )
 
 from ._utils import (
+    dpnp_lu,
     dpnp_lu_factor,
     dpnp_lu_solve,
 )
 
 
+def lu(a, permute_l=False, overwrite_a=False, check_finite=True,
+       p_indices=False):
+    """
+    Compute LU decomposition of a matrix with partial pivoting.
+
+    The decomposition satisfies::
+
+        A = P @ L @ U
+
+    where `P` is a permutation matrix, `L` is lower triangular with unit
+    diagonal elements, and `U` is upper triangular. If `permute_l` is set to
+    ``True`` then `L` is returned already permuted and hence satisfying
+    ``A = L @ U``.
+
+    For full documentation refer to :obj:`scipy.linalg.lu`.
+
+    Parameters
+    ----------
+    a : (..., M, N) {dpnp.ndarray, usm_ndarray}
+        Input array to decompose.
+    permute_l : bool, optional
+        Perform the multiplication ``P @ L`` (Default: do not permute).
+
+        Default: ``False``.
+    overwrite_a : {None, bool}, optional
+        Whether to overwrite data in `a` (may increase performance).
+
+        Default: ``False``.
+    check_finite : {None, bool}, optional
+        Whether to check that the input matrix contains only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+
+        Default: ``True``.
+    p_indices : bool, optional
+        If ``True`` the permutation information is returned as row indices
+        instead of a permutation matrix.
+
+        Default: ``False``.
+
+    Returns
+    -------
+    **(If ``permute_l`` is ``False``)**
+
+    p : (..., M, M) dpnp.ndarray or (..., M) dpnp.ndarray
+        If `p_indices` is ``False`` (default), the permutation matrix.
+        The permutation matrix always has a real dtype (``float32`` or
+        ``float64``) even when `a` is complex, since it only contains
+        0s and 1s.
+        If `p_indices` is ``True``, a 1-D (or batched) array of row
+        permutation indices such that ``A = L[p] @ U``.
+    l : (..., M, K) dpnp.ndarray
+        Lower triangular or trapezoidal matrix with unit diagonal.
+        ``K = min(M, N)``.
+    u : (..., K, N) dpnp.ndarray
+        Upper triangular or trapezoidal matrix.
+
+    **(If ``permute_l`` is ``True``)**
+
+    pl : (..., M, K) dpnp.ndarray
+        Permuted ``L`` matrix: ``pl = P @ L``.
+        ``K = min(M, N)``.
+    u : (..., K, N) dpnp.ndarray
+        Upper triangular or trapezoidal matrix.
+
+    Notes
+    -----
+    Permutation matrices are costly since they are nothing but row reorder of
+    ``L`` and hence indices are strongly recommended to be used instead if the
+    permutation is required. The relation in the 2D case then becomes simply
+    ``A = L[P, :] @ U``. In higher dimensions, it is better to use `permute_l`
+    to avoid complicated indexing tricks.
+
+    In the 2D case, if one has the indices however, for some reason, the
+    permutation matrix is still needed then it can be constructed by
+    ``dpnp.eye(M)[P, :]``.
+
+    Warning
+    -------
+    This function synchronizes in order to validate array elements
+    when ``check_finite=True``, and also synchronizes to compute the
+    permutation from LAPACK pivot indices.
+
+    See Also
+    --------
+    :obj:`dpnp.scipy.linalg.lu_factor` : LU factorize a matrix
+                                         (compact representation).
+    :obj:`dpnp.scipy.linalg.lu_solve` : Solve an equation system using
+                                        the LU factorization of a matrix.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> A = np.array([[2, 5, 8, 7], [5, 2, 2, 8],
+    ...               [7, 5, 6, 6], [5, 4, 4, 8]])
+    >>> p, l, u = np.scipy.linalg.lu(A)
+    >>> np.allclose(A, p @ l @ u)
+    array(True)
+
+    Retrieve the permutation as row indices with ``p_indices=True``:
+
+    >>> p, l, u = np.scipy.linalg.lu(A, p_indices=True)
+    >>> p
+    array([1, 3, 0, 2])
+    >>> np.allclose(A, l[p] @ u)
+    array(True)
+
+    Return the permuted ``L`` directly with ``permute_l=True``:
+
+    >>> pl, u = np.scipy.linalg.lu(A, permute_l=True)
+    >>> np.allclose(A, pl @ u)
+    array(True)
+
+    Non-square matrices are supported:
+
+    >>> B = np.array([[1, 2, 3], [4, 5, 6]])
+    >>> p, l, u = np.scipy.linalg.lu(B)
+    >>> np.allclose(B, p @ l @ u)
+    array(True)
+
+    Batched input:
+
+    >>> C = np.random.randn(3, 2, 4, 4)
+    >>> p, l, u = np.scipy.linalg.lu(C)
+    >>> np.allclose(C, p @ l @ u)
+    array(True)
+
+    """
+
+    dpnp.check_supported_arrays_type(a)
+    assert_stacked_2d(a)
+
+    return dpnp_lu(
+        a,
+        overwrite_a=overwrite_a,
+        check_finite=check_finite,
+        p_indices=p_indices,
+        permute_l=permute_l,
+    )
+
+
 def lu_factor(a, overwrite_a=False, check_finite=True):
     """
     Compute the pivoted LU decomposition of `a` matrix.
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index f00db6fdfb92..0f6df9289a54 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -83,6 +83,64 @@ def _align_lu_solve_broadcast(lu, b):
     return lu, b
 
 
+def _apply_permutation_to_rows(mat, perm_indices):
+    """
+    Apply a permutation to the rows (axis=-2) of a matrix.
+
+    Returns ``out`` such that
+    ``out[..., i, :] = mat[..., perm_indices[..., i], :]``.
+
+    For 2-D inputs this is equivalent to ``mat[perm_indices]`` (a single
+    device gather).  For batched inputs :func:`dpnp.take_along_axis` is
+    used so the operation stays entirely on the device.
+
+    Parameters
+    ----------
+    mat : dpnp.ndarray, shape (..., M, N)
+        Matrix whose rows are to be permuted.
+    perm_indices : dpnp.ndarray, shape (..., M)
+        Permutation indices (dtype int64).
+
+    Returns
+    -------
+    out : dpnp.ndarray, shape (..., M, N)
+        Row-permuted matrix.
+    """
+
+    if perm_indices.ndim == 1:
+        # 2-D case: simple fancy indexing, single kernel launch.
+        return mat[perm_indices]
+
+    # Batched case: ensure *mat* has the same batch dimensions as
+    # *perm_indices*.  This is needed, for example, when permuting
+    # a shared identity matrix across a batch.
+    target_shape = perm_indices.shape[:-1] + mat.shape[-2:]
+    if mat.shape != target_shape:
+        mat = dpnp.broadcast_to(mat, target_shape)
+
+    # Expand (..., M) → (..., M, 1), then broadcast to the full shape
+    # of *mat* so take_along_axis can gather along axis -2.
+    idx = dpnp.expand_dims(perm_indices, axis=-1)
+    idx = dpnp.broadcast_to(idx, target_shape).copy()
+    return dpnp.take_along_axis(mat, idx, axis=-2)
+
+
+def _get_real_dtype(res_type):
+    """
+    Return the real floating-point counterpart of *res_type*.
+
+    SciPy uses the real dtype for the permutation matrix ``P`` even when
+    the input is complex (``P`` only contains 0s and 1s).
+
+    ``float32`` and ``complex64`` → ``float32``;
+    ``float64`` and ``complex128`` → ``float64``.
+    """
+
+    if dpnp.issubdtype(res_type, dpnp.complexfloating):
+        return dpnp.float32 if dpnp.dtype(res_type).itemsize <= 8 else dpnp.float64
+    return res_type
+
+
 def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
     """SciPy-compatible LU factorization for batched inputs."""
 
@@ -338,6 +396,71 @@ def _map_trans_to_mkl(trans):
     raise ValueError("`trans` must be 0 (N), 1 (T), or 2 (C)")
 
 
+def _pivots_to_permutation(piv, m):
+    """
+    Convert 0-based LAPACK pivot indices (sequential row swaps)
+    to a permutation array.
+
+    The returned permutation ``perm`` satisfies ``A[perm] = L @ U``
+    (i.e. the forward row-permutation produced by LAPACK).
+
+    The computation is performed entirely on the device.  A host-side
+    Python loop of ``K = min(M, N)`` iterations drives the sequential
+    swap logic, but each iteration only launches device kernels
+    (:func:`dpnp.take_along_axis` for gather,
+    :func:`dpnp.put_along_axis` for scatter); **no data is transferred
+    between host and device**.
+
+    .. note::
+
+        A future custom SYCL kernel could fuse all ``K`` swap steps
+        into a single launch to eliminate per-step kernel overhead.
+
+    Parameters
+    ----------
+    piv : dpnp.ndarray, shape (..., K)
+        0-based pivot indices as returned by :obj:`dpnp_lu_factor`.
+    m : int
+        Number of rows of the original matrix.
+
+    Returns
+    -------
+    perm : dpnp.ndarray, shape (..., M), dtype int64
+        Permutation indices.
+    """
+
+    batch_shape = piv.shape[:-1]
+    k = piv.shape[-1]
+
+    # Initialise the identity permutation on the device.
+    perm = dpnp.broadcast_to(
+        dpnp.arange(
+            m,
+            dtype=dpnp.int64,
+            usm_type=piv.usm_type,
+            sycl_queue=piv.sycl_queue,
+        ),
+        (*batch_shape, m),
+    ).copy()
+
+    # Apply sequential row swaps entirely on the device.
+    # Each iteration launches a small number of device kernels (gather +
+    # slice-assign + scatter) but never transfers data to the host.
+    for i in range(k):
+        # Pivot target for step *i*: shape (..., 1)
+        j = piv[..., i : i + 1]
+
+        # Gather the two values to be swapped.
+        val_i = perm[..., i : i + 1].copy()                   # slice (free)
+        val_j = dpnp.take_along_axis(perm, j, axis=-1)        # gather
+
+        # Perform the swap.
+        perm[..., i : i + 1] = val_j                           # slice assign
+        dpnp.put_along_axis(perm, j, val_i, axis=-1)           # scatter
+
+    return perm
+
+
 def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     """
     dpnp_lu_factor(a, overwrite_a=False, check_finite=True)
@@ -432,6 +555,167 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     return (a_h, ipiv_h)
 
 
+def dpnp_lu(
+    a,
+    overwrite_a=False,
+    check_finite=True,
+    p_indices=False,
+    permute_l=False,
+):
+    """
+    dpnp_lu(a, overwrite_a=False, check_finite=True, p_indices=False,
+            permute_l=False)
+
+    Compute pivoted LU decomposition and return separate P, L, U matrices
+    (SciPy-compatible behavior).
+
+    This function mimics the behavior of `scipy.linalg.lu` including
+    support for `permute_l`, `p_indices`, `overwrite_a`, and `check_finite`.
+
+    """
+
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+    m, n = a.shape[-2:]
+    k = min(m, n)
+    batch_shape = a.shape[:-2]
+
+    res_type = _common_type(a)
+
+    # The permutation matrix P uses a real dtype (SciPy convention):
+    # P only contains 0s and 1s, so complex storage would be wasteful.
+    real_type = _get_real_dtype(res_type)
+
+    # ---- Fast path: scalar (1x1) matrices ----
+    # For 1x1 input, P = I, L = I, U = A.  This avoids invoking LAPACK
+    # entirely (matches SciPy's scalar fast path).
+    if m == 1 and n == 1:
+        if check_finite:
+            if not dpnp.isfinite(a).all():
+                raise ValueError("array must not contain infs or NaNs")
+
+        _L = dpnp.ones(
+            a.shape,
+            dtype=res_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        _U = dpnp.array(a, dtype=res_type)
+
+        if permute_l:
+            return _L.copy(), _U
+
+        if p_indices:
+            _p = dpnp.zeros(
+                (*batch_shape, 1),
+                dtype=dpnp.int64,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            return _p, _L, _U
+
+        _P = dpnp.ones(
+            a.shape,
+            dtype=real_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        return _P, _L, _U
+
+    # ---- Fast path: empty arrays ----
+    if a.size == 0:
+        _L = dpnp.empty(
+            (*batch_shape, m, k),
+            dtype=res_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        _U = dpnp.empty(
+            (*batch_shape, k, n),
+            dtype=res_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+
+        if permute_l:
+            return _L, _U
+
+        if p_indices:
+            _p = dpnp.empty(
+                (*batch_shape, m),
+                dtype=dpnp.int64,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            return _p, _L, _U
+
+        _P = dpnp.empty(
+            (*batch_shape, m, m),
+            dtype=real_type,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
+        return _P, _L, _U
+
+    # ---- General case: LAPACK factorization ----
+    lu_compact, piv = dpnp_lu_factor(
+        a, overwrite_a=overwrite_a, check_finite=check_finite
+    )
+
+    # ---- Extract L: lower-triangular with unit diagonal ----
+    # L has shape (..., M, K).
+    _L = dpnp.tril(lu_compact[..., :, :k], k=-1)
+    _L += dpnp.eye(
+        m,
+        k,
+        dtype=lu_compact.dtype,
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+
+    # ---- Extract U: upper-triangular ----
+    # U has shape (..., K, N).
+    _U = dpnp.triu(lu_compact[..., :k, :])
+
+    # ---- Convert pivot indices → row permutation ----
+    # ``perm`` (forward): A[perm] = L @ U.
+    # This is the only step that requires a host transfer because the
+    # sequential swap semantics of LAPACK pivots cannot be parallelised.
+    # Only the small pivot array (min(M, N) elements per slice) is
+    # transferred; all subsequent work stays on the device.
+    perm = _pivots_to_permutation(piv, m)
+
+    # ``inv_perm`` (inverse): A = L[inv_perm] @ U.
+    # This is SciPy's ``p_indices`` convention.
+    # ``dpnp.argsort`` is an efficient on-device O(M log M) operation
+    # that avoids a second host round-trip.
+    inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
+
+    if permute_l:
+        # Return (PL, U) where PL = P @ L = L[inv_perm].
+        # A = PL @ U directly.
+        _PL = _apply_permutation_to_rows(_L, inv_perm)
+        return _PL, _U
+
+    if p_indices:
+        # SciPy convention: A = L[inv_perm] @ U.
+        return inv_perm, _L, _U
+
+    # ---- Build full permutation matrix P = I[inv_perm] ----
+    # P has shape (..., M, M) with real dtype (SciPy convention).
+    # The gather from an identity matrix is efficient on device:
+    # each output row selects one row of the identity (one hot encoding).
+    _I = dpnp.eye(
+        m,
+        dtype=real_type,
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+    _P = _apply_permutation_to_rows(_I, inv_perm)
+
+    return _P, _L, _U
+
+
 def dpnp_lu_solve(lu, piv, b, trans=0, overwrite_b=False, check_finite=True):
     """
     dpnp_lu_solve(lu, piv, b, trans=0, overwrite_b=False, check_finite=True)
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 31d99d71ce49..e92b9f65c182 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2605,6 +2605,463 @@ def test_invalid_shapes(self, a_shape, b_shape):
             dpnp.scipy.linalg.lu_solve((lu, piv), b, check_finite=False)
 
 
+class TestLu:
+    @staticmethod
+    def _make_nonsingular_np(shape, dtype, order):
+        A = generate_random_numpy_array(shape, dtype, order)
+        m, n = shape
+        k = min(m, n)
+        for i in range(k):
+            off = numpy.sum(numpy.abs(A[i, :n])) - numpy.abs(A[i, i])
+            A[i, i] = A.dtype.type(off + 1.0)
+        return A
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_default(self, shape, order, dtype):
+        a_np = self._make_nonsingular_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+
+        m, n = shape
+        k = min(m, n)
+        assert P.shape == (m, m)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_permute_l(self, shape, order, dtype):
+        a_np = self._make_nonsingular_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        PL, U = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+
+        m, n = shape
+        k = min(m, n)
+        assert PL.shape == (m, k)
+        assert U.shape == (k, n)
+
+        A_cast = a_dp.astype(PL.dtype, copy=False)
+        A_rec = PL @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_p_indices(self, shape, order, dtype):
+        a_np = self._make_nonsingular_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        p, L, U = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        m, n = shape
+        k = min(m, n)
+        assert p.shape == (m,)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+        assert dpnp.issubdtype(p.dtype, dpnp.integer)
+
+        p_np = dpnp.asnumpy(p)
+        L_np = dpnp.asnumpy(L)
+        U_np = dpnp.asnumpy(U)
+        A_rec = L_np[p_np] @ U_np
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        assert_allclose(A_rec, dpnp.asnumpy(A_cast), rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "in_dtype, expected_p_dtype",
+        [
+            (dpnp.float32, dpnp.float32),
+            (dpnp.float64, dpnp.float64),
+            (dpnp.complex64, dpnp.float32),
+            (dpnp.complex128, dpnp.float64),
+        ],
+    )
+    def test_p_matrix_dtype(self, in_dtype, expected_p_dtype):
+        if in_dtype in (dpnp.float64, dpnp.complex128):
+            if not has_support_aspect64():
+                pytest.skip("fp64 not supported on this device")
+
+        a_np = self._make_nonsingular_np((4, 4), in_dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+
+        assert P.dtype == expected_p_dtype
+        assert dpnp.issubdtype(P.dtype, dpnp.floating)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_indices_dtype(self, dtype):
+        a_np = self._make_nonsingular_np((4, 4), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        p, _, _ = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+        assert dpnp.issubdtype(p.dtype, dpnp.integer)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_l_structure(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        _, L, _ = dpnp.scipy.linalg.lu(a_dp)
+        L_np = dpnp.asnumpy(L)
+
+        # unit diagonal
+        diag_abs = numpy.abs(numpy.diag(L_np))
+        assert_allclose(diag_abs, numpy.ones(5, dtype=diag_abs.dtype))
+        # lower triangular
+        assert_allclose(numpy.triu(L_np, 1), numpy.zeros_like(L_np))
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_u_upper_triangular(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        _, _, U = dpnp.scipy.linalg.lu(a_dp)
+        U_np = dpnp.asnumpy(U)
+        assert_allclose(numpy.tril(U_np, -1), numpy.zeros_like(U_np))
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_is_permutation(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, _, _ = dpnp.scipy.linalg.lu(a_dp)
+        P_np = dpnp.asnumpy(P)
+
+        assert_allclose(P_np.sum(axis=0), numpy.ones(5, dtype=P_np.dtype))
+        assert_allclose(P_np.sum(axis=1), numpy.ones(5, dtype=P_np.dtype))
+        assert_allclose(P_np.T @ P_np, numpy.eye(5, dtype=P_np.dtype), atol=1e-15)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_modes_consistency(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        PL, U2 = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        p, L3, U3 = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A1 = P @ L @ U
+        A2 = PL @ U2
+        p_np = dpnp.asnumpy(p)
+        A3_np = dpnp.asnumpy(L3)[p_np] @ dpnp.asnumpy(U3)
+
+        assert dpnp.allclose(A1, A_cast, rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A2, A_cast, rtol=1e-6, atol=1e-6)
+        assert_allclose(A3_np, dpnp.asnumpy(A_cast), rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_times_l_equals_pl(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, L, _ = dpnp.scipy.linalg.lu(a_dp)
+        PL, _ = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        assert dpnp.allclose(P @ L, PL, rtol=1e-12, atol=1e-12)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_p_indices_to_matrix(self, dtype):
+        a_np = self._make_nonsingular_np((5, 5), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        P, _, _ = dpnp.scipy.linalg.lu(a_dp)
+        p, _, _ = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+        P_from_idx = dpnp.eye(5, dtype=P.dtype)[p]
+        assert dpnp.allclose(P_from_idx, P, rtol=1e-15, atol=1e-15)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_overwrite_a_false(self, dtype):
+        a_dp = dpnp.array([[4, 3], [6, 3]], dtype=dtype, order="F")
+        a_dp_orig = a_dp.copy()
+        dpnp.scipy.linalg.lu(a_dp, overwrite_a=False)
+        assert dpnp.allclose(a_dp, a_dp_orig)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_inputs(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        m, n = shape
+        k = min(m, n)
+        assert P.shape == (m, m)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_permute_l(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        PL, U = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        m, n = shape
+        k = min(m, n)
+        assert PL.shape == (m, k)
+        assert U.shape == (k, n)
+
+    @pytest.mark.parametrize("shape", [(0, 0), (0, 2), (2, 0)])
+    def test_empty_p_indices(self, shape):
+        a_dp = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+        p, L, U = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+        m, n = shape
+        k = min(m, n)
+        assert p.shape == (m,)
+        assert L.shape == (m, k)
+        assert U.shape == (k, n)
+
+    @pytest.mark.parametrize(
+        "sl",
+        [
+            (slice(None, None, 2), slice(None, None, 2)),
+            (slice(None, None, -1), slice(None, None, -1)),
+        ],
+    )
+    def test_strided(self, sl):
+        base = self._make_nonsingular_np(
+            (7, 7), dpnp.default_float_type(), "F"
+        )
+        a_np = base[sl]
+        a_dp = dpnp.array(a_np)
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, a_dp, rtol=1e-6, atol=1e-6)
+
+    def test_singular_matrix(self):
+        a_np = numpy.array([[1.0, 2.0], [2.0, 4.0]])
+        a_dp = dpnp.array(a_np)
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        A_rec = dpnp.asnumpy(P @ L @ U)
+        assert_allclose(A_rec, a_np, atol=1e-12)
+
+    def test_identity_matrix(self):
+        n = 4
+        I_dp = dpnp.eye(n, dtype=dpnp.default_float_type())
+        P, L, U = dpnp.scipy.linalg.lu(I_dp)
+        I_np = numpy.eye(n)
+        assert_allclose(dpnp.asnumpy(P), I_np, atol=1e-15)
+        assert_allclose(dpnp.asnumpy(L), I_np, atol=1e-15)
+        assert_allclose(dpnp.asnumpy(U), I_np, atol=1e-15)
+
+    def test_1d_input_raises(self):
+        a_dp = dpnp.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError):
+            dpnp.scipy.linalg.lu(a_dp)
+
+    @pytest.mark.parametrize("bad", [numpy.inf, -numpy.inf, numpy.nan])
+    def test_check_finite_raises(self, bad):
+        a_dp = dpnp.array([[1.0, 2.0], [3.0, bad]], order="F")
+        assert_raises(
+            ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True
+        )
+
+    def test_check_finite_disabled(self):
+        a_dp = dpnp.array([[1.0, numpy.nan], [3.0, 4.0]])
+        result = dpnp.scipy.linalg.lu(a_dp, check_finite=False)
+        assert len(result) == 3
+
+
+class TestLuBatched:
+    @staticmethod
+    def _make_nonsingular_nd_np(shape, dtype, order):
+        A = generate_random_numpy_array(shape, dtype, order)
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        A3 = A.reshape((-1, m, n))
+        for B in A3:
+            for i in range(k):
+                off = numpy.sum(numpy.abs(B[i, :n])) - numpy.abs(B[i, i])
+                B[i, i] = A.dtype.type(off + 1.0)
+        A = A3.reshape(shape)
+        A = numpy.array(A, order=order)
+        return A
+
+    @staticmethod
+    def _reconstruct_p_indices(p, L, U):
+        """Reconstruct A from (p, L, U) for batched p_indices mode."""
+        idx = dpnp.expand_dims(p, axis=-1)
+        idx = dpnp.broadcast_to(idx, L.shape).copy()
+        PL = dpnp.take_along_axis(L, idx, axis=-2)
+        return PL @ U
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2, 2), (3, 4, 4), (2, 3, 5, 2), (4, 1, 3)],
+        ids=["(2,2,2)", "(3,4,4)", "(2,3,5,2)", "(4,1,3)"],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_default_batched(self, shape, order, dtype):
+        a_np = self._make_nonsingular_nd_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        assert P.shape == (*shape[:-2], m, m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2, 2), (3, 4, 4), (2, 3, 5, 2), (4, 1, 3)],
+        ids=["(2,2,2)", "(3,4,4)", "(2,3,5,2)", "(4,1,3)"],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_permute_l_batched(self, shape, order, dtype):
+        a_np = self._make_nonsingular_nd_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        PL, U = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        assert PL.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+        A_cast = a_dp.astype(PL.dtype, copy=False)
+        A_rec = PL @ U
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2, 2), (3, 4, 4), (2, 3, 5, 2), (4, 1, 3)],
+        ids=["(2,2,2)", "(3,4,4)", "(2,3,5,2)", "(4,1,3)"],
+    )
+    @pytest.mark.parametrize("order", ["C", "F"])
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_lu_p_indices_batched(self, shape, order, dtype):
+        a_np = self._make_nonsingular_nd_np(shape, dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+
+        p, L, U = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        m, n = shape[-2], shape[-1]
+        k = min(m, n)
+        assert p.shape == (*shape[:-2], m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+        assert dpnp.issubdtype(p.dtype, dpnp.integer)
+
+        A_cast = a_dp.astype(L.dtype, copy=False)
+        A_rec = self._reconstruct_p_indices(p, L, U)
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    @pytest.mark.parametrize("order", ["C", "F"])
+    def test_overwrite_a(self, dtype, order):
+        a_np = self._make_nonsingular_nd_np((3, 2, 2), dtype, order)
+        a_dp = dpnp.array(a_np, order=order)
+        a_dp_orig = a_dp.copy()
+
+        dpnp.scipy.linalg.lu(a_dp, overwrite_a=False)
+        assert dpnp.allclose(a_dp, a_dp_orig)
+
+    @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
+    def test_modes_consistency_batched(self, dtype):
+        a_np = self._make_nonsingular_nd_np((3, 4, 4), dtype, "F")
+        a_dp = dpnp.array(a_np, order="F")
+        A_cast = a_dp.astype(
+            dpnp.complex128 if dpnp.issubdtype(dtype, dpnp.complexfloating)
+            else dpnp.float64, copy=False
+        )
+
+        P, L, U = dpnp.scipy.linalg.lu(a_dp)
+        PL, U2 = dpnp.scipy.linalg.lu(a_dp, permute_l=True)
+        p, L3, U3 = dpnp.scipy.linalg.lu(a_dp, p_indices=True)
+
+        A1 = P @ L @ U
+        A2 = PL @ U2
+        A3 = self._reconstruct_p_indices(p, L3, U3)
+
+        A_cast2 = a_dp.astype(L.dtype, copy=False)
+        assert dpnp.allclose(A1, A_cast2, rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A2, A_cast2, rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A3, A_cast2, rtol=1e-6, atol=1e-6)
+
+    @pytest.mark.parametrize(
+        "shape", [(0, 2, 2), (2, 0, 2), (2, 2, 0), (0, 0, 0)]
+    )
+    def test_empty_inputs(self, shape):
+        a = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+
+        P, L, U = dpnp.scipy.linalg.lu(a)
+        m, n = shape[-2:]
+        k = min(m, n)
+        assert P.shape == (*shape[:-2], m, m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+    @pytest.mark.parametrize(
+        "shape", [(0, 2, 2), (2, 0, 2), (2, 2, 0), (0, 0, 0)]
+    )
+    def test_empty_permute_l(self, shape):
+        a = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+
+        PL, U = dpnp.scipy.linalg.lu(a, permute_l=True)
+        m, n = shape[-2:]
+        k = min(m, n)
+        assert PL.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+    @pytest.mark.parametrize(
+        "shape", [(0, 2, 2), (2, 0, 2), (2, 2, 0), (0, 0, 0)]
+    )
+    def test_empty_p_indices(self, shape):
+        a = dpnp.empty(shape, dtype=dpnp.default_float_type(), order="F")
+
+        p, L, U = dpnp.scipy.linalg.lu(a, p_indices=True)
+        m, n = shape[-2:]
+        k = min(m, n)
+        assert p.shape == (*shape[:-2], m)
+        assert L.shape == (*shape[:-2], m, k)
+        assert U.shape == (*shape[:-2], k, n)
+
+    def test_strided(self):
+        a_np = self._make_nonsingular_nd_np(
+            (5, 3, 3), dpnp.default_float_type(), "F"
+        )
+        a_dp = dpnp.array(a_np, order="F")
+        a_stride = a_dp[::2]
+
+        P, L, U = dpnp.scipy.linalg.lu(a_stride)
+        for i in range(a_stride.shape[0]):
+            A_rec = dpnp.asnumpy(P[i] @ L[i] @ U[i])
+            A_orig = dpnp.asnumpy(a_stride[i].astype(L.dtype, copy=False))
+            assert_allclose(A_rec, A_orig, rtol=1e-6, atol=1e-6)
+
+    def test_singular_matrix(self):
+        a = dpnp.zeros((3, 2, 2), dtype=dpnp.default_float_type())
+        a[0] = dpnp.array([[1.0, 2.0], [2.0, 4.0]])
+        a[1] = dpnp.eye(2)
+        a[2] = dpnp.array([[1.0, 1.0], [1.0, 1.0]])
+
+        P, L, U = dpnp.scipy.linalg.lu(a)
+        A_rec = P @ L @ U
+        assert dpnp.allclose(A_rec, a, rtol=1e-6, atol=1e-6)
+
+    def test_check_finite_raises(self):
+        a = dpnp.ones((2, 3, 3), dtype=dpnp.default_float_type(), order="F")
+        a[1, 0, 0] = dpnp.nan
+        assert_raises(
+            ValueError, dpnp.scipy.linalg.lu, a, check_finite=True
+        )
+
+
 class TestMatrixPower:
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     @pytest.mark.parametrize(
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index d1853579036a..e392f4b733c4 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1675,6 +1675,13 @@ def test_lu_factor(self, data, device):
             ([[[1.0, 2.0], [3.0, 5.0]]], numpy.empty((2, 0, 2))),
         ],
     )
+    def test_lu(self, data, device):
+        a = dpnp.array(data, device=device)
+        result = dpnp.scipy.linalg.lu(a)
+
+        for param in result:
+            param_queue = param.sycl_queue
+            assert_sycl_queue_equal(param_queue, a.sycl_queue)
     def test_lu_solve(self, a_data, b_data, device):
         a = dpnp.array(a_data, device=device)
         lu, piv = dpnp.scipy.linalg.lu_factor(a)
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 4fc0f2b958fa..6eecbb243edc 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -1531,6 +1531,13 @@ def test_lstsq(self, m, n, nrhs, usm_type, usm_type_other):
         "data",
         [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
     )
+    def test_lu(self, data, usm_type):
+        a = dpnp.array(data, usm_type=usm_type)
+        result = dpnp.scipy.linalg.lu(a)
+
+        assert a.usm_type == usm_type
+        for param in result:
+            assert param.usm_type == a.usm_type
     def test_lu_factor(self, data, usm_type):
         a = dpnp.array(data, usm_type=usm_type)
         result = dpnp.scipy.linalg.lu_factor(a)

From 206f4ee83b121f05fcfc0afe1d48833fd8a1eb91 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Fri, 27 Feb 2026 20:50:17 +0000
Subject: [PATCH 02/23] Fix the test in test_sycl_queue.py

---
 dpnp/tests/test_sycl_queue.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index e392f4b733c4..77d659305c06 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1662,6 +1662,13 @@ def test_lu_factor(self, data, device):
         a = dpnp.array(data, device=device)
         result = dpnp.scipy.linalg.lu_factor(a)
 
+        for param in result:
+            param_queue = param.sycl_queue
+            assert_sycl_queue_equal(param_queue, a.sycl_queue)
+    def test_lu(self, data, device):
+        a = dpnp.array(data, device=device)
+        result = dpnp.scipy.linalg.lu(a)
+
         for param in result:
             param_queue = param.sycl_queue
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
@@ -1675,13 +1682,6 @@ def test_lu_factor(self, data, device):
             ([[[1.0, 2.0], [3.0, 5.0]]], numpy.empty((2, 0, 2))),
         ],
     )
-    def test_lu(self, data, device):
-        a = dpnp.array(data, device=device)
-        result = dpnp.scipy.linalg.lu(a)
-
-        for param in result:
-            param_queue = param.sycl_queue
-            assert_sycl_queue_equal(param_queue, a.sycl_queue)
     def test_lu_solve(self, a_data, b_data, device):
         a = dpnp.array(a_data, device=device)
         lu, piv = dpnp.scipy.linalg.lu_factor(a)

From 7a6b5bd4d37099e338eb245aef84f79f98a13126 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Sat, 28 Feb 2026 16:31:04 +0000
Subject: [PATCH 03/23] Fix black formatting

---
 dpnp/scipy/linalg/_decomp_lu.py |  5 +++--
 dpnp/scipy/linalg/_utils.py     | 12 +++++++-----
 dpnp/tests/test_linalg.py       | 24 ++++++++++++------------
 dpnp/tests/test_sycl_queue.py   |  1 +
 dpnp/tests/test_usm_type.py     |  1 +
 5 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/dpnp/scipy/linalg/_decomp_lu.py b/dpnp/scipy/linalg/_decomp_lu.py
index 85161cec2c6f..c890bb8cc090 100644
--- a/dpnp/scipy/linalg/_decomp_lu.py
+++ b/dpnp/scipy/linalg/_decomp_lu.py
@@ -52,8 +52,9 @@
 )
 
 
-def lu(a, permute_l=False, overwrite_a=False, check_finite=True,
-       p_indices=False):
+def lu(
+    a, permute_l=False, overwrite_a=False, check_finite=True, p_indices=False
+):
     """
     Compute LU decomposition of a matrix with partial pivoting.
 
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 0f6df9289a54..1a62e8da7212 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -137,7 +137,9 @@ def _get_real_dtype(res_type):
     """
 
     if dpnp.issubdtype(res_type, dpnp.complexfloating):
-        return dpnp.float32 if dpnp.dtype(res_type).itemsize <= 8 else dpnp.float64
+        return (
+            dpnp.float32 if dpnp.dtype(res_type).itemsize <= 8 else dpnp.float64
+        )
     return res_type
 
 
@@ -451,12 +453,12 @@ def _pivots_to_permutation(piv, m):
         j = piv[..., i : i + 1]
 
         # Gather the two values to be swapped.
-        val_i = perm[..., i : i + 1].copy()                   # slice (free)
-        val_j = dpnp.take_along_axis(perm, j, axis=-1)        # gather
+        val_i = perm[..., i : i + 1].copy()  # slice (free)
+        val_j = dpnp.take_along_axis(perm, j, axis=-1)  # gather
 
         # Perform the swap.
-        perm[..., i : i + 1] = val_j                           # slice assign
-        dpnp.put_along_axis(perm, j, val_i, axis=-1)           # scatter
+        perm[..., i : i + 1] = val_j  # slice assign
+        dpnp.put_along_axis(perm, j, val_i, axis=-1)  # scatter
 
     return perm
 
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index e92b9f65c182..39ddff426973 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2743,7 +2743,9 @@ def test_p_is_permutation(self, dtype):
 
         assert_allclose(P_np.sum(axis=0), numpy.ones(5, dtype=P_np.dtype))
         assert_allclose(P_np.sum(axis=1), numpy.ones(5, dtype=P_np.dtype))
-        assert_allclose(P_np.T @ P_np, numpy.eye(5, dtype=P_np.dtype), atol=1e-15)
+        assert_allclose(
+            P_np.T @ P_np, numpy.eye(5, dtype=P_np.dtype), atol=1e-15
+        )
 
     @pytest.mark.parametrize("dtype", get_float_complex_dtypes())
     def test_modes_consistency(self, dtype):
@@ -2825,9 +2827,7 @@ def test_empty_p_indices(self, shape):
         ],
     )
     def test_strided(self, sl):
-        base = self._make_nonsingular_np(
-            (7, 7), dpnp.default_float_type(), "F"
-        )
+        base = self._make_nonsingular_np((7, 7), dpnp.default_float_type(), "F")
         a_np = base[sl]
         a_dp = dpnp.array(a_np)
 
@@ -2859,9 +2859,7 @@ def test_1d_input_raises(self):
     @pytest.mark.parametrize("bad", [numpy.inf, -numpy.inf, numpy.nan])
     def test_check_finite_raises(self, bad):
         a_dp = dpnp.array([[1.0, 2.0], [3.0, bad]], order="F")
-        assert_raises(
-            ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True
-        )
+        assert_raises(ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True)
 
     def test_check_finite_disabled(self):
         a_dp = dpnp.array([[1.0, numpy.nan], [3.0, 4.0]])
@@ -2976,8 +2974,12 @@ def test_modes_consistency_batched(self, dtype):
         a_np = self._make_nonsingular_nd_np((3, 4, 4), dtype, "F")
         a_dp = dpnp.array(a_np, order="F")
         A_cast = a_dp.astype(
-            dpnp.complex128 if dpnp.issubdtype(dtype, dpnp.complexfloating)
-            else dpnp.float64, copy=False
+            (
+                dpnp.complex128
+                if dpnp.issubdtype(dtype, dpnp.complexfloating)
+                else dpnp.float64
+            ),
+            copy=False,
         )
 
         P, L, U = dpnp.scipy.linalg.lu(a_dp)
@@ -3057,9 +3059,7 @@ def test_singular_matrix(self):
     def test_check_finite_raises(self):
         a = dpnp.ones((2, 3, 3), dtype=dpnp.default_float_type(), order="F")
         a[1, 0, 0] = dpnp.nan
-        assert_raises(
-            ValueError, dpnp.scipy.linalg.lu, a, check_finite=True
-        )
+        assert_raises(ValueError, dpnp.scipy.linalg.lu, a, check_finite=True)
 
 
 class TestMatrixPower:
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 77d659305c06..b0155658a4df 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1665,6 +1665,7 @@ def test_lu_factor(self, data, device):
         for param in result:
             param_queue = param.sycl_queue
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
+
     def test_lu(self, data, device):
         a = dpnp.array(data, device=device)
         result = dpnp.scipy.linalg.lu(a)
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 6eecbb243edc..59a6b05f688e 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -1538,6 +1538,7 @@ def test_lu(self, data, usm_type):
         assert a.usm_type == usm_type
         for param in result:
             assert param.usm_type == a.usm_type
+
     def test_lu_factor(self, data, usm_type):
         a = dpnp.array(data, usm_type=usm_type)
         result = dpnp.scipy.linalg.lu_factor(a)

From 1f6c92849e72ff3c1165844ead66ea3d85efecda Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Sun, 1 Mar 2026 19:59:58 +0000
Subject: [PATCH 04/23] fix formatting and apply plint

---
 dpnp/scipy/linalg/_decomp_lu.py |  10 +--
 dpnp/scipy/linalg/_utils.py     | 126 +++++++++++++-------------------
 2 files changed, 55 insertions(+), 81 deletions(-)

diff --git a/dpnp/scipy/linalg/_decomp_lu.py b/dpnp/scipy/linalg/_decomp_lu.py
index c890bb8cc090..823b2fccc230 100644
--- a/dpnp/scipy/linalg/_decomp_lu.py
+++ b/dpnp/scipy/linalg/_decomp_lu.py
@@ -323,13 +323,13 @@ def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True):
 
     """
 
-    lu, piv = lu_and_piv
-    dpnp.check_supported_arrays_type(lu, piv, b)
-    assert_stacked_2d(lu)
-    assert_stacked_square(lu)
+    lu_matrix, piv = lu_and_piv
+    dpnp.check_supported_arrays_type(lu_matrix, piv, b)
+    assert_stacked_2d(lu_matrix)
+    assert_stacked_square(lu_matrix)
 
     return dpnp_lu_solve(
-        lu,
+        lu_matrix,
         piv,
         b,
         trans=trans,
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 1a62e8da7212..76e0db6752b0 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -596,126 +596,100 @@ def dpnp_lu(
             if not dpnp.isfinite(a).all():
                 raise ValueError("array must not contain infs or NaNs")
 
-        _L = dpnp.ones(
+        low = dpnp.ones(
             a.shape,
             dtype=res_type,
             usm_type=a_usm_type,
             sycl_queue=a_sycl_queue,
         )
-        _U = dpnp.array(a, dtype=res_type)
-
-        if permute_l:
-            return _L.copy(), _U
-
-        if p_indices:
-            _p = dpnp.zeros(
-                (*batch_shape, 1),
-                dtype=dpnp.int64,
-                usm_type=a_usm_type,
-                sycl_queue=a_sycl_queue,
-            )
-            return _p, _L, _U
-
-        _P = dpnp.ones(
-            a.shape,
-            dtype=real_type,
+        up = dpnp.array(a, dtype=res_type)
+        inv_perm = dpnp.zeros(
+            (*batch_shape, 1),
+            dtype=dpnp.int64,
             usm_type=a_usm_type,
             sycl_queue=a_sycl_queue,
         )
-        return _P, _L, _U
 
     # ---- Fast path: empty arrays ----
-    if a.size == 0:
-        _L = dpnp.empty(
+    elif a.size == 0:
+        low = dpnp.empty(
             (*batch_shape, m, k),
             dtype=res_type,
             usm_type=a_usm_type,
             sycl_queue=a_sycl_queue,
         )
-        _U = dpnp.empty(
+        up = dpnp.empty(
             (*batch_shape, k, n),
             dtype=res_type,
             usm_type=a_usm_type,
             sycl_queue=a_sycl_queue,
         )
-
-        if permute_l:
-            return _L, _U
-
-        if p_indices:
-            _p = dpnp.empty(
-                (*batch_shape, m),
-                dtype=dpnp.int64,
-                usm_type=a_usm_type,
-                sycl_queue=a_sycl_queue,
-            )
-            return _p, _L, _U
-
-        _P = dpnp.empty(
-            (*batch_shape, m, m),
-            dtype=real_type,
+        inv_perm = dpnp.empty(
+            (*batch_shape, m),
+            dtype=dpnp.int64,
             usm_type=a_usm_type,
             sycl_queue=a_sycl_queue,
         )
-        return _P, _L, _U
 
     # ---- General case: LAPACK factorization ----
-    lu_compact, piv = dpnp_lu_factor(
-        a, overwrite_a=overwrite_a, check_finite=check_finite
-    )
-
-    # ---- Extract L: lower-triangular with unit diagonal ----
-    # L has shape (..., M, K).
-    _L = dpnp.tril(lu_compact[..., :, :k], k=-1)
-    _L += dpnp.eye(
-        m,
-        k,
-        dtype=lu_compact.dtype,
-        usm_type=a_usm_type,
-        sycl_queue=a_sycl_queue,
-    )
-
-    # ---- Extract U: upper-triangular ----
-    # U has shape (..., K, N).
-    _U = dpnp.triu(lu_compact[..., :k, :])
-
-    # ---- Convert pivot indices → row permutation ----
-    # ``perm`` (forward): A[perm] = L @ U.
-    # This is the only step that requires a host transfer because the
-    # sequential swap semantics of LAPACK pivots cannot be parallelised.
-    # Only the small pivot array (min(M, N) elements per slice) is
-    # transferred; all subsequent work stays on the device.
-    perm = _pivots_to_permutation(piv, m)
+    else:
+        lu_compact, piv = dpnp_lu_factor(
+            a, overwrite_a=overwrite_a, check_finite=check_finite
+        )
 
-    # ``inv_perm`` (inverse): A = L[inv_perm] @ U.
-    # This is SciPy's ``p_indices`` convention.
-    # ``dpnp.argsort`` is an efficient on-device O(M log M) operation
-    # that avoids a second host round-trip.
-    inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
+        # ---- Extract L: lower-triangular with unit diagonal ----
+        # L has shape (..., M, K).
+        low = dpnp.tril(lu_compact[..., :, :k], k=-1)
+        low += dpnp.eye(
+            m,
+            k,
+            dtype=lu_compact.dtype,
+            usm_type=a_usm_type,
+            sycl_queue=a_sycl_queue,
+        )
 
+        # ---- Extract U: upper-triangular ----
+        # U has shape (..., K, N).
+        up = dpnp.triu(lu_compact[..., :k, :])
+
+        # ---- Convert pivot indices → row permutation ----
+        # ``perm`` (forward): A[perm] = L @ U.
+        # This is the only step that requires a host transfer because the
+        # sequential swap semantics of LAPACK pivots cannot be parallelised.
+        # Only the small pivot array (min(M, N) elements per slice) is
+        # transferred; all subsequent work stays on the device.
+        perm = _pivots_to_permutation(piv, m)
+
+        # ``inv_perm`` (inverse): A = L[inv_perm] @ U.
+        # This is SciPy's ``p_indices`` convention.
+        # ``dpnp.argsort`` is an efficient on-device O(M log M) operation
+        # that avoids a second host round-trip.
+        inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
+
+    # ---- Assemble output (SciPy convention) ----
     if permute_l:
         # Return (PL, U) where PL = P @ L = L[inv_perm].
         # A = PL @ U directly.
-        _PL = _apply_permutation_to_rows(_L, inv_perm)
-        return _PL, _U
+        perm_low = _apply_permutation_to_rows(low, inv_perm)
+        return perm_low, up
 
     if p_indices:
         # SciPy convention: A = L[inv_perm] @ U.
-        return inv_perm, _L, _U
+        return inv_perm, low, up
 
     # ---- Build full permutation matrix P = I[inv_perm] ----
     # P has shape (..., M, M) with real dtype (SciPy convention).
     # The gather from an identity matrix is efficient on device:
     # each output row selects one row of the identity (one hot encoding).
-    _I = dpnp.eye(
+    eye_m = dpnp.eye(
         m,
         dtype=real_type,
         usm_type=a_usm_type,
         sycl_queue=a_sycl_queue,
     )
-    _P = _apply_permutation_to_rows(_I, inv_perm)
+    perm_matrix = _apply_permutation_to_rows(eye_m, inv_perm)
 
-    return _P, _L, _U
+    return perm_matrix, low, up
 
 
 def dpnp_lu_solve(lu, piv, b, trans=0, overwrite_b=False, check_finite=True):

From 67314b7a3de00d6e590cb6dc8e309844fda0f66e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 2 Mar 2026 09:18:08 -0600
Subject: [PATCH 05/23] Update dpnp/scipy/linalg/_utils.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/scipy/linalg/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 76e0db6752b0..00dddd78bed1 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -112,7 +112,7 @@ def _apply_permutation_to_rows(mat, perm_indices):
         return mat[perm_indices]
 
     # Batched case: ensure *mat* has the same batch dimensions as
-    # *perm_indices*.  This is needed, for example, when permuting
+    # *perm_indices*. This is needed, for example, when permuting
     # a shared identity matrix across a batch.
     target_shape = perm_indices.shape[:-1] + mat.shape[-2:]
     if mat.shape != target_shape:

From 384618b0c2ee2d90decc1cdc7de02f7a0ad23fc1 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Mon, 2 Mar 2026 09:18:32 -0600
Subject: [PATCH 06/23] Update dpnp/scipy/linalg/_utils.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/scipy/linalg/_utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 00dddd78bed1..12b32fb5dc0b 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -596,12 +596,7 @@ def dpnp_lu(
             if not dpnp.isfinite(a).all():
                 raise ValueError("array must not contain infs or NaNs")
 
-        low = dpnp.ones(
-            a.shape,
-            dtype=res_type,
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
+        low = dpnp.ones_like(a, dtype=res_type)
         up = dpnp.array(a, dtype=res_type)
         inv_perm = dpnp.zeros(
             (*batch_shape, 1),

From 5b83946c5f7c05c0cf77afc32c1ae798cdba3ed4 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 2 Mar 2026 15:28:35 +0000
Subject: [PATCH 07/23] udapte CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f177be311f84..cef92676b708 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.ndarray.__bytes__` method [#2671](https://github.com/IntelPython/dpnp/pull/2671)
 * Added implementation of `dpnp.divmod` [#2674](https://github.com/IntelPython/dpnp/pull/2674)
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
+* Added implementation of `dpnp.scipy.linalg.lu` for 2D inputs (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 
 ### Changed
 

From 7161aef58e26983fcab25986cae5d4f7b376eeaa Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 2 Mar 2026 17:20:23 +0000
Subject: [PATCH 08/23] address comments from PR

---
 dpnp/scipy/linalg/_utils.py                   | 134 ++++++++----------
 .../linalg_tests/test_decomp_lu.py            |  15 +-
 2 files changed, 65 insertions(+), 84 deletions(-)

diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 12b32fb5dc0b..f74c78381577 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -49,7 +49,7 @@
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
 from dpnp.dpnp_utils import get_usm_allocations
-from dpnp.linalg.dpnp_utils_linalg import _common_type
+from dpnp.linalg.dpnp_utils_linalg import _common_type, _real_type
 
 
 def _align_lu_solve_broadcast(lu, b):
@@ -125,24 +125,6 @@ def _apply_permutation_to_rows(mat, perm_indices):
     return dpnp.take_along_axis(mat, idx, axis=-2)
 
 
-def _get_real_dtype(res_type):
-    """
-    Return the real floating-point counterpart of *res_type*.
-
-    SciPy uses the real dtype for the permutation matrix ``P`` even when
-    the input is complex (``P`` only contains 0s and 1s).
-
-    ``float32`` and ``complex64`` → ``float32``;
-    ``float64`` and ``complex128`` → ``float64``.
-    """
-
-    if dpnp.issubdtype(res_type, dpnp.complexfloating):
-        return (
-            dpnp.float32 if dpnp.dtype(res_type).itemsize <= 8 else dpnp.float64
-        )
-    return res_type
-
-
 def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
     """SciPy-compatible LU factorization for batched inputs."""
 
@@ -586,7 +568,7 @@ def dpnp_lu(
 
     # The permutation matrix P uses a real dtype (SciPy convention):
     # P only contains 0s and 1s, so complex storage would be wasteful.
-    real_type = _get_real_dtype(res_type)
+    real_type = _real_type(res_type)
 
     # ---- Fast path: scalar (1x1) matrices ----
     # For 1x1 input, P = I, L = I, U = A.  This avoids invoking LAPACK
@@ -596,70 +578,70 @@ def dpnp_lu(
             if not dpnp.isfinite(a).all():
                 raise ValueError("array must not contain infs or NaNs")
 
-        low = dpnp.ones_like(a, dtype=res_type)
-        up = dpnp.array(a, dtype=res_type)
-        inv_perm = dpnp.zeros(
-            (*batch_shape, 1),
-            dtype=dpnp.int64,
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
+        low = dpnp.ones_like(a, shape=a.shape, dtype=res_type)
+        up = dpnp.astype(a, res_type, copy=not overwrite_a)
+        inv_perm = dpnp.zeros_like(a, shape=(*batch_shape, 1), dtype=dpnp.int64)
+
+        if permute_l:
+            # P = I, so PL = L unchanged; no gather needed.
+            return low, up
+        if p_indices:
+            return inv_perm, low, up
+        # P = I exactly: return eye directly, no gather needed.
+        eye_m = dpnp.eye(
+            m, dtype=real_type, usm_type=a_usm_type, sycl_queue=a_sycl_queue
         )
+        return eye_m, low, up
 
     # ---- Fast path: empty arrays ----
-    elif a.size == 0:
-        low = dpnp.empty(
-            (*batch_shape, m, k),
-            dtype=res_type,
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
-        up = dpnp.empty(
-            (*batch_shape, k, n),
-            dtype=res_type,
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
-        inv_perm = dpnp.empty(
-            (*batch_shape, m),
-            dtype=dpnp.int64,
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
+    if a.size == 0:
+        low = dpnp.empty_like(a, shape=(*batch_shape, m, k), dtype=res_type)
+        up = dpnp.empty_like(a, shape=(*batch_shape, k, n), dtype=res_type)
+        inv_perm = dpnp.empty_like(a, shape=(*batch_shape, m), dtype=dpnp.int64)
+
+        if permute_l:
+            return low, up
+        if p_indices:
+            return inv_perm, low, up
+        # P is empty: allocate directly, no eye + gather needed.
+        perm_matrix = dpnp.empty_like(
+            a, shape=(*batch_shape, m, m), dtype=real_type
         )
+        return perm_matrix, low, up
 
     # ---- General case: LAPACK factorization ----
-    else:
-        lu_compact, piv = dpnp_lu_factor(
-            a, overwrite_a=overwrite_a, check_finite=check_finite
-        )
+    lu_compact, piv = dpnp_lu_factor(
+        a, overwrite_a=overwrite_a, check_finite=check_finite
+    )
 
-        # ---- Extract L: lower-triangular with unit diagonal ----
-        # L has shape (..., M, K).
-        low = dpnp.tril(lu_compact[..., :, :k], k=-1)
-        low += dpnp.eye(
-            m,
-            k,
-            dtype=lu_compact.dtype,
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
+    # ---- Extract L: lower-triangular with unit diagonal ----
+    # L has shape (..., M, K).
+    low = dpnp.tril(lu_compact[..., :, :k], k=-1)
+    low += dpnp.eye(
+        m,
+        k,
+        dtype=lu_compact.dtype,
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
 
-        # ---- Extract U: upper-triangular ----
-        # U has shape (..., K, N).
-        up = dpnp.triu(lu_compact[..., :k, :])
-
-        # ---- Convert pivot indices → row permutation ----
-        # ``perm`` (forward): A[perm] = L @ U.
-        # This is the only step that requires a host transfer because the
-        # sequential swap semantics of LAPACK pivots cannot be parallelised.
-        # Only the small pivot array (min(M, N) elements per slice) is
-        # transferred; all subsequent work stays on the device.
-        perm = _pivots_to_permutation(piv, m)
-
-        # ``inv_perm`` (inverse): A = L[inv_perm] @ U.
-        # This is SciPy's ``p_indices`` convention.
-        # ``dpnp.argsort`` is an efficient on-device O(M log M) operation
-        # that avoids a second host round-trip.
-        inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
+    # ---- Extract U: upper-triangular ----
+    # U has shape (..., K, N).
+    up = dpnp.triu(lu_compact[..., :k, :])
+
+    # ---- Convert pivot indices → row permutation ----
+    # ``perm`` (forward): A[perm] = L @ U.
+    # This is the only step that requires a host transfer because the
+    # sequential swap semantics of LAPACK pivots cannot be parallelised.
+    # Only the small pivot array (min(M, N) elements per slice) is
+    # transferred; all subsequent work stays on the device.
+    perm = _pivots_to_permutation(piv, m)
+
+    # ``inv_perm`` (inverse): A = L[inv_perm] @ U.
+    # This is SciPy's ``p_indices`` convention.
+    # ``dpnp.argsort`` is an efficient on-device O(M log M) operation
+    # that avoids a second host round-trip.
+    inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
 
     # ---- Assemble output (SciPy convention) ----
     if permute_l:
diff --git a/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py b/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py
index fb51c3e39244..440521419652 100644
--- a/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py
+++ b/dpnp/tests/third_party/cupyx/scipy_tests/linalg_tests/test_decomp_lu.py
@@ -124,7 +124,6 @@ def test_lu_factor_reconstruction_singular(self, dtype):
 )
 @testing.fix_random()
 @testing.with_requires("scipy")
-@pytest.mark.skip("lu() is not supported yet")
 class TestLU(unittest.TestCase):
 
     @testing.for_dtypes("fdFD")
@@ -132,7 +131,7 @@ def test_lu(self, dtype):
         a_cpu = testing.shaped_random(self.shape, numpy, dtype=dtype)
         a_gpu = cupy.asarray(a_cpu)
         result_cpu = scipy.linalg.lu(a_cpu, permute_l=self.permute_l)
-        result_gpu = cupy.linalg.lu(a_gpu, permute_l=self.permute_l)
+        result_gpu = cupy.scipy.linalg.lu(a_gpu, permute_l=self.permute_l)
         assert len(result_cpu) == len(result_gpu)
         if not self.permute_l:
             # check permutation matrix
@@ -140,22 +139,22 @@ def test_lu(self, dtype):
             result_gpu = list(result_gpu)
             P_cpu = result_cpu.pop(0)
             P_gpu = result_gpu.pop(0)
-            cupy.testing.assert_array_equal(P_gpu, P_cpu)
-        cupy.testing.assert_allclose(result_gpu[0], result_cpu[0], atol=1e-5)
-        cupy.testing.assert_allclose(result_gpu[1], result_cpu[1], atol=1e-5)
+            testing.assert_array_equal(P_gpu, P_cpu)
+        testing.assert_allclose(result_gpu[0], result_cpu[0], atol=1e-5)
+        testing.assert_allclose(result_gpu[1], result_cpu[1], atol=1e-5)
 
     @testing.for_dtypes("fdFD")
     def test_lu_reconstruction(self, dtype):
         m, n = self.shape
         A = testing.shaped_random(self.shape, cupy, dtype=dtype)
         if self.permute_l:
-            PL, U = cupy.linalg.lu(A, permute_l=self.permute_l)
+            PL, U = cupy.scipy.linalg.lu(A, permute_l=self.permute_l)
             PLU = PL @ U
         else:
-            P, L, U = cupy.linalg.lu(A, permute_l=self.permute_l)
+            P, L, U = cupy.scipy.linalg.lu(A, permute_l=self.permute_l)
             PLU = P @ L @ U
         # check that reconstruction is close to original
-        cupy.testing.assert_allclose(PLU, A, atol=1e-5)
+        testing.assert_allclose(PLU, A, atol=1e-5)
 
 
 @testing.parameterize(

From ac8117ba370162c6c8a7314cda821148dc7d9033 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 2 Mar 2026 17:37:39 +0000
Subject: [PATCH 09/23] Fix pylint multiple returns

---
 dpnp/scipy/linalg/_utils.py | 94 ++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index f74c78381577..7f9dff4b2847 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -539,6 +539,32 @@ def dpnp_lu_factor(a, overwrite_a=False, check_finite=True):
     return (a_h, ipiv_h)
 
 
+def _assemble_lu_output(
+    low,
+    up,
+    inv_perm,
+    permute_l,
+    p_indices,
+    m,
+    real_type,
+    a_usm_type,
+    a_sycl_queue,
+):
+    """Select and build the correct dpnp_lu return value."""
+    if permute_l:
+        return _apply_permutation_to_rows(low, inv_perm), up
+    if p_indices:
+        return inv_perm, low, up
+    eye_m = dpnp.eye(
+        m, dtype=real_type, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+    )
+    return (
+        _apply_permutation_to_rows(eye_m, inv_perm),
+        low,
+        up,
+    )  # perm_matrix, L, U
+
+
 def dpnp_lu(
     a,
     overwrite_a=False,
@@ -582,32 +608,34 @@ def dpnp_lu(
         up = dpnp.astype(a, res_type, copy=not overwrite_a)
         inv_perm = dpnp.zeros_like(a, shape=(*batch_shape, 1), dtype=dpnp.int64)
 
-        if permute_l:
-            # P = I, so PL = L unchanged; no gather needed.
-            return low, up
-        if p_indices:
-            return inv_perm, low, up
-        # P = I exactly: return eye directly, no gather needed.
-        eye_m = dpnp.eye(
-            m, dtype=real_type, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+        return _assemble_lu_output(
+            low,
+            up,
+            inv_perm,
+            permute_l,
+            p_indices,
+            m,
+            real_type,
+            a_usm_type,
+            a_sycl_queue,
         )
-        return eye_m, low, up
 
     # ---- Fast path: empty arrays ----
     if a.size == 0:
         low = dpnp.empty_like(a, shape=(*batch_shape, m, k), dtype=res_type)
         up = dpnp.empty_like(a, shape=(*batch_shape, k, n), dtype=res_type)
         inv_perm = dpnp.empty_like(a, shape=(*batch_shape, m), dtype=dpnp.int64)
-
-        if permute_l:
-            return low, up
-        if p_indices:
-            return inv_perm, low, up
-        # P is empty: allocate directly, no eye + gather needed.
-        perm_matrix = dpnp.empty_like(
-            a, shape=(*batch_shape, m, m), dtype=real_type
+        return _assemble_lu_output(
+            low,
+            up,
+            inv_perm,
+            permute_l,
+            p_indices,
+            m,
+            real_type,
+            a_usm_type,
+            a_sycl_queue,
         )
-        return perm_matrix, low, up
 
     # ---- General case: LAPACK factorization ----
     lu_compact, piv = dpnp_lu_factor(
@@ -644,29 +672,17 @@ def dpnp_lu(
     inv_perm = dpnp.argsort(perm, axis=-1).astype(dpnp.int64)
 
     # ---- Assemble output (SciPy convention) ----
-    if permute_l:
-        # Return (PL, U) where PL = P @ L = L[inv_perm].
-        # A = PL @ U directly.
-        perm_low = _apply_permutation_to_rows(low, inv_perm)
-        return perm_low, up
-
-    if p_indices:
-        # SciPy convention: A = L[inv_perm] @ U.
-        return inv_perm, low, up
-
-    # ---- Build full permutation matrix P = I[inv_perm] ----
-    # P has shape (..., M, M) with real dtype (SciPy convention).
-    # The gather from an identity matrix is efficient on device:
-    # each output row selects one row of the identity (one hot encoding).
-    eye_m = dpnp.eye(
+    return _assemble_lu_output(
+        low,
+        up,
+        inv_perm,
+        permute_l,
+        p_indices,
         m,
-        dtype=real_type,
-        usm_type=a_usm_type,
-        sycl_queue=a_sycl_queue,
+        real_type,
+        a_usm_type,
+        a_sycl_queue,
     )
-    perm_matrix = _apply_permutation_to_rows(eye_m, inv_perm)
-
-    return perm_matrix, low, up
 
 
 def dpnp_lu_solve(lu, piv, b, trans=0, overwrite_b=False, check_finite=True):

From 3033c0b5be1482b3169815c36bb226db2d1ed87e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 2 Mar 2026 18:17:11 +0000
Subject: [PATCH 10/23] Fix the test failing in onemkl tests

---
 dpnp/tests/test_sycl_queue.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index b0155658a4df..47eb75b05204 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1654,10 +1654,11 @@ def test_lstsq(self, m, n, nrhs, device):
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
             assert_sycl_queue_equal(param_queue, b.sycl_queue)
 
-    @pytest.mark.parametrize(
-        "data",
+    _LU_TEST_DATA = (
         [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
     )
+
+    @pytest.mark.parametrize("data", _LU_TEST_DATA)
     def test_lu_factor(self, data, device):
         a = dpnp.array(data, device=device)
         result = dpnp.scipy.linalg.lu_factor(a)
@@ -1666,6 +1667,7 @@ def test_lu_factor(self, data, device):
             param_queue = param.sycl_queue
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
 
+    @pytest.mark.parametrize("data", _LU_TEST_DATA)
     def test_lu(self, data, device):
         a = dpnp.array(data, device=device)
         result = dpnp.scipy.linalg.lu(a)

From cdb5e324bb94e1753985104e66a32677740a1571 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 2 Mar 2026 19:13:00 +0000
Subject: [PATCH 11/23] one more try..

---
 dpnp/tests/test_sycl_queue.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index 47eb75b05204..ebe356adbbfc 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -1654,11 +1654,10 @@ def test_lstsq(self, m, n, nrhs, device):
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
             assert_sycl_queue_equal(param_queue, b.sycl_queue)
 
-    _LU_TEST_DATA = (
+    @pytest.mark.parametrize(
+        "data",
         [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
     )
-
-    @pytest.mark.parametrize("data", _LU_TEST_DATA)
     def test_lu_factor(self, data, device):
         a = dpnp.array(data, device=device)
         result = dpnp.scipy.linalg.lu_factor(a)
@@ -1667,7 +1666,10 @@ def test_lu_factor(self, data, device):
             param_queue = param.sycl_queue
             assert_sycl_queue_equal(param_queue, a.sycl_queue)
 
-    @pytest.mark.parametrize("data", _LU_TEST_DATA)
+    @pytest.mark.parametrize(
+        "data",
+        [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
+    )
     def test_lu(self, data, device):
         a = dpnp.array(data, device=device)
         result = dpnp.scipy.linalg.lu(a)

From d3dc5da7e4433c41a0a02f5cc5d202b4951d074e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Mon, 2 Mar 2026 20:16:51 +0000
Subject: [PATCH 12/23] fix one more...

---
 dpnp/tests/test_usm_type.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index 59a6b05f688e..b73eb67d51ee 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -1539,6 +1539,10 @@ def test_lu(self, data, usm_type):
         for param in result:
             assert param.usm_type == a.usm_type
 
+    @pytest.mark.parametrize(
+        "data",
+        [[[1.0, 2.0], [3.0, 5.0]], [[]], [[[1.0, 2.0], [3.0, 5.0]]], [[[]]]],
+    )
     def test_lu_factor(self, data, usm_type):
         a = dpnp.array(data, usm_type=usm_type)
         result = dpnp.scipy.linalg.lu_factor(a)

From 6e94734891eca13ffc1f89de9e84280ae1586655 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:46:16 -0600
Subject: [PATCH 13/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 39ddff426973..f9434d4e654a 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2643,7 +2643,7 @@ def test_lu_default(self, shape, order, dtype):
         [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
     )
     @pytest.mark.parametrize("order", ["C", "F"])
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True, no_bool=True))
     def test_lu_permute_l(self, shape, order, dtype):
         a_np = self._make_nonsingular_np(shape, dtype, order)
         a_dp = dpnp.array(a_np, order=order)

From aa985e1e185b822162ed8d36463b0b3907adf571 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:46:31 -0600
Subject: [PATCH 14/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index f9434d4e654a..8cbf8ddf1c37 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2835,6 +2835,7 @@ def test_strided(self, sl):
         A_rec = P @ L @ U
         assert dpnp.allclose(A_rec, a_dp, rtol=1e-6, atol=1e-6)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
     def test_singular_matrix(self):
         a_np = numpy.array([[1.0, 2.0], [2.0, 4.0]])
         a_dp = dpnp.array(a_np)

From 05e3a0faddcda965d070cb3f8ba2cd348e51492c Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:46:42 -0600
Subject: [PATCH 15/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 8cbf8ddf1c37..35ef4cb16d6b 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -3047,6 +3047,7 @@ def test_strided(self):
             A_orig = dpnp.asnumpy(a_stride[i].astype(L.dtype, copy=False))
             assert_allclose(A_rec, A_orig, rtol=1e-6, atol=1e-6)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
     def test_singular_matrix(self):
         a = dpnp.zeros((3, 2, 2), dtype=dpnp.default_float_type())
         a[0] = dpnp.array([[1.0, 2.0], [2.0, 4.0]])

From c7a117a4244b94b2d0f33c5dd53bfbb421d2eecb Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:46:55 -0600
Subject: [PATCH 16/23] Update CHANGELOG.md

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cef92676b708..61cde1ddfefc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,7 +26,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.ndarray.__bytes__` method [#2671](https://github.com/IntelPython/dpnp/pull/2671)
 * Added implementation of `dpnp.divmod` [#2674](https://github.com/IntelPython/dpnp/pull/2674)
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
-* Added implementation of `dpnp.scipy.linalg.lu` for 2D inputs (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
+* Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 
 ### Changed
 

From 4bc7226943737e0dc1ad86996eac062838f4b6b9 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:47:06 -0600
Subject: [PATCH 17/23] Update dpnp/scipy/linalg/_utils.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/scipy/linalg/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index 7f9dff4b2847..d083f1c2c0a2 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -604,7 +604,7 @@ def dpnp_lu(
             if not dpnp.isfinite(a).all():
                 raise ValueError("array must not contain infs or NaNs")
 
-        low = dpnp.ones_like(a, shape=a.shape, dtype=res_type)
+        low = dpnp.ones_like(a, dtype=res_type)
         up = dpnp.astype(a, res_type, copy=not overwrite_a)
         inv_perm = dpnp.zeros_like(a, shape=(*batch_shape, 1), dtype=dpnp.int64)
 

From 643f34d5ab5f477ce90205ee22cf9a5cf4a9b440 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:47:21 -0600
Subject: [PATCH 18/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 35ef4cb16d6b..7e9f5c658634 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2621,7 +2621,7 @@ def _make_nonsingular_np(shape, dtype, order):
         [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
     )
     @pytest.mark.parametrize("order", ["C", "F"])
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True, no_bool=True))
     def test_lu_default(self, shape, order, dtype):
         a_np = self._make_nonsingular_np(shape, dtype, order)
         a_dp = dpnp.array(a_np, order=order)

From 28f26ebf276502d2455061a343356d3b1b41fc84 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 09:47:33 -0600
Subject: [PATCH 19/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 7e9f5c658634..280000d9ec86 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2664,7 +2664,7 @@ def test_lu_permute_l(self, shape, order, dtype):
         [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
     )
     @pytest.mark.parametrize("order", ["C", "F"])
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True, no_bool=True))
     def test_lu_p_indices(self, shape, order, dtype):
         a_np = self._make_nonsingular_np(shape, dtype, order)
         a_dp = dpnp.array(a_np, order=order)

From aa00062fa22df5a8b21522431f5862b39d61b6dc Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Tue, 3 Mar 2026 11:27:03 -0600
Subject: [PATCH 20/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 280000d9ec86..dc4f768049e4 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2685,19 +2685,9 @@ def test_lu_p_indices(self, shape, order, dtype):
         A_cast = a_dp.astype(L.dtype, copy=False)
         assert_allclose(A_rec, dpnp.asnumpy(A_cast), rtol=1e-6, atol=1e-6)
 
-    @pytest.mark.parametrize(
-        "in_dtype, expected_p_dtype",
-        [
-            (dpnp.float32, dpnp.float32),
-            (dpnp.float64, dpnp.float64),
-            (dpnp.complex64, dpnp.float32),
-            (dpnp.complex128, dpnp.float64),
-        ],
-    )
-    def test_p_matrix_dtype(self, in_dtype, expected_p_dtype):
-        if in_dtype in (dpnp.float64, dpnp.complex128):
-            if not has_support_aspect64():
-                pytest.skip("fp64 not supported on this device")
+    @pytest.mark.parametrize("in_dtype", get_float_complex_dtypes())
+    def test_p_matrix_dtype(self, in_dtype):
+        expected_p_dtype = numpy.dtype(in_dtype).char.lower()
 
         a_np = self._make_nonsingular_np((4, 4), in_dtype, "F")
         a_dp = dpnp.array(a_np, order="F")

From 9654380dd2b8aeee85b0781508216d0b835ddcd1 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Wed, 4 Mar 2026 00:14:34 -0600
Subject: [PATCH 21/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index dc4f768049e4..34e4111414e3 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2964,14 +2964,6 @@ def test_overwrite_a(self, dtype, order):
     def test_modes_consistency_batched(self, dtype):
         a_np = self._make_nonsingular_nd_np((3, 4, 4), dtype, "F")
         a_dp = dpnp.array(a_np, order="F")
-        A_cast = a_dp.astype(
-            (
-                dpnp.complex128
-                if dpnp.issubdtype(dtype, dpnp.complexfloating)
-                else dpnp.float64
-            ),
-            copy=False,
-        )
 
         P, L, U = dpnp.scipy.linalg.lu(a_dp)
         PL, U2 = dpnp.scipy.linalg.lu(a_dp, permute_l=True)

From 5de153df74c0b5642636d1e8b6e5f96a7d52b6f6 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Wed, 4 Mar 2026 00:16:26 -0600
Subject: [PATCH 22/23] Update dpnp/tests/test_linalg.py

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 34e4111414e3..2550576ccb2e 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2683,7 +2683,7 @@ def test_lu_p_indices(self, shape, order, dtype):
         U_np = dpnp.asnumpy(U)
         A_rec = L_np[p_np] @ U_np
         A_cast = a_dp.astype(L.dtype, copy=False)
-        assert_allclose(A_rec, dpnp.asnumpy(A_cast), rtol=1e-6, atol=1e-6)
+        assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
 
     @pytest.mark.parametrize("in_dtype", get_float_complex_dtypes())
     def test_p_matrix_dtype(self, in_dtype):

From 69c51d890cddeb00d0dbb4e3097659101cfc2afd Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Wed, 4 Mar 2026 06:31:18 +0000
Subject: [PATCH 23/23] Fix the test_linalg.py, add a test for scalar

---
 dpnp/tests/test_linalg.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 2550576ccb2e..7d8018fa83a2 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -2621,7 +2621,9 @@ def _make_nonsingular_np(shape, dtype, order):
         [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
     )
     @pytest.mark.parametrize("order", ["C", "F"])
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True, no_bool=True))
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
     def test_lu_default(self, shape, order, dtype):
         a_np = self._make_nonsingular_np(shape, dtype, order)
         a_dp = dpnp.array(a_np, order=order)
@@ -2643,7 +2645,9 @@ def test_lu_default(self, shape, order, dtype):
         [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
     )
     @pytest.mark.parametrize("order", ["C", "F"])
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True, no_bool=True))
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
     def test_lu_permute_l(self, shape, order, dtype):
         a_np = self._make_nonsingular_np(shape, dtype, order)
         a_dp = dpnp.array(a_np, order=order)
@@ -2664,7 +2668,9 @@ def test_lu_permute_l(self, shape, order, dtype):
         [(1, 1), (2, 2), (3, 3), (1, 5), (5, 1), (2, 5), (5, 2)],
     )
     @pytest.mark.parametrize("order", ["C", "F"])
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_none=True, no_bool=True))
+    @pytest.mark.parametrize(
+        "dtype", get_all_dtypes(no_none=True, no_bool=True)
+    )
     def test_lu_p_indices(self, shape, order, dtype):
         a_np = self._make_nonsingular_np(shape, dtype, order)
         a_dp = dpnp.array(a_np, order=order)
@@ -2678,10 +2684,7 @@ def test_lu_p_indices(self, shape, order, dtype):
         assert U.shape == (k, n)
         assert dpnp.issubdtype(p.dtype, dpnp.integer)
 
-        p_np = dpnp.asnumpy(p)
-        L_np = dpnp.asnumpy(L)
-        U_np = dpnp.asnumpy(U)
-        A_rec = L_np[p_np] @ U_np
+        A_rec = L[p] @ U
         A_cast = a_dp.astype(L.dtype, copy=False)
         assert dpnp.allclose(A_rec, A_cast, rtol=1e-6, atol=1e-6)
 
@@ -2852,6 +2855,12 @@ def test_check_finite_raises(self, bad):
         a_dp = dpnp.array([[1.0, 2.0], [3.0, bad]], order="F")
         assert_raises(ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True)
 
+    @pytest.mark.parametrize("bad", [numpy.inf, -numpy.inf, numpy.nan])
+    def test_check_finite_raises_scalar(self, bad):
+        # Covers the 1x1 scalar fast path in dpnp_lu
+        a_dp = dpnp.array([[bad]])
+        assert_raises(ValueError, dpnp.scipy.linalg.lu, a_dp, check_finite=True)
+
     def test_check_finite_disabled(self):
         a_dp = dpnp.array([[1.0, numpy.nan], [3.0, 4.0]])
         result = dpnp.scipy.linalg.lu(a_dp, check_finite=False)