From 886cb7e1bb562299f69f78c0a76e4b97e18fa887 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 30 Jul 2025 12:36:51 -0700
Subject: [PATCH 1/3] v25.07.00

---
 CMakeLists.txt                                |   2 +-
 README.md                                     |   3 +-
 cmake/thirdparty/get_legate.cmake             |   1 +
 cmake/versions.json                           |  14 +-
 conda/conda-build/conda_build_config.yaml     |   6 +-
 conda/conda-build/meta.yaml                   |  15 +-
 examples/common.py                            | 263 +++++++-
 examples/matrix_power.py                      | 145 +++--
 examples/pde.py                               |  45 +-
 examples/spgemm_microbenchmark.py             |  94 +++
 examples/spmv_microbenchmark.py               |  70 ++
 legate_sparse/__init__.py                     |   1 +
 legate_sparse/base.py                         | 190 +++++-
 legate_sparse/config.py                       |  85 ++-
 legate_sparse/csr.py                          | 605 ++++++++++++++++--
 legate_sparse/dia.py                          | 215 ++++++-
 legate_sparse/gallery.py                      |  49 +-
 legate_sparse/install_info.pyi                |  16 +
 legate_sparse/io.py                           |  32 +
 legate_sparse/linalg.py                       | 290 +++++++--
 legate_sparse/module.py                       |  66 +-
 legate_sparse/types.py                        |  11 +
 legate_sparse/utils.py                        | 330 ++++++++--
 setup.py                                      |   5 +-
 src/legate_sparse/array/conv/csr_to_dense.cc  |   6 +-
 src/legate_sparse/array/conv/csr_to_dense.h   |   3 +-
 src/legate_sparse/array/conv/dense_to_csr.cc  |   7 +-
 src/legate_sparse/array/conv/dense_to_csr.h   |   6 +-
 .../array/conv/pos_to_coordinates.cc          |   6 +-
 .../array/conv/pos_to_coordinates.h           |   3 +-
 src/legate_sparse/array/csr/get_diagonal.cc   |   7 +-
 src/legate_sparse/array/csr/get_diagonal.h    |   3 +-
 src/legate_sparse/array/csr/indexing.cc       |   7 +-
 src/legate_sparse/array/csr/indexing.h        |   3 +-
 .../array/csr/spgemm_csr_csr_csr.cc           |   7 +-
 .../array/csr/spgemm_csr_csr_csr.h            |   9 +-
 src/legate_sparse/array/csr/spmv.cc           |   7 +-
 src/legate_sparse/array/csr/spmv.h            |   3 +-
 src/legate_sparse/array/util/scale_rect.cc    |   6 +-
 src/legate_sparse/array/util/scale_rect.h     |   3 +-
 src/legate_sparse/array/util/unzip_rect.cc    |   6 +-
 src/legate_sparse/array/util/unzip_rect.h     |   3 +-
 src/legate_sparse/array/util/zip_to_rect.cc   |   6 +-
 src/legate_sparse/array/util/zip_to_rect.h    |   3 +-
 src/legate_sparse/cudalibs.cu                 |  12 +-
 src/legate_sparse/io/mtx_to_coo.cc            |  20 +-
 src/legate_sparse/io/mtx_to_coo.h             |   3 +-
 src/legate_sparse/linalg/axpby.cc             |   6 +-
 src/legate_sparse/linalg/axpby.h              |   3 +-
 src/legate_sparse/mapper/mapper.cc            |  20 +-
 .../partition/fast_image_partition.cc         |   8 +-
 .../partition/fast_image_partition.h          |   3 +-
 src/legate_sparse/util/upcast_future.cc       |   8 +-
 src/legate_sparse/util/upcast_future.h        |   3 +-
 test.py                                       |   3 +-
 tests/integration/conftest.py                 |  54 +-
 tests/integration/test_cg_solve.py            |  74 ++-
 tests/integration/test_comparison.py          |  46 +-
 tests/integration/test_diagonal.py            |  30 +
 tests/integration/test_gmres_solve.py         |  22 +
 tests/integration/test_indexing.py            | 107 +++-
 tests/integration/test_io.py                  |  48 ++
 tests/integration/test_spgemm.py              |  42 ++
 tests/integration/test_spmv.py                |  51 ++
 tests/integration/utils/banded_matrix.py      |  51 +-
 tests/integration/utils/sample.py             | 128 ++++
 66 files changed, 3035 insertions(+), 364 deletions(-)
 create mode 100644 legate_sparse/install_info.pyi

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82e98886..c32254c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(rapids-find)
 ###################################
 # Project
 
-set(legate_sparse_version 25.03.00)
+set(legate_sparse_version 25.07.00)
 
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
diff --git a/README.md b/README.md
index 5eb4af78..b03ea8c6 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,7 @@ for [NumPy](https://numpy.org/doc/stable/reference/index.html#reference), to
 enable writing programs that operate on distributed dense and sparse arrays.
 Take a look at the `examples` directory for some applications that can 
 use Legate Sparse. We have implemented
-an explicit partial-differential equation (PDE) [solver](examples/pde.py) 
-and [Geometric multi-grid](examples/gmg.py) solver. 
+an explicit partial-differential equation (PDE) [solver](examples/pde.py).
 More complex and interesting applications are on the way -- stay tuned!
 
 Legate Sparse is currently in alpha and supports a subset of APIs 
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index 709577a1..727671fd 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -24,6 +24,7 @@ function(find_or_configure_legate)
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
+  set(version ${PKG_VERSION})
   set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
   if(PKG_BRANCH)
     set(git_branch "${PKG_BRANCH}")
diff --git a/cmake/versions.json b/cmake/versions.json
index 6c8498d4..6c5440f4 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -3,22 +3,22 @@
     "legate" : {
       "repo": "legate.internal",
       "org": "nv-legate",
-      "version": "25.03.02",
-      "git_url" : "git@github.com:nv-legate/legate.internal.git",
+      "version": "25.07.00",
+      "git_url" : "git@github.com:nv-legate/legate.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "75dc0a92bbd2dfb79b6b680a0f37cbd0370d0181",
+      "git_tag" : "a46dc3d5b176ff9546bc831409c394c1bbc3b936",
       "anaconda_label": "main"
     },
     "cupynumeric" : {
       "repo": "cupynumeric.internal",
       "org": "nv-legate",
-      "version": "25.03.02",
-      "git_url" : "git@github.com:nv-legate/cupynumeric.internal",
+      "version": "25.07.00",
+      "git_url" : "git@github.com:nv-legate/cupynumeric",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "1fa45603c560068508c3be2e0df45aec62359019",
-      "anaconda_label": "experimental"
+      "git_tag" : "6132d8450049a7abd7786fb4d60444eb5b4e25db",
+      "anaconda_label": "main"
     }
   }
 }
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
index 79750a86..ada8dda2 100644
--- a/conda/conda-build/conda_build_config.yaml
+++ b/conda/conda-build/conda_build_config.yaml
@@ -6,12 +6,14 @@ upload_build:
   - false
 
 python:
-  - 3.10
   - 3.11
   - 3.12
+  - 3.13
 
 numpy_version:
-  - ">=1.22,<2"
+  # Not 2.1.0 which segfaults on asarray() sometimes, see
+  # https://github.com/numpy/numpy/pull/27249
+  - ">=1.22,!=2.1.0"
 
 cmake_version:
   - ">=3.20.1,!=3.23.0"
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 03d1f9d3..9bdad28f 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -7,14 +7,6 @@
   {# We need to have a default value for the initial pass over the recipe #}
   {% set gpu_enabled_bool = false %}
 {% endif %}
-{% if upload_build == "true" %}
-  {% set upload_build_bool = true %}
-{% elif upload_build == "false" %}
-  {% set upload_build_bool = false %}
-{% else %}
-  {# We need to have a default value for the initial pass over the recipe #}
-  {% set upload_build_bool = false %}
-{% endif %}
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
@@ -68,9 +60,9 @@ build:
 
 ## Create legate/cupynumeric version and build string
 {% set legate_version = os.environ.get("LEGATE_VERSION", "1.0.0") %}
-{% set legate_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("LEGATE_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set legate_buildstr = "_".join(["py" ~ py_version, "*" ~ cpu_gpu_tag.strip('_'), os.environ.get("LEGATE_BUILDSTR", "") ]) %}
 {% set cupynumeric_version = os.environ.get("CUPYNUMERIC_VERSION", "1.0.0") %}
-{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("CUPYNUMERIC_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, cpu_gpu_tag, os.environ.get("CUPYNUMERIC_BUILDSTR", "")]) %}
 
 {% if use_local_path is not defined %}
 # use git hash
@@ -126,7 +118,7 @@ requirements:
     #- libcurand-dev
     - openblas =* =*openmp*
     - llvm-openmp
-    - legate ={{ legate_version }}={{ legate_buildstr }}
+    - legate ={{ legate_version }}=*{{ legate_buildstr }}
     - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
 {% if gpu_enabled_bool %}
     # cupynumeric could be only in the run section and we could have just legate
@@ -146,7 +138,6 @@ requirements:
     - numpy {{ numpy_version }}
     - scipy
     - openblas =* =*openmp*
-    - legate ={{ legate_version }}={{ legate_buildstr }}
     - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
 {% if gpu_enabled_bool %}
     - libnvjitlink
diff --git a/examples/common.py b/examples/common.py
index b679e5b9..99174ed6 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -20,6 +20,30 @@
 
 
 def get_arg_number(arg):
+    """Parse a string argument that may contain size suffixes.
+
+    Parameters
+    ----------
+    arg : str
+        String argument that may end with 'k', 'm', or 'g' for
+        kilobytes, megabytes, or gigabytes respectively.
+
+    Returns
+    -------
+    int
+        The parsed number with appropriate multiplier applied.
+
+    Examples
+    --------
+    >>> get_arg_number("1024")
+    1024
+    >>> get_arg_number("1k")
+    1024
+    >>> get_arg_number("1m")
+    1048576
+    >>> get_arg_number("1g")
+    1073741824
+    """
     multiplier = 1
     arg = arg.lower()
     if len(arg) == 0:
@@ -38,28 +62,47 @@ def get_arg_number(arg):
 
 
 class Timer(Protocol):
+    """Protocol for timer implementations.
+
+    This protocol defines the interface that timer classes must implement
+    for measuring execution time in the examples.
+    """
+
     def start(self):
+        """Start timing."""
         ...
 
     def stop(self):
-        """
-        Blocks execution until everything before it has completed. Returns the
-        duration since the last call to start(), in milliseconds.
+        """Stop timing and return duration.
+
+        Blocks execution until everything before it has completed.
+
+        Returns
+        -------
+        float
+            Duration since the last call to start(), in milliseconds.
         """
         ...
 
 
 class LegateTimer(Timer):
+    """Timer implementation using Legate's timing facilities.
+
+    This timer uses Legate's internal timing mechanism for accurate
+    measurement of GPU operations.
+    """
+
     def __init__(self):
         self._start = None
 
     def start(self):
+        """Start timing using Legate's time function."""
         from legate.timing import time
 
         self._start = time()
 
-    # returns time in milliseconds
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from legate.timing import time
 
         _end = time()
@@ -67,16 +110,24 @@ def stop(self):
 
 
 class CuPyTimer(Timer):
+    """Timer implementation using CuPy's CUDA events.
+
+    This timer uses CUDA events for accurate measurement of GPU operations
+    in CuPy applications.
+    """
+
     def __init__(self):
         self._start_event = None
 
     def start(self):
+        """Start timing using CUDA events."""
         from cupy import cuda
 
         self._start_event = cuda.Event()
         self._start_event.record()
 
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from cupy import cuda
 
         end_event = cuda.Event()
@@ -86,15 +137,23 @@ def stop(self):
 
 
 class NumPyTimer(Timer):
+    """Timer implementation using Python's high-resolution timer.
+
+    This timer uses Python's perf_counter_ns for accurate measurement
+    of CPU operations in NumPy/SciPy applications.
+    """
+
     def __init__(self):
         self._start_time = None
 
     def start(self):
+        """Start timing using perf_counter_ns."""
         from time import perf_counter_ns
 
         self._start_time = perf_counter_ns() / 1000.0
 
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from time import perf_counter_ns
 
         end_time = perf_counter_ns() / 1000.0
@@ -105,27 +164,61 @@ def stop(self):
 # manager so that we can run both CuPy and SciPy
 # programs with resource scoping.
 class DummyScope:
+    """No-op context manager for resource scoping.
+
+    This class provides a dummy context manager that does nothing,
+    allowing the same code to run with both CuPy and SciPy programs
+    that may or may not use resource scoping.
+    """
+
     def __init__(self):
         ...
 
     def __enter__(self):
+        """Enter the context (no-op)."""
         ...
 
     def __exit__(self, _, __, ___):
+        """Exit the context (no-op)."""
         ...
 
     def __getitem__(self, item):
+        """Return self for any indexing (no-op)."""
         return self
 
     def count(self, _):
+        """Return 1 for any count operation."""
         return 1
 
     @property
     def preferred_kind(self):
+        """Return None for preferred kind."""
         return None
 
 
 def get_phase_procs(use_legate: bool):
+    """Get processor configurations for different phases of computation.
+
+    Parameters
+    ----------
+    use_legate : bool
+        Whether to use Legate-specific processor configuration.
+
+    Returns
+    -------
+    tuple
+        (build_procs, solve_procs) - processor configurations for
+        build and solve phases respectively.
+
+    Notes
+    -----
+    When use_legate is True, this function queries the available
+    processors and assigns them to different phases:
+    - Build phase: Prefers CPUs, then OpenMP processors, then GPUs
+    - Solve phase: Prefers GPUs, then OpenMP processors, then CPUs
+
+    When use_legate is False, returns DummyScope objects.
+    """
     if use_legate:
         from legate.core import TaskTarget, get_machine
 
@@ -160,6 +253,27 @@ def get_phase_procs(use_legate: bool):
 
 
 def parse_common_args():
+    """Parse common command line arguments for example scripts.
+
+    Returns
+    -------
+    tuple
+        (package, timer, np, sparse, linalg, use_legate) where:
+        - package: str - the selected package ("legate", "cupy", or "scipy")
+        - timer: Timer - appropriate timer implementation
+        - np: module - numpy/cupy/cupynumeric module
+        - sparse: module - sparse matrix module
+        - linalg: module - linear algebra module
+        - use_legate: bool - whether Legate is being used
+
+    Notes
+    -----
+    This function sets up the global environment with the appropriate
+    modules based on the --package argument. It supports:
+    - "legate": Uses cupynumeric, legate_sparse, and legate_sparse.linalg
+    - "cupy": Uses cupy, cupyx.scipy.sparse, and cupyx.scipy.sparse.linalg
+    - "scipy": Uses numpy, scipy.sparse, and scipy.sparse.linalg
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--package",
@@ -204,6 +318,44 @@ def parse_common_args():
 # `diags` construct csr from dia array, while when from_diags=False
 # we construct csr arrya directly - might be slightly faster
 def banded_matrix(N, nnz_per_row, from_diags=False):
+    """Construct a banded matrix with 1.0 as values.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix (N x N).
+    nnz_per_row : int
+        Number of non-zeros per row. Must be odd.
+    from_diags : bool, optional
+        If True, construct using sparse.diags then convert to CSR.
+        If False, construct CSR array directly. Default is False.
+
+    Returns
+    -------
+    sparse matrix
+        A banded matrix in CSR format with 1.0 values.
+
+    Raises
+    ------
+    AssertionError
+        If N <= nnz_per_row or nnz_per_row is not odd.
+
+    Notes
+    -----
+    The matrix has a banded structure with nnz_per_row non-zeros per row,
+    centered around the main diagonal. The direct CSR construction method
+    (from_diags=False) may be slightly faster than the diags method.
+
+    Examples
+    --------
+    >>> A = banded_matrix(5, 3)
+    >>> print(A.toarray())
+    [[1. 1. 0. 0. 0.]
+     [1. 1. 1. 0. 0.]
+     [0. 1. 1. 1. 0.]
+     [0. 0. 1. 1. 1.]
+     [0. 0. 0. 1. 1.]]
+    """
     if from_diags:
         return sparse.diags(
             [1] * nnz_per_row,
@@ -248,6 +400,43 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
 
 
 def stencil_grid(S, grid, dtype=None, format=None):
+    """Construct a sparse matrix resulting from a stencil
+    discretization on rectilinear grids.
+
+    Parameters
+    ----------
+    S : array_like
+        The stencil array defining the pattern of connections.
+    grid : tuple
+        Grid dimensions (e.g., (N, N) for 2D grid).
+    dtype : dtype, optional
+        Data type of the matrix. If None, uses S.dtype.
+    format : str, optional
+        Output format. If None, returns CSR format.
+
+    Returns
+    -------
+    sparse matrix
+        A sparse matrix in CSR format representing the stencil on the grid.
+
+    Notes
+    -----
+    This function constructs a sparse matrix that represents the application
+    of a stencil operator on a regular grid. The stencil defines the pattern
+    of connections between grid points.
+
+    The function handles:
+    - Boundary conditions by zeroing connections outside the grid
+    - Duplicate diagonals by summing their contributions
+    - Conversion to CSR format for efficient operations
+
+    Examples
+    --------
+    >>> # 5-point stencil for 2D grid
+    >>> S = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]])
+    >>> A = stencil_grid(S, (3, 3))
+    >>> print(A.toarray())
+    """
     N_v = int(numpy.prod(grid))  # number of vertices in the mesh
     N_s = int((S != 0).sum(dtype=int))  # number of nonzero stencil entries
 
@@ -309,6 +498,41 @@ def stencil_grid(S, grid, dtype=None, format=None):
 
 
 def poisson2D(N):
+    """Construct the 2D Poisson matrix.
+
+    Parameters
+    ----------
+    N : int
+        Grid size (N x N grid).
+
+    Returns
+    -------
+    sparse matrix
+        The 2D Poisson matrix in CSR format.
+
+    Notes
+    -----
+    This constructs the standard 5-point stencil discretization of
+    the 2D Poisson equation -u_xx - u_yy = f on an N x N grid.
+
+    The matrix has the following structure:
+    - Main diagonal: 4.0
+    - Off-diagonals: -1.0 for horizontal and vertical connections
+
+    Examples
+    --------
+    >>> A = poisson2D(3)
+    >>> print(A.toarray())
+    [[ 4. -1.  0. -1.  0.  0.  0.  0.  0.]
+     [-1.  4. -1.  0. -1.  0.  0.  0.  0.]
+     [ 0. -1.  4.  0.  0. -1.  0.  0.  0.]
+     [-1.  0.  0.  4. -1.  0. -1.  0.  0.]
+     [ 0. -1.  0. -1.  4. -1.  0. -1.  0.]
+     [ 0.  0. -1.  0. -1.  4.  0.  0. -1.]
+     [ 0.  0.  0. -1.  0.  0.  4. -1.  0.]
+     [ 0.  0.  0.  0. -1.  0. -1.  4. -1.]
+     [ 0.  0.  0.  0.  0. -1.  0. -1.  4.]]
+    """
     diag_size = N * N - 1
     first = np.full((N - 1), -1.0)
     chunks = np.concatenate([np.zeros(1), first])
@@ -326,6 +550,37 @@ def poisson2D(N):
 
 
 def diffusion2D(N, epsilon=1.0, theta=0.0):
+    """Construct a 2D diffusion matrix with anisotropy.
+
+    Parameters
+    ----------
+    N : int
+        Grid size (N x N grid).
+    epsilon : float, optional
+        Anisotropy parameter. Default is 1.0 (isotropic).
+    theta : float, optional
+        Rotation angle in radians. Default is 0.0.
+
+    Returns
+    -------
+    sparse matrix
+        The 2D diffusion matrix in CSR format.
+
+    Notes
+    -----
+    This constructs a 9-point stencil for the anisotropic diffusion equation:
+    -div(K * grad(u)) = f
+
+    where K is a diffusion tensor that depends on epsilon and theta.
+    The stencil coefficients are computed based on the rotated diffusion tensor.
+
+    Examples
+    --------
+    >>> # Isotropic diffusion
+    >>> A = diffusion2D(3, epsilon=1.0, theta=0.0)
+    >>> # Anisotropic diffusion
+    >>> A = diffusion2D(3, epsilon=0.1, theta=np.pi/4)
+    """
     eps = float(epsilon)  # for brevity
     theta = float(theta)
 
diff --git a/examples/matrix_power.py b/examples/matrix_power.py
index a43249c0..cc52c08b 100644
--- a/examples/matrix_power.py
+++ b/examples/matrix_power.py
@@ -12,9 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This example performs matrix power by repetitively multiplication. We assume
-# that the matrix is square, so the number of cols is same as the number of
-# rows in the matrix
+"""Sparse Matrix Power Microbenchmark.
+
+This script benchmarks sparse matrix power computation by performing repeated
+matrix multiplication (A^n) and measuring performance at each step. It supports:
+
+- Matrix generation with specified non-zeros per row or total non-zeros
+- Configurable number of matrix multiplications (power exponent)
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nrows: Matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for generated matrix
+--nnz-total: Total number of non-zeros for generated matrix
+--k: Number of matrix multiplications to perform
+--nwarmups: Number of warmup iterations before timing
+--same-sparsity-for-cpu-and-gpu: Use NumPy for consistent sparsity patterns
+--random-seed: Random number seed for sparsity pattern generation
+--package: Backend to use (legate, cupy, scipy)
+"""
 
 import argparse
 from functools import reduce
@@ -31,17 +47,28 @@
 
 
 def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None):
-    """Return a CSR matrix with a prescribed number of nonzeros in each row.
-
-    Args:
-    ----
-
-    nrows: int
-        Number of rows in the matrix. Number of columns is same as number of rows
-    nnz_per_row: int
-        Desired number of nonzero entries in each row
-    dtype: npt.DTypeLike
-        Datatype of the values. This should be one of floating point datatypes
+    """Create a CSR matrix with a prescribed number of nonzeros in each row.
+
+    Parameters
+    ----------
+    nrows : int
+        Number of rows in the matrix. Number of columns is same as number of rows.
+    nnz_per_row : int
+        Desired number of nonzero entries in each row.
+    dtype : npt.DTypeLike, optional
+        Datatype of the values. Should be one of floating point datatypes.
+        Default is np.float32.
+
+    Returns
+    -------
+    sparse matrix
+        A CSR matrix with the specified sparsity pattern.
+
+    Notes
+    -----
+    This function creates a square matrix where each row has exactly
+    nnz_per_row non-zero entries. The column indices are randomly
+    generated and sorted within each row.
     """
     dtype = np.float32 if dtype is None else dtype
     ncols = nrows
@@ -58,18 +85,28 @@ def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike =
 
 
 def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
-    """Return a CSR matrix with a prescribed number of nonzeros in the matrix.
-
-    Args:
-    ----
-
-    nrows: int
-        Number of rows in the matrix. Number of columns is same as number of rows
-    nnz_total: int
-        Desired number of nonzero entries in the matrix with no expectation of
-        nonzeros in each row of the matrix
-    dtype: npt.DTypeLike
-        Datatype of the values. This should be one of floating point datatypes
+    """Create a CSR matrix with a prescribed total number of nonzeros.
+
+    Parameters
+    ----------
+    nrows : int
+        Number of rows in the matrix. Number of columns is same as number of rows.
+    nnz_total : int
+        Desired total number of nonzero entries in the matrix.
+    dtype : npt.DTypeLike, optional
+        Datatype of the values. Should be one of floating point datatypes.
+        Default is np.float32.
+
+    Returns
+    -------
+    sparse matrix
+        A CSR matrix with the specified total number of non-zeros.
+
+    Notes
+    -----
+    This function creates a square matrix with exactly nnz_total non-zero
+    entries distributed randomly across the matrix. There is no guarantee
+    about the number of non-zeros per row.
     """
     dtype = np.float32 if dtype is None else dtype
     ncols = nrows
@@ -86,21 +123,30 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
 # ------------------------
 
 
-def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4):
-    """Multiply matrix by self ntimes and print the time elapsed.
-    Args:
-    ----
-
-    A: csr_matrix
-        The input matrix
-    timer:
-        Instance of the timer class to measure elapsed time
-    ntimes:
-        Number of matrix multiplies or the exponent in A^n
-    nwarmups:
-        Number of warmup iterations before
+def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
+    """Compute A^k and measure performance.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The input matrix to compute A^k.
+    timer : Timer
+        Timer instance to measure elapsed time.
+    nwarmups : int, optional
+        Number of warmup iterations before timing. Default is 2.
+    k : int, optional
+        Number of matrix multiplies or the exponent in A^k. Default is 4.
+
+    Notes
+    -----
+    This function computes A^k by repeated matrix multiplication
+    and measures the time for each step. It prints detailed timing
+    information including:
+    - Matrix dimensions and sparsity
+    - Time for each multiplication step
+    - Time for copying intermediate results
+    - Overall sparsity of the final result
     """
-
     timer.start()
     B = A.copy()
     elapsed_time_init_copy = timer.stop()
@@ -108,10 +154,10 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     for _ in range(nwarmups):
         output = A @ B
 
-    elapsed_time_spgemm = [-1.0] * ntimes
-    elapsed_time_copy = [-1.0] * ntimes
+    elapsed_time_spgemm = [-1.0] * k
+    elapsed_time_copy = [-1.0] * k
 
-    for hop in range(ntimes):
+    for hop in range(k):
         timer.start()
         output = A @ B
         elapsed_time_spgemm[hop] = timer.stop()
@@ -128,9 +174,9 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     print(f"NNZ of A                               : {A.nnz}")
     print(f"NNZ of output                          : {output.nnz}")
     print(f"Sparsity of output (%)                 : {sparsity_output}")
-    print(f"Total number of hops                   : {ntimes}")
+    print(f"Total number of hops                   : {k}")
     print(f"Elapsed time for copy in init (ms)     : {elapsed_time_init_copy}")
-    for hop in range(ntimes):
+    for hop in range(k):
         print(
             f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}"
         )
@@ -168,10 +214,10 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     )
 
     parser.add_argument(
-        "--ntimes",
+        "--k",
         type=int,
         default=4,
-        dest="ntimes",
+        dest="k",
         help="Number of times A @ A is performed",
     )
 
@@ -203,8 +249,7 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     nnz_total = get_arg_number(args.nnz_total)
 
     # this is a global variable
-    global random_seed
-    global rng
+    global random_seed, rng
     random_seed = args.random_seed
 
     if args.same_sparsity_for_cpu_and_gpu:
@@ -230,6 +275,6 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
         print("Matrix created with number of nonzeros per row")
     elapsed_time_matrix_gen = timer.stop()
 
-    compute_matrix_multiply_ntimes(A, timer, int(args.nwarmups), int(args.ntimes))
+    compute_A_power_k(A, timer, int(args.nwarmups), int(args.k))
 
     print(f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}")
diff --git a/examples/pde.py b/examples/pde.py
index 2313f799..d9ca0095 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -12,6 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Partial Differential Equation (PDE) Solver Microbenchmark.
+
+This script benchmarks the solution of 2D Poisson equations using sparse
+linear algebra operations. It implements a finite difference discretization
+with Dirichlet boundary conditions and solves the resulting linear system
+using conjugate gradient iteration. It supports:
+
+- 2D Poisson equation with analytical right-hand side
+- Configurable mesh resolution (nx, ny grid points)
+- Performance measurement of linear solver iterations
+- Throughput mode for measuring solve performance only
+- Convergence analysis with relative residual norms
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nx: Number of grid points along X axis
+--ny: Number of grid points along Y axis
+--plot: Enable residual plotting
+--plot_filename: Filename for plot output
+--throughput: Measure only solve iterations (requires max_iters)
+--tol: Convergence tolerance for linear solver
+--max-iters: Maximum number of linear solver iterations
+--warmup-iters: Number of warmup iterations (for throughput mode)
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 # This PDE solving application is derived from
 # https://aquaulb.github.io/book_solving_pde_mooc/solving_pde_mooc/notebooks/05_IterativeMethods/05_01_Iteration_and_2D.html.
 
@@ -184,7 +210,6 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         _ = A.dot(np.ones((A.shape[1],)))
 
         if throughput:
-            assert max_iters > warmup_iters
             p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=warmup_iters)
             max_iters = max_iters - warmup_iters
             print(f"max_iters has been updated to: {max_iters}")
@@ -192,7 +217,10 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         timer.start()
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
-            p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
+            if use_legate:
+                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+            else:
+                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
             p_sol, iters = linalg.cg(A, bflat, rtol=tol)
         total = timer.stop()
@@ -200,9 +228,10 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         print(f"Mesh resolution                     : ({nx}, {ny})")
         print(f"Dimension of A                      : {A.shape}")
         print(f"Number of rows in A                 : {A.shape[0]}")
+        print(f"Total elapsed time (ms)             : {total}")
 
         if throughput:
-            print(f"Total elapsed time (ms)             : {total}")
+            print(f"Number of warmup iterations         : {warmup_iters}")
             print(f"Max number of iterations            : {max_iters}")
             print(f"Time per (max-)iteration (ms)       : {total / max_iters}")
 
@@ -215,9 +244,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
             convergence_status = True if norm_res <= norm_ini * tol else False
             print(f"Did the solution converge           : {convergence_status}")
             print(f"Final relative residual norm        : {norm_res / norm_ini}")
-            print(f"Number of iterations                : {iters}")
-            print(f"Total elapsed time (ms)             : {total}")
-            print(f"Time per iteration (ms)             : {total / iters}")
+            if iters > 0:
+                print(f"Number of iterations                : {iters}")
+                print(f"Time per iteration (ms)             : {total / iters}")
 
 
 if __name__ == "__main__":
@@ -294,8 +323,8 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
     args, _ = parser.parse_known_args()
     _, timer, np, sparse, linalg, use_legate = parse_common_args()
 
-    if args.throughput and args.max_iters is None:
-        print("Must provide --max-iters when using -throughput.")
+    if args.throughput and (args.max_iters is None or args.warmup_iters is None):
+        print("Must provide --max-iters and --warmup-iters when using --throughput.")
         sys.exit(1)
 
     execute(**vars(args), timer=timer)
diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py
index 3741f400..e30c05dd 100644
--- a/examples/spgemm_microbenchmark.py
+++ b/examples/spgemm_microbenchmark.py
@@ -12,17 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Sparse Matrix-Matrix Multiplication Microbenchmark.
+
+This script benchmarks sparse matrix-matrix multiplication performance
+with configurable matrix sizes and generation methods. It supports:
+
+- Banded matrix generation with specified non-zeros per row
+- Loading matrices from Matrix Market files
+- Stable mode for partition caching vs. fresh matrix creation
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nrows: Matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for banded matrices
+--stable: Enable partition caching by reusing matrices
+--filename1: Load first matrix from Matrix Market file
+--filename2: Load second matrix from Matrix Market file
+--iters: Number of benchmark iterations
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 import argparse
 
 from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
 
 
 def spgemm_dispatch(A, B):
+    """Dispatch sparse matrix-matrix multiplication operation.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        First sparse matrix operand.
+    B : sparse matrix
+        Second sparse matrix operand.
+
+    Returns
+    -------
+    sparse matrix
+        The result of A @ B.
+
+    Notes
+    -----
+    This function performs sparse matrix-matrix multiplication using
+    the @ operator, which is supported by all backends (Legate, CuPy, SciPy).
+    """
     C = A @ B
     return C
 
 
 def get_matrices(N, nnz_per_row, fname1, fname2):
+    """Get matrices for SpGEMM benchmark.
+
+    Parameters
+    ----------
+    N : int
+        Matrix size (N x N) for generated matrices.
+    nnz_per_row : int
+        Number of non-zeros per row for banded matrices.
+    fname1 : str
+        Filename for first matrix (empty string to generate).
+    fname2 : str
+        Filename for second matrix (empty string to use first matrix).
+
+    Returns
+    -------
+    tuple
+        (A, B) - two sparse matrices for multiplication.
+
+    Notes
+    -----
+    If fname1 is provided, loads matrices from Matrix Market files.
+    If fname2 is empty, uses the same matrix for both A and B.
+    Otherwise, generates banded matrices with specified parameters.
+    """
     if fname1 != "":
         # Read file from matrix
         A = sparse.mmread(fname1)
@@ -38,6 +101,37 @@ def get_matrices(N, nnz_per_row, fname1, fname2):
 
 
 def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
+    """Run sparse matrix-matrix multiplication benchmark.
+
+    Parameters
+    ----------
+    N : int
+        Matrix size for generated matrices.
+    nnz_per_row : int
+        Number of non-zeros per row for banded matrices.
+    fname1 : str
+        Filename for first matrix.
+    fname2 : str
+        Filename for second matrix.
+    iters : int
+        Number of benchmark iterations.
+    stable : bool
+        Whether to reuse matrices (allows partition caching).
+    timer : Timer
+        Timer object for measuring performance.
+
+    Notes
+    -----
+    This function runs a benchmark of sparse matrix-matrix multiplication.
+    It supports two modes:
+    - stable=True: Reuses matrices, allowing partition caching
+    - stable=False: Creates fresh matrices each iteration
+
+    The function prints:
+    - Matrix dimensions and non-zero counts
+    - Number of iterations
+    - Total time and time per iteration
+    """
     warmup_iterations = 5
 
     if stable:
diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py
index 559ccbfc..c6f11ff8 100644
--- a/examples/spmv_microbenchmark.py
+++ b/examples/spmv_microbenchmark.py
@@ -12,6 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Sparse Matrix-Vector Multiplication Microbenchmark.
+
+This script benchmarks sparse matrix-vector multiplication performance
+across different matrix sizes and configurations. It supports:
+
+- Matrix size sweeps with configurable min/max sizes
+- Banded matrix generation with specified non-zeros per row
+- Loading matrices from Matrix Market files
+- Optional repartitioning to simulate data updates
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nmin: Minimum matrix size (supports k, m, g suffixes)
+--nmax: Maximum matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for banded matrices
+--repartition: Enable alternating x/y vectors
+--filename: Load matrix from Matrix Market file
+--iters: Number of benchmark iterations
+--from-diags: Use sparse.diags for matrix construction
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 import argparse
 
 from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
@@ -19,6 +41,30 @@
 
 # Writing to pre-allocated array is preferred
 def spmv_dispatch(A, x, y, i, repartition):
+    """Dispatch sparse matrix-vector multiplication operation.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The sparse matrix to multiply with.
+    x : array_like
+        Input vector.
+    y : array_like
+        Output vector (pre-allocated).
+    i : int
+        Iteration index.
+    repartition : bool
+        Whether to alternate between y=A*x and x=A*y.
+
+    Notes
+    -----
+    This function performs sparse matrix-vector multiplication with optional
+    repartitioning. When repartition is True, it alternates between computing
+    y = A*x and x = A*y to simulate data updates.
+
+    For Legate, it uses the dot method with pre-allocated output arrays.
+    For other backends, it uses the @ operator.
+    """
     if use_legate:
         if repartition and i % 2:
             A.dot(y, out=x)
@@ -32,6 +78,30 @@ def spmv_dispatch(A, x, y, i, repartition):
 
 
 def run_spmv(A, iters, repartition, timer):
+    """Run sparse matrix-vector multiplication benchmark.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The sparse matrix to benchmark.
+    iters : int
+        Number of iterations to run.
+    repartition : bool
+        Whether to use repartitioning (alternate x and y).
+    timer : Timer
+        Timer object for measuring performance.
+
+    Notes
+    -----
+    This function runs a benchmark of sparse matrix-vector multiplication.
+    It includes warm-up runs and measures the total time and time per iteration.
+
+    The function prints:
+    - Matrix dimensions and number of non-zeros
+    - Number of iterations
+    - Total elapsed time
+    - Time per iteration
+    """
     x = np.ones((A.shape[1],))
     y = np.zeros((A.shape[0],))
 
diff --git a/legate_sparse/__init__.py b/legate_sparse/__init__.py
index 8a6a077a..c8f44589 100644
--- a/legate_sparse/__init__.py
+++ b/legate_sparse/__init__.py
@@ -21,6 +21,7 @@
 
 from .coverage import clone_module  # noqa: F401
 from .csr import csr_array, csr_matrix  # noqa: F401
+from .dia import dia_array, dia_matrix  # noqa: F401
 from .module import *  # noqa: F401
 
 clone_module(_sp, globals())
diff --git a/legate_sparse/base.py b/legate_sparse/base.py
index fa46fc8b..c9d99a31 100644
--- a/legate_sparse/base.py
+++ b/legate_sparse/base.py
@@ -62,8 +62,36 @@
 # CompressedBase is a base class for several different kinds of sparse
 # matrices, such as CSR, CSC, COO and DIA.
 class CompressedBase:
+    """Base class for compressed sparse matrix formats.
+
+    This class provides common functionality for compressed sparse matrix
+    formats like CSR, CSC, COO, and DIA. It handles the conversion from
+    non-zero counts to position arrays and provides common operations.
+
+    Notes
+    -----
+    This is an internal base class and should not be instantiated directly.
+    Use specific format classes like csr_array instead.
+    """
+
     @classmethod
     def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
+        """Convert non-zero counts to position arrays.
+
+        This class method converts an array of non-zero counts per row/column
+        into the position array used in compressed sparse formats.
+
+        Parameters
+        ----------
+        q_nnz : LogicalStore
+            Store containing the number of non-zeros per row/column.
+
+        Returns
+        -------
+        tuple
+            (pos, total_nnz) where pos is the position array and total_nnz
+            is the total number of non-zeros.
+        """
         q_nnz_arr = store_to_cupynumeric_array(q_nnz)
         cs = cupynumeric.cumsum(q_nnz_arr)
         cs_shifted = cs - q_nnz_arr
@@ -86,9 +114,43 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
         return pos, cs[-1]
 
     def nnz_to_pos(self, q_nnz: LogicalStore):
+        """Convert non-zero counts to position arrays for this instance.
+
+        Parameters
+        ----------
+        q_nnz : LogicalStore
+            Store containing the number of non-zeros per row/column.
+
+        Returns
+        -------
+        tuple
+            (pos, total_nnz) where pos is the position array and total_nnz
+            is the total number of non-zeros.
+        """
         return CompressedBase.nnz_to_pos_cls(q_nnz)
 
     def asformat(self, format, copy=False):
+        """Convert the matrix to a specified format.
+
+        Parameters
+        ----------
+        format : str
+            The desired format ('csr', 'csc', 'coo', etc.).
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        sparse matrix
+            Matrix in the requested format.
+
+        Raises
+        ------
+        ValueError
+            If the format is unknown.
+        NotImplementedError
+            If conversion to the requested format is not implemented.
+        """
         if format is None or format == self.format:
             if copy:
                 raise NotImplementedError
@@ -108,35 +170,51 @@ def asformat(self, format, copy=False):
 
     # The implementation of sum is mostly lifted from scipy.sparse.
     def sum(self, axis=None, dtype=None, out=None):
-        """
-        Sum the matrix elements over a given axis.
+        """Sum the matrix elements over a given axis.
+
         Parameters
         ----------
-        axis : {-2, -1, 0, 1, None} optional
+        axis : {-2, -1, 0, 1, None}, optional
             Axis along which the sum is computed. The default is to
             compute the sum of all the matrix elements, returning a scalar
             (i.e., `axis` = `None`).
         dtype : dtype, optional
             The type of the returned matrix and of the accumulator in which
-            the elements are summed.  The dtype of `a` is used by default
+            the elements are summed. The dtype of `a` is used by default
             unless `a` has an integer dtype of less precision than the default
-            platform integer.  In that case, if `a` is signed then the platform
+            platform integer. In that case, if `a` is signed then the platform
             integer is used while if `a` is unsigned then an unsigned integer
             of the same precision as the platform integer is used.
-            .. versionadded:: 0.18.0
-        out : np.matrix, optional
-            Alternative output matrix in which to place the result. It must
+        out : cupynumeric.ndarray, optional
+            Alternative output array in which to place the result. It must
             have the same shape as the expected output, but the type of the
             output values will be cast if necessary.
-            .. versionadded:: 0.18.0
+
         Returns
         -------
-        sum_along_axis : np.matrix
+        sum_along_axis : cupynumeric.ndarray or scalar
             A matrix with the same shape as `self`, with the specified
-            axis removed.
+            axis removed, or a scalar if axis=None.
+
+        Raises
+        ------
+        NotImplementedError
+            If axis=0 (sum over columns) is requested.
+        ValueError
+            If out is provided but has incompatible shape.
+
+        Notes
+        -----
+        The implementation uses multiplication by a matrix of ones to achieve
+        the sum. For some sparse matrix formats more efficient methods are
+        possible and should override this function.
+
+        Currently, summing over columns (axis=0) is not implemented due to
+        the lack of right matrix multiplication support.
+
         See Also
         --------
-        numpy.matrix.sum : NumPy's implementation of 'sum' for matrices
+        cupynumeric.matrix.sum : NumPy's implementation of 'sum' for matrices
         """
 
         # We use multiplication by a matrix of ones to achieve this.
@@ -171,9 +249,27 @@ def sum(self, axis=None, dtype=None, out=None):
 
     # needed by _data_matrix
     def _with_data(self, data, copy=True):
-        """Returns a _different_ matrix object with the same sparsity structure as self,
-        but with different data.  By default the structure arrays
-        (i.e. .indptr and .indices) are copied. 'data' parameter is never copied.
+        """Returns a matrix object with the same sparsity structure as self,
+        but with different data.
+
+        Parameters
+        ----------
+        data : array_like
+            The new data array. This parameter is never copied.
+        copy : bool, optional
+            Whether to copy the structure arrays (indptr and indices).
+            Default is True.
+
+        Returns
+        -------
+        sparse matrix
+            A new matrix with the same sparsity structure but different data.
+
+        Notes
+        -----
+        This method creates a new matrix object with the same sparsity pattern
+        but replaces the data array. The structure arrays (indptr and indices)
+        are copied by default to avoid modifying the original matrix.
         """
 
         # For CSR and CSC compressed base we can just reuse compressed stores,
@@ -253,12 +349,42 @@ def method(self):
 # format of {Dense, Sparse}. For our purposes, that means CSC and CSR
 # matrices.
 class DenseSparseBase:
+    """Base class for sparse matrices with dense-sparse format.
+
+    This class provides functionality for sparse matrices that have a TACO
+    format of {Dense, Sparse}, which includes CSR and CSC matrices.
+
+    Notes
+    -----
+    This is an internal base class and should not be instantiated directly.
+    Use specific format classes like csr_array instead.
+    """
+
     def __init__(self):
+        """Initialize the DenseSparseBase class."""
         self._balanced_pos_partition = None
 
     # consider using _with_data() here
     @classmethod
     def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
+        """Create a new matrix with the same non-zero structure as mat.
+
+        Parameters
+        ----------
+        mat : sparse matrix
+            The reference matrix whose structure to copy.
+        arg : array_like
+            The data for the new matrix.
+        shape : tuple, optional
+            The shape of the new matrix. If None, uses mat.shape.
+        dtype : dtype, optional
+            The data type of the new matrix. If None, uses mat.dtype.
+
+        Returns
+        -------
+        sparse matrix
+            A new matrix with the same structure as mat but with data from arg.
+        """
         if shape is None:
             shape = mat.shape
         if dtype is None:
@@ -269,6 +395,21 @@ def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
 
 # unpack_rect1_store unpacks a rect1 store into two int64 stores.
 def unpack_rect1_store(pos):
+    """Unpack a rect1 store into two int64 stores.
+
+    This function unpacks the compressed position array used in CSR/CSC
+    formats into separate start and end position arrays.
+
+    Parameters
+    ----------
+    pos : LogicalStore
+        The rect1 store containing packed position information.
+
+    Returns
+    -------
+    tuple
+        (lo, hi) where lo contains start positions and hi contains end positions.
+    """
     out1 = runtime.create_store(int64, shape=pos.shape)
     out2 = runtime.create_store(int64, shape=pos.shape)
     task = runtime.create_auto_task(SparseOpCode.UNZIP_RECT1)
@@ -283,6 +424,25 @@ def unpack_rect1_store(pos):
 
 # pack_to_rect1_store packs two int64 stores into a rect1 store.
 def pack_to_rect1_store(lo, hi, output=None):
+    """Pack two int64 stores into a rect1 store.
+
+    This function packs separate start and end position arrays into the
+    compressed rect1 format used in CSR/CSC formats.
+
+    Parameters
+    ----------
+    lo : LogicalStore
+        Store containing start positions.
+    hi : LogicalStore
+        Store containing end positions.
+    output : LogicalStore, optional
+        Output store for the packed result. If None, creates a new store.
+
+    Returns
+    -------
+    LogicalStore
+        The packed rect1 store.
+    """
     if output is None:
         output = runtime.create_store(rect1, shape=(lo.shape[0],))
     task = runtime.create_auto_task(SparseOpCode.ZIP_TO_RECT1)
diff --git a/legate_sparse/config.py b/legate_sparse/config.py
index 3c9a3780..8c601981 100644
--- a/legate_sparse/config.py
+++ b/legate_sparse/config.py
@@ -23,6 +23,12 @@
 
 
 class _LegateSparseSharedLib:
+    """Internal class representing the shared library interface.
+
+    This class defines the interface to the C++ shared library that
+    implements the core sparse matrix operations.
+    """
+
     LEGATE_SPARSE_DENSE_TO_CSR: int
     LEGATE_SPARSE_DENSE_TO_CSR_NNZ: int
     LEGATE_SPARSE_ZIP_TO_RECT_1: int
@@ -46,6 +52,26 @@ class _LegateSparseSharedLib:
 
 
 def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
+    """Load a shared library without automatic closing.
+
+    Parameters
+    ----------
+    ffi : Any
+        The CFFI interface object.
+    lib_path : str
+        Path to the shared library to load.
+
+    Returns
+    -------
+    Any
+        The loaded library object.
+
+    Notes
+    -----
+    This function loads a shared library using CDLL and converts it to
+    a CFFI object without automatic closing. This prevents issues with
+    symbol cleanup during shutdown.
+    """
     # Use an already-opened library handle, which cffi will convert to a
     # regular FFI object (using the definitions previously added using
     # ffi.cdef), but will not automatically dlclose() on collection.
@@ -55,8 +81,21 @@ def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
 
 # Load the LegateSparse library first so we have a shard object that
 # we can use to initialize all these configuration enumerations
-class LegateSparseLib(Library):
+class LegateSparseLib:
+    """Legate sparse matrix library loader.
+
+    This class handles loading and registering the Legate sparse matrix
+    library with the Legate runtime.
+    """
+
     def __init__(self, name):
+        """Initialize the Legate sparse library.
+
+        Parameters
+        ----------
+        name : str
+            The name of the library to load.
+        """
         self.name = name
         self.runtime = None
         self.shared_object = None
@@ -78,24 +117,58 @@ def __init__(self, name):
         self.shared_object = cast(_LegateSparseSharedLib, shared_lib)
 
     def register(self) -> None:
+        """Register the library with the Legate runtime."""
         callback = getattr(self.shared_object, "legate_sparse_perform_registration")
         callback()
 
     def get_shared_library(self) -> str:
+        """Get the path to the shared library.
+
+        Returns
+        -------
+        str
+            The full path to the shared library file.
+        """
         from legate_sparse.install_info import libpath
 
         return os.path.join(libpath, "liblegate_sparse" + self.get_library_extension())
 
     def get_legate_library(self) -> Library:
+        """Get the Legate library object.
+
+        Returns
+        -------
+        Library
+            The Legate library object.
+        """
         return get_legate_runtime().find_library(self.name)
 
     def get_c_header(self) -> str:
+        """Get the C header for the library.
+
+        Returns
+        -------
+        str
+            The C header content.
+        """
         from legate_sparse.install_info import header
 
         return header
 
     @staticmethod
     def get_library_extension() -> str:
+        """Get the appropriate library extension for the current platform.
+
+        Returns
+        -------
+        str
+            The library extension ('.so' for Linux, '.dylib' for macOS).
+
+        Raises
+        ------
+        RuntimeError
+            If the platform is not supported.
+        """
         os_name = platform.system()
         if os_name == "Linux":
             return ".so"
@@ -105,6 +178,8 @@ def get_library_extension() -> str:
 
 
 SPARSE_LIB_NAME = "legate.sparse"
+"""Name of the Legate sparse library."""
+
 sparse_lib = LegateSparseLib(SPARSE_LIB_NAME)
 sparse_lib.register()
 _sparse = sparse_lib.shared_object
@@ -115,6 +190,13 @@ def get_library_extension() -> str:
 # Match these to entries in sparse_c.h
 @unique
 class SparseOpCode(IntEnum):
+    """Enumeration of sparse matrix operation codes.
+
+    These codes correspond to the operations implemented in the C++
+    shared library and are used to dispatch tasks to the appropriate
+    kernels.
+    """
+
     LOAD_CUDALIBS = _sparse.LEGATE_SPARSE_LOAD_CUDALIBS
     UNLOAD_CUDALIBS = _sparse.LEGATE_SPARSE_UNLOAD_CUDALIBS
 
@@ -146,3 +228,4 @@ class SparseOpCode(IntEnum):
 
 # Register some types for us to use.
 rect1 = types.rect_type(1)
+"""1-dimensional rectangle type used for compressed storage formats."""
diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py
index 6b1d69a4..3008356e 100644
--- a/legate_sparse/csr.py
+++ b/legate_sparse/csr.py
@@ -91,7 +91,140 @@
 
 @clone_scipy_arr_kind(scipy.sparse.csr_array)
 class csr_array(CompressedBase, DenseSparseBase):
+    """Compressed Sparse Row array.
+
+    This can be instantiated in several ways:
+        csr_array(D)
+            where D is a 2-D ndarray or cupynumeric.ndarray
+
+        csr_array(S)
+            with another sparse array or matrix S (equivalent to S.tocsr())
+
+        csr_array((M, N), [dtype])
+            to construct an empty array with shape (M, N)
+            dtype is optional, defaulting to dtype='d'.
+
+        csr_array((data, (row_ind, col_ind)), [shape=(M, N)])
+            where ``data``, ``row_ind`` and ``col_ind`` satisfy the
+            relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
+
+        csr_array((data, indices, indptr), [shape=(M, N)])
+            is the standard CSR representation where the column indices for
+            row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
+            corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
+            If the shape parameter is not supplied, the array dimensions
+            are inferred from the index arrays.
+
+    Attributes
+    ----------
+    dtype : dtype
+        Data type of the array
+    shape : 2-tuple
+        Shape of the array
+    ndim : int
+        Number of dimensions (this is always 2)
+    nnz : int
+        Number of stored values, including explicit zeros
+    data : cupynumeric.ndarray
+        CSR format data array of the array
+    indices : cupynumeric.ndarray
+        CSR format index array of the array
+    indptr : cupynumeric.ndarray
+        CSR format index pointer array of the array
+    has_sorted_indices : bool
+        Whether the indices are sorted
+    has_canonical_format : bool
+        Whether the matrix is in canonical format
+    T : csr_array
+        Transpose of the matrix
+
+    Notes
+    -----
+    Sparse arrays can be used in arithmetic operations: they support
+    addition, subtraction, multiplication, division, and matrix power.
+
+    Advantages of the CSR format:
+        - fast matrix vector products
+
+    Disadvantages of the CSR format:
+        - changes to the sparsity structure are expensive (consider LIL or DOK)
+
+    Canonical Format:
+        - Within each row, indices are sorted by column.
+        - There are no duplicate entries.
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - GPU acceleration via cuSPARSE when available
+        - Limited to supported datatypes on GPU: float32, float64, complex64, complex128
+        - Some operations may create implicit copies due to transformed arrays
+        - Element-wise operations with scalars only operate on existing non-zero elements
+        - Indexing with boolean masks only updates existing non-zero elements
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array
+    >>> csr_array((3, 4), dtype=np.int8).todense()
+    array([[0, 0, 0, 0],
+           [0, 0, 0, 0],
+           [0, 0, 0, 0]], dtype=int8)
+
+    >>> row = np.array([0, 0, 1, 2, 2, 2])
+    >>> col = np.array([0, 2, 2, 0, 1, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+    >>> csr_array((data, (row, col)), shape=(3, 3)).todense()
+    array([[1, 0, 2],
+           [0, 0, 3],
+           [4, 5, 6]])
+
+    >>> indptr = np.array([0, 2, 3, 6])
+    >>> indices = np.array([0, 2, 2, 0, 1, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+    >>> csr_array((data, indices, indptr), shape=(3, 3)).todense()
+    array([[1, 0, 2],
+           [0, 0, 3],
+           [4, 5, 6]])
+    """
+
     def __init__(self, arg, shape=None, dtype=None, copy=False):
+        """Initialize a CSR array.
+
+        Parameters
+        ----------
+        arg : array_like, tuple, or csr_array
+            The input data. Can be:
+            - A 2-D dense array (numpy.ndarray or cupynumeric.ndarray)
+            - A sparse array/matrix to convert to CSR format
+            - A tuple (M, N) for an empty array of shape (M, N)
+            - A tuple (data, (row_ind, col_ind)) for COO format data
+            - A tuple (data, indices, indptr) for CSR format data
+        shape : tuple, optional
+            Shape of the array (M, N). Required if not inferrable from input.
+        dtype : dtype, optional
+            Data type of the array. If None, inferred from input data.
+            Defaults to float64 if not specified.
+        copy : bool, optional
+            Whether to copy the input data. Default is False.
+
+        Raises
+        ------
+        NotImplementedError
+            If the input type is not supported for conversion to CSR.
+        AssertionError
+            If shape cannot be inferred and is not provided.
+        ValueError
+            If input data is inconsistent or invalid.
+
+        Notes
+        -----
+        When converting from dense arrays, the implementation uses a two-pass
+        algorithm that first counts non-zeros per row, then fills them in.
+        This may not scale well on distributed systems due to alignment constraints.
+
+        When converting from COO format, the data is automatically sorted by
+        rows and then by columns to ensure canonical format.
+        """
         self.ndim = 2
         self.indices_sorted = False
         self.canonical_format = False
@@ -189,7 +322,39 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         self._dtype = dtype
 
     def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
+        """Initialize CSR array from tuple inputs.
+
+        This internal method handles the various tuple-based constructor formats:
+        - (M, N) for empty arrays
+        - (data, (row_ind, col_ind)) for COO format
+        - (data, indices, indptr) for CSR format
+
+        Parameters
+        ----------
+        arg : tuple
+            The input tuple in one of the supported formats.
+        dtype : dtype, optional
+            The desired data type.
+        shape : tuple, optional
+            The shape of the array.
+        copy : bool
+            Whether to copy the input data.
+
+        Returns
+        -------
+        tuple
+            (dtype, shape) for the constructed array.
+
+        Raises
+        ------
+        AssertionError
+            If shape cannot be inferred or input is invalid.
+        NotImplementedError
+            If the tuple format is not supported.
+        """
+
         def _get_empty_csr(dtype, nrows_plus_one):
+            """Helper function to create empty CSR arrays."""
             return (
                 cupynumeric.zeros(0, dtype=dtype),
                 cupynumeric.zeros(0, dtype=coord_ty),
@@ -326,36 +491,96 @@ def _get_empty_csr(dtype, nrows_plus_one):
 
     @property
     def dim(self):
+        """Number of dimensions (always 2 for CSR arrays)."""
         return self.ndim
 
     @property
     def nnz(self):
+        """Number of stored values, including explicit zeros.
+
+        Returns
+        -------
+        int
+            The number of non-zero elements in the matrix.
+        """
         return self.vals.shape[0]
 
     @property
     def dtype(self):
+        """Data type of the array.
+
+        Returns
+        -------
+        dtype
+            The data type of the array elements.
+        """
         # We can just return self.vals.type, but bookkeep type separately now
         return self._dtype
 
     # Enable direct operation on the values array.
     def get_data(self):
+        """Get the data array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The data array containing the non-zero values.
+        """
         return store_to_cupynumeric_array(self.vals)
 
     # From array,
     def set_data(self, data):
+        """Set the data array of the CSR matrix.
+
+        Parameters
+        ----------
+        data : cupynumeric.ndarray
+            The new data array. Must have the same length as the current data array.
+
+        Raises
+        ------
+        AssertionError
+            If data is not a cupynumeric.ndarray.
+        """
         if isinstance(data, numpy.ndarray):
             data = cupynumeric.array(data)
         assert isinstance(data, cupynumeric.ndarray)
         self.vals = get_store_from_cupynumeric_array(data)
         self._dtype = data.dtype
 
-    data = property(fget=get_data, fset=set_data)
+    data = property(
+        fget=get_data, fset=set_data, doc="CSR format data array of the matrix"
+    )
 
     # Enable direct operation on the indices array.
     def get_indices(self):
+        """Get the column indices array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The column indices array.
+        """
         return store_to_cupynumeric_array(self.crd)
 
     def set_indices(self, indices):
+        """Set the column indices array of the CSR matrix.
+
+        Parameters
+        ----------
+        indices : cupynumeric.ndarray
+            The new column indices array. Must have the same length as the current indices array.
+
+        Raises
+        ------
+        AssertionError
+            If indices is not a cupynumeric.ndarray.
+
+        Notes
+        -----
+        Setting new indices will mark the matrix as not having sorted indices
+        and not being in canonical format.
+        """
         if isinstance(indices, numpy.ndarray):
             indices = cupynumeric.array(indices)
         assert isinstance(indices, cupynumeric.ndarray)
@@ -364,22 +589,46 @@ def set_indices(self, indices):
         self.canonical_format = False
         self.indices_sorted = False
 
-    indices = property(fget=get_indices, fset=set_indices)
+    indices = property(
+        fget=get_indices, fset=set_indices, doc="CSR format index array of the matrix"
+    )
 
     def get_indptr(self):
+        """Get the index pointer array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The index pointer array. For row i, the column indices are stored in
+            indices[indptr[i]:indptr[i+1]] and their corresponding values are
+            stored in data[indptr[i]:indptr[i+1]].
+        """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
         return cupynumeric.append(row_start, [self.nnz])
 
     # Disallow changing intptrs directly
-    indptr = property(fget=get_indptr)
+    indptr = property(
+        fget=get_indptr, doc="CSR format index pointer array of the matrix"
+    )
 
     def _get_row_indices(self):
-        """Helper routine that converts pos to row indices"""
+        """Helper routine that converts pos to row indices.
+
+        This internal method expands the compressed row storage format's position
+        array into explicit row indices for each non-zero element.
 
-        # TODO: Add an option that caches the row_indices so that other binary
-        # operations don't have to recompute it.
+        Returns
+        -------
+        cupynumeric.ndarray
+            Array of row indices corresponding to each non-zero element.
 
+        Notes
+        -----
+        This method is used internally by comparison operations and other
+        methods that need explicit row indices. The result could be cached
+        for performance, but currently is recomputed each time.
+        """
         row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
         src_part = task.add_input(self.pos)
@@ -390,13 +639,56 @@ def _get_row_indices(self):
         return store_to_cupynumeric_array(row_indices)
 
     def has_sorted_indices(self):
+        """Determine whether the matrix has sorted indices.
+
+        Returns
+        -------
+        bool
+            True if the indices are sorted, False otherwise.
+        """
         return self.indices_sorted
 
     def has_canonical_format(self):
+        """Determine whether the matrix is in canonical format.
+
+        Returns
+        -------
+        bool
+            True if the matrix is in canonical format, False otherwise.
+
+        Notes
+        -----
+        A matrix is in canonical format if:
+        - Within each row, indices are sorted by column
+        - There are no duplicate entries
+        """
         return self.canonical_format
 
     # The rest of the methods
     def diagonal(self, k=0):
+        """Return the k-th diagonal of the matrix.
+
+        Parameters
+        ----------
+        k : int, optional
+            Which diagonal to retrieve. Default is 0 (main diagonal).
+            k > 0 for upper diagonals, k < 0 for lower diagonals.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The k-th diagonal of the matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            If k != 0 (only main diagonal is currently supported).
+
+        Notes
+        -----
+        Currently only supports k=0 (main diagonal). Other diagonals
+        are not implemented.
+        """
         rows, cols = self.shape
         if k <= -rows or k >= cols:
             return cupynumeric.empty(0, dtype=self.dtype)
@@ -422,6 +714,33 @@ def diagonal(self, k=0):
         return store_to_cupynumeric_array(output)
 
     def todense(self, order=None, out=None):
+        """Return a dense matrix representation of this matrix.
+
+        Parameters
+        ----------
+        order : str, optional
+            Not supported. Must be None.
+        out : cupynumeric.ndarray, optional
+            Output array for the result. Must have the same shape and dtype
+            as the expected output.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            A dense matrix with the same shape and dtype as this matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            If order is not None.
+        ValueError
+            If out is provided but has incompatible dtype or shape.
+
+        Notes
+        -----
+        The order parameter is not supported and must be None.
+        If out is provided, it must have the correct shape and dtype.
+        """
         if order is not None:
             raise NotImplementedError
         if out is not None:
@@ -444,13 +763,63 @@ def todense(self, order=None, out=None):
         return store_to_cupynumeric_array(out)
 
     def multiply(self, other):
+        """Point-wise multiplication by another matrix, vector, or scalar.
+
+        Parameters
+        ----------
+        other : csr_array, cupynumeric.ndarray, or scalar
+            The object to multiply with.
+
+        Returns
+        -------
+        csr_array or cupynumeric.ndarray
+            The result of the multiplication.
+
+        Notes
+        -----
+        This is equivalent to the * operator.
+        """
         return self * other
 
     def __rmul__(self, other):
+        """Right multiplication by a scalar.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar to multiply with.
+
+        Returns
+        -------
+        csr_array
+            The result of the multiplication.
+        """
         return self * other
 
     # This is an element-wise operation now.
     def __mul__(self, other):
+        """Element-wise multiplication.
+
+        Parameters
+        ----------
+        other : scalar or array_like
+            The object to multiply with.
+
+        Returns
+        -------
+        csr_array
+            The result of the multiplication.
+
+        Raises
+        ------
+        NotImplementedError
+            If other is not a scalar.
+
+        Notes
+        -----
+        Currently only supports scalar multiplication. Array multiplication
+        is not implemented.
+        """
         if isinstance(other, numpy.ndarray):
             other = cupynumeric.array(other)
 
@@ -464,10 +833,48 @@ def __mul__(self, other):
 
     # rmatmul represents the operation other @ self.
     def __rmatmul__(self, other):
+        """Right matrix multiplication (other @ self).
+
+        Parameters
+        ----------
+        other : array_like
+            The left operand for matrix multiplication.
+
+        Returns
+        -------
+        cupynumeric.ndarray or csr_array
+            The result of the matrix multiplication.
+
+        Raises
+        ------
+        NotImplementedError
+            Currently not implemented.
+
+        Notes
+        -----
+        This method handles the case where a dense matrix is multiplied
+        with a CSR matrix from the left. Currently not implemented.
+        """
         # Handle dense @ CSR
         raise NotImplementedError
 
     def __matmul__(self, other):
+        """Matrix multiplication (self @ other).
+
+        Parameters
+        ----------
+        other : array_like or csr_array
+            The right operand for matrix multiplication.
+
+        Returns
+        -------
+        cupynumeric.ndarray or csr_array
+            The result of the matrix multiplication.
+
+        Notes
+        -----
+        This is equivalent to the dot method.
+        """
         return self.dot(other)
 
     def _compare_scalar(self, other, op):
@@ -742,15 +1149,54 @@ def dot(self, other, out=None):
 
         Parameters
         ----------
-        other : array_like
-            The object to compute dot product with
-        out : ndarray, optional
-            Output array for the result
+        other : array_like or csr_array
+            The object to compute dot product with. Can be:
+            - A dense vector (1-D array) for sparse matrix-vector multiplication (SpMV)
+            - A dense matrix (2-D array) for sparse matrix-matrix multiplication (SpMM)
+            - A CSR matrix for sparse matrix-sparse matrix multiplication (SpGEMM)
+        out : cupynumeric.ndarray, optional
+            Output array for the result. Only supported for SpMV operations.
+            Must have the correct shape and dtype.
 
         Returns
         -------
-        output : csr_array or cupynumeric.ndarray
-            Sparse matrix or dense array depending on input
+        cupynumeric.ndarray or csr_array
+            The result of the dot product:
+            - For SpMV: dense vector
+            - For SpMM: dense matrix
+            - For SpGEMM: CSR matrix
+
+        Raises
+        ------
+        NotImplementedError
+            If the operation is not supported or datatypes are not supported on GPU.
+        ValueError
+            If out is provided for SpGEMM operations or has incompatible dtype/shape.
+        RuntimeWarning
+            If an implicit copy is created due to transformed input arrays.
+
+        Notes
+        -----
+        Supported operations:
+        - SpMV (sparse matrix-vector): A @ x where x is a dense vector
+        - SpGEMM (sparse-sparse): A @ B where B is a CSR matrix
+
+        GPU limitations:
+        - Only floating point datatypes are supported: float32, float64, complex64, complex128
+        - Some operations may create implicit copies due to transformed arrays
+
+        The implementation automatically chooses the appropriate algorithm:
+        - For vectors: uses cuSPARSE SpMV when available
+        - For CSR matrices: uses cuSPARSE SpGEMM on GPU, custom implementation on CPU
+
+        Examples
+        --------
+        >>> import cupynumeric as np
+        >>> from legate_sparse import csr_array
+        >>> A = csr_array([[1, 2, 0], [0, 0, 3], [4, 0, 5]])
+        >>> v = np.array([1, 0, -1])
+        >>> A.dot(v)
+        array([ 1, -3, -1])
         """
         # If output specified - it should be cupynumeric array
         if out is not None:
@@ -840,7 +1286,9 @@ def _getpos(self):
         Returns
         -------
         list of tuple
-            List of (start, end) position tuples for each row in the matrix
+            List of (start, end) position tuples for each row in the matrix.
+            For row i, the non-zero elements are stored in positions
+            [start, end) in the data and indices arrays.
         """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
@@ -853,7 +1301,7 @@ def copy(self):
         Returns
         -------
         csr_array
-            A copy of the matrix
+            A copy of the matrix with the same data and structure.
         """
         return csr_array(self, dtype=self.dtype)
 
@@ -863,12 +1311,17 @@ def conj(self, copy=True):
         Parameters
         ----------
         copy : bool, optional
-            Whether to create a new matrix or modify in-place
+            Whether to create a new matrix or modify in-place. Default is True.
 
         Returns
         -------
         csr_array
-            The conjugate matrix
+            The conjugate matrix.
+
+        Notes
+        -----
+        If copy=True, returns a new matrix. If copy=False, modifies the
+        current matrix in-place.
         """
         if copy:
             return self.copy().conj(copy=False)
@@ -882,14 +1335,25 @@ def transpose(self, axes=None, copy=False):
         Parameters
         ----------
         axes : None, optional
-            This argument is not supported
+            This argument is not supported and must be None.
         copy : bool, optional
-            Whether to create a copy (ignored - CSR transpose always creates copy)
+            Whether to create a copy. Ignored - CSR transpose always creates a copy.
 
         Returns
         -------
         csr_array
-            Transposed matrix
+            Transposed matrix with shape (N, M) where the original shape was (M, N).
+
+        Raises
+        ------
+        AssertionError
+            If axes is not None.
+
+        Notes
+        -----
+        The axes parameter is not supported and must be None.
+        CSR transpose always creates a copy due to the format conversion.
+        The implementation sorts the data by columns to maintain canonical format.
         """
         if axes is not None:
             raise AssertionError("axes parameter should be None")
@@ -922,22 +1386,31 @@ def transpose(self, axes=None, copy=False):
             copy=False,
         )
 
-    T = property(transpose)
+    T = property(transpose, doc="Transpose of the matrix")
 
-    def asformat(seld, format, copy=False):
+    def asformat(self, format, copy=False):
         """Convert this matrix to a specified format.
 
         Parameters
         ----------
         format : str
-            Desired sparse format ('csr' only)
+            Desired sparse format. Currently only 'csr' is supported.
         copy : bool, optional
-            Whether to create a copy
+            Whether to create a copy. Default is False.
 
         Returns
         -------
         csr_array
-            Matrix in the requested format
+            Matrix in the requested format.
+
+        Raises
+        ------
+        NotImplementedError
+            If format is not 'csr'.
+
+        Notes
+        -----
+        Currently only CSR format is supported. Other formats are not implemented.
         """
         if format == "csr":
             return self.copy() if copy else self
@@ -950,12 +1423,17 @@ def tocsr(self, copy=False):
         Parameters
         ----------
         copy : bool, optional
-            Whether to create a copy
+            Whether to create a copy. Default is False.
 
         Returns
         -------
         csr_array
-            The converted CSR matrix
+            The converted CSR matrix.
+
+        Notes
+        -----
+        Since this matrix is already in CSR format, this method simply
+        returns a copy if requested, or the matrix itself otherwise.
         """
         if copy:
             return self.copy().tocsr(copy=False)
@@ -967,7 +1445,13 @@ def nonzero(self):
         Returns
         -------
         (row, col) : tuple of cupynumeric.ndarrays
-            Row and column indices of non-zeros
+            Row and column indices of non-zeros. Only returns indices
+            where the values are actually non-zero (not just stored).
+
+        Notes
+        -----
+        This method filters out explicit zeros that may be stored in the
+        sparse matrix structure.
         """
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
 
@@ -986,23 +1470,30 @@ def nonzero(self):
 
 
 csr_matrix = csr_array
+"""Alias for csr_array for backward compatibility with SciPy naming conventions."""
 
 
 # spmv computes y = A @ x.
 def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
+    """Perform sparse matrix vector product y = A @ x.
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix of shape (M, N).
+    x : cupynumeric.ndarray
+        Dense vector of shape (N,) for the dot product.
+    y : cupynumeric.ndarray
+        Output array of shape (M,) to store the result.
+
+    Notes
+    -----
+    This function computes the sparse matrix-vector multiplication y = A @ x.
+    The implementation uses an auto-parallelized kernel that distributes
+    the computation across available processors.
+
+    The function modifies y in-place to store the result.
     """
-    Perform sparse matrix vector product y = A @ x
-
-    Parameters:
-    -----------
-    A: csr_array
-        Input sparse matrix
-    x: cupynumeric.ndarray
-        Dense vector for the dot product
-    y: cupynumeric.ndarray
-        Output array
-    """
-
     x_store = get_store_from_cupynumeric_array(x)
     y_store = get_store_from_cupynumeric_array(y)
 
@@ -1026,20 +1517,34 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
 # spgemm_csr_csr_csr computes C = A @ B when A and B and
 # both csr matrices, and returns the result C as a csr matrix.
 def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
-    """
-    Perform sparse matrix multiplication C = A @ B
+    """Perform sparse matrix multiplication C = A @ B.
 
-    Parameters:
-    -----------
-    A: csr_array
-        Input sparse matrix A
-    B: csr_array
-        Input sparse matrix B
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix A of shape (M, K).
+    B : csr_array
+        Input sparse matrix B of shape (K, N).
 
-    Returns:
-    --------
+    Returns
+    -------
     csr_array
-        The result of the sparse matrix multiplication
+        The result of the sparse matrix multiplication with shape (M, N).
+
+    Notes
+    -----
+    This function computes the sparse matrix-sparse matrix multiplication C = A @ B.
+
+    The implementation differs based on the available hardware:
+    - On GPU: Uses cuSPARSE SpGEMM with local CSR matrices that are aggregated
+    - On CPU: Uses a custom implementation with two-pass algorithm
+
+    The GPU implementation creates a set of local CSR matrices that are
+    aggregated into a global CSR matrix. The CPU implementation uses a
+    query phase to determine the number of non-zeros per row, followed
+    by the actual computation phase.
+
+    Both implementations maintain the CSR format throughout the computation.
     """
     # Due to limitations in cuSPARSE, we cannot use a uniform task
     # implementation for CSRxCSRxCSR SpGEMM across CPUs, OMPs and GPUs.
diff --git a/legate_sparse/dia.py b/legate_sparse/dia.py
index 0dd93735..20f2dc5c 100644
--- a/legate_sparse/dia.py
+++ b/legate_sparse/dia.py
@@ -63,7 +63,107 @@
 # Temporary implementation for matrix generation in examples
 @clone_scipy_arr_kind(scipy.sparse.dia_array)
 class dia_array(CompressedBase):
+    """Sparse matrix with DIAgonal storage.
+
+    This can be instantiated in several ways:
+        dia_array(D)
+            where D is a 2-D ndarray or cupynumeric.ndarray
+
+        dia_array((data, offsets), shape=(M, N))
+            where data is a 2-D array and offsets is a 1-D array of diagonal offsets
+
+        dia_array((data, offset), shape=(M, N))
+            where data is a 1-D array and offset is a single integer
+
+    Attributes
+    ----------
+    dtype : dtype
+        Data type of the array
+    shape : 2-tuple
+        Shape of the array
+    ndim : int
+        Number of dimensions (this is always 2)
+    nnz : int
+        Number of stored values, including explicit zeros
+    data : cupynumeric.ndarray
+        DIA format data array of the array
+    offsets : cupynumeric.ndarray
+        DIA format offset array of the array
+    T : dia_array
+        Transpose of the matrix
+
+    Notes
+    -----
+    The DIA (Diagonal) format stores a sparse matrix by diagonals.
+    The data array has shape (n_diagonals, max_diagonal_length) where
+    each row represents a diagonal. The offsets array contains the
+    diagonal offsets (k > 0 for upper diagonals, k < 0 for lower diagonals).
+
+    Advantages of the DIA format:
+        - efficient for matrices with few diagonals
+        - fast matrix-vector products
+        - simple structure
+
+    Disadvantages of the DIA format:
+        - inefficient for irregular sparsity patterns
+        - not suitable for general sparse matrices
+        - limited arithmetic operations
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - Limited functionality (mainly for matrix generation in examples)
+        - Some operations may not be fully optimized
+        - Primarily used as an intermediate format for conversion to CSR
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import dia_array
+    >>> data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> offsets = np.array([-1, 0, 1])
+    >>> A = dia_array((data, offsets), shape=(3, 3))
+    >>> A.todense()
+    array([[5, 2, 0],
+           [4, 8, 3],
+           [0, 7, 9]])
+    """
+
     def __init__(self, arg, shape=None, dtype=None, copy=False):
+        """Initialize a DIA array.
+
+        Parameters
+        ----------
+        arg : tuple
+            The input data. Must be a tuple (data, offsets) where:
+            - data is a 2-D array containing the diagonal values
+            - offsets is a 1-D array or integer specifying diagonal offsets
+        shape : tuple, optional
+            Shape of the array (M, N). Required if not inferrable from input.
+        dtype : dtype, optional
+            Data type of the array. If None, inferred from input data.
+        copy : bool, optional
+            Whether to copy the input data. Default is False.
+
+        Raises
+        ------
+        NotImplementedError
+            If shape is not provided (shape is required for DIA arrays).
+        AssertionError
+            If arg is not a tuple or has invalid format.
+        ValueError
+            If input data is inconsistent or invalid.
+
+        Notes
+        -----
+        The DIA format is primarily used for matrix generation in examples
+        and as an intermediate format for conversion to CSR. The shape
+        parameter is required as it cannot be inferred from the diagonal data.
+
+        The offsets array specifies which diagonals are stored:
+        - k > 0: upper diagonal (kth diagonal above main diagonal)
+        - k = 0: main diagonal
+        - k < 0: lower diagonal (kth diagonal below main diagonal)
+        """
         if shape is None:
             raise NotImplementedError
         assert isinstance(arg, tuple)
@@ -89,6 +189,18 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
 
     @property
     def nnz(self):
+        """Number of stored values, including explicit zeros.
+
+        Returns
+        -------
+        int
+            The number of non-zero elements in the matrix.
+
+        Notes
+        -----
+        This property computes the number of non-zeros by iterating through
+        each diagonal and counting the valid elements within the matrix bounds.
+        """
         M, N = self.shape
         nnz = 0
         for k in self.offsets:
@@ -100,18 +212,73 @@ def nnz(self):
 
     @property
     def data(self):
+        """Get the data array of the DIA matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The data array containing the diagonal values. Each row represents
+            a diagonal, with shape (n_diagonals, max_diagonal_length).
+        """
         return store_to_cupynumeric_array(self._data)
 
     @property
     def offsets(self):
+        """Get the offsets array of the DIA matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The offsets array specifying which diagonals are stored.
+            Positive values indicate upper diagonals, negative values
+            indicate lower diagonals, and zero indicates the main diagonal.
+        """
         return store_to_cupynumeric_array(self._offsets)
 
     def copy(self):
+        """Returns a copy of this matrix.
+
+        Returns
+        -------
+        dia_array
+            A copy of the matrix with the same data and structure.
+        """
         data = cupynumeric.array(self.data)
         offsets = cupynumeric.array(self.offsets)
         return dia_array((data, offsets), shape=self.shape, dtype=self.dtype)
 
     def transpose(self, axes=None, copy=False):
+        """Reverses the dimensions of the sparse matrix.
+
+        Parameters
+        ----------
+        axes : None, optional
+            This argument is not supported and must be None.
+        copy : bool, optional
+            Whether to create a copy. Not supported - must be False.
+
+        Returns
+        -------
+        dia_array
+            Transposed matrix with shape (N, M) where the original shape was (M, N).
+
+        Raises
+        ------
+        ValueError
+            If axes is not None.
+        AssertionError
+            If copy is True (not supported).
+
+        Notes
+        -----
+        The axes parameter is not supported and must be None.
+        The copy parameter is not supported and must be False.
+
+        Transposing a DIA matrix involves:
+        1. Flipping the diagonal offsets (negating them)
+        2. Re-aligning the data matrix to account for the new offsets
+        3. Adjusting the shape from (M, N) to (N, M)
+        """
         if axes is not None:
             raise ValueError(
                 "Sparse matrices do not support "
@@ -147,9 +314,27 @@ def transpose(self, axes=None, copy=False):
             dtype=self.dtype,
         )
 
-    T = property(transpose)
+    T = property(transpose, doc="Transpose of the matrix")
 
     def tocsr(self, copy=False):
+        """Convert this matrix to a CSR matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        csr_array
+            The converted CSR matrix.
+
+        Notes
+        -----
+        The conversion to CSR is done by first transposing the matrix
+        and then converting the transposed matrix to CSR format.
+        This approach is used to simplify the conversion process.
+        """
         if copy:
             return self.copy().tocsr(copy=False)
         # we don't need secondary copy
@@ -157,6 +342,33 @@ def tocsr(self, copy=False):
 
     # This routine is lifted from scipy.sparse's converter.
     def _tocsr_transposed(self, copy=False):
+        """Convert the transposed DIA matrix to CSR format.
+
+        This internal method converts a transposed DIA matrix to CSR format.
+        It is used by the tocsr method after transposing the original matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        csr_array
+            The CSR representation of the transposed matrix.
+
+        Notes
+        -----
+        This method is adapted from SciPy's DIA to CSR converter.
+        It handles the conversion by:
+        1. Creating masks for valid diagonal elements
+        2. Computing the indptr array using cumulative sums
+        3. Extracting indices and data for non-zero elements
+        4. Constructing the CSR matrix
+
+        The method ensures that only elements within the matrix bounds
+        and with non-zero values are included in the CSR representation.
+        """
         if self.nnz == 0:
             return csr_array(self.shape, self.dtype)
 
@@ -192,3 +404,4 @@ def _tocsr_transposed(self, copy=False):
 
 # Declare an alias for this type.
 dia_matrix = dia_array
+"""Alias for dia_array for backward compatibility with SciPy naming conventions."""
diff --git a/legate_sparse/gallery.py b/legate_sparse/gallery.py
index 56e71054..371a4c44 100644
--- a/legate_sparse/gallery.py
+++ b/legate_sparse/gallery.py
@@ -75,8 +75,8 @@
 
 
 def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
-    """
-    Construct a sparse matrix from diagonals.
+    """Construct a sparse matrix from diagonals.
+
     Parameters
     ----------
     diagonals : sequence of array_like
@@ -90,44 +90,69 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
     shape : tuple of int, optional
         Shape of the result. If omitted, a square matrix large enough
         to contain the diagonals is returned.
-    format : {"dia", "csr", "csc", "lil", ...}, optional
-        Matrix format of the result. By default (format=None) an
-        appropriate sparse matrix format is returned. This choice is
-        subject to change.
+    format : {"dia", "csr"}, optional
+        Matrix format of the result. By default (format=None) a DIA
+        matrix is returned. Currently only "dia" and "csr" are supported.
     dtype : dtype, optional
-        Data type of the matrix.
+        Data type of the matrix. Must be specified.
+
+    Returns
+    -------
+    sparse matrix
+        A sparse matrix in the specified format with the given diagonals.
+
+    Raises
+    ------
+    ValueError
+        If the number of diagonals and offsets don't match, or if
+        diagonal lengths don't agree with matrix size.
+    NotImplementedError
+        If dtype is not specified or format is not supported.
+
     See Also
     --------
     spdiags : construct matrix from diagonals
+
     Notes
     -----
     This function differs from `spdiags` in the way it handles
     off-diagonals.
+
     The result from `diags` is the sparse equivalent of::
         np.diag(diagonals[0], offsets[0])
         + ...
         + np.diag(diagonals[k], offsets[k])
+
     Repeated diagonal offsets are disallowed.
-    .. versionadded:: 0.11
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - dtype parameter is required (cannot be None)
+        - Limited format support (only "dia" and "csr")
+        - Primarily used for matrix generation in examples
+
     Examples
     --------
-    >>> from scipy.sparse import diags
+    >>> import cupynumeric as np
+    >>> from legate_sparse import diags
     >>> diagonals = [[1, 2, 3, 4], [1, 2, 3], [1, 2]]
-    >>> diags(diagonals, [0, -1, 2]).toarray()
+    >>> diags(diagonals, [0, -1, 2], dtype=np.float64).todense()
     array([[1, 0, 1, 0],
            [1, 2, 0, 2],
            [0, 2, 3, 0],
            [0, 0, 3, 4]])
+
     Broadcasting of scalars is supported (but shape needs to be
     specified):
-    >>> diags([1, -2, 1], [-1, 0, 1], shape=(4, 4)).toarray()
+    >>> diags([1, -2, 1], [-1, 0, 1], shape=(4, 4), dtype=np.float64).todense()
     array([[-2.,  1.,  0.,  0.],
            [ 1., -2.,  1.,  0.],
            [ 0.,  1., -2.,  1.],
            [ 0.,  0.,  1., -2.]])
+
     If only one diagonal is wanted (as in `numpy.diag`), the following
     works as well:
-    >>> diags([1, 2, 3], 1).toarray()
+    >>> diags([1, 2, 3], 1, dtype=np.float64).todense()
     array([[ 0.,  1.,  0.,  0.],
            [ 0.,  0.,  2.,  0.],
            [ 0.,  0.,  0.,  3.],
diff --git a/legate_sparse/install_info.pyi b/legate_sparse/install_info.pyi
new file mode 100644
index 00000000..16574ccf
--- /dev/null
+++ b/legate_sparse/install_info.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# See the LICENSE file for details.
+#
+
+# Stub file for install_info module to satisfy mypy
+# This module is generated during build time
+
+libpath: str
+header: str
diff --git a/legate_sparse/io.py b/legate_sparse/io.py
index 2b6a09e3..ecaf8e3c 100644
--- a/legate_sparse/io.py
+++ b/legate_sparse/io.py
@@ -24,6 +24,38 @@
 
 @track_provenance(runtime.sparse_library)
 def mmread(source):
+    """Read a sparse matrix from a Matrix Market (.mtx) file.
+
+    Parameters
+    ----------
+    source : str
+        The filename or path to the Matrix Market file to read.
+
+    Returns
+    -------
+    csr_array
+        A sparse matrix in CSR format loaded from the file.
+
+    Notes
+    -----
+    This function reads Matrix Market format files and converts them
+    to CSR format. The Matrix Market format is a standard format for
+    storing sparse matrices. For more information on the format, see
+    https://math.nist.gov/MatrixMarket/formats.html.
+
+    The function assumes that all nodes in the system can access the
+    file, so no special file distribution is needed.
+
+    The implementation reads the file in COO format and then converts
+    to CSR format for efficient storage and operations.
+
+    Examples
+    --------
+    >>> from legate_sparse import mmread
+    >>> A = mmread("matrix.mtx")
+    >>> print(A.shape)
+    (1000, 1000)
+    """
     # TODO (rohany): We'll assume for now that all of the nodes in the system
     # can access the file passed in, so we don't need to worry about where this
     # task gets mapped to.
diff --git a/legate_sparse/linalg.py b/legate_sparse/linalg.py
index 6d515502..82aa0edb 100644
--- a/legate_sparse/linalg.py
+++ b/legate_sparse/linalg.py
@@ -66,6 +66,32 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+"""
+Sparse linear algebra (:mod:`legate_sparse.linalg`)
+===================================================
+
+.. currentmodule:: legate_sparse.linalg
+
+Abstract linear operators
+-------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   LinearOperator -- abstract representation of a linear operator
+
+Solving linear problems
+-----------------------
+
+Iterative methods for linear equation systems:
+
+.. autosummary::
+   :toctree: generated/
+
+   cg -- Use Conjugate Gradient iteration to solve Ax = b
+   gmres -- Use Generalized Minimal RESidual iteration to solve Ax = b
+
+"""
 
 import inspect
 import warnings
@@ -414,6 +440,23 @@ def _rmatvec(self, x, out=None):
 
 
 def make_linear_operator(A):
+    """Convert a matrix to a LinearOperator.
+
+    Parameters
+    ----------
+    A : array_like, sparse matrix, or LinearOperator
+        The matrix to convert.
+
+    Returns
+    -------
+    LinearOperator
+        A LinearOperator representation of A.
+
+    Notes
+    -----
+    If A is already a LinearOperator, it is returned unchanged.
+    Otherwise, A is wrapped in a _SparseMatrixLinearOperator.
+    """
     if isinstance(A, LinearOperator):
         return A
     else:
@@ -431,6 +474,39 @@ def make_linear_operator(A):
 # allocating unnecessary futures.
 @track_provenance(nested=True)
 def cg_axpby(y, x, a, b, isalpha=True, negate=False):
+    """Perform fused vector operation for CG solvers.
+
+    This function performs the operation y = alpha * x + beta * y where
+    alpha and beta are computed as a/b within the task. This avoids
+    unnecessary future operations and memory allocations.
+
+    Parameters
+    ----------
+    y : cupynumeric.ndarray
+        Output vector that will be modified in-place.
+    x : cupynumeric.ndarray
+        Input vector for the operation.
+    a : cupynumeric.ndarray
+        Numerator for computing alpha or beta.
+    b : cupynumeric.ndarray
+        Denominator for computing alpha or beta.
+    isalpha : bool, optional
+        If True, a/b is interpreted as alpha. If False, as beta.
+        Default is True.
+    negate : bool, optional
+        If True, negate the computed coefficient. Default is False.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The modified y vector (same as input y).
+
+    Notes
+    -----
+    This is a specialized implementation for CG solvers that fuses
+    coefficient computation with vector operations to avoid unnecessary
+    memory allocations and future operations in the Legion runtime.
+    """
     y_store = get_store_from_cupynumeric_array(y)
     x_store = get_store_from_cupynumeric_array(x)
     task = runtime.create_auto_task(SparseOpCode.AXPBY)
@@ -451,6 +527,29 @@ def cg_axpby(y, x, a, b, isalpha=True, negate=False):
 
 
 def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
+    """Compute absolute and relative tolerances for convergence.
+
+    Parameters
+    ----------
+    b_norm : float
+        Norm of the right-hand side vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    atol : float, optional
+        Absolute tolerance. Default is 0.0.
+    rtol : float, optional
+        Relative tolerance. Default is 1e-5.
+
+    Returns
+    -------
+    tuple
+        (atol, rtol) - computed absolute and relative tolerances.
+
+    Notes
+    -----
+    If atol is None, it is set to rtol. The final atol is the maximum
+    of the provided atol and rtol * b_norm.
+    """
     rtol = float(tol) if tol is not None else rtol
 
     if atol is None:
@@ -473,6 +572,60 @@ def cg(
     rtol=1e-5,
     conv_test_iters=25,
 ):
+    """Solve a linear system using the Conjugate Gradient method.
+
+    Parameters
+    ----------
+    A : sparse matrix or LinearOperator
+        The coefficient matrix of the linear system.
+    b : cupynumeric.ndarray
+        Right-hand side of the linear system.
+    x0 : cupynumeric.ndarray, optional
+        Initial guess for the solution. If None, uses zero vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    maxiter : int, optional
+        Maximum number of iterations. If None, uses 10 * n.
+    M : sparse matrix or LinearOperator, optional
+        Preconditioner for A. If None, uses identity.
+    callback : callable, optional
+        User-specified function called after each iteration.
+    atol : float, optional
+        Absolute tolerance for convergence. Default is 0.0.
+    rtol : float, optional
+        Relative tolerance for convergence. Default is 1e-5.
+    conv_test_iters : int, optional
+        Number of iterations between convergence tests. Default is 25.
+
+    Returns
+    -------
+    tuple
+        (x, info) where x is the solution and info is zero if solution is
+        converged else number of iterations
+
+    Raises
+    ------
+    AssertionError
+        If b is not 1D or A is not square.
+
+    Notes
+    -----
+    This implementation follows SciPy's CG solver semantics closely.
+    The method uses fused vector operations to avoid unnecessary
+    memory allocations and improve performance.
+
+    Convergence is tested every conv_test_iters iterations to avoid
+    the overhead of computing the residual norm in every iteration.
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array, linalg
+    >>> A = csr_array([[4, 1], [1, 3]])
+    >>> b = np.array([1, 2])
+    >>> x, iters = linalg.cg(A, b)
+    >>> print(f"Solution: {x}, Iterations: {iters}")
+    """
     # We keep semantics as close as possible to scipy.cg.
     # https://github.com/scipy/scipy/blob/v1.9.0/scipy/sparse/linalg/_isolve/iterative.py#L298-L385
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
@@ -503,6 +656,7 @@ def cg(
     z = None
     q = None
 
+    converged = False
     while iters < maxiter:
         z = M.matvec(r, out=z)
         rho1 = rho
@@ -528,10 +682,15 @@ def cg(
         if (iters % conv_test_iters == 0 or iters == (maxiter - 1)) and np.linalg.norm(
             r
         ) < atol:
+            converged = True
             # Test convergence every conv_test_iters iterations.
             break
 
-    return x, iters
+    info = 0
+    if iters == maxiter and not converged:
+        info = iters
+
+    return x, info
 
 
 # This implementation of GMRES is lifted from the cupy implementation:
@@ -550,43 +709,77 @@ def gmres(
     callback_type=None,
     rtol=1e-5,
 ):
-    """Uses Generalized Minimal RESidual iteration to solve ``Ax = b``.
-    Args:
-        A (ndarray, spmatrix or LinearOperator): The real or complex
-            matrix of the linear system with shape ``(n, n)``. ``A`` must be
-            :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or
-            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
-        b (cupy.ndarray): Right hand side of the linear system with shape
-            ``(n,)`` or ``(n, 1)``.
-        x0 (cupy.ndarray): Starting guess for the solution.
-        tol (float): Tolerance for convergence. This argument is optional,
-            deprecated in favour of ``rtol``.
-        restart (int): Number of iterations between restarts. Larger values
-            increase iteration cost, but may be necessary for convergence.
-        maxiter (int): Maximum number of iterations.
-        M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``.
-            The preconditioner should approximate the inverse of ``A``.
-            ``M`` must be :class:`cupy.ndarray`,
-            :class:`cupyx.scipy.sparse.spmatrix` or
-            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
-        callback (function): User-specified function to call on every restart.
-            It is called as ``callback(arg)``, where ``arg`` is selected by
-            ``callback_type``.
-        callback_type (str): 'x' or 'pr_norm'. If 'x', the current solution
-            vector is used as an argument of callback function. if 'pr_norm',
-            relative (preconditioned) residual norm is used as an arugment.
-        atol, rtol (float): Tolerance for convergence. For convergence,
-            ``norm(b - A @ x) <= max(rtol*norm(b), atol)`` should be satisfied.
-            The default is ``atol=0.`` and ``rtol=1e-5``.
-    Returns:
-        tuple:
-            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
-            the converged solution and ``info`` provides convergence
-            information.
-    Reference:
-        M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
-        Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).
-    .. seealso:: :func:`scipy.sparse.linalg.gmres`
+    """Solve a linear system using the Generalized Minimal Residual method.
+
+    Parameters
+    ----------
+    A : sparse matrix or LinearOperator
+        The coefficient matrix of the linear system.
+    b : cupynumeric.ndarray
+        Right-hand side of the linear system with shape (n,) or (n, 1).
+    x0 : cupynumeric.ndarray, optional
+        Starting guess for the solution. If None, uses zero vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    restart : int, optional
+        Number of iterations between restarts. Larger values increase
+        iteration cost but may be necessary for convergence. Default is 20.
+    maxiter : int, optional
+        Maximum number of iterations. If None, uses 10 * n.
+    M : sparse matrix or LinearOperator, optional
+        Preconditioner for A. The preconditioner should approximate
+        the inverse of A. If None, uses identity.
+    callback : callable, optional
+        User-specified function called on every restart.
+    restrt : int, optional
+        Deprecated alias for restart parameter.
+    atol : float, optional
+        Absolute tolerance for convergence. Default is 0.0.
+    callback_type : str, optional
+        Type of callback argument: 'x' for current solution vector,
+        'pr_norm' for relative preconditioned residual norm. Default is 'pr_norm'.
+    rtol : float, optional
+        Relative tolerance for convergence. Default is 1e-5.
+
+    Returns
+    -------
+    tuple
+        (x, info) where x is the converged solution and info provides
+        convergence information.
+
+    Raises
+    ------
+    AssertionError
+        If b is not 1D or A is not square.
+    ValueError
+        If callback_type is not 'x' or 'pr_norm'.
+
+    Notes
+    -----
+    This implementation is adapted from CuPy's GMRES solver.
+    The method uses Arnoldi iteration to build a Krylov subspace
+    and solves the least squares problem in that subspace.
+
+    For convergence, the residual must satisfy:
+    norm(b - A @ x) <= max(rtol * norm(b), atol)
+
+    The restart parameter controls the trade-off between memory usage
+    and convergence rate. Larger restart values may improve convergence
+    but require more memory.
+
+    References
+    ----------
+    M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
+    Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array, linalg
+    >>> A = csr_array([[4, 1, 0], [1, 3, 1], [0, 1, 2]])
+    >>> b = np.array([1, 2, 3])
+    >>> x, info = linalg.gmres(A, b, restart=10)
+    >>> print(f"Solution: {x}, Info: {info}")
     """
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
     assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
@@ -625,6 +818,27 @@ def gmres(
     e = np.zeros((restart + 1,), dtype=A.dtype)
 
     def compute_hu(u, j):
+        """Compute Householder transformation for Arnoldi iteration.
+
+        Parameters
+        ----------
+        u : cupynumeric.ndarray
+            Vector to be transformed.
+        j : int
+            Current iteration index.
+
+        Returns
+        -------
+        tuple
+            (h, u) where h contains the Householder coefficients and
+            u is the transformed vector.
+
+        Notes
+        -----
+        This function computes the Householder transformation that
+        orthogonalizes the current vector against the previous basis
+        vectors in the Arnoldi iteration.
+        """
         h = V[:, : j + 1].conj().T @ u
         u -= V[:, : j + 1] @ h
         return h, u
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index a84abb0f..56f22fa1 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -55,16 +55,66 @@
 from .types import coord_ty, nnz_ty  # noqa: F401
 
 
-# is_sparse_matrix returns whether or not an object is a legate
-# sparse created sparse matrix.
-def is_sparse_matrix(o):
-    return any((isinstance(o, csr_array),))
+# returns whether or not an object is a legate sparse created sparse matrix.
+def _is_sparse_matrix(obj) -> bool:
+    return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
+def isspmatrix(obj) -> bool:
+    """Check if an object is a legate sparse matrix.
 
-issparse = is_sparse_matrix
-isspmatrix = is_sparse_matrix
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a legate sparse matrix, False otherwise.
+
+    Notes
+    -----
+    This function checks if the object is an instance of any supported
+    sparse matrix format in legate_sparse. Currently, only
+    CSR and DIA formats for supported.
+    """
+    return _is_sparse_matrix(obj)
+
+
+def issparse(obj) -> bool:
+    """Check if an object is a legate sparse matrix.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a legate sparse matrix, False otherwise.
+
+    Notes
+    -----
+    This function checks if the object is an instance of any supported
+    sparse matrix format in legate_sparse. Currently, only
+    CSR and DIA formats for supported.
+    """
+    return _is_sparse_matrix(obj)
 
 
 # Variants for each particular format type.
-def isspmatrix_csr(o):
-    return isinstance(o, csr_array)
+def isspmatrix_csr(obj):
+    """Check if an object is a CSR sparse matrix.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a CSR sparse matrix, False otherwise.
+    """
+    return isinstance(obj, csr_array)
diff --git a/legate_sparse/types.py b/legate_sparse/types.py
index 376fe116..923767f2 100644
--- a/legate_sparse/types.py
+++ b/legate_sparse/types.py
@@ -18,8 +18,19 @@
 # progress in generalizing the compute kernels, we can
 # remove this code.
 coord_ty = numpy.dtype(numpy.int64)
+"""Data type for coordinate indices in sparse matrices (int64)."""
+
 nnz_ty = numpy.dtype(numpy.uint64)
+"""Data type for non-zero counts in sparse matrices (uint64)."""
+
 float64 = numpy.dtype(numpy.float64)
+"""64-bit floating point data type."""
+
 int32 = numpy.dtype(numpy.int32)
+"""32-bit integer data type."""
+
 int64 = numpy.dtype(numpy.int64)
+"""64-bit integer data type."""
+
 uint64 = numpy.dtype(numpy.uint64)
+"""64-bit unsigned integer data type."""
diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py
index 11daf9fd..2c072f2b 100644
--- a/legate_sparse/utils.py
+++ b/legate_sparse/utils.py
@@ -31,11 +31,25 @@
     numpy.complex64,
     numpy.complex128,
 )
+"""Supported datatypes for sparse matrix operations (SpMV and SpGEMM)."""
 
 
 # find_last_user_stacklevel gets the last stack frame index
 # within legate sparse.
 def find_last_user_stacklevel() -> int:
+    """Find the last stack frame index within legate sparse.
+
+    Returns
+    -------
+    int
+        The stack level of the last user code frame.
+
+    Notes
+    -----
+    This function walks the stack to find the first frame that is not
+    within the legate_sparse module, which is useful for determining
+    the appropriate stack level for warnings.
+    """
     stacklevel = 1
     for frame, _ in traceback.walk_stack(None):
         if not frame.f_globals["__name__"].startswith("sparse"):
@@ -46,6 +60,18 @@ def find_last_user_stacklevel() -> int:
 
 # store_to_cupynumeric_array converts a store to a cuPyNumeric array.
 def store_to_cupynumeric_array(store: LogicalStore):
+    """Convert a LogicalStore to a cupynumeric array.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to convert.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the store.
+    """
     return cupynumeric.asarray(store)
 
 
@@ -54,6 +80,20 @@ def get_store_from_cupynumeric_array(
     arr: cupynumeric.ndarray,
     copy=False,
 ) -> LogicalStore:
+    """Extract a LogicalStore from a cupynumeric array.
+
+    Parameters
+    ----------
+    arr : cupynumeric.ndarray
+        The cupynumeric array to extract the store from.
+    copy : bool, optional
+        Whether to create a copy of the array first. Default is False.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the array.
+    """
     if copy:
         # If requested to make a copy, do so.
         arr = cupynumeric.array(arr)
@@ -67,6 +107,23 @@ def get_store_from_cupynumeric_array(
 
 # cast_to_store attempts to cast an arbitrary object into a store.
 def cast_to_store(arr):
+    """Cast an arbitrary object to a LogicalStore.
+
+    Parameters
+    ----------
+    arr : array_like or LogicalStore
+        The object to cast.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the input.
+
+    Raises
+    ------
+    NotImplementedError
+        If the object cannot be cast to a LogicalStore.
+    """
     if isinstance(arr, LogicalStore):
         return arr
     if isinstance(arr, numpy.ndarray):
@@ -79,6 +136,20 @@ def cast_to_store(arr):
 # cast_arr attempts to cast an arbitrary object into a cupynumeric
 # ndarray, with an optional desired type.
 def cast_arr(arr, dtype=None):
+    """Cast an arbitrary object to a cupynumeric array.
+
+    Parameters
+    ----------
+    arr : array_like or LogicalStore
+        The object to cast.
+    dtype : dtype, optional
+        The desired data type. If None, preserves the original type.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the input.
+    """
     if isinstance(arr, LogicalStore):
         arr = store_to_cupynumeric_array(arr)
     elif not isinstance(arr, cupynumeric.ndarray):
@@ -88,14 +159,32 @@ def cast_arr(arr, dtype=None):
     return arr
 
 
-# find_common_type performs a similar analysis to
-# cupynumeric.ndarray.find_common_type to find a common type
-# between all of the arguments.
 def find_common_type(*args):
+    """Find the common data type for a set of arrays.
+
+    This function performs a similar analysis to cupynumeric.ndarray.find_common_type
+    to find a common type between all of the arguments.
+
+    Parameters
+    ----------
+    *args : array_like
+        Arrays to find the common type for.
+
+    Returns
+    -------
+    numpy.dtype
+        The common data type that can represent all input arrays.
+
+    Notes
+    -----
+    The function handles sparse matrices, dense arrays, and scalars.
+    For sparse matrices, it uses their dtype. For scalars (size == 1),
+    they are treated separately from arrays.
+    """
     array_types = list()
     scalar_types = list()
     for array in args:
-        if legate_sparse.is_sparse_matrix(array):
+        if legate_sparse.isspmatrix(array):
             array_types.append(array.dtype)
         elif array.size == 1:
             scalar_types.append(array.dtype)
@@ -104,18 +193,47 @@ def find_common_type(*args):
     return numpy.result_type(*array_types, *scalar_types)
 
 
-# cast_to_common_type casts all arguments to the same common dtype.
 def cast_to_common_type(*args):
-    # Find a common type for all of the arguments.
+    """Cast all arguments to the same common data type.
+
+    Parameters
+    ----------
+    *args : array_like
+        Arrays to cast to a common type.
+
+    Returns
+    -------
+    tuple
+        Tuple of arrays, all cast to the same common data type.
+
+    Notes
+    -----
+    This function first finds the common type using find_common_type,
+    then casts each input to that type. If all arguments are already
+    the common type, this will be a no-op.
+    """
     common_type = find_common_type(*args)
-    # Cast each input to the common type. Ideally, if all of the
-    # arguments are already the common type then this will
-    # be a no-op.
     return tuple(arg.astype(common_type, copy=False) for arg in args)
 
 
-# factor_int decomposes an integer into a close to square grid.
 def factor_int(n):
+    """Split an integer into two close factors.
+
+    Parameters
+    ----------
+    n : int
+        The integer to factor.
+
+    Returns
+    -------
+    tuple
+        (val, val2) where val * val2 = n and val is close to sqrt(n).
+
+    Notes
+    -----
+    This function finds two factors of n such that their product equals n
+    and the first factor is close to the square root of n.
+    """
     val = math.ceil(math.sqrt(n))
     val2 = int(n / val)
     while val2 * val != float(n):
@@ -124,9 +242,31 @@ def factor_int(n):
     return val, val2
 
 
-# broadcast_store broadcasts a store to the desired input shape,
-# or throws an error if the broadcast is not possible.
 def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
+    """Broadcast a LogicalStore to the desired shape.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to broadcast.
+    shape : tuple
+        The target shape to broadcast to.
+
+    Returns
+    -------
+    LogicalStore
+        The broadcasted store.
+
+    Raises
+    ------
+    ValueError
+        If the broadcast is not possible.
+
+    Notes
+    -----
+    This function handles both dimension promotion (adding new dimensions)
+    and broadcasting (expanding dimensions of size 1).
+    """
     diff = len(shape) - store.ndim
     for dim in range(diff):
         store = store.promote(dim, shape[dim])
@@ -142,13 +282,43 @@ def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
 
 
 def copy_store(store: LogicalStore) -> LogicalStore:
+    """Create a copy of a LogicalStore.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to copy.
+
+    Returns
+    -------
+    LogicalStore
+        A new LogicalStore with the same data as the input.
+    """
     res = runtime.create_store(store.type, store.shape)  # type: ignore
     runtime.legate_runtime.issue_copy(res, store)
     return res
 
 
 def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
-    "Get LogicalStore from a LogicalStore or array, potentially creating a copy"
+    """Get LogicalStore from a LogicalStore or array, potentially creating a copy.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The source object to convert.
+    copy : bool, optional
+        Whether to create a copy. Default is False.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the input.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return get_store_from_cupynumeric_array(src, copy)
     elif isinstance(src, LogicalStore):
@@ -158,7 +328,25 @@ def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
 
 
 def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type: ignore
-    "Get array from a LogicalStore or array, potentially creating a copy"
+    """Get array from a LogicalStore or array, potentially creating a copy.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The source object to convert.
+    copy : bool, optional
+        Whether to create a copy. Default is False.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the input.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return src.copy() if copy else src
     elif isinstance(src, LogicalStore):
@@ -173,6 +361,23 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
 
 
 def get_storage_type(src):
+    """Get the storage type of an object.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The object to get the storage type for.
+
+    Returns
+    -------
+    numpy.dtype
+        The data type of the object.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return src.dtype
     elif isinstance(src, LogicalStore):
@@ -185,33 +390,58 @@ def get_storage_type(src):
 
 
 def is_dtype_supported(dtype: numpy.dtype) -> bool:
-    """
-    Does this datatype support spMV and spGEMM operations
+    """Check if a datatype supports SpMV and SpGEMM operations.
 
     Parameters
     ----------
-    dtype: np.dtype
-        Input datatype to check if it supports spMV and spGEMM
+    dtype : numpy.dtype
+        Input datatype to check if it supports SpMV and SpGEMM.
 
     Returns
     -------
-    valid: bool
-        True if  dtype supports spMV and spGEMM
-    """
+    bool
+        True if dtype supports SpMV and SpGEMM operations.
 
+    Notes
+    -----
+    Currently supported datatypes are float32, float64, complex64, and complex128.
+    """
     return dtype in SUPPORTED_DATATYPES
 
 
 def is_dense(x) -> bool:
-    """
-    Is this object a dense cupynumeric array
+    """Check if an object is a dense cupynumeric array.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a cupynumeric.ndarray, False otherwise.
     """
     return isinstance(x, cupynumeric.ndarray)
 
 
 def is_scalar_like(x) -> bool:
-    """
-    Is this object a scalar like type
+    """Check if an object is a scalar-like type.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a scalar or 0-dimensional array, False otherwise.
+
+    Notes
+    -----
+    This function returns False for strings, even though they are scalar-like
+    in some contexts, to avoid confusion with numeric scalars.
     """
     if isinstance(x, str):
         return False
@@ -219,32 +449,52 @@ def is_scalar_like(x) -> bool:
 
 
 def is_sparse(x) -> bool:
+    """Check if an object is a legate sparse matrix.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a legate sparse matrix, False otherwise.
     """
-    Is this object a legate sparse matrix
-    """
-    return legate_sparse.is_sparse_matrix(x)
+    return legate_sparse.isspmatrix(x)
 
 
 def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray):
-    """
+    """Sort indices by rows first, then by columns.
+
     This function is a quick and dirty hack that does what np.lexsort does
-    using argsort, but only for two keys.
-    This is primarily used to to get the indices that we can use to sort data
-    first by rows and then by columns
+    using argsort, but only for two keys. This is primarily used to get
+    the indices that we can use to sort data first by rows and then by columns.
 
     Parameters
     ----------
-
-    rows: cupynumeric.ndarray
-        Indices of rows
-
-    cols: cupynumeric.ndarray
-        Indices of cols
+    rows : cupynumeric.ndarray
+        Indices of rows.
+    cols : cupynumeric.ndarray
+        Indices of columns.
 
     Returns
     -------
-    sorted_indices:cupynumeric.ndarray
-        Indices sorted by rows and then by columns, as given by numpy's lexsort
+    cupynumeric.ndarray
+        Indices sorted by rows and then by columns, as given by numpy's lexsort.
+
+    Notes
+    -----
+    This function is equivalent to np.lexsort((cols, rows)) but implemented
+    using stable sorting to ensure consistent results.
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> rows = np.array([1, 0, 1, 0])
+    >>> cols = np.array([2, 1, 1, 2])
+    >>> indices = sort_by_rows_then_cols(rows, cols)
+    >>> print(indices)  # [1, 3, 2, 0] - sorted by (row, col)
     """
     assert rows.size == cols.size
 
diff --git a/setup.py b/setup.py
index daa17216..68efb75c 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="legate-sparse",
-    version="25.03.00",
+    version="25.07.00",
     description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale",
     author="NVIDIA Corporation",
     license="Apache 2.0",
@@ -48,8 +48,9 @@
         "Topic :: Scientific/Engineering",
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
     ],
     packages=find_packages(
         where=".",
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cc b/src/legate_sparse/array/conv/csr_to_dense.cc
index d865a805..de9a8958 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense.cc
@@ -55,7 +55,11 @@ struct CSRToDenseImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { CSRToDense::register_variants(); }
+
+static const auto sparse_reg_task_ = []() -> char {
+  CSRToDense::register_variants();
+  return 0;
+}();
 
 }  // namespace
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense.h b/src/legate_sparse/array/conv/csr_to_dense.h
index 7b5a947e..58ec4479 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.h
+++ b/src/legate_sparse/array/conv/csr_to_dense.h
@@ -31,7 +31,8 @@ struct CSRToDenseArgs {
 
 class CSRToDense : public SparseTask<CSRToDense> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_TO_DENSE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_TO_DENSE}};
 
  public:
   static void cpu_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cc b/src/legate_sparse/array/conv/dense_to_csr.cc
index 97a86fe7..3304b558 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr.cc
@@ -77,11 +77,12 @@ struct DenseToCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+
+static const auto sparse_reg_task_ = []() -> char {
   DenseToCSRNNZ::register_variants();
   DenseToCSR::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr.h b/src/legate_sparse/array/conv/dense_to_csr.h
index a0ebddc0..c9cf504a 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.h
+++ b/src/legate_sparse/array/conv/dense_to_csr.h
@@ -29,7 +29,8 @@ struct DenseToCSRNNZArgs {
 
 class DenseToCSRNNZ : public SparseTask<DenseToCSRNNZ> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR_NNZ};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR_NNZ}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
@@ -48,7 +49,8 @@ struct DenseToCSRArgs {
 
 class DenseToCSR : public SparseTask<DenseToCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc
index 6b781134..7cadb10e 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc
@@ -44,10 +44,10 @@ struct ExpandPosToCoordinatesImplBody<VariantKind::CPU, INDEX_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   ExpandPosToCoordinates::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.h b/src/legate_sparse/array/conv/pos_to_coordinates.h
index 70e351a6..ad21ff95 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.h
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.h
@@ -29,7 +29,8 @@ struct ExpandPosToCoordinatesArgs {
 
 class ExpandPosToCoordinates : public SparseTask<ExpandPosToCoordinates> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES}};
 
  public:
   static void cpu_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/csr/get_diagonal.cc b/src/legate_sparse/array/csr/get_diagonal.cc
index 1e39b82b..cace6438 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cc
+++ b/src/legate_sparse/array/csr/get_diagonal.cc
@@ -50,10 +50,11 @@ struct GetCSRDiagonalImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   GetCSRDiagonal::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/get_diagonal.h b/src/legate_sparse/array/csr/get_diagonal.h
index 6dd842bf..0c3d44a7 100644
--- a/src/legate_sparse/array/csr/get_diagonal.h
+++ b/src/legate_sparse/array/csr/get_diagonal.h
@@ -32,7 +32,8 @@ struct GetCSRDiagonalArgs {
 
 class GetCSRDiagonal : public SparseTask<GetCSRDiagonal> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_DIAGONAL};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_DIAGONAL}};
   // TODO (rohany): We could rewrite this having each implementation just make
   //  a call to thrust::transform, but the implementations are simple enough
   //  anyway.
diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc
index 8fc0c11b..f40c901b 100644
--- a/src/legate_sparse/array/csr/indexing.cc
+++ b/src/legate_sparse/array/csr/indexing.cc
@@ -90,10 +90,11 @@ struct CSRIndexingCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   CSRIndexingCSR::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.h b/src/legate_sparse/array/csr/indexing.h
index 7bd6240c..8962370c 100644
--- a/src/legate_sparse/array/csr/indexing.h
+++ b/src/legate_sparse/array/csr/indexing.h
@@ -33,7 +33,8 @@ struct CSRIndexingCSRArgs {
 
 class CSRIndexingCSR : public SparseTask<CSRIndexingCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR}};
 
   // TODO: The implementatio of the below three variants are
   // identical and hence need to be templated (DRY)
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
index 47ed6d34..6c4945de 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
@@ -177,12 +177,13 @@ struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   SpGEMMCSRxCSRxCSRNNZ::register_variants();
   SpGEMMCSRxCSRxCSR::register_variants();
   SpGEMMCSRxCSRxCSRGPU::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
index bf5d526d..c1b004ce 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
@@ -32,7 +32,8 @@ struct SpGEMMCSRxCSRxCSRNNZArgs {
 
 class SpGEMMCSRxCSRxCSRNNZ : public SparseTask<SpGEMMCSRxCSRxCSRNNZ> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
@@ -60,7 +61,8 @@ struct SpGEMMCSRxCSRxCSRArgs {
 
 class SpGEMMCSRxCSRxCSR : public SparseTask<SpGEMMCSRxCSRxCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
@@ -94,7 +96,8 @@ struct SpGEMMCSRxCSRxCSRGPUArgs {
 // we take a different approach than on CPUs and OMPs.
 class SpGEMMCSRxCSRxCSRGPU : public SparseTask<SpGEMMCSRxCSRxCSRGPU> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/array/csr/spmv.cc b/src/legate_sparse/array/csr/spmv.cc
index 63c305c6..d9efa4fd 100644
--- a/src/legate_sparse/array/csr/spmv.cc
+++ b/src/legate_sparse/array/csr/spmv.cc
@@ -51,10 +51,11 @@ struct CSRSpMVRowSplitImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   CSRSpMVRowSplit::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv.h b/src/legate_sparse/array/csr/spmv.h
index 8c46ba7f..7d718990 100644
--- a/src/legate_sparse/array/csr/spmv.h
+++ b/src/legate_sparse/array/csr/spmv.h
@@ -32,7 +32,8 @@ struct CSRSpMVRowSplitArgs {
 
 class CSRSpMVRowSplit : public SparseTask<CSRSpMVRowSplit> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/array/util/scale_rect.cc b/src/legate_sparse/array/util/scale_rect.cc
index bad54157..c2d2df90 100644
--- a/src/legate_sparse/array/util/scale_rect.cc
+++ b/src/legate_sparse/array/util/scale_rect.cc
@@ -39,7 +39,11 @@ struct ScaleRect1ImplBody<VariantKind::CPU> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ScaleRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ScaleRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/scale_rect.h b/src/legate_sparse/array/util/scale_rect.h
index e9e7ffda..0b559036 100644
--- a/src/legate_sparse/array/util/scale_rect.h
+++ b/src/legate_sparse/array/util/scale_rect.h
@@ -29,7 +29,8 @@ struct ScaleRect1Args {
 
 class ScaleRect1 : public SparseTask<ScaleRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SCALE_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SCALE_RECT_1}};
   static void cpu_variant(legate::TaskContext context);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext context);
diff --git a/src/legate_sparse/array/util/unzip_rect.cc b/src/legate_sparse/array/util/unzip_rect.cc
index 9a9e8708..1272e9cc 100644
--- a/src/legate_sparse/array/util/unzip_rect.cc
+++ b/src/legate_sparse/array/util/unzip_rect.cc
@@ -42,7 +42,11 @@ struct UnZipRect1ImplBody<VariantKind::CPU> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { UnZipRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  UnZipRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/unzip_rect.h b/src/legate_sparse/array/util/unzip_rect.h
index e470c541..08293ef2 100644
--- a/src/legate_sparse/array/util/unzip_rect.h
+++ b/src/legate_sparse/array/util/unzip_rect.h
@@ -30,7 +30,8 @@ struct UnZipRect1Args {
 
 class UnZipRect1 : public SparseTask<UnZipRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UNZIP_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UNZIP_RECT_1}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/util/zip_to_rect.cc b/src/legate_sparse/array/util/zip_to_rect.cc
index 39634664..c8871583 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cc
+++ b/src/legate_sparse/array/util/zip_to_rect.cc
@@ -41,7 +41,11 @@ struct ZipToRect1ImplBody<VariantKind::CPU, VAL> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ZipToRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ZipToRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/zip_to_rect.h b/src/legate_sparse/array/util/zip_to_rect.h
index 3851a195..4bc5ac70 100644
--- a/src/legate_sparse/array/util/zip_to_rect.h
+++ b/src/legate_sparse/array/util/zip_to_rect.h
@@ -30,7 +30,8 @@ struct ZipToRect1Args {
 
 class ZipToRect1 : public SparseTask<ZipToRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_ZIP_TO_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_ZIP_TO_RECT_1}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/cudalibs.cu b/src/legate_sparse/cudalibs.cu
index 733ea8f1..6ec45bd5 100644
--- a/src/legate_sparse/cudalibs.cu
+++ b/src/legate_sparse/cudalibs.cu
@@ -77,7 +77,8 @@ cusparseHandle_t get_cusparse()
 
 class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_LOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_LOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -90,7 +91,8 @@ class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
 
 class UnloadCUDALibsTask : public SparseTask<UnloadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UNLOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UNLOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -101,10 +103,10 @@ class UnloadCUDALibsTask : public SparseTask<UnloadCUDALibsTask> {
   }
 };
 
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   LoadCUDALibsTask::register_variants();
   UnloadCUDALibsTask::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace sparse
diff --git a/src/legate_sparse/io/mtx_to_coo.cc b/src/legate_sparse/io/mtx_to_coo.cc
index d2c85667..71afd22e 100644
--- a/src/legate_sparse/io/mtx_to_coo.cc
+++ b/src/legate_sparse/io/mtx_to_coo.cc
@@ -35,13 +35,13 @@ using val_ty   = double;
   // within DISTAL.
   assert(ctx.is_single_task());
   // Regardless of how inputs are added, scalar future return values are at the front.
-  auto& m_store   = ctx.outputs()[0];
-  auto& n_store   = ctx.outputs()[1];
-  auto& nnz_store = ctx.outputs()[2];
-  auto& rows      = ctx.outputs()[3];
-  auto& cols      = ctx.outputs()[4];
-  auto& vals      = ctx.outputs()[5];
-  auto filename   = ctx.scalars()[0].value<std::string>();
+  auto m_store   = ctx.output(0);
+  auto n_store   = ctx.output(1);
+  auto nnz_store = ctx.output(2);
+  auto rows      = ctx.output(3);
+  auto cols      = ctx.output(4);
+  auto vals      = ctx.output(5);
+  auto filename  = ctx.scalar(0).value<std::string>();
   std::fstream file;
   file.open(filename, std::fstream::in);
 
@@ -148,7 +148,11 @@ using val_ty   = double;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ReadMTXToCOO::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ReadMTXToCOO::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/io/mtx_to_coo.h b/src/legate_sparse/io/mtx_to_coo.h
index 4dde28fa..c8e1d7ff 100644
--- a/src/legate_sparse/io/mtx_to_coo.h
+++ b/src/legate_sparse/io/mtx_to_coo.h
@@ -24,7 +24,8 @@ namespace sparse {
 
 class ReadMTXToCOO : public SparseTask<ReadMTXToCOO> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_READ_MTX_TO_COO};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_READ_MTX_TO_COO}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/linalg/axpby.cc b/src/legate_sparse/linalg/axpby.cc
index 1e61a1bf..547ad927 100644
--- a/src/legate_sparse/linalg/axpby.cc
+++ b/src/legate_sparse/linalg/axpby.cc
@@ -52,7 +52,11 @@ struct AXPBYImplBody<VariantKind::CPU, VAL_CODE, IS_ALPHA, NEGATE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { AXPBY::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  AXPBY::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/axpby.h b/src/legate_sparse/linalg/axpby.h
index 132c9f22..256e8070 100644
--- a/src/legate_sparse/linalg/axpby.h
+++ b/src/legate_sparse/linalg/axpby.h
@@ -33,7 +33,8 @@ struct AXPBYArgs {
 
 class AXPBY : public SparseTask<AXPBY> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_AXPBY};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_AXPBY}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc
index 43ed8b14..6357d898 100644
--- a/src/legate_sparse/mapper/mapper.cc
+++ b/src/legate_sparse/mapper/mapper.cc
@@ -32,9 +32,9 @@ std::vector<StoreMapping> LegateSparseMapper::store_mappings(
   const Task& task, const std::vector<StoreTarget>& options)
 {
   const auto& inputs = task.inputs();
-  std::vector<StoreMapping> mappings(inputs.size());
+  std::vector<StoreMapping> mappings;
   for (size_t i = 0; i < inputs.size(); i++) {
-    mappings[i] = StoreMapping::default_mapping(inputs[i].data(), options.front());
+    mappings.push_back(StoreMapping::default_mapping(inputs[i].data(), options.front()));
   }
   return std::move(mappings);
 }
@@ -64,9 +64,10 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
         auto crd  = task.inputs()[1];
         auto vals = task.inputs()[2];
 
-        std::size_t nrows_plus_one   = pos.domain().get_volume() + 1;
-        std::size_t nnz              = vals.domain().get_volume();
-        std::size_t factor_of_safety = 1.15;  // make sure we don't fail; 1.15 is arbitrary
+        std::size_t nrows_plus_one = pos.domain().get_volume() + 1;
+        std::size_t nnz            = vals.domain().get_volume();
+        // make sure we don't fail; 1.15 is arbitrary
+        std::size_t factor_of_safety = static_cast<std::size_t>(1.15);
         std::size_t cusparseSpMV_buffer_size =
           factor_of_safety * std::ceil(nnz / 32.0) * sizeof(double);
         std::size_t legate_buffer_size = nrows_plus_one * (vals.type().size() + crd.type().size());
@@ -124,10 +125,13 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       // and then update the estimate here
       return std::nullopt;
     }
-  }
 
-  LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
-  return {};
+    default: {
+      // Handle any unhandled enum values
+      LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
+      return {};
+    }
+  }
 }
 
 Scalar LegateSparseMapper::tunable_value(legate::TunableID tunable_id)
diff --git a/src/legate_sparse/partition/fast_image_partition.cc b/src/legate_sparse/partition/fast_image_partition.cc
index c9ee27ef..13801faa 100644
--- a/src/legate_sparse/partition/fast_image_partition.cc
+++ b/src/legate_sparse/partition/fast_image_partition.cc
@@ -23,10 +23,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   FastImageRange::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace sparse
\ No newline at end of file
+}  // namespace sparse
diff --git a/src/legate_sparse/partition/fast_image_partition.h b/src/legate_sparse/partition/fast_image_partition.h
index 2d09ff08..992843c5 100644
--- a/src/legate_sparse/partition/fast_image_partition.h
+++ b/src/legate_sparse/partition/fast_image_partition.h
@@ -31,7 +31,8 @@ struct FastImageRangeArgs {
 // only for CSR SpGEMM on GPU right now
 class FastImageRange : public SparseTask<FastImageRange> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_FAST_IMAGE_RANGE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_FAST_IMAGE_RANGE}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/util/upcast_future.cc b/src/legate_sparse/util/upcast_future.cc
index c7e8f4ec..ad66b755 100644
--- a/src/legate_sparse/util/upcast_future.cc
+++ b/src/legate_sparse/util/upcast_future.cc
@@ -82,10 +82,12 @@ void upcast_impl(legate::TaskContext ctx)
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+
+static const auto sparse_reg_task_ = []() -> char {
   UpcastFutureToRegion::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/util/upcast_future.h b/src/legate_sparse/util/upcast_future.h
index 7c78df88..b38dbab9 100644
--- a/src/legate_sparse/util/upcast_future.h
+++ b/src/legate_sparse/util/upcast_future.h
@@ -24,7 +24,8 @@ namespace sparse {
 
 class UpcastFutureToRegion : public SparseTask<UpcastFutureToRegion> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UPCAST_FUTURE_TO_REGION};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UPCAST_FUTURE_TO_REGION}};
   static void cpu_variant(legate::TaskContext ctx);
 
  private:
diff --git a/test.py b/test.py
index 581dac4b..72165ebf 100755
--- a/test.py
+++ b/test.py
@@ -44,5 +44,4 @@ def stage_env(self, feature: FeatureType) -> EnvDict:
 
     plan = TestPlan(config, system)
 
-    plan.execute()
-    sys.exit(0)
+    sys.exit(plan.execute())
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 8c7ffa2f..a8b2f17e 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -8,8 +8,30 @@
 
 @pytest.fixture
 def create_mask():
-    """
-    Create a boolean mask matrix with a random sparsity pattern
+    """Create a boolean mask matrix with a random sparsity pattern.
+
+    This fixture creates equivalent boolean mask matrices in both SciPy and
+    Legate Sparse formats for testing purposes.
+
+    Parameters
+    ----------
+    rows : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Density of non-zero elements. Default is 0.3.
+
+    Returns
+    -------
+    tuple
+        (A_scipy, A_sparse) - Equivalent boolean matrices in SciPy and
+        Legate Sparse formats.
+
+    Notes
+    -----
+    The fixture ensures that both matrices have identical sparsity patterns
+    and values. It verifies equivalence by converting both to dense format
+    and checking that they are numerically close.
+
     """
 
     def _create_mask(rows, density=0.3):
@@ -39,8 +61,32 @@ def _create_mask(rows, density=0.3):
 
 @pytest.fixture
 def create_matrix():
-    """
-    Create matrices in SciPy and Legate Sparse that are equivalent
+    """Create matrices in SciPy and Legate Sparse that are equivalent.
+
+    This fixture creates equivalent sparse matrices in both SciPy and
+    Legate Sparse formats for testing purposes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    tol : float, optional
+        Threshold for sparsity. Values below this threshold are set to zero.
+        Default is 0.5.
+
+    Returns
+    -------
+    tuple
+        (A_scipy, A_sparse) - Equivalent sparse matrices in SciPy and
+        Legate Sparse formats.
+
+    Notes
+    -----
+    The fixture uses simple_system_gen to create a dense matrix, then
+    converts it to sparse format in both libraries. It verifies equivalence
+    by converting both to dense format and checking that they are numerically
+    close.
+
     """
 
     def _create_matrix(N, tol=0.5):
diff --git a/tests/integration/test_cg_solve.py b/tests/integration/test_cg_solve.py
index fe8b3578..d8e046e3 100644
--- a/tests/integration/test_cg_solve.py
+++ b/tests/integration/test_cg_solve.py
@@ -21,8 +21,26 @@
 
 
 def test_cg_solve():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with a positive definite matrix.
+
+    This test verifies that the conjugate gradient solver correctly
+    solves the linear system Ax = b for a positive definite matrix A.
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and ensures it is positive
+    definite by:
+    1. Making it symmetric: A = 0.5 * (A + A.T)
+    2. Adding a multiple of the identity: A = A + N * I
+
+    It then generates a random solution vector x and computes b = Ax.
+    The CG solver is used to solve Ax = b, and the result is verified
+    by checking that A * x_pred ≈ b.
+
+    The test uses a tolerance of 1e-8 for convergence and verification.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
@@ -36,8 +54,28 @@ def test_cg_solve():
 
 
 def test_cg_solve_with_callback():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with a callback function.
+
+    This test verifies that the conjugate gradient solver correctly
+    handles callback functions during iteration.
+
+    Notes
+    -----
+    The test creates a positive definite matrix and solves the linear
+    system Ax = b using CG with a callback function. The callback
+    computes the residual at each iteration and stores it in a list.
+
+    The test verifies that:
+    1. The solver converges to the correct solution
+    2. The callback function is called during iteration
+    3. The residuals are computed correctly
+
+    This ensures that the callback mechanism works properly and can
+    be used for monitoring convergence or implementing custom stopping
+    criteria.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
@@ -59,8 +97,8 @@ def callback(x):
 
 
 # def test_cg_solve_with_identity_preconditioner():
-#     N, D = 1000, 1000
-#     seed = 471014
+#     N, D = 20, 20
+#     seed = 42
 #     A = sample_dense(N, D, 0.1, seed)
 #     A = 0.5 * (A + A.T)
 #     A = A + N * np.eye(N)
@@ -75,8 +113,28 @@ def callback(x):
 
 
 def test_cg_solve_with_linear_operator():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with LinearOperator objects.
+
+    This test verifies that the conjugate gradient solver correctly
+    works with LinearOperator objects that provide matrix-vector
+    multiplication functionality.
+
+    Notes
+    -----
+    The test creates a positive definite matrix A and wraps it in
+    a LinearOperator object. It then solves the linear system using
+    CG with the LinearOperator instead of the sparse matrix directly.
+
+    The test verifies two different LinearOperator implementations:
+    1. Using the @ operator: matvec(x) = A @ x
+    2. Using the dot method: matvec(x, out=None) = A.dot(x, out=out)
+
+    This ensures that the solver can work with any object that provides
+    the required matrix-vector multiplication interface, not just
+    sparse matrices.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
diff --git a/tests/integration/test_comparison.py b/tests/integration/test_comparison.py
index 03c28128..3e58d9c6 100644
--- a/tests/integration/test_comparison.py
+++ b/tests/integration/test_comparison.py
@@ -33,18 +33,32 @@
 @pytest.mark.parametrize("threshold", [0.3, 0.5])
 @pytest.mark.parametrize("op_name, op_func", COMPARISON_OPS)
 def test_comparison_operation(N, threshold, op_name, op_func):
-    """Test element-wise comparison operations on non-zero entries of the matrix
+    """Test element-wise comparison operations on non-zero entries of the matrix.
+
+    This test verifies that comparison operations work correctly on sparse
+    matrices by comparing results with dense matrix operations.
 
     Parameters
     ----------
     N : int
-        Size of the test matrix
+        Size of the test matrix.
     threshold : float
-        Value to compare against
+        Value to compare against.
     op_name : str
-        Name of the comparison operation
+        Name of the comparison operation.
     op_func : callable
-        The comparison function to test
+        The comparison function to test.
+
+    Notes
+    -----
+    The test creates a sparse matrix and applies a comparison operation
+    against a threshold value. It then compares the number of True values
+    in the sparse result with the dense result (considering only non-zero
+    elements).
+
+    This verifies that sparse comparison operations produce the same
+    logical result as dense operations when applied to non-zero elements.
+
     """
     A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
 
@@ -58,12 +72,30 @@ def test_comparison_operation(N, threshold, op_name, op_func):
 def test_comparison_error_cases(op_name, op_func):
     """Test error cases for comparison operations.
 
+    This test verifies that comparison operations properly handle invalid
+    input types by raising appropriate exceptions.
+
     Parameters
     ----------
     op_name : str
-        Name of the comparison operation
+        Name of the comparison operation.
     op_func : callable
-        The comparison function to test
+        The comparison function to test.
+
+    Notes
+    -----
+    The test attempts to compare a sparse matrix with various invalid
+    types including:
+    - 1D arrays
+    - 2D arrays
+    - Strings
+    - Lists
+
+    All of these should raise AssertionError since sparse matrix
+    comparison operations only support scalar values.
+
+    This ensures that the implementation properly validates input
+    types and provides clear error messages for unsupported operations.
     """
     N = 8
     _, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
diff --git a/tests/integration/test_diagonal.py b/tests/integration/test_diagonal.py
index 8c53aa00..3ed7dd54 100644
--- a/tests/integration/test_diagonal.py
+++ b/tests/integration/test_diagonal.py
@@ -24,6 +24,36 @@
 @pytest.mark.parametrize("N", [7, 13])
 @pytest.mark.parametrize("with_zeros", [True, False])
 def test_csr_diagonal(N, with_zeros):
+    """Test diagonal extraction from CSR matrices.
+
+    This test verifies that the diagonal() method correctly extracts
+    the main diagonal from CSR matrices, comparing results with dense
+    matrix diagonal extraction.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix (N x N).
+    with_zeros : bool
+        Whether to include zeros on the diagonal (True) or ensure
+        non-zero diagonal elements (False).
+
+    Notes
+    -----
+    The test creates a random sparse matrix and optionally adds the
+    identity matrix to ensure non-zero diagonal elements. It then
+    extracts the diagonal using both the sparse matrix's diagonal()
+    method and numpy's diagonal() function on the dense version.
+
+    The test verifies that:
+    1. The diagonal elements are extracted correctly
+    2. The results match between sparse and dense implementations
+    3. The method works for both sparse and dense diagonals
+
+    This is important because diagonal extraction is a common operation
+    in linear algebra and should work consistently across different
+    matrix formats.
+    """
     M = N
     np.random.seed(0)
     A_dense, _, _ = simple_system_gen(N, M, None, tol=0.2)
diff --git a/tests/integration/test_gmres_solve.py b/tests/integration/test_gmres_solve.py
index 1acd8d68..37c8dd76 100644
--- a/tests/integration/test_gmres_solve.py
+++ b/tests/integration/test_gmres_solve.py
@@ -21,6 +21,28 @@
 
 
 def test_gmres_solve():
+    """Test GMRES solver with a positive definite matrix.
+
+    This test verifies that the GMRES solver correctly solves the linear
+    system Ax = b for a positive definite matrix A.
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and ensures it is positive
+    definite by:
+    1. Making it symmetric: A = 0.5 * (A + A.T)
+    2. Adding a multiple of the identity: A = A + N * I
+
+    It then generates a random solution vector x and computes b = Ax.
+    The GMRES solver is used to solve Ax = b, and the result is verified
+    by checking that A * x_pred ≈ b.
+
+    The test uses:
+    - atol=1e-5: Absolute tolerance for convergence
+    - tol=1e-5: Relative tolerance (legacy parameter)
+    - maxiter=300: Maximum number of iterations
+    - atol=1e-8: Tolerance for final verification
+    """
     N, D = 1000, 1000
     seed = 471014
     A = sample_dense(N, D, 0.1, seed)
diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py
index 80d078ae..259c7996 100644
--- a/tests/integration/test_indexing.py
+++ b/tests/integration/test_indexing.py
@@ -20,15 +20,39 @@
 
 
 class TestIndexingSetItem:
+    """Test class for sparse matrix indexing and assignment operations.
+
+    This class contains tests for various indexing scenarios including
+    boolean masking, derived masks, and edge cases for sparse matrix
+    assignment operations.
+    """
+
     @pytest.mark.parametrize("N", [6, 9, 17])
     def test_incompatible_mask(self, N, create_matrix, create_mask):
-        """
+        """Test indexing with incompatible mask sparsity patterns.
+
         This test checks that the mask is applied correctly to the matrix when
-        the sparsity of mask is from that of the matrix.
+        the sparsity of mask is different from that of the matrix.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+        create_mask : fixture
+            Fixture to create boolean masks.
+
+        Notes
+        -----
         While SciPy will apply the mask to all entries, Legate Sparse will only
         apply the mask to the non-zero entries of the matrix, so we can't compare
-        to SciPy results for all entries. Instead, we check that the number of
-        non-zero entries are updated correctly and the values are updated correctly.
+        to SciPy results for all entries. Instead, we check that:
+        1. The number of non-zero entries are updated correctly
+        2. The values are updated correctly for masked positions
+
+        This test verifies that the sparse implementation correctly handles
+        cases where the mask has a different sparsity pattern than the matrix.
         """
         _, A = create_matrix(N)
         _, mask = create_mask(N)
@@ -54,11 +78,25 @@ def test_incompatible_mask(self, N, create_matrix, create_mask):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_derived_from_self(self, N, create_matrix):
-        """
+        """Test indexing with mask derived from the matrix itself.
+
         This test checks that the mask is applied correctly to the matrix when
-        the sparsity of mask is derived from the matrix. Our behavior
-        matches that of SciPy, so we can compare against SciPy
-        results for all entries.
+        the sparsity of mask is derived from the matrix.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+
+        Notes
+        -----
+        Our behavior matches that of SciPy when the mask is derived from
+        the matrix itself, so we can compare against SciPy results for all entries.
+
+        The test creates a mask based on a threshold comparison (A > threshold)
+        and verifies that both SciPy and Legate Sparse produce identical results.
         """
         A_scipy, A_sparse = create_matrix(N)
         threshold = 0.2
@@ -81,9 +119,23 @@ def test_mask_derived_from_self(self, N, create_matrix):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_all_true(self, N, create_matrix):
-        """
+        """Test indexing behavior with a mask that is all True.
+
         This test checks indexing behavior when using a mask that is all True.
         Every non-zero element should be updated to the new value.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+
+        Notes
+        -----
+        The test creates a mask with the same sparsity pattern as the matrix
+        but with all True values. This should result in all non-zero elements
+        being updated to the specified value.
         """
         _, A = create_matrix(N)
         value = 10.0
@@ -99,9 +151,24 @@ def test_mask_all_true(self, N, create_matrix):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_all_false(self, N, create_matrix, create_mask):
-        """
+        """Test indexing behavior with a mask that is all False.
+
         This test checks indexing behavior when using a mask that is all False.
         No elements should be modified.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+        create_mask : fixture
+            Fixture to create boolean masks.
+
+        Notes
+        -----
+        The test creates a mask with density=0 (all False values) and verifies
+        that the matrix remains unchanged after the assignment operation.
         """
         _, A = create_matrix(N)
         _, mask_all_false = create_mask(N, density=0)
@@ -114,7 +181,25 @@ def test_mask_all_false(self, N, create_matrix, create_mask):
         assert numpy.all(A_copy.get_data() == A.get_data())
 
     def test_random_column_order(self):
-        "The ordering of the matrix is random" ""
+        """Test indexing with randomly ordered column indices.
+
+        This test verifies that indexing works correctly even when the
+        column indices are not in sorted order within each row.
+
+        Notes
+        -----
+        The test creates a matrix with randomly ordered column indices
+        within rows. During instantiation, these indices get sorted to
+        ensure proper indexing behavior.
+
+        The test verifies that:
+        1. The matrix is created correctly despite random column ordering
+        2. Boolean indexing operations work correctly
+        3. The number of elements replaced matches the expected count
+
+        This is important because CSR format requires column indices to be
+        sorted within each row for efficient operations.
+        """
         row_indices = cupynumeric.array(
             [
                 2,
diff --git a/tests/integration/test_io.py b/tests/integration/test_io.py
index 8d237ba4..c307941c 100644
--- a/tests/integration/test_io.py
+++ b/tests/integration/test_io.py
@@ -25,6 +25,29 @@
 
 @pytest.fixture
 def test_mtx_files():
+    """Fixture providing paths to test Matrix Market files.
+
+    This fixture returns a list of paths to various Matrix Market (.mtx)
+    files that are used for testing the mmread functionality.
+
+    Returns
+    -------
+    list
+        List of file paths to test Matrix Market files.
+
+    Notes
+    -----
+    The fixture includes various types of matrices:
+    - test.mtx: Basic test matrix
+    - GlossGT.mtx: Graph theory matrix
+    - Ragusa18.mtx: Scientific computing matrix
+    - cage4.mtx: Graph matrix
+    - karate.mtx: Social network matrix
+
+    These files are located in the testdata directory and provide
+    different sparsity patterns and matrix properties for comprehensive
+    testing of the Matrix Market reader.
+    """
     mtx_files = [
         "test.mtx",
         "GlossGT.mtx",
@@ -36,6 +59,31 @@ def test_mtx_files():
 
 
 def test_mmread(test_mtx_files):
+    """Test Matrix Market file reading functionality.
+
+    This test verifies that the legate_sparse Matrix Market reader
+    produces the same results as SciPy's mmread function.
+
+    Parameters
+    ----------
+    test_mtx_files : list
+        List of Matrix Market file paths to test.
+
+    Notes
+    -----
+    The test reads each Matrix Market file using both legate_sparse.io.mmread
+    and scipy.io.mmread, then compares the results by converting both to
+    dense format and checking for equality.
+
+    This ensures that:
+    1. The Matrix Market format is parsed correctly
+    2. The sparse matrix structure is preserved
+    3. The numerical values are read accurately
+    4. The implementation is compatible with SciPy's reference implementation
+
+    The test covers various matrix types and sizes to ensure robust
+    parsing of the Matrix Market format.
+    """
     for mtx_file in test_mtx_files:
         arr = legate_io.mmread(mtx_file)
         s = sci_io.mmread(mtx_file)
diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py
index 7106281c..5954df79 100644
--- a/tests/integration/test_spgemm.py
+++ b/tests/integration/test_spgemm.py
@@ -25,6 +25,25 @@
 
 @pytest.mark.parametrize("N", [5, 29])
 def test_csr_spgemm(N):
+    """Test sparse matrix-matrix multiplication with CSR matrices.
+
+    This test verifies that sparse matrix-matrix multiplication works
+    correctly for different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrices (N x N).
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and computes A @ A using
+    the sparse implementation. It then compares the result with the
+    dense matrix multiplication A_dense @ A_dense to verify correctness.
+
+    The test uses different matrix sizes to ensure the implementation
+    works correctly for both small and larger matrices.
+    """
     np.random.seed(0)
     A_dense, A, _ = simple_system_gen(N, N, sparse.csr_array)
 
@@ -38,6 +57,29 @@ def test_csr_spgemm(N):
 @pytest.mark.parametrize("N", [5, 29])
 @pytest.mark.parametrize("unsupported_dtype", ["int", "bool"])
 def test_csr_spgemm_unsupported_dtype(N, unsupported_dtype):
+    """Test that unsupported datatypes raise appropriate exceptions for SpGEMM.
+
+    This test verifies that sparse matrix-matrix multiplication
+    properly handles unsupported datatypes by raising NotImplementedError
+    when running on GPU.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrices.
+    unsupported_dtype : str
+        Datatype that is not supported for SpGEMM operations.
+
+    Notes
+    -----
+    The test creates banded matrices with unsupported datatypes and
+    attempts to perform matrix-matrix multiplication. On GPU systems,
+    this should raise NotImplementedError since only floating-point
+    and complex datatypes are supported for SpGEMM.
+
+    Currently supported datatypes are float32, float64, complex64,
+    and complex128.
+    """
     np.random.seed(0)
 
     nnz_per_row = 3
diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py
index 0aca095e..0c3590df 100644
--- a/tests/integration/test_spmv.py
+++ b/tests/integration/test_spmv.py
@@ -27,6 +27,32 @@
 @pytest.mark.parametrize("M", [7, 17])
 @pytest.mark.parametrize("inline", [True, False])
 def test_csr_spmv(N, M, inline):
+    """Test sparse matrix-vector multiplication with CSR matrices.
+
+    This test verifies that sparse matrix-vector multiplication works
+    correctly for different matrix sizes and computation methods.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    inline : bool
+        Whether to use inline computation (A.dot(x, out=y)) or
+        standard multiplication (A @ x).
+
+    Notes
+    -----
+    The test creates a random sparse matrix and vector, then computes
+    the matrix-vector product using both the sparse implementation
+    and a dense reference. It verifies that the results are numerically
+    close.
+
+    The inline parameter tests two different computation methods:
+    - inline=True: Uses A.dot(x, out=y) with pre-allocated output
+    - inline=False: Uses A @ x with automatic output allocation
+    """
     np.random.seed(0)
     A_dense, A, x = simple_system_gen(N, M, sparse.csr_array)
 
@@ -43,6 +69,31 @@ def test_csr_spmv(N, M, inline):
 @pytest.mark.parametrize("nnz_per_row", [3, 9])
 @pytest.mark.parametrize("unsupported_dtype", ["int", "bool"])
 def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype):
+    """Test that unsupported datatypes raise appropriate exceptions.
+
+    This test verifies that sparse matrix-vector multiplication
+    properly handles unsupported datatypes by raising NotImplementedError
+    when running on GPU.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix.
+    nnz_per_row : int
+        Number of non-zeros per row in the banded matrix.
+    unsupported_dtype : str
+        Datatype that is not supported for SpMV operations.
+
+    Notes
+    -----
+    The test creates a banded matrix with an unsupported datatype
+    and attempts to perform matrix-vector multiplication. On GPU
+    systems, this should raise NotImplementedError since only
+    floating-point and complex datatypes are supported for SpMV.
+
+    Currently supported datatypes are float32, float64, complex64,
+    and complex128.
+    """
     np.random.seed(0)
 
     A = banded_matrix(N, nnz_per_row).astype(unsupported_dtype)
diff --git a/tests/integration/utils/banded_matrix.py b/tests/integration/utils/banded_matrix.py
index 4cff897a..fda5ef5f 100644
--- a/tests/integration/utils/banded_matrix.py
+++ b/tests/integration/utils/banded_matrix.py
@@ -24,22 +24,51 @@ def banded_matrix(
     init_with_ones: bool = True,
     verbose: bool = False,
 ):
-    """
+    """Create a banded sparse matrix for testing purposes.
+
     Parameters
     ----------
-    N: int
-        Size of the NxN sparse matrix
-    nnz_per_row: int
-        Number of non-zero elements per row (odd number)
-    from_diags: bool
-        use sparse.diags to generate the banded matrix (default = True)
-    init_with_ones: bool
-        Initialize the matrix with ones instead of arange
+    N : int
+        Size of the NxN sparse matrix.
+    nnz_per_row : int
+        Number of non-zero elements per row (must be odd).
+    from_diags : bool, optional
+        Use sparse.diags to generate the banded matrix. Default is True.
+    init_with_ones : bool, optional
+        Initialize the matrix with ones instead of arange. Default is True.
+    verbose : bool, optional
+        Print detailed information about the matrix construction. Default is False.
 
     Returns
     -------
-    csr_array:
-        Return a sparse matrix
+    csr_array
+        A banded sparse matrix in CSR format.
+
+    Raises
+    ------
+    AssertionError
+        If N <= nnz_per_row or nnz_per_row is not odd.
+
+    Notes
+    -----
+    This function creates a banded matrix with a specific sparsity pattern.
+    When from_diags=True, it uses the sparse.diags function which is simpler
+    but may be slower. When from_diags=False, it constructs the CSR matrix
+    directly for better performance.
+
+    The matrix has a banded structure with nnz_per_row non-zeros per row,
+    centered around the main diagonal. The function handles the boundary
+    conditions by masking out-of-bounds indices.
+
+    Examples
+    --------
+    >>> A = banded_matrix(5, 3, from_diags=True)
+    >>> print(A.toarray())
+    [[1. 1. 0. 0. 0.]
+     [1. 1. 1. 0. 0.]
+     [0. 1. 1. 1. 0.]
+     [0. 0. 1. 1. 1.]
+     [0. 0. 0. 1. 1.]]
     """
 
     if from_diags:
diff --git a/tests/integration/utils/sample.py b/tests/integration/utils/sample.py
index a201d6f2..e5444987 100644
--- a/tests/integration/utils/sample.py
+++ b/tests/integration/utils/sample.py
@@ -19,11 +19,62 @@
 
 
 class Normal(stats.rv_continuous):
+    """Custom normal distribution class for reproducible random sampling.
+
+    This class extends scipy.stats.rv_continuous to provide a custom
+    normal distribution that can be used with scipy.sparse.random for
+    generating sparse matrices with reproducible random values.
+
+    Notes
+    -----
+    The _rvs method generates standard normal random variates using
+    the provided random_state for reproducibility.
+    """
+
     def _rvs(self, *args, size=None, random_state=None):
+        """Generate standard normal random variates.
+
+        Parameters
+        ----------
+        size : int or tuple, optional
+            Number of random variates to generate.
+        random_state : numpy.random.RandomState, optional
+            Random state for reproducibility.
+
+        Returns
+        -------
+        numpy.ndarray
+            Array of standard normal random variates.
+        """
         return random_state.standard_normal(size)
 
 
 def sample(N: int, D: int, density: float, seed: int):
+    """Generate a sparse matrix with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    D : int
+        Number of columns in the matrix.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    scipy.sparse.csr_matrix
+        A sparse matrix in CSR format with random normal values.
+
+    Notes
+    -----
+    This function uses scipy.sparse.random with a custom normal distribution
+    to generate sparse matrices with reproducible random values. The matrix
+    is returned in CSR format.
+
+    """
     NormalType = Normal(seed=seed)
     SeededNormal = NormalType()
     return scpy.random(
@@ -38,14 +89,91 @@ def sample(N: int, D: int, density: float, seed: int):
 
 
 def sample_dense(N: int, D: int, density: float, seed: int):
+    """Generate a dense matrix with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    D : int
+        Number of columns in the matrix.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    numpy.ndarray
+        A dense matrix with random normal values.
+
+    Notes
+    -----
+    This function generates a sparse matrix using sample() and then
+    converts it to dense format. This is useful for creating test
+    matrices that can be compared with sparse implementations.
+
+    """
     return numpy.asarray(sample(N, D, density, seed).todense())
 
 
 def sample_dense_vector(N: int, density: float, seed: int):
+    """Generate a dense vector with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Length of the vector.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    numpy.ndarray
+        A dense vector with random normal values.
+
+    Notes
+    -----
+    This function generates a dense matrix with one column using
+    sample_dense() and then squeezes it to a 1D vector.
+
+    """
     return sample_dense(N, 1, density, seed).squeeze()
 
 
 def simple_system_gen(N, M, cls, tol=0.5):
+    """Generate a simple linear system for testing.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    cls : type or None
+        Class to use for creating the sparse matrix. If None, no sparse
+        matrix is created.
+    tol : float, optional
+        Threshold for sparsity. Values below this threshold are set to zero.
+        Default is 0.5.
+
+    Returns
+    -------
+    tuple
+        (a_dense, a_sparse, x) where:
+        - a_dense: Dense matrix
+        - a_sparse: Sparse matrix (or None if cls is None)
+        - x: Dense vector
+
+    Notes
+    -----
+    This function generates a random dense matrix and vector, then
+    applies a threshold to create sparsity. The sparse matrix is
+    created using the provided class if specified.
+
+    """
     a_dense = np.random.rand(N, M)
     x = np.random.rand(M)
     a_dense = np.where(a_dense < tol, a_dense, 0)

From 4515ccea859fff6eef75caca767d71ba8b740759 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Dec 2025 20:15:02 +0000
Subject: [PATCH 2/3] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/mirrors-mypy: v1.5.1 → v1.19.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.5.1...v1.19.1)
- https://github.com/psf/black → https://github.com/psf/black-pre-commit-mirror
- [github.com/psf/black-pre-commit-mirror: 23.9.1 → 25.12.0](https://github.com/psf/black-pre-commit-mirror/compare/23.9.1...25.12.0)
- [github.com/PyCQA/isort: 5.12.0 → 7.0.0](https://github.com/PyCQA/isort/compare/5.12.0...7.0.0)
- [github.com/PyCQA/flake8: 6.1.0 → 7.3.0](https://github.com/PyCQA/flake8/compare/6.1.0...7.3.0)
- [github.com/pre-commit/mirrors-clang-format: v16.0.6 → v21.1.8](https://github.com/pre-commit/mirrors-clang-format/compare/v16.0.6...v21.1.8)
---
 .pre-commit-config.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b24026ed..6384338f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,27 +1,27 @@
 repos:
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v1.5.1'
+      rev: 'v1.19.1'
       hooks:
             - id: mypy
               language: system
               pass_filenames: false
               args: ['legate_sparse']
-    - repo: https://github.com/psf/black
-      rev: 23.9.1
+    - repo: https://github.com/psf/black-pre-commit-mirror
+      rev: 25.12.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/isort
-      rev: 5.12.0
+      rev: 7.0.0
       hooks:
             - id: isort
               args: ["--profile", "black"]
     - repo: https://github.com/PyCQA/flake8
-      rev: 6.1.0
+      rev: 7.3.0
       hooks:
             - id: flake8
               args: [--config=.flake8]
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v16.0.6'  # Use the sha / tag you want to point at
+      rev: 'v21.1.8'  # Use the sha / tag you want to point at
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$

From ac60270bdc1006ddb2c5738cb55d5c40e7859d99 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Dec 2025 20:15:11 +0000
Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/common.py         | 3 +--
 examples/pde.py            | 4 +++-
 legate_sparse/coverage.py  | 3 +--
 legate_sparse/module.py    | 1 +
 scripts/memlog_analysis.py | 4 ++--
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/common.py b/examples/common.py
index 99174ed6..c7b7b493 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -171,8 +171,7 @@ class DummyScope:
     that may or may not use resource scoping.
     """
 
-    def __init__(self):
-        ...
+    def __init__(self): ...
 
     def __enter__(self):
         """Enter the context (no-op)."""
diff --git a/examples/pde.py b/examples/pde.py
index d9ca0095..9212baba 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -218,7 +218,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
             if use_legate:
-                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+                p_sol, iters = linalg.cg(
+                    A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters
+                )
             else:
                 p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
diff --git a/legate_sparse/coverage.py b/legate_sparse/coverage.py
index 8765044e..cfbf98aa 100644
--- a/legate_sparse/coverage.py
+++ b/legate_sparse/coverage.py
@@ -43,8 +43,7 @@ def should_wrap(obj: object) -> bool:
 
 
 class AnyCallable(Protocol):
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
 
 
 def wrap(func: AnyCallable) -> Any:
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index 56f22fa1..e2004a60 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -59,6 +59,7 @@
 def _is_sparse_matrix(obj) -> bool:
     return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
+
 def isspmatrix(obj) -> bool:
     """Check if an object is a legate sparse matrix.
 
diff --git a/scripts/memlog_analysis.py b/scripts/memlog_analysis.py
index ee8bd3c6..e294c349 100644
--- a/scripts/memlog_analysis.py
+++ b/scripts/memlog_analysis.py
@@ -16,10 +16,10 @@
 
     # Parse the log file
     allocations = parse_memlog('memlog.txt')
-    
+
     # Export to CSV
     export_to_csv(allocations, 'memory_analysis.csv')
-    
+
     # Create visualizations (requires pandas, matplotlib, seaborn)
     visualize_allocations(allocations)
 """  # noqa: W293