From bfd1b1fbecefc6f3569e17b38934571c307fd309 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Tue, 15 Apr 2025 10:01:15 -0700
Subject: [PATCH 1/4] v25.03.00

---
 CMakeLists.txt                                |   2 +-
 cmake/thirdparty/get_legate.cmake             |   1 -
 cmake/versions.json                           |  10 +-
 conda/conda-build/build.sh                    |  57 ++
 conda/conda-build/conda_build_config.yaml     |  17 +
 conda/conda-build/meta.yaml                   | 174 +++++
 examples/gmg.py                               |  20 +-
 examples/matrix_power.py                      | 235 ++++++
 examples/pde.py                               |  31 +-
 examples/spgemm_microbenchmark.py             |  12 +-
 examples/spmv_microbenchmark.py               |  10 +-
 install.py                                    |  10 +
 legate_sparse/config.py                       |  13 +-
 legate_sparse/csr.py                          | 680 +++++++++++++++---
 legate_sparse/runtime.py                      |   3 -
 legate_sparse/settings.py                     |  12 -
 legate_sparse/utils.py                        |  55 ++
 legate_sparse_cpp.cmake                       |  14 +-
 scripts/memlog_analysis.py                    | 420 +++++++++++
 scripts/memlog_cli.py                         | 241 +++++++
 scripts/memlog_parser.py                      | 297 ++++++++
 setup.py                                      |   2 +-
 .../array/conv/pos_to_coordinates.cc          |  24 +-
 .../array/conv/pos_to_coordinates.cu          |  44 +-
 .../array/conv/pos_to_coordinates.h           |   9 +-
 .../array/conv/pos_to_coordinates_omp.cc      |  25 +-
 .../conv/pos_to_coordinates_template.inl      |  94 +--
 src/legate_sparse/array/csr/indexing.cc       |  99 +++
 src/legate_sparse/array/csr/indexing.cu       | 116 +++
 src/legate_sparse/array/csr/indexing.h        |  53 ++
 src/legate_sparse/array/csr/indexing_omp.cc   |  93 +++
 .../array/csr/indexing_template.inl           |  68 ++
 .../array/csr/spgemm_csr_csr_csr.cc           |  11 +-
 .../array/csr/spgemm_csr_csr_csr.cu           |  48 +-
 .../array/csr/spgemm_csr_csr_csr_omp.cc       |  11 +-
 .../array/csr/spgemm_csr_csr_csr_template.inl |   3 +-
 src/legate_sparse/array/csr/spmv.cu           |   9 +-
 src/legate_sparse/cffi.h                      |  12 +-
 src/legate_sparse/io/mtx_to_coo.cc            |   8 +-
 src/legate_sparse/mapper/mapper.cc            |  22 +-
 src/legate_sparse/projections.cc              |  79 --
 src/legate_sparse/projections.h               |  65 --
 src/legate_sparse/sparse.cc                   |  11 +-
 src/legate_sparse/util/cusparse_utils.h       |  16 +-
 src/legate_sparse/util/dispatch.h             |  98 +++
 src/legate_sparse/util/legate_utils.h         |  32 +
 src/legate_sparse/util/logger.h               |   1 +
 src/legate_sparse/util/thrust_allocator.h     |   8 +-
 tests/integration/conftest.py                 |  59 ++
 tests/integration/test_comparison.py          |  87 +++
 tests/integration/test_indexing.py            | 167 +++++
 tests/integration/test_manual_sorting.py      |  34 +
 tests/integration/test_nonzero.py             |  43 ++
 tests/integration/test_spgemm.py              |   8 +-
 tests/integration/test_spmv.py                |   8 +-
 tests/integration/utils/sample.py             |   8 +-
 56 files changed, 3270 insertions(+), 519 deletions(-)
 create mode 100644 conda/conda-build/build.sh
 create mode 100644 conda/conda-build/conda_build_config.yaml
 create mode 100644 conda/conda-build/meta.yaml
 create mode 100644 examples/matrix_power.py
 create mode 100644 scripts/memlog_analysis.py
 create mode 100644 scripts/memlog_cli.py
 create mode 100644 scripts/memlog_parser.py
 create mode 100644 src/legate_sparse/array/csr/indexing.cc
 create mode 100644 src/legate_sparse/array/csr/indexing.cu
 create mode 100644 src/legate_sparse/array/csr/indexing.h
 create mode 100644 src/legate_sparse/array/csr/indexing_omp.cc
 create mode 100644 src/legate_sparse/array/csr/indexing_template.inl
 delete mode 100644 src/legate_sparse/projections.cc
 delete mode 100644 src/legate_sparse/projections.h
 create mode 100644 src/legate_sparse/util/logger.h
 create mode 100644 tests/integration/conftest.py
 create mode 100644 tests/integration/test_comparison.py
 create mode 100644 tests/integration/test_indexing.py
 create mode 100644 tests/integration/test_manual_sorting.py
 create mode 100644 tests/integration/test_nonzero.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccc2eaf8..82e98886 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(rapids-find)
 ###################################
 # Project
 
-set(legate_sparse_version 25.01.00)
+set(legate_sparse_version 25.03.00)
 
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index 727671fd..709577a1 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -24,7 +24,6 @@ function(find_or_configure_legate)
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
-  set(version ${PKG_VERSION})
   set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
   if(PKG_BRANCH)
     set(git_branch "${PKG_BRANCH}")
diff --git a/cmake/versions.json b/cmake/versions.json
index 4351e1d1..6c8498d4 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -3,20 +3,22 @@
     "legate" : {
       "repo": "legate.internal",
       "org": "nv-legate",
-      "version": "25.01.00",
+      "version": "25.03.02",
       "git_url" : "git@github.com:nv-legate/legate.internal.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "c9748050eed8f6087fc2ecb86ff7eb2b5f3edb45"
+      "git_tag" : "75dc0a92bbd2dfb79b6b680a0f37cbd0370d0181",
+      "anaconda_label": "main"
     },
     "cupynumeric" : {
       "repo": "cupynumeric.internal",
       "org": "nv-legate",
-      "version": "25.01.00",
+      "version": "25.03.02",
       "git_url" : "git@github.com:nv-legate/cupynumeric.internal",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "0464776f098f7a54c1341cb3c19b3aa5812f658f"
+      "git_tag" : "1fa45603c560068508c3be2e0df45aec62359019",
+      "anaconda_label": "experimental"
     }
   }
 }
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
new file mode 100644
index 00000000..bda8ee16
--- /dev/null
+++ b/conda/conda-build/build.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+echo -e "\n\n--------------------- CONDA/CONDA-BUILD/BUILD.SH -----------------------\n"
+
+set -xeo pipefail;
+
+# If run through CI, BUILD_MARCH is set externally. If it is not set, try to set it.
+ARCH=$(uname -m)
+if [[ -z "${BUILD_MARCH}" ]]; then
+    if [[ "${ARCH}" = "aarch64" ]]; then
+        # Use the gcc march value used by aarch64 Ubuntu.
+        BUILD_MARCH=armv8-a
+    else
+        # Use uname -m otherwise
+        BUILD_MARCH=$(uname -m | tr '_' '-')
+    fi
+fi
+
+# Rewrite conda's -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY to
+#                 -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+CMAKE_ARGS="$(echo "$CMAKE_ARGS" | sed -r "s@_INCLUDE=ONLY@_INCLUDE=BOTH@g")"
+
+# Add our options to conda's CMAKE_ARGS
+CMAKE_ARGS+="
+--log-level=VERBOSE
+-DBUILD_SHARED_LIBS=ON
+-DBUILD_MARCH=${BUILD_MARCH}
+-DCMAKE_BUILD_TYPE=Release
+-DCMAKE_VERBOSE_MAKEFILE=ON
+-DCMAKE_BUILD_PARALLEL_LEVEL=${JOBS:-$(nproc --ignore=1)}"
+if [ -z "$CPU_ONLY" ]; then
+  CMAKE_ARGS+="-DCMAKE_CUDA_ARCHITECTURES=all-major"
+fi
+
+export CMAKE_GENERATOR=Ninja
+export CUDAHOSTCXX=${CXX}
+export OPENSSL_DIR="$PREFIX"
+
+echo "Environment"
+env
+
+echo "Build starting on $(date)"
+CUDAFLAGS="-isystem ${PREFIX}/include -L${PREFIX}/lib"
+export CUDAFLAGS
+
+SKBUILD_BUILD_OPTIONS=-j$CPU_COUNT \
+$PYTHON -m pip install             \
+  --root /                         \
+  --no-deps                        \
+  --prefix "$PREFIX"               \
+  --no-build-isolation             \
+  --upgrade                        \
+  --cache-dir "$PIP_CACHE_DIR"     \
+  --disable-pip-version-check      \
+  . -vv
+
+echo "Build ending on $(date)"
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
new file mode 100644
index 00000000..79750a86
--- /dev/null
+++ b/conda/conda-build/conda_build_config.yaml
@@ -0,0 +1,17 @@
+gpu_enabled:
+  - true
+  - false
+
+upload_build:
+  - false
+
+python:
+  - 3.10
+  - 3.11
+  - 3.12
+
+numpy_version:
+  - ">=1.22,<2"
+
+cmake_version:
+  - ">=3.20.1,!=3.23.0"
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
new file mode 100644
index 00000000..03d1f9d3
--- /dev/null
+++ b/conda/conda-build/meta.yaml
@@ -0,0 +1,174 @@
+{% set name = "legate-sparse" %}
+{% if gpu_enabled == "true" %}
+  {% set gpu_enabled_bool = true %}
+{% elif gpu_enabled == "false" %}
+  {% set gpu_enabled_bool = false %}
+{% else %}
+  {# We need to have a default value for the initial pass over the recipe #}
+  {% set gpu_enabled_bool = false %}
+{% endif %}
+{% if upload_build == "true" %}
+  {% set upload_build_bool = true %}
+{% elif upload_build == "false" %}
+  {% set upload_build_bool = false %}
+{% else %}
+  {# We need to have a default value for the initial pass over the recipe #}
+  {% set upload_build_bool = false %}
+{% endif %}
+## The placeholder version is strictly for making two-pass conda build process.
+## It should not be used for any other purpose, and this is not a default version.
+{% set placeholder_version = '0.0.0.dev' %}
+{% set default_cuda_version = '12.2.2' %}
+{% set cuda_version='.'.join(environ.get('CUDA', default_cuda_version).split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0]|int %}
+{% set py_version=environ.get('CONDA_PY', '') %}
+
+{% if build_number is defined %}
+    {# do nothing if defined #}
+{% else %}
+    {% set build_number = environ.get('GIT_DESCRIBE_NUMBER', '0') %}
+{% endif %}
+{% if package_version is defined %}
+    {% set version = package_version %}
+## The tag must be in one of the two formats (where N is a digit):
+## vNN.NN.NN
+## vNN.NN.NN.dev
+
+## Note: default values are only given to make conda build work. They should not be necessary in principle.
+{% elif 'dev' in environ.get('GIT_DESCRIBE_TAG', placeholder_version) %}
+    {% set version = (environ.get('GIT_DESCRIBE_TAG', placeholder_version) ~ environ.get('GIT_DESCRIBE_NUMBER', '')).lstrip('v') %}
+{% else %}
+    {% set version = environ.get('GIT_DESCRIBE_TAG', placeholder_version).lstrip('v') %}
+{% endif %}
+
+package:
+  name: {{ name|lower }}
+  version: {{ version }}
+
+source:
+{% if package_tar is defined %}
+  url: {{ package_tar }}
+{% elif use_local_path is defined %}
+  path: ../../
+{% else %}
+  git_url: ../../
+{% endif %}
+
+build:
+  skip: true # [not linux]
+  number: {{ build_number }}
+  missing_dso_whitelist:
+    -  '*libcuda.so*'
+{% if gpu_enabled_bool %}
+{% set cpu_gpu_tag='gpu' %}
+{% else %}
+{% set cpu_gpu_tag='cpu' %}
+{% endif %}
+{% set upload_tag='' %}
+
+## Create legate/cupynumeric version and build string
+{% set legate_version = os.environ.get("LEGATE_VERSION", "1.0.0") %}
+{% set legate_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("LEGATE_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set cupynumeric_version = os.environ.get("CUPYNUMERIC_VERSION", "1.0.0") %}
+{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("CUPYNUMERIC_BUILDSTR", ""), cpu_gpu_tag]) %}
+
+{% if use_local_path is not defined %}
+# use git hash
+  string: "cuda{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ PKG_BUILDNUM }}_{{ cpu_gpu_tag }}{{ upload_tag }}"
+{% else %}
+# do not use git hash
+  string: "cuda{{ cuda_major }}_py{{ py_version }}_{{ PKG_BUILDNUM }}_{{ cpu_gpu_tag }}{{ upload_tag }}"
+{% endif %}
+  script_env:
+    - SCCACHE_BUCKET
+    - SCCACHE_REGION
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_S3_KEY_PREFIX
+    - SCCACHE_S3_KEY_PREFIX
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - BUILD_MARCH
+{% if not gpu_enabled_bool %}
+    - CPU_ONLY=1
+  # The CPU-only packages having more track_features than the GPU builds helps
+  # the solver to prefer the GPU builds when both are viable candidates.
+  # ref: https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html#track-features
+  track_features:
+    - cpu_only
+{% endif %}
+
+ignore_run_exports_from:
+  # scikit-build should really be a part of the build env, but then it installs its own Python.  Conda build stacks
+  # the build environment on the host environment, and the build python takes over causing paths havoc.  So, we put
+  # scikit-build into the host env, but we ignore any exports it may bring.
+  - scikit-build
+
+requirements:
+  build:
+    - make
+    - ninja
+    - cmake {{ cmake_version }}
+    - {{ compiler('c') }} =11.2
+    - {{ compiler('cxx') }} =11.2
+    # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
+    - cuda-nvcc
+    # cudart needed for CPU and GPU builds because of curand
+    - cuda-cudart-dev
+    - cuda-version ={{ cuda_version }}
+    - libcusparse-dev
+
+
+  host:
+    # List shared libraries *for the target platfrom* here
+    - python
+    - scikit-build
+    # libcurand is used both in CPU and GPU builds
+    #- libcurand-dev
+    - openblas =* =*openmp*
+    - llvm-openmp
+    - legate ={{ legate_version }}={{ legate_buildstr }}
+    - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
+{% if gpu_enabled_bool %}
+    # cupynumeric could be only in the run section and we could have just legate
+    # here, but we want to make sure that we get a legate version compatible
+    # with *some* cupynumeric.  With the dev versions, if we pick the latest
+    # legate, there may not be a cupynumeric package that is compatible.  So, we
+    # list cupynumeric here to get a pair of legate and cupynumeric that are
+    # compatible.
+    - cuda-cccl
+    - libcusparse
+    - cuda-version ={{ cuda_version }}
+    - cuda-cudart
+    - nccl
+{% endif %}
+
+  run:
+    - numpy {{ numpy_version }}
+    - scipy
+    - openblas =* =*openmp*
+    - legate ={{ legate_version }}={{ legate_buildstr }}
+    - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
+{% if gpu_enabled_bool %}
+    - libnvjitlink
+    - libcusparse
+    # Pin to all minor versions of CUDA newer than the one built against, within the same major version.
+    # cuda-version constrains the CUDA runtime version and ensures a compatible driver is available
+    - {{ pin_compatible('cuda-version', min_pin='x.x', max_pin='x') }}
+    - __cuda >={{ cuda_version }}
+    - cuda-cudart
+{% endif %}
+
+  run_constrained:
+    - __glibc >=2.17  # [linux]
+
+about:
+  home: https://github.com/nv-legate/legate-sparse
+  license: Apache-2.0
+  license_file: LICENSE
+  summary: 'Aspiring drop-in replacment for Scipy Sparse'
+  description: |
+    Legate Sparse is a Legate library that aims to provide
+    a distributed and accelerated drop-in replacement
+    for the Scipy Sparse API on top of the Legate runtime.
+  doc_url: https://github.com/nv-legate/legate-sparse
+  dev_url: https://github.com/nv-legate/legate-sparse
diff --git a/examples/gmg.py b/examples/gmg.py
index 8b6ba7c6..6491ef59 100644
--- a/examples/gmg.py
+++ b/examples/gmg.py
@@ -402,19 +402,13 @@ def callback(x):
     norm_res = np.linalg.norm(b - (A @ x))
 
     # Check convergence with relative tolerance
-    if norm_res <= norm_ini * tol:
-        print(
-            f"Converged in {iters} iterations, final residual relative norm:"
-            f" {norm_res/norm_ini}"  # noqa: E226
-        )
-    else:
-        print(
-            f"Failed to converge in {iters} iterations, final residual relative norm:"
-            f" {norm_res/norm_ini}"  # noqa: E226
-        )
-
-    print(f"Solve Time: {total} ms")
-    print(f"Iteration time: {total / iters} ms")
+    convergence_status = True if norm_res <= norm_ini * tol else False
+    print(f"Dimension of A                      : {A.shape}")
+    print(f"Did the solution converge           : {convergence_status}")
+    print(f"Final relative residual norm        : {norm_res / norm_ini}")
+    print(f"Number of iterations                : {iters}")
+    print(f"Total elapsed time (ms)             : {total}")
+    print(f"Time per iteration (ms)             : {total / iters}")
 
 
 if __name__ == "__main__":
diff --git a/examples/matrix_power.py b/examples/matrix_power.py
new file mode 100644
index 00000000..a43249c0
--- /dev/null
+++ b/examples/matrix_power.py
@@ -0,0 +1,235 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This example performs matrix power by repetitively multiplication. We assume
+# that the matrix is square, so the number of cols is same as the number of
+# rows in the matrix
+
+import argparse
+from functools import reduce
+
+import numpy.typing as npt
+from common import get_arg_number, parse_common_args
+
+# global states random_seed, rng
+global random_seed, rng
+
+# ----------------------------
+# Matrix generation functions
+# ----------------------------
+
+
+def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None):
+    """Return a CSR matrix with a prescribed number of nonzeros in each row.
+
+    Args:
+    ----
+
+    nrows: int
+        Number of rows in the matrix. Number of columns is same as number of rows
+    nnz_per_row: int
+        Desired number of nonzero entries in each row
+    dtype: npt.DTypeLike
+        Datatype of the values. This should be one of floating point datatypes
+    """
+    dtype = np.float32 if dtype is None else dtype
+    ncols = nrows
+    nnz = nnz_per_row * nrows
+    indptr = np.linspace(
+        start=0, stop=nnz, num=nrows + 1, endpoint=True, dtype=np.int64
+    )
+    cols = rng.integers(0, ncols, nnz).reshape(ncols, nnz_per_row)
+    cols = np.sort(cols, axis=1).flatten()
+    vals = np.ones(nnz, dtype=dtype)
+    matrix = sparse.csr_matrix((vals, cols, indptr), shape=(nrows, ncols))
+
+    return matrix
+
+
+def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
+    """Return a CSR matrix with a prescribed number of nonzeros in the matrix.
+
+    Args:
+    ----
+
+    nrows: int
+        Number of rows in the matrix. Number of columns is same as number of rows
+    nnz_total: int
+        Desired number of nonzero entries in the matrix with no expectation of
+        nonzeros in each row of the matrix
+    dtype: npt.DTypeLike
+        Datatype of the values. This should be one of floating point datatypes
+    """
+    dtype = np.float32 if dtype is None else dtype
+    ncols = nrows
+    coo_rows = rng.integers(0, nrows, nnz_total)
+    coo_cols = rng.integers(0, ncols, nnz_total)
+    vals = np.ones(nnz_total, dtype=dtype)
+    matrix = sparse.csr_matrix((vals, (coo_rows, coo_cols)), shape=(nrows, ncols))
+
+    return matrix
+
+
+# ------------------------
+# Matrix Multiply routines
+# ------------------------
+
+
+def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4):
+    """Multiply matrix by self ntimes and print the time elapsed.
+    Args:
+    ----
+
+    A: csr_matrix
+        The input matrix
+    timer:
+        Instance of the timer class to measure elapsed time
+    ntimes:
+        Number of matrix multiplies or the exponent in A^n
+    nwarmups:
+        Number of warmup iterations before
+    """
+
+    timer.start()
+    B = A.copy()
+    elapsed_time_init_copy = timer.stop()
+
+    for _ in range(nwarmups):
+        output = A @ B
+
+    elapsed_time_spgemm = [-1.0] * ntimes
+    elapsed_time_copy = [-1.0] * ntimes
+
+    for hop in range(ntimes):
+        timer.start()
+        output = A @ B
+        elapsed_time_spgemm[hop] = timer.stop()
+        timer.start()
+        B = output.copy()
+        elapsed_time_copy[hop] = timer.stop()
+
+    # TODO: Wrap all the timing information in a dataclass
+    nelems = reduce(lambda x, y: x * y, A.shape)
+    sparsity_output = (nelems - output.nnz) * 100.0 / (A.shape[0] ** 2)
+
+    print(f"Dimension of A                         : {A.shape}")
+    print(f"Output matrix shape                    : {output.shape}")
+    print(f"NNZ of A                               : {A.nnz}")
+    print(f"NNZ of output                          : {output.nnz}")
+    print(f"Sparsity of output (%)                 : {sparsity_output}")
+    print(f"Total number of hops                   : {ntimes}")
+    print(f"Elapsed time for copy in init (ms)     : {elapsed_time_init_copy}")
+    for hop in range(ntimes):
+        print(
+            f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}"
+        )
+        print(f"Elapsed time for copy   for hop {hop} (ms) : {elapsed_time_copy[hop]}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-n",
+        "--nrows",
+        type=str,
+        default="1k",
+        dest="nrows",
+        help="Number of rows in the generated matrix (accepts suffixes 'k', 'm', 'g')",
+    )
+
+    parser.add_argument(
+        "--nnz-per-row",
+        type=int,
+        default=3,
+        dest="nnz_per_row",
+        help="Number of nnz per row for generated matrix",
+    )
+
+    parser.add_argument(
+        "--nnz-total",
+        type=str,
+        default="-1",
+        dest="nnz_total",
+        help="Total number of nonzeros for the generated matrix. "
+        "If both --nnz-per-row and --nnz-total are given, "
+        "--nnz-total takes precedence",
+    )
+
+    parser.add_argument(
+        "--ntimes",
+        type=int,
+        default=4,
+        dest="ntimes",
+        help="Number of times A @ A is performed",
+    )
+
+    parser.add_argument(
+        "--nwarmups",
+        type=int,
+        default=2,
+        dest="nwarmups",
+        help="Number of warmup iterations before A @ A is timed",
+    )
+
+    parser.add_argument(
+        "--same-sparsity-for-cpu-and-gpu",
+        action="store_true",
+        help="Use NumPy to generate random nos regardless of --package",
+    )
+
+    parser.add_argument(
+        "--random-seed",
+        type=int,
+        default=42,
+        help="Random number seed that influences the sparsity pattern",
+    )
+
+    args, _ = parser.parse_known_args()
+    _, timer, np, sparse, linalg, use_legate = parse_common_args()
+
+    nrows = get_arg_number(args.nrows)
+    nnz_total = get_arg_number(args.nnz_total)
+
+    # this is a global variable
+    global random_seed
+    global rng
+    random_seed = args.random_seed
+
+    if args.same_sparsity_for_cpu_and_gpu:
+        message = (
+            "Using NumPy to generate random numbers and "
+            "ensure sparsity pattern is the same across NumPy and "
+            "cuPyNumeric"
+        )
+        print(message)
+
+        import numpy as numpy
+
+        rng = numpy.random.default_rng(random_seed)
+    else:
+        rng = np.random.default_rng(random_seed)
+
+    timer.start()
+    if nnz_total > 0:
+        A = create_csr_with_nnz_total(nrows, nnz_total, np.float32)
+        print("Matrix created with total number of nonzeros")
+    elif nnz_total < 0 and args.nnz_per_row > 0:
+        A = create_csr_with_nnz_per_row(nrows, args.nnz_per_row, np.float32)
+        print("Matrix created with number of nonzeros per row")
+    elapsed_time_matrix_gen = timer.stop()
+
+    compute_matrix_multiply_ntimes(A, timer, int(args.nwarmups), int(args.ntimes))
+
+    print(f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}")
diff --git a/examples/pde.py b/examples/pde.py
index 52e169e2..2313f799 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -197,28 +197,27 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
             p_sol, iters = linalg.cg(A, bflat, rtol=tol)
         total = timer.stop()
 
+        print(f"Mesh resolution                     : ({nx}, {ny})")
+        print(f"Dimension of A                      : {A.shape}")
+        print(f"Number of rows in A                 : {A.shape[0]}")
+
         if throughput:
-            print(
-                f"CG Mesh: {nx}x{ny}, A numrows: {A.shape[0]} , ms / iter:"
-                f" { total / max_iters }"  # noqa: E201, E202
-            )
+            print(f"Total elapsed time (ms)             : {total}")
+            print(f"Max number of iterations            : {max_iters}")
+            print(f"Time per (max-)iteration (ms)       : {total / max_iters}")
+
             sys.exit(0)
         else:
             norm_ini = np.linalg.norm(bflat)
             norm_res = np.linalg.norm(bflat - (A @ p_sol))
+
             # Check convergence with relative tolerance
-            if norm_res <= norm_ini * tol:
-                print(
-                    f"CG converged after {iters} iterations, final residual relative norm:"
-                    f" {norm_res / norm_ini}"  # noqa: E201, E202
-                )
-            else:
-                print(
-                    f"CG didn't converge after {iters} iterations, final residual relative"
-                    f" norm: {norm_res / norm_ini}"
-                )
-
-            print(f"Total time: {total} ms")
+            convergence_status = True if norm_res <= norm_ini * tol else False
+            print(f"Did the solution converge           : {convergence_status}")
+            print(f"Final relative residual norm        : {norm_res / norm_ini}")
+            print(f"Number of iterations                : {iters}")
+            print(f"Total elapsed time (ms)             : {total}")
+            print(f"Time per iteration (ms)             : {total / iters}")
 
 
 if __name__ == "__main__":
diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py
index 34db35ef..3741f400 100644
--- a/examples/spgemm_microbenchmark.py
+++ b/examples/spgemm_microbenchmark.py
@@ -73,10 +73,14 @@ def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
 
     Cnnz = spgemm_dispatch(A, B).nnz
 
-    print(
-        f"SPGEMM {A.shape}x{B.shape} , nnz ({A.nnz})x({B.nnz})->({Cnnz}) : ms /"
-        f" iteration: {total / iters}"
-    )
+    print(f"Dimension of A                         : {A.shape}")
+    print(f"Dimension of B                         : {B.shape}")
+    print(f"NNZ of A                               : {A.nnz}")
+    print(f"NNZ of B                               : {B.nnz}")
+    print(f"NNZ of C                               : {Cnnz}")
+    print(f"Number of iterations                   : {iters}")
+    print(f"Total time (ms)                        : {total}")
+    print(f"Time per iteration (ms)                : {total / iters}")
 
 
 if __name__ == "__main__":
diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py
index 96145f1e..559ccbfc 100644
--- a/examples/spmv_microbenchmark.py
+++ b/examples/spmv_microbenchmark.py
@@ -49,7 +49,11 @@ def run_spmv(A, iters, repartition, timer):
         spmv_dispatch(A, x, y, i, repartition)
     total = timer.stop()
 
-    print(f"SPMV rows: {A.shape[0]}, nnz: {A.nnz} , ms / iter: {total / iters}")
+    print(f"Dimension of A                         : {A.shape}")
+    print(f"NNZ of A                               : {A.nnz}")
+    print(f"Number of iterations                   : {iters}")
+    print(f"Total elapsed time (ms)                : {total}")
+    print(f"Time per iteration (ms)                : {total / iters}")
 
 
 if __name__ == "__main__":
@@ -116,8 +120,8 @@ def run_spmv(A, iters, repartition, timer):
 
     init_procs, bench_procs = get_phase_procs(use_legate)
 
-    print(f"Processor kind for initialization: {init_procs}")
-    print(f"Processor kind for computation   : {bench_procs}")
+    print(f"Processor kind for initialization      : {init_procs}")
+    print(f"Processor kind for computation         : {bench_procs}")
 
     if args.fname != "":
         # Read file from matrix
diff --git a/install.py b/install.py
index c60a6f1b..c46e03a7 100755
--- a/install.py
+++ b/install.py
@@ -152,6 +152,7 @@ def install_legate_sparse(
     thrust_dir,
     unknown,
     verbose,
+    enable_buffer_logging,
 ):
     if len(networks) > 1:
         print(
@@ -312,6 +313,7 @@ def validate_path(path):
 -DLegion_USE_LLVM={("ON" if llvm else "OFF")}
 -DLegion_NETWORKS={";".join(networks)}
 -DLegion_USE_HDF5={("ON" if hdf else "OFF")}
+-DENABLE_BUFFER_LOGGING={("ON" if enable_buffer_logging else "OFF")}
 """.splitlines()
 
     if cuda_dir:
@@ -582,6 +584,14 @@ def driver():
         default=False,
         help="Enable verbose build output.",
     )
+    parser.add_argument(
+        "--enable-buffer-logging",
+        dest="enable_buffer_logging",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable logging deferred buffer allocations.",
+    )
     args, unknown = parser.parse_known_args()
 
     install_legate_sparse(unknown=unknown, **vars(args))
diff --git a/legate_sparse/config.py b/legate_sparse/config.py
index 08a7e915..3c9a3780 100644
--- a/legate_sparse/config.py
+++ b/legate_sparse/config.py
@@ -36,15 +36,14 @@ class _LegateSparseSharedLib:
 
     LEGATE_SPARSE_CSR_DIAGONAL: int
 
+    LEGATE_SPARSE_CSR_INDEXING_CSR: int
+
     LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT: int
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ: int
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR: int
     LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU: int
     LEGATE_SPARSE_AXPBY: int
 
-    LEGATE_SPARSE_PROJ_FN_1D_TO_2D: int
-    LEGATE_SPARSE_LAST_PROJ_FN: int
-
 
 def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
     # Use an already-opened library handle, which cffi will convert to a
@@ -136,6 +135,8 @@ class SparseOpCode(IntEnum):
 
     CSR_DIAGONAL = _sparse.LEGATE_SPARSE_CSR_DIAGONAL
 
+    CSR_INDEXING_CSR = _sparse.LEGATE_SPARSE_CSR_INDEXING_CSR
+
     CSR_SPMV_ROW_SPLIT = _sparse.LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT
 
     SPGEMM_CSR_CSR_CSR_NNZ = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ
@@ -143,11 +144,5 @@ class SparseOpCode(IntEnum):
     SPGEMM_CSR_CSR_CSR_GPU = _sparse.LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU
 
 
-@unique
-class SparseProjectionFunctor(IntEnum):
-    PROMOTE_1D_TO_2D = _sparse.LEGATE_SPARSE_PROJ_FN_1D_TO_2D
-    LAST_STATIC_PROJ_FN = _sparse.LEGATE_SPARSE_LAST_PROJ_FN
-
-
 # Register some types for us to use.
 rect1 = types.rect_type(1)
diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py
index 0a5a95d7..6b1d69a4 100644
--- a/legate_sparse/csr.py
+++ b/legate_sparse/csr.py
@@ -50,7 +50,15 @@
 import cupynumeric
 import numpy
 import scipy  # type: ignore
-from legate.core import ImageComputationHint, Shape, align, broadcast, image, types
+from legate.core import (
+    ImageComputationHint,
+    Scalar,
+    Shape,
+    align,
+    broadcast,
+    image,
+    types,
+)
 
 from .base import (
     CompressedBase,
@@ -74,6 +82,8 @@
     get_storage_type,
     get_store_from_cupynumeric_array,
     is_dtype_supported,
+    is_scalar_like,
+    sort_by_rows_then_cols,
     store_from_store_or_array,
     store_to_cupynumeric_array,
 )
@@ -87,6 +97,9 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         self.canonical_format = False
         super().__init__()
 
+        # Note that cupynumeric.dtype(None) returns float64, so make
+        # sure dtype is passed to csr_array if it is known apriori,
+        # especially when copying the matrix
         dtype = cupynumeric.dtype(dtype)
 
         # If from numpy.array - convert to cupynumeric array first
@@ -152,112 +165,7 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
             self.canonical_format = arg.canonical_format
 
         elif isinstance(arg, tuple):
-            # Couple of options here
-            if len(arg) == 2:
-                # empty array ctor, see scipy.sparse
-                # csr_array((M, N), [dtype])
-                if not isinstance(arg[1], tuple):
-                    (M, N) = arg
-                    if not isinstance(M, (int, numpy.integer)) or not isinstance(
-                        N, (int, numpy.integer)
-                    ):
-                        NotImplementedError(
-                            "Input tuple for empty CSR ctor should be it's shape"
-                        )
-                    shape = arg
-                    if dtype is None:
-                        dtype = cupynumeric.float64
-                    else:
-                        dtype = cupynumeric.dtype(dtype)
-                    nnz_arr = cupynumeric.zeros(0, dtype=dtype)
-                    ci_arr = cupynumeric.zeros(0, dtype=coord_ty)
-                    rptr_arr = cupynumeric.zeros(M + 1, dtype=coord_ty)
-                    # and pass this to next ctor
-                    arg = (nnz_arr, ci_arr, rptr_arr)
-
-                # Otherwise assume arg is COO data : (data, (row_ind, col_ind))
-                else:
-                    if shape is None:
-                        raise AssertionError("Cannot infer shape in this case.")
-
-                    st_data, (st_row, st_col) = arg
-
-                    # if passed numpy arrays - convert them
-                    if isinstance(st_data, numpy.ndarray):
-                        st_data = cupynumeric.array(st_data)
-                    if isinstance(st_row, numpy.ndarray):
-                        st_row = cupynumeric.array(st_row)
-                    if isinstance(st_col, numpy.ndarray):
-                        st_col = cupynumeric.array(st_col)
-
-                    # we assume nothing is sorted (be we can pass this information to ctor)
-                    # so sort by row indices:
-                    row_array = array_from_store_or_array(st_row)
-                    # if we would know that column indices are pre-sorted,
-                    # then we can use kind='stable' and mark csr_array as
-                    # with 'indices_sorted'
-                    row_sort = cupynumeric.argsort(row_array, kind="stable")
-
-                    # sort data based on rows
-                    new_data = array_from_store_or_array(st_data, copy=copy)[row_sort]
-                    new_col_ind = array_from_store_or_array(st_col, copy=copy)[row_sort]
-                    new_row_offsets = cupynumeric.append(
-                        cupynumeric.array([0]),
-                        cupynumeric.cumsum(
-                            cupynumeric.bincount(row_array, minlength=shape[0])
-                        ),
-                    )
-
-                    # pass to next ctor
-                    arg = (new_data, new_col_ind, new_row_offsets)
-                    # we created copies already if necessary
-                    copy = False
-
-            # ctor from CSR arrays
-            # Tuple of (vals, col_ind, row_offsets)
-            if len(arg) == 3:
-                if shape is None or len(shape) != 2:
-                    raise AssertionError("Cannot infer shape in this case.")
-
-                (data, indices, indptr) = arg
-
-                # if passed numpy arrays - convert them
-                if isinstance(data, numpy.ndarray):
-                    data = cupynumeric.array(data)
-                if isinstance(indices, numpy.ndarray):
-                    indices = cupynumeric.array(indices).astype(coord_ty)
-                if isinstance(indptr, numpy.ndarray):
-                    indptr = cupynumeric.array(indptr).astype(coord_ty)
-
-                # checking that shape matches with expectations for row_offsets
-                if indptr.shape[0] == shape[0] + 1:
-                    indptr_storage = array_from_store_or_array(indptr, copy=False)
-                    los = indptr_storage[:-1]
-                    his = indptr_storage[1:]
-                    self.pos = pack_to_rect1_store(
-                        get_store_from_cupynumeric_array(los),
-                        get_store_from_cupynumeric_array(his),
-                    )
-                    # copy explicitly, just in case (there are paths that won't create temp object)
-                    # For crd we enforce our internal type
-                    self.crd = store_from_store_or_array(
-                        cast_arr(indices, coord_ty), copy
-                    )
-                    self.vals = store_from_store_or_array(cast_to_store(data), copy)
-
-                # Otherwise we assume that we are passing pos store from existing csr_array
-                # This is internal only functionality, and we assume here only Store or cupynumeric.array
-                elif indptr.shape[0] == shape[0]:
-                    self.pos = store_from_store_or_array(indptr, copy)
-                    self.crd = store_from_store_or_array(indices, copy)
-                    self.vals = store_from_store_or_array(data, copy)
-
-                else:
-                    raise AssertionError(
-                        "Can't understand tuple of inputs for csr_array constructor"
-                    )
-
-                dtype = get_storage_type(data)
+            dtype, shape = self._init_from_tuple_inputs(arg, dtype, shape, copy)
         else:
             raise NotImplementedError("Can't convert to CSR from the input")
 
@@ -280,6 +188,142 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         # Saving the type
         self._dtype = dtype
 
+    def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
+        def _get_empty_csr(dtype, nrows_plus_one):
+            return (
+                cupynumeric.zeros(0, dtype=dtype),
+                cupynumeric.zeros(0, dtype=coord_ty),
+                cupynumeric.zeros(nrows_plus_one, dtype=coord_ty),
+            )
+
+        # Couple of options here
+        if len(arg) == 2:
+            # empty array ctor, see scipy.sparse
+            # csr_array((M, N), [dtype])
+            if not isinstance(arg[1], tuple):
+                (M, N) = arg
+                if not isinstance(M, (int, numpy.integer)) or not isinstance(
+                    N, (int, numpy.integer)
+                ):
+                    NotImplementedError(
+                        "Input tuple for empty CSR ctor should be it's shape"
+                    )
+                shape = arg
+                dtype = (
+                    cupynumeric.float64 if dtype is None else cupynumeric.dtype(dtype)
+                )
+
+                # and pass this to next ctor
+                arg = _get_empty_csr(dtype, M + 1)
+
+            # Otherwise assume arg is COO data : (data, (row_ind, col_ind))
+            else:
+                if shape is None:
+                    raise AssertionError("Cannot infer shape in this case.")
+
+                st_data, (st_row, st_col) = arg
+
+                # issue 209: handle the case where we have empty CSR array
+                if st_data.size == st_row.size == st_col.size == 0:
+                    arg = _get_empty_csr(dtype, shape[0] + 1)
+                    copy = False
+                else:
+                    # if passed numpy arrays - convert them
+                    if isinstance(st_row, numpy.ndarray):
+                        st_row = cupynumeric.array(st_row)
+                    if isinstance(st_col, numpy.ndarray):
+                        st_col = cupynumeric.array(st_col)
+                    if isinstance(st_data, numpy.ndarray):
+                        st_data = cupynumeric.array(st_data)
+
+                    if not self.indices_sorted:
+                        # NOTE that CSR format does not require sorting the data
+                        # by columns but in setitem, we assume that the data is
+                        # sorted by rows and then by columns, so we sort the data
+                        # by columns as well
+
+                        row_array = array_from_store_or_array(st_row, copy=copy)
+                        col_array = array_from_store_or_array(st_col, copy=copy)
+                        new_data = array_from_store_or_array(st_data, copy=copy)
+
+                        indices = sort_by_rows_then_cols(row_array, col_array)
+
+                        new_data = new_data[indices]
+                        row_array = row_array[indices]
+                        col_array = col_array[indices]
+
+                        row_offsets = cupynumeric.append(
+                            cupynumeric.array([0]),
+                            cupynumeric.cumsum(
+                                cupynumeric.bincount(row_array, minlength=shape[0])
+                            ),
+                        )
+
+                        # pass to next ctor
+                        arg = (new_data, col_array, row_offsets)
+
+                        self.indices_sorted = True
+                        self.canonical_format = True
+                    else:
+                        # we need to convert row indices to row offsets/indptr
+                        row_array = array_from_store_or_array(st_row)
+                        row_offsets = cupynumeric.append(
+                            cupynumeric.array([0]),
+                            cupynumeric.cumsum(
+                                cupynumeric.bincount(row_array, minlength=shape[0])
+                            ),
+                        )
+                        if copy:
+                            arg = (st_data.copy(), st_col.copy(), row_offsets)
+                        else:
+                            arg = (st_data, st_col, row_offsets)
+
+        # ctor from CSR arrays
+        # Tuple of (vals, col_ind, row_offsets)
+        if len(arg) == 3:
+            if shape is None or len(shape) != 2:
+                raise AssertionError("Cannot infer shape in this case.")
+
+            (data, indices, indptr) = arg
+
+            # if passed numpy arrays - convert them
+            if isinstance(data, numpy.ndarray):
+                data = cupynumeric.array(data)
+            if isinstance(indices, numpy.ndarray):
+                indices = cupynumeric.array(indices).astype(coord_ty)
+            if isinstance(indptr, numpy.ndarray):
+                indptr = cupynumeric.array(indptr).astype(coord_ty)
+
+            # checking that shape matches with expectations for row_offsets
+            if indptr.shape[0] == shape[0] + 1:
+                indptr_storage = array_from_store_or_array(indptr, copy=False)
+                los = indptr_storage[:-1]
+                his = indptr_storage[1:]
+                self.pos = pack_to_rect1_store(
+                    get_store_from_cupynumeric_array(los),
+                    get_store_from_cupynumeric_array(his),
+                )
+                # copy explicitly, just in case (there are paths that won't create temp object)
+                # For crd we enforce our internal type
+                self.crd = store_from_store_or_array(cast_arr(indices, coord_ty), copy)
+                self.vals = store_from_store_or_array(cast_to_store(data), copy)
+
+            # Otherwise we assume that we are passing pos store from existing csr_array
+            # This is internal only functionality, and we assume here only Store or cupynumeric.array
+            elif indptr.shape[0] == shape[0]:
+                self.pos = store_from_store_or_array(indptr, copy)
+                self.crd = store_from_store_or_array(indices, copy)
+                self.vals = store_from_store_or_array(data, copy)
+
+            else:
+                raise AssertionError(
+                    "Can't understand tuple of inputs for csr_array constructor"
+                )
+
+            dtype = get_storage_type(data)
+
+        return dtype, shape
+
     @property
     def dim(self):
         return self.ndim
@@ -330,6 +374,21 @@ def get_indptr(self):
     # Disallow changing intptrs directly
     indptr = property(fget=get_indptr)
 
+    def _get_row_indices(self):
+        """Helper routine that converts pos to row indices"""
+
+        # TODO: Add an option that caches the row_indices so that other binary
+        # operations don't have to recompute it.
+
+        row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
+        task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
+        src_part = task.add_input(self.pos)
+        dst_part = task.add_output(row_indices)
+        task.add_constraint(image(src_part, dst_part))
+
+        task.execute()
+        return store_to_cupynumeric_array(row_indices)
+
     def has_sorted_indices(self):
         return self.indices_sorted
 
@@ -411,18 +470,302 @@ def __rmatmul__(self, other):
     def __matmul__(self, other):
         return self.dot(other)
 
+    def _compare_scalar(self, other, op):
+        """Helper method for element-wise comparison operations with scalars.
+        This methods returns a boolean CSR array with True values where
+        the comparison for op returns True.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against
+        op : callable
+            The comparison operator to use (e.g. cupynumeric.greater)
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where the comparison is True
+        """
+        assert is_scalar_like(other)
+        mask = op(store_to_cupynumeric_array(self.vals), other)
+        col_indices = store_to_cupynumeric_array(self.crd)[mask]
+        row_indices = self._get_row_indices()[mask]
+        vals = cupynumeric.ones(row_indices.size, dtype=bool)
+
+        # NOTE:
+        # If the data was already sorted by rows and cols in self,
+        # then we don't have to sort again in the constructor of csr_array,
+        # but there's no clean way to pass to the class that the data
+        # is already sorted
+        return csr_array(
+            (vals, (row_indices, col_indices)),
+            shape=self.shape,
+            dtype=bool,
+        )
+
+    def __gt__(self, other):
+        """Element-wise greater than comparison with a scalar value.
+        This operates only on the existing non-zero elements of the matrix.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against.
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where elements are greater
+            than the scalar.
+
+        Raises
+        ------
+        AssertionError
+            If the input is not scalar-like.
+
+        Examples
+        --------
+        >>> A = csr_array(...)
+        >>> mask = A > 0.5  # Returns boolean CSR array
+        """
+        return self._compare_scalar(other, cupynumeric.greater)
+
+    def __lt__(self, other):
+        """Element-wise less than comparison with a scalar value.
+        This operates only on the existing non-zero elements of the matrix.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against.
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where elements are less
+            than the scalar.
+
+        Raises
+        ------
+        AssertionError
+            If the input is not scalar-like.
+
+        Examples
+        --------
+        >>> A = csr_array(...)
+        >>> mask = A < 0.5  # Returns boolean CSR array
+        """
+        return self._compare_scalar(other, cupynumeric.less)
+
+    def __ge__(self, other):
+        """Element-wise greater than or equal comparison with a scalar value.
+        This operates only on the existing non-zero elements of the matrix.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against.
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where elements are greater
+            than or equal to the scalar.
+
+        Raises
+        ------
+        AssertionError
+            If the input is not scalar-like.
+
+        Examples
+        --------
+        >>> A = csr_array(...)
+        >>> mask = A >= 0.5  # Returns boolean CSR array
+        """
+        return self._compare_scalar(other, cupynumeric.greater_equal)
+
+    def __le__(self, other):
+        """Element-wise less than or equal comparison with a scalar value.
+        This operates only on the existing non-zero elements of the matrix.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against.
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where elements are less
+            than or equal to the scalar.
+
+        Raises
+        ------
+        AssertionError
+            If the input is not a scalar or a zero-dimensional array.
+
+        Examples
+        --------
+        >>> A = csr_array(...)
+        >>> mask = A <= 0.5  # Returns boolean CSR array
+        """
+        return self._compare_scalar(other, cupynumeric.less_equal)
+
+    def __eq__(self, other):
+        """Element-wise equality comparison with a scalar value.
+        This operates only on the existing non-zero elements of the matrix.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against.
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where elements are equal
+            to the scalar.
+
+        Raises
+        ------
+        AssertionError
+            If the input is not scalar-like.
+
+        Examples
+        --------
+        >>> A = csr_array(...)
+        >>> mask = A == 0.5  # Returns boolean CSR array
+        """
+        return self._compare_scalar(other, cupynumeric.equal)
+
+    def __ne__(self, other):
+        """Element-wise not equal comparison with a scalar value.
+        This operates only on the existing non-zero elements of the matrix.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar value to compare against.
+
+        Returns
+        -------
+        csr_array
+            A boolean CSR array with True values where elements are not equal
+            to the scalar.
+
+        Raises
+        ------
+        AssertionError
+            If the input is not scalar-like.
+
+        Examples
+        --------
+        >>> A = csr_array(...)
+        >>> mask = A != 0.5  # Returns boolean CSR array
+        """
+        return self._compare_scalar(other, cupynumeric.not_equal)
+
+    def __setitem__(self, key, value):
+        """Set values in the matrix using a boolean CSR mask.
+
+        Parameters
+        ----------
+        key : csr_array or csr_matrix
+            A boolean CSR matrix of the same shape as self that indicates which
+            elements to modify. Must have dtype=bool and same shape as the matrix
+        value : scalar
+            Value to assign at the positions indicated by key. Value gets
+            converted to the datatype of CSR matrix before assignment.
+
+        Returns
+        -------
+        csr_array
+            The modified matrix (self).
+
+        Raises
+        ------
+        NotImplementedError
+            If key is not a CSR matrix.
+
+        Examples
+        --------
+        >>> A = csr_array([[1, 2, 0], [3, 0, 4]])
+        >>> mask = A > 2  # Create mask from A
+        >>> A[mask] = 10
+        >>> A.todense()
+        array([[ 1,  2,  0],
+               [10,  0, 10]])
+
+        Notes
+        -----
+        This operation only updates entries that are
+        non-zero in both the original matrix and the mask. Elements that are zero
+        in the original matrix will remain zero even if they are True in the mask.
+        """
+        allowed_types = (csr_matrix, csr_array)
+        if not isinstance(key, allowed_types):
+            msg = "setting item is only supported for bool csr matrices"
+            raise NotImplementedError(msg)
+
+        assert key.shape == self.shape
+        assert key.dtype == bool
+
+        value_store = runtime.legate_runtime.create_store_from_scalar(Scalar(value))
+
+        # launch c++ task
+        task = runtime.create_auto_task(SparseOpCode.CSR_INDEXING_CSR)
+        A_vals_part = task.add_output(self.vals)
+        A_pos_part = task.add_input(self.pos)
+        A_crd_part = task.add_input(self.crd)
+        mask_pos_part = task.add_input(key.pos)
+        mask_crd_part = task.add_input(key.crd)
+        task.add_input(value_store)
+
+        # The elements that get updated are the ones where the mask
+        # and the current matrix have a non-zero value, so the coordinates
+        # that get updated in this operation is same as that from
+        # an AND operation of the coordinates of mask and self/matrix
+
+        # add partitioning constraints
+        task.add_constraint(image(A_pos_part, A_crd_part))
+        task.add_constraint(image(A_pos_part, A_vals_part))
+        task.add_constraint(image(mask_pos_part, mask_crd_part))
+        task.add_constraint(align(A_pos_part, mask_pos_part))
+
+        task.execute()
+
+        return self
+
     def dot(self, other, out=None):
+        """Ordinary dot product.
+
+        Parameters
+        ----------
+        other : array_like
+            The object to compute dot product with
+        out : ndarray, optional
+            Output array for the result
+
+        Returns
+        -------
+        output : csr_array or cupynumeric.ndarray
+            Sparse matrix or dense array depending on input
+        """
         # If output specified - it should be cupynumeric array
         if out is not None:
             assert isinstance(out, cupynumeric.ndarray)
 
-        # only floating point operations are supported at the moment
-        if not is_dtype_supported(self.dtype) or not is_dtype_supported(other.dtype):
-            msg = (
-                "Only the following datatypes are currently supported:"
-                f" {SUPPORTED_DATATYPES}."
-            )
-            raise NotImplementedError(msg)
+        # only floating point operations are supported by cusparse at the moment
+        if runtime.num_gpus > 0:
+            if not is_dtype_supported(self.dtype) or not is_dtype_supported(
+                other.dtype
+            ):
+                msg = (
+                    "Only the following datatypes are currently supported:"
+                    f" {SUPPORTED_DATATYPES}."
+                )
+                raise NotImplementedError(msg)
 
         # If other.shape = (M,) then it's SpMV
         if len(other.shape) == 1 or (len(other.shape) == 2 and other.shape[1] == 1):
@@ -489,15 +832,44 @@ def dot(self, other, out=None):
 
     # Misc
     def _getpos(self):
+        """Helper method to get row start and end positions.
+
+        This internal method unpacks the compressed row storage format's position array
+        into start and end positions for each row.
+
+        Returns
+        -------
+        list of tuple
+            List of (start, end) position tuples for each row in the matrix
+        """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
         row_end = store_to_cupynumeric_array(row_end_st)
         return [(i, j) for (i, j) in zip(row_start, row_end)]
 
     def copy(self):
-        return csr_array(self)
+        """Returns a copy of this matrix.
+
+        Returns
+        -------
+        csr_array
+            A copy of the matrix
+        """
+        return csr_array(self, dtype=self.dtype)
 
     def conj(self, copy=True):
+        """Element-wise complex conjugate.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a new matrix or modify in-place
+
+        Returns
+        -------
+        csr_array
+            The conjugate matrix
+        """
         if copy:
             return self.copy().conj(copy=False)
         return self._with_data(
@@ -505,6 +877,20 @@ def conj(self, copy=True):
         )
 
     def transpose(self, axes=None, copy=False):
+        """Reverses the dimensions of the sparse matrix.
+
+        Parameters
+        ----------
+        axes : None, optional
+            This argument is not supported
+        copy : bool, optional
+            Whether to create a copy (ignored - CSR transpose always creates copy)
+
+        Returns
+        -------
+        csr_array
+            Transposed matrix
+        """
         if axes is not None:
             raise AssertionError("axes parameter should be None")
 
@@ -539,16 +925,65 @@ def transpose(self, axes=None, copy=False):
     T = property(transpose)
 
     def asformat(seld, format, copy=False):
+        """Convert this matrix to a specified format.
+
+        Parameters
+        ----------
+        format : str
+            Desired sparse format ('csr' only)
+        copy : bool, optional
+            Whether to create a copy
+
+        Returns
+        -------
+        csr_array
+            Matrix in the requested format
+        """
         if format == "csr":
             return self.copy() if copy else self
         else:
             raise NotImplementedError("Only CSR format is supported right now")
 
     def tocsr(self, copy=False):
+        """Convert this matrix to a CSR matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy
+
+        Returns
+        -------
+        csr_array
+            The converted CSR matrix
+        """
         if copy:
             return self.copy().tocsr(copy=False)
         return self
 
+    def nonzero(self):
+        """Return the indices of the non-zero elements.
+
+        Returns
+        -------
+        (row, col) : tuple of cupynumeric.ndarrays
+            Row and column indices of non-zeros
+        """
+        task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
+
+        row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
+        row_indices_part = task.add_output(row_indices)
+        pos_part = task.add_input(self.pos)
+        task.add_constraint(image(pos_part, row_indices_part))
+        task.execute()
+
+        row_indices = store_to_cupynumeric_array(row_indices)
+        col_indices = store_to_cupynumeric_array(self.crd)
+        vals_array = store_to_cupynumeric_array(self.vals)
+        mask = vals_array != 0.0
+
+        return (row_indices[mask], col_indices[mask])
+
 
 csr_matrix = csr_array
 
@@ -591,6 +1026,21 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
 # spgemm_csr_csr_csr computes C = A @ B when A and B and
 # both csr matrices, and returns the result C as a csr matrix.
 def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
+    """
+    Perform sparse matrix multiplication C = A @ B
+
+    Parameters:
+    -----------
+    A: csr_array
+        Input sparse matrix A
+    B: csr_array
+        Input sparse matrix B
+
+    Returns:
+    --------
+    csr_array
+        The result of the sparse matrix multiplication
+    """
     # Due to limitations in cuSPARSE, we cannot use a uniform task
     # implementation for CSRxCSRxCSR SpGEMM across CPUs, OMPs and GPUs.
     # The GPU implementation will create a set of local CSR matrices
diff --git a/legate_sparse/runtime.py b/legate_sparse/runtime.py
index dad869fe..e7a3dc41 100644
--- a/legate_sparse/runtime.py
+++ b/legate_sparse/runtime.py
@@ -59,9 +59,6 @@ def __init__(self, sparse_library):
         self.legate_runtime = get_legate_runtime()
         self.legate_machine = get_machine()
 
-        self.dynamic_projection_functor_id = 1
-        self.proj_fn_1d_to_2d_cache = {}
-
         # Load all the necessary CUDA libraries if we have GPUs.
         if self.num_gpus > 0:
             # TODO (rohany): Also handle destroying the cuda libraries when the
diff --git a/legate_sparse/settings.py b/legate_sparse/settings.py
index f08608b2..31e48a0c 100644
--- a/legate_sparse/settings.py
+++ b/legate_sparse/settings.py
@@ -20,18 +20,6 @@
 
 
 class SparseRuntimeSettings(Settings):
-    precise_images: PrioritizedSetting[bool] = PrioritizedSetting(
-        "precise-images",
-        "LEGATE_SPARSE_PRECISE_IMAGES",
-        default=False,
-        convert=convert_bool,
-        help="""
-        Use precise images instead of approximate min-max boundary based
-        images. This can potentially reduce communication volume at the cost of
-        increasing startup time before application steady state.
-        """,
-    )
-
     fast_spgemm: PrioritizedSetting[bool] = PrioritizedSetting(
         "fast-spgemm",
         "LEGATE_SPARSE_FAST_SPGEMM",
diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py
index 5dbf0366..11daf9fd 100644
--- a/legate_sparse/utils.py
+++ b/legate_sparse/utils.py
@@ -200,3 +200,58 @@ def is_dtype_supported(dtype: numpy.dtype) -> bool:
     """
 
     return dtype in SUPPORTED_DATATYPES
+
+
+def is_dense(x) -> bool:
+    """
+    Is this object a dense cupynumeric array
+    """
+    return isinstance(x, cupynumeric.ndarray)
+
+
+def is_scalar_like(x) -> bool:
+    """
+    Is this object a scalar like type
+    """
+    if isinstance(x, str):
+        return False
+    return cupynumeric.isscalar(x) or (is_dense(x) and x.ndim == 0)
+
+
+def is_sparse(x) -> bool:
+    """
+    Is this object a legate sparse matrix
+    """
+    return legate_sparse.is_sparse_matrix(x)
+
+
+def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray):
+    """
+    This function is a quick and dirty hack that does what np.lexsort does
+    using argsort, but only for two keys.
+    This is primarily used to to get the indices that we can use to sort data
+    first by rows and then by columns
+
+    Parameters
+    ----------
+
+    rows: cupynumeric.ndarray
+        Indices of rows
+
+    cols: cupynumeric.ndarray
+        Indices of cols
+
+    Returns
+    -------
+    sorted_indices:cupynumeric.ndarray
+        Indices sorted by rows and then by columns, as given by numpy's lexsort
+    """
+    assert rows.size == cols.size
+
+    # note that the lexsort reverses the order of key,
+    # so this would be equivalent to np.lexsort((cols, rows))
+
+    indices = cupynumeric.argsort(cols, kind="stable")
+    order = cupynumeric.argsort(rows[indices], kind="stable")
+
+    return indices[order]
diff --git a/legate_sparse_cpp.cmake b/legate_sparse_cpp.cmake
index bad92aec..6a90e3b3 100644
--- a/legate_sparse_cpp.cmake
+++ b/legate_sparse_cpp.cmake
@@ -19,6 +19,7 @@
 
 option(BUILD_SHARED_LIBS "Build legate sparse shared libraries" ON)
 option(legate_sparse_EXCLUDE_LEGATE_FROM_ALL "Exclude legate targets from Legate Sparse's 'all' target" OFF)
+option(ENABLE_BUFFER_LOGGING "Enable logging of deferred buffers and allocators" OFF)
 
 ##############################################################################
 # - Project definition -------------------------------------------------------
@@ -123,8 +124,6 @@ set_cpu_arch_flags(legate_sparse_CXX_OPTIONS)
 
 
 list(APPEND legate_sparse_SOURCES
-  src/legate_sparse/projections.cc
-
   src/legate_sparse/mapper/mapper.cc
 
   src/legate_sparse/array/conv/dense_to_csr.cc
@@ -134,6 +133,7 @@ list(APPEND legate_sparse_SOURCES
   src/legate_sparse/array/csr/get_diagonal.cc
   src/legate_sparse/array/csr/spmv.cc
   src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+  src/legate_sparse/array/csr/indexing.cc
   
   src/legate_sparse/array/util/unzip_rect.cc
   src/legate_sparse/array/util/zip_to_rect.cc
@@ -153,6 +153,7 @@ if(Legion_USE_OpenMP)
     src/legate_sparse/array/csr/get_diagonal_omp.cc
     src/legate_sparse/array/csr/spmv_omp.cc
     src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
+    src/legate_sparse/array/csr/indexing_omp.cc
 
     src/legate_sparse/array/util/unzip_rect_omp.cc
     src/legate_sparse/array/util/zip_to_rect_omp.cc
@@ -172,6 +173,7 @@ if(Legion_USE_CUDA)
     src/legate_sparse/array/csr/get_diagonal.cu
     src/legate_sparse/array/csr/spmv.cu
     src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
+    src/legate_sparse/array/csr/indexing.cu
 
     src/legate_sparse/array/util/unzip_rect.cu
     src/legate_sparse/array/util/zip_to_rect.cu
@@ -218,6 +220,12 @@ elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set(platform_rpath_origin "@loader_path")
 endif ()
 
+if(ENABLE_BUFFER_LOGGING)
+  add_compile_definitions(ENABLE_BUFFER_LOGGING=1)
+else()
+  add_compile_definitions(ENABLE_BUFFER_LOGGING=0)
+endif()
+
 set_target_properties(legate_sparse
            PROPERTIES BUILD_RPATH                         "${platform_rpath_origin}"
                       INSTALL_RPATH                       "${platform_rpath_origin}"
@@ -296,7 +304,7 @@ install(
   FILES src/legate_sparse/sparse_c.h
         #TODO: ?
         #${CMAKE_CURRENT_BINARY_DIR}/include/cupynumeric/version_config.hpp
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/legate_sprase)
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/legate_sparse)
 
 
 ##############################################################################
diff --git a/scripts/memlog_analysis.py b/scripts/memlog_analysis.py
new file mode 100644
index 00000000..ee8bd3c6
--- /dev/null
+++ b/scripts/memlog_analysis.py
@@ -0,0 +1,420 @@
+#!/usr/bin/env python3
+"""
+Memory Log Analysis Tools
+
+This module provides tools for analyzing and visualizing memory allocation data
+from legate-sparse. It includes functions for exporting data to CSV/Excel and
+creating visualizations of memory usage patterns.
+
+Optional Dependencies:
+- pandas, matplotlib, seaborn: Required for visualizations
+- openpyxl: Required for Excel export
+
+Usage:
+    from memlog_analysis import export_to_csv, visualize_allocations
+    from memlog_parser import parse_memlog
+
+    # Parse the log file
+    allocations = parse_memlog('memlog.txt')
+    
+    # Export to CSV
+    export_to_csv(allocations, 'memory_analysis.csv')
+    
+    # Create visualizations (requires pandas, matplotlib, seaborn)
+    visualize_allocations(allocations)
+"""  # noqa: W293
+
+import csv
+from typing import List
+
+from memlog_parser import (
+    TYPE_SIZES,
+    BufferAllocation,
+    are_similar_sizes,
+    group_by_description,
+    group_by_size,
+)
+
+# Optional imports with error handling
+try:
+    import pandas as pd
+
+    PANDAS_AVAILABLE = True
+except ImportError:
+    PANDAS_AVAILABLE = False
+
+try:
+    import matplotlib.pyplot as plt
+
+    MATPLOTLIB_AVAILABLE = True
+except ImportError:
+    MATPLOTLIB_AVAILABLE = False
+
+try:
+    import seaborn as sns
+
+    SEABORN_AVAILABLE = True
+except ImportError:
+    SEABORN_AVAILABLE = False
+
+try:
+    from openpyxl import Workbook
+    from openpyxl.styles import Font, PatternFill
+    from openpyxl.utils import get_column_letter
+
+    OPENPYXL_AVAILABLE = True
+except ImportError:
+    OPENPYXL_AVAILABLE = False
+
+
+def export_to_csv(
+    allocations: List[BufferAllocation],
+    output_file: str,
+    group_by: str = "description",
+    unique_mb_only: bool = False,
+    threshold_percent: float = 5.0,
+):
+    """
+    Export memory allocation data to CSV file.
+
+    Args:
+        allocations: List of BufferAllocation objects
+        output_file: Path to output CSV file
+        group_by: Grouping method ('description' or 'size')
+        unique_mb_only: If True, only export unique memory sizes
+        threshold_percent: Percentage threshold for considering sizes similar
+    """
+    if group_by == "description":
+        grouped_data = group_by_description(allocations)
+    else:
+        grouped_data = group_by_size(allocations)
+
+    with open(output_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        # Write header
+        writer.writerow(
+            [
+                "Group",
+                "Size (elements)",
+                "Type",
+                "Size (bytes)",
+                "Memory (MB)",
+                "File",
+                "Line",
+                "Timestamp",
+            ]
+        )
+
+        for group, allocs in grouped_data.items():
+            # Track seen entries and memory sizes for this group
+            seen_entries = set()
+            seen_mb_sizes = set()
+
+            for alloc in allocs:
+                mb_size = alloc.total_mb()
+
+                # If unique_mb_only is enabled, check for similar memory sizes
+                if unique_mb_only:
+                    is_similar = any(
+                        are_similar_sizes(mb_size, seen_size, threshold_percent)
+                        for seen_size in seen_mb_sizes
+                    )
+                    if is_similar:
+                        continue
+                    seen_mb_sizes.add(mb_size)
+
+                # Create a unique key for this entry
+                entry_key = (alloc.size, alloc.type, alloc.file, alloc.line)
+
+                # Skip if we've seen this exact entry before
+                if entry_key in seen_entries:
+                    continue
+                seen_entries.add(entry_key)
+
+                writer.writerow(
+                    [
+                        group,
+                        alloc.size,
+                        alloc.type,
+                        TYPE_SIZES.get(alloc.type, 1),
+                        mb_size,
+                        alloc.file,
+                        alloc.line,
+                        alloc.timestamp,
+                    ]
+                )
+
+
+def export_to_excel(allocations: List[BufferAllocation], output_file: str) -> bool:
+    """
+    Export memory allocation data to formatted Excel file.
+
+    Args:
+        allocations: List of BufferAllocation objects
+        output_file: Path to output Excel file
+
+    Returns:
+        bool: True if export was successful, False if openpyxl is not available
+    """
+    if not OPENPYXL_AVAILABLE:
+        print(
+            "Error: Excel export requires openpyxl. Please install it with: pip install openpyxl"
+        )
+        return False
+
+    wb = Workbook()
+
+    # Create description-based view
+    desc_sheet = wb.active
+    desc_sheet.title = "By Description"
+    _add_allocations_to_sheet(desc_sheet, allocations, group_by="description")
+
+    # Create size-based view
+    size_sheet = wb.create_sheet("By Size")
+    _add_allocations_to_sheet(size_sheet, allocations, group_by="size")
+
+    # Create summary sheet
+    summary_sheet = wb.create_sheet("Summary")
+    _add_summary_to_sheet(summary_sheet, allocations)
+
+    wb.save(output_file)
+    return True
+
+
+def _add_allocations_to_sheet(
+    sheet, allocations: List[BufferAllocation], group_by: str
+):
+    """Helper function to add allocations to Excel sheet with formatting."""
+    if not OPENPYXL_AVAILABLE:
+        return
+
+    # Add headers
+    headers = [
+        "Group",
+        "Size (elements)",
+        "Type",
+        "Size (bytes)",
+        "Memory (MB)",
+        "File",
+        "Line",
+        "Timestamp",
+    ]
+    for col, header in enumerate(headers, 1):
+        cell = sheet.cell(row=1, column=col)
+        cell.value = header
+        cell.font = Font(bold=True)
+        cell.fill = PatternFill(
+            start_color="CCCCCC", end_color="CCCCCC", fill_type="solid"
+        )
+
+    # Group data
+    if group_by == "description":
+        grouped_data = group_by_description(allocations)
+    else:
+        grouped_data = group_by_size(allocations)
+
+    # Add data
+    row = 2
+    for group, allocs in grouped_data.items():
+        for alloc in allocs:
+            sheet.cell(row=row, column=1).value = group
+            sheet.cell(row=row, column=2).value = alloc.size
+            sheet.cell(row=row, column=3).value = alloc.type
+            sheet.cell(row=row, column=4).value = TYPE_SIZES.get(alloc.type, 1)
+            sheet.cell(row=row, column=5).value = alloc.total_mb()
+            sheet.cell(row=row, column=6).value = alloc.file
+            sheet.cell(row=row, column=7).value = alloc.line
+            sheet.cell(row=row, column=8).value = alloc.timestamp
+            row += 1
+
+    # Auto-adjust column widths
+    for col in range(1, len(headers) + 1):
+        sheet.column_dimensions[get_column_letter(col)].auto_size = True
+
+
+def _add_summary_to_sheet(sheet, allocations: List[BufferAllocation]):
+    """Helper function to add summary statistics to Excel sheet."""
+    if not OPENPYXL_AVAILABLE:
+        return
+
+    # Calculate summary statistics
+    total_memory = sum(alloc.total_mb() for alloc in allocations)
+    by_type = {}
+    by_description = {}
+
+    for alloc in allocations:
+        by_type[alloc.type] = by_type.get(alloc.type, 0) + alloc.total_mb()
+        by_description[alloc.description] = (
+            by_description.get(alloc.description, 0) + alloc.total_mb()
+        )
+
+    # Add headers
+    sheet.cell(row=1, column=1).value = "Summary Statistics"
+    sheet.cell(row=1, column=1).font = Font(bold=True, size=14)
+
+    # Add total memory
+    sheet.cell(row=3, column=1).value = "Total Memory Allocated:"
+    sheet.cell(row=3, column=2).value = f"{total_memory:.2f} MB"
+
+    # Add memory by type
+    sheet.cell(row=5, column=1).value = "Memory by Type"
+    sheet.cell(row=5, column=1).font = Font(bold=True)
+    row = 6
+    for type_name, memory in by_type.items():
+        sheet.cell(row=row, column=1).value = type_name
+        sheet.cell(row=row, column=2).value = f"{memory:.2f} MB"
+        row += 1
+
+    # Add memory by description
+    sheet.cell(row=row + 1, column=1).value = "Memory by Description"
+    sheet.cell(row=row + 1, column=1).font = Font(bold=True)
+    row += 2
+    for desc, memory in by_description.items():
+        sheet.cell(row=row, column=1).value = desc
+        sheet.cell(row=row, column=2).value = f"{memory:.2f} MB"
+        row += 1
+
+    # Auto-adjust column widths
+    sheet.column_dimensions["A"].auto_size = True
+    sheet.column_dimensions["B"].auto_size = True
+
+
+def visualize_allocations(
+    allocations: List[BufferAllocation],
+    output_dir: str = ".",
+    unique_mb_only: bool = False,
+    threshold_percent: float = 5.0,
+) -> bool:
+    """
+    Create visualizations of memory allocations.
+
+    Args:
+        allocations: List of BufferAllocation objects
+        output_dir: Directory to save visualization files
+        unique_mb_only: If True, only show unique memory sizes
+        threshold_percent: Percentage threshold for considering sizes similar
+
+    Returns:
+        bool: True if visualization was successful, False if required libraries are not available
+    """
+    if not all([PANDAS_AVAILABLE, MATPLOTLIB_AVAILABLE, SEABORN_AVAILABLE]):
+        print("Error: Visualization requires pandas, matplotlib, and seaborn.")
+        print("Please install them with: pip install pandas matplotlib seaborn")
+        return False
+
+    # Convert to DataFrame
+    data = []
+    for alloc in allocations:
+        data.append(
+            {
+                "Description": alloc.description,
+                "Size_MB": alloc.total_mb(),
+                "Type": alloc.type,
+                "File": alloc.file,
+                "Timestamp": alloc.timestamp,
+            }
+        )
+    df = pd.DataFrame(data)
+
+    # Create figure
+    fig = plt.figure(figsize=(15, 10))  # noqa: F841
+
+    if unique_mb_only:
+        # Get unique descriptions
+        unique_descriptions = set()
+        for alloc in allocations:
+            unique_descriptions.add(alloc.description)
+
+        # Get unique memory sizes for each description using our similarity check
+        unique_sizes = {}
+        for desc in sorted(unique_descriptions):
+            desc_df = df[df["Description"] == desc]
+            sizes = []
+            seen_sizes = set()
+
+            for _, row in desc_df.iterrows():
+                size = row["Size_MB"]
+                if not any(
+                    are_similar_sizes(size, seen_size, threshold_percent)
+                    for seen_size in seen_sizes
+                ):
+                    sizes.append(size)
+                    seen_sizes.add(size)
+
+            unique_sizes[desc] = sizes
+
+        # Create a figure with subplots for each description
+        n_descriptions = len(unique_sizes)
+        n_cols = min(3, n_descriptions)  # Maximum 3 columns
+        n_rows = (n_descriptions + n_cols - 1) // n_cols  # Ceiling division
+
+        for i, (desc, sizes) in enumerate(unique_sizes.items()):
+            plt.subplot(n_rows, n_cols, i + 1)
+            plt.bar(range(len(sizes)), sizes)
+            plt.title(f"{desc}\nUnique Memory Allocations")
+            plt.xlabel("Allocation Index")
+            plt.ylabel("Memory (MB)")
+            plt.xticks(range(len(sizes)), range(1, len(sizes) + 1))
+
+        plt.tight_layout()
+    else:
+        # Memory usage by description (top 10)
+        plt.subplot(2, 2, 1)
+        top_descriptions = df.groupby("Description")["Size_MB"].sum().nlargest(10)
+        sns.barplot(x=top_descriptions.values, y=top_descriptions.index)
+        plt.title("Top 10 Memory Usage by Description")
+        plt.xlabel("Memory (MB)")
+
+        # Memory distribution by type
+        plt.subplot(2, 2, 2)
+        type_dist = df.groupby("Type")["Size_MB"].sum()
+        plt.pie(type_dist, labels=type_dist.index, autopct="%1.1f%%")
+        plt.title("Memory Distribution by Type")
+
+    plt.tight_layout()
+    plt.savefig(f"{output_dir}/memory_analysis.png", dpi=300, bbox_inches="tight")
+    plt.close()
+    return True
+
+
+def main():
+    """Example usage of the analysis tools."""
+    import argparse
+
+    from memlog_parser import parse_memlog
+
+    parser = argparse.ArgumentParser(description="Analyze memory allocation logs")
+    parser.add_argument("file", help="Path to the memory log file")
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to save output files"
+    )
+    parser.add_argument(
+        "--format",
+        choices=["csv", "excel", "visualization", "all"],
+        default="all",
+        help="Output format",
+    )
+
+    args = parser.parse_args()
+
+    # Parse the log file
+    allocations = parse_memlog(args.file)
+
+    # Create output directory if it doesn't exist
+    import os
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Generate requested outputs
+    if args.format in ["csv", "all"]:
+        export_to_csv(allocations, f"{args.output_dir}/memory_analysis.csv")
+
+    if args.format in ["excel", "all"]:
+        export_to_excel(allocations, f"{args.output_dir}/memory_analysis.xlsx")
+
+    if args.format in ["visualization", "all"]:
+        visualize_allocations(allocations, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/memlog_cli.py b/scripts/memlog_cli.py
new file mode 100644
index 00000000..ef45a129
--- /dev/null
+++ b/scripts/memlog_cli.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Memory Log Analysis Command Line Interface
+
+This script provides a unified command-line interface for parsing and analyzing
+memory allocation logs from legate-sparse.
+
+Optional Dependencies:
+- pandas, matplotlib, seaborn: Required for visualizations
+- openpyxl: Required for Excel export
+"""
+
+import argparse
+import os
+import sys
+
+from memlog_analysis import export_to_csv, export_to_excel, visualize_allocations
+from memlog_parser import (
+    filter_allocations,
+    parse_memlog,
+    print_description_group,
+    print_size_group,
+    update_type_sizes,
+)
+
+
+def check_dependencies(format: str) -> bool:
+    """
+    Check if required dependencies are available for the requested format.
+
+    Args:
+        format: Requested output format
+
+    Returns:
+        bool: True if all required dependencies are available
+    """
+    if format == "excel":
+        try:
+            import openpyxl  # noqa:  F401
+        except ImportError:
+            print(
+                "Error: Excel export requires openpyxl. Please install it with: pip install openpyxl"
+            )
+            return False
+
+    if format == "visualization":
+        try:
+            import matplotlib  # noqa:  F401
+            import pandas  # noqa:  F401
+            import seaborn  # noqa:  F401
+        except ImportError:
+            print("Error: Visualization requires pandas, matplotlib, and seaborn.")
+            print("Please install them with: pip install pandas matplotlib seaborn")
+            return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse and analyze memory allocation logs from legate-sparse",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  Basic usage (print to screen):
+    python memlog_cli.py memlog.txt
+
+  Group by size:
+    python memlog_cli.py memlog.txt --group-by size
+
+  Ignore specific descriptions:
+    python memlog_cli.py memlog.txt --ignore-descriptions "ThrustAllocator::allocate" "buffer1"
+
+  Set minimum memory size:
+    python memlog_cli.py memlog.txt --min-mb 1.0
+
+  Show only unique memory sizes within each group:
+    python memlog_cli.py memlog.txt --unique-mb-only
+
+  Custom similarity threshold:
+    python memlog_cli.py memlog.txt --unique-mb-only --similarity-threshold 5.0
+
+  Configure type sizes:
+    python memlog_cli.py memlog.txt --index-ty-size 4 --val-ty-size 8
+
+  Export to CSV (in addition to printing):
+    python memlog_cli.py memlog.txt --format csv --output-dir ./analysis
+
+  Export to Excel (requires openpyxl):
+    python memlog_cli.py memlog.txt --format excel --output-dir ./analysis
+
+  Create visualizations (requires pandas, matplotlib, seaborn):
+    python memlog_cli.py memlog.txt --format visualization --output-dir ./analysis
+
+  Combine multiple options:
+    python memlog_cli.py memlog.txt --unique-mb-only --similarity-threshold 1.0 --min-mb 1.0 --ignore-descriptions "ThrustAllocator::allocate" --format csv
+""",
+    )
+
+    # Required arguments
+    parser.add_argument("file", help="Path to the memory log file")
+
+    # Filtering options
+    parser.add_argument(
+        "--ignore-descriptions",
+        nargs="+",
+        default=[],
+        help="List of descriptions to ignore",
+    )
+    parser.add_argument(
+        "--min-mb",
+        type=float,
+        default=0.0,
+        help="Minimum memory size in MB to include (default: 0.0)",
+    )
+    parser.add_argument(
+        "--unique-mb-only",
+        action="store_true",
+        help="Only show unique memory sizes in MB",
+    )
+    parser.add_argument(
+        "--similarity-threshold",
+        type=float,
+        default=5.0,
+        help="Percentage threshold for considering memory sizes similar (default: 5.0%%)",
+    )
+
+    # Type size configuration
+    parser.add_argument(
+        "--index-ty-size",
+        type=int,
+        default=8,
+        help="Size of INDEX_TY in bytes (default: 8)",
+    )
+    parser.add_argument(
+        "--val-ty-size",
+        type=int,
+        default=8,
+        help="Size of VAL_TY in bytes (default: 8)",
+    )
+
+    # Output options
+    parser.add_argument(
+        "--group-by",
+        choices=["description", "size"],
+        default="description",
+        help="Group allocations by description or size (default: description)",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["csv", "excel", "visualization"],
+        help="Additional output format (optional)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=".",
+        help="Directory to save output files (default: current directory)",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        # Update type sizes based on command line arguments
+        update_type_sizes(args.index_ty_size, args.val_ty_size)
+
+        # Parse the log file
+        allocations = parse_memlog(args.file)
+
+        # Filter allocations based on criteria
+        filtered_allocations = filter_allocations(
+            allocations,
+            ignore_descriptions=set(args.ignore_descriptions),
+            min_mb=args.min_mb,
+        )
+
+        # Print results to screen
+        print("\nMemory Allocation Analysis:")
+        print("=" * 50)
+        print(f"Using INDEX_TY size: {args.index_ty_size} bytes")
+        print(f"Using VAL_TY size: {args.val_ty_size} bytes")
+        print("=" * 50)
+
+        if args.group_by == "description":
+            print_description_group(
+                filtered_allocations,
+                unique_mb_only=args.unique_mb_only,
+                threshold_percent=args.similarity_threshold,
+            )
+        else:
+            print_size_group(
+                filtered_allocations,
+                unique_mb_only=args.unique_mb_only,
+                threshold_percent=args.similarity_threshold,
+            )
+
+        # Handle additional output formats if requested
+        if args.format:
+            # Create output directory if it doesn't exist
+            os.makedirs(args.output_dir, exist_ok=True)
+
+            # Check dependencies if needed
+            if args.format in ["excel", "visualization"]:
+                if not check_dependencies(args.format):
+                    return 1
+
+            # Generate requested output
+            success = True
+            if args.format == "csv":
+                export_to_csv(
+                    filtered_allocations,
+                    f"{args.output_dir}/memory_analysis.csv",
+                    group_by=args.group_by,
+                    unique_mb_only=args.unique_mb_only,
+                    threshold_percent=args.similarity_threshold,
+                )
+            elif args.format == "excel":
+                if not export_to_excel(
+                    filtered_allocations,
+                    f"{args.output_dir}/memory_analysis.xlsx",
+                ):
+                    success = False
+            elif args.format == "visualization":
+                if not visualize_allocations(
+                    filtered_allocations,
+                    args.output_dir,
+                    unique_mb_only=args.unique_mb_only,
+                    threshold_percent=args.similarity_threshold,
+                ):
+                    success = False
+
+            return 0 if success else 1
+
+        return 0
+
+    except FileNotFoundError:
+        print(f"Error: File '{args.file}' not found")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/memlog_parser.py b/scripts/memlog_parser.py
new file mode 100644
index 00000000..0024b41d
--- /dev/null
+++ b/scripts/memlog_parser.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""
+Memory Log Parser Core Module
+
+This module contains the core functionality for parsing memory allocation logs
+from legate-sparse, including data structures and basic parsing functions.
+"""
+
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Set
+
+# Dictionary mapping data types to their sizes in bytes
+TYPE_SIZES = {
+    "char": 1,
+    "int32_t": 4,
+    "int64_t": 8,
+    "float": 4,
+    "double": 8,
+    "float32": 4,
+    "float64": 8,
+    "INDEX_TY": 8,  # Default to int64_t
+    "VAL_TY": 8,  # Default to double
+}
+
+
+def update_type_sizes(index_ty_size: int = 8, val_ty_size: int = 8) -> None:
+    """
+    Update the sizes of INDEX_TY and VAL_TY in the TYPE_SIZES dictionary.
+
+    Args:
+        index_ty_size: Size of INDEX_TY in bytes (default: 8)
+        val_ty_size: Size of VAL_TY in bytes (default: 8)
+    """
+    TYPE_SIZES["INDEX_TY"] = index_ty_size
+    TYPE_SIZES["VAL_TY"] = val_ty_size
+
+
+@dataclass
+class BufferAllocation:
+    """
+    Represents a single buffer allocation from the memory log.
+
+    Attributes:
+        timestamp: Time of allocation in seconds
+        file: Source file where allocation occurred
+        line: Line number in the source file
+        size: Number of elements allocated
+        type: Data type of the elements
+        description: Description of the allocation
+    """
+
+    timestamp: float
+    file: str
+    line: int
+    size: int
+    type: str
+    description: str
+
+    def total_bytes(self) -> int:
+        """Calculate total bytes allocated including data type size."""
+        type_size = TYPE_SIZES.get(self.type, 1)  # Default to 1 byte if type not found
+        return self.size * type_size
+
+    def total_mb(self) -> float:
+        """Calculate total memory in MB."""
+        return self.total_bytes() / (1024 * 1024)
+
+
+def are_similar_sizes(size1: float, size2: float, threshold_percent: float) -> bool:
+    """
+    Check if two sizes are similar within the given percentage threshold.
+
+    Args:
+        size1: First size in MB
+        size2: Second size in MB
+        threshold_percent: Maximum allowed percentage difference
+
+    Returns:
+        True if sizes are within threshold, False otherwise
+    """
+    if size1 == 0 or size2 == 0:
+        return size1 == size2
+    percent_diff = abs(size1 - size2) / min(size1, size2) * 100
+    return percent_diff <= threshold_percent
+
+
+def parse_memlog(file_path: str) -> List[BufferAllocation]:
+    """
+    Parse the memory log file and extract buffer allocation information.
+
+    Args:
+        file_path: Path to the memory log file
+
+    Returns:
+        List of BufferAllocation objects
+
+    Raises:
+        FileNotFoundError: If the specified file doesn't exist
+    """
+    pattern = r"\[.*?\]\s+(\d+\.\d+)\s+\{3\}\{legate-sparse\}:\s+Buffer allocation at ([^:]+):(\d+)\s+-\s+Size:\s+(\d+)\s+Type:\s+([^\s]+)\s+Description:\s+(.+)"
+
+    allocations = []
+    seen_entries = set()  # To track unique entries
+
+    with open(file_path, "r") as f:
+        for line in f:
+            match = re.match(pattern, line.strip())
+            if match:
+                # Create a unique key for the entry
+                entry_key = (
+                    match.group(2),  # file
+                    match.group(3),  # line
+                    match.group(4),  # size
+                    match.group(5),  # type
+                    match.group(6),  # description
+                )
+
+                # Only add if we haven't seen this exact entry before
+                if entry_key not in seen_entries:
+                    seen_entries.add(entry_key)
+                    allocation = BufferAllocation(
+                        timestamp=float(match.group(1)),
+                        file=match.group(2),
+                        line=int(match.group(3)),
+                        size=int(match.group(4)),
+                        type=match.group(5),
+                        description=match.group(6),
+                    )
+                    allocations.append(allocation)
+
+    return allocations
+
+
+def group_by_description(
+    allocations: List[BufferAllocation],
+) -> Dict[str, List[BufferAllocation]]:
+    """
+    Group allocations by their description.
+
+    Args:
+        allocations: List of BufferAllocation objects
+
+    Returns:
+        Dictionary mapping descriptions to lists of allocations
+    """
+    grouped = defaultdict(list)
+    for alloc in allocations:
+        grouped[alloc.description].append(alloc)
+    return dict(grouped)
+
+
+def group_by_size(
+    allocations: List[BufferAllocation],
+) -> Dict[int, List[BufferAllocation]]:
+    """
+    Group allocations by their size in elements.
+
+    Args:
+        allocations: List of BufferAllocation objects
+
+    Returns:
+        Dictionary mapping sizes to lists of allocations
+    """
+    grouped = defaultdict(list)
+    for alloc in allocations:
+        grouped[alloc.size].append(alloc)
+    return dict(grouped)
+
+
+def filter_allocations(
+    allocations: List[BufferAllocation],
+    ignore_descriptions: Set[str] = None,
+    min_mb: float = 0.0,
+) -> List[BufferAllocation]:
+    """
+    Filter allocations based on description and minimum size criteria.
+
+    Args:
+        allocations: List of BufferAllocation objects
+        ignore_descriptions: Set of descriptions to ignore
+        min_mb: Minimum memory size in MB to include
+
+    Returns:
+        Filtered list of BufferAllocation objects
+    """
+    if ignore_descriptions is None:
+        ignore_descriptions = set()
+
+    filtered = []
+    for alloc in allocations:
+        if alloc.description not in ignore_descriptions and alloc.total_mb() >= min_mb:
+            filtered.append(alloc)
+    return filtered
+
+
+def print_description_group(
+    allocations: List[BufferAllocation],
+    unique_mb_only: bool = False,
+    threshold_percent: float = 5.0,
+):
+    """
+    Print allocations grouped by description.
+
+    Args:
+        allocations: List of BufferAllocation objects
+        unique_mb_only: If True, only show unique memory sizes
+        threshold_percent: Percentage threshold for considering sizes similar
+    """
+    grouped_by_desc = group_by_description(allocations)
+
+    for desc, allocs in grouped_by_desc.items():
+        desc_total_bytes = sum(alloc.total_bytes() for alloc in allocs)
+        max_bytes = max(alloc.total_bytes() for alloc in allocs)
+        print(f"\n{desc}:")
+        print(
+            f"  Total bytes (includes non-unique allocs): {desc_total_bytes / (1024*1024):.2f} MB"
+        )
+        print(f"  Max bytes  : {max_bytes / (1024*1024):.2f} MB")
+
+        # Track seen entries for this description
+        seen_entries = set()
+        seen_mb_sizes = set()
+
+        for alloc in allocs:
+            mb_size = alloc.total_mb()
+
+            # If unique_mb_only is enabled, check for similar memory sizes
+            if unique_mb_only:
+                is_similar = any(
+                    are_similar_sizes(mb_size, seen_size, threshold_percent)
+                    for seen_size in seen_mb_sizes
+                )
+                if is_similar:
+                    continue
+                seen_mb_sizes.add(mb_size)
+
+            # Create a unique key for this entry
+            entry_key = (alloc.size, alloc.type, alloc.file, alloc.line)
+
+            # Skip if we've seen this exact entry before
+            if entry_key in seen_entries:
+                continue
+            seen_entries.add(entry_key)
+
+            print(
+                f"  - Size: {alloc.size} elements, Type: {alloc.type} ({TYPE_SIZES.get(alloc.type, 1)} bytes), "
+                f"Total: {mb_size:.2f} MB, "
+                f"File: {alloc.file}:{alloc.line}, Time: {alloc.timestamp}"
+            )
+
+
+def print_size_group(
+    allocations: List[BufferAllocation],
+    unique_mb_only: bool = False,
+    threshold_percent: float = 2.0,
+):
+    """
+    Print allocations grouped by size.
+
+    Args:
+        allocations: List of BufferAllocation objects
+        unique_mb_only: If True, only show unique memory sizes
+        threshold_percent: Percentage threshold for considering sizes similar
+    """
+    grouped_by_size_dict = group_by_size(allocations)
+    seen_mb_sizes = set()
+
+    for size, allocs in sorted(
+        grouped_by_size_dict.items(), key=lambda x: x[0], reverse=True
+    ):
+        size_total_bytes = sum(alloc.total_bytes() for alloc in allocs)
+        max_bytes = max(alloc.total_bytes() for alloc in allocs)
+
+        print(f"\nSize: {size} elements:")
+        print(
+            f"  Total bytes (includes non-unique allocs): {size_total_bytes / (1024*1024):.2f} MB"
+        )
+        print(f"  Max bytes  : {max_bytes / (1024*1024):.2f} MB")
+
+        for alloc in allocs:
+            mb_size = alloc.total_mb()
+            if unique_mb_only:
+                # Check if this size is similar to any previously seen size
+                is_similar = any(
+                    are_similar_sizes(mb_size, seen_size, threshold_percent)
+                    for seen_size in seen_mb_sizes
+                )
+                if is_similar:
+                    continue
+            seen_mb_sizes.add(mb_size)
+            print(
+                f"  - Type: {alloc.type} ({TYPE_SIZES.get(alloc.type, 1)} bytes), "
+                f"Description: {alloc.description}, "
+                f"File: {alloc.file}:{alloc.line}"
+            )
diff --git a/setup.py b/setup.py
index e0b12193..daa17216 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="legate-sparse",
-    version="25.01.00",
+    version="25.03.00",
     description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale",
     author="NVIDIA Corporation",
     license="Apache 2.0",
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc
index 7b5c1e99..6b781134 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc
@@ -17,19 +17,29 @@
 #include "legate_sparse/array/conv/pos_to_coordinates.h"
 #include "legate_sparse/array/conv/pos_to_coordinates_template.inl"
 
-#include "legate_sparse/util/thrust_allocator.h"
-
 namespace sparse {
 
 using namespace legate;
 
+template <Type::Code INDEX_CODE>
+struct ExpandPosToCoordinatesImplBody<VariantKind::CPU, INDEX_CODE> {
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& pos,
+                  const AccessorWO<INDEX_TY, 1>& row_indices,
+                  const Rect<1>& rect)
+  {
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      for (size_t j_pos = pos[row].lo; j_pos < pos[row].hi + 1; j_pos++) {
+        row_indices[j_pos] = row;
+      }
+    }
+  }
+};
+
 /*static*/ void ExpandPosToCoordinates::cpu_variant(TaskContext context)
 {
-  Memory::Kind kind = find_memory_kind_for_executing_processor();
-  ThrustAllocator alloc(kind);
-  auto policy = thrust::host(alloc);
-
-  pos_to_coordinates_template(context, policy);
+  pos_to_coordinates_template<VariantKind::CPU>(context);
 }
 
 namespace  // unnamed
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cu b/src/legate_sparse/array/conv/pos_to_coordinates.cu
index 07760dc9..c74a5c3f 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cu
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cu
@@ -17,19 +17,51 @@
 #include "legate_sparse/array/conv/pos_to_coordinates.h"
 #include "legate_sparse/array/conv/pos_to_coordinates_template.inl"
 #include "legate_sparse/util/cuda_help.h"
-#include "legate_sparse/util/thrust_allocator.h"
+#include "legate_sparse/util/cusparse_utils.h"
+#include <stdio.h>
 
 namespace sparse {
 
 using namespace legate;
 
+template <typename INDEX_TY>
+__global__ void fill_row_indices(size_t rows,
+                                 size_t offset,
+                                 AccessorRO<Rect<1>, 1> pos,
+                                 AccessorWO<INDEX_TY, 1> row_indices)
+{
+  const auto idx = global_tid_1d();
+
+  if (idx >= rows) {
+    return;
+  }
+
+  size_t row = offset + idx;
+  for (size_t j_pos = pos[row].lo; j_pos < pos[row].hi + 1; j_pos++) {
+    row_indices[j_pos] = row;
+  }
+}
+
+template <Type::Code INDEX_CODE>
+struct ExpandPosToCoordinatesImplBody<VariantKind::GPU, INDEX_CODE> {
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& pos,
+                  const AccessorWO<INDEX_TY, 1>& row_indices,
+                  const Rect<1>& rect)
+  {
+    auto stream = get_cached_stream();
+    auto blocks = get_num_blocks_1d(rect.volume());
+    size_t rows = rect.volume();
+
+    fill_row_indices<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(rows, rect.lo[0], pos, row_indices);
+    LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  }
+};
+
 /*static*/ void ExpandPosToCoordinates::gpu_variant(TaskContext context)
 {
-  auto stream = get_cached_stream();
-  ThrustAllocator alloc(Memory::GPU_FB_MEM);
-  auto policy = thrust::cuda::par(alloc).on(stream);
-  pos_to_coordinates_template(context, policy);
-  LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  pos_to_coordinates_template<VariantKind::GPU>(context);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.h b/src/legate_sparse/array/conv/pos_to_coordinates.h
index c22f2853..70e351a6 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.h
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.h
@@ -23,7 +23,7 @@
 namespace sparse {
 
 struct ExpandPosToCoordinatesArgs {
-  const legate::PhysicalStore result;
+  const legate::PhysicalStore row_indices;
   const legate::PhysicalStore pos;
 };
 
@@ -31,13 +31,6 @@ class ExpandPosToCoordinates : public SparseTask<ExpandPosToCoordinates> {
  public:
   static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES};
 
-  static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
-    legate::VariantOptions{}.with_has_allocations(true);
-  static constexpr legate::VariantOptions OMP_VARIANT_OPTIONS =
-    legate::VariantOptions{}.with_has_allocations(true);
-  static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
-    legate::VariantOptions{}.with_has_allocations(true);
-
  public:
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
index 911ea1e8..80da99a5 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates_omp.cc
@@ -17,19 +17,30 @@
 #include "legate_sparse/array/conv/pos_to_coordinates.h"
 #include "legate_sparse/array/conv/pos_to_coordinates_template.inl"
 
-#include "legate_sparse/util/thrust_allocator.h"
-
 namespace sparse {
 
 using namespace legate;
 
+template <Type::Code INDEX_CODE>
+struct ExpandPosToCoordinatesImplBody<VariantKind::OMP, INDEX_CODE> {
+  using INDEX_TY = type_of<INDEX_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& pos,
+                  const AccessorWO<INDEX_TY, 1>& row_indices,
+                  const Rect<1>& rect)
+  {
+#pragma omp parallel for schedule(monotonic : dynamic, 128)
+    for (auto row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      for (size_t j_pos = pos[row].lo; j_pos < pos[row].hi + 1; j_pos++) {
+        row_indices[j_pos] = row;
+      }
+    }
+  }
+};
+
 /*static*/ void ExpandPosToCoordinates::omp_variant(TaskContext context)
 {
-  Memory::Kind kind = find_memory_kind_for_executing_processor();
-  ThrustAllocator alloc(kind);
-  auto policy = thrust::omp::par(alloc);
-
-  pos_to_coordinates_template(context, policy);
+  pos_to_coordinates_template<VariantKind::OMP>(context);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
index 358600f8..39142e70 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
+++ b/src/legate_sparse/array/conv/pos_to_coordinates_template.inl
@@ -20,105 +20,39 @@
 #include "legate_sparse/array/conv/pos_to_coordinates.h"
 #include "legate_sparse/util/dispatch.h"
 
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/gather.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/scatter.h>
-#include <thrust/transform.h>
-
 namespace sparse {
 using namespace legate;
 
-template <typename T>
-struct volume : public thrust::unary_function<T, size_t> {
-#if defined(__CUDACC__)
-  __host__ __device__
-#endif
-    size_t
-    operator()(Legion::Rect<1> x)
-  {
-    return x.volume();
-  }
-};
+template <VariantKind KIND, Type::Code INDEX_CODE>
+struct ExpandPosToCoordinatesImplBody;
 
-template <typename Policy>
+template <VariantKind KIND>
 struct ExpandPosToCoordinatesImpl {
-  ExpandPosToCoordinatesImpl(const Policy& policy) : policy(policy) {}
-
   template <Type::Code INDEX_CODE>
   void operator()(ExpandPosToCoordinatesArgs& args) const
   {
-    using INDEX_TY     = type_of<INDEX_CODE>;
-    auto pos           = args.pos.read_accessor<Rect<1>, 1>();
-    auto result        = args.result.write_accessor<INDEX_TY, 1>();
-    auto pos_domain    = args.pos.domain();
-    auto result_domain = args.result.domain();
-    auto src_size      = pos_domain.get_volume();
-    auto dst_size      = result_domain.get_volume();
+    using INDEX_TY = type_of<INDEX_CODE>;
 
-    // Return early if there isn't any work to do. Entering this code
-    // with an empty domain results in CUDA errors for the thrust backend.
-    if (pos_domain.empty() || result_domain.empty()) {
+    auto pos                = args.pos.read_accessor<Rect<1>, 1>();
+    auto row_indices        = args.row_indices.write_accessor<INDEX_TY, 1>();
+    auto pos_domain         = args.pos.domain();
+    auto row_indices_domain = args.row_indices.domain();
+
+    if (pos_domain.empty() || row_indices_domain.empty()) {
       return;
     }
-
-    // This implementation of expand was inspired from
-    // https://huggingface.co/spaces/ma-xu/LIVE/blob/main/thrust/examples/expand.cu.
-    auto volumes = create_buffer<size_t, 1>(src_size);
-    auto offsets = create_buffer<size_t, 1>(src_size);
-
-    // Initialize all of our arrays.
-    thrust::fill(policy, volumes.ptr(0), volumes.ptr(0) + src_size, size_t(0));
-    thrust::fill(policy, offsets.ptr(0), offsets.ptr(0) + src_size, size_t(0));
-    thrust::fill(policy,
-                 result.ptr(result_domain.lo()),
-                 result.ptr(result_domain.lo()) + dst_size,
-                 INDEX_TY(0));
-    // Transform each pos rectangle into its volume. We have to make a
-    // temporary here because not all of the thrust functions accept a
-    // transform.
-    thrust::transform(policy,
-                      pos.ptr(pos_domain.lo()),
-                      pos.ptr(pos_domain.lo()) + src_size,
-                      volumes.ptr(0),
-                      volume<Legion::Rect<1>>{});
-    // Perform an exclusive scan to find the offsets to write coordinates into.
-    thrust::exclusive_scan(policy, volumes.ptr(0), volumes.ptr(0) + src_size, offsets.ptr(0));
-    // Scatter the non-zero counts into their output indices.
-    thrust::scatter_if(policy,
-                       thrust::counting_iterator<INDEX_TY>(0),
-                       thrust::counting_iterator<INDEX_TY>(src_size),
-                       offsets.ptr(0),
-                       volumes.ptr(0),
-                       result.ptr(result_domain.lo()));
-    // Compute a max-scan over the output indices, filling in holes.
-    thrust::inclusive_scan(policy,
-                           result.ptr(result_domain.lo()),
-                           result.ptr(result_domain.lo()) + dst_size,
-                           result.ptr(result_domain.lo()),
-                           thrust::maximum<INDEX_TY>{});
-    // Gather input values according to the computed indices.
-    thrust::gather(policy,
-                   result.ptr(result_domain.lo()),
-                   result.ptr(result_domain.lo()) + dst_size,
-                   thrust::counting_iterator<INDEX_TY>(pos_domain.lo()[0]),
-                   result.ptr(result_domain.lo()));
+    ExpandPosToCoordinatesImplBody<KIND, INDEX_CODE>()(pos, row_indices, args.pos.shape<1>());
   }
-
- private:
-  const Policy& policy;
 };
 
-template <typename Policy>
-static void pos_to_coordinates_template(TaskContext context, const Policy& policy)
+template <VariantKind KIND>
+static void pos_to_coordinates_template(TaskContext context)
 {
   ExpandPosToCoordinatesArgs args{
     context.outputs()[0],
     context.inputs()[0],
   };
-  index_type_dispatch(args.result.code(), ExpandPosToCoordinatesImpl(policy), args);
+  index_type_dispatch(args.row_indices.code(), ExpandPosToCoordinatesImpl<KIND>(), args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc
new file mode 100644
index 00000000..8fc0c11b
--- /dev/null
+++ b/src/legate_sparse/array/csr/indexing.cc
@@ -0,0 +1,99 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/indexing.h"
+#include "legate_sparse/array/csr/indexing_template.inl"
+
+namespace sparse {
+
+using namespace legate;
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct CSRIndexingCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRW<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& mask_pos,
+                  const AccessorRO<INDEX_TY, 1>& mask_crd,
+                  const AccessorRO<VAL_TY, 1>& value,
+                  const Rect<1>& rect)
+  {
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      size_t j_pos_start = A_pos[row].lo;
+      size_t j_pos_end   = A_pos[row].hi + 1;
+
+      size_t m_pos_start = mask_pos[row].lo;
+      size_t m_pos_end   = mask_pos[row].hi + 1;
+
+      size_t m_pos = m_pos_start;
+      size_t j_pos = j_pos_start;
+
+      // When the if condition is satisfied, the (row, col) of A and
+      // mask match. Ideally, we would expect it to match for all
+      // elements, even though mask stores only the True elements
+      // making its sparsity pattern differ from A.
+      // This would be the case if mask was derived from A.
+      // However, if mask has entries that are not present in A,
+      // then the else conditions will be hit.
+      // Note that we don't update the vals array in those cases
+      // since updating vals would require changing its size
+      // apriori and hence the sparsity pattern of A, which is not
+      // supported in this task.
+
+      while (m_pos < m_pos_end && j_pos < j_pos_end) {
+        if (mask_crd[m_pos] == A_crd[j_pos]) {
+          A_vals[j_pos] = static_cast<VAL_TY>(value[0]);
+          j_pos++;
+          m_pos++;
+        } else if (mask_crd[m_pos] > A_crd[j_pos]) {
+          // this element in A is either not found in mask or is False
+          // in mask and thus not stored. This means the pointer for
+          // mask (m_pos) would have skipped ahead of the pointer
+          // for A (j_pos), so A needs to catch-up; increment j_pos
+          j_pos++;
+        } else {  // mask_crd[m_pos] < A_crd[j_pos]
+          // In this case, A is ahead and mask is behind in this row
+          // which means mask has an entry (r,c) that was not in A.
+          // Increment m_pos and let mask move ahead
+          m_pos++;
+        }
+        // when either one of the pointers reach the end of the row,
+        // we are done because we only update vals when (row, col)
+        // of mask and A match exactly, and if one of the pointers
+        // has reached the end of this row, the vals for this row
+        // can never be updated, so exit the loop.
+      }
+    }
+  }
+};
+
+/* static */ void CSRIndexingCSR::cpu_variant(legate::TaskContext context)
+{
+  csr_indexing_csr_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void)
+{
+  CSRIndexingCSR::register_variants();
+}
+}  // namespace
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.cu b/src/legate_sparse/array/csr/indexing.cu
new file mode 100644
index 00000000..25e96097
--- /dev/null
+++ b/src/legate_sparse/array/csr/indexing.cu
@@ -0,0 +1,116 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/indexing.h"
+#include "legate_sparse/array/csr/indexing_template.inl"
+#include "legate_sparse/util/cuda_help.h"
+
+namespace sparse {
+
+using namespace legate;
+
+template <typename INDEX_TY, typename VAL_TY>
+__global__ void csr_indexing_csr_kernel(const size_t num_rows,
+                                        AccessorRO<Rect<1>, 1> A_pos,
+                                        AccessorRO<INDEX_TY, 1> A_crd,
+                                        AccessorRW<VAL_TY, 1> A_vals,
+                                        AccessorRO<Rect<1>, 1> mask_pos,
+                                        AccessorRO<INDEX_TY, 1> mask_crd,
+                                        AccessorRO<VAL_TY, 1> value)
+{
+  const auto idx = global_tid_1d();
+  if (idx >= num_rows) {
+    return;
+  }
+
+  size_t j_pos_start = A_pos[idx].lo;
+  size_t j_pos_end   = A_pos[idx].hi + 1;
+
+  size_t m_pos_start = mask_pos[idx].lo;
+  size_t m_pos_end   = mask_pos[idx].hi + 1;
+
+  size_t m_pos = m_pos_start;
+  size_t j_pos = j_pos_start;
+
+  // When the if condition is satisfied, the (row, col) of A and
+  // mask match. Ideally, we would expect it to match for all
+  // elements, even though mask stores only the True elements
+  // making its sparsity pattern differ from A.
+  // This would be the case if mask was derived from A.
+  // However, if mask has entries that are not present in A,
+  // then the else conditions will be hit.
+  // Note that we don't update the vals array in those cases
+  // since updating vals would require changing its size
+  // apriori and hence the sparsity pattern of A, which is not
+  // supported in this task.
+
+  while (m_pos < m_pos_end && j_pos < j_pos_end) {
+    if (mask_crd[m_pos] == A_crd[j_pos]) {
+      A_vals[j_pos] = static_cast<VAL_TY>(value[0]);
+      j_pos++;
+      m_pos++;
+    } else if (mask_crd[m_pos] > A_crd[j_pos]) {
+      // this element in A is either not found in mask or is False
+      // in mask and thus not stored. This means the pointer for
+      // mask (m_pos) would have skipped ahead of the pointer
+      // for A (j_pos), so A needs to catch-up; increment j_pos
+      j_pos++;
+    } else {  // mask_crd[m_pos] < A_crd[j_pos]
+      // In this case, A is ahead and mask is behind in this row
+      // which means mask has an entry (r,c) that was not in A.
+      // Increment m_pos and let mask move ahead
+      m_pos++;
+    }
+    // when either one of the pointers reach the end of the row,
+    // we are done because we only update vals when (row, col)
+    // of mask and A match exactly, and if one of the pointers
+    // has reached the end of this row, the vals for this row
+    // can never be updated, so exit the loop.
+  }
+}
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct CSRIndexingCSRImplBody<VariantKind::GPU, INDEX_CODE, VAL_CODE> {
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRW<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& mask_pos,
+                  const AccessorRO<INDEX_TY, 1>& mask_crd,
+                  const AccessorRO<VAL_TY, 1>& value,
+                  const Rect<1>& rect)
+  {
+    // Get the number of rows in the matrix
+    size_t num_rows = rect.hi[0] - rect.lo[0] + 1;
+
+    std::cout << "GPU variant" << std::endl;
+
+    auto stream = get_cached_stream();
+    auto blocks = get_num_blocks_1d(rect.volume());
+    csr_indexing_csr_kernel<INDEX_TY, VAL_TY><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      num_rows, A_pos, A_crd, A_vals, mask_pos, mask_crd, value);
+    LEGATE_SPARSE_CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/* static */ void CSRIndexingCSR::gpu_variant(TaskContext context)
+{
+  csr_indexing_csr_template<VariantKind::GPU>(context);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.h b/src/legate_sparse/array/csr/indexing.h
new file mode 100644
index 00000000..7bd6240c
--- /dev/null
+++ b/src/legate_sparse/array/csr/indexing.h
@@ -0,0 +1,53 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/sparse.h"
+#include "legate_sparse/sparse_c.h"
+#include "legate.h"
+
+namespace sparse {
+
+struct CSRIndexingCSRArgs {
+  const legate::PhysicalStore& A_vals;
+  const legate::PhysicalStore& A_pos;
+  const legate::PhysicalStore& A_crd;
+  const legate::PhysicalStore& key_pos;
+  const legate::PhysicalStore& key_crd;
+  const legate::PhysicalStore& value;
+};
+
+class CSRIndexingCSR : public SparseTask<CSRIndexingCSR> {
+ public:
+  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR};
+
+  // TODO: The implementatio of the below three variants are
+  // identical and hence need to be templated (DRY)
+
+ public:
+  static void cpu_variant(legate::TaskContext context);
+
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext context);
+#endif
+
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing_omp.cc b/src/legate_sparse/array/csr/indexing_omp.cc
new file mode 100644
index 00000000..c429481f
--- /dev/null
+++ b/src/legate_sparse/array/csr/indexing_omp.cc
@@ -0,0 +1,93 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "legate_sparse/array/csr/indexing.h"
+#include "legate_sparse/array/csr/indexing_template.inl"
+
+namespace sparse {
+
+using namespace legate;
+
+template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+struct CSRIndexingCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
+  using INDEX_TY = type_of<INDEX_CODE>;
+  using VAL_TY   = type_of<VAL_CODE>;
+
+  void operator()(const AccessorRO<Rect<1>, 1>& A_pos,
+                  const AccessorRO<INDEX_TY, 1>& A_crd,
+                  const AccessorRW<VAL_TY, 1>& A_vals,
+                  const AccessorRO<Rect<1>, 1>& mask_pos,
+                  const AccessorRO<INDEX_TY, 1>& mask_crd,
+                  const AccessorRO<VAL_TY, 1>& value,
+                  const Rect<1>& rect)
+  {
+    std::cout << "OMP variant" << std::endl;
+#pragma omp parallel for
+    for (size_t row = rect.lo[0]; row < rect.hi[0] + 1; row++) {
+      size_t j_pos_start = A_pos[row].lo;
+      size_t j_pos_end   = A_pos[row].hi + 1;
+
+      size_t m_pos_start = mask_pos[row].lo;
+      size_t m_pos_end   = mask_pos[row].hi + 1;
+
+      size_t m_pos = m_pos_start;
+      size_t j_pos = j_pos_start;
+
+      // When the if condition is satisfied, the (row, col) of A and
+      // mask match. Ideally, we would expect it to match for all
+      // elements, even though mask stores only the True elements
+      // making its sparsity pattern differ from A.
+      // This would be the case if mask was derived from A.
+      // However, if mask has entries that are not present in A,
+      // then the else conditions will be hit.
+      // Note that we don't update the vals array in those cases
+      // since updating vals would require changing its size
+      // apriori and hence the sparsity pattern of A, which is not
+      // supported in this task.
+
+      while (m_pos < m_pos_end && j_pos < j_pos_end) {
+        if (mask_crd[m_pos] == A_crd[j_pos]) {
+          A_vals[j_pos] = static_cast<VAL_TY>(value[0]);
+          j_pos++;
+          m_pos++;
+        } else if (mask_crd[m_pos] > A_crd[j_pos]) {
+          // this element in A is either not found in mask or is False
+          // in mask and thus not stored. This means the pointer for
+          // mask (m_pos) would have skipped ahead of the pointer
+          // for A (j_pos), so A needs to catch-up; increment j_pos
+          j_pos++;
+        } else {  // mask_crd[m_pos] < A_crd[j_pos]
+          // In this case, A is ahead and mask is behind in this row
+          // which means mask has an entry (r,c) that was not in A.
+          // Increment m_pos and let mask move ahead
+          m_pos++;
+        }
+        // when either one of the pointers reach the end of the row,
+        // we are done because we only update vals when (row, col)
+        // of mask and A match exactly, and if one of the pointers
+        // has reached the end of this row, the vals for this row
+        // can never be updated, so exit the loop.
+      }
+    }
+  }
+};
+
+/* static */ void CSRIndexingCSR::omp_variant(TaskContext context)
+{
+  csr_indexing_csr_template<VariantKind::OMP>(context);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing_template.inl b/src/legate_sparse/array/csr/indexing_template.inl
new file mode 100644
index 00000000..381ca45b
--- /dev/null
+++ b/src/legate_sparse/array/csr/indexing_template.inl
@@ -0,0 +1,68 @@
+/* Copyright 2022-2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "legate_sparse/array/csr/indexing.h"
+#include "legate_sparse/util/dispatch.h"
+
+namespace sparse {
+
+using namespace legate;
+
+template <VariantKind KIND, Type::Code INDEX_TY, Type::Code VAL_CODE>
+struct CSRIndexingCSRImplBody;
+
+template <VariantKind KIND>
+struct CSRIndexingCSRImpl {
+  template <Type::Code INDEX_CODE, Type::Code VAL_CODE>
+  void operator()(const CSRIndexingCSRArgs& args)
+  {
+    using INDEX_TY = type_of<INDEX_CODE>;
+    using VAL_TY   = type_of<VAL_CODE>;
+
+    auto A_pos  = args.A_pos.read_accessor<Rect<1>, 1>();
+    auto A_crd  = args.A_crd.read_accessor<INDEX_TY, 1>();
+    auto A_vals = args.A_vals.read_write_accessor<VAL_TY, 1>();
+
+    auto key_pos = args.key_pos.read_accessor<Rect<1>, 1>();
+    auto key_crd = args.key_crd.read_accessor<INDEX_TY, 1>();
+
+    auto value = args.value.read_accessor<VAL_TY, 1>();
+
+    // TODO: Rect is based on A_pos.shape, is that correct?
+    CSRIndexingCSRImplBody<KIND, INDEX_CODE, VAL_CODE>()(
+      A_pos, A_crd, A_vals, key_pos, key_crd, value, args.A_pos.shape<1>());
+  }
+};
+
+template <VariantKind KIND>
+static void csr_indexing_csr_template(TaskContext context)
+{
+  CSRIndexingCSRArgs args{
+    context.outputs()[0],
+    context.inputs()[0],
+    context.inputs()[1],
+    context.inputs()[2],
+    context.inputs()[3],
+    context.inputs()[4],  // value
+  };
+
+  index_type_value_type_dispatch(
+    args.A_crd.code(), args.A_vals.code(), CSRIndexingCSRImpl<KIND>(), args);
+}
+
+}  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
index fe73ddca..47ed6d34 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
@@ -17,6 +17,7 @@
 #include "legate_sparse/array/csr/spgemm_csr_csr_csr.h"
 #include "legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl"
 #include "legate_sparse/util/thrust_allocator.h"
+#include "legate_sparse/util/legate_utils.h"
 
 #include <thrust/extrema.h>
 
@@ -49,8 +50,8 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::CPU, INDEX_CODE> {
 
     // Next, initialize the deferred buffers ourselves, instead of using
     // Realm fills (which tend to be slower).
-    auto index_list_buf  = legate::create_buffer<INDEX_TY, 1>(A2_dim, Memory::SYSTEM_MEM);
-    auto already_set_buf = legate::create_buffer<bool, 1>(A2_dim, Memory::SYSTEM_MEM);
+    auto index_list_buf  = CREATE_BUFFER(INDEX_TY, A2_dim, Memory::SYSTEM_MEM, "index_list_buf");
+    auto already_set_buf = CREATE_BUFFER(bool, A2_dim, Memory::SYSTEM_MEM, "already_set_buf");
     for (INDEX_TY i = 0; i < A2_dim; i++) {
       index_list_buf[i]  = 0;
       already_set_buf[i] = false;
@@ -121,9 +122,9 @@ struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
     // Next, initialize the deferred buffers ourselves, instead of using
     // Realm fills (which tend to be slower).
-    auto index_list_buf  = legate::create_buffer<INDEX_TY, 1>(A2_dim, Memory::SYSTEM_MEM);
-    auto already_set_buf = legate::create_buffer<bool, 1>(A2_dim, Memory::SYSTEM_MEM);
-    auto workspace_buf   = legate::create_buffer<VAL_TY, 1>(A2_dim, Memory::SYSTEM_MEM);
+    auto index_list_buf  = CREATE_BUFFER(INDEX_TY, A2_dim, Memory::SYSTEM_MEM, "index_list_buf");
+    auto already_set_buf = CREATE_BUFFER(bool, A2_dim, Memory::SYSTEM_MEM, "already_set_buf");
+    auto workspace_buf   = CREATE_BUFFER(VAL_TY, A2_dim, Memory::SYSTEM_MEM, "workspace_buf");
     for (INDEX_TY i = 0; i < A2_dim; i++) {
       index_list_buf[i]  = 0;
       already_set_buf[i] = false;
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
index c26696f6..66827db6 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cu
@@ -19,6 +19,7 @@
 #include "legate_sparse/util/dispatch.h"
 #include "legate_sparse/util/legate_utils.h"
 #include "legate_sparse/util/thrust_allocator.h"
+#include "legate_sparse/util/legate_utils.h"
 
 #include <thrust/scan.h>
 
@@ -45,7 +46,7 @@ int64_t local_offset_from_nnz(ncclComm_t comm, coord_t task_id, coord_t task_num
   ThrustAllocator alloc(Memory::GPU_FB_MEM);
   auto stream         = get_cached_stream();
   auto policy         = thrust::cuda::par(alloc).on(stream);
-  auto buf            = legate::create_buffer<int64_t, 1>(task_num, Memory::GPU_FB_MEM);
+  auto buf            = CREATE_BUFFER(int64_t, task_num, Memory::GPU_FB_MEM, "nnz_reduce_buf");
   auto nnz_reduce_buf = buf.ptr(0);
 
   // Pageable memory
@@ -121,8 +122,9 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
     }
 
     // Convert the pos arrays into local indptr arrays.
-    auto B_indptr = legate::create_buffer<int32_t, 1>(B_rows + 1, Memory::GPU_FB_MEM);
-    auto C_indptr = legate::create_buffer<int32_t, 1>(C_rows + 1, Memory::GPU_FB_MEM);
+    auto B_indptr = CREATE_BUFFER(int32_t, B_rows + 1, Memory::GPU_FB_MEM, "B_indptr");
+    auto C_indptr = CREATE_BUFFER(int32_t, C_rows + 1, Memory::GPU_FB_MEM, "C_indptr");
+
     std::vector<int> tmem(1000, 0);
     {
       auto blocks = get_num_blocks_1d(B_rows);
@@ -136,12 +138,12 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
     }
 
     auto B_crd_int =
-      legate::create_buffer<int32_t, 1>(B_crd.domain().get_volume(), Memory::GPU_FB_MEM);
+      CREATE_BUFFER(int32_t, B_crd.domain().get_volume(), Memory::GPU_FB_MEM, "B_crd_int");
 
     // Importantly, don't use the volume for C, as the image optimization
     // is being applied. Compute an upper bound on the volume directly.
     auto C_nnz     = C_crd.domain().hi()[0] - C_crd.domain().lo()[0] + 1;
-    auto C_crd_int = legate::create_buffer<int32_t, 1>(C_nnz, Memory::GPU_FB_MEM);
+    auto C_crd_int = CREATE_BUFFER(int32_t, C_nnz, Memory::GPU_FB_MEM, "C_crd_int");
     {
       auto dom    = B_crd.domain();
       auto elems  = dom.get_volume();
@@ -235,7 +237,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                                    nullptr));
       void* buffer1 = nullptr;
       if (bufferSize1 > 0) {
-        auto buf = legate::create_buffer<char, 1>(bufferSize1, Memory::GPU_FB_MEM);
+        auto buf = CREATE_BUFFER(char, bufferSize1, Memory::GPU_FB_MEM, "buffer1");
         buffer1  = buf.ptr(0);
       }
       CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle,
@@ -266,7 +268,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                             nullptr));
       void* buffer2 = nullptr;
       if (bufferSize2 > 0) {
-        auto buf = legate::create_buffer<char, 1>(bufferSize2, Memory::GPU_FB_MEM);
+        auto buf = CREATE_BUFFER(char, bufferSize2, Memory::GPU_FB_MEM, "buffer2");
         buffer2  = buf.ptr(0);
       }
       CHECK_CUSPARSE(cusparseSpGEMM_compute(handle,
@@ -285,17 +287,21 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
       // Allocate buffers for the 32-bit version of the A matrix.
       int64_t A_rows, A_cols, A_nnz;
       CHECK_CUSPARSE(cusparseSpMatGetSize(cusparse_A, &A_rows, &A_cols, &A_nnz));
-      auto A_indptr = legate::create_buffer<int32_t, 1>(A_rows + 1, Memory::GPU_FB_MEM);
+      auto A_indptr = CREATE_BUFFER(int32_t, A_rows + 1, Memory::GPU_FB_MEM, "A_indptr");
       // Handle the creation of the A_crd buffer depending on whether the result
       // type is the type of data we are supposed to create.
       legate::Buffer<int32_t, 1> A_crd_int;
       if constexpr (INDEX_CODE == Type::Code::INT32) {
         A_crd_int = A_crd.create_output_buffer<INDEX_TY, 1>(A_nnz, true /* return_buffer */);
+        LOG_BUFFER(INDEX_TY, A_nnz, "A matrix coordinates (create_output_buffer)");
       } else {
         A_crd_int = legate::Buffer<int32_t, 1>(
           create_1d_extents(0, A_nnz - 1), Memory::GPU_FB_MEM, NULL, BUFFER_DEFAULT_ALIGNMENT);
+        LOG_BUFFER(int32_t, A_nnz, "A matrix coordinates (create_output_buffer)");
       }
       auto A_vals_acc = A_vals.create_output_buffer<VAL_TY, 1>(A_nnz, true /* return_buffer */);
+      LOG_BUFFER(VAL_TY, A_nnz, "A matrix values (create_output_buffer)");
+
       CHECK_CUSPARSE(
         cusparseCsrSetPointers(cusparse_A, A_indptr.ptr(0), A_crd_int.ptr(0), A_vals_acc.ptr(0)));
       CHECK_CUSPARSE(cusparseSpGEMM_copy(handle,
@@ -314,6 +320,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
       if constexpr (INDEX_CODE != Type::Code::INT32) {
         auto blocks = get_num_blocks_1d(A_nnz);
         auto buf    = A_crd.create_output_buffer<INDEX_TY, 1>(A_nnz, true /* return_buffer */);
+        LOG_BUFFER(INDEX_TY, A_nnz, "A matrix coordinates casting (output buffer)");
         cast<INDEX_TY, int32_t>
           <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(A_nnz, buf.ptr(0), A_crd_int.ptr(0));
       }
@@ -356,7 +363,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                                    nullptr));
       void* buffer1 = nullptr;
       if (bufferSize1 > 0) {
-        auto buf = legate::create_buffer<char, 1>(bufferSize1, Memory::GPU_FB_MEM);
+        auto buf = CREATE_BUFFER(char, bufferSize1, Memory::GPU_FB_MEM, "buffer1");
         buffer1  = buf.ptr(0);
       }
       CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle,
@@ -390,7 +397,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                                    nullptr));
       void* buffer3 = nullptr;
       if (bufferSize3 > 0) {
-        auto buf = legate::create_buffer<char, 1>(bufferSize3, Memory::GPU_FB_MEM);
+        auto buf = CREATE_BUFFER(char, bufferSize3, Memory::GPU_FB_MEM, "buffer3");
         buffer3  = buf.ptr(0);
       }
 
@@ -412,7 +419,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
 
       void* buffer2 = nullptr;
       if (bufferSize2 > 0) {
-        auto buf = legate::create_buffer<char, 1>(bufferSize2, Memory::GPU_FB_MEM);
+        auto buf = CREATE_BUFFER(char, bufferSize2, Memory::GPU_FB_MEM, "buffer2");
         buffer2  = buf.ptr(0);
       }
 
@@ -431,17 +438,21 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                             buffer2));
       // Allocate buffers for the 32-bit version of the A matrix.
       CHECK_CUSPARSE(cusparseSpMatGetSize(cusparse_A, &A_rows, &A_cols, &A_nnz));
-      auto A_indptr = legate::create_buffer<int32_t, 1>(A_rows + 1, Memory::GPU_FB_MEM);
+      auto A_indptr = CREATE_BUFFER(int32_t, A_rows + 1, Memory::GPU_FB_MEM, "A_indptr");
       // Handle the creation of the A_crd buffer depending on whether the result
       // type is the type of data we are supposed to create.
       legate::Buffer<int32_t, 1> A_crd_int;
       if constexpr (INDEX_CODE == Type::Code::INT32) {
         A_crd_int = A_crd.create_output_buffer<INDEX_TY, 1>(A_nnz, true /* return_buffer */);
+        LOG_BUFFER(INDEX_TY, A_nnz, "A matrix coordinates (create_output_buffer)");
       } else {
         A_crd_int = legate::Buffer<int32_t, 1>(
           create_1d_extents(0, A_nnz - 1), Memory::GPU_FB_MEM, NULL, BUFFER_DEFAULT_ALIGNMENT);
+        LOG_BUFFER(int32_t, A_nnz, "A matrix coordinates (create_output_buffer)");
       }
       auto A_vals_acc = A_vals.create_output_buffer<VAL_TY, 1>(A_nnz, true /* return_buffer */);
+      LOG_BUFFER(VAL_TY, A_nnz, "A matrix values (create_output_buffer)");
+
       CHECK_CUSPARSE(
         cusparseCsrSetPointers(cusparse_A, A_indptr.ptr(0), A_crd_int.ptr(0), A_vals_acc.ptr(0)));
       CHECK_CUSPARSE(cusparseSpGEMM_copy(handle,
@@ -478,6 +489,7 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
       if constexpr (INDEX_CODE != Type::Code::INT32) {
         auto blocks = get_num_blocks_1d(A_nnz);
         auto buf    = A_crd.create_output_buffer<INDEX_TY, 1>(A_nnz, true /* return_buffer */);
+        LOG_BUFFER(INDEX_TY, A_nnz, "A matrix coordinates casting (output buffer)");
         cast<INDEX_TY, int32_t>
           <<<blocks, THREADS_PER_BLOCK, 0, stream>>>(A_nnz, buf.ptr(0), A_crd_int.ptr(0));
       }
@@ -510,12 +522,12 @@ struct SpGEMMCSRxCSRxCSRGPUImpl {
                                 context.scalars()[1].value<uint64_t>(),
                                 context.scalars()[2].value<uint64_t>(),
                                 context.communicators()};
-  index_type_value_type_dispatch(args.A_crd.code(),
-                                 args.A_vals.code(),
-                                 SpGEMMCSRxCSRxCSRGPUImpl{},
-                                 args,
-                                 context.get_task_index()[0],
-                                 context.get_launch_domain().hi()[0]);
+  index_type_floating_point_value_type_dispatch(args.A_crd.code(),
+                                                args.A_vals.code(),
+                                                SpGEMMCSRxCSRxCSRGPUImpl{},
+                                                args,
+                                                context.get_task_index()[0],
+                                                context.get_launch_domain().hi()[0]);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
index 23bcd619..e6ac4ef6 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_omp.cc
@@ -17,6 +17,7 @@
 #include "legate_sparse/array/csr/spgemm_csr_csr_csr.h"
 #include "legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl"
 #include "legate_sparse/util/thrust_allocator.h"
+#include "legate_sparse/util/legate_utils.h"
 
 #include <omp.h>
 #include <thrust/extrema.h>
@@ -54,8 +55,8 @@ struct SpGEMMCSRxCSRxCSRNNZImplBody<VariantKind::OMP, INDEX_CODE> {
     // Next, initialize the deferred buffers ourselves, instead of using
     // Realm fills (which tend to be slower).
     auto size            = A2_dim * num_threads;
-    auto index_list_all  = legate::create_buffer<INDEX_TY, 1>(size, kind);
-    auto already_set_all = legate::create_buffer<bool, 1>(size, kind);
+    auto index_list_all  = CREATE_BUFFER(INDEX_TY, size, kind, "index_list_all");
+    auto already_set_all = CREATE_BUFFER(bool, size, kind, "already_set_all");
 #pragma omp parallel for schedule(static)
     for (INDEX_TY i = 0; i < A2_dim * num_threads; i++) {
       index_list_all[i]  = 0;
@@ -127,9 +128,9 @@ struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::OMP, INDEX_CODE, VAL_CODE> {
     // Next, initialize the deferred buffers ourselves, instead of using
     // Realm fills (which tend to be slower).
     auto size            = A2_dim * num_threads;
-    auto index_list_all  = legate::create_buffer<INDEX_TY, 1>(size, kind);
-    auto already_set_all = legate::create_buffer<bool, 1>(size, kind);
-    auto workspace_all   = legate::create_buffer<VAL_TY, 1>(size, kind);
+    auto index_list_all  = CREATE_BUFFER(INDEX_TY, size, kind, "index_list_all");
+    auto already_set_all = CREATE_BUFFER(bool, size, kind, "already_set_all");
+    auto workspace_all   = CREATE_BUFFER(VAL_TY, size, kind, "workspace_all");
 #pragma omp parallel for schedule(static)
     for (INDEX_TY i = 0; i < A2_dim * num_threads; i++) {
       index_list_all[i]  = 0;
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
index 7d912184..c958752a 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr_template.inl
@@ -114,8 +114,7 @@ static void spgemm_csr_csr_csr_template(TaskContext context)
     inputs[4],
     inputs[5],
   };
-
-  index_type_value_type_dispatch(
+  index_type_floating_point_value_type_dispatch(
     args.A_crd.code(), args.A_vals.code(), SpGEMMCSRxCSRxCSRImpl<KIND>{}, args);
 }
 
diff --git a/src/legate_sparse/array/csr/spmv.cu b/src/legate_sparse/array/csr/spmv.cu
index 0be4d98c..f2c5f1a1 100644
--- a/src/legate_sparse/array/csr/spmv.cu
+++ b/src/legate_sparse/array/csr/spmv.cu
@@ -19,6 +19,7 @@
 #include "legate_sparse/util/cusparse_utils.h"
 #include "legate_sparse/util/legate_utils.h"
 #include "legate_sparse/util/dispatch.h"
+#include "legate_sparse/util/legate_utils.h"
 
 #ifndef CUSPARSE_MISALIGNMENT_FIX_VERSION
 #define CUSPARSE_MISALIGNMENT_FIX_VERSION 12500
@@ -131,7 +132,7 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
     // Allocate a buffer if we need to.
     void* workspacePtr = nullptr;
     if (bufSize > 0) {
-      auto buf     = legate::create_buffer<char, 1>(bufSize, Memory::GPU_FB_MEM);
+      auto buf     = CREATE_BUFFER(char, bufSize, Memory::GPU_FB_MEM, "workspace_buf");
       workspacePtr = buf.ptr(0);
     }
 
@@ -165,7 +166,11 @@ struct CSRSpMVRowSplitImpl<VariantKind::GPU> {
 
 /*static*/ void CSRSpMVRowSplit::gpu_variant(TaskContext context)
 {
-  csr_spmv_row_split_template<VariantKind::GPU>(context);
+  auto inputs = context.inputs();
+  CSRSpMVRowSplitArgs args{context.outputs()[0], inputs[0], inputs[1], inputs[2], inputs[3]};
+
+  index_type_floating_point_value_type_dispatch(
+    args.A_crd.code(), args.y.code(), CSRSpMVRowSplitImpl<VariantKind::GPU>{}, args);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/cffi.h b/src/legate_sparse/cffi.h
index afcf9227..cdd53926 100644
--- a/src/legate_sparse/cffi.h
+++ b/src/legate_sparse/cffi.h
@@ -32,6 +32,9 @@ enum LegateSparseOpCode {
   // Operations on matrices that aren't quite tensor algebra related.
   LEGATE_SPARSE_CSR_DIAGONAL,
 
+  // Indexing a CSR matrix with another CSR matrix
+  LEGATE_SPARSE_CSR_INDEXING_CSR,
+
   // Linear algebra operations
   LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT,
   LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ,
@@ -42,6 +45,9 @@ enum LegateSparseOpCode {
   // like iterative linear solvers.
   LEGATE_SPARSE_AXPBY,
 
+  // nonzero API
+  LEGATE_SPARSE_NONZERO,
+
   // Utility tasks.
   LEGATE_SPARSE_ZIP_TO_RECT_1,
   LEGATE_SPARSE_UNZIP_RECT_1,
@@ -56,10 +62,4 @@ enum LegateSparseOpCode {
   LEGATE_SPARSE_LAST_TASK,  // must be last
 };
 
-enum LegateSparseProjectionFunctors {
-  _LEGATE_SPARSE_PROJ_FN_BASE = 0,
-  LEGATE_SPARSE_PROJ_FN_1D_TO_2D,
-  LEGATE_SPARSE_LAST_PROJ_FN,  // must be last
-};
-
 #endif  // __LEGATE_SPARSE_CFFI_H__
diff --git a/src/legate_sparse/io/mtx_to_coo.cc b/src/legate_sparse/io/mtx_to_coo.cc
index 6bc52af6..d2c85667 100644
--- a/src/legate_sparse/io/mtx_to_coo.cc
+++ b/src/legate_sparse/io/mtx_to_coo.cc
@@ -15,6 +15,7 @@
  */
 
 #include "legate_sparse/io/mtx_to_coo.h"
+#include "legate_sparse/util/legate_utils.h"
 
 #include <fstream>
 
@@ -106,9 +107,12 @@ using val_ty   = double;
     bufSize *= 2;
   }
 
-  auto row_acc  = rows.data().create_output_buffer<coord_ty, 1>(bufSize, true /* return_data */);
-  auto col_acc  = cols.data().create_output_buffer<coord_ty, 1>(bufSize, true /* return_data */);
+  auto row_acc = rows.data().create_output_buffer<coord_ty, 1>(bufSize, true /* return_data */);
+  LOG_BUFFER(coord_ty, bufSize, "IO: rows");
+  auto col_acc = cols.data().create_output_buffer<coord_ty, 1>(bufSize, true /* return_data */);
+  LOG_BUFFER(coord_ty, bufSize, "IO: cols");
   auto vals_acc = vals.data().create_output_buffer<val_ty, 1>(bufSize, true /* return_data */);
+  LOG_BUFFER(val_ty, bufSize, "IO: vals");
 
   size_t idx = 0;
   while (std::getline(file, line)) {
diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc
index 0f729324..43ed8b14 100644
--- a/src/legate_sparse/mapper/mapper.cc
+++ b/src/legate_sparse/mapper/mapper.cc
@@ -54,11 +54,11 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
         // GPU variant has two buffers with pre-determined size and
         // another one based on output from cuSparse
         // For the default spmv algorithm using csr format, cuSparse
-        // could allocate ceil(nnz  / nthreads_per_block ) * sizeof(double) bytes
-        // of temporary memory. Since this expression could change in the future,
-        // we use this estimate and use a factor of safety to sheild us from
-        // mapper errors while noting that nthreads_per_block is 128 on newer
-        // GPUs and 32 on older ones.
+        // could allocate ceil(nnz  / nthreads_per_block ) * sizeof(double)
+        // bytes of temporary memory. Since this expression could change in the
+        // future, we use this estimate and use a factor of safety to sheild us
+        // from mapper errors while noting that nthreads_per_block is 128 on
+        // newer GPUs and 32 on older ones.
 
         auto pos  = task.inputs()[0];
         auto crd  = task.inputs()[1];
@@ -84,7 +84,8 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       // Allocations done in the omp version:
       // (1) Three arrays of types bool, index_ty, val_ty and of
       //     size: (max_col - min_col of c) * nthreads
-      // (2) Extra storage from thrust::minmax_element(). Use O(1) words, say, 2?
+      // (2) Extra storage from thrust::minmax_element(). Use O(1) words, say,
+      // 2?
 
       // For the first one, if we assume that datatype size is 17 bytes per word
       // (1 for bool, and 8 each for index and val types),
@@ -111,15 +112,6 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       return std::nullopt;
     }
 
-    case LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES: {
-      // Two temp buffers explictly created but there are a number of thrust
-      // calls. Thrust could create temp buffer for these whose sizes can't be
-      // predicted. We are going to use unbounded pool size until
-      // we can estimate the pool size better
-
-      return std::nullopt;
-    }
-
     case LEGATE_SPARSE_READ_MTX_TO_COO: {
       // Three output buffers created but size depends on the file,
       // so we cannot estimate upper bound
diff --git a/src/legate_sparse/projections.cc b/src/legate_sparse/projections.cc
deleted file mode 100644
index 9ddf418f..00000000
--- a/src/legate_sparse/projections.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2022-2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#include "projections.h"
-
-using namespace Legion;
-
-namespace sparse {
-
-Legion::LogicalRegion Promote1Dto2DFunctor::project(Legion::LogicalPartition upper_bound,
-                                                    const Legion::DomainPoint& input,
-                                                    const Legion::Domain& launch_domain)
-{
-  auto color_space =
-    this->runtime->get_index_partition_color_space(upper_bound.get_index_partition());
-  assert(input.dim == 1 && launch_domain.dim == 1);
-  assert(color_space.dim == 2);
-  assert(color_space.lo()[0] == 0 && color_space.lo()[1] == 0);
-  int64_t jdim = color_space.hi()[1] + 1;
-
-  // Project our input point onto the mxn grid.
-  int64_t idx = input[0];
-  int64_t i   = idx / jdim;
-  int64_t j   = idx % jdim;
-  auto output = Point<2>{i, j};
-  assert(color_space.contains(output));
-  return runtime->get_logical_subregion_by_color(upper_bound, output);
-}
-
-Functor1DToRowsImplicit2D::Functor1DToRowsImplicit2D(Legion::Runtime* rt,
-                                                     int32_t gx,
-                                                     int32_t gy,
-                                                     bool rows)
-  : LegateSparseProjectionFunctor(rt), gx(gx), gy(gy), rows(rows)
-{
-}
-
-Legion::LogicalRegion Functor1DToRowsImplicit2D::project(Legion::LogicalPartition upper_bound,
-                                                         const Legion::DomainPoint& input,
-                                                         const Legion::Domain& launch_domain)
-{
-  int64_t idx = input[0];
-  int64_t i   = idx / gy;
-  int64_t j   = idx % gy;
-  // Assume we're just projecting onto the rows right now.
-  if (rows) {
-    return runtime->get_logical_subregion_by_color(upper_bound, i);
-  } else {
-    return runtime->get_logical_subregion_by_color(upper_bound, j);
-  }
-}
-
-void register_legate_sparse_1d_to_2d_functor(legion_projection_id_t proj_id,
-                                             int32_t gx,
-                                             int32_t gy,
-                                             bool rows)
-{
-  auto rt = Legion::Runtime::get_runtime();
-  rt->register_projection_functor(
-    proj_id, new sparse::Functor1DToRowsImplicit2D(rt, gx, gy, rows), true /* silence_warnings */);
-}
-
-}  // namespace sparse
-
-extern "C" {
-}
diff --git a/src/legate_sparse/projections.h b/src/legate_sparse/projections.h
deleted file mode 100644
index 20a9ac23..00000000
--- a/src/legate_sparse/projections.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022-2024 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-#include "legate_sparse/sparse_c.h"
-#include "legate.h"
-
-namespace sparse {
-
-// LegateSparseProjectionFunctor is a base class for projection functors.
-class LegateSparseProjectionFunctor : public Legion::ProjectionFunctor {
- public:
-  LegateSparseProjectionFunctor(Legion::Runtime* rt) : Legion::ProjectionFunctor(rt) {}
-  bool is_functional(void) const override { return true; }
-  bool is_exclusive(void) const override { return true; }
-  unsigned get_depth(void) const override { return 0; }
-};
-
-// Promote1Dto2DFunctor is a projection functor that up-casts an index
-// into a point in a 2-D color space.
-class Promote1Dto2DFunctor : public LegateSparseProjectionFunctor {
- public:
-  Promote1Dto2DFunctor(Legion::Runtime* rt) : LegateSparseProjectionFunctor(rt) {}
-
- public:
-  Legion::LogicalRegion project(Legion::LogicalPartition upper_bound,
-                                const Legion::DomainPoint& point,
-                                const Legion::Domain& launch_domain) override;
-};
-
-// Functor1DToRowsImplicit2D is a functor that is similar to Promote1Dto2DFunctor
-// but works around legate internal partitioning that internally translates
-// multi-dimensional tiled colorings into tilings over 1-D color spaces.
-class Functor1DToRowsImplicit2D : public LegateSparseProjectionFunctor {
- public:
-  Functor1DToRowsImplicit2D(Legion::Runtime* rt, int32_t gx, int32_t gy, bool rows);
-  Legion::LogicalRegion project(Legion::LogicalPartition upper_bound,
-                                const Legion::DomainPoint& point,
-                                const Legion::Domain& launch_domain) override;
-
- private:
-  int32_t gx, gy;
-  bool rows = true;
-};
-
-void register_legate_sparse_1d_to_2d_functor(legion_projection_id_t proj_id,
-                                             int32_t gx,
-                                             int32_t gy,
-                                             bool rows);
-
-}  // namespace sparse
diff --git a/src/legate_sparse/sparse.cc b/src/legate_sparse/sparse.cc
index d502db66..1651f94e 100644
--- a/src/legate_sparse/sparse.cc
+++ b/src/legate_sparse/sparse.cc
@@ -15,7 +15,6 @@
  */
 
 #include "legate_sparse/sparse.h"
-#include "projections.h"
 
 #include "legate_sparse/mapper/mapper.h"
 
@@ -40,10 +39,11 @@ void registration_callback()
   //  to keep this in line with the Python import since there seems to be a
   //  cyclic dependency.
   // config.max_tasks = LEGATE_SPARSE_LAST_TASK;
-  // config.max_projections = LEGATE_SPARSE_LAST_PROJ_FN;
   config.max_tasks = 100;
+  // SJ; Sat 11 Jan 2025 01:16:31 PM PST
+  // Do we need to set max_projections if we don't use any projection functors?
   // TODO (rohany): We're dynamically generating projections... How does cupynumeric handle this?
-  config.max_projections = 1000;
+  // config.max_projections = 1000;
 
   auto options = VariantOptions{}.with_has_allocations(false);
   auto ctx     = Runtime::get_runtime()->create_library(
@@ -53,11 +53,6 @@ void registration_callback()
     {{VariantCode::CPU, options}, {VariantCode::GPU, options}, {VariantCode::OMP, options}});
 
   Sparse::get_registrar().register_all_tasks(ctx);
-
-  auto runtime = Legion::Runtime::get_runtime();
-  auto proj_id = ctx.get_projection_id(LEGATE_SPARSE_PROJ_FN_1D_TO_2D);
-  auto functor = new Promote1Dto2DFunctor(runtime);
-  runtime->register_projection_functor(proj_id, functor, true /*silence warnings*/);
 }
 
 }  // namespace sparse
diff --git a/src/legate_sparse/util/cusparse_utils.h b/src/legate_sparse/util/cusparse_utils.h
index 0a0b0c90..6d496a3e 100644
--- a/src/legate_sparse/util/cusparse_utils.h
+++ b/src/legate_sparse/util/cusparse_utils.h
@@ -20,6 +20,8 @@
 
 namespace sparse {
 
+using namespace legate;
+
 // All of our indices are 0 based.
 const cusparseIndexBase_t index_base = CUSPARSE_INDEX_BASE_ZERO;
 
@@ -29,7 +31,7 @@ constexpr std::size_t BUFFER_DEFAULT_ALIGNMENT = 16;
 // convertGlobalPosToLocalIndPtr converts the global pos array used for
 // CSR and CSC matrices into a locally indexed indptr array.
 template <typename T>
-__global__ void convertGlobalPosToLocalIndPtr(size_t rows, const Legion::Rect<1>* pos, T* indptr)
+__global__ void convertGlobalPosToLocalIndPtr(size_t rows, const Rect<1>* pos, T* indptr)
 {
   const auto idx = global_tid_1d();
   if (idx >= rows) {
@@ -48,7 +50,7 @@ __global__ void convertGlobalPosToLocalIndPtr(size_t rows, const Legion::Rect<1>
 // save number of nonzeros per row in the partition based on crd image
 template <typename TS, typename TD>
 __global__ void convertImagePosToLocalIndPtr(
-  size_t image_crds, const TS* crds, const Legion::Rect<1>* pos, TS low_offset, TD* indptr)
+  size_t image_crds, const TS* crds, const Rect<1>* pos, TS low_offset, TD* indptr)
 {
   const auto idx = global_tid_1d();
   if (idx >= image_crds) {
@@ -139,9 +141,9 @@ cusparseSpMatDescr_t makeCuSparseCSR(const legate::PhysicalStore& pos,
   auto pos_domain = pos.domain();
   auto crd_domain = crd.domain();
 
-  auto pos_acc = pos.read_accessor<Legion::Rect<1>, 1>();
+  auto pos_acc = pos.read_accessor<Rect<1>, 1>();
   size_t rows  = pos_domain.get_volume();
-  auto indptr  = legate::create_buffer<INDEX_TY, 1>(rows + 1, Legion::Memory::GPU_FB_MEM);
+  auto indptr  = legate::create_buffer<INDEX_TY, 1>(rows + 1, legate::Memory::GPU_FB_MEM);
   auto blocks  = get_num_blocks_1d(rows);
   convertGlobalPosToLocalIndPtr<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
     rows, pos_acc.ptr(pos_domain.lo()), indptr.ptr(0));
@@ -175,9 +177,9 @@ cusparseSpMatDescr_t makeCuSparseCSC(const legate::PhysicalStore& pos,
   auto pos_domain = pos.domain();
   auto crd_domain = crd.domain();
 
-  auto pos_acc = pos.read_accessor<Legion::Rect<1>, 1>();
+  auto pos_acc = pos.read_accessor<Rect<1>, 1>();
   size_t cols  = pos_domain.get_volume();
-  auto indptr  = legate::create_buffer<int64_t, 1>(cols + 1, Legion::Memory::GPU_FB_MEM);
+  auto indptr  = legate::create_buffer<int64_t, 1>(cols + 1, legate::Memory::GPU_FB_MEM);
   auto blocks  = get_num_blocks_1d(cols);
   convertGlobalPosToLocalIndPtr<<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
     cols, pos_acc.ptr(pos_domain.lo()), indptr.ptr(0));
@@ -279,7 +281,7 @@ __global__ void localIndptrToNnz(size_t rows, uint64_t* out, T* in)
 // localIndptrToPos is a utility kernel to cast an indptr array
 // into a legate.sparse pos array.
 template <typename T>
-__global__ void localIndptrToPos(size_t rows, Legion::Rect<1>* out, T* in, int64_t offset)
+__global__ void localIndptrToPos(size_t rows, Rect<1>* out, T* in, int64_t offset)
 {
   const auto idx = global_tid_1d();
   if (idx >= rows) {
diff --git a/src/legate_sparse/util/dispatch.h b/src/legate_sparse/util/dispatch.h
index 9102c5c8..822ac52d 100644
--- a/src/legate_sparse/util/dispatch.h
+++ b/src/legate_sparse/util/dispatch.h
@@ -23,6 +23,35 @@
 
 namespace sparse {
 
+template <legate::Type::Code INDEX_TY_CODE, typename Functor, typename... Fnargs>
+constexpr decltype(auto) value_type_floating_point_dispatch_from_index(
+  legate::Type::Code value_type, Functor f, Fnargs&&... args)
+{
+  // Dispatch on the supported value types, conditioned on the index types.
+  switch (value_type) {
+    case legate::Type::Code::FLOAT32: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::FLOAT32>(
+        std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::FLOAT64: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::FLOAT64>(
+        std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::COMPLEX64: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::COMPLEX64>(
+        std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::COMPLEX128: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::COMPLEX128>(
+        std::forward<Fnargs>(args)...);
+    }
+    default: break;
+  }
+  assert(false);
+  return f.template operator()<INDEX_TY_CODE, legate::Type::Code::FLOAT32>(
+    std::forward<Fnargs>(args)...);
+}
+
 template <legate::Type::Code INDEX_TY_CODE, typename Functor, typename... Fnargs>
 constexpr decltype(auto) value_type_dispatch_from_index(legate::Type::Code value_type,
                                                         Functor f,
@@ -30,6 +59,18 @@ constexpr decltype(auto) value_type_dispatch_from_index(legate::Type::Code value
 {
   // Dispatch on the supported value types, conditioned on the index types.
   switch (value_type) {
+    case legate::Type::Code::BOOL: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::BOOL>(
+        std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::INT32: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::INT32>(
+        std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::INT64: {
+      return f.template operator()<INDEX_TY_CODE, legate::Type::Code::INT64>(
+        std::forward<Fnargs>(args)...);
+    }
     case legate::Type::Code::FLOAT32: {
       return f.template operator()<INDEX_TY_CODE, legate::Type::Code::FLOAT32>(
         std::forward<Fnargs>(args)...);
@@ -53,6 +94,30 @@ constexpr decltype(auto) value_type_dispatch_from_index(legate::Type::Code value
     std::forward<Fnargs>(args)...);
 }
 
+template <typename Functor, typename... Fnargs>
+constexpr decltype(auto) index_type_floating_point_value_type_dispatch(
+  legate::Type::Code index_type, legate::Type::Code value_type, Functor f, Fnargs&&... args)
+{
+  // First dispatch onto the index type.
+  switch (index_type) {
+    case legate::Type::Code::INT32: {
+      return value_type_floating_point_dispatch_from_index<legate::Type::Code::INT32,
+                                                           Functor,
+                                                           Fnargs...>(value_type, f, args...);
+    }
+    case legate::Type::Code::INT64: {
+      return value_type_floating_point_dispatch_from_index<legate::Type::Code::INT64,
+                                                           Functor,
+                                                           Fnargs...>(value_type, f, args...);
+    }
+    default: break;
+  }
+  assert(false);
+  return value_type_floating_point_dispatch_from_index<legate::Type::Code::INT32,
+                                                       Functor,
+                                                       Fnargs...>(value_type, f, args...);
+}
+
 template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) index_type_value_type_dispatch(legate::Type::Code index_type,
                                                         legate::Type::Code value_type,
@@ -98,6 +163,39 @@ template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) value_type_dispatch(legate::Type::Code value_type,
                                              Functor f,
                                              Fnargs&&... args)
+{
+  switch (value_type) {
+    case legate::Type::Code::BOOL: {
+      return f.template operator()<legate::Type::Code::BOOL>(std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::INT32: {
+      return f.template operator()<legate::Type::Code::INT32>(std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::INT64: {
+      return f.template operator()<legate::Type::Code::INT64>(std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::FLOAT32: {
+      return f.template operator()<legate::Type::Code::FLOAT32>(std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::FLOAT64: {
+      return f.template operator()<legate::Type::Code::FLOAT64>(std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::COMPLEX64: {
+      return f.template operator()<legate::Type::Code::COMPLEX64>(std::forward<Fnargs>(args)...);
+    }
+    case legate::Type::Code::COMPLEX128: {
+      return f.template operator()<legate::Type::Code::COMPLEX128>(std::forward<Fnargs>(args)...);
+    }
+    default: break;
+  }
+  assert(false);
+  return f.template operator()<legate::Type::Code::FLOAT32>(std::forward<Fnargs>(args)...);
+}
+
+template <typename Functor, typename... Fnargs>
+constexpr decltype(auto) value_type_floating_point_dispatch(legate::Type::Code value_type,
+                                                            Functor f,
+                                                            Fnargs&&... args)
 {
   switch (value_type) {
     case legate::Type::Code::FLOAT32: {
diff --git a/src/legate_sparse/util/legate_utils.h b/src/legate_sparse/util/legate_utils.h
index 496befb6..c965dfdc 100644
--- a/src/legate_sparse/util/legate_utils.h
+++ b/src/legate_sparse/util/legate_utils.h
@@ -17,6 +17,8 @@
 #pragma once
 
 #include "legate/utilities/typedefs.h"
+#include "realm/logging.h"
+#include <string_view>
 
 namespace sparse {
 
@@ -27,4 +29,34 @@ legate::Rect<1> create_1d_extents(const T& lo, const Q& hi)
   return legate::Rect<1>{legate::Point<1>{lo}, legate::Point<1>{hi}};
 }
 
+inline Realm::Logger& get_logger()
+{
+  static Realm::Logger logger("legate-sparse");
+  return logger;
+}
+
+// Remove the path and use only the filename
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+
+// Macros for buffer allocation logging
+#if ENABLE_BUFFER_LOGGING
+#define CREATE_BUFFER(T, size, mem, desc)                                                       \
+  [&]() {                                                                                       \
+    auto buf = legate::create_buffer<T, 1>(size, mem);                                          \
+    get_logger().print() << "Buffer allocation at " << __FILENAME__ << ":" << __LINE__          \
+                         << " - Size: " << size << " Type: " << #T << " Description: " << desc; \
+    return buf;                                                                                 \
+  }()
+#else
+#define CREATE_BUFFER(T, size, mem, desc) legate::create_buffer<T, 1>(size, mem)
+#endif
+
+#if ENABLE_BUFFER_LOGGING
+#define LOG_BUFFER(T, size, desc)                                                    \
+  get_logger().print() << "Buffer allocation at " << __FILENAME__ << ":" << __LINE__ \
+                       << " - Size: " << size << " Type: " << #T << " Description: " << desc
+#else
+#define LOG_BUFFER(T, size, desc)
+#endif
+
 }  // namespace sparse
diff --git a/src/legate_sparse/util/logger.h b/src/legate_sparse/util/logger.h
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/legate_sparse/util/logger.h
@@ -0,0 +1 @@
+
diff --git a/src/legate_sparse/util/thrust_allocator.h b/src/legate_sparse/util/thrust_allocator.h
index fc274da5..eaa22fd8 100644
--- a/src/legate_sparse/util/thrust_allocator.h
+++ b/src/legate_sparse/util/thrust_allocator.h
@@ -17,21 +17,25 @@
 #pragma once
 
 #include "legate.h"
+#include "legate_sparse/util/legate_utils.h"
 
 namespace sparse {
 
+using namespace legate;
+
 class ThrustAllocator : public legate::ScopedAllocator {
  public:
   using value_type = char;
 
-  ThrustAllocator(Legion::Memory::Kind kind) : legate::ScopedAllocator(kind) {}
+  ThrustAllocator(Memory::Kind kind) : legate::ScopedAllocator(kind) {}
 
   char* allocate(size_t num_bytes)
   {
+    LOG_BUFFER(char, num_bytes, "ThrustAllocator::allocate");
     return static_cast<char*>(ScopedAllocator::allocate(num_bytes));
   }
 
   void deallocate(char* ptr, size_t n) { ScopedAllocator::deallocate(ptr); }
 };
 
-}  // namespace sparse
\ No newline at end of file
+}  // namespace sparse
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 00000000..8c7ffa2f
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,59 @@
+import numpy
+import pytest
+from scipy import sparse as scipy_sparse
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+
+
+@pytest.fixture
+def create_mask():
+    """
+    Create a boolean mask matrix with a random sparsity pattern
+    """
+
+    def _create_mask(rows, density=0.3):
+        cols = rows
+        nnz = int(rows * cols * density)
+
+        # SciPy
+        row_idx = numpy.random.randint(0, rows, size=nnz)
+        col_idx = numpy.random.randint(0, cols, size=nnz)
+        data = numpy.ones(nnz, dtype=bool)
+        A_scipy = scipy_sparse.csr_array((data, (row_idx, col_idx)), shape=(rows, cols))
+
+        # Sparse
+        A_sparse = sparse.csr_array(A_scipy.todense())
+
+        # Verify matrices are equivalent
+        A_scipy_dense = numpy.asarray(A_scipy.todense())
+        A_sparse_dense = numpy.asarray(A_sparse.todense())
+        assert numpy.all(
+            numpy.allclose(A_scipy_dense, A_sparse_dense, rtol=1e-5, atol=1e-6)
+        )
+
+        return A_scipy, A_sparse
+
+    return _create_mask
+
+
+@pytest.fixture
+def create_matrix():
+    """
+    Create matrices in SciPy and Legate Sparse that are equivalent
+    """
+
+    def _create_matrix(N, tol=0.5):
+        _, A_scipy, _ = simple_system_gen(N, N, scipy_sparse.csr_array, tol=tol)
+        A_sparse = sparse.csr_array(A_scipy)
+
+        # Verify matrices are equivalent
+        A_scipy_dense = numpy.asarray(A_scipy.todense())
+        A_sparse_dense = numpy.asarray(A_sparse.todense())
+        assert numpy.all(
+            numpy.allclose(A_scipy_dense, A_sparse_dense, rtol=1e-5, atol=1e-6)
+        )
+
+        return A_scipy, A_sparse
+
+    return _create_matrix
diff --git a/tests/integration/test_comparison.py b/tests/integration/test_comparison.py
new file mode 100644
index 00000000..03c28128
--- /dev/null
+++ b/tests/integration/test_comparison.py
@@ -0,0 +1,87 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as np
+import pytest
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+
+# TODO: Enable "eq" after #209 is fixed
+COMPARISON_OPS = [
+    ("gt", lambda x, y: x > y),
+    ("lt", lambda x, y: x < y),
+    ("ge", lambda x, y: x >= y),
+    ("le", lambda x, y: x <= y),
+    # ("eq", lambda x, y: x == y),
+    ("ne", lambda x, y: x != y),
+]
+
+
+@pytest.mark.parametrize("N", [8, 13])
+@pytest.mark.parametrize("threshold", [0.3, 0.5])
+@pytest.mark.parametrize("op_name, op_func", COMPARISON_OPS)
+def test_comparison_operation(N, threshold, op_name, op_func):
+    """Test element-wise comparison operations on non-zero entries of the matrix
+
+    Parameters
+    ----------
+    N : int
+        Size of the test matrix
+    threshold : float
+        Value to compare against
+    op_name : str
+        Name of the comparison operation
+    op_func : callable
+        The comparison function to test
+    """
+    A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
+
+    sparse_result = op_func(A_sparse, threshold)
+    dense_result = op_func(A_dense[A_dense != 0], threshold)
+
+    assert sparse_result.astype(int).sum() == dense_result.astype(int).sum()
+
+
+@pytest.mark.parametrize("op_name, op_func", COMPARISON_OPS)
+def test_comparison_error_cases(op_name, op_func):
+    """Test error cases for comparison operations.
+
+    Parameters
+    ----------
+    op_name : str
+        Name of the comparison operation
+    op_func : callable
+        The comparison function to test
+    """
+    N = 8
+    _, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
+
+    # Test comparison with non-scalar values
+    invalid_comparisons = [
+        np.array([1, 2, 3]),  # 1D array
+        np.array([[1, 2], [3, 4]]),  # 2D array
+        "string",  # string
+        [1, 2, 3],  # list
+    ]
+
+    for invalid_value in invalid_comparisons:
+        with pytest.raises(AssertionError):
+            op_func(A_sparse, invalid_value)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py
new file mode 100644
index 00000000..80d078ae
--- /dev/null
+++ b/tests/integration/test_indexing.py
@@ -0,0 +1,167 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric
+import numpy
+import pytest
+
+from legate_sparse import csr_matrix
+
+
+class TestIndexingSetItem:
+    @pytest.mark.parametrize("N", [6, 9, 17])
+    def test_incompatible_mask(self, N, create_matrix, create_mask):
+        """
+        This test checks that the mask is applied correctly to the matrix when
+        the sparsity of mask is from that of the matrix.
+        While SciPy will apply the mask to all entries, Legate Sparse will only
+        apply the mask to the non-zero entries of the matrix, so we can't compare
+        to SciPy results for all entries. Instead, we check that the number of
+        non-zero entries are updated correctly and the values are updated correctly.
+        """
+        _, A = create_matrix(N)
+        _, mask = create_mask(N)
+
+        mask_dense = numpy.asarray(mask.todense())
+        A_dense = numpy.asarray(A.todense())
+
+        value = 10.0
+        A[mask] = value
+
+        vals = A.get_data()
+        num_nonzeros = numpy.count_nonzero(A_dense[mask_dense])
+
+        # make sure the number of entries are updated correctly
+        num_updated = (vals == value).astype(int).sum()
+        assert num_updated == num_nonzeros
+
+        # make sure the values are updated correctly
+        A_dense = numpy.asarray(A.todense())
+        assert numpy.allclose(A_dense[mask_dense].sum() / num_nonzeros, value)
+
+        # TODO: Add a check/test for location of nonzeros as well
+
+    @pytest.mark.parametrize("N", [8, 13, 24])
+    def test_mask_derived_from_self(self, N, create_matrix):
+        """
+        This test checks that the mask is applied correctly to the matrix when
+        the sparsity of mask is derived from the matrix. Our behavior
+        matches that of SciPy, so we can compare against SciPy
+        results for all entries.
+        """
+        A_scipy, A_sparse = create_matrix(N)
+        threshold = 0.2
+        value = 10.0
+
+        # Legate operations
+        mask_sparse = A_sparse > threshold
+        A_sparse[mask_sparse] = value
+
+        # SciPy operations
+        mask_scipy = A_scipy > threshold
+        A_scipy[mask_scipy] = value
+
+        # Make sure scipy and legate sparse matrices are the same
+        A_scipy_dense = numpy.asarray(A_scipy.todense())
+        A_sparse_dense = numpy.asarray(A_sparse.todense())
+        assert numpy.all(
+            numpy.allclose(A_scipy_dense, A_sparse_dense, rtol=1e-5, atol=1e-6)
+        )
+
+    @pytest.mark.parametrize("N", [8, 13, 24])
+    def test_mask_all_true(self, N, create_matrix):
+        """
+        This test checks indexing behavior when using a mask that is all True.
+        Every non-zero element should be updated to the new value.
+        """
+        _, A = create_matrix(N)
+        value = 10.0
+
+        # Create mask with same sparsity pattern as A_sparse but all True values
+        mask_all_true = A.copy()
+        mask_all_true.data = numpy.ones(A.nnz, dtype=bool)
+
+        A[mask_all_true] = value
+
+        # All non-zero elements should be updated to value
+        assert numpy.all(A.get_data() == value)
+
+    @pytest.mark.parametrize("N", [8, 13, 24])
+    def test_mask_all_false(self, N, create_matrix, create_mask):
+        """
+        This test checks indexing behavior when using a mask that is all False.
+        No elements should be modified.
+        """
+        _, A = create_matrix(N)
+        _, mask_all_false = create_mask(N, density=0)
+        A_copy = A.copy()
+
+        value = 10.0
+        A[mask_all_false] = value
+
+        # # Matrix should remain unchanged
+        assert numpy.all(A_copy.get_data() == A.get_data())
+
+    def test_random_column_order(self):
+        "The ordering of the matrix is random" ""
+        row_indices = cupynumeric.array(
+            [
+                2,
+                4,
+                5,
+                3,
+                5,
+                1,
+                1,
+                5,
+                5,
+            ]
+        )
+        col_indices = cupynumeric.array(
+            [
+                3,
+                1,
+                2,
+                2,
+                5,
+                1,
+                4,
+                1,
+                3,
+            ]
+        )
+        data = cupynumeric.array([7.0, 9.0, 3.0, 4.0, 5.0, 19.0, 2.0, 99.0, 109.0])
+
+        # note that the data in row 5 is ordered (2, 5, 1, 3),which will get
+        # sorted to (1, 2, 5, 3) during instantiation, which is needed for indexing
+        # to work correctly
+
+        A_sparse = csr_matrix((data, (row_indices, col_indices)), shape=(6, 6))
+
+        mask = A_sparse > 18.0
+        A_sparse[mask] = 10.0
+
+        data_sparse = A_sparse.get_data()
+        num_replaced_sparse = (data_sparse == 10.0).sum()
+        num_replaced_numpy = (data > 18.0).sum()
+
+        # make sure the number of elements that needed to be replaced
+        # in the data array gets replaced in the sparse matrix
+        assert num_replaced_sparse == num_replaced_numpy
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_manual_sorting.py b/tests/integration/test_manual_sorting.py
new file mode 100644
index 00000000..7d3ed282
--- /dev/null
+++ b/tests/integration/test_manual_sorting.py
@@ -0,0 +1,34 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as np
+import numpy
+import pytest
+from legate_sparse.utils import sort_by_rows_then_cols
+
+
+def test_manual_sorting():
+    N = 5
+    for _ in range(N):
+        rows = np.random.randint(0, 100, 100)
+        cols = np.random.randint(0, 100, 100)
+        assert numpy.allclose(
+            sort_by_rows_then_cols(rows, cols), numpy.lexsort((cols, rows))
+        )
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_nonzero.py b/tests/integration/test_nonzero.py
new file mode 100644
index 00000000..147dab4f
--- /dev/null
+++ b/tests/integration/test_nonzero.py
@@ -0,0 +1,43 @@
+# Copyright 2022-2024 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupynumeric as np
+import numpy
+import pytest
+from utils.sample import simple_system_gen
+
+import legate_sparse as sparse
+
+
+@pytest.mark.parametrize("N", [8, 13])
+def test_nonzero(N):
+    """
+    This test checks that the nonzero method returns the correct indices for a sparse matrix.
+    """
+    np.random.seed(0)
+    A_dense, _, _ = simple_system_gen(N, N, None, tol=0.2)
+
+    r_numpy, c_numpy = numpy.nonzero(A_dense)
+
+    A = sparse.csr_array(A_dense)
+    r_scipy, c_scipy = A.nonzero()
+
+    assert np.all(r_numpy == r_scipy)
+    assert np.all(c_numpy == c_scipy)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py
index 69598f27..7106281c 100644
--- a/tests/integration/test_spgemm.py
+++ b/tests/integration/test_spgemm.py
@@ -16,6 +16,7 @@
 
 import cupynumeric as np
 import pytest
+from legate_sparse.runtime import runtime
 from utils.banded_matrix import banded_matrix
 from utils.sample import simple_system_gen
 
@@ -43,9 +44,10 @@ def test_csr_spgemm_unsupported_dtype(N, unsupported_dtype):
     A = banded_matrix(N, nnz_per_row).astype(unsupported_dtype)
     B = banded_matrix(N, nnz_per_row).astype(unsupported_dtype)
 
-    expected_exp = NotImplementedError
-    with pytest.raises(expected_exp):
-        C = A @ B  # noqa: F841
+    if runtime.num_gpus > 0:
+        expected_exp = NotImplementedError
+        with pytest.raises(expected_exp):
+            C = A @ B  # noqa: F841
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py
index b8a3b904..0aca095e 100644
--- a/tests/integration/test_spmv.py
+++ b/tests/integration/test_spmv.py
@@ -16,6 +16,7 @@
 
 import cupynumeric as np
 import pytest
+from legate_sparse.runtime import runtime
 from utils.banded_matrix import banded_matrix
 from utils.sample import simple_system_gen
 
@@ -47,9 +48,10 @@ def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype):
     A = banded_matrix(N, nnz_per_row).astype(unsupported_dtype)
     x = np.ndarray((N,))
 
-    expected_exp = NotImplementedError
-    with pytest.raises(expected_exp):
-        y = A.dot(x)  # noqa: F841
+    if runtime.num_gpus > 0:
+        expected_exp = NotImplementedError
+        with pytest.raises(expected_exp):
+            y = A.dot(x)  # noqa: F841
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/utils/sample.py b/tests/integration/utils/sample.py
index 94180b1b..a201d6f2 100644
--- a/tests/integration/utils/sample.py
+++ b/tests/integration/utils/sample.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cupynumeric
+import cupynumeric as np
 import numpy
 import scipy.sparse as scpy
 import scipy.stats as stats
@@ -46,9 +46,9 @@ def sample_dense_vector(N: int, density: float, seed: int):
 
 
 def simple_system_gen(N, M, cls, tol=0.5):
-    a_dense = cupynumeric.random.rand(N, M)
-    x = cupynumeric.random.rand(M)
-    a_dense = cupynumeric.where(a_dense < tol, a_dense, 0)
+    a_dense = np.random.rand(N, M)
+    x = np.random.rand(M)
+    a_dense = np.where(a_dense < tol, a_dense, 0)
 
     a_sparse = None if cls is None else cls(a_dense)
 

From 886cb7e1bb562299f69f78c0a76e4b97e18fa887 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 30 Jul 2025 12:36:51 -0700
Subject: [PATCH 2/4] v25.07.00

---
 CMakeLists.txt                                |   2 +-
 README.md                                     |   3 +-
 cmake/thirdparty/get_legate.cmake             |   1 +
 cmake/versions.json                           |  14 +-
 conda/conda-build/conda_build_config.yaml     |   6 +-
 conda/conda-build/meta.yaml                   |  15 +-
 examples/common.py                            | 263 +++++++-
 examples/matrix_power.py                      | 145 +++--
 examples/pde.py                               |  45 +-
 examples/spgemm_microbenchmark.py             |  94 +++
 examples/spmv_microbenchmark.py               |  70 ++
 legate_sparse/__init__.py                     |   1 +
 legate_sparse/base.py                         | 190 +++++-
 legate_sparse/config.py                       |  85 ++-
 legate_sparse/csr.py                          | 605 ++++++++++++++++--
 legate_sparse/dia.py                          | 215 ++++++-
 legate_sparse/gallery.py                      |  49 +-
 legate_sparse/install_info.pyi                |  16 +
 legate_sparse/io.py                           |  32 +
 legate_sparse/linalg.py                       | 290 +++++++--
 legate_sparse/module.py                       |  66 +-
 legate_sparse/types.py                        |  11 +
 legate_sparse/utils.py                        | 330 ++++++++--
 setup.py                                      |   5 +-
 src/legate_sparse/array/conv/csr_to_dense.cc  |   6 +-
 src/legate_sparse/array/conv/csr_to_dense.h   |   3 +-
 src/legate_sparse/array/conv/dense_to_csr.cc  |   7 +-
 src/legate_sparse/array/conv/dense_to_csr.h   |   6 +-
 .../array/conv/pos_to_coordinates.cc          |   6 +-
 .../array/conv/pos_to_coordinates.h           |   3 +-
 src/legate_sparse/array/csr/get_diagonal.cc   |   7 +-
 src/legate_sparse/array/csr/get_diagonal.h    |   3 +-
 src/legate_sparse/array/csr/indexing.cc       |   7 +-
 src/legate_sparse/array/csr/indexing.h        |   3 +-
 .../array/csr/spgemm_csr_csr_csr.cc           |   7 +-
 .../array/csr/spgemm_csr_csr_csr.h            |   9 +-
 src/legate_sparse/array/csr/spmv.cc           |   7 +-
 src/legate_sparse/array/csr/spmv.h            |   3 +-
 src/legate_sparse/array/util/scale_rect.cc    |   6 +-
 src/legate_sparse/array/util/scale_rect.h     |   3 +-
 src/legate_sparse/array/util/unzip_rect.cc    |   6 +-
 src/legate_sparse/array/util/unzip_rect.h     |   3 +-
 src/legate_sparse/array/util/zip_to_rect.cc   |   6 +-
 src/legate_sparse/array/util/zip_to_rect.h    |   3 +-
 src/legate_sparse/cudalibs.cu                 |  12 +-
 src/legate_sparse/io/mtx_to_coo.cc            |  20 +-
 src/legate_sparse/io/mtx_to_coo.h             |   3 +-
 src/legate_sparse/linalg/axpby.cc             |   6 +-
 src/legate_sparse/linalg/axpby.h              |   3 +-
 src/legate_sparse/mapper/mapper.cc            |  20 +-
 .../partition/fast_image_partition.cc         |   8 +-
 .../partition/fast_image_partition.h          |   3 +-
 src/legate_sparse/util/upcast_future.cc       |   8 +-
 src/legate_sparse/util/upcast_future.h        |   3 +-
 test.py                                       |   3 +-
 tests/integration/conftest.py                 |  54 +-
 tests/integration/test_cg_solve.py            |  74 ++-
 tests/integration/test_comparison.py          |  46 +-
 tests/integration/test_diagonal.py            |  30 +
 tests/integration/test_gmres_solve.py         |  22 +
 tests/integration/test_indexing.py            | 107 +++-
 tests/integration/test_io.py                  |  48 ++
 tests/integration/test_spgemm.py              |  42 ++
 tests/integration/test_spmv.py                |  51 ++
 tests/integration/utils/banded_matrix.py      |  51 +-
 tests/integration/utils/sample.py             | 128 ++++
 66 files changed, 3035 insertions(+), 364 deletions(-)
 create mode 100644 legate_sparse/install_info.pyi

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82e98886..c32254c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ include(rapids-find)
 ###################################
 # Project
 
-set(legate_sparse_version 25.03.00)
+set(legate_sparse_version 25.07.00)
 
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
 set(CMAKE_CUDA_FLAGS_DEBUG "-O0 -g")
diff --git a/README.md b/README.md
index 5eb4af78..b03ea8c6 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,7 @@ for [NumPy](https://numpy.org/doc/stable/reference/index.html#reference), to
 enable writing programs that operate on distributed dense and sparse arrays.
 Take a look at the `examples` directory for some applications that can 
 use Legate Sparse. We have implemented
-an explicit partial-differential equation (PDE) [solver](examples/pde.py) 
-and [Geometric multi-grid](examples/gmg.py) solver. 
+an explicit partial-differential equation (PDE) [solver](examples/pde.py).
 More complex and interesting applications are on the way -- stay tuned!
 
 Legate Sparse is currently in alpha and supports a subset of APIs 
diff --git a/cmake/thirdparty/get_legate.cmake b/cmake/thirdparty/get_legate.cmake
index 709577a1..727671fd 100644
--- a/cmake/thirdparty/get_legate.cmake
+++ b/cmake/thirdparty/get_legate.cmake
@@ -24,6 +24,7 @@ function(find_or_configure_legate)
   include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
   rapids_cpm_package_details(legate version git_repo git_branch shallow exclude_from_all)
 
+  set(version ${PKG_VERSION})
   set(exclude_from_all ${PKG_EXCLUDE_FROM_ALL})
   if(PKG_BRANCH)
     set(git_branch "${PKG_BRANCH}")
diff --git a/cmake/versions.json b/cmake/versions.json
index 6c8498d4..6c5440f4 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -3,22 +3,22 @@
     "legate" : {
       "repo": "legate.internal",
       "org": "nv-legate",
-      "version": "25.03.02",
-      "git_url" : "git@github.com:nv-legate/legate.internal.git",
+      "version": "25.07.00",
+      "git_url" : "git@github.com:nv-legate/legate.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "75dc0a92bbd2dfb79b6b680a0f37cbd0370d0181",
+      "git_tag" : "a46dc3d5b176ff9546bc831409c394c1bbc3b936",
       "anaconda_label": "main"
     },
     "cupynumeric" : {
       "repo": "cupynumeric.internal",
       "org": "nv-legate",
-      "version": "25.03.02",
-      "git_url" : "git@github.com:nv-legate/cupynumeric.internal",
+      "version": "25.07.00",
+      "git_url" : "git@github.com:nv-legate/cupynumeric",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "1fa45603c560068508c3be2e0df45aec62359019",
-      "anaconda_label": "experimental"
+      "git_tag" : "6132d8450049a7abd7786fb4d60444eb5b4e25db",
+      "anaconda_label": "main"
     }
   }
 }
diff --git a/conda/conda-build/conda_build_config.yaml b/conda/conda-build/conda_build_config.yaml
index 79750a86..ada8dda2 100644
--- a/conda/conda-build/conda_build_config.yaml
+++ b/conda/conda-build/conda_build_config.yaml
@@ -6,12 +6,14 @@ upload_build:
   - false
 
 python:
-  - 3.10
   - 3.11
   - 3.12
+  - 3.13
 
 numpy_version:
-  - ">=1.22,<2"
+  # Not 2.1.0 which segfaults on asarray() sometimes, see
+  # https://github.com/numpy/numpy/pull/27249
+  - ">=1.22,!=2.1.0"
 
 cmake_version:
   - ">=3.20.1,!=3.23.0"
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 03d1f9d3..9bdad28f 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -7,14 +7,6 @@
   {# We need to have a default value for the initial pass over the recipe #}
   {% set gpu_enabled_bool = false %}
 {% endif %}
-{% if upload_build == "true" %}
-  {% set upload_build_bool = true %}
-{% elif upload_build == "false" %}
-  {% set upload_build_bool = false %}
-{% else %}
-  {# We need to have a default value for the initial pass over the recipe #}
-  {% set upload_build_bool = false %}
-{% endif %}
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
@@ -68,9 +60,9 @@ build:
 
 ## Create legate/cupynumeric version and build string
 {% set legate_version = os.environ.get("LEGATE_VERSION", "1.0.0") %}
-{% set legate_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("LEGATE_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set legate_buildstr = "_".join(["py" ~ py_version, "*" ~ cpu_gpu_tag.strip('_'), os.environ.get("LEGATE_BUILDSTR", "") ]) %}
 {% set cupynumeric_version = os.environ.get("CUPYNUMERIC_VERSION", "1.0.0") %}
-{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, os.environ.get("CUPYNUMERIC_BUILDSTR", ""), cpu_gpu_tag]) %}
+{% set cupynumeric_buildstr = "_".join(["cuda" ~ cuda_major, "py" ~ py_version, cpu_gpu_tag, os.environ.get("CUPYNUMERIC_BUILDSTR", "")]) %}
 
 {% if use_local_path is not defined %}
 # use git hash
@@ -126,7 +118,7 @@ requirements:
     #- libcurand-dev
     - openblas =* =*openmp*
     - llvm-openmp
-    - legate ={{ legate_version }}={{ legate_buildstr }}
+    - legate ={{ legate_version }}=*{{ legate_buildstr }}
     - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
 {% if gpu_enabled_bool %}
     # cupynumeric could be only in the run section and we could have just legate
@@ -146,7 +138,6 @@ requirements:
     - numpy {{ numpy_version }}
     - scipy
     - openblas =* =*openmp*
-    - legate ={{ legate_version }}={{ legate_buildstr }}
     - cupynumeric ={{ cupynumeric_version }}={{ cupynumeric_buildstr }}
 {% if gpu_enabled_bool %}
     - libnvjitlink
diff --git a/examples/common.py b/examples/common.py
index b679e5b9..99174ed6 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -20,6 +20,30 @@
 
 
 def get_arg_number(arg):
+    """Parse a string argument that may contain size suffixes.
+
+    Parameters
+    ----------
+    arg : str
+        String argument that may end with 'k', 'm', or 'g' for
+        kilobytes, megabytes, or gigabytes respectively.
+
+    Returns
+    -------
+    int
+        The parsed number with appropriate multiplier applied.
+
+    Examples
+    --------
+    >>> get_arg_number("1024")
+    1024
+    >>> get_arg_number("1k")
+    1024
+    >>> get_arg_number("1m")
+    1048576
+    >>> get_arg_number("1g")
+    1073741824
+    """
     multiplier = 1
     arg = arg.lower()
     if len(arg) == 0:
@@ -38,28 +62,47 @@ def get_arg_number(arg):
 
 
 class Timer(Protocol):
+    """Protocol for timer implementations.
+
+    This protocol defines the interface that timer classes must implement
+    for measuring execution time in the examples.
+    """
+
     def start(self):
+        """Start timing."""
         ...
 
     def stop(self):
-        """
-        Blocks execution until everything before it has completed. Returns the
-        duration since the last call to start(), in milliseconds.
+        """Stop timing and return duration.
+
+        Blocks execution until everything before it has completed.
+
+        Returns
+        -------
+        float
+            Duration since the last call to start(), in milliseconds.
         """
         ...
 
 
 class LegateTimer(Timer):
+    """Timer implementation using Legate's timing facilities.
+
+    This timer uses Legate's internal timing mechanism for accurate
+    measurement of GPU operations.
+    """
+
     def __init__(self):
         self._start = None
 
     def start(self):
+        """Start timing using Legate's time function."""
         from legate.timing import time
 
         self._start = time()
 
-    # returns time in milliseconds
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from legate.timing import time
 
         _end = time()
@@ -67,16 +110,24 @@ def stop(self):
 
 
 class CuPyTimer(Timer):
+    """Timer implementation using CuPy's CUDA events.
+
+    This timer uses CUDA events for accurate measurement of GPU operations
+    in CuPy applications.
+    """
+
     def __init__(self):
         self._start_event = None
 
     def start(self):
+        """Start timing using CUDA events."""
         from cupy import cuda
 
         self._start_event = cuda.Event()
         self._start_event.record()
 
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from cupy import cuda
 
         end_event = cuda.Event()
@@ -86,15 +137,23 @@ def stop(self):
 
 
 class NumPyTimer(Timer):
+    """Timer implementation using Python's high-resolution timer.
+
+    This timer uses Python's perf_counter_ns for accurate measurement
+    of CPU operations in NumPy/SciPy applications.
+    """
+
     def __init__(self):
         self._start_time = None
 
     def start(self):
+        """Start timing using perf_counter_ns."""
         from time import perf_counter_ns
 
         self._start_time = perf_counter_ns() / 1000.0
 
     def stop(self):
+        """Stop timing and return duration in milliseconds."""
         from time import perf_counter_ns
 
         end_time = perf_counter_ns() / 1000.0
@@ -105,27 +164,61 @@ def stop(self):
 # manager so that we can run both CuPy and SciPy
 # programs with resource scoping.
 class DummyScope:
+    """No-op context manager for resource scoping.
+
+    This class provides a dummy context manager that does nothing,
+    allowing the same code to run with both CuPy and SciPy programs
+    that may or may not use resource scoping.
+    """
+
     def __init__(self):
         ...
 
     def __enter__(self):
+        """Enter the context (no-op)."""
         ...
 
     def __exit__(self, _, __, ___):
+        """Exit the context (no-op)."""
         ...
 
     def __getitem__(self, item):
+        """Return self for any indexing (no-op)."""
         return self
 
     def count(self, _):
+        """Return 1 for any count operation."""
         return 1
 
     @property
     def preferred_kind(self):
+        """Return None for preferred kind."""
         return None
 
 
 def get_phase_procs(use_legate: bool):
+    """Get processor configurations for different phases of computation.
+
+    Parameters
+    ----------
+    use_legate : bool
+        Whether to use Legate-specific processor configuration.
+
+    Returns
+    -------
+    tuple
+        (build_procs, solve_procs) - processor configurations for
+        build and solve phases respectively.
+
+    Notes
+    -----
+    When use_legate is True, this function queries the available
+    processors and assigns them to different phases:
+    - Build phase: Prefers CPUs, then OpenMP processors, then GPUs
+    - Solve phase: Prefers GPUs, then OpenMP processors, then CPUs
+
+    When use_legate is False, returns DummyScope objects.
+    """
     if use_legate:
         from legate.core import TaskTarget, get_machine
 
@@ -160,6 +253,27 @@ def get_phase_procs(use_legate: bool):
 
 
 def parse_common_args():
+    """Parse common command line arguments for example scripts.
+
+    Returns
+    -------
+    tuple
+        (package, timer, np, sparse, linalg, use_legate) where:
+        - package: str - the selected package ("legate", "cupy", or "scipy")
+        - timer: Timer - appropriate timer implementation
+        - np: module - numpy/cupy/cupynumeric module
+        - sparse: module - sparse matrix module
+        - linalg: module - linear algebra module
+        - use_legate: bool - whether Legate is being used
+
+    Notes
+    -----
+    This function sets up the global environment with the appropriate
+    modules based on the --package argument. It supports:
+    - "legate": Uses cupynumeric, legate_sparse, and legate_sparse.linalg
+    - "cupy": Uses cupy, cupyx.scipy.sparse, and cupyx.scipy.sparse.linalg
+    - "scipy": Uses numpy, scipy.sparse, and scipy.sparse.linalg
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--package",
@@ -204,6 +318,44 @@ def parse_common_args():
 # `diags` construct csr from dia array, while when from_diags=False
 # we construct csr arrya directly - might be slightly faster
 def banded_matrix(N, nnz_per_row, from_diags=False):
+    """Construct a banded matrix with 1.0 as values.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix (N x N).
+    nnz_per_row : int
+        Number of non-zeros per row. Must be odd.
+    from_diags : bool, optional
+        If True, construct using sparse.diags then convert to CSR.
+        If False, construct CSR array directly. Default is False.
+
+    Returns
+    -------
+    sparse matrix
+        A banded matrix in CSR format with 1.0 values.
+
+    Raises
+    ------
+    AssertionError
+        If N <= nnz_per_row or nnz_per_row is not odd.
+
+    Notes
+    -----
+    The matrix has a banded structure with nnz_per_row non-zeros per row,
+    centered around the main diagonal. The direct CSR construction method
+    (from_diags=False) may be slightly faster than the diags method.
+
+    Examples
+    --------
+    >>> A = banded_matrix(5, 3)
+    >>> print(A.toarray())
+    [[1. 1. 0. 0. 0.]
+     [1. 1. 1. 0. 0.]
+     [0. 1. 1. 1. 0.]
+     [0. 0. 1. 1. 1.]
+     [0. 0. 0. 1. 1.]]
+    """
     if from_diags:
         return sparse.diags(
             [1] * nnz_per_row,
@@ -248,6 +400,43 @@ def banded_matrix(N, nnz_per_row, from_diags=False):
 
 
 def stencil_grid(S, grid, dtype=None, format=None):
+    """Construct a sparse matrix resulting from a stencil
+    discretization on rectilinear grids.
+
+    Parameters
+    ----------
+    S : array_like
+        The stencil array defining the pattern of connections.
+    grid : tuple
+        Grid dimensions (e.g., (N, N) for 2D grid).
+    dtype : dtype, optional
+        Data type of the matrix. If None, uses S.dtype.
+    format : str, optional
+        Output format. If None, returns CSR format.
+
+    Returns
+    -------
+    sparse matrix
+        A sparse matrix in CSR format representing the stencil on the grid.
+
+    Notes
+    -----
+    This function constructs a sparse matrix that represents the application
+    of a stencil operator on a regular grid. The stencil defines the pattern
+    of connections between grid points.
+
+    The function handles:
+    - Boundary conditions by zeroing connections outside the grid
+    - Duplicate diagonals by summing their contributions
+    - Conversion to CSR format for efficient operations
+
+    Examples
+    --------
+    >>> # 5-point stencil for 2D grid
+    >>> S = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]])
+    >>> A = stencil_grid(S, (3, 3))
+    >>> print(A.toarray())
+    """
     N_v = int(numpy.prod(grid))  # number of vertices in the mesh
     N_s = int((S != 0).sum(dtype=int))  # number of nonzero stencil entries
 
@@ -309,6 +498,41 @@ def stencil_grid(S, grid, dtype=None, format=None):
 
 
 def poisson2D(N):
+    """Construct the 2D Poisson matrix.
+
+    Parameters
+    ----------
+    N : int
+        Grid size (N x N grid).
+
+    Returns
+    -------
+    sparse matrix
+        The 2D Poisson matrix in CSR format.
+
+    Notes
+    -----
+    This constructs the standard 5-point stencil discretization of
+    the 2D Poisson equation -u_xx - u_yy = f on an N x N grid.
+
+    The matrix has the following structure:
+    - Main diagonal: 4.0
+    - Off-diagonals: -1.0 for horizontal and vertical connections
+
+    Examples
+    --------
+    >>> A = poisson2D(3)
+    >>> print(A.toarray())
+    [[ 4. -1.  0. -1.  0.  0.  0.  0.  0.]
+     [-1.  4. -1.  0. -1.  0.  0.  0.  0.]
+     [ 0. -1.  4.  0.  0. -1.  0.  0.  0.]
+     [-1.  0.  0.  4. -1.  0. -1.  0.  0.]
+     [ 0. -1.  0. -1.  4. -1.  0. -1.  0.]
+     [ 0.  0. -1.  0. -1.  4.  0.  0. -1.]
+     [ 0.  0.  0. -1.  0.  0.  4. -1.  0.]
+     [ 0.  0.  0.  0. -1.  0. -1.  4. -1.]
+     [ 0.  0.  0.  0.  0. -1.  0. -1.  4.]]
+    """
     diag_size = N * N - 1
     first = np.full((N - 1), -1.0)
     chunks = np.concatenate([np.zeros(1), first])
@@ -326,6 +550,37 @@ def poisson2D(N):
 
 
 def diffusion2D(N, epsilon=1.0, theta=0.0):
+    """Construct a 2D diffusion matrix with anisotropy.
+
+    Parameters
+    ----------
+    N : int
+        Grid size (N x N grid).
+    epsilon : float, optional
+        Anisotropy parameter. Default is 1.0 (isotropic).
+    theta : float, optional
+        Rotation angle in radians. Default is 0.0.
+
+    Returns
+    -------
+    sparse matrix
+        The 2D diffusion matrix in CSR format.
+
+    Notes
+    -----
+    This constructs a 9-point stencil for the anisotropic diffusion equation:
+    -div(K * grad(u)) = f
+
+    where K is a diffusion tensor that depends on epsilon and theta.
+    The stencil coefficients are computed based on the rotated diffusion tensor.
+
+    Examples
+    --------
+    >>> # Isotropic diffusion
+    >>> A = diffusion2D(3, epsilon=1.0, theta=0.0)
+    >>> # Anisotropic diffusion
+    >>> A = diffusion2D(3, epsilon=0.1, theta=np.pi/4)
+    """
     eps = float(epsilon)  # for brevity
     theta = float(theta)
 
diff --git a/examples/matrix_power.py b/examples/matrix_power.py
index a43249c0..cc52c08b 100644
--- a/examples/matrix_power.py
+++ b/examples/matrix_power.py
@@ -12,9 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This example performs matrix power by repetitively multiplication. We assume
-# that the matrix is square, so the number of cols is same as the number of
-# rows in the matrix
+"""Sparse Matrix Power Microbenchmark.
+
+This script benchmarks sparse matrix power computation by performing repeated
+matrix multiplication (A^n) and measuring performance at each step. It supports:
+
+- Matrix generation with specified non-zeros per row or total non-zeros
+- Configurable number of matrix multiplications (power exponent)
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nrows: Matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for generated matrix
+--nnz-total: Total number of non-zeros for generated matrix
+--k: Number of matrix multiplications to perform
+--nwarmups: Number of warmup iterations before timing
+--same-sparsity-for-cpu-and-gpu: Use NumPy for consistent sparsity patterns
+--random-seed: Random number seed for sparsity pattern generation
+--package: Backend to use (legate, cupy, scipy)
+"""
 
 import argparse
 from functools import reduce
@@ -31,17 +47,28 @@
 
 
 def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike = None):
-    """Return a CSR matrix with a prescribed number of nonzeros in each row.
-
-    Args:
-    ----
-
-    nrows: int
-        Number of rows in the matrix. Number of columns is same as number of rows
-    nnz_per_row: int
-        Desired number of nonzero entries in each row
-    dtype: npt.DTypeLike
-        Datatype of the values. This should be one of floating point datatypes
+    """Create a CSR matrix with a prescribed number of nonzeros in each row.
+
+    Parameters
+    ----------
+    nrows : int
+        Number of rows in the matrix. Number of columns is same as number of rows.
+    nnz_per_row : int
+        Desired number of nonzero entries in each row.
+    dtype : npt.DTypeLike, optional
+        Datatype of the values. Should be one of floating point datatypes.
+        Default is np.float32.
+
+    Returns
+    -------
+    sparse matrix
+        A CSR matrix with the specified sparsity pattern.
+
+    Notes
+    -----
+    This function creates a square matrix where each row has exactly
+    nnz_per_row non-zero entries. The column indices are randomly
+    generated and sorted within each row.
     """
     dtype = np.float32 if dtype is None else dtype
     ncols = nrows
@@ -58,18 +85,28 @@ def create_csr_with_nnz_per_row(nrows, nnz_per_row: int, dtype: npt.DTypeLike =
 
 
 def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
-    """Return a CSR matrix with a prescribed number of nonzeros in the matrix.
-
-    Args:
-    ----
-
-    nrows: int
-        Number of rows in the matrix. Number of columns is same as number of rows
-    nnz_total: int
-        Desired number of nonzero entries in the matrix with no expectation of
-        nonzeros in each row of the matrix
-    dtype: npt.DTypeLike
-        Datatype of the values. This should be one of floating point datatypes
+    """Create a CSR matrix with a prescribed total number of nonzeros.
+
+    Parameters
+    ----------
+    nrows : int
+        Number of rows in the matrix. Number of columns is same as number of rows.
+    nnz_total : int
+        Desired total number of nonzero entries in the matrix.
+    dtype : npt.DTypeLike, optional
+        Datatype of the values. Should be one of floating point datatypes.
+        Default is np.float32.
+
+    Returns
+    -------
+    sparse matrix
+        A CSR matrix with the specified total number of non-zeros.
+
+    Notes
+    -----
+    This function creates a square matrix with exactly nnz_total non-zero
+    entries distributed randomly across the matrix. There is no guarantee
+    about the number of non-zeros per row.
     """
     dtype = np.float32 if dtype is None else dtype
     ncols = nrows
@@ -86,21 +123,30 @@ def create_csr_with_nnz_total(nrows, nnz_total, dtype: npt.DTypeLike = None):
 # ------------------------
 
 
-def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4):
-    """Multiply matrix by self ntimes and print the time elapsed.
-    Args:
-    ----
-
-    A: csr_matrix
-        The input matrix
-    timer:
-        Instance of the timer class to measure elapsed time
-    ntimes:
-        Number of matrix multiplies or the exponent in A^n
-    nwarmups:
-        Number of warmup iterations before
+def compute_A_power_k(A, timer, nwarmups: int = 2, k: int = 4):
+    """Compute A^k and measure performance.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The input matrix to compute A^k.
+    timer : Timer
+        Timer instance to measure elapsed time.
+    nwarmups : int, optional
+        Number of warmup iterations before timing. Default is 2.
+    k : int, optional
+        Number of matrix multiplies or the exponent in A^k. Default is 4.
+
+    Notes
+    -----
+    This function computes A^k by repeated matrix multiplication
+    and measures the time for each step. It prints detailed timing
+    information including:
+    - Matrix dimensions and sparsity
+    - Time for each multiplication step
+    - Time for copying intermediate results
+    - Overall sparsity of the final result
     """
-
     timer.start()
     B = A.copy()
     elapsed_time_init_copy = timer.stop()
@@ -108,10 +154,10 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     for _ in range(nwarmups):
         output = A @ B
 
-    elapsed_time_spgemm = [-1.0] * ntimes
-    elapsed_time_copy = [-1.0] * ntimes
+    elapsed_time_spgemm = [-1.0] * k
+    elapsed_time_copy = [-1.0] * k
 
-    for hop in range(ntimes):
+    for hop in range(k):
         timer.start()
         output = A @ B
         elapsed_time_spgemm[hop] = timer.stop()
@@ -128,9 +174,9 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     print(f"NNZ of A                               : {A.nnz}")
     print(f"NNZ of output                          : {output.nnz}")
     print(f"Sparsity of output (%)                 : {sparsity_output}")
-    print(f"Total number of hops                   : {ntimes}")
+    print(f"Total number of hops                   : {k}")
     print(f"Elapsed time for copy in init (ms)     : {elapsed_time_init_copy}")
-    for hop in range(ntimes):
+    for hop in range(k):
         print(
             f"Elapsed time for spgemm for hop {hop} (ms) : {elapsed_time_spgemm[hop]}"
         )
@@ -168,10 +214,10 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     )
 
     parser.add_argument(
-        "--ntimes",
+        "--k",
         type=int,
         default=4,
-        dest="ntimes",
+        dest="k",
         help="Number of times A @ A is performed",
     )
 
@@ -203,8 +249,7 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
     nnz_total = get_arg_number(args.nnz_total)
 
     # this is a global variable
-    global random_seed
-    global rng
+    global random_seed, rng
     random_seed = args.random_seed
 
     if args.same_sparsity_for_cpu_and_gpu:
@@ -230,6 +275,6 @@ def compute_matrix_multiply_ntimes(A, timer, nwarmups: int = 2, ntimes: int = 4)
         print("Matrix created with number of nonzeros per row")
     elapsed_time_matrix_gen = timer.stop()
 
-    compute_matrix_multiply_ntimes(A, timer, int(args.nwarmups), int(args.ntimes))
+    compute_A_power_k(A, timer, int(args.nwarmups), int(args.k))
 
     print(f"Elapsed time in matrix creation (ms)   : {elapsed_time_matrix_gen}")
diff --git a/examples/pde.py b/examples/pde.py
index 2313f799..d9ca0095 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -12,6 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Partial Differential Equation (PDE) Solver Microbenchmark.
+
+This script benchmarks the solution of 2D Poisson equations using sparse
+linear algebra operations. It implements a finite difference discretization
+with Dirichlet boundary conditions and solves the resulting linear system
+using conjugate gradient iteration. It supports:
+
+- 2D Poisson equation with analytical right-hand side
+- Configurable mesh resolution (nx, ny grid points)
+- Performance measurement of linear solver iterations
+- Throughput mode for measuring solve performance only
+- Convergence analysis with relative residual norms
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nx: Number of grid points along X axis
+--ny: Number of grid points along Y axis
+--plot: Enable residual plotting
+--plot_filename: Filename for plot output
+--throughput: Measure only solve iterations (requires max_iters)
+--tol: Convergence tolerance for linear solver
+--max-iters: Maximum number of linear solver iterations
+--warmup-iters: Number of warmup iterations (for throughput mode)
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 # This PDE solving application is derived from
 # https://aquaulb.github.io/book_solving_pde_mooc/solving_pde_mooc/notebooks/05_IterativeMethods/05_01_Iteration_and_2D.html.
 
@@ -184,7 +210,6 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         _ = A.dot(np.ones((A.shape[1],)))
 
         if throughput:
-            assert max_iters > warmup_iters
             p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=warmup_iters)
             max_iters = max_iters - warmup_iters
             print(f"max_iters has been updated to: {max_iters}")
@@ -192,7 +217,10 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         timer.start()
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
-            p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
+            if use_legate:
+                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+            else:
+                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
             p_sol, iters = linalg.cg(A, bflat, rtol=tol)
         total = timer.stop()
@@ -200,9 +228,10 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         print(f"Mesh resolution                     : ({nx}, {ny})")
         print(f"Dimension of A                      : {A.shape}")
         print(f"Number of rows in A                 : {A.shape[0]}")
+        print(f"Total elapsed time (ms)             : {total}")
 
         if throughput:
-            print(f"Total elapsed time (ms)             : {total}")
+            print(f"Number of warmup iterations         : {warmup_iters}")
             print(f"Max number of iterations            : {max_iters}")
             print(f"Time per (max-)iteration (ms)       : {total / max_iters}")
 
@@ -215,9 +244,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
             convergence_status = True if norm_res <= norm_ini * tol else False
             print(f"Did the solution converge           : {convergence_status}")
             print(f"Final relative residual norm        : {norm_res / norm_ini}")
-            print(f"Number of iterations                : {iters}")
-            print(f"Total elapsed time (ms)             : {total}")
-            print(f"Time per iteration (ms)             : {total / iters}")
+            if iters > 0:
+                print(f"Number of iterations                : {iters}")
+                print(f"Time per iteration (ms)             : {total / iters}")
 
 
 if __name__ == "__main__":
@@ -294,8 +323,8 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
     args, _ = parser.parse_known_args()
     _, timer, np, sparse, linalg, use_legate = parse_common_args()
 
-    if args.throughput and args.max_iters is None:
-        print("Must provide --max-iters when using -throughput.")
+    if args.throughput and (args.max_iters is None or args.warmup_iters is None):
+        print("Must provide --max-iters and --warmup-iters when using --throughput.")
         sys.exit(1)
 
     execute(**vars(args), timer=timer)
diff --git a/examples/spgemm_microbenchmark.py b/examples/spgemm_microbenchmark.py
index 3741f400..e30c05dd 100644
--- a/examples/spgemm_microbenchmark.py
+++ b/examples/spgemm_microbenchmark.py
@@ -12,17 +12,80 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Sparse Matrix-Matrix Multiplication Microbenchmark.
+
+This script benchmarks sparse matrix-matrix multiplication performance
+with configurable matrix sizes and generation methods. It supports:
+
+- Banded matrix generation with specified non-zeros per row
+- Loading matrices from Matrix Market files
+- Stable mode for partition caching vs. fresh matrix creation
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nrows: Matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for banded matrices
+--stable: Enable partition caching by reusing matrices
+--filename1: Load first matrix from Matrix Market file
+--filename2: Load second matrix from Matrix Market file
+--iters: Number of benchmark iterations
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 import argparse
 
 from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
 
 
 def spgemm_dispatch(A, B):
+    """Dispatch sparse matrix-matrix multiplication operation.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        First sparse matrix operand.
+    B : sparse matrix
+        Second sparse matrix operand.
+
+    Returns
+    -------
+    sparse matrix
+        The result of A @ B.
+
+    Notes
+    -----
+    This function performs sparse matrix-matrix multiplication using
+    the @ operator, which is supported by all backends (Legate, CuPy, SciPy).
+    """
     C = A @ B
     return C
 
 
 def get_matrices(N, nnz_per_row, fname1, fname2):
+    """Get matrices for SpGEMM benchmark.
+
+    Parameters
+    ----------
+    N : int
+        Matrix size (N x N) for generated matrices.
+    nnz_per_row : int
+        Number of non-zeros per row for banded matrices.
+    fname1 : str
+        Filename for first matrix (empty string to generate).
+    fname2 : str
+        Filename for second matrix (empty string to use first matrix).
+
+    Returns
+    -------
+    tuple
+        (A, B) - two sparse matrices for multiplication.
+
+    Notes
+    -----
+    If fname1 is provided, loads matrices from Matrix Market files.
+    If fname2 is empty, uses the same matrix for both A and B.
+    Otherwise, generates banded matrices with specified parameters.
+    """
     if fname1 != "":
         # Read file from matrix
         A = sparse.mmread(fname1)
@@ -38,6 +101,37 @@ def get_matrices(N, nnz_per_row, fname1, fname2):
 
 
 def run_spgemm(N, nnz_per_row, fname1, fname2, iters, stable, timer):
+    """Run sparse matrix-matrix multiplication benchmark.
+
+    Parameters
+    ----------
+    N : int
+        Matrix size for generated matrices.
+    nnz_per_row : int
+        Number of non-zeros per row for banded matrices.
+    fname1 : str
+        Filename for first matrix.
+    fname2 : str
+        Filename for second matrix.
+    iters : int
+        Number of benchmark iterations.
+    stable : bool
+        Whether to reuse matrices (allows partition caching).
+    timer : Timer
+        Timer object for measuring performance.
+
+    Notes
+    -----
+    This function runs a benchmark of sparse matrix-matrix multiplication.
+    It supports two modes:
+    - stable=True: Reuses matrices, allowing partition caching
+    - stable=False: Creates fresh matrices each iteration
+
+    The function prints:
+    - Matrix dimensions and non-zero counts
+    - Number of iterations
+    - Total time and time per iteration
+    """
     warmup_iterations = 5
 
     if stable:
diff --git a/examples/spmv_microbenchmark.py b/examples/spmv_microbenchmark.py
index 559ccbfc..c6f11ff8 100644
--- a/examples/spmv_microbenchmark.py
+++ b/examples/spmv_microbenchmark.py
@@ -12,6 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Sparse Matrix-Vector Multiplication Microbenchmark.
+
+This script benchmarks sparse matrix-vector multiplication performance
+across different matrix sizes and configurations. It supports:
+
+- Matrix size sweeps with configurable min/max sizes
+- Banded matrix generation with specified non-zeros per row
+- Loading matrices from Matrix Market files
+- Optional repartitioning to simulate data updates
+- Multiple backend support (Legate, CuPy, SciPy)
+
+Command line arguments:
+--nmin: Minimum matrix size (supports k, m, g suffixes)
+--nmax: Maximum matrix size (supports k, m, g suffixes)
+--nnz-per-row: Number of non-zeros per row for banded matrices
+--repartition: Enable alternating x/y vectors
+--filename: Load matrix from Matrix Market file
+--iters: Number of benchmark iterations
+--from-diags: Use sparse.diags for matrix construction
+--package: Backend to use (legate, cupy, scipy)
+"""
+
 import argparse
 
 from common import banded_matrix, get_arg_number, get_phase_procs, parse_common_args
@@ -19,6 +41,30 @@
 
 # Writing to pre-allocated array is preferred
 def spmv_dispatch(A, x, y, i, repartition):
+    """Dispatch sparse matrix-vector multiplication operation.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The sparse matrix to multiply with.
+    x : array_like
+        Input vector.
+    y : array_like
+        Output vector (pre-allocated).
+    i : int
+        Iteration index.
+    repartition : bool
+        Whether to alternate between y=A*x and x=A*y.
+
+    Notes
+    -----
+    This function performs sparse matrix-vector multiplication with optional
+    repartitioning. When repartition is True, it alternates between computing
+    y = A*x and x = A*y to simulate data updates.
+
+    For Legate, it uses the dot method with pre-allocated output arrays.
+    For other backends, it uses the @ operator.
+    """
     if use_legate:
         if repartition and i % 2:
             A.dot(y, out=x)
@@ -32,6 +78,30 @@ def spmv_dispatch(A, x, y, i, repartition):
 
 
 def run_spmv(A, iters, repartition, timer):
+    """Run sparse matrix-vector multiplication benchmark.
+
+    Parameters
+    ----------
+    A : sparse matrix
+        The sparse matrix to benchmark.
+    iters : int
+        Number of iterations to run.
+    repartition : bool
+        Whether to use repartitioning (alternate x and y).
+    timer : Timer
+        Timer object for measuring performance.
+
+    Notes
+    -----
+    This function runs a benchmark of sparse matrix-vector multiplication.
+    It includes warm-up runs and measures the total time and time per iteration.
+
+    The function prints:
+    - Matrix dimensions and number of non-zeros
+    - Number of iterations
+    - Total elapsed time
+    - Time per iteration
+    """
     x = np.ones((A.shape[1],))
     y = np.zeros((A.shape[0],))
 
diff --git a/legate_sparse/__init__.py b/legate_sparse/__init__.py
index 8a6a077a..c8f44589 100644
--- a/legate_sparse/__init__.py
+++ b/legate_sparse/__init__.py
@@ -21,6 +21,7 @@
 
 from .coverage import clone_module  # noqa: F401
 from .csr import csr_array, csr_matrix  # noqa: F401
+from .dia import dia_array, dia_matrix  # noqa: F401
 from .module import *  # noqa: F401
 
 clone_module(_sp, globals())
diff --git a/legate_sparse/base.py b/legate_sparse/base.py
index fa46fc8b..c9d99a31 100644
--- a/legate_sparse/base.py
+++ b/legate_sparse/base.py
@@ -62,8 +62,36 @@
 # CompressedBase is a base class for several different kinds of sparse
 # matrices, such as CSR, CSC, COO and DIA.
 class CompressedBase:
+    """Base class for compressed sparse matrix formats.
+
+    This class provides common functionality for compressed sparse matrix
+    formats like CSR, CSC, COO, and DIA. It handles the conversion from
+    non-zero counts to position arrays and provides common operations.
+
+    Notes
+    -----
+    This is an internal base class and should not be instantiated directly.
+    Use specific format classes like csr_array instead.
+    """
+
     @classmethod
     def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
+        """Convert non-zero counts to position arrays.
+
+        This class method converts an array of non-zero counts per row/column
+        into the position array used in compressed sparse formats.
+
+        Parameters
+        ----------
+        q_nnz : LogicalStore
+            Store containing the number of non-zeros per row/column.
+
+        Returns
+        -------
+        tuple
+            (pos, total_nnz) where pos is the position array and total_nnz
+            is the total number of non-zeros.
+        """
         q_nnz_arr = store_to_cupynumeric_array(q_nnz)
         cs = cupynumeric.cumsum(q_nnz_arr)
         cs_shifted = cs - q_nnz_arr
@@ -86,9 +114,43 @@ def nnz_to_pos_cls(cls, q_nnz: LogicalStore):
         return pos, cs[-1]
 
     def nnz_to_pos(self, q_nnz: LogicalStore):
+        """Convert non-zero counts to position arrays for this instance.
+
+        Parameters
+        ----------
+        q_nnz : LogicalStore
+            Store containing the number of non-zeros per row/column.
+
+        Returns
+        -------
+        tuple
+            (pos, total_nnz) where pos is the position array and total_nnz
+            is the total number of non-zeros.
+        """
         return CompressedBase.nnz_to_pos_cls(q_nnz)
 
     def asformat(self, format, copy=False):
+        """Convert the matrix to a specified format.
+
+        Parameters
+        ----------
+        format : str
+            The desired format ('csr', 'csc', 'coo', etc.).
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        sparse matrix
+            Matrix in the requested format.
+
+        Raises
+        ------
+        ValueError
+            If the format is unknown.
+        NotImplementedError
+            If conversion to the requested format is not implemented.
+        """
         if format is None or format == self.format:
             if copy:
                 raise NotImplementedError
@@ -108,35 +170,51 @@ def asformat(self, format, copy=False):
 
     # The implementation of sum is mostly lifted from scipy.sparse.
     def sum(self, axis=None, dtype=None, out=None):
-        """
-        Sum the matrix elements over a given axis.
+        """Sum the matrix elements over a given axis.
+
         Parameters
         ----------
-        axis : {-2, -1, 0, 1, None} optional
+        axis : {-2, -1, 0, 1, None}, optional
             Axis along which the sum is computed. The default is to
             compute the sum of all the matrix elements, returning a scalar
             (i.e., `axis` = `None`).
         dtype : dtype, optional
             The type of the returned matrix and of the accumulator in which
-            the elements are summed.  The dtype of `a` is used by default
+            the elements are summed. The dtype of `a` is used by default
             unless `a` has an integer dtype of less precision than the default
-            platform integer.  In that case, if `a` is signed then the platform
+            platform integer. In that case, if `a` is signed then the platform
             integer is used while if `a` is unsigned then an unsigned integer
             of the same precision as the platform integer is used.
-            .. versionadded:: 0.18.0
-        out : np.matrix, optional
-            Alternative output matrix in which to place the result. It must
+        out : cupynumeric.ndarray, optional
+            Alternative output array in which to place the result. It must
             have the same shape as the expected output, but the type of the
             output values will be cast if necessary.
-            .. versionadded:: 0.18.0
+
         Returns
         -------
-        sum_along_axis : np.matrix
+        sum_along_axis : cupynumeric.ndarray or scalar
             A matrix with the same shape as `self`, with the specified
-            axis removed.
+            axis removed, or a scalar if axis=None.
+
+        Raises
+        ------
+        NotImplementedError
+            If axis=0 (sum over columns) is requested.
+        ValueError
+            If out is provided but has incompatible shape.
+
+        Notes
+        -----
+        The implementation uses multiplication by a matrix of ones to achieve
+        the sum. For some sparse matrix formats more efficient methods are
+        possible and should override this function.
+
+        Currently, summing over columns (axis=0) is not implemented due to
+        the lack of right matrix multiplication support.
+
         See Also
         --------
-        numpy.matrix.sum : NumPy's implementation of 'sum' for matrices
+        cupynumeric.matrix.sum : NumPy's implementation of 'sum' for matrices
         """
 
         # We use multiplication by a matrix of ones to achieve this.
@@ -171,9 +249,27 @@ def sum(self, axis=None, dtype=None, out=None):
 
     # needed by _data_matrix
     def _with_data(self, data, copy=True):
-        """Returns a _different_ matrix object with the same sparsity structure as self,
-        but with different data.  By default the structure arrays
-        (i.e. .indptr and .indices) are copied. 'data' parameter is never copied.
+        """Returns a matrix object with the same sparsity structure as self,
+        but with different data.
+
+        Parameters
+        ----------
+        data : array_like
+            The new data array. This parameter is never copied.
+        copy : bool, optional
+            Whether to copy the structure arrays (indptr and indices).
+            Default is True.
+
+        Returns
+        -------
+        sparse matrix
+            A new matrix with the same sparsity structure but different data.
+
+        Notes
+        -----
+        This method creates a new matrix object with the same sparsity pattern
+        but replaces the data array. The structure arrays (indptr and indices)
+        are copied by default to avoid modifying the original matrix.
         """
 
         # For CSR and CSC compressed base we can just reuse compressed stores,
@@ -253,12 +349,42 @@ def method(self):
 # format of {Dense, Sparse}. For our purposes, that means CSC and CSR
 # matrices.
 class DenseSparseBase:
+    """Base class for sparse matrices with dense-sparse format.
+
+    This class provides functionality for sparse matrices that have a TACO
+    format of {Dense, Sparse}, which includes CSR and CSC matrices.
+
+    Notes
+    -----
+    This is an internal base class and should not be instantiated directly.
+    Use specific format classes like csr_array instead.
+    """
+
     def __init__(self):
+        """Initialize the DenseSparseBase class."""
         self._balanced_pos_partition = None
 
     # consider using _with_data() here
     @classmethod
     def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
+        """Create a new matrix with the same non-zero structure as mat.
+
+        Parameters
+        ----------
+        mat : sparse matrix
+            The reference matrix whose structure to copy.
+        arg : array_like
+            The data for the new matrix.
+        shape : tuple, optional
+            The shape of the new matrix. If None, uses mat.shape.
+        dtype : dtype, optional
+            The data type of the new matrix. If None, uses mat.dtype.
+
+        Returns
+        -------
+        sparse matrix
+            A new matrix with the same structure as mat but with data from arg.
+        """
         if shape is None:
             shape = mat.shape
         if dtype is None:
@@ -269,6 +395,21 @@ def make_with_same_nnz_structure(cls, mat, arg, shape=None, dtype=None):
 
 # unpack_rect1_store unpacks a rect1 store into two int64 stores.
 def unpack_rect1_store(pos):
+    """Unpack a rect1 store into two int64 stores.
+
+    This function unpacks the compressed position array used in CSR/CSC
+    formats into separate start and end position arrays.
+
+    Parameters
+    ----------
+    pos : LogicalStore
+        The rect1 store containing packed position information.
+
+    Returns
+    -------
+    tuple
+        (lo, hi) where lo contains start positions and hi contains end positions.
+    """
     out1 = runtime.create_store(int64, shape=pos.shape)
     out2 = runtime.create_store(int64, shape=pos.shape)
     task = runtime.create_auto_task(SparseOpCode.UNZIP_RECT1)
@@ -283,6 +424,25 @@ def unpack_rect1_store(pos):
 
 # pack_to_rect1_store packs two int64 stores into a rect1 store.
 def pack_to_rect1_store(lo, hi, output=None):
+    """Pack two int64 stores into a rect1 store.
+
+    This function packs separate start and end position arrays into the
+    compressed rect1 format used in CSR/CSC formats.
+
+    Parameters
+    ----------
+    lo : LogicalStore
+        Store containing start positions.
+    hi : LogicalStore
+        Store containing end positions.
+    output : LogicalStore, optional
+        Output store for the packed result. If None, creates a new store.
+
+    Returns
+    -------
+    LogicalStore
+        The packed rect1 store.
+    """
     if output is None:
         output = runtime.create_store(rect1, shape=(lo.shape[0],))
     task = runtime.create_auto_task(SparseOpCode.ZIP_TO_RECT1)
diff --git a/legate_sparse/config.py b/legate_sparse/config.py
index 3c9a3780..8c601981 100644
--- a/legate_sparse/config.py
+++ b/legate_sparse/config.py
@@ -23,6 +23,12 @@
 
 
 class _LegateSparseSharedLib:
+    """Internal class representing the shared library interface.
+
+    This class defines the interface to the C++ shared library that
+    implements the core sparse matrix operations.
+    """
+
     LEGATE_SPARSE_DENSE_TO_CSR: int
     LEGATE_SPARSE_DENSE_TO_CSR_NNZ: int
     LEGATE_SPARSE_ZIP_TO_RECT_1: int
@@ -46,6 +52,26 @@ class _LegateSparseSharedLib:
 
 
 def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
+    """Load a shared library without automatic closing.
+
+    Parameters
+    ----------
+    ffi : Any
+        The CFFI interface object.
+    lib_path : str
+        Path to the shared library to load.
+
+    Returns
+    -------
+    Any
+        The loaded library object.
+
+    Notes
+    -----
+    This function loads a shared library using CDLL and converts it to
+    a CFFI object without automatic closing. This prevents issues with
+    symbol cleanup during shutdown.
+    """
     # Use an already-opened library handle, which cffi will convert to a
     # regular FFI object (using the definitions previously added using
     # ffi.cdef), but will not automatically dlclose() on collection.
@@ -55,8 +81,21 @@ def dlopen_no_autoclose(ffi: Any, lib_path: str) -> Any:
 
 # Load the LegateSparse library first so we have a shard object that
 # we can use to initialize all these configuration enumerations
-class LegateSparseLib(Library):
+class LegateSparseLib:
+    """Legate sparse matrix library loader.
+
+    This class handles loading and registering the Legate sparse matrix
+    library with the Legate runtime.
+    """
+
     def __init__(self, name):
+        """Initialize the Legate sparse library.
+
+        Parameters
+        ----------
+        name : str
+            The name of the library to load.
+        """
         self.name = name
         self.runtime = None
         self.shared_object = None
@@ -78,24 +117,58 @@ def __init__(self, name):
         self.shared_object = cast(_LegateSparseSharedLib, shared_lib)
 
     def register(self) -> None:
+        """Register the library with the Legate runtime."""
         callback = getattr(self.shared_object, "legate_sparse_perform_registration")
         callback()
 
     def get_shared_library(self) -> str:
+        """Get the path to the shared library.
+
+        Returns
+        -------
+        str
+            The full path to the shared library file.
+        """
         from legate_sparse.install_info import libpath
 
         return os.path.join(libpath, "liblegate_sparse" + self.get_library_extension())
 
     def get_legate_library(self) -> Library:
+        """Get the Legate library object.
+
+        Returns
+        -------
+        Library
+            The Legate library object.
+        """
         return get_legate_runtime().find_library(self.name)
 
     def get_c_header(self) -> str:
+        """Get the C header for the library.
+
+        Returns
+        -------
+        str
+            The C header content.
+        """
         from legate_sparse.install_info import header
 
         return header
 
     @staticmethod
     def get_library_extension() -> str:
+        """Get the appropriate library extension for the current platform.
+
+        Returns
+        -------
+        str
+            The library extension ('.so' for Linux, '.dylib' for macOS).
+
+        Raises
+        ------
+        RuntimeError
+            If the platform is not supported.
+        """
         os_name = platform.system()
         if os_name == "Linux":
             return ".so"
@@ -105,6 +178,8 @@ def get_library_extension() -> str:
 
 
 SPARSE_LIB_NAME = "legate.sparse"
+"""Name of the Legate sparse library."""
+
 sparse_lib = LegateSparseLib(SPARSE_LIB_NAME)
 sparse_lib.register()
 _sparse = sparse_lib.shared_object
@@ -115,6 +190,13 @@ def get_library_extension() -> str:
 # Match these to entries in sparse_c.h
 @unique
 class SparseOpCode(IntEnum):
+    """Enumeration of sparse matrix operation codes.
+
+    These codes correspond to the operations implemented in the C++
+    shared library and are used to dispatch tasks to the appropriate
+    kernels.
+    """
+
     LOAD_CUDALIBS = _sparse.LEGATE_SPARSE_LOAD_CUDALIBS
     UNLOAD_CUDALIBS = _sparse.LEGATE_SPARSE_UNLOAD_CUDALIBS
 
@@ -146,3 +228,4 @@ class SparseOpCode(IntEnum):
 
 # Register some types for us to use.
 rect1 = types.rect_type(1)
+"""1-dimensional rectangle type used for compressed storage formats."""
diff --git a/legate_sparse/csr.py b/legate_sparse/csr.py
index 6b1d69a4..3008356e 100644
--- a/legate_sparse/csr.py
+++ b/legate_sparse/csr.py
@@ -91,7 +91,140 @@
 
 @clone_scipy_arr_kind(scipy.sparse.csr_array)
 class csr_array(CompressedBase, DenseSparseBase):
+    """Compressed Sparse Row array.
+
+    This can be instantiated in several ways:
+        csr_array(D)
+            where D is a 2-D ndarray or cupynumeric.ndarray
+
+        csr_array(S)
+            with another sparse array or matrix S (equivalent to S.tocsr())
+
+        csr_array((M, N), [dtype])
+            to construct an empty array with shape (M, N)
+            dtype is optional, defaulting to dtype='d'.
+
+        csr_array((data, (row_ind, col_ind)), [shape=(M, N)])
+            where ``data``, ``row_ind`` and ``col_ind`` satisfy the
+            relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
+
+        csr_array((data, indices, indptr), [shape=(M, N)])
+            is the standard CSR representation where the column indices for
+            row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
+            corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
+            If the shape parameter is not supplied, the array dimensions
+            are inferred from the index arrays.
+
+    Attributes
+    ----------
+    dtype : dtype
+        Data type of the array
+    shape : 2-tuple
+        Shape of the array
+    ndim : int
+        Number of dimensions (this is always 2)
+    nnz : int
+        Number of stored values, including explicit zeros
+    data : cupynumeric.ndarray
+        CSR format data array of the array
+    indices : cupynumeric.ndarray
+        CSR format index array of the array
+    indptr : cupynumeric.ndarray
+        CSR format index pointer array of the array
+    has_sorted_indices : bool
+        Whether the indices are sorted
+    has_canonical_format : bool
+        Whether the matrix is in canonical format
+    T : csr_array
+        Transpose of the matrix
+
+    Notes
+    -----
+    Sparse arrays can be used in arithmetic operations: they support
+    addition, subtraction, multiplication, division, and matrix power.
+
+    Advantages of the CSR format:
+        - fast matrix vector products
+
+    Disadvantages of the CSR format:
+        - changes to the sparsity structure are expensive (consider LIL or DOK)
+
+    Canonical Format:
+        - Within each row, indices are sorted by column.
+        - There are no duplicate entries.
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - GPU acceleration via cuSPARSE when available
+        - Limited to supported datatypes on GPU: float32, float64, complex64, complex128
+        - Some operations may create implicit copies due to transformed arrays
+        - Element-wise operations with scalars only operate on existing non-zero elements
+        - Indexing with boolean masks only updates existing non-zero elements
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array
+    >>> csr_array((3, 4), dtype=np.int8).todense()
+    array([[0, 0, 0, 0],
+           [0, 0, 0, 0],
+           [0, 0, 0, 0]], dtype=int8)
+
+    >>> row = np.array([0, 0, 1, 2, 2, 2])
+    >>> col = np.array([0, 2, 2, 0, 1, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+    >>> csr_array((data, (row, col)), shape=(3, 3)).todense()
+    array([[1, 0, 2],
+           [0, 0, 3],
+           [4, 5, 6]])
+
+    >>> indptr = np.array([0, 2, 3, 6])
+    >>> indices = np.array([0, 2, 2, 0, 1, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+    >>> csr_array((data, indices, indptr), shape=(3, 3)).todense()
+    array([[1, 0, 2],
+           [0, 0, 3],
+           [4, 5, 6]])
+    """
+
     def __init__(self, arg, shape=None, dtype=None, copy=False):
+        """Initialize a CSR array.
+
+        Parameters
+        ----------
+        arg : array_like, tuple, or csr_array
+            The input data. Can be:
+            - A 2-D dense array (numpy.ndarray or cupynumeric.ndarray)
+            - A sparse array/matrix to convert to CSR format
+            - A tuple (M, N) for an empty array of shape (M, N)
+            - A tuple (data, (row_ind, col_ind)) for COO format data
+            - A tuple (data, indices, indptr) for CSR format data
+        shape : tuple, optional
+            Shape of the array (M, N). Required if not inferrable from input.
+        dtype : dtype, optional
+            Data type of the array. If None, inferred from input data.
+            Defaults to float64 if not specified.
+        copy : bool, optional
+            Whether to copy the input data. Default is False.
+
+        Raises
+        ------
+        NotImplementedError
+            If the input type is not supported for conversion to CSR.
+        AssertionError
+            If shape cannot be inferred and is not provided.
+        ValueError
+            If input data is inconsistent or invalid.
+
+        Notes
+        -----
+        When converting from dense arrays, the implementation uses a two-pass
+        algorithm that first counts non-zeros per row, then fills them in.
+        This may not scale well on distributed systems due to alignment constraints.
+
+        When converting from COO format, the data is automatically sorted by
+        rows and then by columns to ensure canonical format.
+        """
         self.ndim = 2
         self.indices_sorted = False
         self.canonical_format = False
@@ -189,7 +322,39 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
         self._dtype = dtype
 
     def _init_from_tuple_inputs(self, arg, dtype, shape, copy):
+        """Initialize CSR array from tuple inputs.
+
+        This internal method handles the various tuple-based constructor formats:
+        - (M, N) for empty arrays
+        - (data, (row_ind, col_ind)) for COO format
+        - (data, indices, indptr) for CSR format
+
+        Parameters
+        ----------
+        arg : tuple
+            The input tuple in one of the supported formats.
+        dtype : dtype, optional
+            The desired data type.
+        shape : tuple, optional
+            The shape of the array.
+        copy : bool
+            Whether to copy the input data.
+
+        Returns
+        -------
+        tuple
+            (dtype, shape) for the constructed array.
+
+        Raises
+        ------
+        AssertionError
+            If shape cannot be inferred or input is invalid.
+        NotImplementedError
+            If the tuple format is not supported.
+        """
+
         def _get_empty_csr(dtype, nrows_plus_one):
+            """Helper function to create empty CSR arrays."""
             return (
                 cupynumeric.zeros(0, dtype=dtype),
                 cupynumeric.zeros(0, dtype=coord_ty),
@@ -326,36 +491,96 @@ def _get_empty_csr(dtype, nrows_plus_one):
 
     @property
     def dim(self):
+        """Number of dimensions (always 2 for CSR arrays)."""
         return self.ndim
 
     @property
     def nnz(self):
+        """Number of stored values, including explicit zeros.
+
+        Returns
+        -------
+        int
+            The number of non-zero elements in the matrix.
+        """
         return self.vals.shape[0]
 
     @property
     def dtype(self):
+        """Data type of the array.
+
+        Returns
+        -------
+        dtype
+            The data type of the array elements.
+        """
         # We can just return self.vals.type, but bookkeep type separately now
         return self._dtype
 
     # Enable direct operation on the values array.
     def get_data(self):
+        """Get the data array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The data array containing the non-zero values.
+        """
         return store_to_cupynumeric_array(self.vals)
 
     # From array,
     def set_data(self, data):
+        """Set the data array of the CSR matrix.
+
+        Parameters
+        ----------
+        data : cupynumeric.ndarray
+            The new data array. Must have the same length as the current data array.
+
+        Raises
+        ------
+        AssertionError
+            If data is not a cupynumeric.ndarray.
+        """
         if isinstance(data, numpy.ndarray):
             data = cupynumeric.array(data)
         assert isinstance(data, cupynumeric.ndarray)
         self.vals = get_store_from_cupynumeric_array(data)
         self._dtype = data.dtype
 
-    data = property(fget=get_data, fset=set_data)
+    data = property(
+        fget=get_data, fset=set_data, doc="CSR format data array of the matrix"
+    )
 
     # Enable direct operation on the indices array.
     def get_indices(self):
+        """Get the column indices array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The column indices array.
+        """
         return store_to_cupynumeric_array(self.crd)
 
     def set_indices(self, indices):
+        """Set the column indices array of the CSR matrix.
+
+        Parameters
+        ----------
+        indices : cupynumeric.ndarray
+            The new column indices array. Must have the same length as the current indices array.
+
+        Raises
+        ------
+        AssertionError
+            If indices is not a cupynumeric.ndarray.
+
+        Notes
+        -----
+        Setting new indices will mark the matrix as not having sorted indices
+        and not being in canonical format.
+        """
         if isinstance(indices, numpy.ndarray):
             indices = cupynumeric.array(indices)
         assert isinstance(indices, cupynumeric.ndarray)
@@ -364,22 +589,46 @@ def set_indices(self, indices):
         self.canonical_format = False
         self.indices_sorted = False
 
-    indices = property(fget=get_indices, fset=set_indices)
+    indices = property(
+        fget=get_indices, fset=set_indices, doc="CSR format index array of the matrix"
+    )
 
     def get_indptr(self):
+        """Get the index pointer array of the CSR matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The index pointer array. For row i, the column indices are stored in
+            indices[indptr[i]:indptr[i+1]] and their corresponding values are
+            stored in data[indptr[i]:indptr[i+1]].
+        """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
         return cupynumeric.append(row_start, [self.nnz])
 
     # Disallow changing intptrs directly
-    indptr = property(fget=get_indptr)
+    indptr = property(
+        fget=get_indptr, doc="CSR format index pointer array of the matrix"
+    )
 
     def _get_row_indices(self):
-        """Helper routine that converts pos to row indices"""
+        """Helper routine that converts pos to row indices.
+
+        This internal method expands the compressed row storage format's position
+        array into explicit row indices for each non-zero element.
 
-        # TODO: Add an option that caches the row_indices so that other binary
-        # operations don't have to recompute it.
+        Returns
+        -------
+        cupynumeric.ndarray
+            Array of row indices corresponding to each non-zero element.
 
+        Notes
+        -----
+        This method is used internally by comparison operations and other
+        methods that need explicit row indices. The result could be cached
+        for performance, but currently is recomputed each time.
+        """
         row_indices = runtime.create_store(coord_ty, shape=self.crd.shape)
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
         src_part = task.add_input(self.pos)
@@ -390,13 +639,56 @@ def _get_row_indices(self):
         return store_to_cupynumeric_array(row_indices)
 
     def has_sorted_indices(self):
+        """Determine whether the matrix has sorted indices.
+
+        Returns
+        -------
+        bool
+            True if the indices are sorted, False otherwise.
+        """
         return self.indices_sorted
 
     def has_canonical_format(self):
+        """Determine whether the matrix is in canonical format.
+
+        Returns
+        -------
+        bool
+            True if the matrix is in canonical format, False otherwise.
+
+        Notes
+        -----
+        A matrix is in canonical format if:
+        - Within each row, indices are sorted by column
+        - There are no duplicate entries
+        """
         return self.canonical_format
 
     # The rest of the methods
     def diagonal(self, k=0):
+        """Return the k-th diagonal of the matrix.
+
+        Parameters
+        ----------
+        k : int, optional
+            Which diagonal to retrieve. Default is 0 (main diagonal).
+            k > 0 for upper diagonals, k < 0 for lower diagonals.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The k-th diagonal of the matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            If k != 0 (only main diagonal is currently supported).
+
+        Notes
+        -----
+        Currently only supports k=0 (main diagonal). Other diagonals
+        are not implemented.
+        """
         rows, cols = self.shape
         if k <= -rows or k >= cols:
             return cupynumeric.empty(0, dtype=self.dtype)
@@ -422,6 +714,33 @@ def diagonal(self, k=0):
         return store_to_cupynumeric_array(output)
 
     def todense(self, order=None, out=None):
+        """Return a dense matrix representation of this matrix.
+
+        Parameters
+        ----------
+        order : str, optional
+            Not supported. Must be None.
+        out : cupynumeric.ndarray, optional
+            Output array for the result. Must have the same shape and dtype
+            as the expected output.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            A dense matrix with the same shape and dtype as this matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            If order is not None.
+        ValueError
+            If out is provided but has incompatible dtype or shape.
+
+        Notes
+        -----
+        The order parameter is not supported and must be None.
+        If out is provided, it must have the correct shape and dtype.
+        """
         if order is not None:
             raise NotImplementedError
         if out is not None:
@@ -444,13 +763,63 @@ def todense(self, order=None, out=None):
         return store_to_cupynumeric_array(out)
 
     def multiply(self, other):
+        """Point-wise multiplication by another matrix, vector, or scalar.
+
+        Parameters
+        ----------
+        other : csr_array, cupynumeric.ndarray, or scalar
+            The object to multiply with.
+
+        Returns
+        -------
+        csr_array or cupynumeric.ndarray
+            The result of the multiplication.
+
+        Notes
+        -----
+        This is equivalent to the * operator.
+        """
         return self * other
 
     def __rmul__(self, other):
+        """Right multiplication by a scalar.
+
+        Parameters
+        ----------
+        other : scalar
+            The scalar to multiply with.
+
+        Returns
+        -------
+        csr_array
+            The result of the multiplication.
+        """
         return self * other
 
     # This is an element-wise operation now.
     def __mul__(self, other):
+        """Element-wise multiplication.
+
+        Parameters
+        ----------
+        other : scalar or array_like
+            The object to multiply with.
+
+        Returns
+        -------
+        csr_array
+            The result of the multiplication.
+
+        Raises
+        ------
+        NotImplementedError
+            If other is not a scalar.
+
+        Notes
+        -----
+        Currently only supports scalar multiplication. Array multiplication
+        is not implemented.
+        """
         if isinstance(other, numpy.ndarray):
             other = cupynumeric.array(other)
 
@@ -464,10 +833,48 @@ def __mul__(self, other):
 
     # rmatmul represents the operation other @ self.
     def __rmatmul__(self, other):
+        """Right matrix multiplication (other @ self).
+
+        Parameters
+        ----------
+        other : array_like
+            The left operand for matrix multiplication.
+
+        Returns
+        -------
+        cupynumeric.ndarray or csr_array
+            The result of the matrix multiplication.
+
+        Raises
+        ------
+        NotImplementedError
+            Currently not implemented.
+
+        Notes
+        -----
+        This method handles the case where a dense matrix is multiplied
+        with a CSR matrix from the left. Currently not implemented.
+        """
         # Handle dense @ CSR
         raise NotImplementedError
 
     def __matmul__(self, other):
+        """Matrix multiplication (self @ other).
+
+        Parameters
+        ----------
+        other : array_like or csr_array
+            The right operand for matrix multiplication.
+
+        Returns
+        -------
+        cupynumeric.ndarray or csr_array
+            The result of the matrix multiplication.
+
+        Notes
+        -----
+        This is equivalent to the dot method.
+        """
         return self.dot(other)
 
     def _compare_scalar(self, other, op):
@@ -742,15 +1149,54 @@ def dot(self, other, out=None):
 
         Parameters
         ----------
-        other : array_like
-            The object to compute dot product with
-        out : ndarray, optional
-            Output array for the result
+        other : array_like or csr_array
+            The object to compute dot product with. Can be:
+            - A dense vector (1-D array) for sparse matrix-vector multiplication (SpMV)
+            - A dense matrix (2-D array) for sparse matrix-matrix multiplication (SpMM)
+            - A CSR matrix for sparse matrix-sparse matrix multiplication (SpGEMM)
+        out : cupynumeric.ndarray, optional
+            Output array for the result. Only supported for SpMV operations.
+            Must have the correct shape and dtype.
 
         Returns
         -------
-        output : csr_array or cupynumeric.ndarray
-            Sparse matrix or dense array depending on input
+        cupynumeric.ndarray or csr_array
+            The result of the dot product:
+            - For SpMV: dense vector
+            - For SpMM: dense matrix
+            - For SpGEMM: CSR matrix
+
+        Raises
+        ------
+        NotImplementedError
+            If the operation is not supported or datatypes are not supported on GPU.
+        ValueError
+            If out is provided for SpGEMM operations or has incompatible dtype/shape.
+        RuntimeWarning
+            If an implicit copy is created due to transformed input arrays.
+
+        Notes
+        -----
+        Supported operations:
+        - SpMV (sparse matrix-vector): A @ x where x is a dense vector
+        - SpGEMM (sparse-sparse): A @ B where B is a CSR matrix
+
+        GPU limitations:
+        - Only floating point datatypes are supported: float32, float64, complex64, complex128
+        - Some operations may create implicit copies due to transformed arrays
+
+        The implementation automatically chooses the appropriate algorithm:
+        - For vectors: uses cuSPARSE SpMV when available
+        - For CSR matrices: uses cuSPARSE SpGEMM on GPU, custom implementation on CPU
+
+        Examples
+        --------
+        >>> import cupynumeric as np
+        >>> from legate_sparse import csr_array
+        >>> A = csr_array([[1, 2, 0], [0, 0, 3], [4, 0, 5]])
+        >>> v = np.array([1, 0, -1])
+        >>> A.dot(v)
+        array([ 1, -3, -1])
         """
         # If output specified - it should be cupynumeric array
         if out is not None:
@@ -840,7 +1286,9 @@ def _getpos(self):
         Returns
         -------
         list of tuple
-            List of (start, end) position tuples for each row in the matrix
+            List of (start, end) position tuples for each row in the matrix.
+            For row i, the non-zero elements are stored in positions
+            [start, end) in the data and indices arrays.
         """
         row_start_st, row_end_st = unpack_rect1_store(self.pos)
         row_start = store_to_cupynumeric_array(row_start_st)
@@ -853,7 +1301,7 @@ def copy(self):
         Returns
         -------
         csr_array
-            A copy of the matrix
+            A copy of the matrix with the same data and structure.
         """
         return csr_array(self, dtype=self.dtype)
 
@@ -863,12 +1311,17 @@ def conj(self, copy=True):
         Parameters
         ----------
         copy : bool, optional
-            Whether to create a new matrix or modify in-place
+            Whether to create a new matrix or modify in-place. Default is True.
 
         Returns
         -------
         csr_array
-            The conjugate matrix
+            The conjugate matrix.
+
+        Notes
+        -----
+        If copy=True, returns a new matrix. If copy=False, modifies the
+        current matrix in-place.
         """
         if copy:
             return self.copy().conj(copy=False)
@@ -882,14 +1335,25 @@ def transpose(self, axes=None, copy=False):
         Parameters
         ----------
         axes : None, optional
-            This argument is not supported
+            This argument is not supported and must be None.
         copy : bool, optional
-            Whether to create a copy (ignored - CSR transpose always creates copy)
+            Whether to create a copy. Ignored - CSR transpose always creates a copy.
 
         Returns
         -------
         csr_array
-            Transposed matrix
+            Transposed matrix with shape (N, M) where the original shape was (M, N).
+
+        Raises
+        ------
+        AssertionError
+            If axes is not None.
+
+        Notes
+        -----
+        The axes parameter is not supported and must be None.
+        CSR transpose always creates a copy due to the format conversion.
+        The implementation sorts the data by columns to maintain canonical format.
         """
         if axes is not None:
             raise AssertionError("axes parameter should be None")
@@ -922,22 +1386,31 @@ def transpose(self, axes=None, copy=False):
             copy=False,
         )
 
-    T = property(transpose)
+    T = property(transpose, doc="Transpose of the matrix")
 
-    def asformat(seld, format, copy=False):
+    def asformat(self, format, copy=False):
         """Convert this matrix to a specified format.
 
         Parameters
         ----------
         format : str
-            Desired sparse format ('csr' only)
+            Desired sparse format. Currently only 'csr' is supported.
         copy : bool, optional
-            Whether to create a copy
+            Whether to create a copy. Default is False.
 
         Returns
         -------
         csr_array
-            Matrix in the requested format
+            Matrix in the requested format.
+
+        Raises
+        ------
+        NotImplementedError
+            If format is not 'csr'.
+
+        Notes
+        -----
+        Currently only CSR format is supported. Other formats are not implemented.
         """
         if format == "csr":
             return self.copy() if copy else self
@@ -950,12 +1423,17 @@ def tocsr(self, copy=False):
         Parameters
         ----------
         copy : bool, optional
-            Whether to create a copy
+            Whether to create a copy. Default is False.
 
         Returns
         -------
         csr_array
-            The converted CSR matrix
+            The converted CSR matrix.
+
+        Notes
+        -----
+        Since this matrix is already in CSR format, this method simply
+        returns a copy if requested, or the matrix itself otherwise.
         """
         if copy:
             return self.copy().tocsr(copy=False)
@@ -967,7 +1445,13 @@ def nonzero(self):
         Returns
         -------
         (row, col) : tuple of cupynumeric.ndarrays
-            Row and column indices of non-zeros
+            Row and column indices of non-zeros. Only returns indices
+            where the values are actually non-zero (not just stored).
+
+        Notes
+        -----
+        This method filters out explicit zeros that may be stored in the
+        sparse matrix structure.
         """
         task = runtime.create_auto_task(SparseOpCode.EXPAND_POS_TO_COORDINATES)
 
@@ -986,23 +1470,30 @@ def nonzero(self):
 
 
 csr_matrix = csr_array
+"""Alias for csr_array for backward compatibility with SciPy naming conventions."""
 
 
 # spmv computes y = A @ x.
 def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
+    """Perform sparse matrix vector product y = A @ x.
+
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix of shape (M, N).
+    x : cupynumeric.ndarray
+        Dense vector of shape (N,) for the dot product.
+    y : cupynumeric.ndarray
+        Output array of shape (M,) to store the result.
+
+    Notes
+    -----
+    This function computes the sparse matrix-vector multiplication y = A @ x.
+    The implementation uses an auto-parallelized kernel that distributes
+    the computation across available processors.
+
+    The function modifies y in-place to store the result.
     """
-    Perform sparse matrix vector product y = A @ x
-
-    Parameters:
-    -----------
-    A: csr_array
-        Input sparse matrix
-    x: cupynumeric.ndarray
-        Dense vector for the dot product
-    y: cupynumeric.ndarray
-        Output array
-    """
-
     x_store = get_store_from_cupynumeric_array(x)
     y_store = get_store_from_cupynumeric_array(y)
 
@@ -1026,20 +1517,34 @@ def spmv(A: csr_array, x: cupynumeric.ndarray, y: cupynumeric.ndarray):
 # spgemm_csr_csr_csr computes C = A @ B when A and B and
 # both csr matrices, and returns the result C as a csr matrix.
 def spgemm_csr_csr_csr(A: csr_array, B: csr_array) -> csr_array:
-    """
-    Perform sparse matrix multiplication C = A @ B
+    """Perform sparse matrix multiplication C = A @ B.
 
-    Parameters:
-    -----------
-    A: csr_array
-        Input sparse matrix A
-    B: csr_array
-        Input sparse matrix B
+    Parameters
+    ----------
+    A : csr_array
+        Input sparse matrix A of shape (M, K).
+    B : csr_array
+        Input sparse matrix B of shape (K, N).
 
-    Returns:
-    --------
+    Returns
+    -------
     csr_array
-        The result of the sparse matrix multiplication
+        The result of the sparse matrix multiplication with shape (M, N).
+
+    Notes
+    -----
+    This function computes the sparse matrix-sparse matrix multiplication C = A @ B.
+
+    The implementation differs based on the available hardware:
+    - On GPU: Uses cuSPARSE SpGEMM with local CSR matrices that are aggregated
+    - On CPU: Uses a custom implementation with two-pass algorithm
+
+    The GPU implementation creates a set of local CSR matrices that are
+    aggregated into a global CSR matrix. The CPU implementation uses a
+    query phase to determine the number of non-zeros per row, followed
+    by the actual computation phase.
+
+    Both implementations maintain the CSR format throughout the computation.
     """
     # Due to limitations in cuSPARSE, we cannot use a uniform task
     # implementation for CSRxCSRxCSR SpGEMM across CPUs, OMPs and GPUs.
diff --git a/legate_sparse/dia.py b/legate_sparse/dia.py
index 0dd93735..20f2dc5c 100644
--- a/legate_sparse/dia.py
+++ b/legate_sparse/dia.py
@@ -63,7 +63,107 @@
 # Temporary implementation for matrix generation in examples
 @clone_scipy_arr_kind(scipy.sparse.dia_array)
 class dia_array(CompressedBase):
+    """Sparse matrix with DIAgonal storage.
+
+    This can be instantiated in several ways:
+        dia_array(D)
+            where D is a 2-D ndarray or cupynumeric.ndarray
+
+        dia_array((data, offsets), shape=(M, N))
+            where data is a 2-D array and offsets is a 1-D array of diagonal offsets
+
+        dia_array((data, offset), shape=(M, N))
+            where data is a 1-D array and offset is a single integer
+
+    Attributes
+    ----------
+    dtype : dtype
+        Data type of the array
+    shape : 2-tuple
+        Shape of the array
+    ndim : int
+        Number of dimensions (this is always 2)
+    nnz : int
+        Number of stored values, including explicit zeros
+    data : cupynumeric.ndarray
+        DIA format data array of the array
+    offsets : cupynumeric.ndarray
+        DIA format offset array of the array
+    T : dia_array
+        Transpose of the matrix
+
+    Notes
+    -----
+    The DIA (Diagonal) format stores a sparse matrix by diagonals.
+    The data array has shape (n_diagonals, max_diagonal_length) where
+    each row represents a diagonal. The offsets array contains the
+    diagonal offsets (k > 0 for upper diagonals, k < 0 for lower diagonals).
+
+    Advantages of the DIA format:
+        - efficient for matrices with few diagonals
+        - fast matrix-vector products
+        - simple structure
+
+    Disadvantages of the DIA format:
+        - inefficient for irregular sparsity patterns
+        - not suitable for general sparse matrices
+        - limited arithmetic operations
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - Limited functionality (mainly for matrix generation in examples)
+        - Some operations may not be fully optimized
+        - Primarily used as an intermediate format for conversion to CSR
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import dia_array
+    >>> data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> offsets = np.array([-1, 0, 1])
+    >>> A = dia_array((data, offsets), shape=(3, 3))
+    >>> A.todense()
+    array([[5, 2, 0],
+           [4, 8, 3],
+           [0, 7, 9]])
+    """
+
     def __init__(self, arg, shape=None, dtype=None, copy=False):
+        """Initialize a DIA array.
+
+        Parameters
+        ----------
+        arg : tuple
+            The input data. Must be a tuple (data, offsets) where:
+            - data is a 2-D array containing the diagonal values
+            - offsets is a 1-D array or integer specifying diagonal offsets
+        shape : tuple, optional
+            Shape of the array (M, N). Required if not inferrable from input.
+        dtype : dtype, optional
+            Data type of the array. If None, inferred from input data.
+        copy : bool, optional
+            Whether to copy the input data. Default is False.
+
+        Raises
+        ------
+        NotImplementedError
+            If shape is not provided (shape is required for DIA arrays).
+        AssertionError
+            If arg is not a tuple or has invalid format.
+        ValueError
+            If input data is inconsistent or invalid.
+
+        Notes
+        -----
+        The DIA format is primarily used for matrix generation in examples
+        and as an intermediate format for conversion to CSR. The shape
+        parameter is required as it cannot be inferred from the diagonal data.
+
+        The offsets array specifies which diagonals are stored:
+        - k > 0: upper diagonal (kth diagonal above main diagonal)
+        - k = 0: main diagonal
+        - k < 0: lower diagonal (kth diagonal below main diagonal)
+        """
         if shape is None:
             raise NotImplementedError
         assert isinstance(arg, tuple)
@@ -89,6 +189,18 @@ def __init__(self, arg, shape=None, dtype=None, copy=False):
 
     @property
     def nnz(self):
+        """Number of stored values, including explicit zeros.
+
+        Returns
+        -------
+        int
+            The number of non-zero elements in the matrix.
+
+        Notes
+        -----
+        This property computes the number of non-zeros by iterating through
+        each diagonal and counting the valid elements within the matrix bounds.
+        """
         M, N = self.shape
         nnz = 0
         for k in self.offsets:
@@ -100,18 +212,73 @@ def nnz(self):
 
     @property
     def data(self):
+        """Get the data array of the DIA matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The data array containing the diagonal values. Each row represents
+            a diagonal, with shape (n_diagonals, max_diagonal_length).
+        """
         return store_to_cupynumeric_array(self._data)
 
     @property
     def offsets(self):
+        """Get the offsets array of the DIA matrix.
+
+        Returns
+        -------
+        cupynumeric.ndarray
+            The offsets array specifying which diagonals are stored.
+            Positive values indicate upper diagonals, negative values
+            indicate lower diagonals, and zero indicates the main diagonal.
+        """
         return store_to_cupynumeric_array(self._offsets)
 
     def copy(self):
+        """Returns a copy of this matrix.
+
+        Returns
+        -------
+        dia_array
+            A copy of the matrix with the same data and structure.
+        """
         data = cupynumeric.array(self.data)
         offsets = cupynumeric.array(self.offsets)
         return dia_array((data, offsets), shape=self.shape, dtype=self.dtype)
 
     def transpose(self, axes=None, copy=False):
+        """Reverses the dimensions of the sparse matrix.
+
+        Parameters
+        ----------
+        axes : None, optional
+            This argument is not supported and must be None.
+        copy : bool, optional
+            Whether to create a copy. Not supported - must be False.
+
+        Returns
+        -------
+        dia_array
+            Transposed matrix with shape (N, M) where the original shape was (M, N).
+
+        Raises
+        ------
+        ValueError
+            If axes is not None.
+        AssertionError
+            If copy is True (not supported).
+
+        Notes
+        -----
+        The axes parameter is not supported and must be None.
+        The copy parameter is not supported and must be False.
+
+        Transposing a DIA matrix involves:
+        1. Flipping the diagonal offsets (negating them)
+        2. Re-aligning the data matrix to account for the new offsets
+        3. Adjusting the shape from (M, N) to (N, M)
+        """
         if axes is not None:
             raise ValueError(
                 "Sparse matrices do not support "
@@ -147,9 +314,27 @@ def transpose(self, axes=None, copy=False):
             dtype=self.dtype,
         )
 
-    T = property(transpose)
+    T = property(transpose, doc="Transpose of the matrix")
 
     def tocsr(self, copy=False):
+        """Convert this matrix to a CSR matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        csr_array
+            The converted CSR matrix.
+
+        Notes
+        -----
+        The conversion to CSR is done by first transposing the matrix
+        and then converting the transposed matrix to CSR format.
+        This approach is used to simplify the conversion process.
+        """
         if copy:
             return self.copy().tocsr(copy=False)
         # we don't need secondary copy
@@ -157,6 +342,33 @@ def tocsr(self, copy=False):
 
     # This routine is lifted from scipy.sparse's converter.
     def _tocsr_transposed(self, copy=False):
+        """Convert the transposed DIA matrix to CSR format.
+
+        This internal method converts a transposed DIA matrix to CSR format.
+        It is used by the tocsr method after transposing the original matrix.
+
+        Parameters
+        ----------
+        copy : bool, optional
+            Whether to create a copy. Default is False.
+
+        Returns
+        -------
+        csr_array
+            The CSR representation of the transposed matrix.
+
+        Notes
+        -----
+        This method is adapted from SciPy's DIA to CSR converter.
+        It handles the conversion by:
+        1. Creating masks for valid diagonal elements
+        2. Computing the indptr array using cumulative sums
+        3. Extracting indices and data for non-zero elements
+        4. Constructing the CSR matrix
+
+        The method ensures that only elements within the matrix bounds
+        and with non-zero values are included in the CSR representation.
+        """
         if self.nnz == 0:
             return csr_array(self.shape, self.dtype)
 
@@ -192,3 +404,4 @@ def _tocsr_transposed(self, copy=False):
 
 # Declare an alias for this type.
 dia_matrix = dia_array
+"""Alias for dia_array for backward compatibility with SciPy naming conventions."""
diff --git a/legate_sparse/gallery.py b/legate_sparse/gallery.py
index 56e71054..371a4c44 100644
--- a/legate_sparse/gallery.py
+++ b/legate_sparse/gallery.py
@@ -75,8 +75,8 @@
 
 
 def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
-    """
-    Construct a sparse matrix from diagonals.
+    """Construct a sparse matrix from diagonals.
+
     Parameters
     ----------
     diagonals : sequence of array_like
@@ -90,44 +90,69 @@ def diags(diagonals, offsets=0, shape=None, format=None, dtype=None):
     shape : tuple of int, optional
         Shape of the result. If omitted, a square matrix large enough
         to contain the diagonals is returned.
-    format : {"dia", "csr", "csc", "lil", ...}, optional
-        Matrix format of the result. By default (format=None) an
-        appropriate sparse matrix format is returned. This choice is
-        subject to change.
+    format : {"dia", "csr"}, optional
+        Matrix format of the result. By default (format=None) a DIA
+        matrix is returned. Currently only "dia" and "csr" are supported.
     dtype : dtype, optional
-        Data type of the matrix.
+        Data type of the matrix. Must be specified.
+
+    Returns
+    -------
+    sparse matrix
+        A sparse matrix in the specified format with the given diagonals.
+
+    Raises
+    ------
+    ValueError
+        If the number of diagonals and offsets don't match, or if
+        diagonal lengths don't agree with matrix size.
+    NotImplementedError
+        If dtype is not specified or format is not supported.
+
     See Also
     --------
     spdiags : construct matrix from diagonals
+
     Notes
     -----
     This function differs from `spdiags` in the way it handles
     off-diagonals.
+
     The result from `diags` is the sparse equivalent of::
         np.diag(diagonals[0], offsets[0])
         + ...
         + np.diag(diagonals[k], offsets[k])
+
     Repeated diagonal offsets are disallowed.
-    .. versionadded:: 0.11
+
+    Differences from SciPy:
+        - Uses cupynumeric arrays instead of numpy arrays
+        - dtype parameter is required (cannot be None)
+        - Limited format support (only "dia" and "csr")
+        - Primarily used for matrix generation in examples
+
     Examples
     --------
-    >>> from scipy.sparse import diags
+    >>> import cupynumeric as np
+    >>> from legate_sparse import diags
     >>> diagonals = [[1, 2, 3, 4], [1, 2, 3], [1, 2]]
-    >>> diags(diagonals, [0, -1, 2]).toarray()
+    >>> diags(diagonals, [0, -1, 2], dtype=np.float64).todense()
     array([[1, 0, 1, 0],
            [1, 2, 0, 2],
            [0, 2, 3, 0],
            [0, 0, 3, 4]])
+
     Broadcasting of scalars is supported (but shape needs to be
     specified):
-    >>> diags([1, -2, 1], [-1, 0, 1], shape=(4, 4)).toarray()
+    >>> diags([1, -2, 1], [-1, 0, 1], shape=(4, 4), dtype=np.float64).todense()
     array([[-2.,  1.,  0.,  0.],
            [ 1., -2.,  1.,  0.],
            [ 0.,  1., -2.,  1.],
            [ 0.,  0.,  1., -2.]])
+
     If only one diagonal is wanted (as in `numpy.diag`), the following
     works as well:
-    >>> diags([1, 2, 3], 1).toarray()
+    >>> diags([1, 2, 3], 1, dtype=np.float64).todense()
     array([[ 0.,  1.,  0.,  0.],
            [ 0.,  0.,  2.,  0.],
            [ 0.,  0.,  0.,  3.],
diff --git a/legate_sparse/install_info.pyi b/legate_sparse/install_info.pyi
new file mode 100644
index 00000000..16574ccf
--- /dev/null
+++ b/legate_sparse/install_info.pyi
@@ -0,0 +1,16 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# See the LICENSE file for details.
+#
+
+# Stub file for install_info module to satisfy mypy
+# This module is generated during build time
+
+libpath: str
+header: str
diff --git a/legate_sparse/io.py b/legate_sparse/io.py
index 2b6a09e3..ecaf8e3c 100644
--- a/legate_sparse/io.py
+++ b/legate_sparse/io.py
@@ -24,6 +24,38 @@
 
 @track_provenance(runtime.sparse_library)
 def mmread(source):
+    """Read a sparse matrix from a Matrix Market (.mtx) file.
+
+    Parameters
+    ----------
+    source : str
+        The filename or path to the Matrix Market file to read.
+
+    Returns
+    -------
+    csr_array
+        A sparse matrix in CSR format loaded from the file.
+
+    Notes
+    -----
+    This function reads Matrix Market format files and converts them
+    to CSR format. The Matrix Market format is a standard format for
+    storing sparse matrices. For more information on the format, see
+    https://math.nist.gov/MatrixMarket/formats.html.
+
+    The function assumes that all nodes in the system can access the
+    file, so no special file distribution is needed.
+
+    The implementation reads the file in COO format and then converts
+    to CSR format for efficient storage and operations.
+
+    Examples
+    --------
+    >>> from legate_sparse import mmread
+    >>> A = mmread("matrix.mtx")
+    >>> print(A.shape)
+    (1000, 1000)
+    """
     # TODO (rohany): We'll assume for now that all of the nodes in the system
     # can access the file passed in, so we don't need to worry about where this
     # task gets mapped to.
diff --git a/legate_sparse/linalg.py b/legate_sparse/linalg.py
index 6d515502..82aa0edb 100644
--- a/legate_sparse/linalg.py
+++ b/legate_sparse/linalg.py
@@ -66,6 +66,32 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
+"""
+Sparse linear algebra (:mod:`legate_sparse.linalg`)
+===================================================
+
+.. currentmodule:: legate_sparse.linalg
+
+Abstract linear operators
+-------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   LinearOperator -- abstract representation of a linear operator
+
+Solving linear problems
+-----------------------
+
+Iterative methods for linear equation systems:
+
+.. autosummary::
+   :toctree: generated/
+
+   cg -- Use Conjugate Gradient iteration to solve Ax = b
+   gmres -- Use Generalized Minimal RESidual iteration to solve Ax = b
+
+"""
 
 import inspect
 import warnings
@@ -414,6 +440,23 @@ def _rmatvec(self, x, out=None):
 
 
 def make_linear_operator(A):
+    """Convert a matrix to a LinearOperator.
+
+    Parameters
+    ----------
+    A : array_like, sparse matrix, or LinearOperator
+        The matrix to convert.
+
+    Returns
+    -------
+    LinearOperator
+        A LinearOperator representation of A.
+
+    Notes
+    -----
+    If A is already a LinearOperator, it is returned unchanged.
+    Otherwise, A is wrapped in a _SparseMatrixLinearOperator.
+    """
     if isinstance(A, LinearOperator):
         return A
     else:
@@ -431,6 +474,39 @@ def make_linear_operator(A):
 # allocating unnecessary futures.
 @track_provenance(nested=True)
 def cg_axpby(y, x, a, b, isalpha=True, negate=False):
+    """Perform fused vector operation for CG solvers.
+
+    This function performs the operation y = alpha * x + beta * y where
+    alpha and beta are computed as a/b within the task. This avoids
+    unnecessary future operations and memory allocations.
+
+    Parameters
+    ----------
+    y : cupynumeric.ndarray
+        Output vector that will be modified in-place.
+    x : cupynumeric.ndarray
+        Input vector for the operation.
+    a : cupynumeric.ndarray
+        Numerator for computing alpha or beta.
+    b : cupynumeric.ndarray
+        Denominator for computing alpha or beta.
+    isalpha : bool, optional
+        If True, a/b is interpreted as alpha. If False, as beta.
+        Default is True.
+    negate : bool, optional
+        If True, negate the computed coefficient. Default is False.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The modified y vector (same as input y).
+
+    Notes
+    -----
+    This is a specialized implementation for CG solvers that fuses
+    coefficient computation with vector operations to avoid unnecessary
+    memory allocations and future operations in the Legion runtime.
+    """
     y_store = get_store_from_cupynumeric_array(y)
     x_store = get_store_from_cupynumeric_array(x)
     task = runtime.create_auto_task(SparseOpCode.AXPBY)
@@ -451,6 +527,29 @@ def cg_axpby(y, x, a, b, isalpha=True, negate=False):
 
 
 def _get_atol_rtol(b_norm, tol=None, atol=0.0, rtol=1e-5):
+    """Compute absolute and relative tolerances for convergence.
+
+    Parameters
+    ----------
+    b_norm : float
+        Norm of the right-hand side vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    atol : float, optional
+        Absolute tolerance. Default is 0.0.
+    rtol : float, optional
+        Relative tolerance. Default is 1e-5.
+
+    Returns
+    -------
+    tuple
+        (atol, rtol) - computed absolute and relative tolerances.
+
+    Notes
+    -----
+    If atol is None, it is set to rtol. The final atol is the maximum
+    of the provided atol and rtol * b_norm.
+    """
     rtol = float(tol) if tol is not None else rtol
 
     if atol is None:
@@ -473,6 +572,60 @@ def cg(
     rtol=1e-5,
     conv_test_iters=25,
 ):
+    """Solve a linear system using the Conjugate Gradient method.
+
+    Parameters
+    ----------
+    A : sparse matrix or LinearOperator
+        The coefficient matrix of the linear system.
+    b : cupynumeric.ndarray
+        Right-hand side of the linear system.
+    x0 : cupynumeric.ndarray, optional
+        Initial guess for the solution. If None, uses zero vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    maxiter : int, optional
+        Maximum number of iterations. If None, uses 10 * n.
+    M : sparse matrix or LinearOperator, optional
+        Preconditioner for A. If None, uses identity.
+    callback : callable, optional
+        User-specified function called after each iteration.
+    atol : float, optional
+        Absolute tolerance for convergence. Default is 0.0.
+    rtol : float, optional
+        Relative tolerance for convergence. Default is 1e-5.
+    conv_test_iters : int, optional
+        Number of iterations between convergence tests. Default is 25.
+
+    Returns
+    -------
+    tuple
+        (x, info) where x is the solution and info is zero if solution is
+        converged else number of iterations
+
+    Raises
+    ------
+    AssertionError
+        If b is not 1D or A is not square.
+
+    Notes
+    -----
+    This implementation follows SciPy's CG solver semantics closely.
+    The method uses fused vector operations to avoid unnecessary
+    memory allocations and improve performance.
+
+    Convergence is tested every conv_test_iters iterations to avoid
+    the overhead of computing the residual norm in every iteration.
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array, linalg
+    >>> A = csr_array([[4, 1], [1, 3]])
+    >>> b = np.array([1, 2])
+    >>> x, iters = linalg.cg(A, b)
+    >>> print(f"Solution: {x}, Iterations: {iters}")
+    """
     # We keep semantics as close as possible to scipy.cg.
     # https://github.com/scipy/scipy/blob/v1.9.0/scipy/sparse/linalg/_isolve/iterative.py#L298-L385
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
@@ -503,6 +656,7 @@ def cg(
     z = None
     q = None
 
+    converged = False
     while iters < maxiter:
         z = M.matvec(r, out=z)
         rho1 = rho
@@ -528,10 +682,15 @@ def cg(
         if (iters % conv_test_iters == 0 or iters == (maxiter - 1)) and np.linalg.norm(
             r
         ) < atol:
+            converged = True
             # Test convergence every conv_test_iters iterations.
             break
 
-    return x, iters
+    info = 0
+    if iters == maxiter and not converged:
+        info = iters
+
+    return x, info
 
 
 # This implementation of GMRES is lifted from the cupy implementation:
@@ -550,43 +709,77 @@ def gmres(
     callback_type=None,
     rtol=1e-5,
 ):
-    """Uses Generalized Minimal RESidual iteration to solve ``Ax = b``.
-    Args:
-        A (ndarray, spmatrix or LinearOperator): The real or complex
-            matrix of the linear system with shape ``(n, n)``. ``A`` must be
-            :class:`cupy.ndarray`, :class:`cupyx.scipy.sparse.spmatrix` or
-            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
-        b (cupy.ndarray): Right hand side of the linear system with shape
-            ``(n,)`` or ``(n, 1)``.
-        x0 (cupy.ndarray): Starting guess for the solution.
-        tol (float): Tolerance for convergence. This argument is optional,
-            deprecated in favour of ``rtol``.
-        restart (int): Number of iterations between restarts. Larger values
-            increase iteration cost, but may be necessary for convergence.
-        maxiter (int): Maximum number of iterations.
-        M (ndarray, spmatrix or LinearOperator): Preconditioner for ``A``.
-            The preconditioner should approximate the inverse of ``A``.
-            ``M`` must be :class:`cupy.ndarray`,
-            :class:`cupyx.scipy.sparse.spmatrix` or
-            :class:`cupyx.scipy.sparse.linalg.LinearOperator`.
-        callback (function): User-specified function to call on every restart.
-            It is called as ``callback(arg)``, where ``arg`` is selected by
-            ``callback_type``.
-        callback_type (str): 'x' or 'pr_norm'. If 'x', the current solution
-            vector is used as an argument of callback function. if 'pr_norm',
-            relative (preconditioned) residual norm is used as an arugment.
-        atol, rtol (float): Tolerance for convergence. For convergence,
-            ``norm(b - A @ x) <= max(rtol*norm(b), atol)`` should be satisfied.
-            The default is ``atol=0.`` and ``rtol=1e-5``.
-    Returns:
-        tuple:
-            It returns ``x`` (cupy.ndarray) and ``info`` (int) where ``x`` is
-            the converged solution and ``info`` provides convergence
-            information.
-    Reference:
-        M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
-        Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).
-    .. seealso:: :func:`scipy.sparse.linalg.gmres`
+    """Solve a linear system using the Generalized Minimal Residual method.
+
+    Parameters
+    ----------
+    A : sparse matrix or LinearOperator
+        The coefficient matrix of the linear system.
+    b : cupynumeric.ndarray
+        Right-hand side of the linear system with shape (n,) or (n, 1).
+    x0 : cupynumeric.ndarray, optional
+        Starting guess for the solution. If None, uses zero vector.
+    tol : float, optional
+        Legacy tolerance parameter. If provided, overrides rtol.
+    restart : int, optional
+        Number of iterations between restarts. Larger values increase
+        iteration cost but may be necessary for convergence. Default is 20.
+    maxiter : int, optional
+        Maximum number of iterations. If None, uses 10 * n.
+    M : sparse matrix or LinearOperator, optional
+        Preconditioner for A. The preconditioner should approximate
+        the inverse of A. If None, uses identity.
+    callback : callable, optional
+        User-specified function called on every restart.
+    restrt : int, optional
+        Deprecated alias for restart parameter.
+    atol : float, optional
+        Absolute tolerance for convergence. Default is 0.0.
+    callback_type : str, optional
+        Type of callback argument: 'x' for current solution vector,
+        'pr_norm' for relative preconditioned residual norm. Default is 'pr_norm'.
+    rtol : float, optional
+        Relative tolerance for convergence. Default is 1e-5.
+
+    Returns
+    -------
+    tuple
+        (x, info) where x is the converged solution and info provides
+        convergence information.
+
+    Raises
+    ------
+    AssertionError
+        If b is not 1D or A is not square.
+    ValueError
+        If callback_type is not 'x' or 'pr_norm'.
+
+    Notes
+    -----
+    This implementation is adapted from CuPy's GMRES solver.
+    The method uses Arnoldi iteration to build a Krylov subspace
+    and solves the least squares problem in that subspace.
+
+    For convergence, the residual must satisfy:
+    norm(b - A @ x) <= max(rtol * norm(b), atol)
+
+    The restart parameter controls the trade-off between memory usage
+    and convergence rate. Larger restart values may improve convergence
+    but require more memory.
+
+    References
+    ----------
+    M. Wang, H. Klie, M. Parashar and H. Sudan, "Solving Sparse Linear
+    Systems on NVIDIA Tesla GPUs", ICCS 2009 (2009).
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> from legate_sparse import csr_array, linalg
+    >>> A = csr_array([[4, 1, 0], [1, 3, 1], [0, 1, 2]])
+    >>> b = np.array([1, 2, 3])
+    >>> x, info = linalg.gmres(A, b, restart=10)
+    >>> print(f"Solution: {x}, Info: {info}")
     """
     assert len(b.shape) == 1 or (len(b.shape) == 2 and b.shape[1] == 1)
     assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
@@ -625,6 +818,27 @@ def gmres(
     e = np.zeros((restart + 1,), dtype=A.dtype)
 
     def compute_hu(u, j):
+        """Compute Householder transformation for Arnoldi iteration.
+
+        Parameters
+        ----------
+        u : cupynumeric.ndarray
+            Vector to be transformed.
+        j : int
+            Current iteration index.
+
+        Returns
+        -------
+        tuple
+            (h, u) where h contains the Householder coefficients and
+            u is the transformed vector.
+
+        Notes
+        -----
+        This function computes the Householder transformation that
+        orthogonalizes the current vector against the previous basis
+        vectors in the Arnoldi iteration.
+        """
         h = V[:, : j + 1].conj().T @ u
         u -= V[:, : j + 1] @ h
         return h, u
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index a84abb0f..56f22fa1 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -55,16 +55,66 @@
 from .types import coord_ty, nnz_ty  # noqa: F401
 
 
-# is_sparse_matrix returns whether or not an object is a legate
-# sparse created sparse matrix.
-def is_sparse_matrix(o):
-    return any((isinstance(o, csr_array),))
+# returns whether or not an object is a legate sparse created sparse matrix.
+def _is_sparse_matrix(obj) -> bool:
+    return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
+def isspmatrix(obj) -> bool:
+    """Check if an object is a legate sparse matrix.
 
-issparse = is_sparse_matrix
-isspmatrix = is_sparse_matrix
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a legate sparse matrix, False otherwise.
+
+    Notes
+    -----
+    This function checks if the object is an instance of any supported
+    sparse matrix format in legate_sparse. Currently, only
+    CSR and DIA formats for supported.
+    """
+    return _is_sparse_matrix(obj)
+
+
+def issparse(obj) -> bool:
+    """Check if an object is a legate sparse matrix.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a legate sparse matrix, False otherwise.
+
+    Notes
+    -----
+    This function checks if the object is an instance of any supported
+    sparse matrix format in legate_sparse. Currently, only
+    CSR and DIA formats for supported.
+    """
+    return _is_sparse_matrix(obj)
 
 
 # Variants for each particular format type.
-def isspmatrix_csr(o):
-    return isinstance(o, csr_array)
+def isspmatrix_csr(obj):
+    """Check if an object is a CSR sparse matrix.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if the object is a CSR sparse matrix, False otherwise.
+    """
+    return isinstance(obj, csr_array)
diff --git a/legate_sparse/types.py b/legate_sparse/types.py
index 376fe116..923767f2 100644
--- a/legate_sparse/types.py
+++ b/legate_sparse/types.py
@@ -18,8 +18,19 @@
 # progress in generalizing the compute kernels, we can
 # remove this code.
 coord_ty = numpy.dtype(numpy.int64)
+"""Data type for coordinate indices in sparse matrices (int64)."""
+
 nnz_ty = numpy.dtype(numpy.uint64)
+"""Data type for non-zero counts in sparse matrices (uint64)."""
+
 float64 = numpy.dtype(numpy.float64)
+"""64-bit floating point data type."""
+
 int32 = numpy.dtype(numpy.int32)
+"""32-bit integer data type."""
+
 int64 = numpy.dtype(numpy.int64)
+"""64-bit integer data type."""
+
 uint64 = numpy.dtype(numpy.uint64)
+"""64-bit unsigned integer data type."""
diff --git a/legate_sparse/utils.py b/legate_sparse/utils.py
index 11daf9fd..2c072f2b 100644
--- a/legate_sparse/utils.py
+++ b/legate_sparse/utils.py
@@ -31,11 +31,25 @@
     numpy.complex64,
     numpy.complex128,
 )
+"""Supported datatypes for sparse matrix operations (SpMV and SpGEMM)."""
 
 
 # find_last_user_stacklevel gets the last stack frame index
 # within legate sparse.
 def find_last_user_stacklevel() -> int:
+    """Find the last stack frame index within legate sparse.
+
+    Returns
+    -------
+    int
+        The stack level of the last user code frame.
+
+    Notes
+    -----
+    This function walks the stack to find the first frame that is not
+    within the legate_sparse module, which is useful for determining
+    the appropriate stack level for warnings.
+    """
     stacklevel = 1
     for frame, _ in traceback.walk_stack(None):
         if not frame.f_globals["__name__"].startswith("sparse"):
@@ -46,6 +60,18 @@ def find_last_user_stacklevel() -> int:
 
 # store_to_cupynumeric_array converts a store to a cuPyNumeric array.
 def store_to_cupynumeric_array(store: LogicalStore):
+    """Convert a LogicalStore to a cupynumeric array.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to convert.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the store.
+    """
     return cupynumeric.asarray(store)
 
 
@@ -54,6 +80,20 @@ def get_store_from_cupynumeric_array(
     arr: cupynumeric.ndarray,
     copy=False,
 ) -> LogicalStore:
+    """Extract a LogicalStore from a cupynumeric array.
+
+    Parameters
+    ----------
+    arr : cupynumeric.ndarray
+        The cupynumeric array to extract the store from.
+    copy : bool, optional
+        Whether to create a copy of the array first. Default is False.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the array.
+    """
     if copy:
         # If requested to make a copy, do so.
         arr = cupynumeric.array(arr)
@@ -67,6 +107,23 @@ def get_store_from_cupynumeric_array(
 
 # cast_to_store attempts to cast an arbitrary object into a store.
 def cast_to_store(arr):
+    """Cast an arbitrary object to a LogicalStore.
+
+    Parameters
+    ----------
+    arr : array_like or LogicalStore
+        The object to cast.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the input.
+
+    Raises
+    ------
+    NotImplementedError
+        If the object cannot be cast to a LogicalStore.
+    """
     if isinstance(arr, LogicalStore):
         return arr
     if isinstance(arr, numpy.ndarray):
@@ -79,6 +136,20 @@ def cast_to_store(arr):
 # cast_arr attempts to cast an arbitrary object into a cupynumeric
 # ndarray, with an optional desired type.
 def cast_arr(arr, dtype=None):
+    """Cast an arbitrary object to a cupynumeric array.
+
+    Parameters
+    ----------
+    arr : array_like or LogicalStore
+        The object to cast.
+    dtype : dtype, optional
+        The desired data type. If None, preserves the original type.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the input.
+    """
     if isinstance(arr, LogicalStore):
         arr = store_to_cupynumeric_array(arr)
     elif not isinstance(arr, cupynumeric.ndarray):
@@ -88,14 +159,32 @@ def cast_arr(arr, dtype=None):
     return arr
 
 
-# find_common_type performs a similar analysis to
-# cupynumeric.ndarray.find_common_type to find a common type
-# between all of the arguments.
 def find_common_type(*args):
+    """Find the common data type for a set of arrays.
+
+    This function performs a similar analysis to cupynumeric.ndarray.find_common_type
+    to find a common type between all of the arguments.
+
+    Parameters
+    ----------
+    *args : array_like
+        Arrays to find the common type for.
+
+    Returns
+    -------
+    numpy.dtype
+        The common data type that can represent all input arrays.
+
+    Notes
+    -----
+    The function handles sparse matrices, dense arrays, and scalars.
+    For sparse matrices, it uses their dtype. For scalars (size == 1),
+    they are treated separately from arrays.
+    """
     array_types = list()
     scalar_types = list()
     for array in args:
-        if legate_sparse.is_sparse_matrix(array):
+        if legate_sparse.isspmatrix(array):
             array_types.append(array.dtype)
         elif array.size == 1:
             scalar_types.append(array.dtype)
@@ -104,18 +193,47 @@ def find_common_type(*args):
     return numpy.result_type(*array_types, *scalar_types)
 
 
-# cast_to_common_type casts all arguments to the same common dtype.
 def cast_to_common_type(*args):
-    # Find a common type for all of the arguments.
+    """Cast all arguments to the same common data type.
+
+    Parameters
+    ----------
+    *args : array_like
+        Arrays to cast to a common type.
+
+    Returns
+    -------
+    tuple
+        Tuple of arrays, all cast to the same common data type.
+
+    Notes
+    -----
+    This function first finds the common type using find_common_type,
+    then casts each input to that type. If all arguments are already
+    the common type, this will be a no-op.
+    """
     common_type = find_common_type(*args)
-    # Cast each input to the common type. Ideally, if all of the
-    # arguments are already the common type then this will
-    # be a no-op.
     return tuple(arg.astype(common_type, copy=False) for arg in args)
 
 
-# factor_int decomposes an integer into a close to square grid.
 def factor_int(n):
+    """Split an integer into two close factors.
+
+    Parameters
+    ----------
+    n : int
+        The integer to factor.
+
+    Returns
+    -------
+    tuple
+        (val, val2) where val * val2 = n and val is close to sqrt(n).
+
+    Notes
+    -----
+    This function finds two factors of n such that their product equals n
+    and the first factor is close to the square root of n.
+    """
     val = math.ceil(math.sqrt(n))
     val2 = int(n / val)
     while val2 * val != float(n):
@@ -124,9 +242,31 @@ def factor_int(n):
     return val, val2
 
 
-# broadcast_store broadcasts a store to the desired input shape,
-# or throws an error if the broadcast is not possible.
 def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
+    """Broadcast a LogicalStore to the desired shape.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to broadcast.
+    shape : tuple
+        The target shape to broadcast to.
+
+    Returns
+    -------
+    LogicalStore
+        The broadcasted store.
+
+    Raises
+    ------
+    ValueError
+        If the broadcast is not possible.
+
+    Notes
+    -----
+    This function handles both dimension promotion (adding new dimensions)
+    and broadcasting (expanding dimensions of size 1).
+    """
     diff = len(shape) - store.ndim
     for dim in range(diff):
         store = store.promote(dim, shape[dim])
@@ -142,13 +282,43 @@ def broadcast_store(store: LogicalStore, shape: Any) -> LogicalStore:
 
 
 def copy_store(store: LogicalStore) -> LogicalStore:
+    """Create a copy of a LogicalStore.
+
+    Parameters
+    ----------
+    store : LogicalStore
+        The store to copy.
+
+    Returns
+    -------
+    LogicalStore
+        A new LogicalStore with the same data as the input.
+    """
     res = runtime.create_store(store.type, store.shape)  # type: ignore
     runtime.legate_runtime.issue_copy(res, store)
     return res
 
 
 def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
-    "Get LogicalStore from a LogicalStore or array, potentially creating a copy"
+    """Get LogicalStore from a LogicalStore or array, potentially creating a copy.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The source object to convert.
+    copy : bool, optional
+        Whether to create a copy. Default is False.
+
+    Returns
+    -------
+    LogicalStore
+        The LogicalStore representation of the input.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return get_store_from_cupynumeric_array(src, copy)
     elif isinstance(src, LogicalStore):
@@ -158,7 +328,25 @@ def store_from_store_or_array(src, copy=False) -> LogicalStore:  # type: ignore
 
 
 def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type: ignore
-    "Get array from a LogicalStore or array, potentially creating a copy"
+    """Get array from a LogicalStore or array, potentially creating a copy.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The source object to convert.
+    copy : bool, optional
+        Whether to create a copy. Default is False.
+
+    Returns
+    -------
+    cupynumeric.ndarray
+        The cupynumeric array representation of the input.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return src.copy() if copy else src
     elif isinstance(src, LogicalStore):
@@ -173,6 +361,23 @@ def array_from_store_or_array(src, copy=False) -> cupynumeric.ndarray:  # type:
 
 
 def get_storage_type(src):
+    """Get the storage type of an object.
+
+    Parameters
+    ----------
+    src : LogicalStore or cupynumeric.ndarray
+        The object to get the storage type for.
+
+    Returns
+    -------
+    numpy.dtype
+        The data type of the object.
+
+    Raises
+    ------
+    AssertionError
+        If the input type is not supported.
+    """
     if isinstance(src, cupynumeric.ndarray):
         return src.dtype
     elif isinstance(src, LogicalStore):
@@ -185,33 +390,58 @@ def get_storage_type(src):
 
 
 def is_dtype_supported(dtype: numpy.dtype) -> bool:
-    """
-    Does this datatype support spMV and spGEMM operations
+    """Check if a datatype supports SpMV and SpGEMM operations.
 
     Parameters
     ----------
-    dtype: np.dtype
-        Input datatype to check if it supports spMV and spGEMM
+    dtype : numpy.dtype
+        Input datatype to check if it supports SpMV and SpGEMM.
 
     Returns
     -------
-    valid: bool
-        True if  dtype supports spMV and spGEMM
-    """
+    bool
+        True if dtype supports SpMV and SpGEMM operations.
 
+    Notes
+    -----
+    Currently supported datatypes are float32, float64, complex64, and complex128.
+    """
     return dtype in SUPPORTED_DATATYPES
 
 
 def is_dense(x) -> bool:
-    """
-    Is this object a dense cupynumeric array
+    """Check if an object is a dense cupynumeric array.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a cupynumeric.ndarray, False otherwise.
     """
     return isinstance(x, cupynumeric.ndarray)
 
 
 def is_scalar_like(x) -> bool:
-    """
-    Is this object a scalar like type
+    """Check if an object is a scalar-like type.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a scalar or 0-dimensional array, False otherwise.
+
+    Notes
+    -----
+    This function returns False for strings, even though they are scalar-like
+    in some contexts, to avoid confusion with numeric scalars.
     """
     if isinstance(x, str):
         return False
@@ -219,32 +449,52 @@ def is_scalar_like(x) -> bool:
 
 
 def is_sparse(x) -> bool:
+    """Check if an object is a legate sparse matrix.
+
+    Parameters
+    ----------
+    x : object
+        The object to check.
+
+    Returns
+    -------
+    bool
+        True if x is a legate sparse matrix, False otherwise.
     """
-    Is this object a legate sparse matrix
-    """
-    return legate_sparse.is_sparse_matrix(x)
+    return legate_sparse.isspmatrix(x)
 
 
 def sort_by_rows_then_cols(rows: cupynumeric.ndarray, cols: cupynumeric.ndarray):
-    """
+    """Sort indices by rows first, then by columns.
+
     This function is a quick and dirty hack that does what np.lexsort does
-    using argsort, but only for two keys.
-    This is primarily used to to get the indices that we can use to sort data
-    first by rows and then by columns
+    using argsort, but only for two keys. This is primarily used to get
+    the indices that we can use to sort data first by rows and then by columns.
 
     Parameters
     ----------
-
-    rows: cupynumeric.ndarray
-        Indices of rows
-
-    cols: cupynumeric.ndarray
-        Indices of cols
+    rows : cupynumeric.ndarray
+        Indices of rows.
+    cols : cupynumeric.ndarray
+        Indices of columns.
 
     Returns
     -------
-    sorted_indices:cupynumeric.ndarray
-        Indices sorted by rows and then by columns, as given by numpy's lexsort
+    cupynumeric.ndarray
+        Indices sorted by rows and then by columns, as given by numpy's lexsort.
+
+    Notes
+    -----
+    This function is equivalent to np.lexsort((cols, rows)) but implemented
+    using stable sorting to ensure consistent results.
+
+    Examples
+    --------
+    >>> import cupynumeric as np
+    >>> rows = np.array([1, 0, 1, 0])
+    >>> cols = np.array([2, 1, 1, 2])
+    >>> indices = sort_by_rows_then_cols(rows, cols)
+    >>> print(indices)  # [1, 3, 2, 0] - sorted by (row, col)
     """
     assert rows.size == cols.size
 
diff --git a/setup.py b/setup.py
index daa17216..68efb75c 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 setup(
     name="legate-sparse",
-    version="25.03.00",
+    version="25.07.00",
     description="An Aspiring Drop-In Replacement for SciPy Sparse module at Scale",
     author="NVIDIA Corporation",
     license="Apache 2.0",
@@ -48,8 +48,9 @@
         "Topic :: Scientific/Engineering",
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
     ],
     packages=find_packages(
         where=".",
diff --git a/src/legate_sparse/array/conv/csr_to_dense.cc b/src/legate_sparse/array/conv/csr_to_dense.cc
index d865a805..de9a8958 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.cc
+++ b/src/legate_sparse/array/conv/csr_to_dense.cc
@@ -55,7 +55,11 @@ struct CSRToDenseImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { CSRToDense::register_variants(); }
+
+static const auto sparse_reg_task_ = []() -> char {
+  CSRToDense::register_variants();
+  return 0;
+}();
 
 }  // namespace
 
diff --git a/src/legate_sparse/array/conv/csr_to_dense.h b/src/legate_sparse/array/conv/csr_to_dense.h
index 7b5a947e..58ec4479 100644
--- a/src/legate_sparse/array/conv/csr_to_dense.h
+++ b/src/legate_sparse/array/conv/csr_to_dense.h
@@ -31,7 +31,8 @@ struct CSRToDenseArgs {
 
 class CSRToDense : public SparseTask<CSRToDense> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_TO_DENSE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_TO_DENSE}};
 
  public:
   static void cpu_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/conv/dense_to_csr.cc b/src/legate_sparse/array/conv/dense_to_csr.cc
index 97a86fe7..3304b558 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.cc
+++ b/src/legate_sparse/array/conv/dense_to_csr.cc
@@ -77,11 +77,12 @@ struct DenseToCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+
+static const auto sparse_reg_task_ = []() -> char {
   DenseToCSRNNZ::register_variants();
   DenseToCSR::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace
 
diff --git a/src/legate_sparse/array/conv/dense_to_csr.h b/src/legate_sparse/array/conv/dense_to_csr.h
index a0ebddc0..c9cf504a 100644
--- a/src/legate_sparse/array/conv/dense_to_csr.h
+++ b/src/legate_sparse/array/conv/dense_to_csr.h
@@ -29,7 +29,8 @@ struct DenseToCSRNNZArgs {
 
 class DenseToCSRNNZ : public SparseTask<DenseToCSRNNZ> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR_NNZ};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR_NNZ}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
@@ -48,7 +49,8 @@ struct DenseToCSRArgs {
 
 class DenseToCSR : public SparseTask<DenseToCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_DENSE_TO_CSR}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.cc b/src/legate_sparse/array/conv/pos_to_coordinates.cc
index 6b781134..7cadb10e 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.cc
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.cc
@@ -44,10 +44,10 @@ struct ExpandPosToCoordinatesImplBody<VariantKind::CPU, INDEX_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   ExpandPosToCoordinates::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/conv/pos_to_coordinates.h b/src/legate_sparse/array/conv/pos_to_coordinates.h
index 70e351a6..ad21ff95 100644
--- a/src/legate_sparse/array/conv/pos_to_coordinates.h
+++ b/src/legate_sparse/array/conv/pos_to_coordinates.h
@@ -29,7 +29,8 @@ struct ExpandPosToCoordinatesArgs {
 
 class ExpandPosToCoordinates : public SparseTask<ExpandPosToCoordinates> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_EXPAND_POS_TO_COORDINATES}};
 
  public:
   static void cpu_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/csr/get_diagonal.cc b/src/legate_sparse/array/csr/get_diagonal.cc
index 1e39b82b..cace6438 100644
--- a/src/legate_sparse/array/csr/get_diagonal.cc
+++ b/src/legate_sparse/array/csr/get_diagonal.cc
@@ -50,10 +50,11 @@ struct GetCSRDiagonalImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   GetCSRDiagonal::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/get_diagonal.h b/src/legate_sparse/array/csr/get_diagonal.h
index 6dd842bf..0c3d44a7 100644
--- a/src/legate_sparse/array/csr/get_diagonal.h
+++ b/src/legate_sparse/array/csr/get_diagonal.h
@@ -32,7 +32,8 @@ struct GetCSRDiagonalArgs {
 
 class GetCSRDiagonal : public SparseTask<GetCSRDiagonal> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_DIAGONAL};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_DIAGONAL}};
   // TODO (rohany): We could rewrite this having each implementation just make
   //  a call to thrust::transform, but the implementations are simple enough
   //  anyway.
diff --git a/src/legate_sparse/array/csr/indexing.cc b/src/legate_sparse/array/csr/indexing.cc
index 8fc0c11b..f40c901b 100644
--- a/src/legate_sparse/array/csr/indexing.cc
+++ b/src/legate_sparse/array/csr/indexing.cc
@@ -90,10 +90,11 @@ struct CSRIndexingCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   CSRIndexingCSR::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/indexing.h b/src/legate_sparse/array/csr/indexing.h
index 7bd6240c..8962370c 100644
--- a/src/legate_sparse/array/csr/indexing.h
+++ b/src/legate_sparse/array/csr/indexing.h
@@ -33,7 +33,8 @@ struct CSRIndexingCSRArgs {
 
 class CSRIndexingCSR : public SparseTask<CSRIndexingCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_INDEXING_CSR}};
 
   // TODO: The implementatio of the below three variants are
   // identical and hence need to be templated (DRY)
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
index 47ed6d34..6c4945de 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.cc
@@ -177,12 +177,13 @@ struct SpGEMMCSRxCSRxCSRImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   SpGEMMCSRxCSRxCSRNNZ::register_variants();
   SpGEMMCSRxCSRxCSR::register_variants();
   SpGEMMCSRxCSRxCSRGPU::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
index bf5d526d..c1b004ce 100644
--- a/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
+++ b/src/legate_sparse/array/csr/spgemm_csr_csr_csr.h
@@ -32,7 +32,8 @@ struct SpGEMMCSRxCSRxCSRNNZArgs {
 
 class SpGEMMCSRxCSRxCSRNNZ : public SparseTask<SpGEMMCSRxCSRxCSRNNZ> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_NNZ}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
@@ -60,7 +61,8 @@ struct SpGEMMCSRxCSRxCSRArgs {
 
 class SpGEMMCSRxCSRxCSR : public SparseTask<SpGEMMCSRxCSRxCSR> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
@@ -94,7 +96,8 @@ struct SpGEMMCSRxCSRxCSRGPUArgs {
 // we take a different approach than on CPUs and OMPs.
 class SpGEMMCSRxCSRxCSRGPU : public SparseTask<SpGEMMCSRxCSRxCSRGPU> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SPGEMM_CSR_CSR_CSR_GPU}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/array/csr/spmv.cc b/src/legate_sparse/array/csr/spmv.cc
index 63c305c6..d9efa4fd 100644
--- a/src/legate_sparse/array/csr/spmv.cc
+++ b/src/legate_sparse/array/csr/spmv.cc
@@ -51,10 +51,11 @@ struct CSRSpMVRowSplitImplBody<VariantKind::CPU, INDEX_CODE, VAL_CODE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   CSRSpMVRowSplit::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/csr/spmv.h b/src/legate_sparse/array/csr/spmv.h
index 8c46ba7f..7d718990 100644
--- a/src/legate_sparse/array/csr/spmv.h
+++ b/src/legate_sparse/array/csr/spmv.h
@@ -32,7 +32,8 @@ struct CSRSpMVRowSplitArgs {
 
 class CSRSpMVRowSplit : public SparseTask<CSRSpMVRowSplit> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_CSR_SPMV_ROW_SPLIT}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/array/util/scale_rect.cc b/src/legate_sparse/array/util/scale_rect.cc
index bad54157..c2d2df90 100644
--- a/src/legate_sparse/array/util/scale_rect.cc
+++ b/src/legate_sparse/array/util/scale_rect.cc
@@ -39,7 +39,11 @@ struct ScaleRect1ImplBody<VariantKind::CPU> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ScaleRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ScaleRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/scale_rect.h b/src/legate_sparse/array/util/scale_rect.h
index e9e7ffda..0b559036 100644
--- a/src/legate_sparse/array/util/scale_rect.h
+++ b/src/legate_sparse/array/util/scale_rect.h
@@ -29,7 +29,8 @@ struct ScaleRect1Args {
 
 class ScaleRect1 : public SparseTask<ScaleRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_SCALE_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_SCALE_RECT_1}};
   static void cpu_variant(legate::TaskContext context);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext context);
diff --git a/src/legate_sparse/array/util/unzip_rect.cc b/src/legate_sparse/array/util/unzip_rect.cc
index 9a9e8708..1272e9cc 100644
--- a/src/legate_sparse/array/util/unzip_rect.cc
+++ b/src/legate_sparse/array/util/unzip_rect.cc
@@ -42,7 +42,11 @@ struct UnZipRect1ImplBody<VariantKind::CPU> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { UnZipRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  UnZipRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/unzip_rect.h b/src/legate_sparse/array/util/unzip_rect.h
index e470c541..08293ef2 100644
--- a/src/legate_sparse/array/util/unzip_rect.h
+++ b/src/legate_sparse/array/util/unzip_rect.h
@@ -30,7 +30,8 @@ struct UnZipRect1Args {
 
 class UnZipRect1 : public SparseTask<UnZipRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UNZIP_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UNZIP_RECT_1}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/array/util/zip_to_rect.cc b/src/legate_sparse/array/util/zip_to_rect.cc
index 39634664..c8871583 100644
--- a/src/legate_sparse/array/util/zip_to_rect.cc
+++ b/src/legate_sparse/array/util/zip_to_rect.cc
@@ -41,7 +41,11 @@ struct ZipToRect1ImplBody<VariantKind::CPU, VAL> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ZipToRect1::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ZipToRect1::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/array/util/zip_to_rect.h b/src/legate_sparse/array/util/zip_to_rect.h
index 3851a195..4bc5ac70 100644
--- a/src/legate_sparse/array/util/zip_to_rect.h
+++ b/src/legate_sparse/array/util/zip_to_rect.h
@@ -30,7 +30,8 @@ struct ZipToRect1Args {
 
 class ZipToRect1 : public SparseTask<ZipToRect1> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_ZIP_TO_RECT_1};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_ZIP_TO_RECT_1}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/cudalibs.cu b/src/legate_sparse/cudalibs.cu
index 733ea8f1..6ec45bd5 100644
--- a/src/legate_sparse/cudalibs.cu
+++ b/src/legate_sparse/cudalibs.cu
@@ -77,7 +77,8 @@ cusparseHandle_t get_cusparse()
 
 class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_LOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_LOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -90,7 +91,8 @@ class LoadCUDALibsTask : public SparseTask<LoadCUDALibsTask> {
 
 class UnloadCUDALibsTask : public SparseTask<UnloadCUDALibsTask> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UNLOAD_CUDALIBS};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UNLOAD_CUDALIBS}};
 
  public:
   static void gpu_variant(legate::TaskContext context)
@@ -101,10 +103,10 @@ class UnloadCUDALibsTask : public SparseTask<UnloadCUDALibsTask> {
   }
 };
 
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   LoadCUDALibsTask::register_variants();
   UnloadCUDALibsTask::register_variants();
-}
+  return 0;
+}();
 
 }  // namespace sparse
diff --git a/src/legate_sparse/io/mtx_to_coo.cc b/src/legate_sparse/io/mtx_to_coo.cc
index d2c85667..71afd22e 100644
--- a/src/legate_sparse/io/mtx_to_coo.cc
+++ b/src/legate_sparse/io/mtx_to_coo.cc
@@ -35,13 +35,13 @@ using val_ty   = double;
   // within DISTAL.
   assert(ctx.is_single_task());
   // Regardless of how inputs are added, scalar future return values are at the front.
-  auto& m_store   = ctx.outputs()[0];
-  auto& n_store   = ctx.outputs()[1];
-  auto& nnz_store = ctx.outputs()[2];
-  auto& rows      = ctx.outputs()[3];
-  auto& cols      = ctx.outputs()[4];
-  auto& vals      = ctx.outputs()[5];
-  auto filename   = ctx.scalars()[0].value<std::string>();
+  auto m_store   = ctx.output(0);
+  auto n_store   = ctx.output(1);
+  auto nnz_store = ctx.output(2);
+  auto rows      = ctx.output(3);
+  auto cols      = ctx.output(4);
+  auto vals      = ctx.output(5);
+  auto filename  = ctx.scalar(0).value<std::string>();
   std::fstream file;
   file.open(filename, std::fstream::in);
 
@@ -148,7 +148,11 @@ using val_ty   = double;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { ReadMTXToCOO::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  ReadMTXToCOO::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/io/mtx_to_coo.h b/src/legate_sparse/io/mtx_to_coo.h
index 4dde28fa..c8e1d7ff 100644
--- a/src/legate_sparse/io/mtx_to_coo.h
+++ b/src/legate_sparse/io/mtx_to_coo.h
@@ -24,7 +24,8 @@ namespace sparse {
 
 class ReadMTXToCOO : public SparseTask<ReadMTXToCOO> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_READ_MTX_TO_COO};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_READ_MTX_TO_COO}};
 
   static constexpr legate::VariantOptions CPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/linalg/axpby.cc b/src/legate_sparse/linalg/axpby.cc
index 1e61a1bf..547ad927 100644
--- a/src/legate_sparse/linalg/axpby.cc
+++ b/src/legate_sparse/linalg/axpby.cc
@@ -52,7 +52,11 @@ struct AXPBYImplBody<VariantKind::CPU, VAL_CODE, IS_ALPHA, NEGATE> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { AXPBY::register_variants(); }
+static const auto sparse_reg_task_ = []() -> char {
+  AXPBY::register_variants();
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/linalg/axpby.h b/src/legate_sparse/linalg/axpby.h
index 132c9f22..256e8070 100644
--- a/src/legate_sparse/linalg/axpby.h
+++ b/src/legate_sparse/linalg/axpby.h
@@ -33,7 +33,8 @@ struct AXPBYArgs {
 
 class AXPBY : public SparseTask<AXPBY> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_AXPBY};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_AXPBY}};
   static void cpu_variant(legate::TaskContext ctx);
 #ifdef LEGATE_USE_OPENMP
   static void omp_variant(legate::TaskContext ctx);
diff --git a/src/legate_sparse/mapper/mapper.cc b/src/legate_sparse/mapper/mapper.cc
index 43ed8b14..6357d898 100644
--- a/src/legate_sparse/mapper/mapper.cc
+++ b/src/legate_sparse/mapper/mapper.cc
@@ -32,9 +32,9 @@ std::vector<StoreMapping> LegateSparseMapper::store_mappings(
   const Task& task, const std::vector<StoreTarget>& options)
 {
   const auto& inputs = task.inputs();
-  std::vector<StoreMapping> mappings(inputs.size());
+  std::vector<StoreMapping> mappings;
   for (size_t i = 0; i < inputs.size(); i++) {
-    mappings[i] = StoreMapping::default_mapping(inputs[i].data(), options.front());
+    mappings.push_back(StoreMapping::default_mapping(inputs[i].data(), options.front()));
   }
   return std::move(mappings);
 }
@@ -64,9 +64,10 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
         auto crd  = task.inputs()[1];
         auto vals = task.inputs()[2];
 
-        std::size_t nrows_plus_one   = pos.domain().get_volume() + 1;
-        std::size_t nnz              = vals.domain().get_volume();
-        std::size_t factor_of_safety = 1.15;  // make sure we don't fail; 1.15 is arbitrary
+        std::size_t nrows_plus_one = pos.domain().get_volume() + 1;
+        std::size_t nnz            = vals.domain().get_volume();
+        // make sure we don't fail; 1.15 is arbitrary
+        std::size_t factor_of_safety = static_cast<std::size_t>(1.15);
         std::size_t cusparseSpMV_buffer_size =
           factor_of_safety * std::ceil(nnz / 32.0) * sizeof(double);
         std::size_t legate_buffer_size = nrows_plus_one * (vals.type().size() + crd.type().size());
@@ -124,10 +125,13 @@ std::optional<std::size_t> LegateSparseMapper::allocation_pool_size(const Task&
       // and then update the estimate here
       return std::nullopt;
     }
-  }
 
-  LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
-  return {};
+    default: {
+      // Handle any unhandled enum values
+      LEGATE_ABORT("Unsupported Legate Sparse task_id: " + std::to_string(task_id));
+      return {};
+    }
+  }
 }
 
 Scalar LegateSparseMapper::tunable_value(legate::TunableID tunable_id)
diff --git a/src/legate_sparse/partition/fast_image_partition.cc b/src/legate_sparse/partition/fast_image_partition.cc
index c9ee27ef..13801faa 100644
--- a/src/legate_sparse/partition/fast_image_partition.cc
+++ b/src/legate_sparse/partition/fast_image_partition.cc
@@ -23,10 +23,10 @@ using namespace legate;
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+static const auto sparse_reg_task_ = []() -> char {
   FastImageRange::register_variants();
-}
+  return 0;
+}();
 }  // namespace
 
-}  // namespace sparse
\ No newline at end of file
+}  // namespace sparse
diff --git a/src/legate_sparse/partition/fast_image_partition.h b/src/legate_sparse/partition/fast_image_partition.h
index 2d09ff08..992843c5 100644
--- a/src/legate_sparse/partition/fast_image_partition.h
+++ b/src/legate_sparse/partition/fast_image_partition.h
@@ -31,7 +31,8 @@ struct FastImageRangeArgs {
 // only for CSR SpGEMM on GPU right now
 class FastImageRange : public SparseTask<FastImageRange> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_FAST_IMAGE_RANGE};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_FAST_IMAGE_RANGE}};
 
   static constexpr legate::VariantOptions GPU_VARIANT_OPTIONS =
     legate::VariantOptions{}.with_has_allocations(true);
diff --git a/src/legate_sparse/util/upcast_future.cc b/src/legate_sparse/util/upcast_future.cc
index c7e8f4ec..ad66b755 100644
--- a/src/legate_sparse/util/upcast_future.cc
+++ b/src/legate_sparse/util/upcast_future.cc
@@ -82,10 +82,12 @@ void upcast_impl(legate::TaskContext ctx)
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void)
-{
+
+static const auto sparse_reg_task_ = []() -> char {
   UpcastFutureToRegion::register_variants();
-}
+  return 0;
+}();
+
 }  // namespace
 
 }  // namespace sparse
diff --git a/src/legate_sparse/util/upcast_future.h b/src/legate_sparse/util/upcast_future.h
index 7c78df88..b38dbab9 100644
--- a/src/legate_sparse/util/upcast_future.h
+++ b/src/legate_sparse/util/upcast_future.h
@@ -24,7 +24,8 @@ namespace sparse {
 
 class UpcastFutureToRegion : public SparseTask<UpcastFutureToRegion> {
  public:
-  static constexpr auto TASK_ID = legate::LocalTaskID{LEGATE_SPARSE_UPCAST_FUTURE_TO_REGION};
+  static inline const auto TASK_CONFIG =
+    legate::TaskConfig{legate::LocalTaskID{LEGATE_SPARSE_UPCAST_FUTURE_TO_REGION}};
   static void cpu_variant(legate::TaskContext ctx);
 
  private:
diff --git a/test.py b/test.py
index 581dac4b..72165ebf 100755
--- a/test.py
+++ b/test.py
@@ -44,5 +44,4 @@ def stage_env(self, feature: FeatureType) -> EnvDict:
 
     plan = TestPlan(config, system)
 
-    plan.execute()
-    sys.exit(0)
+    sys.exit(plan.execute())
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 8c7ffa2f..a8b2f17e 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -8,8 +8,30 @@
 
 @pytest.fixture
 def create_mask():
-    """
-    Create a boolean mask matrix with a random sparsity pattern
+    """Create a boolean mask matrix with a random sparsity pattern.
+
+    This fixture creates equivalent boolean mask matrices in both SciPy and
+    Legate Sparse formats for testing purposes.
+
+    Parameters
+    ----------
+    rows : int
+        Number of rows (and columns) in the square matrix.
+    density : float, optional
+        Density of non-zero elements. Default is 0.3.
+
+    Returns
+    -------
+    tuple
+        (A_scipy, A_sparse) - Equivalent boolean matrices in SciPy and
+        Legate Sparse formats.
+
+    Notes
+    -----
+    The fixture ensures that both matrices have identical sparsity patterns
+    and values. It verifies equivalence by converting both to dense format
+    and checking that they are numerically close.
+
     """
 
     def _create_mask(rows, density=0.3):
@@ -39,8 +61,32 @@ def _create_mask(rows, density=0.3):
 
 @pytest.fixture
 def create_matrix():
-    """
-    Create matrices in SciPy and Legate Sparse that are equivalent
+    """Create matrices in SciPy and Legate Sparse that are equivalent.
+
+    This fixture creates equivalent sparse matrices in both SciPy and
+    Legate Sparse formats for testing purposes.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows (and columns) in the square matrix.
+    tol : float, optional
+        Threshold for sparsity. Values below this threshold are set to zero.
+        Default is 0.5.
+
+    Returns
+    -------
+    tuple
+        (A_scipy, A_sparse) - Equivalent sparse matrices in SciPy and
+        Legate Sparse formats.
+
+    Notes
+    -----
+    The fixture uses simple_system_gen to create a dense matrix, then
+    converts it to sparse format in both libraries. It verifies equivalence
+    by converting both to dense format and checking that they are numerically
+    close.
+
     """
 
     def _create_matrix(N, tol=0.5):
diff --git a/tests/integration/test_cg_solve.py b/tests/integration/test_cg_solve.py
index fe8b3578..d8e046e3 100644
--- a/tests/integration/test_cg_solve.py
+++ b/tests/integration/test_cg_solve.py
@@ -21,8 +21,26 @@
 
 
 def test_cg_solve():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with a positive definite matrix.
+
+    This test verifies that the conjugate gradient solver correctly
+    solves the linear system Ax = b for a positive definite matrix A.
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and ensures it is positive
+    definite by:
+    1. Making it symmetric: A = 0.5 * (A + A.T)
+    2. Adding a multiple of the identity: A = A + N * I
+
+    It then generates a random solution vector x and computes b = Ax.
+    The CG solver is used to solve Ax = b, and the result is verified
+    by checking that A * x_pred ≈ b.
+
+    The test uses a tolerance of 1e-8 for convergence and verification.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
@@ -36,8 +54,28 @@ def test_cg_solve():
 
 
 def test_cg_solve_with_callback():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with a callback function.
+
+    This test verifies that the conjugate gradient solver correctly
+    handles callback functions during iteration.
+
+    Notes
+    -----
+    The test creates a positive definite matrix and solves the linear
+    system Ax = b using CG with a callback function. The callback
+    computes the residual at each iteration and stores it in a list.
+
+    The test verifies that:
+    1. The solver converges to the correct solution
+    2. The callback function is called during iteration
+    3. The residuals are computed correctly
+
+    This ensures that the callback mechanism works properly and can
+    be used for monitoring convergence or implementing custom stopping
+    criteria.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
@@ -59,8 +97,8 @@ def callback(x):
 
 
 # def test_cg_solve_with_identity_preconditioner():
-#     N, D = 1000, 1000
-#     seed = 471014
+#     N, D = 20, 20
+#     seed = 42
 #     A = sample_dense(N, D, 0.1, seed)
 #     A = 0.5 * (A + A.T)
 #     A = A + N * np.eye(N)
@@ -75,8 +113,28 @@ def callback(x):
 
 
 def test_cg_solve_with_linear_operator():
-    N, D = 1000, 1000
-    seed = 471014
+    """Test conjugate gradient solver with LinearOperator objects.
+
+    This test verifies that the conjugate gradient solver correctly
+    works with LinearOperator objects that provide matrix-vector
+    multiplication functionality.
+
+    Notes
+    -----
+    The test creates a positive definite matrix A and wraps it in
+    a LinearOperator object. It then solves the linear system using
+    CG with the LinearOperator instead of the sparse matrix directly.
+
+    The test verifies two different LinearOperator implementations:
+    1. Using the @ operator: matvec(x) = A @ x
+    2. Using the dot method: matvec(x, out=None) = A.dot(x, out=out)
+
+    This ensures that the solver can work with any object that provides
+    the required matrix-vector multiplication interface, not just
+    sparse matrices.
+    """
+    N, D = 20, 20
+    seed = 42
     A = sample_dense(N, D, 0.1, seed)
     A = 0.5 * (A + A.T)
     A = A + N * np.eye(N)
diff --git a/tests/integration/test_comparison.py b/tests/integration/test_comparison.py
index 03c28128..3e58d9c6 100644
--- a/tests/integration/test_comparison.py
+++ b/tests/integration/test_comparison.py
@@ -33,18 +33,32 @@
 @pytest.mark.parametrize("threshold", [0.3, 0.5])
 @pytest.mark.parametrize("op_name, op_func", COMPARISON_OPS)
 def test_comparison_operation(N, threshold, op_name, op_func):
-    """Test element-wise comparison operations on non-zero entries of the matrix
+    """Test element-wise comparison operations on non-zero entries of the matrix.
+
+    This test verifies that comparison operations work correctly on sparse
+    matrices by comparing results with dense matrix operations.
 
     Parameters
     ----------
     N : int
-        Size of the test matrix
+        Size of the test matrix.
     threshold : float
-        Value to compare against
+        Value to compare against.
     op_name : str
-        Name of the comparison operation
+        Name of the comparison operation.
     op_func : callable
-        The comparison function to test
+        The comparison function to test.
+
+    Notes
+    -----
+    The test creates a sparse matrix and applies a comparison operation
+    against a threshold value. It then compares the number of True values
+    in the sparse result with the dense result (considering only non-zero
+    elements).
+
+    This verifies that sparse comparison operations produce the same
+    logical result as dense operations when applied to non-zero elements.
+
     """
     A_dense, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
 
@@ -58,12 +72,30 @@ def test_comparison_operation(N, threshold, op_name, op_func):
 def test_comparison_error_cases(op_name, op_func):
     """Test error cases for comparison operations.
 
+    This test verifies that comparison operations properly handle invalid
+    input types by raising appropriate exceptions.
+
     Parameters
     ----------
     op_name : str
-        Name of the comparison operation
+        Name of the comparison operation.
     op_func : callable
-        The comparison function to test
+        The comparison function to test.
+
+    Notes
+    -----
+    The test attempts to compare a sparse matrix with various invalid
+    types including:
+    - 1D arrays
+    - 2D arrays
+    - Strings
+    - Lists
+
+    All of these should raise AssertionError since sparse matrix
+    comparison operations only support scalar values.
+
+    This ensures that the implementation properly validates input
+    types and provides clear error messages for unsupported operations.
     """
     N = 8
     _, A_sparse, _ = simple_system_gen(N, N, sparse.csr_array, tol=0.7)
diff --git a/tests/integration/test_diagonal.py b/tests/integration/test_diagonal.py
index 8c53aa00..3ed7dd54 100644
--- a/tests/integration/test_diagonal.py
+++ b/tests/integration/test_diagonal.py
@@ -24,6 +24,36 @@
 @pytest.mark.parametrize("N", [7, 13])
 @pytest.mark.parametrize("with_zeros", [True, False])
 def test_csr_diagonal(N, with_zeros):
+    """Test diagonal extraction from CSR matrices.
+
+    This test verifies that the diagonal() method correctly extracts
+    the main diagonal from CSR matrices, comparing results with dense
+    matrix diagonal extraction.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix (N x N).
+    with_zeros : bool
+        Whether to include zeros on the diagonal (True) or ensure
+        non-zero diagonal elements (False).
+
+    Notes
+    -----
+    The test creates a random sparse matrix and optionally adds the
+    identity matrix to ensure non-zero diagonal elements. It then
+    extracts the diagonal using both the sparse matrix's diagonal()
+    method and numpy's diagonal() function on the dense version.
+
+    The test verifies that:
+    1. The diagonal elements are extracted correctly
+    2. The results match between sparse and dense implementations
+    3. The method works for both sparse and dense diagonals
+
+    This is important because diagonal extraction is a common operation
+    in linear algebra and should work consistently across different
+    matrix formats.
+    """
     M = N
     np.random.seed(0)
     A_dense, _, _ = simple_system_gen(N, M, None, tol=0.2)
diff --git a/tests/integration/test_gmres_solve.py b/tests/integration/test_gmres_solve.py
index 1acd8d68..37c8dd76 100644
--- a/tests/integration/test_gmres_solve.py
+++ b/tests/integration/test_gmres_solve.py
@@ -21,6 +21,28 @@
 
 
 def test_gmres_solve():
+    """Test GMRES solver with a positive definite matrix.
+
+    This test verifies that the GMRES solver correctly solves the linear
+    system Ax = b for a positive definite matrix A.
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and ensures it is positive
+    definite by:
+    1. Making it symmetric: A = 0.5 * (A + A.T)
+    2. Adding a multiple of the identity: A = A + N * I
+
+    It then generates a random solution vector x and computes b = Ax.
+    The GMRES solver is used to solve Ax = b, and the result is verified
+    by checking that A * x_pred ≈ b.
+
+    The test uses:
+    - atol=1e-5: Absolute tolerance for convergence
+    - tol=1e-5: Relative tolerance (legacy parameter)
+    - maxiter=300: Maximum number of iterations
+    - atol=1e-8: Tolerance for final verification
+    """
     N, D = 1000, 1000
     seed = 471014
     A = sample_dense(N, D, 0.1, seed)
diff --git a/tests/integration/test_indexing.py b/tests/integration/test_indexing.py
index 80d078ae..259c7996 100644
--- a/tests/integration/test_indexing.py
+++ b/tests/integration/test_indexing.py
@@ -20,15 +20,39 @@
 
 
 class TestIndexingSetItem:
+    """Test class for sparse matrix indexing and assignment operations.
+
+    This class contains tests for various indexing scenarios including
+    boolean masking, derived masks, and edge cases for sparse matrix
+    assignment operations.
+    """
+
     @pytest.mark.parametrize("N", [6, 9, 17])
     def test_incompatible_mask(self, N, create_matrix, create_mask):
-        """
+        """Test indexing with incompatible mask sparsity patterns.
+
         This test checks that the mask is applied correctly to the matrix when
-        the sparsity of mask is from that of the matrix.
+        the sparsity of mask is different from that of the matrix.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+        create_mask : fixture
+            Fixture to create boolean masks.
+
+        Notes
+        -----
         While SciPy will apply the mask to all entries, Legate Sparse will only
         apply the mask to the non-zero entries of the matrix, so we can't compare
-        to SciPy results for all entries. Instead, we check that the number of
-        non-zero entries are updated correctly and the values are updated correctly.
+        to SciPy results for all entries. Instead, we check that:
+        1. The number of non-zero entries are updated correctly
+        2. The values are updated correctly for masked positions
+
+        This test verifies that the sparse implementation correctly handles
+        cases where the mask has a different sparsity pattern than the matrix.
         """
         _, A = create_matrix(N)
         _, mask = create_mask(N)
@@ -54,11 +78,25 @@ def test_incompatible_mask(self, N, create_matrix, create_mask):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_derived_from_self(self, N, create_matrix):
-        """
+        """Test indexing with mask derived from the matrix itself.
+
         This test checks that the mask is applied correctly to the matrix when
-        the sparsity of mask is derived from the matrix. Our behavior
-        matches that of SciPy, so we can compare against SciPy
-        results for all entries.
+        the sparsity of mask is derived from the matrix.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+
+        Notes
+        -----
+        Our behavior matches that of SciPy when the mask is derived from
+        the matrix itself, so we can compare against SciPy results for all entries.
+
+        The test creates a mask based on a threshold comparison (A > threshold)
+        and verifies that both SciPy and Legate Sparse produce identical results.
         """
         A_scipy, A_sparse = create_matrix(N)
         threshold = 0.2
@@ -81,9 +119,23 @@ def test_mask_derived_from_self(self, N, create_matrix):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_all_true(self, N, create_matrix):
-        """
+        """Test indexing behavior with a mask that is all True.
+
         This test checks indexing behavior when using a mask that is all True.
         Every non-zero element should be updated to the new value.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+
+        Notes
+        -----
+        The test creates a mask with the same sparsity pattern as the matrix
+        but with all True values. This should result in all non-zero elements
+        being updated to the specified value.
         """
         _, A = create_matrix(N)
         value = 10.0
@@ -99,9 +151,24 @@ def test_mask_all_true(self, N, create_matrix):
 
     @pytest.mark.parametrize("N", [8, 13, 24])
     def test_mask_all_false(self, N, create_matrix, create_mask):
-        """
+        """Test indexing behavior with a mask that is all False.
+
         This test checks indexing behavior when using a mask that is all False.
         No elements should be modified.
+
+        Parameters
+        ----------
+        N : int
+            Size of the square matrix.
+        create_matrix : fixture
+            Fixture to create test matrices.
+        create_mask : fixture
+            Fixture to create boolean masks.
+
+        Notes
+        -----
+        The test creates a mask with density=0 (all False values) and verifies
+        that the matrix remains unchanged after the assignment operation.
         """
         _, A = create_matrix(N)
         _, mask_all_false = create_mask(N, density=0)
@@ -114,7 +181,25 @@ def test_mask_all_false(self, N, create_matrix, create_mask):
         assert numpy.all(A_copy.get_data() == A.get_data())
 
     def test_random_column_order(self):
-        "The ordering of the matrix is random" ""
+        """Test indexing with randomly ordered column indices.
+
+        This test verifies that indexing works correctly even when the
+        column indices are not in sorted order within each row.
+
+        Notes
+        -----
+        The test creates a matrix with randomly ordered column indices
+        within rows. During instantiation, these indices get sorted to
+        ensure proper indexing behavior.
+
+        The test verifies that:
+        1. The matrix is created correctly despite random column ordering
+        2. Boolean indexing operations work correctly
+        3. The number of elements replaced matches the expected count
+
+        This is important because CSR format requires column indices to be
+        sorted within each row for efficient operations.
+        """
         row_indices = cupynumeric.array(
             [
                 2,
diff --git a/tests/integration/test_io.py b/tests/integration/test_io.py
index 8d237ba4..c307941c 100644
--- a/tests/integration/test_io.py
+++ b/tests/integration/test_io.py
@@ -25,6 +25,29 @@
 
 @pytest.fixture
 def test_mtx_files():
+    """Fixture providing paths to test Matrix Market files.
+
+    This fixture returns a list of paths to various Matrix Market (.mtx)
+    files that are used for testing the mmread functionality.
+
+    Returns
+    -------
+    list
+        List of file paths to test Matrix Market files.
+
+    Notes
+    -----
+    The fixture includes various types of matrices:
+    - test.mtx: Basic test matrix
+    - GlossGT.mtx: Graph theory matrix
+    - Ragusa18.mtx: Scientific computing matrix
+    - cage4.mtx: Graph matrix
+    - karate.mtx: Social network matrix
+
+    These files are located in the testdata directory and provide
+    different sparsity patterns and matrix properties for comprehensive
+    testing of the Matrix Market reader.
+    """
     mtx_files = [
         "test.mtx",
         "GlossGT.mtx",
@@ -36,6 +59,31 @@ def test_mtx_files():
 
 
 def test_mmread(test_mtx_files):
+    """Test Matrix Market file reading functionality.
+
+    This test verifies that the legate_sparse Matrix Market reader
+    produces the same results as SciPy's mmread function.
+
+    Parameters
+    ----------
+    test_mtx_files : list
+        List of Matrix Market file paths to test.
+
+    Notes
+    -----
+    The test reads each Matrix Market file using both legate_sparse.io.mmread
+    and scipy.io.mmread, then compares the results by converting both to
+    dense format and checking for equality.
+
+    This ensures that:
+    1. The Matrix Market format is parsed correctly
+    2. The sparse matrix structure is preserved
+    3. The numerical values are read accurately
+    4. The implementation is compatible with SciPy's reference implementation
+
+    The test covers various matrix types and sizes to ensure robust
+    parsing of the Matrix Market format.
+    """
     for mtx_file in test_mtx_files:
         arr = legate_io.mmread(mtx_file)
         s = sci_io.mmread(mtx_file)
diff --git a/tests/integration/test_spgemm.py b/tests/integration/test_spgemm.py
index 7106281c..5954df79 100644
--- a/tests/integration/test_spgemm.py
+++ b/tests/integration/test_spgemm.py
@@ -25,6 +25,25 @@
 
 @pytest.mark.parametrize("N", [5, 29])
 def test_csr_spgemm(N):
+    """Test sparse matrix-matrix multiplication with CSR matrices.
+
+    This test verifies that sparse matrix-matrix multiplication works
+    correctly for different matrix sizes.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrices (N x N).
+
+    Notes
+    -----
+    The test creates a random sparse matrix A and computes A @ A using
+    the sparse implementation. It then compares the result with the
+    dense matrix multiplication A_dense @ A_dense to verify correctness.
+
+    The test uses different matrix sizes to ensure the implementation
+    works correctly for both small and larger matrices.
+    """
     np.random.seed(0)
     A_dense, A, _ = simple_system_gen(N, N, sparse.csr_array)
 
@@ -38,6 +57,29 @@ def test_csr_spgemm(N):
 @pytest.mark.parametrize("N", [5, 29])
 @pytest.mark.parametrize("unsupported_dtype", ["int", "bool"])
 def test_csr_spgemm_unsupported_dtype(N, unsupported_dtype):
+    """Test that unsupported datatypes raise appropriate exceptions for SpGEMM.
+
+    This test verifies that sparse matrix-matrix multiplication
+    properly handles unsupported datatypes by raising NotImplementedError
+    when running on GPU.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrices.
+    unsupported_dtype : str
+        Datatype that is not supported for SpGEMM operations.
+
+    Notes
+    -----
+    The test creates banded matrices with unsupported datatypes and
+    attempts to perform matrix-matrix multiplication. On GPU systems,
+    this should raise NotImplementedError since only floating-point
+    and complex datatypes are supported for SpGEMM.
+
+    Currently supported datatypes are float32, float64, complex64,
+    and complex128.
+    """
     np.random.seed(0)
 
     nnz_per_row = 3
diff --git a/tests/integration/test_spmv.py b/tests/integration/test_spmv.py
index 0aca095e..0c3590df 100644
--- a/tests/integration/test_spmv.py
+++ b/tests/integration/test_spmv.py
@@ -27,6 +27,32 @@
 @pytest.mark.parametrize("M", [7, 17])
 @pytest.mark.parametrize("inline", [True, False])
 def test_csr_spmv(N, M, inline):
+    """Test sparse matrix-vector multiplication with CSR matrices.
+
+    This test verifies that sparse matrix-vector multiplication works
+    correctly for different matrix sizes and computation methods.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    inline : bool
+        Whether to use inline computation (A.dot(x, out=y)) or
+        standard multiplication (A @ x).
+
+    Notes
+    -----
+    The test creates a random sparse matrix and vector, then computes
+    the matrix-vector product using both the sparse implementation
+    and a dense reference. It verifies that the results are numerically
+    close.
+
+    The inline parameter tests two different computation methods:
+    - inline=True: Uses A.dot(x, out=y) with pre-allocated output
+    - inline=False: Uses A @ x with automatic output allocation
+    """
     np.random.seed(0)
     A_dense, A, x = simple_system_gen(N, M, sparse.csr_array)
 
@@ -43,6 +69,31 @@ def test_csr_spmv(N, M, inline):
 @pytest.mark.parametrize("nnz_per_row", [3, 9])
 @pytest.mark.parametrize("unsupported_dtype", ["int", "bool"])
 def test_csr_spmv_unsupported_dtype(N, nnz_per_row, unsupported_dtype):
+    """Test that unsupported datatypes raise appropriate exceptions.
+
+    This test verifies that sparse matrix-vector multiplication
+    properly handles unsupported datatypes by raising NotImplementedError
+    when running on GPU.
+
+    Parameters
+    ----------
+    N : int
+        Size of the square matrix.
+    nnz_per_row : int
+        Number of non-zeros per row in the banded matrix.
+    unsupported_dtype : str
+        Datatype that is not supported for SpMV operations.
+
+    Notes
+    -----
+    The test creates a banded matrix with an unsupported datatype
+    and attempts to perform matrix-vector multiplication. On GPU
+    systems, this should raise NotImplementedError since only
+    floating-point and complex datatypes are supported for SpMV.
+
+    Currently supported datatypes are float32, float64, complex64,
+    and complex128.
+    """
     np.random.seed(0)
 
     A = banded_matrix(N, nnz_per_row).astype(unsupported_dtype)
diff --git a/tests/integration/utils/banded_matrix.py b/tests/integration/utils/banded_matrix.py
index 4cff897a..fda5ef5f 100644
--- a/tests/integration/utils/banded_matrix.py
+++ b/tests/integration/utils/banded_matrix.py
@@ -24,22 +24,51 @@ def banded_matrix(
     init_with_ones: bool = True,
     verbose: bool = False,
 ):
-    """
+    """Create a banded sparse matrix for testing purposes.
+
     Parameters
     ----------
-    N: int
-        Size of the NxN sparse matrix
-    nnz_per_row: int
-        Number of non-zero elements per row (odd number)
-    from_diags: bool
-        use sparse.diags to generate the banded matrix (default = True)
-    init_with_ones: bool
-        Initialize the matrix with ones instead of arange
+    N : int
+        Size of the NxN sparse matrix.
+    nnz_per_row : int
+        Number of non-zero elements per row (must be odd).
+    from_diags : bool, optional
+        Use sparse.diags to generate the banded matrix. Default is True.
+    init_with_ones : bool, optional
+        Initialize the matrix with ones instead of arange. Default is True.
+    verbose : bool, optional
+        Print detailed information about the matrix construction. Default is False.
 
     Returns
     -------
-    csr_array:
-        Return a sparse matrix
+    csr_array
+        A banded sparse matrix in CSR format.
+
+    Raises
+    ------
+    AssertionError
+        If N <= nnz_per_row or nnz_per_row is not odd.
+
+    Notes
+    -----
+    This function creates a banded matrix with a specific sparsity pattern.
+    When from_diags=True, it uses the sparse.diags function which is simpler
+    but may be slower. When from_diags=False, it constructs the CSR matrix
+    directly for better performance.
+
+    The matrix has a banded structure with nnz_per_row non-zeros per row,
+    centered around the main diagonal. The function handles the boundary
+    conditions by masking out-of-bounds indices.
+
+    Examples
+    --------
+    >>> A = banded_matrix(5, 3, from_diags=True)
+    >>> print(A.toarray())
+    [[1. 1. 0. 0. 0.]
+     [1. 1. 1. 0. 0.]
+     [0. 1. 1. 1. 0.]
+     [0. 0. 1. 1. 1.]
+     [0. 0. 0. 1. 1.]]
     """
 
     if from_diags:
diff --git a/tests/integration/utils/sample.py b/tests/integration/utils/sample.py
index a201d6f2..e5444987 100644
--- a/tests/integration/utils/sample.py
+++ b/tests/integration/utils/sample.py
@@ -19,11 +19,62 @@
 
 
 class Normal(stats.rv_continuous):
+    """Custom normal distribution class for reproducible random sampling.
+
+    This class extends scipy.stats.rv_continuous to provide a custom
+    normal distribution that can be used with scipy.sparse.random for
+    generating sparse matrices with reproducible random values.
+
+    Notes
+    -----
+    The _rvs method generates standard normal random variates using
+    the provided random_state for reproducibility.
+    """
+
     def _rvs(self, *args, size=None, random_state=None):
+        """Generate standard normal random variates.
+
+        Parameters
+        ----------
+        size : int or tuple, optional
+            Number of random variates to generate.
+        random_state : numpy.random.RandomState, optional
+            Random state for reproducibility.
+
+        Returns
+        -------
+        numpy.ndarray
+            Array of standard normal random variates.
+        """
         return random_state.standard_normal(size)
 
 
 def sample(N: int, D: int, density: float, seed: int):
+    """Generate a sparse matrix with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    D : int
+        Number of columns in the matrix.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    scipy.sparse.csr_matrix
+        A sparse matrix in CSR format with random normal values.
+
+    Notes
+    -----
+    This function uses scipy.sparse.random with a custom normal distribution
+    to generate sparse matrices with reproducible random values. The matrix
+    is returned in CSR format.
+
+    """
     NormalType = Normal(seed=seed)
     SeededNormal = NormalType()
     return scpy.random(
@@ -38,14 +89,91 @@ def sample(N: int, D: int, density: float, seed: int):
 
 
 def sample_dense(N: int, D: int, density: float, seed: int):
+    """Generate a dense matrix with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    D : int
+        Number of columns in the matrix.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    numpy.ndarray
+        A dense matrix with random normal values.
+
+    Notes
+    -----
+    This function generates a sparse matrix using sample() and then
+    converts it to dense format. This is useful for creating test
+    matrices that can be compared with sparse implementations.
+
+    """
     return numpy.asarray(sample(N, D, density, seed).todense())
 
 
 def sample_dense_vector(N: int, density: float, seed: int):
+    """Generate a dense vector with random values from a normal distribution.
+
+    Parameters
+    ----------
+    N : int
+        Length of the vector.
+    density : float
+        Density of non-zero elements (between 0 and 1).
+    seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    numpy.ndarray
+        A dense vector with random normal values.
+
+    Notes
+    -----
+    This function generates a dense matrix with one column using
+    sample_dense() and then squeezes it to a 1D vector.
+
+    """
     return sample_dense(N, 1, density, seed).squeeze()
 
 
 def simple_system_gen(N, M, cls, tol=0.5):
+    """Generate a simple linear system for testing.
+
+    Parameters
+    ----------
+    N : int
+        Number of rows in the matrix.
+    M : int
+        Number of columns in the matrix.
+    cls : type or None
+        Class to use for creating the sparse matrix. If None, no sparse
+        matrix is created.
+    tol : float, optional
+        Threshold for sparsity. Values below this threshold are set to zero.
+        Default is 0.5.
+
+    Returns
+    -------
+    tuple
+        (a_dense, a_sparse, x) where:
+        - a_dense: Dense matrix
+        - a_sparse: Sparse matrix (or None if cls is None)
+        - x: Dense vector
+
+    Notes
+    -----
+    This function generates a random dense matrix and vector, then
+    applies a threshold to create sparsity. The sparse matrix is
+    created using the provided class if specified.
+
+    """
     a_dense = np.random.rand(N, M)
     x = np.random.rand(M)
     a_dense = np.where(a_dense < tol, a_dense, 0)

From 4515ccea859fff6eef75caca767d71ba8b740759 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Dec 2025 20:15:02 +0000
Subject: [PATCH 3/4] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/mirrors-mypy: v1.5.1 → v1.19.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.5.1...v1.19.1)
- https://github.com/psf/black → https://github.com/psf/black-pre-commit-mirror
- [github.com/psf/black-pre-commit-mirror: 23.9.1 → 25.12.0](https://github.com/psf/black-pre-commit-mirror/compare/23.9.1...25.12.0)
- [github.com/PyCQA/isort: 5.12.0 → 7.0.0](https://github.com/PyCQA/isort/compare/5.12.0...7.0.0)
- [github.com/PyCQA/flake8: 6.1.0 → 7.3.0](https://github.com/PyCQA/flake8/compare/6.1.0...7.3.0)
- [github.com/pre-commit/mirrors-clang-format: v16.0.6 → v21.1.8](https://github.com/pre-commit/mirrors-clang-format/compare/v16.0.6...v21.1.8)
---
 .pre-commit-config.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b24026ed..6384338f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,27 +1,27 @@
 repos:
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v1.5.1'
+      rev: 'v1.19.1'
       hooks:
             - id: mypy
               language: system
               pass_filenames: false
               args: ['legate_sparse']
-    - repo: https://github.com/psf/black
-      rev: 23.9.1
+    - repo: https://github.com/psf/black-pre-commit-mirror
+      rev: 25.12.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/isort
-      rev: 5.12.0
+      rev: 7.0.0
       hooks:
             - id: isort
               args: ["--profile", "black"]
     - repo: https://github.com/PyCQA/flake8
-      rev: 6.1.0
+      rev: 7.3.0
       hooks:
             - id: flake8
               args: [--config=.flake8]
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v16.0.6'  # Use the sha / tag you want to point at
+      rev: 'v21.1.8'  # Use the sha / tag you want to point at
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$

From ac60270bdc1006ddb2c5738cb55d5c40e7859d99 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Dec 2025 20:15:11 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/common.py         | 3 +--
 examples/pde.py            | 4 +++-
 legate_sparse/coverage.py  | 3 +--
 legate_sparse/module.py    | 1 +
 scripts/memlog_analysis.py | 4 ++--
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/common.py b/examples/common.py
index 99174ed6..c7b7b493 100644
--- a/examples/common.py
+++ b/examples/common.py
@@ -171,8 +171,7 @@ class DummyScope:
     that may or may not use resource scoping.
     """
 
-    def __init__(self):
-        ...
+    def __init__(self): ...
 
     def __enter__(self):
         """Enter the context (no-op)."""
diff --git a/examples/pde.py b/examples/pde.py
index d9ca0095..9212baba 100644
--- a/examples/pde.py
+++ b/examples/pde.py
@@ -218,7 +218,9 @@ def execute(nx, ny, plot, plot_fname, throughput, tol, max_iters, warmup_iters,
         # If we're testing throughput, run only the prescribed number of iterations.
         if throughput:
             if use_legate:
-                p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters)
+                p_sol, iters = linalg.cg(
+                    A, bflat, rtol=tol, maxiter=max_iters, conv_test_iters=max_iters
+                )
             else:
                 p_sol, iters = linalg.cg(A, bflat, rtol=tol, maxiter=max_iters)
         else:
diff --git a/legate_sparse/coverage.py b/legate_sparse/coverage.py
index 8765044e..cfbf98aa 100644
--- a/legate_sparse/coverage.py
+++ b/legate_sparse/coverage.py
@@ -43,8 +43,7 @@ def should_wrap(obj: object) -> bool:
 
 
 class AnyCallable(Protocol):
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
 
 
 def wrap(func: AnyCallable) -> Any:
diff --git a/legate_sparse/module.py b/legate_sparse/module.py
index 56f22fa1..e2004a60 100644
--- a/legate_sparse/module.py
+++ b/legate_sparse/module.py
@@ -59,6 +59,7 @@
 def _is_sparse_matrix(obj) -> bool:
     return any((isinstance(obj, csr_array), isinstance(obj, dia_array)))
 
+
 def isspmatrix(obj) -> bool:
     """Check if an object is a legate sparse matrix.
 
diff --git a/scripts/memlog_analysis.py b/scripts/memlog_analysis.py
index ee8bd3c6..e294c349 100644
--- a/scripts/memlog_analysis.py
+++ b/scripts/memlog_analysis.py
@@ -16,10 +16,10 @@
 
     # Parse the log file
     allocations = parse_memlog('memlog.txt')
-    
+
     # Export to CSV
     export_to_csv(allocations, 'memory_analysis.csv')
-    
+
     # Create visualizations (requires pandas, matplotlib, seaborn)
     visualize_allocations(allocations)
 """  # noqa: W293