From bfb773e34cf81d493a07b9e2581e9ce9930bb0ca Mon Sep 17 00:00:00 2001
From: Lawson Darrow <lawson.darrow@gmail.com>
Date: Mon, 1 Jun 2026 11:07:56 -0400
Subject: [PATCH 1/2] perf: single-pass _sparse_nanmean (#1894)

_sparse_nanmean copied the matrix twice and did a sparse set-index +
eliminate_zeros. Replace with single-pass numba kernels over the compressed
buffers (one for within-slot reduction, one for the scatter across slots),
handling both axes and CSR/CSC. Implicit zeros still count as observed; only
stored NaNs are excluded, matching np.nanmean on the dense array.

Benchmarked on a 20k x 3k 5%-dense matrix: ~88x faster for axis=1, ~9.5x for
axis=0. Adds CSC regression tests.
---
 src/scanpy/tools/_score_genes.py | 73 ++++++++++++++++++++++++--------
 tests/test_score_genes.py        | 14 ++++++
 2 files changed, 70 insertions(+), 17 deletions(-)

diff --git a/src/scanpy/tools/_score_genes.py b/src/scanpy/tools/_score_genes.py
index 6ba7f51a4c..839e9b863d 100644
--- a/src/scanpy/tools/_score_genes.py
+++ b/src/scanpy/tools/_score_genes.py
@@ -4,8 +4,10 @@
 
 from typing import TYPE_CHECKING
 
+import numba
 import numpy as np
 import pandas as pd
+from fast_array_utils.numba import njit
 
 from .. import logging as logg
 from .._compat import CSBase
@@ -29,28 +31,65 @@
 type _GetSubset = Callable[[_StrIdx], np.ndarray | CSBase]
 
 
+@njit
+def _sparse_nanmean_within_slot(
+    data: NDArray, indptr: NDArray, n_out: int, divisor: int
+) -> NDArray[np.float64]:
+    """Reduce within each compressed slot (e.g. per row for CSR), ignoring NaNs."""
+    out = np.empty(n_out, dtype=np.float64)
+    for i in numba.prange(n_out):
+        total = np.float64(0)
+        n_nan = 0
+        for k in range(indptr[i], indptr[i + 1]):
+            v = data[k]
+            if np.isnan(v):
+                n_nan += 1
+            else:
+                total += v
+        out[i] = total / (divisor - n_nan)
+    return out
+
+
+@njit
+def _sparse_nanmean_across_slots(
+    data: NDArray, indices: NDArray, n_out: int, divisor: int
+) -> NDArray[np.float64]:
+    """Reduce across compressed slots (e.g. per column for CSR), ignoring NaNs."""
+    total = np.zeros(n_out, dtype=np.float64)
+    n_nan = np.zeros(n_out, dtype=np.int64)
+    for k in range(data.shape[0]):
+        v = data[k]
+        j = indices[k]
+        if np.isnan(v):
+            n_nan[j] += 1
+        else:
+            total[j] += v
+    out = np.empty(n_out, dtype=np.float64)
+    for j in numba.prange(n_out):
+        out[j] = total[j] / (divisor - n_nan[j])
+    return out
+
+
 def _sparse_nanmean(x: CSBase, /, axis: Literal[0, 1]) -> NDArray[np.float64]:
-    """np.nanmean equivalent for sparse matrices."""
+    """np.nanmean equivalent for sparse matrices.
+
+    Computed in a single pass over the stored values, avoiding the two matrix
+    copies and the sparse set-index/``eliminate_zeros`` the previous
+    implementation needed. Implicit (structural) zeros count as observed
+    values; only stored ``NaN`` entries are excluded, matching ``np.nanmean``
+    on the densified array.
+    """
     if not isinstance(x, CSBase):
         msg = "X must be a compressed sparse matrix"
         raise TypeError(msg)
 
-    # count the number of nan elements per row/column (dep. on axis)
-    z = x.copy()
-    z.data = np.isnan(z.data)
-    z.eliminate_zeros()
-    n_elements = z.shape[axis] - z.sum(axis)
-
-    # set the nans to 0, so that a normal .sum() works
-    y = x.copy()
-    y.data[np.isnan(y.data)] = 0
-    y.eliminate_zeros()
-
-    # the average
-    s = y.sum(axis, dtype="float64")  # float64 for score_genes function compatibility)
-    m = s / n_elements
-
-    return m
+    n_out = x.shape[1 - axis]  # one result per row (axis=1) or per column (axis=0)
+    divisor = x.shape[axis]  # observable positions along the reduced axis
+    # the compressed (major) axis stores rows for CSR and columns for CSC
+    reduce_within_slot = (x.format == "csr") == (axis == 1)
+    if reduce_within_slot:
+        return _sparse_nanmean_within_slot(x.data, x.indptr, n_out, divisor)
+    return _sparse_nanmean_across_slots(x.data, x.indices, n_out, divisor)
 
 
 @_doc_params(rng=doc_rng)
diff --git a/tests/test_score_genes.py b/tests/test_score_genes.py
index addd384f18..0bdbaf827c 100644
--- a/tests/test_score_genes.py
+++ b/tests/test_score_genes.py
@@ -153,6 +153,20 @@ def test_sparse_nanmean_on_dense_matrix():
         _sparse_nanmean(data, 0)
 
 
+@pytest.mark.parametrize("axis", [0, 1])
+def test_sparse_nanmean_csc(axis: Literal[0, 1]) -> None:
+    """_sparse_nanmean is equivalent to np.nanmean for CSC input too."""
+    from scanpy.tools._score_genes import _sparse_nanmean
+
+    arr = conv.to_dense(
+        _create_sparse_nan_matrix(60, 50, percent_zero=0.3, percent_nan=0.3)
+    )
+    mat = sparse.csc_matrix(arr)  # noqa: TID251
+    np.testing.assert_allclose(
+        np.nanmean(arr, axis), np.array(_sparse_nanmean(mat, axis)).flatten()
+    )
+
+
 def test_score_genes_sparse_vs_dense():
     """score_genes() should give the same result for dense and sparse matrices."""
     adata_sparse = _create_adata(100, 1000, p_zero=0.3, p_nan=0.3)

From 1be798ebd6a02b482056b76dc3f912070b3ccc8a Mon Sep 17 00:00:00 2001
From: Lawson Darrow <lawson.darrow@gmail.com>
Date: Mon, 1 Jun 2026 11:10:29 -0400
Subject: [PATCH 2/2] docs: add release note for #4141

---
 docs/release-notes/4141.perf.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 docs/release-notes/4141.perf.md

diff --git a/docs/release-notes/4141.perf.md b/docs/release-notes/4141.perf.md
new file mode 100644
index 0000000000..7838e7541b
--- /dev/null
+++ b/docs/release-notes/4141.perf.md
@@ -0,0 +1 @@
+Speed up {func}`~scanpy.tl.score_genes` by computing the sparse `nanmean` in a single pass instead of copying the matrix twice {smaller}`L Darrow`