diff --git a/docs/release-notes/0.15.2.md b/docs/release-notes/0.15.2.md index 342f9e0b..1a934840 100644 --- a/docs/release-notes/0.15.2.md +++ b/docs/release-notes/0.15.2.md @@ -3,3 +3,7 @@ ```{rubric} Features ``` * Add pseudobulk based distance metrics to {class}`~rapids_singlecell.ptg.Distance`: ``euclidean``, ``root_mean_squared_error``, ``mse``, ``mean_absolute_error``, ``pearson_distance``, ``cosine_distance``, ``r2_distance``. Matches ``pertpy.tl.Distance`` {pr}`676` {smaller}`S Dicks` + +```{rubric} Bug fixes +``` +* Fixes per-batch clip threshold in `pp.highly_variable_genes(flavor="pearson_residuals", batch_key=...)`: each batch now uses its own `sqrt(n_cells_in_batch)` instead of silently reusing the first batch's value {pr}`674` {smaller}`A Mikaeili` diff --git a/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py b/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py index 47c50e0f..79a83045 100644 --- a/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py +++ b/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py @@ -73,13 +73,15 @@ def _highly_variable_pearson_residuals( X_batch = X_batch[:, nonzero_genes] if clip is None: n = X_batch.shape[0] - clip = cp.sqrt(n, dtype=dtype) - if clip < 0: + clip_batch = cp.sqrt(n, dtype=dtype) + else: + clip_batch = clip + if clip_batch < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") n_cells = X_batch.shape[0] n_genes = X_batch.shape[1] - clip_val = float(clip) + clip_val = float(clip_batch) inv_theta = 1.0 / theta residual_gene_var = cp.zeros(n_genes, dtype=dtype, order="C") stream = cp.cuda.get_current_stream().ptr diff --git a/tests/test_hvg.py b/tests/test_hvg.py index a8d24e55..7821b3fd 100644 --- a/tests/test_hvg.py +++ b/tests/test_hvg.py @@ -397,6 +397,43 @@ def test_highly_variable_genes_pearson_residuals_batch(n_top_genes, dtype): assert len(cudata.var) == n_genes +def test_pearson_residuals_batch_order_invariant(): + """HVG ranking must not depend on alphabetical batch-label order.""" + rng = np.random.default_rng(0) + n_big, n_small, n_genes = 5000, 200, 200 + counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) + counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) + X = csr_matrix(counts.astype(np.float32)) + + a1 = AnnData(X=cpx.scipy.sparse.csr_matrix(X.copy())) + a1.obs["batch"] = np.array(["A"] * n_big + ["B"] * n_small) + a1.obs["batch"] = a1.obs["batch"].astype("category") + rsc.pp.highly_variable_genes( + a1, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, + ) + + a2 = AnnData(X=cpx.scipy.sparse.csr_matrix(X.copy())) + a2.obs["batch"] = np.array(["B"] * n_big + ["A"] * n_small) + a2.obs["batch"] = a2.obs["batch"].astype("category") + rsc.pp.highly_variable_genes( + a2, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, + ) + + np.testing.assert_allclose( + a1.var["residual_variances"].to_numpy(), + a2.var["residual_variances"].to_numpy(), + atol=1e-5, + ) + + @pytest.mark.parametrize("dtype", ["float32", "float64"]) @pytest.mark.parametrize("sparse", [True, False]) def test_poisson_gene_selection_compare_to_scvi(dtype, sparse):