bnprks · bnprks · Apr 17, 2025 · Apr 17, 2025 · immanuelazn · May 19, 2025
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -38,6 +38,9 @@ export(collect_features)
 export(continuous_palette)
 export(convert_matrix_type)
 export(convert_to_fragments)
+export(cor_dense)
+export(cov_dense)
+export(crossprod_dense)
 export(discrete_palette)
 export(draw_trackplot_grid)
 export(expm1_slow)
@@ -62,6 +65,7 @@ export(match_gene_symbol)
 export(matrix_stats)
 export(matrix_type)
 export(merge_cells)
+export(merge_peaks_iterative)
 export(min_by_col)
 export(min_by_row)
 export(min_scalar)
@@ -118,6 +122,7 @@ export(shift_fragments)
 export(storage_order)
 export(subset_lengths)
 export(svds)
+export(tcrossprod_dense)
 export(tile_matrix)
 export(trackplot_bulk)
 export(trackplot_combine)

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -3,6 +3,7 @@
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
 - Add `get_demo_mat()`, `get_demo_frags()` and `remove_demo_data()` to retrieve a small test matrix/fragments object from the [PBMC 3k dataset from 10X Genomics](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0). (pull request #193)
+- Add `cor_dense()`, `cov_dense()`, `crossprod_dense()`, `tcrossprod_dense()` which can accelerate all-by-all feature correlation and related statistics. Recommended for up to 10k features. (pull request #243)
 
 ## Improvements
 - Speed up taking large subsets of large concatenated matrices, e.g. selecting 9M cells from a 10M cell matrix composed of ~100 concatenated pieces. (pull request #179)

diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R
@@ -761,6 +761,10 @@ matrix_max_per_col_cpp <- function(matrix) {
     .Call(`_BPCells_matrix_max_per_col_cpp`, matrix)
 }
 
+dense_transpose_multiply_cpp <- function(matrix, buffer_bytes, threads) {
+    .Call(`_BPCells_dense_transpose_multiply_cpp`, matrix, buffer_bytes, threads)
+}
+
 pseudobulk_matrix_cpp <- function(mat, cell_groups, method, transpose) {
     .Call(`_BPCells_pseudobulk_matrix_cpp`, mat, cell_groups, method, transpose)
 }

diff --git a/r/R/matrix.R b/r/R/matrix.R
@@ -717,6 +717,165 @@ rlang::on_load({
   }
 })
 
+#' Matrix correlation, covariance, or cross-product with dense output
+#' @rdname crossprod_dense
+#' 
+#' @description
+#' These functions provide optimized versions of `tcrossprod()`, `crossprod()`, `cov()`, and `cor()` that take
+#' BPCells matrices as inputs and produce dense matrix outputs. These functions will run more quickly than
+#' combining multiple BPCells operations to produce equivalent output.
+#' They are mainly recommended for sparse matrices with ~10k features or fewer,
+#' since compute and memory costs will rise with the square of the number of features (see details).
+#' 
+#' Note that `cor_dense()` and `cov_dense()` calculate correlation/covariance of *columns* to match `cov()` and `cor()`.
+#' To calculate feature correlations on a standard `feature x cell` matrix, it will be necessary to pass `t(mat)` as the
+#' first argument.
+#' 
+#' @param x (IterableMatrix) Input matrix. In general, disk-backed matrices should have cell-major storage ordering. (See details,
+#'   or `transpose_storage_order()`)
+#' @param buffer_bytes (integer) In-memory chunk size to use during computations. Performance is best when this is slightly below
+#'   the largest CPU cache size on the computer running BPCells.
+#' @param threads (integer) Number of threads to use during execution. Thread counts higher than 4-8 may see diminishing speedups
+#' @param method (character) Must be `"pearson"`. Provided for compatibility with `cov()` and `cor()` but otherwise ignored.
+#' @param ... Reserved for potential future argument additions
+#' 
+#' @return Symmetric, square dense matrix. Results should match the base R `crossprod()`, `tcrossprod()`, `cov()`, and `cor()`.
+#'
+#' @details
+#' 
+#' **Input storage order**
+#'
+#' The input matrix must be row-major for `crossprod_dense()`, `cor_dense()`, and `cov_dense()`.
+#' For `tcrossprod_dense()`, the input must be col-major. Stated another way: when these functions are used to calculate 
+#' feature correlations, the cell axis should always be the major axis. The functions will raise an error if
+#' the input requires running `transpose_storage_order()` to switch to the required storage order.
+#'
+#' **Efficiency considerations**
+#'
+#' These functions require only 1 or 2 matrix passes, so they will run much faster than using `%*%` to perform
+#' disk-backed sparse-sparse matrix multiplies.
+#' 
+#' The memory required for the output matrix scales with the square of the feature count and the computational
+#' work scales with the square of the number of non-zero entries per feature. So in typical cases running with
+#' 20k features will take 100x the time and memory compared to 2k features. Correlation of 2k features from 
+#' 1.3M cells should take under 1 minute single-threaded, and under 15 seconds with 8 threads.
+#'
+#' @examples
+#' set.seed(12345)
+#' base_r_mat <- matrix(rpois(3*5, lambda=1), nrow=3, ncol=5)
+#' rownames(base_r_mat) <- paste0("row", 1:3)
+#' base_r_mat
+#' 
+#' mat <- as(base_r_mat, "IterableMatrix")
+#' 
+#' tcrossprod_dense(mat)
+#' 
+#' all.equal(tcrossprod_dense(mat), tcrossprod(base_r_mat))
+#' all.equal(crossprod_dense(t(mat)), crossprod(t(base_r_mat)))
+#' all.equal(cov_dense(t(mat)), cov(t(base_r_mat)))
+#' all.equal(cor_dense(t(mat)), cor(t(base_r_mat)))
+#' @export
+tcrossprod_dense <- function(x, ..., buffer_bytes=5e6, threads=1L) {
+  assert_is(x, "IterableMatrix")
+  if (x@transpose) rlang::abort(c("Input matrix must have col-major storage order.", "Please use transpose_storage_order()."))
+  assert_is_wholenumber(buffer_bytes)
+  assert_is_wholenumber(threads)
+  assert_true(buffer_bytes >= 48*nrow(x))
+  rlang::check_dots_empty()
+
+  # Specializations to speed up linear algebra offset transformations
+  # Notice that if X = A + R*t(C):
+  # X*t(X) = A*t(A) + A*C*t(R) + t(A*C*t(R)) + R*t(C)*C*t(R)
+  # Requires 1 extra matrix pass, and 2-4x the output memory depending on how aggressive garbage collection is
+  if (is(x, "TransformLinearResidual") || (is(x, "TransformScaleShift") && any(x@active_transforms[,"shift"]))) {
+    if (is(x, "TransformScaleShift") && any(x@active_transforms[,"shift"])) {
+      col_shift <- rep.int(0, ncol(x))
+      row_shift <- rep.int(0, nrow(x))
+      if (x@active_transforms["row", "shift"]) row_shift <- x@row_params[2,]
+      if (x@active_transforms["col", "shift"]) col_shift <- x@col_params[2,]
+      if (x@active_transforms["global", "shift"]) row_shift <- row_shift + x@global_params[2]
+      x@active_transforms[,"shift"] <- FALSE
+
+      R <- cbind(1, row_shift)
+      C <- cbind(col_shift, 1)
+    }
+    if (is(x, "TransformLinearResidual")) {
+      R <- -t(x@row_params)
+      C <- t(x@col_params)
+      x <- x@matrix
+    }
+    res <- tcrossprod_dense(x, buffer_bytes=buffer_bytes, threads=threads)
+    if (threads > 1) x <- parallel_split(x, threads=threads, chunks=4*threads)
+    # Run frequent garbage collection on fast mode to hopefully avoid big matrix copies
+    # without slowing down execution too much
+    offset <- tcrossprod(x %*% C, R)
+    res <- res + offset
+    gc(verbose=FALSE, full=FALSE)
+    offset <- t(offset)
+    gc(verbose=FALSE, full=FALSE)
+    res <- res + offset
+    gc(verbose=FALSE, full=FALSE)
+    res <- res + R %*% crossprod(C) %*% t(R)
+    gc(verbose=FALSE, full=FALSE)
+    return(res)
+  }
+
+  buffer_bytes <- as.integer(buffer_bytes)
+  threads <- as.integer(threads)
+
+  res <- dense_transpose_multiply_cpp(iterate_matrix(x), buffer_bytes=buffer_bytes, threads=threads)
+  rownames(res) <- rownames(x)
+  colnames(res) <- rownames(x)
+
+  return(res)
+}
+
+#' @rdname crossprod_dense
+#' @export
+crossprod_dense <- function(x, ..., buffer_bytes=5e6, threads=1L) {
+  assert_is(x, "IterableMatrix")
+  if (!x@transpose) rlang::abort(c("Input matrix must have row-major storage order.", "Please use transpose_storage_order()."))
+  assert_is_wholenumber(buffer_bytes)
+  assert_is_wholenumber(threads)
+  assert_true(buffer_bytes >= 48*ncol(x))
+  rlang::check_dots_empty()
+
+  return(tcrossprod_dense(t(x), buffer_bytes=buffer_bytes, threads=threads))
+}
+
+#' @rdname crossprod_dense
+#' @export
+cov_dense <- function(x, ..., method="pearson", buffer_bytes=5e6, threads=1L) {
+  assert_is(x, "IterableMatrix")
+  if (!x@transpose) rlang::abort(c("Input matrix must have row-major storage order.", "Please use transpose_storage_order()."))
+  assert_is_wholenumber(buffer_bytes)
+  assert_is_wholenumber(threads)
+  assert_true(buffer_bytes >= 48*ncol(x))
+  assert_true(method == "pearson")
+
+  col_means <- matrix_stats(x, threads=threads, col_stats="mean")$col_stats["mean",]
+  res <- crossprod_dense(add_cols(x, -col_means), buffer_bytes=buffer_bytes, threads=threads)
+  res <- res / (nrow(x) - 1)
+  return(res)
+}
+
+#' @rdname crossprod_dense
+#' @export
+cor_dense <- function(x, ..., method="pearson", buffer_bytes=5e6, threads=1L) {
+  assert_is(x, "IterableMatrix")
+  if (!x@transpose) rlang::abort(c("Input matrix must have row-major storage order.", "Please use transpose_storage_order()."))
+  assert_is_wholenumber(buffer_bytes)
+  assert_is_wholenumber(threads)
+  assert_true(buffer_bytes >= 48*ncol(x))
+  assert_true(method == "pearson")
+
+  res <- cov_dense(x, method=method, buffer_bytes=buffer_bytes, threads=threads)
+  std_dev_inv <- sqrt(1/diag(res))
+  res <- t(res * std_dev_inv) * std_dev_inv
+  return(res)
+}
+
+
 # Index subsetting
 setClass("MatrixSubset",
   contains = "IterableMatrix",

diff --git a/r/man/crossprod_dense.Rd b/r/man/crossprod_dense.Rd
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
@@ -139,6 +139,7 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
+  - crossprod_dense
   - IterableMatrix-methods
   - pseudobulk_matrix
 

diff --git a/r/src/Makevars.in b/r/src/Makevars.in
@@ -53,6 +53,7 @@ bpcells-cpp/matrixTransforms/Round.o \
 bpcells-cpp/matrixTransforms/SCTransform.o \
 bpcells-cpp/matrixTransforms/Scale.o \
 bpcells-cpp/matrixTransforms/Shift.o \
+bpcells-cpp/matrixUtils/DenseTransposeMultiply.o \
 bpcells-cpp/matrixUtils/WilcoxonRankSum.o \
 bpcells-cpp/matrixUtils/Pseudobulk.o \
 bpcells-cpp/matrixUtils/Quantile.o \

diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp
@@ -2510,6 +2510,19 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// dense_transpose_multiply_cpp
+Eigen::MatrixXd dense_transpose_multiply_cpp(SEXP matrix, size_t buffer_bytes, size_t threads);
+RcppExport SEXP _BPCells_dense_transpose_multiply_cpp(SEXP matrixSEXP, SEXP buffer_bytesSEXP, SEXP threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type matrix(matrixSEXP);
+    Rcpp::traits::input_parameter< size_t >::type buffer_bytes(buffer_bytesSEXP);
+    Rcpp::traits::input_parameter< size_t >::type threads(threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(dense_transpose_multiply_cpp(matrix, buffer_bytes, threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 // pseudobulk_matrix_cpp
 List pseudobulk_matrix_cpp(SEXP mat, std::vector<uint32_t> cell_groups, std::vector<std::string> method, bool transpose);
 RcppExport SEXP _BPCells_pseudobulk_matrix_cpp(SEXP matSEXP, SEXP cell_groupsSEXP, SEXP methodSEXP, SEXP transposeSEXP) {
@@ -2742,6 +2755,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_BPCells_apply_matrix_double_cpp", (DL_FUNC) &_BPCells_apply_matrix_double_cpp, 3},
     {"_BPCells_matrix_max_per_row_cpp", (DL_FUNC) &_BPCells_matrix_max_per_row_cpp, 1},
     {"_BPCells_matrix_max_per_col_cpp", (DL_FUNC) &_BPCells_matrix_max_per_col_cpp, 1},
+    {"_BPCells_dense_transpose_multiply_cpp", (DL_FUNC) &_BPCells_dense_transpose_multiply_cpp, 3},
     {"_BPCells_pseudobulk_matrix_cpp", (DL_FUNC) &_BPCells_pseudobulk_matrix_cpp, 4},
     {"_BPCells_matrix_quantile_per_col_cpp", (DL_FUNC) &_BPCells_matrix_quantile_per_col_cpp, 4},
     {"_BPCells_matrix_identical_uint32_t_cpp", (DL_FUNC) &_BPCells_matrix_identical_uint32_t_cpp, 2},