|
| 1 | +#' ComputeCellData |
| 2 | +#' |
| 3 | +#' A function computes cell status for a given pathway in single-cell RNA-seq data, |
| 4 | +#' based on the distance between genes in a specified pathway. The distance is computed |
| 5 | +#' for each batch of cells, and classical multidimensional scaling (MDS) is used to |
| 6 | +#' visualize the pathway expression across cells. |
| 7 | +#' |
| 8 | +#' @name ComputeCellData |
| 9 | +#' @import Seurat |
| 10 | +#' @importFrom matrixStats rowMins rowMaxs |
| 11 | +#' @importFrom stats dist cmdscale |
| 12 | +#' @importFrom dplyr %>% |
| 13 | +#' @importFrom purrr map |
| 14 | +#' @import tidyverse |
| 15 | +#' @import viridis |
| 16 | +#' |
| 17 | +#' @param x A `Seurat` object containing single-cell RNA sequencing data. |
| 18 | +#' @param pathway A `character` string specifying the pathway name. This should match a pathway used by `LoadPathway()`. |
| 19 | +#' @param distance.method A `character` string specifying the distance metric to use.Default is "manhattan". |
| 20 | +#' Options include: `"manhattan"`, `"euclidean"`, `"canberra"`, `"binary"`, `"minkowski"` |
| 21 | +#' @param batch.size An `integer` specifying the number of cells to process per batch. Default is 1000. |
| 22 | +#' @param scale.data A `logical` indicating whether to use scaled data (`scale.data = TRUE`) or normalized data. Default is `TRUE`. |
| 23 | +#' |
| 24 | +#' @return A data frame of MDS results with normalized values per cell, suitable for thresholding or visualization. |
| 25 | +#' |
| 26 | +#' @examples |
| 27 | +#' data(fake_test_object) |
| 28 | +#' ComputeCellData(fake_test_object, pathway = "Wnt", distance.method = "manhattan", batch.size = 2000) |
| 29 | +#' |
| 30 | +#' @export |
| 31 | +ComputeCellData <- function(x, pathway, distance.method, batch.size = batch.size, scale.data = TRUE){ |
| 32 | + |
| 33 | + # Get pathway data |
| 34 | + pathwaydata <- LoadPathway(pathway) |
| 35 | + names <- c(pathwaydata[[1]]) |
| 36 | + |
| 37 | + # Use only genes present in Seurat object |
| 38 | + valid_names <- intersect(names, rownames(x)) |
| 39 | + if (length(valid_names) == 0) { |
| 40 | + stop("No valid pathway genes found in the Seurat object.") |
| 41 | + } |
| 42 | + x <- ScaleData(x, features = valid_names) |
| 43 | + |
| 44 | + # Extract expression data from the desired slot |
| 45 | + slot_use <- if (scale.data) "scale.data" else "data" |
| 46 | + expr_data <- GetAssayData(x, assay = "RNA", slot = slot_use)[valid_names, , drop = FALSE] |
| 47 | + |
| 48 | + # Pathway max and min |
| 49 | + pathway.stat <- PathwayMaxMin(x, pathway) |
| 50 | + |
| 51 | + # Get cell indices |
| 52 | + cell_id <- colnames(expr_data) |
| 53 | + |
| 54 | + # Shuffle cell indices |
| 55 | + shuffled_cell_id <- sample(cell_id) |
| 56 | + |
| 57 | + # Split shuffled indices into batches |
| 58 | + # Check if batch.size is provided; if not, set default and message |
| 59 | + if (missing(batch.size) || is.null(batch.size)) { |
| 60 | + message("Parameter 'batch.size' is missing or NULL. Setting default batch size to 1000.") |
| 61 | + batch.size <- 1000 |
| 62 | + } |
| 63 | + |
| 64 | + # Define batch size |
| 65 | + batch_size <- batch.size |
| 66 | + |
| 67 | + batches <- split(shuffled_cell_id, ceiling(seq_along(shuffled_cell_id) / batch.size)) |
| 68 | + |
| 69 | + # Subset expression data into chunks based on sampled indices |
| 70 | + expr_chunks <- lapply(batches, function(cols) expr_data[, cols, drop = FALSE]) |
| 71 | + |
| 72 | + # For each expr_chunks, do distance measuring |
| 73 | + # Initialize list to store results |
| 74 | + batch_results <- list() |
| 75 | + |
| 76 | + # Loop through batches of 500 cells |
| 77 | + for (i in seq_len(length(batches))) { |
| 78 | + |
| 79 | + message("Processing batch ", i) |
| 80 | + |
| 81 | + # Extract and convert expression chunk |
| 82 | + expr_data <- expr_chunks[[i]] |
| 83 | + temp.data.batch <- as.data.frame(expr_data) |
| 84 | + |
| 85 | + # Merge along columns |
| 86 | + pathwaytempdata <- cbind(pathway.stat, temp.data.batch) |
| 87 | + |
| 88 | + # Check for enough cells (columns) |
| 89 | + if (ncol(pathwaytempdata) < 2) { |
| 90 | + warning("Batch ", i, " does not have enough cells for distance calculation. Skipping...") |
| 91 | + next |
| 92 | + } |
| 93 | + |
| 94 | + # Check if distance.method is provided; if not, set default and message |
| 95 | + if (missing(distance.method) || is.null(distance.method)) { |
| 96 | + message("Parameter 'distance.method' is missing or NULL. Setting default distance.method to 'manhattan'.") |
| 97 | + distance.method <- "manhattan" |
| 98 | + } |
| 99 | + |
| 100 | + # Distance calculation |
| 101 | + message("Computing distance...") |
| 102 | + d <- dist(t(pathwaytempdata), method = distance.method) |
| 103 | + # "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression) |
| 104 | + # "euclidean" is stratight-line distance, is useful for PCA clustering |
| 105 | + # "canberra" is weighted distance, is also good for sparse data and when values have very different scales |
| 106 | + # "binary" is distance based on presence/absence (0/1) |
| 107 | + # "minkowski" is generalization of euclidean & manhattan, tunable using p parameter |
| 108 | + # choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance |
| 109 | + |
| 110 | + # MDS |
| 111 | + message("Running MDS ...") |
| 112 | + fit <- cmdscale(d, eig = TRUE, k = 1) |
| 113 | + message("MDS finished") |
| 114 | + |
| 115 | + # Normalize the MDS values |
| 116 | + temp.data.mds <- as.data.frame(fit$points) |
| 117 | + colnames(temp.data.mds) <- "V1" |
| 118 | + V1_min <- min(temp.data.mds$V1, na.rm = TRUE) |
| 119 | + V1_max <- max(temp.data.mds$V1, na.rm = TRUE) |
| 120 | + |
| 121 | + if (V1_max == V1_min) { |
| 122 | + temp.data.mds$normalized <- 0 |
| 123 | + } else { |
| 124 | + temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min) |
| 125 | + } |
| 126 | + |
| 127 | + # Store result |
| 128 | + batch_results[[i]] <- temp.data.mds |
| 129 | + |
| 130 | + # Report |
| 131 | + cat("Batch", i, "processed with", ncol(expr_data), "cells\n") |
| 132 | + } |
| 133 | + |
| 134 | + final_mds <- do.call(rbind, batch_results) # Merge all batch MDS results |
| 135 | + |
| 136 | + return(final_mds) |
| 137 | +} |
| 138 | + |
| 139 | +# using sample |
| 140 | +# barcode list (randomization) |
| 141 | +# list of data chunk |
| 142 | +# make these list independent |
| 143 | +# short loop |
| 144 | +# lappy, sapply (list-wide operation) |
| 145 | +# https://www.r-bloggers.com/2022/03/complete-tutorial-on-using-apply-functions-in-r/ |
0 commit comments