PathwayEmbed/R/ComputeCellData.R at 9943482ef8e98f8873de2d1c69d89e032fc82e6b · RaredonLab/PathwayEmbed · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#' ComputeCellData
#'
#' A function computes cell status for a given pathway in single-cell RNA-seq data,
#' based on the distance between genes in a specified pathway. The distance is computed
#' for each batch of cells, and classical multidimensional scaling (MDS) is used to
#' visualize the pathway expression across cells.
#'
#' @name ComputeCellData
#' @import Seurat
#' @importFrom matrixStats rowMins rowMaxs
#' @importFrom stats dist cmdscale
#' @importFrom dplyr %>%
#' @importFrom purrr map
#' @import tidyverse
#' @import viridis
#'
#' @param x A `Seurat` object containing single-cell RNA sequencing data.
#' @param pathway A `character` string specifying the pathway name. This should match a pathway used by `LoadPathway()`.
#' @param distance.method A `character` string specifying the distance metric to use.Default is "manhattan".
#' Options include: `"manhattan"`, `"euclidean"`, `"canberra"`, `"binary"`, `"minkowski"`
#' @param batch.size An `integer` specifying the number of cells to process per batch. Default is 1000.
#' @param scale.data A `logical` indicating whether to use scaled data (`scale.data = TRUE`) or normalized data. Default is `TRUE`.
#'
#' @return A data frame of MDS results with normalized values per cell, suitable for thresholding or visualization.
#'
#' @examples
#' data(fake_test_object)
#' ComputeCellData(fake_test_object, pathway = "Wnt", distance.method = "manhattan", batch.size = 2000)
#'
#' @export
ComputeCellData <- function(x, pathway, distance.method, batch.size = batch.size, scale.data = TRUE){

  # Get pathway data
  pathwaydata <- LoadPathway(pathway)
  names <- c(pathwaydata[[1]])

  # Use only genes present in Seurat object
  valid_names <- intersect(names, rownames(x))
  if (length(valid_names) == 0) {
    stop("No valid pathway genes found in the Seurat object.")
  }
  x <- ScaleData(x, features = valid_names)

  # Extract expression data from the desired slot
  slot_use <- if (scale.data) "scale.data" else "data"
  expr_data <- GetAssayData(x, assay = "RNA", slot = slot_use)[valid_names, , drop = FALSE]

  # Pathway max and min
  pathway.stat <- PathwayMaxMin(x, pathway)

  # Get cell indices
  cell_id <- colnames(expr_data)

  # Shuffle cell indices
  shuffled_cell_id <- sample(cell_id)

  # Split shuffled indices into batches
  # Check if batch.size is provided; if not, set default and message
  if (missing(batch.size) || is.null(batch.size)) {
    message("Parameter 'batch.size' is missing or NULL. Setting default batch size to 1000.")
    batch.size <- 1000
  }

  # Define batch size
  batch_size <- batch.size

  batches <- split(shuffled_cell_id, ceiling(seq_along(shuffled_cell_id) / batch.size))

  # Subset expression data into chunks based on sampled indices
  expr_chunks <- lapply(batches, function(cols) expr_data[, cols, drop = FALSE])

  # For each expr_chunks, do distance measuring
  # Initialize list to store results
  batch_results <- list()

  # Loop through batches of 500 cells
  for (i in seq_len(length(batches))) {

    message("Processing batch ", i)

    # Extract and convert expression chunk
    expr_data <- expr_chunks[[i]]
    temp.data.batch <- as.data.frame(expr_data)

    # Merge along columns
    pathwaytempdata <- cbind(pathway.stat, temp.data.batch)

    # Check for enough cells (columns)
    if (ncol(pathwaytempdata) < 2) {
      warning("Batch ", i, " does not have enough cells for distance calculation. Skipping...")
      next
    }

    # Check if distance.method is provided; if not, set default and message
    if (missing(distance.method) || is.null(distance.method)) {
      message("Parameter 'distance.method' is missing or NULL. Setting default distance.method to 'manhattan'.")
      distance.method <- "manhattan"
    }

    # Distance calculation
    message("Computing distance...")
    d <- dist(t(pathwaytempdata), method = distance.method)
    # "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression)
    # "euclidean" is stratight-line distance, is useful for PCA clustering
    # "canberra" is weighted distance, is also good for sparse data and when values have very different scales
    # "binary" is distance based on presence/absence (0/1)
    # "minkowski" is generalization of euclidean & manhattan, tunable using p parameter
    # choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance

    # MDS
    message("Running MDS ...")
    fit <- cmdscale(d, eig = TRUE, k = 1)
    message("MDS finished")

    # Normalize the MDS values
    temp.data.mds <- as.data.frame(fit$points)
    colnames(temp.data.mds) <- "V1"
    V1_min <- min(temp.data.mds$V1, na.rm = TRUE)
    V1_max <- max(temp.data.mds$V1, na.rm = TRUE)

    if (V1_max == V1_min) {
      temp.data.mds$normalized <- 0
    } else {
      temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min)
    }

    # Store result
    batch_results[[i]] <- temp.data.mds

    # Report
    cat("Batch", i, "processed with", ncol(expr_data), "cells\n")
  }

  final_mds <- do.call(rbind, batch_results)  # Merge all batch MDS results

  return(final_mds)
}

# using sample
# barcode list (randomization)
# list of data chunk
# make these list independent
# short loop
# lappy, sapply (list-wide operation)
# https://www.r-bloggers.com/2022/03/complete-tutorial-on-using-apply-functions-in-r/