Skip to content

Commit 01278e7

Browse files
Merge pull request #9 from RaredonLab/dev
update main
2 parents 73e5d1c + 9943482 commit 01278e7

87 files changed

Lines changed: 11247 additions & 25 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.Rbuildignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
^.*\.Rproj$
2+
^\.Rproj\.user$
3+
^LICENSE\.md$
4+
^doc$
5+
^Meta$

.gitignore

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# DS Store
2+
.DS_Store
3+
14
# History files
25
.Rhistory
36
.Rapp.history
@@ -39,11 +42,10 @@ vignettes/*.pdf
3942
# R Environment Variables
4043
.Renviron
4144

42-
# pkgdown site
43-
docs/
44-
4545
# translation temp files
4646
po/*~
4747

4848
# RStudio Connect folder
4949
rsconnect/
50+
inst/doc
51+
/Meta/

DESCRIPTION

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
Package: PathwayEmbed
2+
Title: Tools for Pathway-Level Embedding and Visualization in Single-Cell Data
3+
Version: 0.0.0.9000
4+
Authors@R:
5+
person("Yaqing", "Huang", email = "yaqing.huang@yale.edu", role = c("aut", "cre"))
6+
Description: Provides tools for analyzing and visualizing pathway-level activity
7+
in single-cell RNA-seq data. Includes functions for computing cell-wise pathway scores,
8+
visualizing transduction states, calculating activation percentages,
9+
and integrating pathway data with Seurat objects.
10+
License: MIT + file LICENSE
11+
Encoding: UTF-8
12+
Roxygen: list(markdown = TRUE)
13+
RoxygenNote: 7.3.2
14+
Depends:
15+
R (>= 3.5)
16+
Imports:
17+
readxl,
18+
Seurat,
19+
RColorBrewer,
20+
ggplot2,
21+
cowplot,
22+
dplyr,
23+
matrixStats,
24+
viridis,
25+
stats,
26+
effsize,
27+
tidyverse,
28+
purrr
29+
Suggests:
30+
knitr,
31+
rmarkdown,
32+
testthat (>= 3.0.0)
33+
Config/testthat/edition: 3
34+
LazyData: true
35+
VignetteBuilder: knitr

LICENSE

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,2 @@
1-
MIT License
2-
3-
Copyright (c) 2024 Raredon Lab
4-
5-
Permission is hereby granted, free of charge, to any person obtaining a copy
6-
of this software and associated documentation files (the "Software"), to deal
7-
in the Software without restriction, including without limitation the rights
8-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9-
copies of the Software, and to permit persons to whom the Software is
10-
furnished to do so, subject to the following conditions:
11-
12-
The above copyright notice and this permission notice shall be included in all
13-
copies or substantial portions of the Software.
14-
15-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21-
SOFTWARE.
1+
YEAR: 2025
2+
COPYRIGHT HOLDER: Raredon Lab

LICENSE.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# MIT License
2+
3+
Copyright (c) 2025 Raredon Lab
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

NAMESPACE

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(CalculatePercentage)
4+
export(ComputeCellData)
5+
export(LoadPathway)
6+
export(PathwayMaxMin)
7+
export(PlotPathway)
8+
export(PreparePlotData)
9+
import(RColorBrewer)
10+
import(Seurat)
11+
import(cowplot)
12+
import(ggplot2)
13+
import(matrixStats)
14+
import(readxl)
15+
import(tidyverse)
16+
import(viridis)
17+
importFrom(dplyr,"%>%")
18+
importFrom(dplyr,bind_rows)
19+
importFrom(effsize,cohen.d)
20+
importFrom(matrixStats,rowMaxs)
21+
importFrom(matrixStats,rowMins)
22+
importFrom(purrr,map)
23+
importFrom(stats,cmdscale)
24+
importFrom(stats,dist)
25+
importFrom(stats,na.omit)

PathwayEmbed.Rproj

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Version: 1.0
2+
ProjectId: 0c111876-39b0-460f-a888-db107bec1084
3+
4+
RestoreWorkspace: No
5+
SaveWorkspace: No
6+
AlwaysSaveHistory: Default
7+
8+
EnableCodeIndexing: Yes
9+
UseSpacesForTab: Yes
10+
NumSpacesForTab: 2
11+
Encoding: UTF-8
12+
13+
RnwWeave: Sweave
14+
LaTeX: pdfLaTeX
15+
16+
AutoAppendNewline: Yes
17+
StripTrailingWhitespace: Yes
18+
LineEndingConversion: Posix
19+
20+
BuildType: Package
21+
PackageUseDevtools: Yes
22+
PackageInstallArgs: --no-multiarch --with-keep.source
23+
PackageRoxygenize: rd,collate,namespace

R/CalculatePercentage.R

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#' CalculatePercentage
2+
#'
3+
#' This function calculates the percentage of cells in ON (scale > 0) and OFF (scale < 0)
4+
#' activation states within each group defined by `group_var`. If exactly two groups
5+
#' are provided, it also computes Cohen's d effect size between their activation values.
6+
#'
7+
#' @name CalculatePercentage
8+
#' @importFrom dplyr bind_rows
9+
#' @importFrom effsize cohen.d
10+
#' @importFrom stats na.omit
11+
#' @param to.plot A data frame containing at least a `scale` column and a grouping column.
12+
#' @param group_var A string specifying the grouping variable (e.g., "genotype", "treatment").
13+
#'
14+
#' @return A data frame with the percentage of ON/OFF cells and Cohen's d (if applicable).
15+
#' @examples
16+
#' data(fake_to_plot)
17+
#' CalculatePercentage(fake_to_plot, "genotype")
18+
#' @export
19+
CalculatePercentage <- function(to.plot, group_var){
20+
# Make sure there is scale data
21+
stopifnot("scale" %in% names(to.plot))
22+
23+
# Make sure no NA
24+
groups <- unique(na.omit(to.plot[[group_var]]))
25+
results <- list()
26+
27+
for (g in groups) {
28+
subset_data <- to.plot[to.plot[[group_var]] == g, ]
29+
total <- nrow(subset_data)
30+
31+
# Calculate how many cells are in on/off status
32+
on <- sum(subset_data[["scale"]] > 0, na.rm = TRUE)
33+
off <- sum(subset_data[["scale"]] < 0, na.rm = TRUE)
34+
35+
# Calculate percentages of on/off cells
36+
results[[as.character(g)]] <- list(
37+
percentage_on = round(100 * on / total, 2),
38+
percentage_off = round(100 * off / total, 2)
39+
)
40+
}
41+
42+
# When there are two groups in comparison, Cohen's d — a measure of effect size — will be applied for statistic purpose
43+
if (length(groups) == 2) {
44+
g1 <- groups[1]
45+
g2 <- groups[2]
46+
vec1 <- to.plot[to.plot[[group_var]] == g1, "scale"]
47+
vec2 <- to.plot[to.plot[[group_var]] == g2, "scale"]
48+
49+
# Computes Cohen's d between two numeric vectors (vec1 and vec2) and extracts the estimated value of the effect size.
50+
cohens_d_val <- cohen.d(vec1, vec2)$estimate
51+
# |d value|: 0 - 0.2, effect size is negligible
52+
# |d value|: 0.2 - 0.5: small effect
53+
# |d value|: 0.5 - 0.8: medium effect
54+
# |d value|: > 0.8: large effect
55+
56+
results[[as.character(g1)]]$cohens_d <- cohens_d_val
57+
results[[as.character(g2)]]$cohens_d <- cohens_d_val
58+
}
59+
60+
# Make a dataframe for the output
61+
df <- bind_rows(results, .id = "group")
62+
return(df)
63+
}

R/ComputeCellData.R

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#' ComputeCellData
2+
#'
3+
#' A function computes cell status for a given pathway in single-cell RNA-seq data,
4+
#' based on the distance between genes in a specified pathway. The distance is computed
5+
#' for each batch of cells, and classical multidimensional scaling (MDS) is used to
6+
#' visualize the pathway expression across cells.
7+
#'
8+
#' @name ComputeCellData
9+
#' @import Seurat
10+
#' @importFrom matrixStats rowMins rowMaxs
11+
#' @importFrom stats dist cmdscale
12+
#' @importFrom dplyr %>%
13+
#' @importFrom purrr map
14+
#' @import tidyverse
15+
#' @import viridis
16+
#'
17+
#' @param x A `Seurat` object containing single-cell RNA sequencing data.
18+
#' @param pathway A `character` string specifying the pathway name. This should match a pathway used by `LoadPathway()`.
19+
#' @param distance.method A `character` string specifying the distance metric to use.Default is "manhattan".
20+
#' Options include: `"manhattan"`, `"euclidean"`, `"canberra"`, `"binary"`, `"minkowski"`
21+
#' @param batch.size An `integer` specifying the number of cells to process per batch. Default is 1000.
22+
#' @param scale.data A `logical` indicating whether to use scaled data (`scale.data = TRUE`) or normalized data. Default is `TRUE`.
23+
#'
24+
#' @return A data frame of MDS results with normalized values per cell, suitable for thresholding or visualization.
25+
#'
26+
#' @examples
27+
#' data(fake_test_object)
28+
#' ComputeCellData(fake_test_object, pathway = "Wnt", distance.method = "manhattan", batch.size = 2000)
29+
#'
30+
#' @export
31+
ComputeCellData <- function(x, pathway, distance.method, batch.size = batch.size, scale.data = TRUE){
32+
33+
# Get pathway data
34+
pathwaydata <- LoadPathway(pathway)
35+
names <- c(pathwaydata[[1]])
36+
37+
# Use only genes present in Seurat object
38+
valid_names <- intersect(names, rownames(x))
39+
if (length(valid_names) == 0) {
40+
stop("No valid pathway genes found in the Seurat object.")
41+
}
42+
x <- ScaleData(x, features = valid_names)
43+
44+
# Extract expression data from the desired slot
45+
slot_use <- if (scale.data) "scale.data" else "data"
46+
expr_data <- GetAssayData(x, assay = "RNA", slot = slot_use)[valid_names, , drop = FALSE]
47+
48+
# Pathway max and min
49+
pathway.stat <- PathwayMaxMin(x, pathway)
50+
51+
# Get cell indices
52+
cell_id <- colnames(expr_data)
53+
54+
# Shuffle cell indices
55+
shuffled_cell_id <- sample(cell_id)
56+
57+
# Split shuffled indices into batches
58+
# Check if batch.size is provided; if not, set default and message
59+
if (missing(batch.size) || is.null(batch.size)) {
60+
message("Parameter 'batch.size' is missing or NULL. Setting default batch size to 1000.")
61+
batch.size <- 1000
62+
}
63+
64+
# Define batch size
65+
batch_size <- batch.size
66+
67+
batches <- split(shuffled_cell_id, ceiling(seq_along(shuffled_cell_id) / batch.size))
68+
69+
# Subset expression data into chunks based on sampled indices
70+
expr_chunks <- lapply(batches, function(cols) expr_data[, cols, drop = FALSE])
71+
72+
# For each expr_chunks, do distance measuring
73+
# Initialize list to store results
74+
batch_results <- list()
75+
76+
# Loop through batches of 500 cells
77+
for (i in seq_len(length(batches))) {
78+
79+
message("Processing batch ", i)
80+
81+
# Extract and convert expression chunk
82+
expr_data <- expr_chunks[[i]]
83+
temp.data.batch <- as.data.frame(expr_data)
84+
85+
# Merge along columns
86+
pathwaytempdata <- cbind(pathway.stat, temp.data.batch)
87+
88+
# Check for enough cells (columns)
89+
if (ncol(pathwaytempdata) < 2) {
90+
warning("Batch ", i, " does not have enough cells for distance calculation. Skipping...")
91+
next
92+
}
93+
94+
# Check if distance.method is provided; if not, set default and message
95+
if (missing(distance.method) || is.null(distance.method)) {
96+
message("Parameter 'distance.method' is missing or NULL. Setting default distance.method to 'manhattan'.")
97+
distance.method <- "manhattan"
98+
}
99+
100+
# Distance calculation
101+
message("Computing distance...")
102+
d <- dist(t(pathwaytempdata), method = distance.method)
103+
# "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression)
104+
# "euclidean" is stratight-line distance, is useful for PCA clustering
105+
# "canberra" is weighted distance, is also good for sparse data and when values have very different scales
106+
# "binary" is distance based on presence/absence (0/1)
107+
# "minkowski" is generalization of euclidean & manhattan, tunable using p parameter
108+
# choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance
109+
110+
# MDS
111+
message("Running MDS ...")
112+
fit <- cmdscale(d, eig = TRUE, k = 1)
113+
message("MDS finished")
114+
115+
# Normalize the MDS values
116+
temp.data.mds <- as.data.frame(fit$points)
117+
colnames(temp.data.mds) <- "V1"
118+
V1_min <- min(temp.data.mds$V1, na.rm = TRUE)
119+
V1_max <- max(temp.data.mds$V1, na.rm = TRUE)
120+
121+
if (V1_max == V1_min) {
122+
temp.data.mds$normalized <- 0
123+
} else {
124+
temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min)
125+
}
126+
127+
# Store result
128+
batch_results[[i]] <- temp.data.mds
129+
130+
# Report
131+
cat("Batch", i, "processed with", ncol(expr_data), "cells\n")
132+
}
133+
134+
final_mds <- do.call(rbind, batch_results) # Merge all batch MDS results
135+
136+
return(final_mds)
137+
}
138+
139+
# using sample
140+
# barcode list (randomization)
141+
# list of data chunk
142+
# make these list independent
143+
# short loop
144+
# lappy, sapply (list-wide operation)
145+
# https://www.r-bloggers.com/2022/03/complete-tutorial-on-using-apply-functions-in-r/

R/LoadPathway.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#' LoadPathway
2+
#'
3+
#' This function reads pathway data from the package's built-in Excel file.
4+
#'
5+
#' @name LoadPathway
6+
#' @param pathway A `character` string specifying the pathway name.
7+
#' @return A data frame with pathway data.
8+
#' @examples
9+
#' LoadPathway("Wnt")
10+
#' @import readxl
11+
#' @export
12+
LoadPathway <- function(pathway) {
13+
file_path <- system.file("extdata", "Pathway_Embedding.xlsx", package = "PathwayEmbed")
14+
15+
if (file_path == "") {
16+
stop("Pathway data file not found. Ensure the package is installed correctly.")
17+
}
18+
19+
# Read the specified sheet
20+
data <- readxl::read_excel(file_path, sheet = pathway)
21+
# extract the molecules in the pathway
22+
pathway.molecules <- c(data[["Molecules"]])
23+
# extract the coefficients of the molecules in the pathway
24+
pathway.coefficients <- as.numeric(c(data[["Coefficients"]]))
25+
26+
return(data)
27+
}

0 commit comments

Comments
 (0)