forked from MinjieSh/Cosbin
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCosbin_functions.R
More file actions
139 lines (117 loc) · 4 KB
/
Cosbin_functions.R
File metadata and controls
139 lines (117 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#' Normalize data (in super sample) using Cosbin algorithm
#'
#' @param data data (super sample: mean of each group).
#' @return a list contains normalized data and normalization factor.
#' @examples
#' cosbin(data)
cosbin <- function(data) {
result <- data
row.names(result) <- 1:nrow(result)
# Step 1: identify aDEG
threshold <- 0.728284 #cos(arccos(0.5773503) - arccos(0.98))
while (max(cos_iDEG(result)) >= threshold) {
temp_sDEG_ind <- which.max(cos_iDEG(result))
result <- result[-temp_sDEG_ind, ]
result <- totalcount(result)$data
}
# STEP 2: identify iCEG, normalize based on iCEG
# converge: # not change
while (length(which(cos_iCEG(result) >= 0.98)) != dim(result)[1]) {
temp_CEG_ind <- which(cos_iCEG(result) >= 0.98)
result <- result[temp_CEG_ind,]
result <- totalcount(result)$data
}
# Final step: normalization
ind <- as.numeric(row.names(result))
scalar <- colMeans(data[ind, ] / result)
for (i in 1:ncol(data)) {
data[, i] <- data[, i] / scalar[i]
}
return(list(data = data, norm_factor = scalar, CEG_index = ind))
}
#' convert the normalized data (in super sample) to the correct scale (with replicates)
#'
#' @param cosbin_out normalized data, output of the cosbin function
#' @param data2 original data (with replicates, not in super sample)
#' @return final normalization results
#' @examples
#' cosbin_convert(cosbin_out, original_data)
cosbin_convert <- function(cosbin_out, data2) {
CEG <- data2[cosbin_out$CEG_index, ]
CEG_norm <- totalcount(CEG)
factor <- CEG_norm$norm_factor
for (i in 1:dim(data2)[2]) {
data2[, i] <- data2[, i] / factor[i]
}
return(data2)
}
########################## helper functions
#' Low/high-expressed genes are filtered by their L2-norm ranks.
#' @param data Each row is a gene and each column is a sample.
#' @param thres.low The lower bound of percentage of genes to keep for Cosbin
#' with ranked norm. The value should be between 0 and 1.
#' The default is 0.05.
#' @param thres.high The higher bound of percentage of genes to keep for Cosbin
#' with ranked norm. The value should be between 0 and 1.
#' The default is 1.
data_cleaning <- function(data, thres.low = 0.05, thres.high = 1){
if (thres.low >= thres.high) {
stop("thres.low must be smaller than thres.high!")
}
sigNorm <- apply(data, 1, function(x) norm(matrix(x),"F") )
Valid <- sigNorm >= quantile(sigNorm, thres.low) &
sigNorm <= quantile(sigNorm, thres.high)
data <- data[Valid,]
return(data)
}
#' Calculate the cosine between vector A and vector B
mycosine <- function(A, B) {
return(sum(A * B) / sqrt(sum(A ^ 2) * sum(B ^ 2)))
}
#' Calculate the cosine between data and reference vector
cos_ref <- function(data, ref) {
gene <- dim(data)[1]
smp <- dim(data)[2]
res <- array(0, dim = gene)
for (i in 1:gene) {
res[i] <- mycosine(ref, data[i, ])
}
return(res)
}
#' Calculate the cosine between data and CEG reference, e.g. c(1, 1, 1)
cos_iCEG <- function(data) {
gene <- dim(data)[1]
smp <- dim(data)[2]
ref <- rep(1, smp)
res <- array(0, dim = gene)
for (i in 1:gene) {
res[i] <- mycosine(ref, data[i, ])
}
return(res)
}
#' Calculate the cosine between data and closest iDEG reference, e.g. c(1, 0, 0)
cos_iDEG <- function(data) {
gene <- dim(data)[1]
smp <- dim(data)[2]
ref <- vector("list", length = smp)
for (i in 1:smp) {
ref[[i]] <- rep(0, smp)
ref[[i]][i] <- 1
}
res <- array(0, dim = gene)
for (i in 1:gene) {
temp <- NULL
for (j in 1:smp) {
temp <- c(temp, mycosine(ref[[j]], data[i, ]))
}
res[i] <- max(temp)
}
return(res)
}
#' total count normalization
totalcount <- function(data) {
scalar <- colSums(data) / mean(colSums(data))
data <- apply(data, 2, function(x)
x / sum(x)) * mean(colSums(data))
return(list(data = data, norm_factor = scalar))
}