Skip to content

Commit d6434ad

Browse files
authored
Merge pull request #3 from iibadshah/main
protools2 v0.2.10
2 parents 4f85d2f + bcfce1a commit d6434ad

4 files changed

Lines changed: 429 additions & 406 deletions

File tree

DESCRIPTION

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: protools2
22
Type: Package
33
Title: A Set Of Tools For Proteomics And Phosphoproteomics Data Analysis
4-
Version: 0.2.9
5-
Date: 2023-06-22
4+
Version: 0.2.10
5+
Date: 2024-04-24
66
Author: Pedro Rodriguez Cutillas, Irbaz I. Badshah
77
Maintainer: Pedro Rodriguez Cutillas <p.cutillas@qmul.ac.uk>
88
Description: Reads and normalizes data from Pescal. Does statistical analysis using limma or t-test. It also incoroporates functions for KSEA, pathway analysis and network visualization tools.
@@ -47,5 +47,6 @@ Imports:
4747
parallel,
4848
foreach,
4949
doParallel,
50-
limma
50+
limma,
51+
ComplexHeatmap
5152
RoxygenNote: 7.2.3

R/normalize functions.R

Lines changed: 159 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -2,161 +2,229 @@
22
# Normalise output_areas
33
# Author & copyright: Pedro R. Cutillas
44

5-
normalize_areas_return_ppindex <- function(pescal_output_file, delta_score_cut_off=1){
5+
normalize_areas_return_ppindex <- function(pescal_output_file, delta_score_cut_off=1) {
66

77
# set delta_score_cut_off to low (say 1) for proteomics data,
88
# high (say 15) for phosphoproteomics data
99

10-
1110
library(foreach)
1211
library(doParallel)
1312

1413
suppressMessages(
15-
df.areas <- readxl::read_excel(pescal_output_file,"output_areas")
14+
df.areas <- readxl::read_excel(pescal_output_file, "output_areas")
1615
)
17-
colnames(df.areas) <- gsub("-",".",colnames(df.areas),fixed = T)
16+
colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T)
1817
suppressMessages(
19-
df.combi <- readxl::read_excel(pescal.output.file,"combiPeptData")
18+
df.combi <- readxl::read_excel(pescal_output_file, "combiPeptData") # Original `pescal.output.file` object is accessed from global scope
2019
)
21-
# select peptides above the delta_score_cut_off
22-
df.combi <- subset(df.combi,df.combi$max_delta_score>delta_score_cut_off)
23-
24-
peptides <- unique(unlist(df.combi[,25]))
25-
26-
27-
df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id,]
2820

21+
# select peptides above the delta_score_cut_off
22+
df.combi <- subset(df.combi, df.combi$max_delta_score > delta_score_cut_off)
23+
peptides <- unique(unlist(df.combi[, 25])) # 'sites'
24+
df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id, ]
2925
cols <- colnames(dplyr::select_if(df.areas, is.numeric))
30-
df.areas.n <- data.frame(ids=df.areas$db_id,
31-
scale(df.areas[,cols],center = F,
32-
scale = colSums(df.areas[,cols]) )
26+
df.areas.n <- data.frame(
27+
ids = df.areas$db_id,
28+
scale(df.areas[, cols], center = F, scale = colSums(df.areas[, cols]))
3329
)
3430

35-
#plot(unlist(df.areas[1,cols]), unlist(df.areas.n[1,cols]))
36-
#p <- peptides[3]
37-
cores=detectCores()
38-
cl <- makeCluster(cores[1]-1) #not to overload your computer
31+
cores = detectCores()
32+
cl <- makeCluster(cores[1] - 1) # not to overload your computer
3933
registerDoParallel(cl)
4034
t1 <- Sys.time()
41-
df <- foreach(p = peptides, .combine='rbind' )%dopar%{
42-
ids <- na.omit(df.combi[df.combi[,25] ==p,]$db_id)
43-
apply(df.areas.n[df.areas.n$ids %in% ids,cols],2,sum)
44-
45-
}
35+
df <- foreach(p = peptides, .combine = "rbind") %dopar%
36+
{
37+
ids <- na.omit(df.combi[df.combi[, 25] == p, ]$db_id)
38+
apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum)
39+
}
4640
stopCluster(cl)
47-
48-
df.norm <- data.frame(sites=peptides,df*1000000)
41+
df.norm <- data.frame(sites = peptides, df * 1e+06)
4942
rownames(df.norm) <- df.norm$sites
50-
df.norm[df.norm==0] <- NA
51-
df.norm.log2.centered <- data.frame(sites=peptides,scale(log2(df.norm[,cols]),scale = F))
52-
df.norm.log2.centered.scaled <- data.frame(sites=peptides,scale(log2(df.norm[,cols])))
43+
df.norm[df.norm == 0] <- NA
44+
45+
df.norm.log2.centered <- data.frame(
46+
sites = peptides,
47+
scale(log2(df.norm[, cols]), scale = F)
48+
)
5349

50+
df.norm.log2.centered.scaled <- data.frame(
51+
sites = peptides,
52+
scale(log2(df.norm[, cols]))
53+
)
5454

55+
# Alternative na imputation
56+
# Centred & scaled
5557
df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
56-
df.norm.log2.centered.scaled.na.imputed[is.na(df.norm.log2.centered.scaled.na.imputed)] <- min(df.norm.log2.centered.scaled.na.imputed[,cols], na.rm = T)/5
58+
df.norm.log2.centered.scaled.na.imputed1 <- df.norm.log2.centered.scaled
59+
df.norm.log2.centered.scaled.na.imputed2 <- df.norm.log2.centered.scaled
60+
61+
df.norm.log2.centered.scaled.na.imputed[cols] <- lapply(
62+
df.norm.log2.centered.scaled.na.imputed[cols], function(x){
63+
replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation
64+
}
65+
)
66+
67+
df.norm.log2.centered.scaled.na.imputed1[
68+
is.na(df.norm.log2.centered.scaled.na.imputed1)
69+
] <- min(df.norm.log2.centered.scaled.na.imputed1[,cols], na.rm = T) / 5
70+
71+
df.norm.log2.centered.scaled.na.imputed2[cols] <- lapply(
72+
df.norm.log2.centered.scaled.na.imputed2[cols], function(x){
73+
replace(x, is.na(x), min(x, na.rm = TRUE)) # Correct NA imputation
74+
}
75+
)
76+
77+
# Scaled
78+
df.norm.log2.centered.na.imputed <- df.norm.log2.centered
79+
df.norm.log2.centered.na.imputed[cols] <- lapply(
80+
df.norm.log2.centered.na.imputed[cols], function(x){
81+
replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation
82+
}
83+
)
5784

85+
# Previous na imputation
86+
# df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
87+
# df.norm.log2.centered.scaled.na.imputed[
88+
# is.na(df.norm.log2.centered.scaled.na.imputed)
89+
# ] <- min(df.norm.log2.centered.scaled.na.imputed[, cols], na.rm = T) / 5
5890

5991
rownames(df.norm.log2.centered) <- df.norm.log2.centered$sites
6092
rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$sites
61-
return(list(normalized.data=df.norm,
62-
normalized.plus.log2.cent.data=df.norm.log2.centered,
63-
normalized.plus.log2.cent.scaled.data=df.norm.log2.centered.scaled,
64-
df.norm.log2.centered.scaled.na.imputed=df.norm.log2.centered.scaled.na.imputed))
65-
}
6693

94+
return(list(
95+
normalized.data = df.norm,
96+
normalized.plus.log2.cent.data = df.norm.log2.centered,
97+
normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled,
98+
df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed,
99+
df.norm.log2.centered.na.imputed=df.norm.log2.centered.na.imputed,
100+
df.norm.log2.centered.scaled.na.imputed1=df.norm.log2.centered.scaled.na.imputed1,
101+
df.norm.log2.centered.scaled.na.imputed2=df.norm.log2.centered.scaled.na.imputed2
102+
))
103+
}
67104

68105

69-
normalize_areas_return_protein_groups <- function(pescal_output_file,
70-
mascot.score.cut.off=50,
71-
n.peptide.cut.off=1){
72106

107+
normalize_areas_return_protein_groups <- function(
108+
pescal_output_file,
109+
mascot.score.cut.off=50,
110+
n.peptide.cut.off=1
111+
) {
73112
suppressMessages(
74-
df.areas <- data.frame( readxl::read_excel(pescal_output_file,"output_areas"))
113+
df.areas <- data.frame(readxl::read_excel(pescal_output_file, "output_areas"))
75114
)
76-
colnames(df.areas) <- gsub("-",".",colnames(df.areas),fixed = T)
115+
colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T)
77116
suppressMessages(
78-
df.combi <- data.frame(readxl::read_excel(pescal.output.file,"combiPeptData"))
117+
df.combi <- data.frame(readxl::read_excel(pescal_output_file, "combiPeptData")) # Original `pescal.output.file` object is accessed from global scope
79118
)
80119

81-
82-
83120
# normalise areas
84121
cols <- colnames(dplyr::select_if(df.areas, is.numeric))
85-
df.areas.n <- data.frame(ids=df.areas$db_id,
86-
scale(df.areas[,cols],center = F,
87-
scale = colSums(df.areas[,cols]) )
122+
df.areas.n <- data.frame(
123+
ids = df.areas$db_id,
124+
scale(df.areas[, cols], center = F, scale = colSums(df.areas[, cols]))
88125
)
89126

90127
# find protein groups
91-
protein.groups <- na.omit(unique(unlist(df.combi[,29])))
128+
protein.groups <- na.omit(unique(unlist(df.combi[, 29]))) # 29 = genes
92129
n.protein.groups <- length(protein.groups)
93130

94131
# group peptides by protein group
95-
cores=detectCores()
96-
cl <- makeCluster(cores[1]-1) #not to overload your computer
132+
cores = detectCores()
133+
cl <- makeCluster(cores[1] - 1) # not to overload your computer
97134
registerDoParallel(cl)
98135
t1 <- Sys.time()
99-
df <- foreach(p = protein.groups, .combine='rbind' )%dopar%{
100-
101-
dfx <- df.combi[df.combi[,29] ==p,]
102-
103-
ids <- na.omit(dfx$db_id)
104-
best.mascot.score <- max(dfx$max_scr,na.rm = T)
105-
protein.name <- dfx$protein[1]
106-
acc <- na.omit(dfx$acc_no)[1]
107-
uniprot.id <- na.omit(dfx[1,30])[1]
108-
n.peptides <- length(ids)
109-
nPSMs <- na.omit(dfx[,"N_peptides"])
110-
c(protein.group=p,
111-
apply(df.areas.n[df.areas.n$ids %in% ids,cols],2,sum),
112-
best.mascot.score=best.mascot.score,
113-
n.peptides=n.peptides,
114-
n.psm=sum(nPSMs),
115-
acc=acc,
116-
uniprot.id=uniprot.id,
117-
protein.name=protein.name)
118-
119-
}
136+
df <- foreach(p = protein.groups, .combine = "rbind") %dopar%
137+
{
138+
dfx <- df.combi[df.combi[, 29] == p, ]
139+
ids <- na.omit(dfx$db_id)
140+
best.mascot.score <- max(dfx$max_scr, na.rm = T)
141+
protein.name <- dfx$protein[1]
142+
acc <- na.omit(dfx$acc_no)[1]
143+
uniprot.id <- na.omit(dfx[1, 30])[1]
144+
n.peptides <- length(ids)
145+
nPSMs <- na.omit(dfx[, "N_peptides"])
146+
c(
147+
protein.group = p,
148+
apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum),
149+
best.mascot.score = best.mascot.score,
150+
n.peptides = n.peptides,
151+
n.psm = sum(nPSMs),
152+
acc = acc,
153+
uniprot.id = uniprot.id,
154+
protein.name = protein.name
155+
)
156+
}
120157
stopCluster(cl)
121158

122-
write.csv(df,"temp.csv")
159+
write.csv(df, "temp.csv")
123160
x <- read.csv("temp.csv")
124-
x[x==0] <- NA
125-
df.norm <- data.frame(protein.group=x$protein.group, x[,cols]*1000000)
126-
161+
x[x == 0] <- NA
162+
df.norm <- data.frame(protein.group = x$protein.group, x[, cols] * 1e+06)
127163

128-
df.norm.log2.centered <- data.frame(protein.group=protein.groups,scale(log2(df.norm[,cols]),scale = F))
129-
df.norm.log2.centered.scaled <- data.frame(protein.group=protein.groups,scale(log2(df.norm[,cols])))
164+
df.norm.log2.centered <- data.frame(
165+
protein.group = protein.groups,
166+
scale(log2(df.norm[, cols]), scale = F)
167+
)
130168

169+
df.norm.log2.centered.scaled <- data.frame(
170+
protein.group = protein.groups,
171+
scale(log2(df.norm[, cols]))
172+
)
131173

132174
df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
133-
df.norm.log2.centered.scaled.na.imputed[is.na(df.norm.log2.centered.scaled.na.imputed)] <- min(df.norm.log2.centered.scaled.na.imputed[,cols], na.rm = T)/5
134175

176+
# df.norm.log2.centered.scaled.na.imputed[ # Previous na imputation
177+
# is.na(df.norm.log2.centered.scaled.na.imputed)
178+
# ] <- min(df.norm.log2.centered.scaled.na.imputed[, cols], na.rm = T) / 5
179+
180+
df.norm.log2.centered.scaled.na.imputed[cols] <- lapply( # New na imputation
181+
df.norm.log2.centered.scaled.na.imputed[cols], function(x){
182+
replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation
183+
}
184+
)
135185

136186
rownames(df.norm.log2.centered) <- df.norm.log2.centered$protein.group
137187
rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$protein.group
138188
rownames(df.norm) <- df.norm.log2.centered$protein.group
139189
rownames(df.norm.log2.centered.scaled.na.imputed) <- df.norm.log2.centered.scaled.na.imputed$protein.group
140190

141-
142-
xx <- x[x$best.mascot.score>mascot.score.cut.off & x$n.peptides>n.peptide.cut.off,]
191+
xx <- x[
192+
x$best.mascot.score > mascot.score.cut.off &
193+
x$n.peptides > n.peptide.cut.off, ]
143194

144195
selected.prot.groups <- xx$protein.group
145196

146-
cc <- c('protein.group', 'best.mascot.score','n.peptides',"n.psm" ,'acc','uniprot.id','protein.name')
147-
148-
df.norm <- merge.data.frame(df.norm,x[,cc],by="protein.group")
149-
df.norm.log2.centered <- merge.data.frame(df.norm.log2.centered,x[,cc],by="protein.group")
150-
df.norm.log2.centered.scaled <- merge.data.frame(df.norm.log2.centered.scaled,x[,cc],by="protein.group")
151-
df.norm.log2.centered.scaled.na.imputed <- merge.data.frame(df.norm.log2.centered.scaled.na.imputed,x[,cc],by="protein.group")
152-
153-
197+
cc <- c(
198+
"protein.group", "best.mascot.score", "n.peptides",
199+
"n.psm", "acc", "uniprot.id", "protein.name"
200+
)
154201

155-
return(list(normalized.data=df.norm[df.norm$protein.group %in% selected.prot.groups,],
156-
normalized.plus.log2.cent.data=df.norm.log2.centered[df.norm.log2.centered$protein.group %in% selected.prot.groups,],
157-
normalized.plus.log2.cent.scaled.data=df.norm.log2.centered.scaled[df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups,],
158-
df.norm.log2.centered.scaled.na.imputed=df.norm.log2.centered.scaled.na.imputed[df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups,]))
202+
df.norm <- merge.data.frame(df.norm, x[, cc], by = "protein.group")
203+
df.norm.log2.centered <- merge.data.frame(
204+
df.norm.log2.centered,
205+
x[, cc],
206+
by = "protein.group"
207+
)
208+
df.norm.log2.centered.scaled <- merge.data.frame(
209+
df.norm.log2.centered.scaled,
210+
x[, cc],
211+
by = "protein.group"
212+
)
213+
df.norm.log2.centered.scaled.na.imputed <- merge.data.frame(
214+
df.norm.log2.centered.scaled.na.imputed,
215+
x[, cc],
216+
by = "protein.group"
217+
)
159218

219+
return(list(
220+
normalized.data = df.norm[df.norm$protein.group %in% selected.prot.groups, ],
221+
normalized.plus.log2.cent.data = df.norm.log2.centered[
222+
df.norm.log2.centered$protein.group %in% selected.prot.groups, ],
223+
normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled[
224+
df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups, ],
225+
df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed[
226+
df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups, ]
227+
))
160228
}
161229

162230

0 commit comments

Comments
 (0)