|
2 | 2 | # Normalise output_areas |
3 | 3 | # Author & copyright: Pedro R. Cutillas |
4 | 4 |
|
5 | | -normalize_areas_return_ppindex <- function(pescal_output_file, delta_score_cut_off=1){ |
| 5 | +normalize_areas_return_ppindex <- function(pescal_output_file, delta_score_cut_off=1) { |
6 | 6 |
|
7 | 7 | # set delta_score_cut_off to low (say 1) for proteomics data, |
8 | 8 | # high (say 15) for phosphoproteomics data |
9 | 9 |
|
10 | | - |
11 | 10 | library(foreach) |
12 | 11 | library(doParallel) |
13 | 12 |
|
14 | 13 | suppressMessages( |
15 | | - df.areas <- readxl::read_excel(pescal_output_file,"output_areas") |
| 14 | + df.areas <- readxl::read_excel(pescal_output_file, "output_areas") |
16 | 15 | ) |
17 | | - colnames(df.areas) <- gsub("-",".",colnames(df.areas),fixed = T) |
| 16 | + colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T) |
18 | 17 | suppressMessages( |
19 | | - df.combi <- readxl::read_excel(pescal.output.file,"combiPeptData") |
| 18 | + df.combi <- readxl::read_excel(pescal_output_file, "combiPeptData") # Original `pescal.output.file` object is accessed from global scope |
20 | 19 | ) |
21 | | - # select peptides above the delta_score_cut_off |
22 | | - df.combi <- subset(df.combi,df.combi$max_delta_score>delta_score_cut_off) |
23 | | - |
24 | | - peptides <- unique(unlist(df.combi[,25])) |
25 | | - |
26 | | - |
27 | | - df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id,] |
28 | 20 |
|
| 21 | + # select peptides above the delta_score_cut_off |
| 22 | + df.combi <- subset(df.combi, df.combi$max_delta_score > delta_score_cut_off) |
| 23 | + peptides <- unique(unlist(df.combi[, 25])) # 'sites' |
| 24 | + df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id, ] |
29 | 25 | cols <- colnames(dplyr::select_if(df.areas, is.numeric)) |
30 | | - df.areas.n <- data.frame(ids=df.areas$db_id, |
31 | | - scale(df.areas[,cols],center = F, |
32 | | - scale = colSums(df.areas[,cols]) ) |
| 26 | + df.areas.n <- data.frame( |
| 27 | + ids = df.areas$db_id, |
| 28 | + scale(df.areas[, cols], center = F, scale = colSums(df.areas[, cols])) |
33 | 29 | ) |
34 | 30 |
|
35 | | - #plot(unlist(df.areas[1,cols]), unlist(df.areas.n[1,cols])) |
36 | | - #p <- peptides[3] |
37 | | - cores=detectCores() |
38 | | - cl <- makeCluster(cores[1]-1) #not to overload your computer |
| 31 | + cores = detectCores() |
| 32 | + cl <- makeCluster(cores[1] - 1) # not to overload your computer |
39 | 33 | registerDoParallel(cl) |
40 | 34 | t1 <- Sys.time() |
41 | | - df <- foreach(p = peptides, .combine='rbind' )%dopar%{ |
42 | | - ids <- na.omit(df.combi[df.combi[,25] ==p,]$db_id) |
43 | | - apply(df.areas.n[df.areas.n$ids %in% ids,cols],2,sum) |
44 | | - |
45 | | - } |
| 35 | + df <- foreach(p = peptides, .combine = "rbind") %dopar% |
| 36 | + { |
| 37 | + ids <- na.omit(df.combi[df.combi[, 25] == p, ]$db_id) |
| 38 | + apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum) |
| 39 | + } |
46 | 40 | stopCluster(cl) |
47 | | - |
48 | | - df.norm <- data.frame(sites=peptides,df*1000000) |
| 41 | + df.norm <- data.frame(sites = peptides, df * 1e+06) |
49 | 42 | rownames(df.norm) <- df.norm$sites |
50 | | - df.norm[df.norm==0] <- NA |
51 | | - df.norm.log2.centered <- data.frame(sites=peptides,scale(log2(df.norm[,cols]),scale = F)) |
52 | | - df.norm.log2.centered.scaled <- data.frame(sites=peptides,scale(log2(df.norm[,cols]))) |
| 43 | + df.norm[df.norm == 0] <- NA |
| 44 | + |
| 45 | + df.norm.log2.centered <- data.frame( |
| 46 | + sites = peptides, |
| 47 | + scale(log2(df.norm[, cols]), scale = F) |
| 48 | + ) |
53 | 49 |
|
| 50 | + df.norm.log2.centered.scaled <- data.frame( |
| 51 | + sites = peptides, |
| 52 | + scale(log2(df.norm[, cols])) |
| 53 | + ) |
54 | 54 |
|
| 55 | + # Alternative na imputation |
| 56 | + # Centred & scaled |
55 | 57 | df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled |
56 | | - df.norm.log2.centered.scaled.na.imputed[is.na(df.norm.log2.centered.scaled.na.imputed)] <- min(df.norm.log2.centered.scaled.na.imputed[,cols], na.rm = T)/5 |
| 58 | + df.norm.log2.centered.scaled.na.imputed1 <- df.norm.log2.centered.scaled |
| 59 | + df.norm.log2.centered.scaled.na.imputed2 <- df.norm.log2.centered.scaled |
| 60 | + |
| 61 | + df.norm.log2.centered.scaled.na.imputed[cols] <- lapply( |
| 62 | + df.norm.log2.centered.scaled.na.imputed[cols], function(x){ |
| 63 | + replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation |
| 64 | + } |
| 65 | + ) |
| 66 | + |
| 67 | + df.norm.log2.centered.scaled.na.imputed1[ |
| 68 | + is.na(df.norm.log2.centered.scaled.na.imputed1) |
| 69 | + ] <- min(df.norm.log2.centered.scaled.na.imputed1[,cols], na.rm = T) / 5 |
| 70 | + |
| 71 | + df.norm.log2.centered.scaled.na.imputed2[cols] <- lapply( |
| 72 | + df.norm.log2.centered.scaled.na.imputed2[cols], function(x){ |
| 73 | + replace(x, is.na(x), min(x, na.rm = TRUE)) # Correct NA imputation |
| 74 | + } |
| 75 | + ) |
| 76 | + |
| 77 | + # Scaled |
| 78 | + df.norm.log2.centered.na.imputed <- df.norm.log2.centered |
| 79 | + df.norm.log2.centered.na.imputed[cols] <- lapply( |
| 80 | + df.norm.log2.centered.na.imputed[cols], function(x){ |
| 81 | + replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation |
| 82 | + } |
| 83 | + ) |
57 | 84 |
|
| 85 | + # Previous na imputation |
| 86 | + # df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled |
| 87 | + # df.norm.log2.centered.scaled.na.imputed[ |
| 88 | + # is.na(df.norm.log2.centered.scaled.na.imputed) |
| 89 | + # ] <- min(df.norm.log2.centered.scaled.na.imputed[, cols], na.rm = T) / 5 |
58 | 90 |
|
59 | 91 | rownames(df.norm.log2.centered) <- df.norm.log2.centered$sites |
60 | 92 | rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$sites |
61 | | - return(list(normalized.data=df.norm, |
62 | | - normalized.plus.log2.cent.data=df.norm.log2.centered, |
63 | | - normalized.plus.log2.cent.scaled.data=df.norm.log2.centered.scaled, |
64 | | - df.norm.log2.centered.scaled.na.imputed=df.norm.log2.centered.scaled.na.imputed)) |
65 | | -} |
66 | 93 |
|
| 94 | + return(list( |
| 95 | + normalized.data = df.norm, |
| 96 | + normalized.plus.log2.cent.data = df.norm.log2.centered, |
| 97 | + normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled, |
| 98 | + df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed, |
| 99 | + df.norm.log2.centered.na.imputed=df.norm.log2.centered.na.imputed, |
| 100 | + df.norm.log2.centered.scaled.na.imputed1=df.norm.log2.centered.scaled.na.imputed1, |
| 101 | + df.norm.log2.centered.scaled.na.imputed2=df.norm.log2.centered.scaled.na.imputed2 |
| 102 | + )) |
| 103 | +} |
67 | 104 |
|
68 | 105 |
|
69 | | -normalize_areas_return_protein_groups <- function(pescal_output_file, |
70 | | - mascot.score.cut.off=50, |
71 | | - n.peptide.cut.off=1){ |
72 | 106 |
|
| 107 | +normalize_areas_return_protein_groups <- function( |
| 108 | + pescal_output_file, |
| 109 | + mascot.score.cut.off=50, |
| 110 | + n.peptide.cut.off=1 |
| 111 | + ) { |
73 | 112 | suppressMessages( |
74 | | - df.areas <- data.frame( readxl::read_excel(pescal_output_file,"output_areas")) |
| 113 | + df.areas <- data.frame(readxl::read_excel(pescal_output_file, "output_areas")) |
75 | 114 | ) |
76 | | - colnames(df.areas) <- gsub("-",".",colnames(df.areas),fixed = T) |
| 115 | + colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T) |
77 | 116 | suppressMessages( |
78 | | - df.combi <- data.frame(readxl::read_excel(pescal.output.file,"combiPeptData")) |
| 117 | + df.combi <- data.frame(readxl::read_excel(pescal_output_file, "combiPeptData")) # Original `pescal.output.file` object is accessed from global scope |
79 | 118 | ) |
80 | 119 |
|
81 | | - |
82 | | - |
83 | 120 | # normalise areas |
84 | 121 | cols <- colnames(dplyr::select_if(df.areas, is.numeric)) |
85 | | - df.areas.n <- data.frame(ids=df.areas$db_id, |
86 | | - scale(df.areas[,cols],center = F, |
87 | | - scale = colSums(df.areas[,cols]) ) |
| 122 | + df.areas.n <- data.frame( |
| 123 | + ids = df.areas$db_id, |
| 124 | + scale(df.areas[, cols], center = F, scale = colSums(df.areas[, cols])) |
88 | 125 | ) |
89 | 126 |
|
90 | 127 | # find protein groups |
91 | | - protein.groups <- na.omit(unique(unlist(df.combi[,29]))) |
| 128 | + protein.groups <- na.omit(unique(unlist(df.combi[, 29]))) # 29 = genes |
92 | 129 | n.protein.groups <- length(protein.groups) |
93 | 130 |
|
94 | 131 | # group peptides by protein group |
95 | | - cores=detectCores() |
96 | | - cl <- makeCluster(cores[1]-1) #not to overload your computer |
| 132 | + cores = detectCores() |
| 133 | + cl <- makeCluster(cores[1] - 1) # not to overload your computer |
97 | 134 | registerDoParallel(cl) |
98 | 135 | t1 <- Sys.time() |
99 | | - df <- foreach(p = protein.groups, .combine='rbind' )%dopar%{ |
100 | | - |
101 | | - dfx <- df.combi[df.combi[,29] ==p,] |
102 | | - |
103 | | - ids <- na.omit(dfx$db_id) |
104 | | - best.mascot.score <- max(dfx$max_scr,na.rm = T) |
105 | | - protein.name <- dfx$protein[1] |
106 | | - acc <- na.omit(dfx$acc_no)[1] |
107 | | - uniprot.id <- na.omit(dfx[1,30])[1] |
108 | | - n.peptides <- length(ids) |
109 | | - nPSMs <- na.omit(dfx[,"N_peptides"]) |
110 | | - c(protein.group=p, |
111 | | - apply(df.areas.n[df.areas.n$ids %in% ids,cols],2,sum), |
112 | | - best.mascot.score=best.mascot.score, |
113 | | - n.peptides=n.peptides, |
114 | | - n.psm=sum(nPSMs), |
115 | | - acc=acc, |
116 | | - uniprot.id=uniprot.id, |
117 | | - protein.name=protein.name) |
118 | | - |
119 | | - } |
| 136 | + df <- foreach(p = protein.groups, .combine = "rbind") %dopar% |
| 137 | + { |
| 138 | + dfx <- df.combi[df.combi[, 29] == p, ] |
| 139 | + ids <- na.omit(dfx$db_id) |
| 140 | + best.mascot.score <- max(dfx$max_scr, na.rm = T) |
| 141 | + protein.name <- dfx$protein[1] |
| 142 | + acc <- na.omit(dfx$acc_no)[1] |
| 143 | + uniprot.id <- na.omit(dfx[1, 30])[1] |
| 144 | + n.peptides <- length(ids) |
| 145 | + nPSMs <- na.omit(dfx[, "N_peptides"]) |
| 146 | + c( |
| 147 | + protein.group = p, |
| 148 | + apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum), |
| 149 | + best.mascot.score = best.mascot.score, |
| 150 | + n.peptides = n.peptides, |
| 151 | + n.psm = sum(nPSMs), |
| 152 | + acc = acc, |
| 153 | + uniprot.id = uniprot.id, |
| 154 | + protein.name = protein.name |
| 155 | + ) |
| 156 | + } |
120 | 157 | stopCluster(cl) |
121 | 158 |
|
122 | | - write.csv(df,"temp.csv") |
| 159 | + write.csv(df, "temp.csv") |
123 | 160 | x <- read.csv("temp.csv") |
124 | | - x[x==0] <- NA |
125 | | - df.norm <- data.frame(protein.group=x$protein.group, x[,cols]*1000000) |
126 | | - |
| 161 | + x[x == 0] <- NA |
| 162 | + df.norm <- data.frame(protein.group = x$protein.group, x[, cols] * 1e+06) |
127 | 163 |
|
128 | | - df.norm.log2.centered <- data.frame(protein.group=protein.groups,scale(log2(df.norm[,cols]),scale = F)) |
129 | | - df.norm.log2.centered.scaled <- data.frame(protein.group=protein.groups,scale(log2(df.norm[,cols]))) |
| 164 | + df.norm.log2.centered <- data.frame( |
| 165 | + protein.group = protein.groups, |
| 166 | + scale(log2(df.norm[, cols]), scale = F) |
| 167 | + ) |
130 | 168 |
|
| 169 | + df.norm.log2.centered.scaled <- data.frame( |
| 170 | + protein.group = protein.groups, |
| 171 | + scale(log2(df.norm[, cols])) |
| 172 | + ) |
131 | 173 |
|
132 | 174 | df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled |
133 | | - df.norm.log2.centered.scaled.na.imputed[is.na(df.norm.log2.centered.scaled.na.imputed)] <- min(df.norm.log2.centered.scaled.na.imputed[,cols], na.rm = T)/5 |
134 | 175 |
|
| 176 | + # df.norm.log2.centered.scaled.na.imputed[ # Previous na imputation |
| 177 | + # is.na(df.norm.log2.centered.scaled.na.imputed) |
| 178 | + # ] <- min(df.norm.log2.centered.scaled.na.imputed[, cols], na.rm = T) / 5 |
| 179 | + |
| 180 | + df.norm.log2.centered.scaled.na.imputed[cols] <- lapply( # New na imputation |
| 181 | + df.norm.log2.centered.scaled.na.imputed[cols], function(x){ |
| 182 | + replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation |
| 183 | + } |
| 184 | + ) |
135 | 185 |
|
136 | 186 | rownames(df.norm.log2.centered) <- df.norm.log2.centered$protein.group |
137 | 187 | rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$protein.group |
138 | 188 | rownames(df.norm) <- df.norm.log2.centered$protein.group |
139 | 189 | rownames(df.norm.log2.centered.scaled.na.imputed) <- df.norm.log2.centered.scaled.na.imputed$protein.group |
140 | 190 |
|
141 | | - |
142 | | - xx <- x[x$best.mascot.score>mascot.score.cut.off & x$n.peptides>n.peptide.cut.off,] |
| 191 | + xx <- x[ |
| 192 | + x$best.mascot.score > mascot.score.cut.off & |
| 193 | + x$n.peptides > n.peptide.cut.off, ] |
143 | 194 |
|
144 | 195 | selected.prot.groups <- xx$protein.group |
145 | 196 |
|
146 | | - cc <- c('protein.group', 'best.mascot.score','n.peptides',"n.psm" ,'acc','uniprot.id','protein.name') |
147 | | - |
148 | | - df.norm <- merge.data.frame(df.norm,x[,cc],by="protein.group") |
149 | | - df.norm.log2.centered <- merge.data.frame(df.norm.log2.centered,x[,cc],by="protein.group") |
150 | | - df.norm.log2.centered.scaled <- merge.data.frame(df.norm.log2.centered.scaled,x[,cc],by="protein.group") |
151 | | - df.norm.log2.centered.scaled.na.imputed <- merge.data.frame(df.norm.log2.centered.scaled.na.imputed,x[,cc],by="protein.group") |
152 | | - |
153 | | - |
| 197 | + cc <- c( |
| 198 | + "protein.group", "best.mascot.score", "n.peptides", |
| 199 | + "n.psm", "acc", "uniprot.id", "protein.name" |
| 200 | + ) |
154 | 201 |
|
155 | | - return(list(normalized.data=df.norm[df.norm$protein.group %in% selected.prot.groups,], |
156 | | - normalized.plus.log2.cent.data=df.norm.log2.centered[df.norm.log2.centered$protein.group %in% selected.prot.groups,], |
157 | | - normalized.plus.log2.cent.scaled.data=df.norm.log2.centered.scaled[df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups,], |
158 | | - df.norm.log2.centered.scaled.na.imputed=df.norm.log2.centered.scaled.na.imputed[df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups,])) |
| 202 | + df.norm <- merge.data.frame(df.norm, x[, cc], by = "protein.group") |
| 203 | + df.norm.log2.centered <- merge.data.frame( |
| 204 | + df.norm.log2.centered, |
| 205 | + x[, cc], |
| 206 | + by = "protein.group" |
| 207 | + ) |
| 208 | + df.norm.log2.centered.scaled <- merge.data.frame( |
| 209 | + df.norm.log2.centered.scaled, |
| 210 | + x[, cc], |
| 211 | + by = "protein.group" |
| 212 | + ) |
| 213 | + df.norm.log2.centered.scaled.na.imputed <- merge.data.frame( |
| 214 | + df.norm.log2.centered.scaled.na.imputed, |
| 215 | + x[, cc], |
| 216 | + by = "protein.group" |
| 217 | + ) |
159 | 218 |
|
| 219 | + return(list( |
| 220 | + normalized.data = df.norm[df.norm$protein.group %in% selected.prot.groups, ], |
| 221 | + normalized.plus.log2.cent.data = df.norm.log2.centered[ |
| 222 | + df.norm.log2.centered$protein.group %in% selected.prot.groups, ], |
| 223 | + normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled[ |
| 224 | + df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups, ], |
| 225 | + df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed[ |
| 226 | + df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups, ] |
| 227 | + )) |
160 | 228 | } |
161 | 229 |
|
162 | 230 |
|
|
0 commit comments