Skip to content

Commit ea8a527

Browse files
authored
Merge pull request #9 from iibadshah/main
protools2 v0.2.14: Fix impute_na() min defined outside loop. VIM::kNN impute_na_knn() alternate normalisation NA imputation
2 parents 0e59b90 + 689c593 commit ea8a527

3 files changed

Lines changed: 108 additions & 35 deletions

File tree

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: protools2
22
Type: Package
33
Title: A Set Of Tools For Proteomics And Phosphoproteomics Data Analysis
4-
Version: 0.2.13
4+
Version: 0.2.14
55
Date: 2025-04-15
66
Authors@R: c(
77
person("Pedro", "Cutillas", , "p.cutillas@qmul.ac.uk", role = "aut",
@@ -35,6 +35,7 @@ Imports:
3535
plotly,
3636
heatmaply,
3737
umap,
38+
VIM,
3839
openxlsx,
3940
readxl,
4041
tools,

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import(viridisLite)
2525
import(plotly)
2626
import(heatmaply)
2727
import(umap)
28+
import(VIM)
2829
import(openxlsx)
2930
import(readxl)
3031
import(tools)

R/script_helpers.R

Lines changed: 105 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -220,11 +220,16 @@ swap_dash_for_dot_in_col <- function(df, column_name) {
220220

221221
# Impute missing values ----
222222
# NOTE: Function uses vectorised operations for speed, loops only on columns not rows.
223-
#
224223
impute_na <- function(df.design, df.areas) {
225224
# Get unique conditions
226225
conditions <- unique(df.design$condition)
227226

227+
# Select only numeric columns
228+
numeric_columns <- df.areas[, sapply(df.areas, is.numeric)]
229+
230+
# Apply the min function to the dataset's numeric columns
231+
min_value <- min(numeric_columns, na.rm = TRUE)
232+
228233
# Loop over each unique condition
229234
for (condition in conditions) {
230235
# Get the columns in df.areas that belong to the current condition
@@ -242,12 +247,6 @@ impute_na <- function(df.design, df.areas) {
242247
all_na <- is.na(row_means)
243248
# df.areas[all_na, heading] <- min(df.areas[, heading], na.rm = TRUE) - 1 # min of column (or min - 1)
244249

245-
# Select only numeric columns
246-
numeric_columns <- df.areas[, sapply(df.areas, is.numeric)]
247-
248-
# Apply the min function to the dataset's numeric columns
249-
min_value <- min(numeric_columns, na.rm = TRUE)
250-
251250
# Assign the minimum value minus 1 to the specified location
252251
df.areas[all_na, heading] <- min_value - 1
253252

@@ -258,13 +257,37 @@ impute_na <- function(df.design, df.areas) {
258257
}
259258

260259

260+
impute_na_knn <- function(df.design, df.areas, k = 5) {
261+
# Get unique conditions
262+
conditions <- unique(df.design$condition)
263+
264+
# Loop over each unique condition
265+
for (condition in conditions) {
266+
# Get the columns in df.areas that belong to the current condition
267+
cols <- df.design$heading[df.design$condition == condition]
268+
269+
# Subset the data for the current condition
270+
df.condition <- df.areas[, cols]
271+
272+
# Apply KNN imputation
273+
df.condition.imputed <- VIM::kNN(df.condition, k = k, imp_var = FALSE)
274+
275+
# Replace the original columns with the imputed columns
276+
df.areas[, cols] <- df.condition.imputed
277+
}
278+
279+
return(df.areas)
280+
}
281+
282+
261283
# Edited `protools2::normalize_areas_return_ppindex()` ----
262284
# Phosphoproteomics.Rmd:
263285
# Original function mistakenly used object name not of parameter, but of object in
264286
# global environment. This meant that `df.combi` was using original from global scope not from argument
265287
normalize_areas_return_ppindex_edit <- function(
266288
pescal_output_file,
267-
delta_score_cut_off = 0 # if (fragpipe) {0} else {5}
289+
delta_score_cut_off = 0, # if (fragpipe) {0} else {5}
290+
k_NN = NULL
268291
) {
269292
# Set delta_score_cut_off to low (say 1) for proteomics data,
270293
# High (say 15) for phosphoproteomics data
@@ -305,10 +328,18 @@ normalize_areas_return_ppindex_edit <- function(
305328
df.norm[df.norm == 0] <- NA
306329

307330
# Normalised NA imputation (no log2, no centre, no scale) ----
308-
df.norm.na.imputed <- impute_na(
309-
df.design = df.design,
310-
df.areas = df.norm
311-
)
331+
if (is.null(k_NN)) {
332+
df.norm.na.imputed <- impute_na(
333+
df.design = df.design,
334+
df.areas = df.norm
335+
)
336+
} else {
337+
df.norm.na.imputed <- impute_na_knn(
338+
df.design = df.design,
339+
df.areas = df.norm,
340+
k = k_NN
341+
)
342+
}
312343

313344
# Normalised NA imputation (no log2, no scale, no centre) ----
314345
df.norm.na.imputed.no.log2.no.scale.no.centre <- df.norm
@@ -352,10 +383,18 @@ normalize_areas_return_ppindex_edit <- function(
352383
)
353384

354385
# New improved NA imputation (log2, centred, scaled) ----
355-
df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
356-
df.design = df.design,
357-
df.areas = df.norm.log2.centered.scaled
358-
)
386+
if (is.null(k_NN)) {
387+
df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
388+
df.design = df.design,
389+
df.areas = df.norm.log2.centered.scaled
390+
)
391+
} else {
392+
df.norm.log2.centered.scaled.na.imputed.new <- impute_na_knn(
393+
df.design = df.design,
394+
df.areas = df.norm.log2.centered.scaled,
395+
k = k_NN
396+
)
397+
}
359398

360399
# Centred ----
361400
df.norm.log2.centered.na.imputed <- df.norm.log2.centered
@@ -367,10 +406,18 @@ normalize_areas_return_ppindex_edit <- function(
367406
)
368407

369408
# New improved NA imputation (log2, centred) ----
370-
df.norm.log2.centered.na.imputed.new <- impute_na(
371-
df.design = df.design,
372-
df.areas = df.norm.log2.centered
373-
)
409+
if (is.null(k_NN)) {
410+
df.norm.log2.centered.na.imputed.new <- impute_na(
411+
df.design = df.design,
412+
df.areas = df.norm.log2.centered
413+
)
414+
} else {
415+
df.norm.log2.centered.na.imputed.new <- impute_na_knn(
416+
df.design = df.design,
417+
df.areas = df.norm.log2.centered,
418+
k = k_NN
419+
)
420+
}
374421

375422
# Previous NA imputation
376423
# df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
@@ -406,7 +453,8 @@ normalize_areas_return_ppindex <- normalize_areas_return_ppindex_edit
406453
normalize_areas_return_protein_groups_edit <- function(
407454
pescal_output_file,
408455
mascot.score.cut.off = 50, # if (fragpipe) {0} else {40}
409-
n.peptide.cut.off = 1 # if (fragpipe) {0} else {1}
456+
n.peptide.cut.off = 1, # if (fragpipe) {0} else {1}
457+
k_NN = NULL
410458
) {
411459
suppressMessages(
412460
df.areas <- data.frame(readxl::read_excel(pescal_output_file, "output_areas"))
@@ -463,10 +511,18 @@ normalize_areas_return_protein_groups_edit <- function(
463511
df.norm <- data.frame(protein.group = x$protein.group, x[, cols] * 1e+06)
464512

465513
# New normalised NA imputation (no log2, no centre, no scale) ----
466-
df.norm.na.imputed <- impute_na(
467-
df.design = df.design,
468-
df.areas = df.norm
469-
)
514+
if (is.null(k_NN)) {
515+
df.norm.na.imputed <- impute_na(
516+
df.design = df.design,
517+
df.areas = df.norm
518+
)
519+
} else {
520+
df.norm.na.imputed <- impute_na_knn(
521+
df.design = df.design,
522+
df.areas = df.norm,
523+
k = k_NN
524+
)
525+
}
470526

471527
# Normalised NA imputation (no log2, no scale, no centre) ----
472528
df.norm.na.imputed.no.log2.no.scale.no.centre <- df.norm
@@ -489,10 +545,18 @@ normalize_areas_return_protein_groups_edit <- function(
489545

490546
# Centred
491547
# New improved NA imputation (log2, centred) ----
492-
df.norm.log2.centered.na.imputed.new <- impute_na(
493-
df.design = df.design,
494-
df.areas = df.norm.log2.centered
495-
)
548+
if (is.null(k_NN)) {
549+
df.norm.log2.centered.na.imputed.new <- impute_na(
550+
df.design = df.design,
551+
df.areas = df.norm.log2.centered
552+
)
553+
} else {
554+
df.norm.log2.centered.na.imputed.new <- impute_na_knn(
555+
df.design = df.design,
556+
df.areas = df.norm.log2.centered,
557+
k = k_NN
558+
)
559+
}
496560

497561
# Centred + scaled
498562
df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
@@ -509,10 +573,18 @@ normalize_areas_return_protein_groups_edit <- function(
509573
)
510574

511575
# New improved NA imputation (log2, centred, scaled) ----
512-
df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
513-
df.design = df.design,
514-
df.areas = df.norm.log2.centered.scaled
515-
)
576+
if (is.null(k_NN)) {
577+
df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
578+
df.design = df.design,
579+
df.areas = df.norm.log2.centered.scaled
580+
)
581+
} else {
582+
df.norm.log2.centered.scaled.na.imputed.new <- impute_na_knn(
583+
df.design = df.design,
584+
df.areas = df.norm.log2.centered.scaled,
585+
k = k_NN
586+
)
587+
}
516588

517589
rownames(df.norm) <- df.norm.log2.centered$protein.group
518590
rownames(df.norm.na.imputed) <- df.norm.log2.centered$protein.group
@@ -1273,7 +1345,6 @@ pathway_enrichment_edit <- function(
12731345
registerDoParallel(cl)
12741346
t1 <- Sys.time()
12751347
enrich.combined <- foreach(db = prot_dbs, .combine = rbind, .packages = "protools2") %dopar% {
1276-
# browser() # DEBUG
12771348
e <- protools2::enrichment.from.list(
12781349
list.of.peptides = c(increased.peptides, decreased.peptides),
12791350
background.list,

0 commit comments

Comments
 (0)