@@ -220,11 +220,16 @@ swap_dash_for_dot_in_col <- function(df, column_name) {
220220
221221# Impute missing values ----
222222# NOTE: Function uses vectorised operations for speed, loops only on columns not rows.
223- #
224223impute_na <- function (df.design , df.areas ) {
225224 # Get unique conditions
226225 conditions <- unique(df.design $ condition )
227226
227+ # Select only numeric columns
228+ numeric_columns <- df.areas [, sapply(df.areas , is.numeric )]
229+
230+ # Apply the min function to the dataset's numeric columns
231+ min_value <- min(numeric_columns , na.rm = TRUE )
232+
228233 # Loop over each unique condition
229234 for (condition in conditions ) {
230235 # Get the columns in df.areas that belong to the current condition
@@ -242,12 +247,6 @@ impute_na <- function(df.design, df.areas) {
242247 all_na <- is.na(row_means )
243248 # df.areas[all_na, heading] <- min(df.areas[, heading], na.rm = TRUE) - 1 # min of column (or min - 1)
244249
245- # Select only numeric columns
246- numeric_columns <- df.areas [, sapply(df.areas , is.numeric )]
247-
248- # Apply the min function to the dataset's numeric columns
249- min_value <- min(numeric_columns , na.rm = TRUE )
250-
251250 # Assign the minimum value minus 1 to the specified location
252251 df.areas [all_na , heading ] <- min_value - 1
253252
@@ -258,13 +257,37 @@ impute_na <- function(df.design, df.areas) {
258257}
259258
260259
260+ impute_na_knn <- function (df.design , df.areas , k = 5 ) {
261+ # Get unique conditions
262+ conditions <- unique(df.design $ condition )
263+
264+ # Loop over each unique condition
265+ for (condition in conditions ) {
266+ # Get the columns in df.areas that belong to the current condition
267+ cols <- df.design $ heading [df.design $ condition == condition ]
268+
269+ # Subset the data for the current condition
270+ df.condition <- df.areas [, cols ]
271+
272+ # Apply KNN imputation
273+ df.condition.imputed <- VIM :: kNN(df.condition , k = k , imp_var = FALSE )
274+
275+ # Replace the original columns with the imputed columns
276+ df.areas [, cols ] <- df.condition.imputed
277+ }
278+
279+ return (df.areas )
280+ }
281+
282+
261283# Edited `protools2::normalize_areas_return_ppindex()` ----
262284# Phosphoproteomics.Rmd:
263285# Original function mistakenly used object name not of parameter, but of object in
264286# global environment. This meant that `df.combi` was using original from global scope not from argument
265287normalize_areas_return_ppindex_edit <- function (
266288 pescal_output_file ,
267- delta_score_cut_off = 0 # if (fragpipe) {0} else {5}
289+ delta_score_cut_off = 0 , # if (fragpipe) {0} else {5}
290+ k_NN = NULL
268291) {
269292 # Set delta_score_cut_off to low (say 1) for proteomics data,
270293 # High (say 15) for phosphoproteomics data
@@ -305,10 +328,18 @@ normalize_areas_return_ppindex_edit <- function(
305328 df.norm [df.norm == 0 ] <- NA
306329
307330 # Normalised NA imputation (no log2, no centre, no scale) ----
308- df.norm.na.imputed <- impute_na(
309- df.design = df.design ,
310- df.areas = df.norm
311- )
331+ if (is.null(k_NN )) {
332+ df.norm.na.imputed <- impute_na(
333+ df.design = df.design ,
334+ df.areas = df.norm
335+ )
336+ } else {
337+ df.norm.na.imputed <- impute_na_knn(
338+ df.design = df.design ,
339+ df.areas = df.norm ,
340+ k = k_NN
341+ )
342+ }
312343
313344 # Normalised NA imputation (no log2, no scale, no centre) ----
314345 df.norm.na.imputed.no.log2.no.scale.no.centre <- df.norm
@@ -352,10 +383,18 @@ normalize_areas_return_ppindex_edit <- function(
352383 )
353384
354385 # New improved NA imputation (log2, centred, scaled) ----
355- df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
356- df.design = df.design ,
357- df.areas = df.norm.log2.centered.scaled
358- )
386+ if (is.null(k_NN )) {
387+ df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
388+ df.design = df.design ,
389+ df.areas = df.norm.log2.centered.scaled
390+ )
391+ } else {
392+ df.norm.log2.centered.scaled.na.imputed.new <- impute_na_knn(
393+ df.design = df.design ,
394+ df.areas = df.norm.log2.centered.scaled ,
395+ k = k_NN
396+ )
397+ }
359398
360399 # Centred ----
361400 df.norm.log2.centered.na.imputed <- df.norm.log2.centered
@@ -367,10 +406,18 @@ normalize_areas_return_ppindex_edit <- function(
367406 )
368407
369408 # New improved NA imputation (log2, centred) ----
370- df.norm.log2.centered.na.imputed.new <- impute_na(
371- df.design = df.design ,
372- df.areas = df.norm.log2.centered
373- )
409+ if (is.null(k_NN )) {
410+ df.norm.log2.centered.na.imputed.new <- impute_na(
411+ df.design = df.design ,
412+ df.areas = df.norm.log2.centered
413+ )
414+ } else {
415+ df.norm.log2.centered.na.imputed.new <- impute_na_knn(
416+ df.design = df.design ,
417+ df.areas = df.norm.log2.centered ,
418+ k = k_NN
419+ )
420+ }
374421
375422 # Previous NA imputation
376423 # df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
@@ -406,7 +453,8 @@ normalize_areas_return_ppindex <- normalize_areas_return_ppindex_edit
406453normalize_areas_return_protein_groups_edit <- function (
407454 pescal_output_file ,
408455 mascot.score.cut.off = 50 , # if (fragpipe) {0} else {40}
409- n.peptide.cut.off = 1 # if (fragpipe) {0} else {1}
456+ n.peptide.cut.off = 1 , # if (fragpipe) {0} else {1}
457+ k_NN = NULL
410458) {
411459 suppressMessages(
412460 df.areas <- data.frame (readxl :: read_excel(pescal_output_file , " output_areas" ))
@@ -463,10 +511,18 @@ normalize_areas_return_protein_groups_edit <- function(
463511 df.norm <- data.frame (protein.group = x $ protein.group , x [, cols ] * 1e+06 )
464512
465513 # New normalised NA imputation (no log2, no centre, no scale) ----
466- df.norm.na.imputed <- impute_na(
467- df.design = df.design ,
468- df.areas = df.norm
469- )
514+ if (is.null(k_NN )) {
515+ df.norm.na.imputed <- impute_na(
516+ df.design = df.design ,
517+ df.areas = df.norm
518+ )
519+ } else {
520+ df.norm.na.imputed <- impute_na_knn(
521+ df.design = df.design ,
522+ df.areas = df.norm ,
523+ k = k_NN
524+ )
525+ }
470526
471527 # Normalised NA imputation (no log2, no scale, no centre) ----
472528 df.norm.na.imputed.no.log2.no.scale.no.centre <- df.norm
@@ -489,10 +545,18 @@ normalize_areas_return_protein_groups_edit <- function(
489545
490546 # Centred
491547 # New improved NA imputation (log2, centred) ----
492- df.norm.log2.centered.na.imputed.new <- impute_na(
493- df.design = df.design ,
494- df.areas = df.norm.log2.centered
495- )
548+ if (is.null(k_NN )) {
549+ df.norm.log2.centered.na.imputed.new <- impute_na(
550+ df.design = df.design ,
551+ df.areas = df.norm.log2.centered
552+ )
553+ } else {
554+ df.norm.log2.centered.na.imputed.new <- impute_na_knn(
555+ df.design = df.design ,
556+ df.areas = df.norm.log2.centered ,
557+ k = k_NN
558+ )
559+ }
496560
497561 # Centred + scaled
498562 df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
@@ -509,10 +573,18 @@ normalize_areas_return_protein_groups_edit <- function(
509573 )
510574
511575 # New improved NA imputation (log2, centred, scaled) ----
512- df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
513- df.design = df.design ,
514- df.areas = df.norm.log2.centered.scaled
515- )
576+ if (is.null(k_NN )) {
577+ df.norm.log2.centered.scaled.na.imputed.new <- impute_na(
578+ df.design = df.design ,
579+ df.areas = df.norm.log2.centered.scaled
580+ )
581+ } else {
582+ df.norm.log2.centered.scaled.na.imputed.new <- impute_na_knn(
583+ df.design = df.design ,
584+ df.areas = df.norm.log2.centered.scaled ,
585+ k = k_NN
586+ )
587+ }
516588
517589 rownames(df.norm ) <- df.norm.log2.centered $ protein.group
518590 rownames(df.norm.na.imputed ) <- df.norm.log2.centered $ protein.group
@@ -1273,7 +1345,6 @@ pathway_enrichment_edit <- function(
12731345 registerDoParallel(cl )
12741346 t1 <- Sys.time()
12751347 enrich.combined <- foreach(db = prot_dbs , .combine = rbind , .packages = " protools2" ) %dopar % {
1276- # browser() # DEBUG
12771348 e <- protools2 :: enrichment.from.list(
12781349 list.of.peptides = c(increased.peptides , decreased.peptides ),
12791350 background.list ,
0 commit comments