CutillasLab
diff --git a/‎DESCRIPTION‎
Lines changed: 4 additions & 3 deletions b/‎DESCRIPTION‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎R/normalize functions.R‎
Lines changed: 159 additions & 91 deletions b/‎R/normalize functions.R‎
Lines changed: 159 additions & 91 deletions
@@ -1,8 +1,8 @@
 Package: protools2
 Type: Package
 Title: A Set Of Tools For Proteomics And Phosphoproteomics Data Analysis
-Version: 0.2.9
-Date: 2023-06-22
+Version: 0.2.10
+Date: 2024-04-24
 Author: Pedro Rodriguez Cutillas, Irbaz I. Badshah
 Maintainer: Pedro Rodriguez Cutillas <p.cutillas@qmul.ac.uk>
 Description: Reads and normalizes data from Pescal. Does statistical analysis using limma or t-test. It also incoroporates functions for KSEA, pathway analysis and network visualization tools.
@@ -47,5 +47,6 @@ Imports:
     parallel,
     foreach,
     doParallel,
-    limma
+    limma,
+    ComplexHeatmap
 RoxygenNote: 7.2.3
@@ -2,161 +2,229 @@
 # Normalise output_areas
 # Author & copyright: Pedro R. Cutillas
 
-normalize_areas_return_ppindex <- function(pescal_output_file, delta_score_cut_off=1){
+normalize_areas_return_ppindex <- function(pescal_output_file, delta_score_cut_off=1) {
 
   # set delta_score_cut_off to low (say 1) for proteomics data,
   # high (say 15) for phosphoproteomics data
 
-
   library(foreach)
   library(doParallel)
 
   suppressMessages(
-    df.areas <- readxl::read_excel(pescal_output_file,"output_areas")
+    df.areas <- readxl::read_excel(pescal_output_file, "output_areas")
   )
-  colnames(df.areas) <- gsub("-",".",colnames(df.areas),fixed = T)
+  colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T)
   suppressMessages(
-    df.combi <- readxl::read_excel(pescal.output.file,"combiPeptData")
+    df.combi <- readxl::read_excel(pescal_output_file, "combiPeptData")  # Original `pescal.output.file` object is accessed from global scope
   )
-  # select peptides above the delta_score_cut_off
-  df.combi <- subset(df.combi,df.combi$max_delta_score>delta_score_cut_off)
-
-  peptides <- unique(unlist(df.combi[,25]))
-
-
-  df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id,]
 
+  # select peptides above the delta_score_cut_off
+  df.combi <- subset(df.combi, df.combi$max_delta_score > delta_score_cut_off)
+  peptides <- unique(unlist(df.combi[, 25]))  # 'sites'
+  df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id, ]
   cols <- colnames(dplyr::select_if(df.areas, is.numeric))
-  df.areas.n <- data.frame(ids=df.areas$db_id,
-                           scale(df.areas[,cols],center = F,
-                                 scale =  colSums(df.areas[,cols]) )
+  df.areas.n <- data.frame(
+    ids = df.areas$db_id,
+    scale(df.areas[, cols], center = F, scale = colSums(df.areas[, cols]))
   )
 
-  #plot(unlist(df.areas[1,cols]), unlist(df.areas.n[1,cols]))
-  #p <- peptides[3]
-  cores=detectCores()
-  cl <- makeCluster(cores[1]-1) #not to overload your computer
+  cores = detectCores()
+  cl <- makeCluster(cores[1] - 1)  # not to overload your computer
   registerDoParallel(cl)
   t1 <- Sys.time()
-  df <- foreach(p = peptides, .combine='rbind' )%dopar%{
-    ids <- na.omit(df.combi[df.combi[,25] ==p,]$db_id)
-    apply(df.areas.n[df.areas.n$ids %in% ids,cols],2,sum)
-
-  }
+  df <- foreach(p = peptides, .combine = "rbind") %dopar%
+    {
+      ids <- na.omit(df.combi[df.combi[, 25] == p, ]$db_id)
+      apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum)
+    }
   stopCluster(cl)
-
-  df.norm <- data.frame(sites=peptides,df*1000000)
+  df.norm <- data.frame(sites = peptides, df * 1e+06)
   rownames(df.norm) <- df.norm$sites
-  df.norm[df.norm==0] <- NA
-  df.norm.log2.centered <- data.frame(sites=peptides,scale(log2(df.norm[,cols]),scale = F))
-  df.norm.log2.centered.scaled <- data.frame(sites=peptides,scale(log2(df.norm[,cols])))
+  df.norm[df.norm == 0] <- NA
+
+  df.norm.log2.centered <- data.frame(
+    sites = peptides,
+    scale(log2(df.norm[, cols]), scale = F)
+  )
 
+  df.norm.log2.centered.scaled <- data.frame(
+    sites = peptides,
+    scale(log2(df.norm[, cols]))
+  )
 
+  # Alternative na imputation
+  # Centred & scaled
   df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
-  df.norm.log2.centered.scaled.na.imputed[is.na(df.norm.log2.centered.scaled.na.imputed)] <- min(df.norm.log2.centered.scaled.na.imputed[,cols], na.rm = T)/5
+  df.norm.log2.centered.scaled.na.imputed1 <- df.norm.log2.centered.scaled
+  df.norm.log2.centered.scaled.na.imputed2 <- df.norm.log2.centered.scaled
+
+  df.norm.log2.centered.scaled.na.imputed[cols] <- lapply(
+    df.norm.log2.centered.scaled.na.imputed[cols], function(x){
+      replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation
+    }
+  )
+
+  df.norm.log2.centered.scaled.na.imputed1[
+    is.na(df.norm.log2.centered.scaled.na.imputed1)
+  ] <- min(df.norm.log2.centered.scaled.na.imputed1[,cols], na.rm = T) / 5
+
+  df.norm.log2.centered.scaled.na.imputed2[cols] <- lapply(
+    df.norm.log2.centered.scaled.na.imputed2[cols], function(x){
+      replace(x, is.na(x), min(x, na.rm = TRUE)) # Correct NA imputation
+    }
+  )
+
+  # Scaled
+  df.norm.log2.centered.na.imputed <- df.norm.log2.centered
+  df.norm.log2.centered.na.imputed[cols] <- lapply(
+    df.norm.log2.centered.na.imputed[cols], function(x){
+      replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation
+    }
+  )
 
+  # Previous na imputation
+  # df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
+  # df.norm.log2.centered.scaled.na.imputed[
+  #   is.na(df.norm.log2.centered.scaled.na.imputed)
+  # ] <- min(df.norm.log2.centered.scaled.na.imputed[, cols], na.rm = T) / 5
 
   rownames(df.norm.log2.centered) <- df.norm.log2.centered$sites
   rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$sites
-  return(list(normalized.data=df.norm,
-              normalized.plus.log2.cent.data=df.norm.log2.centered,
-              normalized.plus.log2.cent.scaled.data=df.norm.log2.centered.scaled,
-              df.norm.log2.centered.scaled.na.imputed=df.norm.log2.centered.scaled.na.imputed))
-}
 
+  return(list(
+    normalized.data = df.norm,
+    normalized.plus.log2.cent.data = df.norm.log2.centered,
+    normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled,
+    df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed,
+    df.norm.log2.centered.na.imputed=df.norm.log2.centered.na.imputed,
+    df.norm.log2.centered.scaled.na.imputed1=df.norm.log2.centered.scaled.na.imputed1,
+    df.norm.log2.centered.scaled.na.imputed2=df.norm.log2.centered.scaled.na.imputed2
+  ))
+}
 
 
-normalize_areas_return_protein_groups <- function(pescal_output_file,
-                                                  mascot.score.cut.off=50,
-                                                  n.peptide.cut.off=1){
 
+normalize_areas_return_protein_groups <- function(
+    pescal_output_file,
+    mascot.score.cut.off=50,
+    n.peptide.cut.off=1
+  ) {
   suppressMessages(
-    df.areas <- data.frame( readxl::read_excel(pescal_output_file,"output_areas"))
+    df.areas <- data.frame(readxl::read_excel(pescal_output_file, "output_areas"))
   )
-  colnames(df.areas) <- gsub("-",".",colnames(df.areas),fixed = T)
+  colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T)
   suppressMessages(
-    df.combi <- data.frame(readxl::read_excel(pescal.output.file,"combiPeptData"))
+    df.combi <- data.frame(readxl::read_excel(pescal_output_file, "combiPeptData"))  # Original `pescal.output.file` object is accessed from global scope
   )
 
-
-
   # normalise areas
   cols <- colnames(dplyr::select_if(df.areas, is.numeric))
-  df.areas.n <- data.frame(ids=df.areas$db_id,
-                           scale(df.areas[,cols],center = F,
-                                 scale =  colSums(df.areas[,cols]) )
+  df.areas.n <- data.frame(
+    ids = df.areas$db_id,
+    scale(df.areas[, cols], center = F, scale = colSums(df.areas[, cols]))
   )
 
   # find protein groups
-  protein.groups <- na.omit(unique(unlist(df.combi[,29])))
+  protein.groups <- na.omit(unique(unlist(df.combi[, 29])))  # 29 = genes
   n.protein.groups <- length(protein.groups)
 
   # group peptides by protein group
-  cores=detectCores()
-  cl <- makeCluster(cores[1]-1) #not to overload your computer
+  cores = detectCores()
+  cl <- makeCluster(cores[1] - 1)  # not to overload your computer
   registerDoParallel(cl)
   t1 <- Sys.time()
-  df <- foreach(p = protein.groups, .combine='rbind' )%dopar%{
-
-    dfx <- df.combi[df.combi[,29] ==p,]
-
-    ids <- na.omit(dfx$db_id)
-    best.mascot.score <- max(dfx$max_scr,na.rm = T)
-    protein.name <- dfx$protein[1]
-    acc <- na.omit(dfx$acc_no)[1]
-    uniprot.id <- na.omit(dfx[1,30])[1]
-    n.peptides <- length(ids)
-    nPSMs <- na.omit(dfx[,"N_peptides"])
-    c(protein.group=p,
-      apply(df.areas.n[df.areas.n$ids %in% ids,cols],2,sum),
-      best.mascot.score=best.mascot.score,
-              n.peptides=n.peptides,
-              n.psm=sum(nPSMs),
-               acc=acc,
-              uniprot.id=uniprot.id,
-               protein.name=protein.name)
-
-  }
+  df <- foreach(p = protein.groups, .combine = "rbind") %dopar%
+    {
+      dfx <- df.combi[df.combi[, 29] == p, ]
+      ids <- na.omit(dfx$db_id)
+      best.mascot.score <- max(dfx$max_scr, na.rm = T)
+      protein.name <- dfx$protein[1]
+      acc <- na.omit(dfx$acc_no)[1]
+      uniprot.id <- na.omit(dfx[1, 30])[1]
+      n.peptides <- length(ids)
+      nPSMs <- na.omit(dfx[, "N_peptides"])
+      c(
+        protein.group = p,
+        apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum),
+        best.mascot.score = best.mascot.score,
+        n.peptides = n.peptides,
+        n.psm = sum(nPSMs),
+        acc = acc,
+        uniprot.id = uniprot.id,
+        protein.name = protein.name
+      )
+    }
   stopCluster(cl)
 
-  write.csv(df,"temp.csv")
+  write.csv(df, "temp.csv")
   x <- read.csv("temp.csv")
-  x[x==0] <- NA
-  df.norm <- data.frame(protein.group=x$protein.group, x[,cols]*1000000)
-
+  x[x == 0] <- NA
+  df.norm <- data.frame(protein.group = x$protein.group, x[, cols] * 1e+06)
 
-  df.norm.log2.centered <- data.frame(protein.group=protein.groups,scale(log2(df.norm[,cols]),scale = F))
-  df.norm.log2.centered.scaled <- data.frame(protein.group=protein.groups,scale(log2(df.norm[,cols])))
+  df.norm.log2.centered <- data.frame(
+    protein.group = protein.groups,
+    scale(log2(df.norm[, cols]), scale = F)
+  )
 
+  df.norm.log2.centered.scaled <- data.frame(
+    protein.group = protein.groups,
+    scale(log2(df.norm[, cols]))
+  )
 
   df.norm.log2.centered.scaled.na.imputed <- df.norm.log2.centered.scaled
-  df.norm.log2.centered.scaled.na.imputed[is.na(df.norm.log2.centered.scaled.na.imputed)] <- min(df.norm.log2.centered.scaled.na.imputed[,cols], na.rm = T)/5
 
+  # df.norm.log2.centered.scaled.na.imputed[  # Previous na imputation
+  #   is.na(df.norm.log2.centered.scaled.na.imputed)
+  # ] <- min(df.norm.log2.centered.scaled.na.imputed[, cols], na.rm = T) / 5
+
+  df.norm.log2.centered.scaled.na.imputed[cols] <- lapply(  # New na imputation
+    df.norm.log2.centered.scaled.na.imputed[cols], function(x){
+      replace(x, is.na(x), min(x, na.rm = TRUE) -1) # Correct NA imputation
+    }
+  )
 
   rownames(df.norm.log2.centered) <- df.norm.log2.centered$protein.group
   rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$protein.group
   rownames(df.norm) <- df.norm.log2.centered$protein.group
   rownames(df.norm.log2.centered.scaled.na.imputed) <- df.norm.log2.centered.scaled.na.imputed$protein.group
 
-
-  xx <- x[x$best.mascot.score>mascot.score.cut.off & x$n.peptides>n.peptide.cut.off,]
+  xx <- x[
+    x$best.mascot.score > mascot.score.cut.off &
+      x$n.peptides > n.peptide.cut.off, ]
 
   selected.prot.groups <- xx$protein.group
 
-  cc <- c('protein.group', 'best.mascot.score','n.peptides',"n.psm" ,'acc','uniprot.id','protein.name')
-
-  df.norm <- merge.data.frame(df.norm,x[,cc],by="protein.group")
-  df.norm.log2.centered <- merge.data.frame(df.norm.log2.centered,x[,cc],by="protein.group")
-  df.norm.log2.centered.scaled <- merge.data.frame(df.norm.log2.centered.scaled,x[,cc],by="protein.group")
-  df.norm.log2.centered.scaled.na.imputed <- merge.data.frame(df.norm.log2.centered.scaled.na.imputed,x[,cc],by="protein.group")
-
-
+  cc <- c(
+    "protein.group", "best.mascot.score", "n.peptides",
+    "n.psm", "acc", "uniprot.id", "protein.name"
+  )
 
-  return(list(normalized.data=df.norm[df.norm$protein.group %in% selected.prot.groups,],
-              normalized.plus.log2.cent.data=df.norm.log2.centered[df.norm.log2.centered$protein.group %in% selected.prot.groups,],
-              normalized.plus.log2.cent.scaled.data=df.norm.log2.centered.scaled[df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups,],
-              df.norm.log2.centered.scaled.na.imputed=df.norm.log2.centered.scaled.na.imputed[df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups,]))
+  df.norm <- merge.data.frame(df.norm, x[, cc], by = "protein.group")
+  df.norm.log2.centered <- merge.data.frame(
+    df.norm.log2.centered,
+    x[, cc],
+    by = "protein.group"
+  )
+  df.norm.log2.centered.scaled <- merge.data.frame(
+    df.norm.log2.centered.scaled,
+    x[, cc],
+    by = "protein.group"
+  )
+  df.norm.log2.centered.scaled.na.imputed <- merge.data.frame(
+    df.norm.log2.centered.scaled.na.imputed,
+    x[, cc],
+    by = "protein.group"
+  )
 
+  return(list(
+    normalized.data = df.norm[df.norm$protein.group %in% selected.prot.groups, ],
+    normalized.plus.log2.cent.data = df.norm.log2.centered[
+      df.norm.log2.centered$protein.group %in% selected.prot.groups, ],
+    normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled[
+      df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups, ],
+    df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed[
+      df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups, ]
+  ))
 }