000a_ms_project_specific_formatting.R

xrydbh 2022-02-04

library(readxl)
library(tidyverse)
library(rlist)
#library(ggpubr)

# testing

# If script is running separately move up one step with setwd(".."). Only needed once (for first script run separately).


# specify file paths
input_path <- "./data/ms.xlsx"
missing_values_allowed_for_prot <- 0 #3 for with_at_least_5_vals (_in_high_or_low) or 
filter_missing_val_both_groups_or_either_or <- "either_or" #"either_or" # or "both"

# read ms file
mass_spec.0.1 <- readxl::read_excel(input_path ,
                                    sheet = "USE normalized abundances",
                                    col_names = TRUE,
                                    na = "NaN",
                                    skip = 6) 
# make syntactically valid names
names(mass_spec.0.1) <- 
  mass_spec.0.1 %>% 
  names() %>% 
  make.names()


# rename and reorder columns
mass_spec.0.2 <- mass_spec.0.1 %>% 
  dplyr::select(UniprotID="Accession",
                Gene.synonyms="Gene.Symbol",
                contains("SC.Donor"),
                Coverage="Coverage....",
                No.peptides="X..Peptides",
                PSMs="X..PSMs",
                Unique.Peptides="X..Unique.Peptides",
                MW.kDa="MW..kDa.") %>% 
  #separate_rows(Gene.symbol, sep = ";") %>% # put each gene symbol on its onw row
  mutate(Gene.synonyms=str_replace_all(Gene.synonyms, " ", "")) %>%  #remove spaces (replace spaces with nothing)
  mutate(Gene.symbol=word(Gene.synonyms, start = 1,sep = "\\;")) %>% #Create Gene.symbol from Gene.synonyms by extracting first word in sentence separated by semicolon
  relocate(Gene.symbol,.before = Gene.synonyms) # Put added Gene.symbol next to missing Gene.synonyms

# find ID with Gene.synonyms
symbols_with_synonyms <- mass_spec.0.2 %>% 
  pluck("Gene.synonyms") %>% 
  str_subset(";") %>% # Keep strings matching a ;
  word(start = 1,sep = "\\;")  #Create Gene.symbol from Gene.synonyms by extracting first word in sentence separated by semicolon

# See that Genes with multiple synonyms have been annotated in Gene.synonyms
mass_spec.0.2 %>% 
  select(UniprotID,
         Gene.symbol,
         Gene.synonyms) %>% 
  filter(Gene.symbol %in% symbols_with_synonyms)

## # A tibble: 49 × 3
##    UniprotID Gene.symbol Gene.synonyms                                          
##    <chr>     <chr>       <chr>                                                  
##  1 P0DMV8    HSPA1B      HSPA1B;HSPA1A                                          
##  2 O76011    KRT34       KRT34;LOC100653049                                     
##  3 O95810    SDPR        SDPR;CAVIN2                                            
##  4 O15117    FYB         FYB;FYB1                                               
##  5 P69905    HBA2        HBA2;HBA1                                              
##  6 P49908    SEPP1       SEPP1;SELENOP                                          
##  7 Q92520    FAM3C       FAM3C;WNT16                                            
##  8 P04745    AMY1A       AMY1A;AMY1C;AMY1B                                      
##  9 P0DPA2    VSIG8       VSIG8;C1orf204;SNHG28                                  
## 10 P62805    HIST1H4A    HIST1H4A;HIST1H4F;HIST1H4D;HIST1H4J;HIST2H4A;HIST2H4B;…
## # … with 39 more rows

##################################
# Check if Uniprot is unique
mass_spec.0.2$UniprotID[mass_spec.0.2$UniprotID %>% duplicated()]

## character(0)

##################################

# shorten column names
names(mass_spec.0.2) <- str_replace(names(mass_spec.0.2), ".SC.Donor","_")
names(mass_spec.0.2)

##  [1] "UniprotID"       "Gene.symbol"     "Gene.synonyms"   "High_1"         
##  [5] "High_2"          "High_3"          "High_4"          "High_5"         
##  [9] "High_6"          "High_7"          "High_8"          "Low_1"          
## [13] "Low_2"           "Low_3"           "Low_4"           "Low_5"          
## [17] "Low_6"           "Low_7"           "Low_8"           "Coverage"       
## [21] "No.peptides"     "PSMs"            "Unique.Peptides" "MW.kDa"

# make longer by collecting SC_donor
mass_spec.0.3 <- mass_spec.0.2 %>% 
  pivot_longer(cols=contains("_"),
               names_to=c("SC_Donor"),
               values_to = "Norm_abundance") %>% 
  mutate(Norm_abundance=as.numeric(Norm_abundance)) # make abundances numeric

#####################################
# Fill in missing gene symbols
#####################################
# Get vector of Uniprot Ids with no matching Gene Symbol
UpIDsWithNoGS.0.0 <- mass_spec.0.3$UniprotID[mass_spec.0.3$Gene.symbol %>% is.na()] %>% # Which Uniport has missing values?
  unique()

######## Get uniprot annotation
####
# Define function uniprot_mapping
uniprot_mapping <- function(ids) {
  uri <- 'http://www.uniprot.org/uniprot/?query='
  idStr <- paste(ids, collapse="+or+")
  format <- '&format=tab'
  fullUri <- paste0(uri,idStr,format)
  dat <- read.delim(fullUri)
  dat
}

# Full anno table
Uniprot_anno.0.0 <- uniprot_mapping(UpIDsWithNoGS.0.0) %>% 
  as_tibble() # get Uniprot annotation corresponding to UPIDs

# For some reason, output is has too many Uniprot IDs??#Restrict rows
Uniprot_anno.0.0.1 <- Uniprot_anno.0.0 %>%  
  filter(Entry %in% UpIDsWithNoGS.0.0) #%>% 
#filter(Entry != "P0DOX7") # P0DOX7 Has no annotation

# Restricted anno table (to two cols)
Uniprot_anno.0.1 <- Uniprot_anno.0.0.1 %>% 
  dplyr::select(UniprotID=Entry,
                Gene.synonyms.add=Gene.names) %>% # restrict annotation to only UPIDs and GS
  #separate_rows(Gene.symbol, sep = " ") # put each GS on its own row
  mutate(Gene.synonyms.add=str_replace_all(Gene.synonyms.add,pattern = " ",replacement = ";")) %>% 
  mutate(Gene.symbol.add=word(Gene.synonyms.add,1,sep = ";")) %>% # Create Gene.symbol with only one selected Gene name from Gene.synonyms
  relocate(Gene.symbol.add,.before = Gene.synonyms.add) # Put added GS next to missing GS
## If gene name is missing (NA), replace with fake gene name "no.gene.symbol"
Uniprot_anno.0.1$Gene.symbol.add[Uniprot_anno.0.1$Gene.symbol.add==""] <- "no.gene.symbol"
Uniprot_anno.0.1$Gene.synonyms.add [Uniprot_anno.0.1$Gene.synonyms.add==""] <- "no.gene.symbol"

###############

# Merge dataframe of proteins with no GS with With Uniprot Anno
mass_spec.0.3.1 <- full_join(mass_spec.0.3,Uniprot_anno.0.1,by="UniprotID") %>% 
  relocate(Gene.symbol.add,.after = Gene.symbol) %>%  # Put added GS next to missing GS
  relocate(Gene.synonyms.add,.after = Gene.synonyms) # Put added GS next to missing GS

# When Gene.symbol is NA replace with Gene.symbol.add
mass_spec.0.3.2 <- mass_spec.0.3.1 %>% 
  mutate(Gene.symbol = case_when(is.na(Gene.symbol) ~ Gene.symbol.add,
                                 TRUE ~ Gene.symbol)) %>% 
  mutate(Gene.synonyms = case_when(is.na(Gene.synonyms) ~ Gene.symbol, # When there are no synonyms jus put Gene.symbol in synonyms
                                   TRUE ~ Gene.synonyms))


mass_spec.0.3.3 <- mass_spec.0.3.2 %>% 
  dplyr::select(-c(Gene.symbol.add,Gene.synonyms.add)) # remove Gene.symbol.y

######## Get raw data, no NA filt into wide

ms.raw.0.0 <- mass_spec.0.3.3 %>% 
  dplyr::select(UniprotID,
                Gene.symbol,
                Gene.synonyms,
                SC_Donor,
                Norm_abundance, # Here the unique column is selected
                No.peptides,
                PSMs,
                Unique.Peptides,
                MW.kDa,
                
  ) %>% 
  distinct() %>% 
  pivot_wider(names_from = SC_Donor,values_from = "Norm_abundance") %>% # make wider
  mutate_at(vars(-group_cols()), as.character) %>%  #make all columns as characters
  ungroup()

# Save raw, with some uniprot IDs filled in
# This is now saved after filtering out Keratins instead
 # fn = paste("./data/001_ms_raw.xlsx",sep="")
 # ms.raw.0.0 %>%
 #   writexl::write_xlsx(path = fn)

# initiate list
ms_numb_unique_prot_IDs <- list()
##############################
# Collect number of IDS info into list
##############################
### collect prot ID count for NPX.0.0
ms_numb_unique_prot_IDs[["input"]][["Gene.symbol"]] <- mass_spec.0.1$Gene.Symbol %>% 
  unique() %>% 
  length()
ms_numb_unique_prot_IDs[["input"]][["Uniprot"]] <- mass_spec.0.1$Accession %>% 
  unique() %>% 
  length()


### collect prot ID counts for NPX.0.3
after_proj_spec_format <- list()

after_proj_spec_format[["Gene.symbol"]] <- mass_spec.0.3.3$Gene.symbol %>% 
  unique() %>% 
  length()
after_proj_spec_format[["Uniprot"]] <- mass_spec.0.3.3$UniprotID %>% 
  unique() %>% 
  length()


ms_numb_unique_prot_IDs <- ms_numb_unique_prot_IDs %>% 
  list.append(after_proj_spec_format=after_proj_spec_format)

save(ms_numb_unique_prot_IDs,ms.raw.0.0,file = "./RData/000_ms_numb_unique_prot_IDs.RData")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

000a_ms_project_specific_formatting.R

FilesExpand file tree

000a_ms_project_specific_formatting.md

Latest commit

History

000a_ms_project_specific_formatting.md

File metadata and controls

000a_ms_project_specific_formatting.R