This repository was archived by the owner on Aug 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathaddAnnotation.R
More file actions
65 lines (51 loc) · 2.99 KB
/
addAnnotation.R
File metadata and controls
65 lines (51 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(tidyr))
option_list = list(
make_option(c("-g", "--gene"), action="store", default=NA, type='character',
help="intron bed file with genename for each intron e.g. introns.bed"),
make_option(c("-t", "--type"), action="store", default=NA, type='character',
help="File with the intron types i.e. U2/U12 e.g. intron_type.xls"),
make_option(c("-i", "--geneinfo"), action="store", default=NA, type='character',
help="File with the information for the genes e.g. geneinfo.tsv"),
make_option(c("-d", "--deltamsi"), action="store", default=NA, type='character',
help="File giving delta msi values"),
make_option(c("-o", "--out"), action="store", default=NA, type='character',
help="File to store the merged annotations")
)
opt = parse_args(OptionParser(option_list=option_list))
processGenes <- function(geneFile, geneInfoFile){
gene.df <- readr::read_tsv(geneFile, col_names = c("Chrom", "Start", "End", "GeneNames", "Score", "Strand"))
gene.df.mutated <- gene.df %>%
dplyr::mutate(ID = paste(Chrom, Start, End, Strand, sep = "|")) %>% # Adding ID for join
dplyr::select(ID, GeneNames) %>%
tidyr::separate_rows(GeneNames, sep = ",") %>% # Split by comma so indivudial genes can be mapped to names
dplyr::mutate(GeneNames = gsub("\\..*", "", GeneNames)) # Remove version numbers makes the mapping easier
gene.to.name.map <- readr::read_tsv(geneInfoFile) %>%
dplyr::mutate(Ens_id = gsub("\\..*", "", Ens_id)) %>% # Remove version numbers makes the mapping easier
dplyr::select(Ens_id, GeneName)
gene.df.name <- gene.df.mutated %>%
dplyr::left_join(gene.to.name.map, by = c("GeneNames" = "Ens_id")) %>% # map id to gene name
dplyr::select(ID, GeneName) %>%
dplyr::group_by(ID) %>%
dplyr::summarise(Gene = paste(GeneName, collapse = ",")) # Collapse back to the way it was earlier
return(gene.df.name)
}
processType <- function(typeFile){
type.df <- readr::read_tsv(typeFile)
type.df.mutate <- type.df %>%
dplyr::mutate(ID = paste(Chrom, Start, End, Strand, sep = "|")) %>%
dplyr::select(ID, Type, SubType, Confidence)
return (type.df.mutate)
}
deltaMSI.df <- readr::read_tsv(opt$deltamsi)
deltaMSI.df.mutate <- deltaMSI.df %>%
dplyr::mutate(ID = paste(Chrom, Start, End, Strand, sep = "|"))
out.df <- Reduce(function(x, y) merge(x, y, by = "ID"), list(deltaMSI.df.mutate, processGenes(opt$gene, opt$geneinfo), processType(opt$type)))
out.df <- out.df %>%
dplyr::select(-ID) %>%
dplyr::select(Chrom, Start, End, Strand, DeltaMSI, pval, padj , Gene, Type, SubType, Confidence, everything()) %>%
dplyr::mutate(Start = Start + 1) %>% # Adjusting the start value as we have been using the bed based start values
dplyr::arrange(padj)
write.table(x=out.df, file=opt$out, sep = "\t", row.names = FALSE, quote = FALSE)