-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpostGenerate.R
More file actions
121 lines (100 loc) · 6.31 KB
/
postGenerate.R
File metadata and controls
121 lines (100 loc) · 6.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# DESCRIPTION ----------------------------------------------------------------------------------
# Author: Elizabeth Hutchins
# Date: July 19th, 2022
#Purpose: To create a gene annotation and transcript annotation text files with biotype information, etc.
#Imports: GENCODE40 and LNCipedia5.2 (high confidence) gtf files
# list of ens IDs in intersection
#Exports:
#gene annotations
#transcript annotations
# import packages -------------------------------------------------------------------------------
library(rtracklayer) #for importing gtf
library(GenomicFeatures) #for creating TxDB
library(tidyverse)
#library(gridExtra)
# import combined annotation ---------------------------------------------
combinedAnno.gtf <- "gencode_v40.lncipedia_v5_2_hc.annotation.gtf"
combinedAnno <- rtracklayer::import(combinedAnno.gtf, format = "gtf")
combinedAnno.df <- as.data.frame(combinedAnno)
# import GENCODE40 annotation -------------------------------------------------------------------
gencode.v40.gtf <- "gencode.v40.primary_assembly.annotation.gtf"
gencode40 <- rtracklayer::import(gencode.v40.gtf, format = "gtf")
gencode40.df <- as.data.frame(gencode40)
gencode40.genes <- subset(gencode40.df, type == "gene") #list genes only in gencode annotation
gencode40.genes.anno <- gencode40.genes %>% dplyr::select(gene_id, gene_type, gene_name) %>%
transmute(ensembl_id = gene_id, ensembl_gene_name = gene_name, ensembl_gene_type = gene_type)
#make tx2gene ---------------------------------------------
txdb <- makeTxDbFromGFF(combinedAnno.gtf, format = "gtf")
saveDb(x=txdb, file = "gencode_v40.lncipedia_v5_2_hc.annotation.TxDb")
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- select(txdb, k, "GENEID", "TXNAME")
head(tx2gene)
dim(tx2gene)
write_csv(tx2gene, "tx2gene.gencode_v40.lncipedia_v5_2_hc.csv")
#make .txt files with annotation features (gene, transcript, biotype, etc) ---------------------------------------------
combinedAnno.genes <- combinedAnno.df %>% dplyr::filter(type == "gene")
transcripts.anno <- combinedAnno.df %>% dplyr::filter(type == "transcript") %>%
mutate(transcript_name = coalesce(transcript_name, transcript_id)) %>%
mutate(transcript_type = replace_na(transcript_type, "lncipedia_lncRNA"))
genes.anno.tmp <- combinedAnno.genes %>% dplyr::select(gene_id, gene_type, gene_name) %>%
mutate(gene_name = coalesce(gene_name, gene_id)) %>%
mutate(gene_type = replace_na(gene_type, "lncipedia_lncRNA"))
#add in ensembl ID for matching lncipedia IDs -------------------------------------------------------------------
# import 92 to 106 table, remove retired IDs
ens106 <- read.csv("ensembl106GeneUpdate.csv", header = TRUE)
geneIDs.106 <- ens106[grep("retired", ens106$New.stable.ID, invert = TRUE),] %>%
transmute(lncipediaGeneID = lncipediaGeneID,
matching_ens_gene_id = gsub(" ", "", New.stable.ID),
source = "lncipedia_map")
#import intersection/overlapping genes
intersection.map <- read_tsv("intersection.genes.nopc.txt", col_names = FALSE) %>%
dplyr::select(X9, X18) %>%
transmute(lncipediaGeneID = sapply(str_split(X18, "\\\""), "[", 2),
matching_ens_gene_id = sapply(str_split(X9, "\\\""), "[", 2),
source = "intersection"
)
conversionTable <- rbind(geneIDs.106, intersection.map)
genes.anno <- left_join(genes.anno.tmp, conversionTable, by = c("gene_id" = "lncipediaGeneID")) %>%
mutate(ensembl_id = if_else(!gene_type == "lncipedia_lncRNA", gene_id, matching_ens_gene_id)) %>%
left_join(gencode40.genes.anno)
#tmp <- genes.anno %>% filter(gene_type == "lncipedia_lncRNA")
#table(tmp$ensembl_gene_type)
#add in biotype ----------------------------------------------------------------------------
# biotype classes
#NOTE: see the following link
#https://uswest.ensembl.org/Help/Faq?id=468
#http://vega.archive.ensembl.org/info/about/gene_and_transcript_types.html
pcBiotypes <- c("protein_coding", "IG_V_gene", "IG_C_gene", "IG_J_gene",
"TR_C_gene", "TR_J_gene", "TR_V_gene", "TR_D_gene",
"IG_D_gene", "nonsense_mediated_decay", "non_stop_decay")
lncBiotypes <- c("lncRNA", "lncipedia_lncRNA", "processed_transcript", "retained_intron")
short.ncBiotypes <- c("snRNA", "snoRNA", "misc_RNA", "miRNA", "rRNA",
"Mt_tRNA", "Mt_rRNA", "ribozyme", "scaRNA",
"sRNA", "scRNA", "vault_RNA")
pseudogeneBiotypes <- c("processed_pseudogene", "transcribed_unprocessed_pseudogene",
"unprocessed_pseudogene", "transcribed_processed_pseudogene",
"transcribed_unitary_pseudogene", "rRNA_pseudogene",
"unitary_pseudogene", "polymorphic_pseudogene",
"pseudogene", "IG_V_pseudogene",
"translated_processed_pseudogene", "translated_unprocessed_pseudogene",
"TR_V_pseudogene",
"IG_C_pseudogene", "TR_J_pseudogene",
"IG_J_pseudogene", "IG_pseudogene")
TEC.Biotypes <- c("TEC")
genes.anno <- genes.anno %>% mutate(biotype_class = "unknown")
genes.anno$biotype_class[genes.anno$gene_type %in% pcBiotypes] <- "protein_coding"
genes.anno$biotype_class[genes.anno$gene_type %in% lncBiotypes] <- "long_noncoding"
genes.anno$biotype_class[genes.anno$gene_type %in% short.ncBiotypes] <- "short_noncoding"
genes.anno$biotype_class[genes.anno$gene_type %in% pseudogeneBiotypes] <- "pseudogene"
genes.anno$biotype_class[genes.anno$gene_type %in% TEC.Biotypes] <- "TEC"
unique(genes.anno$biotype_class) #checking
transcripts.anno <- transcripts.anno %>% mutate(biotype_class = "unknown")
transcripts.anno$biotype_class[transcripts.anno$transcript_type %in% pcBiotypes] <- "protein_coding"
transcripts.anno$biotype_class[transcripts.anno$transcript_type %in% lncBiotypes] <- "long_noncoding"
transcripts.anno$biotype_class[transcripts.anno$transcript_type %in% short.ncBiotypes] <- "short_noncoding"
transcripts.anno$biotype_class[transcripts.anno$transcript_type %in% pseudogeneBiotypes] <- "pseudogene"
transcripts.anno$biotype_class[transcripts.anno$transcript_type %in% TEC.Biotypes] <- "TEC"
unique(transcripts.anno$biotype_class) #checking
#output annotation description files ----------------------------------------------------------------
write_fst(genes.anno, "GRCh38_GENCODE40LNCipedia_geneAnnotations.fst")
write_fst(transcripts.anno, "GRCh38_GENCODE40LNCipedia_transcriptInfo.fst")