-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path01_ReadGTExData.R
More file actions
136 lines (105 loc) · 5.5 KB
/
01_ReadGTExData.R
File metadata and controls
136 lines (105 loc) · 5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Read GTEx v8 all samples, save as DDS, then batch correct with ComBAT.
library(tidyverse)
library(sva)
library(DESeq2)
in_dir <- "../data/gtex"
out_dir <- "01_out"
if (!dir.exists(out_dir)) {
dir.create(out_dir)
}
# Read the read count files
gtex <- readr::read_tsv(file.path(in_dir, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct"), skip=2)
# Make distinct gene name
gtex_matrix <- gtex %>% dplyr::select(-Name) %>% dplyr::distinct(Description, .keep_all=TRUE) %>% tibble::column_to_rownames(var="Description") %>% as.matrix()
# Read the annotation file
annot <- readr::read_tsv(file.path(in_dir, "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"), guess_max=15000)
annot$SUBJID <- gsub("(?<!GTEX)-.*$", "", annot$SAMPID, perl=T)
# Read the subject demographics file
subdemo <- readr::read_tsv(file.path(in_dir, "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"))
# Get the metadata file for all RNA-seq data
i. <- annot %>% dplyr::filter(SMAFRZE == "RNASEQ")
i. <- i. %>% dplyr::left_join(subdemo)
i.$rowname <- i.$SAMPID
i. <- i. %>% column_to_rownames("rowname")
i. <- i.[colnames(gtex_matrix),]
# Set up dds object
dds <- DESeqDataSetFromMatrix(countData = gtex_matrix,
colData = i.,
design= ~ SMTSD)
# Normalize with vst
vsd <- DESeq2::vst(dds)
mat_vsd <- assay(vsd)
saveRDS(vsd, file.path(out_dir, "gtexv8_vst_dds.Rds"))
# https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2855-9
# I think we should first correct for technical batches, then correct for biological characteristics
# 1. Extraction Batch (SMNABTCH)
# 2. Sequencing Batch (SMGEBTCH)
# 3. Tissue Ischemic time (bin into 300 min intervals) (SMTSISCH) cut(i..$SMTSISCH, breaks=c(-Inf, 0, 300, 600, 900, 1200, Inf)) %>% table()
# 4. Death type (violent and quick vs. slow) (DTHHRDY)
# 5. 2023-02-28, add AGE for crosstalk.
##
## Correct 1 - Batch correct with extraction batch then sequencing batch
##
correct_technical <- i. %>% dplyr::select(SUBJID, SMNABTCH, SMGEBTCH)
correct_technical <- correct_technical[colnames(mat_vsd),]
# table(correct_technical$SMNABTCH)
# table(correct_technical$SMGEBTCH)
correct_technical$SMGEBTCH[is.na(correct_technical$SMGEBTCH)] <- "NoBatch"
mat_vsd_tec <- sva::ComBat(dat=mat_vsd, batch = correct_technical$SMNABTCH)
mat_vsd_tec <- sva::ComBat(dat=mat_vsd_tec, batch = correct_technical$SMGEBTCH)
vsd_tec <- vsd
assay(vsd_tec) <- mat_vsd_tec
saveRDS(vsd_tec, file.path(out_dir, "gtexv8_vst_combat1_dds.Rds"))
# Histogram of VSD
#h._vsd_tech[,1] %>% hist()
# Plot of pre-and-post correction
#plot(h._vsd_tech[,1], h._vsd[,1])
##
## Correct 2 - Batch correct with Tissue biological characteristics (ischemia time, potentially add fixative and autolysis score)
##
correct_tissue <- i. %>% dplyr::select(SUBJID, SMTSISCH, SMTSPAX, SMATSSCR, SMTSD)
correct_tissue <- correct_tissue[colnames(mat_vsd),]
correct_tissue$ischemic_batch <- cut(correct_tissue$SMTSISCH, breaks=c(-Inf, 300, 600, 900, 1200, Inf))
#correct_tissue$fixative_batch <- cut(correct_tissue$SMTSPAX, breaks=c(300, 600, 900, 1200, Inf))
#correct_tissue$autolysis_batch <- cut(correct_tissue$SMATSSCR, breaks=c(300, 600, 900, 1200, Inf))
# Check if ischemic time correlated with organ
# ggplot(data=correct_tissue, aes(x=SMTSD, y=SMTSISCH)) + geom_boxplot()
# Check that we have a sensible distribution of the batches
# table(correct_tissue$ischemic_batch)
# If no information, encode as no specific ischemia time.
correct_tissue$ischemic_batch[is.na(correct_tissue$ischemic_batch)] <- "(-Inf,300]"
mat_vsd_tec_tis <- sva::ComBat(dat=mat_vsd_tec, batch = correct_tissue$ischemic_batch)
# Histogram of VSD
# mat_vsd_tec_tis[,1] %>% hist()
# Plot of pre-and-post correction
# plot(h._vsd_tech[,1], h._vsd_tech_tiss[,1])
vsd_tec_tis <- vsd
assay(vsd_tec_tis) <- mat_vsd_tec_tis
saveRDS(vsd_tec_tis, file.path(out_dir, "gtexv8_vst_combat2_dds.Rds"))
# Correct 3 - Batch correct with donor death type
correct_donor <- tibble::enframe(colnames(mat_vsd_tec_tis)) %>% dplyr::select(SAMPID = value)
correct_donor$SUBJID <- gsub("(?<!GTEX)-.*$", "", correct_donor$SAMPID, perl=T)
correct_donor <- correct_donor %>% dplyr::left_join(subdemo) %>% tibble::column_to_rownames("SAMPID")
correct_donor <- correct_donor[colnames(mat_vsd_tec_tis),]
# Check that we have a sensible distribution of the batches
# table(correct_donor$DTHHRDY)
# table(correct_donor$AGE)
correct_donor$DTHHRDY[is.na(correct_donor$DTHHRDY)] <- 0
mat_vsd_tec_tis_bio <- sva::ComBat(dat=mat_vsd_tec_tis, batch = correct_donor$DTHHRDY)
# Histogram of VSD
# mat_vsd_tec_tis_bio[,1] %>% hist()
# Plot of pre-and-post correction
# plot( mat_vsd_tec_tis[,1], mat_vsd_tec_tis_bio[,1])
# Check whether there are genes with almost 0 variances (disable comment to run since it takes a long time, but last I check there was no nearZeroVar genes)
# dim(mat_vsd_tec_tis_bio)
# caret::nearZeroVar(t(h._vsd_tech_tiss_biol))
vsd_tec_tis_bio <- vsd_tec_tis
assay(vsd_tec_tis_bio) <- mat_vsd_tec_tis_bio
saveRDS(vsd_tec_tis_bio, file.path(out_dir, "gtexv8_vst_combat3_dds.Rds"))
# NEW 2023-02-28 - We are going to try to correct for age for the R03 tissue crosstalk project
correct_donor$AgeGroup <- as.integer(gsub("-.*$", "", correct_donor$AGE, ))
mat_vsd_tec_tis_bio_age <- sva::ComBat(dat=mat_vsd_tec_tis_bio, batch = correct_donor$AgeGroup)
vsd_tec_tis_bio_age <- vsd_tec_tis_bio
assay(vsd_tec_tis_bio_age) <- mat_vsd_tec_tis_bio_age
saveRDS(vsd_tec_tis_bio_age, file.path(out_dir, "gtexv8_vst_combat4_dds.Rds"))
print("complete")