forked from JamesPhippsTan/EnvGen_ExpVar
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1_LowExpFiltering.R
More file actions
105 lines (84 loc) · 5.04 KB
/
1_LowExpFiltering.R
File metadata and controls
105 lines (84 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Filtering out low expression genes and quality checking
# Original title:
# DEC 29th 2020
# GETTING GENE EXPRESSION MATRIX FROM HEAD that includes both conditions HS and CTRL
# for GxE mapping
# Last Updated: 6/6/25
#################################
##### Packages and Setup ########
#################################
rm(list = ls(all = T))
library(dplyr)
######################
##### Dataset ########
######################
#Head
setwd("C:\\Users\\jtanshengyi\\Desktop\\Projects\\veQTL Netherlands Normal vs High Sugar Adult\\Data")
head.info <- read.table("Info_RawCounts_CPM1_head_hsctrl_Jul9.20.txt",h=T)
head.info$plate <- gsub("b", "", head.info$plate)
#head.counts <- read.table("UnprocessedCountMatrix.txt",h=T,check.names = F)
dim(head.counts) #17727 genes, 2136 samples
# FIRST: FILTER OUT LOW COUNT GENES BASED ON MEAN CPM <1
{
#Transform to CPM to adjust for library size before doing any filtering, do this with both diets together
#Head
library(edgeR)
head.cpm <- DGEList(counts = head.counts)
head.cpm <- calcNormFactors(object = head.cpm)
head.cpm <- cpm(head.cpm, log = F)
# Look at the library sizes
library_sizes <- head.cpm$samples$lib.size*head.cpm$samples$norm.factors
# Get control samples (n=989)
head.c <- head.counts[,colnames(head.counts) %in% head.info$id[head.info$treatment==1]]
head.cpm.c <- head.cpm[ , colnames(head.cpm) %in% colnames(head.c)] ; dim(head.cpm.c)
# Get hs samples (n=1147)
head.hs <- head.counts[,colnames(head.counts) %in% head.info$id[head.info$treatment==2]]
head.cpm.hs <- head.cpm[ , colnames(head.cpm) %in% colnames(head.hs)]; dim(head.cpm.hs)
#keep genes with average CPM > 1 and CPM>1 in at least 20% of the individuals, per treatment
#(keeps = 8800)
phenotyping.rate.c <- apply(head.cpm.c, 1, function(x) sum(x>1)/length(x))
head.cpm1.c <- subset(head.cpm.c, (rowSums(head.cpm.c)/ncol(head.cpm.c))>1 & phenotyping.rate.c >0.2) ; dim(head.cpm.c) ; dim(head.cpm1.c)
phenotyping.rate1.c <- apply(head.cpm1.c, 1, function(x) sum(x>1)/length(x))
#(keeps = 9707) #quite a diff with control
phenotyping.rate.hs <- apply(head.cpm.hs, 1, function(x) sum(x>1)/length(x))
head.cpm1.hs <- subset(head.cpm.hs, (rowSums(head.cpm.hs)/ncol(head.cpm.hs))>1 & phenotyping.rate.hs >0.2) ; dim(head.cpm.hs) ; dim(head.cpm1.hs)
phenotyping.rate1.hs <- apply(head.cpm1.hs, 1, function(x) sum(x>1)/length(x))
#plots
phenotyping.rate=phenotyping.rate.hs
phenotyping.rate1=phenotyping.rate1.hs
head.cpm1.tmp=head.cpm1.hs
head.cpm.tmp=head.cpm.hs
head.counts.tmp=head.hs #put this back
par(mfrow=c(2,2))
hist(phenotyping.rate, main="%individuals CPM>1 per SNP", col="grey"); hist(phenotyping.rate1, col=adjustcolor("red",0.5), add=T)
samplesize.site <- apply(head.cpm1.tmp, 1, function(x) sum(x>1)) #min # ind per snp in ctrl 198, en hs also 230
hist(samplesize.site, co="grey", main="Sample size per site")
plot(log2(rowSums(head.cpm.tmp)/ncol(head.cpm.tmp)) ~ phenotyping.rate, ylab="average CPM per site (log2)", xlab="phenotyping rate") ; abline(h=0, lty=2)
points(log2(rowSums(head.cpm1.tmp)/ncol(head.cpm1.tmp)) ~ phenotyping.rate1, col="salmon")
c <- rowSums(head.cpm.tmp)/ncol(head.cpm.tmp)
co <- rowSums(head.counts.tmp)/ncol(head.counts.tmp)
plot (log2(c) ~ log2(co), xlab ="Average gene counts (log2)", ylab="Average CPM per site (log2)"); abline(h=0, lty=2) ; abline(v=2, lty=2)
hist(log2(c), main="average CPM per gene before filtering", col="grey", breaks=50)
hist(log2(rowSums(head.cpm1.tmp)/ncol(head.cpm1.tmp)), breaks=50,col=adjustcolor("salmon",0.4),add=T, main="average CPM per gene after removing low count genes")
missdata.indiv <- apply(head.cpm.tmp, 2, function(x) sum(x>1)/length(x))
missdata.indiv1 <- apply(head.cpm1.tmp, 2, function(x) sum(x>1)/length(x))
hist(missdata.indiv, col="grey", main="% sites CPM >1 per individual before filter"); hist(missdata.indiv1, col="grey", add=F, main="% sites CPM >1 per individual after filter")
}
# SECOND: OVERLAP BETWEEN GENES EXPRESSED IN THE hs AND IN ctrl
{
# 8786 genes overlap between the two datasets
# 14 genes are only expressed in ctrl
# 921 genes are only expressed in hs
both <- intersect(rownames(head.cpm1.c), rownames(head.cpm1.hs)) ; length(both)
length(setdiff(rownames(head.cpm1.c), rownames(head.cpm1.hs)))
length(setdiff(rownames(head.cpm1.hs), rownames(head.cpm1.c)))
}
# WRITE THE FROZEN VERSION OF THE FILTERED RAW GENE COUNT MATRIX - USE THIS FOR ANY FURTHER ANALYSIS
# 8786 genes expressed in both diets after filters above
{
#Raw counts, but genes with CPM < 1 have been removed
head.counts <- subset(head.counts, rownames(head.counts) %in% both)
all.equal(colnames(head.counts), as.character(head.info$id))
write.table(head.counts, "RawCounts_CPM1_head_hsctrl_Dec29.20.txt", col.names = T, row.names = T, sep="\t", quote=F)
write.table(head.info, "Info_RawCounts_CPM1_head_hsctrl_Jul9.20.txt", col.names = T, row.names = F, sep="\t", quote=F)
}