WingShapeBSA/Code/MAFfig.R at master · DworkinLab/WingShapeBSA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
##Going to look at the allele F in ds and emc estimated from selected lines.

#DO NOT DO
#library(pegas)
#ds.vcf <- read.vcf("../Data/parents2L.vcf.gz", which.loci = 240013:714983)

#library(vcfR)

#chr2L <- read.vcfR("../Data/parents2L.vcf.gz")
library(tidyr)
library(data.table)


#This is the dgrp vcf file but subsetted for the 30 lines used in this experiment.
chr3L <- fread("../Data/parents3L.vcf")

crap <- chr3L[chr3L$POS >= 749400,]
emconly <- crap[crap$POS <= 753492,]

#Now I am going to extract the genotypes. A 1 == alt allele and 0 == ref allele. there are . == no data in here as well.

#this is a really bad way to do this but it works for now.

for(i in 10:39){
  emconly[[i]] <- gsub("1/1", "1", emconly[[i]])
}
for(i in 10:39){
  emconly[[i]] <- gsub("0/0", "0", emconly[[i]])
}
for(i in 10:39){
  emconly[[i]] <- gsub("./.", "NA", emconly[[i]])
}


for(i in 10:39){
  emconly[[i]] <- as.numeric(emconly[[i]])
}


#now to get allele frequency.

emconly$aallele <- rowSums(emconly[,10:39], na.rm = TRUE)
#number of lines without data avalible.
emconly$na.count <- rowSums(is.na(emconly[,10:39]))

emconly$freq <- emconly$aallele/(30 - emconly$na.count)

#This is the alternate allele compared to the refrence genome.
#MAF is also weird because this is called with the whole data set and then I subsetted.
hist(emconly$freq)

#Now I want to also calculate pi using:
# h = n/n-1(1-sum(pi^2)) where pi represents the freq of the ith allele at any site
#then pi = sum(hj) where hj is the heterozygosity at the jth site

#first, I only want the sites that are variant
emc.var1 <- emconly[emconly$freq > 0,]
#there are also indels here? So I want only SNPs
emc.var <- emc.var1[grep("SNP", emc.var1$ID),]

#now to write out how I would get h at each site in peices.

#30 lines in sample
emc.var$n <- 30 - emc.var$na.count

emc.var$p1 <- emc.var$freq^2
emc.var$p2 <- (1 - emc.var$freq)^2

emc.var$h <- (emc.var$n/(emc.var$n-1))*(1- (emc.var$p1 +emc.var$p2))
hist(emc.var$h)

emclength <- 753492 - 749400


#0.0051
emc.pi <- sum(emc.var$h)/emclength


#This is the dgrp vcf file but subsetted for the 30 lines used in this experiment.
chr2L <- fread("../Data/parents2L.vcf")

crap <- chr2L[chr2L$POS >= 240013,]
dsonly <- crap[crap$POS <= 714983,]

#Now I am going to extract the genotypes. A 1 == alt allele and 0 == ref allele. there are . == no data in here as well.

#this is a really bad way to do this but it works for now.

for(i in 10:39){
  dsonly[[i]] <- gsub("1/1", "1", dsonly[[i]])
}
for(i in 10:39){
  dsonly[[i]] <- gsub("0/0", "0", dsonly[[i]])
}
for(i in 10:39){
  dsonly[[i]] <- gsub("./.", "NA", dsonly[[i]])
}


for(i in 10:39){
  dsonly[[i]] <- as.numeric(dsonly[[i]])
}

class(dsonly[[39]])
is.na(dsonly[[39]])

#now to get allele frequency.

dsonly$aallele <- rowSums(dsonly[,10:39], na.rm = TRUE)
#number of lines without data avalible.
dsonly$na.count <- rowSums(is.na(dsonly[,10:39]))

dsonly$freq <- dsonly$aallele/(30 - dsonly$na.count)

#This is the alternate allele compared to the refrence genome.
#MAF is also weird because this is called with the whole data set and then I subsetted.
hist(dsonly$freq)


#Now I want to also calculate pi using:
# h = n/n-1(1-sum(pi^2)) where pi represents the freq of the ith allele at any site
#then pi = sum(hj) where hj is the heterozygosity at the jth site

#first, I only want the sites that are variant
ds.var1 <- dsonly[dsonly$freq > 0,]
#there are also indels here? So I want only SNPs
ds.var <- ds.var1[grep("SNP", ds.var1$ID),]

#now to write out how I would get h at each site in peices.

#30 lines in sample
ds.var$n <- 30 - ds.var$na.count

ds.var$p1 <- ds.var$freq^2
ds.var$p2 <- (1 - ds.var$freq)^2

ds.var$h <- (ds.var$n/(ds.var$n-1))*(1- (ds.var$p1 +ds.var$p2))
hist(ds.var$h)

dslength <- 714983 - 240013


#0.0039273
ds.pi <- sum(ds.var$h)/dslength
ds.pi

#Here, the frequency are on two diffrent axis because there are more alleles in ds than emc. but I don't think that matters?

#Going to try making this in ggplot.

library(ggplot2)

ggplot(dsonly, aes(x = freq)) +
  geom_histogram() +
  xlim(0, 1) +
  ylim(0, 3000)


ggplot(emconly, aes(x = freq)) +
  geom_histogram() +
  xlim(0, 1) +
  ylim(0, 30)

#but I want to plot both together

emcfreq <- data.frame(emconly$freq)
emcfreq$gene <- "emc"
colnames(emcfreq)[1] <- "freq"

dsfreq <- data.frame(dsonly$freq)
dsfreq$gene <- "ds"
colnames(dsfreq)[1] <- "freq"

genefreq <- bind_rows(emcfreq, dsfreq)

png("../Figures/AltAlleleFreq_plottedTogether.png")
ggplot(genefreq, aes(x = freq, fill = gene)) +
  geom_histogram() +
  xlab("Alt Allele Freq") +
  theme_classic()
dev.off()

library(cowplot)

dsplot <- ggplot(dsfreq, aes(x = freq)) +
  geom_histogram() +
  xlab("Alt Allele Freq") +
  theme_classic() +
  xlim(0, 1) +
  ylim(0, 3000)


emcplot <- ggplot(emcfreq, aes(x = freq)) +
  geom_histogram() +
  xlab("Alt Allele Freq") +
  theme_classic()+
  xlim(0, 1) +
  ylim(0, 30)

allPlot <- plot_grid(dsplot, emcplot, labels = c("ds", "emc"), label_size = 10)

png("../Figures/AltAlleleFreq_plottedSeperate.png")
allPlot
dev.off()