R/Final_R_assignment.Rmd at main · razan18/R · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
---
title: "Razan R draft"
author: "Razan Alsayed Omar"
date: '2022-03-11'
output: html_document
---

---
title: "R assignment draft"
author: "Razan Alsayed Omar"
date: '2022-03-11'
output: html_document
---

```{r}
library(tidyverse)
library(ggplot2)
library(data.table)
```


# Loading files
```{r}
genotypes <- read.table("fang_et_al_genotypes.txt", sep = "\t", header = T)
snps <- read.table("snp_position.txt", sep = "\t", header = T )
```
## Data inspection
# To look at rows and columns of the files (rows,columns)
```{r}
dim(genotypes)
dim(snps)
```
# To look at the first 10 lines and first 10 headers
```{r}
genotypes[1:10,1:10]
snps[1:10, 1:10]
```
# To look at the file format
```{r}
class(genotypes)
class(snps)
```
## Data processing
# To filter the genotypes files using Group column to match ZMM* for maize and ZMP* for teosinte and move it into a new file
```{r}
maize_genotypes <- filter(genotypes, Group == "ZMMIL" | Group == "ZMMLR" | Group == "ZMMMR")

teosinte_genotypes <- filter(genotypes, Group == "ZMPBA" | Group == "ZMPIL" | Group == "ZMPJA")
```

# Prepare the snps file by extracting the columns we need (SNP_ID, chromosome, position), and sort based on snp position
```{r}
head(snps) #to check which column corresponds to the ones we want
cutcol_snps <- snps [,c(1,3,4)] #to get columns 1,3,4 and move into a new file

```
# To remove the first 3 columns in the maize and teosinte files to transpose
```{r}
maize_genotypes_only <- maize_genotypes [,-c(1:3)]
teosinte_genotypes_only <- teosinte_genotypes [,-c(1:3)]
transposed_maize <- as.data.frame(t(maize_genotypes_only)) #to transpose maize genotypes
transposed_teosinte <- as.data.frame(t(teosinte_genotypes_only)) #to transpose teosinte genotypes
```

# To join maize genotypes with snps, and teosinte genotypes with snps, based on SNP_ID
```{r}
join_maize <- cbind(cutcol_snps, transposed_maize)
join_teosinte <- cbind(cutcol_snps, transposed_teosinte)
rownames(join_maize) <- NULL
rownames(join_teosinte) <- NULL
```
# To sort based on increasing snp position
```{r}
dir.create('./Maize_data') #to create a folder that will have all maize files
chr_maize <- filter(join_maize, Chromosome != "multiple" & Chromosome != "unknown")
for (i in 1:length(unique(chr_maize$Chromosome))){
  chrm <-  chr_maize %>% filter(Chromosome == i) %>% arrange(Position)
  write.table(chrm, file = paste("./Maize_data/Maize_chr_sorted_",i), quote = F, sep = "\t")
} #sorting maize genotypes based on increasing snp position

dir.create('./Teosinte_data') #to create a folder that will have all maize files
chr_teosinte <- filter(join_teosinte, Chromosome != "multiple" & Chromosome != "unknown")
for (i in 1:length(unique(chr_teosinte$Chromosome))) {
  chrt <- chr_teosinte %>% filter(Chromosome == i) %>% arrange(Position)
  write.table (chrt, file = paste("./Teosinte_data/Teosinte_chr_sorted_", i), quote = F, sep = "\t")
} #sorting teosinte genotypes based on increasing snp position
```
# To sort in decreasing snp position
```{r}
chr_maize <- filter(join_maize, Chromosome != "multiple" & Chromosome != "unknown")
for (i in 1:length(unique(chr_maize$Chromosome))){
  chrm <-  chr_maize %>% filter(Chromosome == i) %>% arrange(desc(Position))
  write.table(chrm, file = paste("./Maize_data/Maize_chr_decrease_",i), quote = F, sep = "\t")
} #sorting maize genotypes based on decreasing snp position

chr_teosinte <- filter(join_teosinte, Chromosome != "multiple" & Chromosome != "unknown")
for (i in 1:length(unique(chr_teosinte$Chromosome))) {
  chrt <- chr_teosinte %>% filter(Chromosome == i) %>% arrange(desc(Position))
  write.table (chrt, file = paste("./Teosinte_data/Teosinte_chr_decrease_", i), quote = F, sep = "\t")
} #sorting teosinte genotypes based on decreasing snp position
```
# Replacing "?" with "-" in decreasing maize and teosinte
```{r}
replaced_Maize <- as_tibble(lapply(join_maize, gsub, pattern ="?", replacement ="-", fixed = TRUE))
replaced_Chr_Maize <- filter(replaced_Maize, Chromosome != "multiple" & Chromosome != "unknown")
for (i in 1:length(unique(replaced_Chr_Maize$Chromosome))) {
  chrm_replaced <- replaced_Chr_Maize %>% filter(Chromosome == i) %>% arrange(desc(Position))
  write.table(chrm_replaced, file = paste("./Maize_data/Maize_replaced_",i), quote = F, sep = "\t")
  } #maize data

replaced_Teosinte <- as_tibble(lapply(join_teosinte, gsub, pattern ="?", replacement ="-", fixed = TRUE))
replaced_Chr_Teosinte <- filter(replaced_Teosinte, Chromosome != "multiple" & Chromosome != "unknown")
for (i in 1:length(unique(replaced_Chr_Teosinte$Chromosome))) {
  chrt_replaced <- replaced_Chr_Teosinte %>% filter(Chromosome == i) %>% arrange(desc(Position))
  write.table(chrm_replaced, file = paste("./Teosinte_data/Teosinte_replaced_",i), quote = F, sep = "\t")
  } #teosinte data
```

## Data visualization

#SNPs per chromosome
# There were more SNP positions in maize individuals (ZMM*). The chromosome with the greatest number of SNPs is chromosome 1 and chromosome 10 had the least number of SNPs
```{r}
library(dplyr)
library(reshape)
library(plyr)
genotypest <- as.data.frame(t(genotypes))
genotypesm.gg <- merge(genotypest, snps, by.x = "row.names", by.y = "SNP_ID") #transpose fang genotypes and snp position files
genotypesm.gg$Chromosome <- factor(genotypesm.gg$Chromosome, levels = c("1","2","3","4","5","6","7","8","9","10"))
ggplot(data = genotypesm.gg) + geom_bar(aes(genotypesm.gg$Chromosome)) + xlab("Chromosome Number") + ylab("SNPs") # Shows distribution of SNPs on and across chromosomes
ggplot(data = genotypes, aes(Group)) + geom_bar() + xlab("Groups") + ylab("SNPs") #Shows SNPs position distribution in maize and teosinte

```
#Missing data and amount of heterozygosity

```{r}
Names <- colnames(genotypes)[-c(1:3)]
genotypes.melt <- melt(genotypes, measure.vars = Names)
colnames(genotypes.melt)[c(3,4,5)] <- c("Group","SNP_ID", "Allele") #Melting data to make it easier to work with
genotypes.melt$Ho <- (genotypes.melt$Allele =="A/A" | genotypes.melt$Allele =="C/C" | genotypes.melt$Allele =="G/G" | genotypes.melt$Allele =="T/T")
sortedmelt <- arrange(genotypes.melt, Sample_ID, Group)
summarize.ID <- ddply(sortedmelt, c("Sample_ID"), summarise, total_ho = sum (Ho, na.rm=TRUE), total_het = sum (!Ho, na.rm=TRUE), missing = sum(is.na(Ho)))#Missing data and heterozygous ratio parameters
summarize.melt <- melt(summarize.ID, measure.vars = c("total_ho", "total_het", "missing"))
ggplot(summarize.melt,aes(x = Sample_ID, y = value, fill=variable)) + geom_bar(stat = "identity", position = "stack")
attributes(summarize.melt)
```

#My own visualization

```{r}
chrpos.melt <- melt(snps, "Chromosome", "candidate.random")
ggplot(chrpos.melt)+geom_bar(aes(x=Chromosome, fill = value))+ggtitle("candidate versus random snps per chromosome")+labs(x = "chromosome", y ="snp_count")
```