You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
library(tidyverse) #For data maniupluation
library(dplyr)
install.packages("janitor")
install.packages("data.table")
library(janitor)
library(data.table)
Upload raw data
geno <- read.delim("https://raw.githubusercontent.com/chamberwin/BCB546-Spring2022/main/assignments/UNIX_Assignment/fang_et_al_genotypes.txt", header = TRUE, sep = "\t") #Download raw files from class repository for genotypes and SNPs
snp_pos <- read.delim("https://raw.githubusercontent.com/chamberwin/BCB546-Spring2022/main/assignments/UNIX_Assignment/snp_position.txt", header = TRUE, sep = "\t")
Inspect data
dim(geno) #Dimensions of rows (2781) and columns (986)
length(geno) #Number of columns is 986
object.size(geno) #Size of file is 22681376 bytes
summary(geno) #Because of large number of columns, not super information
head(geno) #A little better than summary at seeing structure
dim(snp_pos) #Dimensions of rows (983) and columns (15)
length(snp_pos) #Number of columns is 15
object.size(snp_pos)
summary(snp_pos) #Size of file is 327392 bytes
head(snp_pos) #Look at data structure
Prepare SNP position file for join
snp_cut <- snp_pos %>% #Create prepared file for join
select(SNP_ID, Chromosome, Position) %>% #Select only these columns
arrange(SNP_ID) #Arrange by SNP_ID
Maize Join
geno_maize <- geno %>%
filter(Group %in% c("ZMMIL", "ZMMLR", "ZMMMR", "Sample_ID")) %>% # filter by maize groups only, including sample ID
t() %>% # transpose - note it's added automatic column names (V1, V2, etc. in new header)
as.data.frame() %>% # make sure it's a dataframe
setDT(keep.rownames = TRUE) %>% #gives column 1 a placeholder name as well
row_to_names(row_number = 1) %>% # sets first row as column names
rename(SNP_ID = Sample_ID) # rename first column as SNP_ID so we can join on like column names
join_maize <- snp_cut %>%
inner_join(geno_maize, by = "SNP_ID") # joins both files - inner_join makes sure we take only SNP_IDs in both files
Teosinte Join
geno_teosinte <- geno %>%
filter(Group %in% c("ZMPBA", "ZMPIL", "ZMPJA")) %>% # filter by maize groups only, including sample ID
t() %>% # transpose - note it's added automatic column names (V1, V2, etc. in new header)
as.data.frame() %>% # make sure it's a dataframe
setDT(keep.rownames = TRUE) %>% #gives column 1 a placeholder name as well
row_to_names(row_number = 1) %>% # sets first row as column names
rename(SNP_ID = Sample_ID) # rename first column as SNP_ID so we can join on like column names
join_teosinte <- snp_cut %>%
inner_join(geno_teosinte, by = "SNP_ID") # joins both files - inner_join makes sure we take only SNP_IDs in both files
library(ggplot) #reviewer needed this to view visuals
library(ggplot2) #load visualization library
install.packages("ggpubr") #Package for better visualization
library(ggpubr) #load package
SNP Distribution for Maize Across Chromosomes
sort_maize_bin <- sort_maize %>% #Create binned object for sorted maize
filter(Position != "unknown") %>% #Filter out unknown positions
filter(Position != "multiple") %>% #Filter out multiple positions
mutate(position_binned = cut(as.numeric(Position), 10)) %>% #Cut chromosomes into 10 bins and make own category
filter(Chromosome != "multiple") %>% #Filter out multiple chromosomes
filter(Chromosome != "unknown") %>% #Filter out unknown chromosomes
mutate(Chromosome = fct_relevel(Chromosome, c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))) #Use levels to put visuals in chromosome order
maize <- ggplot(data = sort_maize_bin) + #Name visual maize
geom_bar(mapping = aes(x = position_binned, fill = Chromosome)) + #Create bar plot with x as binned position and fill as chromosome #
xlab("Position (basepairs)") + #Update x label to Position (basepairs)
ylab("SNP Distribution") + #Update y label to "SNP Distribution"
theme(axis.text.x = element_text(angle = 90)) + #Make it pretty
facet_grid(~as.numeric(Chromosome)) #Add facet to view chromosome as numeric
maize #Check work
SNP Distribution for Teosinte Across Chromosomes
sort_teosinte_bin <- sort_teosinte %>% #Create binned object for sorted teosinte
filter(Position != "unknown") %>% #Filter out unknown positions
filter(Position != "multiple") %>% #Filter out multiple positions
mutate(position_binned = cut(as.numeric(Position), 10)) %>% #Cut chromosomes into 10 bins and make own category
filter(Chromosome != "multiple") %>% #Filter out multiple chromosomes
filter(Chromosome != "unknown") %>% #Filter out unknown chromosomes
mutate(Chromosome = fct_relevel(Chromosome, c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))) #Use levels to put visuals in chromosome order
teosinte <- ggplot(data = sort_teosinte_bin) + #Name visual teosinte
geom_bar(mapping = aes(x = position_binned, fill = Chromosome)) + #Create bar plot with x as binned position and fill as chromosome #
xlab("Position (basepairs)") + #Update x label to Positions (basepairs)
ylab("SNP Distribution") + #Update y label to SNP Distribution
theme(axis.text.x = element_text(angle = 90)) + #Make it pretty
facet_grid(~as.numeric(Chromosome)) #Add facet to view chromosome as numeric
teosinte #Check work
Combine Maize and Teosinte SNP Distributions Across Chromosomes
SNP_figure <- ggarrange(maize, teosinte,
labels = c("maize", "teosinte"),
ncol = 1, nrow = 2) #Create visual called snp figure that combines maize and teositne
SNP_figure
#View figure of combined maize and teosine SNP distributions per chromsome
ggsave("SNP_figure.pdf", plot = SNP_figure, width = 12, height = 15, units = "in", dpi = 300) #save figure as PDF
ggsave("SNP_figure.png", plot = SNP_figure, width = 12, height = 15, units = "in", dpi = 300) #save figure as png
#There are more SNPs in chromosome 1
Add Multiple and Unknown Chromsomes with SNP Positions
unknown_maize <- sort_maize %>% #Show unknown on each chromosome for maize
filter(Chromosome == "unknown")
multiple_maize <- sort_maize %>% #Show multiple possibilities for each chromosome
filter(Chromosome == "multiple")
view(multiple_maize) #Check work
mchr1 <- nrow(chr1_maize) #Create object for chromosome 1
mchr2 <- nrow(chr2_maize) #Repeat for all chromosomes
mchr3 <- nrow(chr3_maize)
mchr4 <- nrow(chr4_maize)
mchr5 <- nrow(chr5_maize)
mchr6 <- nrow(chr6_maize)
mchr7 <- nrow(chr7_maize)
mchr8 <- nrow(chr8_maize)
mchr9 <- nrow(chr9_maize)
mchr10 <- nrow(chr10_maize)
mchrmult <- nrow(multiple_maize)
mchrunkn <- nrow(unknown_maize)
unknown_teosinte <- sort_teosinte %>% #Show unknown on each chromosome for teosinte
filter(Chromosome == "unknown")
multiple_teosinte <- sort_teosinte %>% #Show multiple possibiliites for each chromosome
filter(Chromosome == "multiple")
tchr1 <- nrow(chr1_teosinte)#Create object for chromosome 1
tchr2 <- nrow(chr2_teosinte) #Repeat for all chromosomes
tchr3 <- nrow(chr3_teosinte)
tchr4 <- nrow(chr4_teosinte)
tchr5 <- nrow(chr5_teosinte)
tchr6 <- nrow(chr6_teosinte)
tchr7 <- nrow(chr7_teosinte)
tchr8 <- nrow(chr8_teosinte)
tchr9 <- nrow(chr9_teosinte)
tchr10 <- nrow(chr10_teosinte)
tchrmult <- nrow(multiple_teosinte)
tchrunkn <- nrow(unknown_teosinte)
SNP_pos_df <- data.frame(chromosome = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "multiple", "unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "multiple", "unknown"),
species = c("maize", "maize", "maize", "maize", "maize", "maize", "maize", "maize", "maize", "maize", "maize", "maize", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte", "teosinte"),
SNP_positions = c(mchr1, mchr2, mchr3, mchr4, mchr5, mchr6, mchr7, mchr8, mchr9, mchr10, mchrmult, mchrunkn, tchr1, tchr2, tchr3, tchr4, tchr5, tchr6, tchr7, tchr8, tchr9, tchr10, tchrmult, tchrunkn)) #Create a dataframe with Chromsome position, species, and SNP Position columns
SNP_pos_df <- SNP_pos_df %>%
mutate(chromosome = fct_relevel(chromosome, c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "multiple", "unknown"))) #Create levels for chromosome numbers to be in order
SNP_pos_bar <- ggplot(data = SNP_pos_df, aes(x = species, y = SNP_positions, fill = species)) + #Create bar plot with species on x, SNP position on y, and fill as species
geom_bar(stat = "identity", position = position_dodge()) + #Make pretty
theme(axis.text.x = element_text(angle = 90)) +
xlab("Species") + #Relabel x axis as Species
ylab("Number of SNP Positions") + #Relabel y axis as Number of SNP Positions
facet_grid(~chromosome)
SNP_pos_bar #Check work
ggsave("SNP_pos_bar.pdf", plot = SNP_pos_bar, width = 10, height = 5, units = "in", dpi = 300) # print to pdf
ggsave("SNP_pos_bar.png", plot = SNP_pos_bar, width = 10, height = 5, units = "in", dpi = 300) # print to png
#SNP distrubtion is equal across maize and teosinte
Zygosity Visualization
geno2 <- read.delim("https://raw.githubusercontent.com/EEOB-BioData/BCB546-Spring2022/main/assignments/UNIX_Assignment/fang_et_al_genotypes.txt", header = TRUE, sep = "\t") #Pull new genotype file to mutate
geno2$homozygous <- apply(geno2, 1, function(x) length(which(x == ("A/A") | x == ("T/T") | x == ("C/C") | x == ("G/G")))) #Search for all homozygous nucleotides
geno2 <- geno2 %>%
mutate(homozygous = (homoA + homoT + homoC + homoG)) #Add columns at end for each homozygous type and total count
geno2$heterozygous <- apply(geno2, 1, function(x) length(which(x == ("A/C") | x == ("A/T") | x == ("A/G") | x == ("C/A") | x == ("C/T") | x == ("C/G") | x == ("G/A") | x == ("G/C") | x == ("G/T") | x == ("T/A") | x == ("T/C") | x == ("T/G")))) #Search for all heterozygous
geno2$missing <- apply(geno2, 1, function(x) length(which(x == ("?/?")))) #Find all missing data
geno_totals <- geno2 %>% #Create new object for geno totals
filter(Group %in% c("ZMMIL", "ZMMLR", "ZMMMR", "ZMPBA", "ZMPIL", "ZMPJA")) %>% #Filter by maize and teosinte groups
mutate(species = fct_recode(Group,
"maize" = "ZMMIL",
"maize" = "ZMMLR",
"maize" = "ZMMMR",
"teosinte" = "ZMPBA",
"teosinte" = "ZMPIL",
"teosinte" = "ZMPJA")) #Create facet to display groups per species
tapply(geno_totals$homozygous, geno_totals$Group, FUN=sum) #prints homozygous totals per group
tapply(geno_totals$heterozygous, geno_totals$Group, FUN=sum) #prints heterozygous total per group
tapply(geno_totals$missing, geno_totals$Group, FUN=sum) #prints missing totals per group
homozygous <- aggregate(geno_totals$homozygous, by = list(Group = geno_totals$Group), FUN = sum) #Aggregates homozygous by group
heterozygous <- aggregate(geno_totals$heterozygous, by = list(Group = geno_totals$Group), FUN = sum) #Aggregates heterozygous by group
missing <- aggregate(geno_totals$missing, by = list(Group = geno_totals$Group), FUN = sum) #Aggregates missing by group
list_zygosity_sums <- c(homozygous, heterozygous, missing) #Creates list of zygosity/missing sums
zygosity_sums <- as.data.frame(list_zygosity_sums) #Makes list a data frame
zygo_sums <- zygosity_sums %>% #Rename
rename(homozygous = x) %>% #Orders homozygous in 2nd column
rename(heterozygous = x.1) %>% #Orders heterozygous in 4th column
rename(missing = x.2) #Orders missing in last column
zygo_sums2 <- zygo_sums %>% #Creates new object
select(Group, homozygous, heterozygous, missing) %>% #Selects just the wanted columns
mutate(species = fct_recode(Group,
"maize" = "ZMMIL",
"maize" = "ZMMLR",
"maize" = "ZMMMR",
"teosinte" = "ZMPBA",
"teosinte" = "ZMPIL",
"teosinte" = "ZMPJA")) #Creates facets for group
zygo_pivot <- zygo_sums2 %>% #Creates object to pivot data
select(Group, species, homozygous, heterozygous, missing) %>% #Select wanted data
pivot_longer(., cols = c(homozygous, heterozygous, missing), names_to = "Zygosity", values_to = "Percent") #Pivots to make columns Group, Species, Zygosity, and Percent
zygo_species <- ggplot(zygo_pivot, aes(fill = Zygosity, y = Percent, x = species)) + #Make bar plot with x as species, y as percent and zyogisty for fill
geom_bar(position="fill", stat="identity") + #Make it pretty
xlab("Species") + #Rename x label as species
ylab("Proportion") #Rename y label as proportion of zygosity
zygo_species #Check work of zygosity per species
ggsave("Zygo_Species.pdf", plot = zygo_species, width = 10, height = 5, units = "in", dpi = 300) # print to pdf
ggsave("Zygo_Species.png", plot = zygo_species, width = 10, height = 5, units = "in", dpi = 300) # print to png
zygo_groups <- ggplot(zygo_pivot, aes(fill = Zygosity, y = Percent, x = Group)) + #Create bar graph based on groups
geom_bar(position="fill", stat="identity") +
xlab("Group") +
ylab("Proportion")
zygo_groups #Check work of zygosity per group
ggsave("Zygo_Groups.pdf", plot = zygo_groups, width = 10, height = 5, units = "in", dpi = 300) # print to pdf
ggsave("Zygo_Groups.png", plot = zygo_groups, width = 10, height = 5, units = "in", dpi = 300) # print to png
Freestyle Visual, GC Content
geno5 <- read.delim("https://raw.githubusercontent.com/EEOB-BioData/BCB546-Spring2022/main/assignments/UNIX_Assignment/fang_et_al_genotypes.txt", header = TRUE, sep = "\t") #Pull new genotype file to mutate
geno5$GC <- apply(geno5, 1, function(x) length(which(x == ("G/C") | x == ("C/G")| x == ("G/G") | x == ("C/C")))) #Filters anything that is certainly a G or a C
geno5$not_GC <- apply(geno5, 1, function(x) length(which(x == ("?/?") | x == ("A/A") | x == ("T/T") | x == ("A/C") | x == ("A/T") | x == ("A/G") | x == ("C/A") | x == ("C/T") | x == ("G/A") | x == ("G/T") | x == ("T/A") | x == ("T/C") | x == ("T/G")))) #Filters anything that is not for sure a G or C
geno_gc <- geno5 %>%
filter(Group %in% c("ZMMIL", "ZMMLR", "ZMMMR", "ZMPBA", "ZMPIL", "ZMPJA")) %>%
mutate(species = fct_recode(Group,
"maize" = "ZMMIL",
"maize" = "ZMMLR",
"maize" = "ZMMMR",
"teosinte" = "ZMPBA",
"teosinte" = "ZMPIL",
"teosinte" = "ZMPJA")) #By filtering for group, we organize by species.
gc_pivot <- geno_gc %>%
select(Group, species, GC, not_GC) %>%
pivot_longer(., cols = c(GC, not_GC), names_to = "GC_content", values_to = "count") #Creates column just for GC content
gc_group_plot <- ggplot(gc_pivot, aes(fill = GC_content, y = count, x = Group)) + #Creates bar plot where group is on the x and count is on the y
xlab("Group") + #Relabel x axis to Group
ylab("Proportion") + #Relabel y axis to Proportion
geom_bar(position = "fill", stat = "identity") #Make Pretty
gc_group_plot #Check work
#There is a similar amount of GC content than non GC per group
gc_species_plot <- ggplot(gc_pivot, aes(fill = GC_content, y = count, x = species)) + #Changes x axis to species
xlab("Species") +
ylab("Proportion") +
geom_bar(position = "fill", stat = "identity")
gc_species_plot #Check work
#GC content is similar among both species
ggsave("GC_Content_Groups.pdf", plot = gc_group_plot, width = 10, height = 5, units = "in", dpi = 300) # print to pdf
ggsave("GC_Content_Groups.png", plot = gc_group_plot, width = 10, height = 5, units = "in", dpi = 300) # print to png
ggsave("GC_Content_Species.pdf", plot = gc_species_plot, width = 10, height = 5, units = "in", dpi = 300) # print to pdf
ggsave("GC_Content_Species.png", plot = gc_species_plot, width = 10, height = 5, units = "in", dpi = 300) # print to png