Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .RData
Binary file not shown.
447 changes: 447 additions & 0 deletions .Rhistory

Large diffs are not rendered by default.

Binary file added CMA Outlier Check.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added CMA and TMA Side by Side Comparison.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Missing Value Clusters.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
61 changes: 61 additions & 0 deletions Missing Values Plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
####################
## Load Libraries ##
####################

library(ggplot2)
library(dplyr)
library(reshape2)

#####################
## Load Data Frame ##
#####################

setwd("~/GitHub/Feature-Engineering-Project/Viz Assignment")

df <- read.csv("Tidy_Data_fff_2014j_Dataset.csv")

############################################
## Check for missing values in data frame ##
############################################

apply(df,2,function(x) sum(is.na(x)))

#######################################################################
## Create new Data frame with only features that have missing values ##
#######################################################################

df1 <- subset(df, select = -c(id_student, homepage, gender, region, highest_education, imd_band, age_band, num_of_prev_attempts, studied_credits, disability, final_result, date_registration.x, date_registration.y, date_unregistration.x, date_unregistration.y))

###################################
## Create missing value function ##
###################################

ggplot_missing <- function(x){

x %>%
is.na %>%
melt %>%
ggplot(data = .,
aes(x = Var2,
y = Var1)) +
geom_raster(aes(fill = value)) +
scale_fill_grey(name = "",
labels = c("Present","Missing")) +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, vjust=1)) +
labs(x = "Variables in Dataset",
y = "Number of Observations")
}

#########################
## Plot Missing Values ##
#########################

## This function shows the severity of missing values for each feature.

ggplot_missing(df1)

## The heatmap function clusters rows and columns, in this case by missingness.

heatmap(is.na(df1)*1, scale='none')

Binary file added Missing Values.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
53 changes: 53 additions & 0 deletions Plot Average TMA and CMA Scores.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
####################
## Load Libraries ##
####################

library(ggplot2)
library(dplyr)
library(reshape2)

#####################
## Load Data Frame ##
#####################

setwd("~/GitHub/Feature-Engineering-Project/Viz Assignment")

df <- read.csv("Tidy_Data_fff_2014j_Dataset.csv")

##################################################
## Create Data Frame of only CMA and TMA Scores ##
##################################################

df1 <- subset(df, select = c(X, TMA, CMA))

## Remove N/A Values ##

df2 <- na.omit(df1)

colnames(df2) <- c("Student_ID","TMA","CMA")

########################################
## Plot average scores on assessments ##
########################################

tma_only <- ggplot(df2, aes(Student_ID,TMA)) + geom_point() + geom_smooth(fill="blue", colour="darkblue", size=1) + ggtitle("Tutor Measured Assessments") + labs(x="Student ID", y="Score")

cma_only <- ggplot(df2, aes(Student_ID,CMA)) + geom_point() + geom_smooth(fill="red", colour="red", size=1) + ggtitle("Computer Measured Assessments") + labs(x="Student ID", y="Score")

########################################
## Plot both assessments on one image ##
########################################

TMA = data.frame(df1$X,df1$TMA)
colnames(TMA) <- c("X","Score")

CMA = data.frame(df1$X,df1$CMA)
colnames(CMA) <- c("X","Score")


visuals = rbind(TMA,CMA)
visuals$Assessment =c(rep("TMA"),rep("CMA"))

cma_and_tma <- ggplot(visuals, aes(X,Score,group=Assessment,col=Assessment)) + geom_point() + geom_smooth()


5 changes: 5 additions & 0 deletions Plot TMA and CMA Side by Side.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## Plot images to compare outliers and non outliers ##

library(gridExtra)

grid.arrange(cma_only, cma_only_no_outliers, tma_only, tma_only_no_outliers, ncol=2)
103 changes: 103 additions & 0 deletions Remove Outlisers and Plot Average TMA and CMA Scores.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
####################
## Load Libraries ##
####################

library(ggplot2)
library(dplyr)
library(reshape2)

#####################
## Load Data Frame ##
#####################

setwd("~/GitHub/Feature-Engineering-Project/Viz Assignment")

df <- read.csv("Tidy_Data_fff_2014j_Dataset.csv")

##################################################
## Create Data Frame of only CMA and TMA Scores ##
##################################################

df1 <- subset(df, select = c(X, TMA, CMA))
df3 <- df1

##############################
## Plot and Remove Outliers ##
##############################

outlierTMA <- function(df3, var) {
TMA <- eval(substitute(var),eval(df3))
na1 <- sum(is.na(TMA))
m1 <- mean(TMA, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(TMA, main="With outliers")
hist(TMA, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(TMA)$out
mo <- mean(outlier)
TMA <- ifelse(TMA %in% outlier, NA, TMA)
boxplot(TMA, main="Without outliers")
hist(TMA, main="Without outliers", xlab=NA, ylab=NA)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(TMA))
cat("Outliers identified:", na2 - na1, "n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(TMA))*100, 1), "n")
cat("Mean of the outliers:", round(mo, 2), "n")
m2 <- mean(TMA, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "n")
cat("Mean if we remove outliers:", round(m2, 2), "n")
response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
if(response == "y" | response == "yes"){
df3[as.character(substitute(var))] <- invisible(TMA)
assign(as.character(as.list(match.call())$df3), df3, envir = .GlobalEnv)
cat("Outliers successfully removed", "n")
return(invisible(df3))
} else{
cat("Nothing changed", "n")
return(invisible(TMA))
}
}

outlierCMA <- function(df3, var) {
CMA <- eval(substitute(var),eval(df3))
na1 <- sum(is.na(CMA))
m1 <- mean(CMA, na.rm = T)
par(mfrow=c(2, 2), oma=c(0,0,3,0))
boxplot(CMA, main="With outliers")
hist(CMA, main="With outliers", xlab=NA, ylab=NA)
outlier <- boxplot.stats(CMA)$out
mo <- mean(outlier)
CMA <- ifelse(CMA %in% outlier, NA, CMA)
boxplot(CMA, main="Without outliers")
hist(CMA, main="Without outliers", xlab=NA, ylab=NA)
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(CMA))
cat("Outliers identified:", na2 - na1, "n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(CMA))*100, 1), "n")
cat("Mean of the outliers:", round(mo, 2), "n")
m2 <- mean(CMA, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "n")
cat("Mean if we remove outliers:", round(m2, 2), "n")
response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
if(response == "y" | response == "yes"){
df3[as.character(substitute(var))] <- invisible(CMA)
assign(as.character(as.list(match.call())$df3), df3, envir = .GlobalEnv)
cat("Outliers successfully removed", "n")
return(invisible(df3))
} else{
cat("Nothing changed", "n")
return(invisible(CMA))
}
}

outlierTMA(df3, TMA)
outlierCMA(df3, CMA)
y

########################################
## Plot average scores on assessments ##
########################################

tma_only_no_outliers <- ggplot(df3, aes(X,TMA)) + geom_point() + geom_smooth(fill="blue", colour="darkblue", size=1) + ggtitle("Tutor Measured Assessments - No Outliers") + labs(x="Student ID", y="Score")

cma_only_no_outliers <- ggplot(df3, aes(X,CMA)) + geom_point() + geom_smooth(fill="red", colour="red", size=1) + ggtitle("Computer Measured Assessments - No Outliers") + labs(x="Student ID", y="Score")

Loading