feature-engineering-studio · cjc2238 · Oct 13, 2016
diff --git a/.RData b/.RData
diff --git a/.Rhistory b/.Rhistory
diff --git a/CMA Outlier Check.png b/CMA Outlier Check.png
diff --git a/CMA and TMA Side by Side Comparison.png b/CMA and TMA Side by Side Comparison.png
diff --git a/Missing Value Clusters.png b/Missing Value Clusters.png
diff --git a/Missing Values Plot.R b/Missing Values Plot.R
@@ -0,0 +1,61 @@
+####################
+## Load Libraries ##
+####################
+
+library(ggplot2)
+library(dplyr)
+library(reshape2)
+
+#####################
+## Load Data Frame ##
+#####################
+
+setwd("~/GitHub/Feature-Engineering-Project/Viz Assignment")
+
+df <- read.csv("Tidy_Data_fff_2014j_Dataset.csv")
+
+############################################
+## Check for missing values in data frame ##
+############################################
+
+apply(df,2,function(x) sum(is.na(x)))
+
+#######################################################################
+## Create new Data frame with only features that have missing values ##
+#######################################################################
+
+df1 <- subset(df, select = -c(id_student, homepage, gender, region, highest_education, imd_band, age_band, num_of_prev_attempts, studied_credits, disability, final_result, date_registration.x, date_registration.y, date_unregistration.x, date_unregistration.y))
+
+###################################
+## Create missing value function ##
+###################################
+
+ggplot_missing <- function(x){
+
+  x %>% 
+    is.na %>%
+    melt %>%
+    ggplot(data = .,
+           aes(x = Var2,
+               y = Var1)) +
+    geom_raster(aes(fill = value)) +
+    scale_fill_grey(name = "",
+                    labels = c("Present","Missing")) +
+    theme_minimal() + 
+    theme(axis.text.x  = element_text(angle=45, vjust=1)) + 
+    labs(x = "Variables in Dataset",
+         y = "Number of Observations")
+}
+
+#########################
+## Plot Missing Values ##
+#########################
+
+## This function shows the severity of missing values for each feature.
+
+ggplot_missing(df1)
+
+## The heatmap function clusters rows and columns, in this case by missingness. 
+
+heatmap(is.na(df1)*1, scale='none')
+
diff --git a/Missing Values.png b/Missing Values.png
diff --git a/Plot Average TMA and CMA Scores.R b/Plot Average TMA and CMA Scores.R
@@ -0,0 +1,53 @@
+####################
+## Load Libraries ##
+####################
+
+library(ggplot2)
+library(dplyr)
+library(reshape2)
+
+#####################
+## Load Data Frame ##
+#####################
+
+setwd("~/GitHub/Feature-Engineering-Project/Viz Assignment")
+
+df <- read.csv("Tidy_Data_fff_2014j_Dataset.csv")
+
+##################################################
+## Create Data Frame of only CMA and TMA Scores ##
+##################################################
+
+df1 <- subset(df, select = c(X, TMA, CMA))
+
+## Remove N/A Values ##
+
+df2 <- na.omit(df1)
+
+colnames(df2) <- c("Student_ID","TMA","CMA")
+
+########################################
+## Plot average scores on assessments ##
+########################################
+
+tma_only <- ggplot(df2, aes(Student_ID,TMA)) + geom_point() + geom_smooth(fill="blue", colour="darkblue", size=1) + ggtitle("Tutor Measured Assessments") + labs(x="Student ID", y="Score")
+
+cma_only <- ggplot(df2, aes(Student_ID,CMA)) + geom_point() + geom_smooth(fill="red", colour="red", size=1) + ggtitle("Computer Measured Assessments") + labs(x="Student ID", y="Score")
+
+########################################
+## Plot both assessments on one image ##
+########################################
+
+TMA = data.frame(df1$X,df1$TMA)
+colnames(TMA) <- c("X","Score")
+
+CMA = data.frame(df1$X,df1$CMA)
+colnames(CMA) <- c("X","Score")
+
+
+visuals = rbind(TMA,CMA)
+visuals$Assessment =c(rep("TMA"),rep("CMA"))
+
+cma_and_tma <- ggplot(visuals, aes(X,Score,group=Assessment,col=Assessment)) + geom_point() + geom_smooth()
+
+
diff --git a/Plot TMA and CMA Side by Side.R b/Plot TMA and CMA Side by Side.R
@@ -0,0 +1,5 @@
+## Plot images to compare outliers and non outliers ##
+
+library(gridExtra)
+
+grid.arrange(cma_only, cma_only_no_outliers, tma_only, tma_only_no_outliers, ncol=2)
diff --git a/Remove Outlisers and Plot Average TMA and CMA Scores.R b/Remove Outlisers and Plot Average TMA and CMA Scores.R
@@ -0,0 +1,103 @@
+####################
+## Load Libraries ##
+####################
+
+library(ggplot2)
+library(dplyr)
+library(reshape2)
+
+#####################
+## Load Data Frame ##
+#####################
+
+setwd("~/GitHub/Feature-Engineering-Project/Viz Assignment")
+
+df <- read.csv("Tidy_Data_fff_2014j_Dataset.csv")
+
+##################################################
+## Create Data Frame of only CMA and TMA Scores ##
+##################################################
+
+df1 <- subset(df, select = c(X, TMA, CMA))
+df3 <- df1
+
+##############################
+## Plot and Remove Outliers ##
+##############################
+
+outlierTMA <- function(df3, var) {
+  TMA <- eval(substitute(var),eval(df3))
+  na1 <- sum(is.na(TMA))
+  m1 <- mean(TMA, na.rm = T)
+  par(mfrow=c(2, 2), oma=c(0,0,3,0))
+  boxplot(TMA, main="With outliers")
+  hist(TMA, main="With outliers", xlab=NA, ylab=NA)
+  outlier <- boxplot.stats(TMA)$out
+  mo <- mean(outlier)
+  TMA <- ifelse(TMA %in% outlier, NA, TMA)
+  boxplot(TMA, main="Without outliers")
+  hist(TMA, main="Without outliers", xlab=NA, ylab=NA)
+  title("Outlier Check", outer=TRUE)
+  na2 <- sum(is.na(TMA))
+  cat("Outliers identified:", na2 - na1, "n")
+  cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(TMA))*100, 1), "n")
+  cat("Mean of the outliers:", round(mo, 2), "n")
+  m2 <- mean(TMA, na.rm = T)
+  cat("Mean without removing outliers:", round(m1, 2), "n")
+  cat("Mean if we remove outliers:", round(m2, 2), "n")
+  response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
+  if(response == "y" | response == "yes"){
+    df3[as.character(substitute(var))] <- invisible(TMA)
+    assign(as.character(as.list(match.call())$df3), df3, envir = .GlobalEnv)
+    cat("Outliers successfully removed", "n")
+    return(invisible(df3))
+  } else{
+    cat("Nothing changed", "n")
+    return(invisible(TMA))
+  }
+}
+
+outlierCMA <- function(df3, var) {
+  CMA <- eval(substitute(var),eval(df3))
+  na1 <- sum(is.na(CMA))
+  m1 <- mean(CMA, na.rm = T)
+  par(mfrow=c(2, 2), oma=c(0,0,3,0))
+  boxplot(CMA, main="With outliers")
+  hist(CMA, main="With outliers", xlab=NA, ylab=NA)
+  outlier <- boxplot.stats(CMA)$out
+  mo <- mean(outlier)
+  CMA <- ifelse(CMA %in% outlier, NA, CMA)
+  boxplot(CMA, main="Without outliers")
+  hist(CMA, main="Without outliers", xlab=NA, ylab=NA)
+  title("Outlier Check", outer=TRUE)
+  na2 <- sum(is.na(CMA))
+  cat("Outliers identified:", na2 - na1, "n")
+  cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(CMA))*100, 1), "n")
+  cat("Mean of the outliers:", round(mo, 2), "n")
+  m2 <- mean(CMA, na.rm = T)
+  cat("Mean without removing outliers:", round(m1, 2), "n")
+  cat("Mean if we remove outliers:", round(m2, 2), "n")
+  response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
+  if(response == "y" | response == "yes"){
+    df3[as.character(substitute(var))] <- invisible(CMA)
+    assign(as.character(as.list(match.call())$df3), df3, envir = .GlobalEnv)
+    cat("Outliers successfully removed", "n")
+    return(invisible(df3))
+  } else{
+    cat("Nothing changed", "n")
+    return(invisible(CMA))
+  }
+}
+
+outlierTMA(df3, TMA)
+outlierCMA(df3, CMA)
+y
+
+########################################
+## Plot average scores on assessments ##
+########################################
+
+tma_only_no_outliers <- ggplot(df3, aes(X,TMA)) + geom_point() + geom_smooth(fill="blue", colour="darkblue", size=1) + ggtitle("Tutor Measured Assessments - No Outliers") + labs(x="Student ID", y="Score")
+
+cma_only_no_outliers <- ggplot(df3, aes(X,CMA)) + geom_point() + geom_smooth(fill="red", colour="red", size=1) + ggtitle("Computer Measured Assessments - No Outliers") + labs(x="Student ID", y="Score")
+