Statistical-Inference-Project/task.R at main · YousefYousry/Statistical-Inference-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# Team Number : 22
# Name : بيشوي سدرة صابر سدرة        ID : 2021170130
# Name : يوسف يسري رضا حليم          ID : 2021170655
# Name : فيلوباتير جورج زكريا بسطوروس     ID : 2021170397
# Name : اندرو عماد سامي          ID : 2021170098
# Name : يوسف رامي يوسف           ID : 2021170631


library(caret) # for linear regression model
library(rpart) # for decision tree model

file <- read.csv("StudentsPerformance.csv")

summary(file)

# count nulls for every column
colSums(is.na(file))

# START PREPROCESSING

# (1) Replacement 'M' -> 'Male' / 'F' -> 'Female'
file$sex[file$sex=="F"]="Female"
file$sex[file$sex=="f"]="Female"
file$sex[file$sex=="M"]="Male"
file$sex[file$sex=="m"]="Male"


# (2) Handle outliers function
handle_outliers <- function(column) {
  Q1 <- quantile(column, 0.25, na.rm = TRUE)
  Q3 <- quantile(column, 0.75, na.rm = TRUE)
  IQR_value <- Q3 - Q1
  outlier_threshold <- 1.5 * IQR_value
  outliers <- column < (Q1 - outlier_threshold) | column > (Q3 + outlier_threshold)
  column[column < (Q1 - outlier_threshold)] <- Q1 - outlier_threshold
  column[column > (Q3 + outlier_threshold)] <- Q3 + outlier_threshold
  return(column)
}

file$age <- handle_outliers(file$age)
file$absences <- handle_outliers(file$absences)
file$goout <- handle_outliers(file$goout)
file$studytime <- handle_outliers(file$studytime)
file$failures <- handle_outliers(file$failures)
file$health <- handle_outliers(file$health)


# (3) replace missing values in internet column with the mode
file$internet <- ifelse(file$internet == "", names(sort(table(file$internet), decreasing = TRUE)[1]), file$internet)

# (4) Scale values of columns G1, G2, G3
file[, c("G1", "G2", "G3")] <- scale(file[, c("G1", "G2", "G3")])

# (5) feature engineering
file$Total_average <- (file$G1  + file$G2 + file$G3)/3

# (6) feature selection
# dropping failures -> after deleting outliers as it has the same value in all records
columns_to_delete <- c("G1", "G2", "G3", "X", "failures")
file <- file[, !names(file) %in% columns_to_delete]

# (7) Encode categorical variables
file$school <- as.factor(file$school)
file$sex <- as.factor(file$sex)
file$Fjob <- as.factor(file$Fjob)
file$Mjob <- as.factor(file$Mjob)
file$internet <- as.factor(file$internet)
file$romantic <- as.factor(file$romantic)

## Create dummy variables
file_dummies <- model.matrix(~ . - 1, data = file)

## Combine the dummy variables with the original dataset
file <- cbind(file, file_dummies)

## Remove the original categorical columns
file <- file[, !names(file) %in% c('school', 'sex', 'Fjob', 'Mjob', 'internet', 'romantic')]

# (8) delete duplicates
file <- unique(file)

# END PREPROCESSING

# display the statistics of the modified dataset
summary(file)

# ============================================================

# START visualization

# Scatterplot matrix
pairs(file[, c("age", "studytime", "absences", "goout", "health", "Total_average")], main = "Scatterplot Matrix")

# plot
plot(file$studytime, file$Total_average , main = "PERFORMANCE [Study Time-Total Average] " ,xlab = "Study Time" , ylab = "Total Average")
plot(file$absences, file$Total_average, main = "PERFORMANCE [Absences-Total Average] " ,xlab = "Absences" , ylab = "Total Average")
plot(file$age, file$Total_average, main = "PERFORMANCE [Age-Total Average] " ,xlab = "Absences" , ylab = "Total Average")
plot(file$goout, file$Total_average, main = "PERFORMANCE [Going out-Total Average] " ,xlab = "Absences" , ylab = "Total Average")
plot(file$health, file$Total_average, main = "PERFORMANCE [Health-Total Average] " ,xlab = "Absences" , ylab = "Total Average")

# histogram
hist(file$studytime, col="lightblue", xlab = "Study Time", main = "Study Time Histogram")
hist(file$absences, col="lightblue", xlab = "Absences", main = "Absences Histogram")
hist(file$age, col="lightblue", xlab = "Age", main = "Age Histogram")
hist(file$goout, col="lightblue", xlab = "Going out", main = "Going out Histogram")
hist(file$Total_average, col="lightblue", xlab = "Total Average", main = "Total Average Histogram")

# boxplot
boxplot(file$Total_average, col="lightblue", ylab = "Total Average", main = "Total Average Boxplot")
boxplot(file$absences, col="lightblue", ylab = "Absences", main = "Absences Boxplot")
boxplot(file$age, col="lightblue", ylab = "Age", main = "Age Boxplot")
boxplot(file$goout, col="lightblue", ylab = "Going out", main = "Going out Boxplot")
boxplot(file$Total_average, col="lightblue", ylab = "Total Average", main = "Total Average Boxplot")

# End Visualization

# ==============================================================

# START Models

#Start Linear Regression

# Prepare data for linear regression modeling

## for reproducibility and the same random sample to be trained and tested
set.seed(123)

## splitting into train and test data
splitIndex <- createDataPartition(file$Total_average, p = 0.7, list = FALSE)
train_data <- file[splitIndex, ]
test_data <- file[-splitIndex, ]

## Shuffle train_data to ensure that the data is randomly distributed across the training and testing sets
shuffled_train_data <- train_data[sample(nrow(train_data)), ]

## Train linear regression model
model <- lm(Total_average ~ ., data = shuffled_train_data)

## Print model summary
summary(model)

## Make predictions on the test set
predictions <- predict(model, newdata = test_data)

## Evaluate model performance
### NOTE: The lower the RMSE and MSE, the better the model and its predictions.

### RMSE
linear_rmse <- sqrt(mean((test_data$Total_average - predictions)^2))

### MSE
linear_mse <- mean((test_data$Total_average - predictions)^2)

cat("Root Mean Squared Error (RMSE) for linear regression model:", linear_rmse, "\n")
cat("Mean Squared Error (MSE) for linear regression model:", linear_mse, "\n")

#End Linear Regression

#==========================================================

# Start Decision Tree

## Assuming 'Total_average' is your target variable
target_class_label <- "Total_average"

## Split the data into training and testing sets (you can use a different method)

## for reproducibility and the same random sample to be trained and tested
set.seed(123)

## splitting into train and test data
train_indices <- sample(seq_len(nrow(file)), 0.7 * nrow(file))
train_data <- file[train_indices, ]
shuffled_train_data <- train_data[sample(nrow(train_data)), ]
test_data <- file[-train_indices, ]

## Shuffle test_data
shuffled_test_data <- test_data[sample(nrow(test_data)), ]

## Train the decision tree model
model <- rpart(formula = paste(target_class_label, "~ ."), data = shuffled_train_data)

## Make predictions on the test set
predictions <- predict(model, newdata = shuffled_test_data)

## Evaluate the model (you can use different metrics)

### RMSE
tree_rmse <- sqrt(mean((shuffled_test_data$Total_average - predictions)^2))

### MSE
tree_mse <- mean((shuffled_test_data$Total_average - predictions)^2)

cat("Root Mean Squared Error (RMSE) for decision tree model:", tree_rmse, "\n")
cat("Mean Squared Error (MSE) for decision tree model:", tree_mse, "\n")

## Visualize the decision tree
plot(model)
text(model, cex = 0.5)

# End Decision Tree