-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTutoring-Statistical-Analysis.Rmd
More file actions
398 lines (292 loc) · 27.1 KB
/
Tutoring-Statistical-Analysis.Rmd
File metadata and controls
398 lines (292 loc) · 27.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
```{r setup, message = FALSE, warning = FALSE}
library(tidyverse)
library(dplyr)
library(Rmisc)
library(Hmisc)
library(emmeans)
library(gridExtra)
#install.packages('car')
library(car)
options(width = 100)
```
# **Section 1**
## Data Preparation
```{r Uploading Data Tutoring, message = FALSE, warning = FALSE}
tutoring_data <- read_csv("tutoring_test_data.txt")
```
```{r Data Preparation Tutoring, message = FALSE, warning = FALSE, fig.align = 'center', fig.width = 10}
# Checking the structure of the data and the summary
str(tutoring_data)
summary(tutoring_data)
# Checking the test scores for their distributions and any outliers
grid.arrange(
ggplot(tutoring_data, aes(score.t1, ..density..)) + geom_histogram(binwidth = 1) + geom_density() +
labs(x = "Score of Test 1", y = "Density"),
ggplot(tutoring_data, aes(score.t2, ..density..)) + geom_histogram(binwidth = 1) + geom_density() +
labs(x = "Score of Test 2", y = "Density"),
nrow = 2
)
# Checking the absence proportion for its distribution and any outliers
absence_distribution_initial_plot <- ggplot(tutoring_data, aes(absences, ..density..)) + geom_histogram(binwidth = 1) + geom_density() +
labs(x = "Proportion of Absences to the Total Class Time", y = "Density")
# Making the "tutored" attribute a factor
tutoring_data$tutoring <- factor(tutoring_data$tutoring, levels = c("FALSE", "TRUE"), labels = c("Non-Tutored", "Tutored"))
# Limiting the test scores so that they would be at most 100 out of 100 and the absence proportion 25 in order to be compatible with any new data containing possible wrong entries as well as removing any possible duplicate student entries
tutoring_data_new <- tutoring_data %>% filter(absences <= 25 & !is.na(score.t2) & score.t1 <= 100 & score.t2 <= 100) %>%
group_by(student_ID) %>% filter(!duplicated(student_ID))
# Checking the summary again after the updates
summary(tutoring_data_new)
# Checking the test scores for their distributions after removing false and duplicate data
grid.arrange(
ggplot(tutoring_data_new, aes(score.t1, ..density..)) + geom_histogram(binwidth = 1) + geom_density() +
labs(x = "Score of Test 1", y = "Density"),
ggplot(tutoring_data_new, aes(score.t2, ..density..)) + geom_histogram(binwidth = 1) + geom_density() +
labs(x = "Score of Test 2", y = "Density"),
nrow = 2
)
# Checking the absence proportion for its distribution
ggplot(tutoring_data_new, aes(absences, ..density..)) + geom_histogram(binwidth = 1) + geom_density() +
labs(x = "Absence Rate", y = "Density")
```
```{r Summary Statistics the Tutoring Data, message = FALSE, warning = FALSE, fig.align = 'center', fig.width = 10}
# Finding the means and standard deviations of the test scores and the absence rates to be used to compare the distributions with normal distribution
( tutoring_data_summary_by_tutoring <- tutoring_data_new %>%
dplyr::group_by(tutoring) %>%
dplyr::summarise(mean.t1.score = mean(score.t1), mean.t2.score = mean(score.t2), mean.absence = mean(absences), sd.t1.score = sd(score.t1), sd.t2.score = sd(score.t2), sd.absence = sd(absences)) )
( tutoring_data_summary_overall <- tutoring_data_new %>%
summarise(mean.t1.score = mean(score.t1), mean.t2.score = mean(score.t2), mean.absence = mean(absences), sd.t1.score = sd(score.t1), sd.t2.score = sd(score.t2), sd.absence = sd(absences)) )
# Comparing the overall distributions of the variables with normal distribution
grid.arrange(
ggplot(tutoring_data_new) + geom_histogram(aes(x = score.t1, y = ..density..)) +
geom_vline(xintercept = tutoring_data_summary_overall$mean.t1.score) +
stat_function(fun = function(x) {dnorm(x, mean = tutoring_data_summary_overall$mean.t1.score, sd = tutoring_data_summary_overall$sd.t1.score)}) +
labs(x = "Score of Test 1", y = "Density") + facet_wrap(~ tutoring),
ggplot(tutoring_data_new) + geom_histogram(aes(x = score.t2, y = ..density..)) +
geom_vline(xintercept = tutoring_data_summary_overall$mean.t2.score) +
stat_function(fun = function(x) {dnorm(x, mean = tutoring_data_summary_overall$mean.t2.score, sd = tutoring_data_summary_overall$sd.t2.score)}) +
labs(x = "Score of Test 2", y = "Density") + facet_wrap(~ tutoring),
ggplot(tutoring_data_new) + geom_histogram(binwidth = 1.3, aes(x = absences, y = ..density..)) +
geom_vline(xintercept = tutoring_data_summary_overall$mean.absence) +
stat_function(fun = function(x) {dnorm(x, mean = tutoring_data_summary_overall$mean.absence, sd = tutoring_data_summary_overall$sd.absence)}) +
labs(x = "Absence Rate", y = "Density") + facet_wrap(~ tutoring)
)
```
## Checking whether the students allocated to the tutored and non-tutored groups had similar or different average test scores before the tutoring scheme began
```{r Before the Buddying Scheme, message = FALSE, warning = FALSE, fig.align = 'center', fig.width = 10}
# Doing NHST for the test scores of the students before the buddying scheme by tutoring
( scores.before.t.test <- t.test(score.t1 ~ tutoring, tutoring_data_new) )
### The mean score difference of test 1 is not significantly different from each other, Welch t(196.54) = -1.05, p = 0.30.
# Estimation of scores by tutoring using linear modelling
scores.before.lm <- lm(score.t1 ~ tutoring, tutoring_data_new)
scores.before.emm <- emmeans(scores.before.lm, ~ tutoring) # Checking the mean, lower.CL and upper.CL values of test 1 scores
scores.before.contrast <- confint(pairs(scores.before.emm, reverse = TRUE)) # Checking the CIs and the difference between Tutored and Non-tutored students
### The mean score for Non-tutored students is 52.9 95% CI [50.4–55.4] before the scheme. The mean score for Tutored students is 54.8 95% CI [52.3–57.3] before the scheme. The mean score is 1.88 95% CI [-1.67–5.43] smaller for Non-tutored students compared to Tutored students.
knitr::kable(scores.before.emm, caption = "Mean of Test Scores of The Students Before The Buddying Scheme") # To see the overall more clearly
# Visualising the mean of scores before the buddying scheme and the difference in scores before the buddying scheme
grid.before <- grid.arrange(
ggplot(summary(scores.before.emm), aes(x = emmean, y = tutoring, xmin = lower.CL, xmax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Tutoring", y = "Scores", title = "Mean of Scores Before the Scheme", subtitle = "Error bars are 95% CIs") +
xlim(-10, 63) + coord_flip(),
ggplot(scores.before.contrast, aes(x = estimate, y = contrast, xmin = lower.CL, xmax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Difference in Scores of Test 1", y = "Contrast", title = "Difference in Scores Before the Scheme", subtitle = "Error bars are 95% CIs") +
geom_vline(xintercept = 0, lty = 2) + xlim(-10, 63) + coord_flip(),
ncol = 2
)
### The Difference in Scores Before the Scheme graph proves that the effect of tutoring is not significant since the line plotted is crossing 0
```
## Checking whether the tutored and non-tutored students have similar or different rates of absences on average
```{r Visualising, message = FALSE, warning = FALSE, fig.align = 'center', fig.width = 10}
# Visualising the absence rates for Tutored and Non-tutored students
ggplot(tutoring_data_new, aes(absences, ..density.., fill = tutoring)) +
geom_histogram(binwidth = 1.3, position = "identity", alpha = 0.5) +
labs(x = "Absence Rate (%)", y = "Density", title = "Distribution of Absence Rates by Tutoring Groups", fill = "tutoring") +
geom_vline(data = tutoring_data_summary_by_tutoring, aes(xintercept = mean.absence), colour = c("coral4", "deepskyblue4"), size = 1.5) +
geom_text(mapping = aes(x = mean.absence, y = 0, label = round(mean.absence, 2), hjust = -2, vjust = 0, angle = 90), data = tutoring_data_summary_by_tutoring, size = 7)
```
**NHST**
```{r Absences On Average, message = FALSE, warning = FALSE}
# Doing NHST for the absence proportions of the students by tutoring
( absences.t.test <- t.test(absences ~ tutoring, tutoring_data_new) )
### The mean of the proportion of absent class time for Non-tutored students is not significantly different from Tutored students', Welch t(198) = -0.99, p = 0.33.
```
**Estimation**
```{r Absences On Average 2, message = FALSE, warning = FALSE, fig.align = 'center', fig.width = 10}
# Estimation of proportion of absences by tutoring using linear modelling
absences.lm <- lm(absences ~ tutoring, tutoring_data_new)
absences.emm <- emmeans(absences.lm, ~tutoring) # Checking the mean, lower.CL and upper.CL values of absence rates
absences.contrast <- confint(pairs(absences.emm, reverse = TRUE)) # Checking the CIs and the difference between Tutored and Non-tutored students
### The mean of the proportion of absent class time for Non-tutored students is 6.31 95% CI [5.63–6.99]. The mean of the proportion of absent class time for Tutored students is 6.79 95% CI [6.11–7.47]. The mean is 0.48 95% CI [-0.48–1.44] smaller for Non-tutored students compared to Tutored students.
knitr::kable(absences.emm, caption = "Mean of Absences of The Students") # To see the overall more clearly
# To see if the linear model can be improved, the following models are examined
absences.test.scores.lm <- lm(absences ~ tutoring + score.t1 + score.t2, tutoring_data_new) # To predict absence proportion from tutoring, score of test 1, and score of test 2
vif(absences.test.scores.lm) # To measure each variable’s shared variance with all other variables
### Since VIF score is larger than 5, we should perform anova tests to decide whether it is justified keeping all of the predictors in the model
absences.test.score.t1.lm <- lm(absences ~ tutoring + score.t1, tutoring_data_new) # To predict absence proportion from both tutoring and score of test 1
absences.test.score.t1.emm <- emmeans(absences.test.score.t1.lm, ~ tutoring) # Checking the mean, lower.CL and upper.CL values of absence rates with the effect of test score 1
absences.test.score.t1.contrast <- confint(pairs(absences.test.score.t1.emm, reverse = TRUE)) # Checking the CIs and the difference between Tutored and Non-tutored students
anova(absences.test.scores.lm, absences.test.score.t1.lm) # To compare the models where both test scores are included and where only the first test score is included to see if a more complex model is more accurate overall
### Model comparison shows that a regression model including tutoring, score of test 1, and score of test 2 does not result in a significantly better overall fit than a model only including tutoring and score of test 1 F(1,196) = 0.60, p = 0.44.
anova(absences.lm, absences.test.score.t1.lm) # To compare the models where none of the test scores are included and where only the first test score is included to see if a more complex model is more accurate overall
### Model comparison shows that a regression model including only tutoring results in a significantly better overall fit than a model only including tutoring and score of test 1 F(1,198) = 27.45 p<.0001.
knitr::kable(absences.test.score.t1.emm, caption = "Mean of Absence Rates of The Students with 95% CIs By Test Scores") # To see the overall more clearly
# Visualising the mean of absence rates by tutoring and the difference in absence rates
grid.absences <- grid.arrange(
ggplot(summary(absences.emm), aes(x = tutoring, y = emmean, ymin = lower.CL, ymax = upper.CL)) + geom_point(colour = "steelblue") +
geom_linerange() +
labs(x = "Tutoring", y = "Absence Rates", title = "Mean of Absence Rates", subtitle = "Error bars are 95% CIs") + ylim(-2, 8),
ggplot(absences.contrast, aes(x = contrast, y = estimate, ymin = lower.CL, ymax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Contrast", y = "Difference in Absence Rates", title = "Difference in Absence Rates", subtitle = "Error bars are 95% CIs") +
geom_hline(yintercept = 0, lty = 2) + ylim(-2, 8),
ncol = 2
)
### The Difference in Absence Rates graph proves that the effect of tutoring is not significant on its own since the line plotted is crossing 0
# Visualising the mean of absence rates by tutoring and test 1 score and the difference in absence rates
grid.absences.main.effect.score.t1 <- grid.arrange(
ggplot(summary(absences.test.score.t1.emm), aes(x = tutoring, y = emmean, ymin = lower.CL, ymax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Tutoring", y = "Absence Rates", title = "Mean of Absence Rates By Test Score 1", subtitle = "Error bars are 95% CIs") +
ylim(-2, 8),
ggplot(absences.test.score.t1.contrast, aes(x = contrast, y = estimate, ymin = lower.CL, ymax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Contrast", y = "Difference in Absence Rates", title = "Difference in Absence Rates By Test Score 1", subtitle = "Error bars are 95% CIs") +
geom_hline(yintercept = 0, lty = 2) + ylim(-2, 8),
ncol = 2
)
### Even though taking test score 1 into consideration proved to improve the model, the Difference in Absence Rates By Test Score 1 graph shows that tutoring along with test score 1 is not significant since the line plotted is crossing 0
grid.arrange(grid.absences, grid.absences.main.effect.score.t1, nrow = 2)
```
## Checking whether the tutored students show an increase in their scores compared to the students who did not receive tutoring
```{r Comparison of Test Scores After the Buddying Scheme, message = FALSE, warning = FALSE}
# Adding the change in the test scores after the Buddying Scheme to the data set under a new column called score_change
tutoring_data_new <- mutate(tutoring_data_new, score_change = score.t2 - score.t1)
```
**NHST**
```{r Comparison of Test Scores After the Buddying Scheme 2, message = FALSE, warning = FALSE}
# Doing NHST for the score differences of test scores by tutoring
( score.difference.t <- t.test(score_change ~ tutoring, tutoring_data_new) )
### The mean of the score differences for Non-tutored students is significantly larger for Tutored students than Non-tutored student's score differences, Welch t(194) = -5.08, p < .0001.
```
**Estimation**
```{r Comparison of Test Scores After the Buddying Scheme 3, message = FALSE, warning = FALSE, fig.align = 'center', fig.width = 10}
# Estimation of the score differences by tutoring using linear modelling
score.difference.lm <- lm(score_change ~ tutoring, tutoring_data_new)
score.difference.emm <- emmeans(score.difference.lm, ~ tutoring) # Checking the mean, lower.CL and upper.CL values of score differences
score.difference.contrast <- confint(pairs(score.difference.emm, reverse = TRUE)) # Checking the CIs and the difference between Tutored and Non-tutored students
### The mean of the score differences for Non-tutored students is -0.44 95% CI [-1.59–0.71]. The mean of the score differences for Tutored students is 3.77 95% CI [2.61–4.92]. The mean is 4.21 95% CI [2.57–5.84] smaller for Non-tutored students compared to Tutored students.
# Visualization of the mean of differences of test scores
score.difference.plot <- grid.arrange(
ggplot(summary(score.difference.emm), aes(x = tutoring, y = emmean, ymin = lower.CL, ymax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(y = "Score Differences on Test 1", x = "Tutoring", subtitle = "Error bars are 95% CIs", title = "Mean of Score Differences") +
ylim(-2, 8),
ggplot(score.difference.contrast, aes(x = contrast, y = estimate, ymin = lower.CL, ymax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Contrast", y = "Difference in Scores", title = "Difference in Scores", subtitle = "Error bars are 95% CIs") +
geom_hline(yintercept = 0, lty = 2) + ylim(-2, 8),
ncol = 2
)
### The Difference in Scores graph proves that the effect of tutoring is significant on its own since the line plotted is crossing 0
```
## Checking whether there are any effects of absences on the change in scores, and whether these have any interaction with the effect of tutoring
```{r Effect of Absences on the Change in Test Scores, message = FALSE, warning = FALSE}
# Doing NHST to see the whether there is any effect of absences on the change in test scores using linear modelling
scoreVSabsence.lm <- lm(score_change ~ tutoring + absences, tutoring_data_new) # To create a multiple linear regression model to check the effects of absences, tutoring and their interaction on the score difference.
( nhst.scoreVSabsence <- summary(scoreVSabsence.lm) )
### The results of the regression show that there is a significant main effect of tutoring upon score differences (b = 4.26, t(197) = 5.137, p < 0.0001) but there is not a significant main effect of absence rate upon score difference (b = -0.12, t(197) = -0.984, p = 0.33).
# Checking the interaction of tutoring with absences and change in scores
scoreVSabsence.lm.interaction <- lm(score_change ~ tutoring * absences, tutoring_data_new)
( scoreVSabsence.interaction <- (summary(scoreVSabsence.lm.interaction)) )
### The results of the regression show that there is not a significant main effect of absences upon score difference (b = -0.14, t(196) = -0.782, p = 0.44) but there is a significant main effect of tutoring upon score difference (b = 4.04, t(196) = 2.25, p = 0.03). There was also no significant interaction effect between absences and tutoring, with the positive effect of tutoring being significantly smaller when absences were present (b = 0.03, t(196) = 0.143, p = 0.89).
anova.score.diff.scoreVSabsence <- anova(score.difference.lm, scoreVSabsence.lm) # To compare the models where score differences by tutoring and where score differences by tutoring and absence rates are included to see if a more complex model is more accurate overall
### Model comparison shows that a regression model including tutoring and absence rate does not result in a significantly better overall fit than a model only including tutoring F(1,197) = 0.97, p = 0.33.
anova.score.diff.scoreVSabsence.interaction <- anova(score.difference.lm, scoreVSabsence.lm.interaction) # To compare the models where score differences by tutoring and where score differences by tutoring and absence rates are interacting with each other are included to see if a more complex model is more accurate overall
### Model comparison shows that a regression model including tutoring and absence rate with interactions does not result in a significantly better overall fit than a model only including tutoring F(1,196) = 0.49, p = 0.61
anova.score.diff.scoreVSabsence <- anova(scoreVSabsence.lm.interaction, scoreVSabsence.lm) # To compare the models where score differences by tutoring and where score differences by tutoring and absence rates are included to see if a more complex model is more accurate overall
### Model comparison shows that a regression model including tutoring and absence rate does not result in a significantly better overall fit than a model including tutoring and absence rate with interactions F(1,196) = 0.02, p = 0.89.
```
---
# **Section 2 - Report**
The findings in this report are based on the examination of the relationship of the test scores and absence proportions of students that had a tutor and did not have a tutor, which will be referred to as "Tutored" and "Non-Tutored", respectively throughout the report.
In this study, among 202 participants, 101 were Tutored and 101 were Non-Tutored students. The data used in this study were student IDs, tutoring information, absence proportions to the total class time, test scores before the Buddying Scheme was implemented (score.t1), and test scores after the Buddying Scheme was implemented (score.t2) of the students.
Prior to any calculations, data were examined for outliers. The second test score had a maximum value of 200 which was omitted since the highest possible score a student can receive is 100. In addition, the maximum value for absence rates was 100 which meant the student did not attend any of the classes which is a highly unlikely situation, hence the distribution of the absence rate was plotted in order to find a plausible limit.
```{r echo = FALSE, include = TRUE, fig.align = 'center', fig.width = 10}
absence_distribution_initial_plot
```
After examining the graph, the absence variable was limited to 25 and the outliers were removed from the dataset. Once the data were checked again, there were no other irrational data.
## Relationship between being tutored and average test scores prior to the Buddying Scheme
Following the data cleansing, the relationship between being tutored and average test scores prior to the Buddying Scheme was checked as well as the difference in the scores in order to have a better understanding of the student profiles.
```{r echo = FALSE, include = TRUE, fig.align = 'center', fig.width = 10}
grid.arrange(
ggplot(summary(scores.before.emm), aes(x = emmean, y = tutoring, xmin = lower.CL, xmax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Tutoring", y = "Scores", title = "Mean of Scores Before the Scheme", subtitle = "Error bars are 95% CIs") +
xlim(-10, 63) + coord_flip(),
ggplot(scores.before.contrast, aes(x = estimate, y = contrast, xmin = lower.CL, xmax = upper.CL)) +
geom_point(colour = "steelblue") + geom_linerange() +
labs(x = "Difference in Changes in Scores", y = "Contrast", title = "Difference in Scores Before the Scheme", subtitle = "Error bars are 95% CIs") +
geom_vline(xintercept = 0, lty = 2) + xlim(-10, 63) + coord_flip(),
ncol = 2
)
```
T-test and linear modelling tools were used. The mean score difference of test 1 is not significantly different from each other, Welch t(196.54) = -1.05, p = 0.30 was concluded. The mean score for Non-tutored students is 52.9 95% CI [50.4–55.4] before the scheme. The mean score for Tutored students is 54.8 95% CI [52.3–57.3] before the scheme. The mean score is 1.88 95% CI [-1.67–5.43] smaller for Non-tutored students compared to Tutored students.
## Relationship between mean absence rates and tutoring
First, a t-test was conducted.
```{r echo = FALSE, include = TRUE}
absences.t.test
```
The mean of the proportion of absent class time for Non-tutored students is not significantly different from Tutored students', Welch t(198) = -0.99, p = 0.33.
In order to examine the different CIs, the estimation method was also applied.
```{r echo = FALSE, include = TRUE}
absences.contrast
```
The mean of the absent class time proportion for Non-Tutored students is 6.31 95% CI [5.63–6.99]. The mean of the absent class time proportion for Tutored students is 6.79 95% CI [6.11–7.47]. The mean is 0.48 95% CI [-0.48–1.44] smaller for Non-tutored students compared to Tutored students.
Initially, a linear model was created to predict the absence proportion from tutoring, the score of test 1, and the score of test 2. However, after checking the VIF scores to measure each variable’s shared variance with all other variables, it was observed that some ANOVA tests had to be performed to decide whether it can be justified to keep all of the predictors in the model.
```{r echo = FALSE, include = TRUE}
vif(absences.test.scores.lm)
```
The models where both test scores are included and where only the first test score is included were compared in order to see if a more complex model is more accurate overall.
```{r echo = FALSE, include = TRUE}
anova(absences.test.scores.lm, absences.test.score.t1.lm)
```
The comparison shows that a regression model including tutoring, the score of test 1, and the score of test 2 does not result in a significantly better overall fit than a model only including tutoring and score of test 1 F(1,196) = 0.60, p = 0.44.
Then, the models where none of the test scores are included and where only the first test score is included were compared in order to see if a more complex model is more accurate overall.
```{r echo = FALSE, include = TRUE}
anova(absences.lm, absences.test.score.t1.lm)
```
The comparison shows that a regression model including only tutoring results in a significantly better overall fit than a model only including tutoring and score of test 1 F(1,198) = 27.45 p < .0001.
The improved model gives the following information.
```{r echo = FALSE, include = TRUE}
knitr::kable(absences.test.score.t1.emm, caption = "Mean of Absence Rates of The Students with 95% CIs By Test Scores") # To see the overall more clearly
```
```{r echo = FALSE, include = TRUE, fig.align = 'center', fig.width = 10}
grid.absences.main.effect.score.t1
```
Even though taking test score 1 into consideration proved to improve the model, the difference in absence rates is not significant.
## Relationship between tutoring and test score differences
Following examining the absence and test score relationships by tutoring information, whether the tutored students show an increase in their scores compared to the students who did not receive tutoring was inspected. Prior to any further analysis, the score difference for each student was calculated.
In order to analyze the relationship between tutoring and test score differences, t-test and linear modelling were used as statistical tools.
```{r echo = FALSE, include = TRUE}
score.difference.t
```
```{r echo = FALSE, include = TRUE}
score.difference.contrast
```
The mean of the score differences for Non-tutored students is significantly larger for Tutored students than Non-tutored students' score differences, Welch t(194) = -5.08, p < .0001. The mean of the score differences for Non-tutored students is -0.44 95% CI [-1.59–0.71]. The mean of the score differences for Tutored students is 3.77 95% CI [2.61–4.92]. The mean is 4.21 95% CI [2.57–5.84] smaller for Non-Tutored students compared to Tutored students.
```{r echo = FALSE, include = TRUE}
score.difference.plot
```
## Relationship between absences and test score difference and investigating the interaction with tutoring
Finally, whether there are any effects of absences on the change in scores, and whether these have any interaction with the effect of tutoring were inspected.
```{r echo = FALSE, include = TRUE}
nhst.scoreVSabsence
```
The mean of the score differences for Non-tutored students is significantly different for Tutored students from Non-tutored students' score differences when the effect of tutoring is considered, Welch t(197) = 5.137, p<.0001. The mean of the score differences for Non-tutored students is not significantly different for Tutored students from Non-tutored students' score differences when the effect of absence rates is considered, Welch t(197) = -0.984, p = 0.33.
Then, the interaction of tutoring and absences were checked.
```{r echo = FALSE, include = TRUE}
scoreVSabsence.interaction
```
The results of the regression show that there is not a significant main effect of absences upon score difference (b = -0.14, t(196) = -0.782, p = 0.44) but there is a significant main effect of tutoring upon score difference (b = 4.04, t(196) = 2.25, p = 0.03). There was also no significant interaction effect between absences and tutoring, with the positive effect of tutoring being significantly smaller when absences were present (b = 0.03, t(196) = 0.143, p = 0.89).
After reaching these conclusions, some ANOVA tests were performed to decide whether it is justified to keep all of the predictors in the model. The tests showed that there is no significant relationship between the change in test scores and the increase in the absence rates. Similarly, their interaction effect was found to be not significant and that inclusion of it does not improve the model.