This repository was archived by the owner on Jan 13, 2020. It is now read-only.
forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPA1_template.Rmd
More file actions
131 lines (101 loc) · 5.51 KB
/
PA1_template.Rmd
File metadata and controls
131 lines (101 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
---
title: "Reproducible Research: Peer Assessment 1"
output:
html_document:
keep_md: true
---
## Loading and preprocessing the data
```{r}
library(ggplot2)
library(scales)
library(dplyr)
activity <- read.csv(file = "data/activity.csv")
head(activity)
```
## What is mean total number of steps taken per day?
```{r message=FALSE}
# preprocess
activity_by_day <- activity[!is.na(activity$steps),]
activity_by_day$date <- as.Date(activity_by_day$date)
# plot sum of steps by date as histogram
plot <- ggplot(data = activity_by_day, aes(x = date, y = steps, width = 0.5, color = "steps"))
plot <- plot + stat_summary(fun.y = sum, geom = "bar")
plot <- plot + scale_x_date(labels = date_format("%Y-%m-%d"), breaks = "1 week")
plot <- plot + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot <- plot + xlab("") + ylab("steps (sum)")
plot
# print mean and median for overall steps
activity_steps_by_day <- activity_by_day %>% group_by(date) %>% summarize(sum = sum(steps))
mean_steps_per_day <- mean(activity_steps_by_day$sum)
median_steps_per_day <- median(activity_steps_by_day$sum)
print("Median and Mean stats for overall dataset")
print(c("Mean", mean_steps_per_day))
print(c("Median", median_steps_per_day))
```
## What is the average daily activity pattern?
```{r message=FALSE}
# preprocess
activity_by_interval <- activity[!is.na(activity$steps),]
activity_by_interval$date <- as.Date(activity_by_day$date)
activity_by_interval <- activity_by_interval %>% group_by(interval)
# plot the average number of steps for each interval
plot <- ggplot(data = activity_by_interval, aes(x = interval, y = steps, width = 0.5))
plot <- plot + stat_summary(fun.y = mean, geom = "line", aes(color = "mean/5 min"))
plot <- plot + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot <- plot + xlab("") + ylab("steps (mean)")
plot
# find and filter by the highest average of steps
activity_steps_max_by_interval <- activity_by_interval %>% group_by(interval) %>% summarize(max = max(mean(steps)))
highest <- activity_steps_max_by_interval[which.max(activity_steps_max_by_interval$max), c(1, 2)]
print("The 5-minute interval that, on average, contains the maximum number of steps")
print(highest)
```
## Imputing missing values
```{r message=FALSE}
# report the number of rows with NA from the original dataset
activity_incomplete <- is.na(activity$steps)
sum_activity_incomplete <- sum(activity_incomplete)
print(c("Number of records without a value: ", sum_activity_incomplete))
# create a new dataset where NA are replaced with the average of the overall 5 min intervals
# for this create a dataframe which maps interval to the corresponding mean (inteval x mean)
activity_by_interval_and_mean = activity_by_interval %>% group_by(interval) %>% summarize(mean = mean(steps))
# replace all NAs with the corresponding mean using previously computed results
# (there is a "nicer" way doing a vectorized computing but somehow this was even more unreadable as the for loop)
activity_imputed <- activity
activity_imputed$date <- as.Date(activity_imputed$date)
for (i in 1:nrow(activity_imputed)) {
if (is.na(activity_imputed[i, ]$steps)) {
activity_imputed[i, ]$steps <- activity_by_interval_and_mean[activity_by_interval_and_mean$interval == activity_imputed[i, ]$interval, ]$mean
}
}
# plot sum of steps by date as histogram
plot <- ggplot(data = activity_imputed, aes(x = date, y = steps, width = 0.5, color = "steps"))
plot <- plot + stat_summary(fun.y = sum, geom = "bar")
plot <- plot + scale_x_date(labels = date_format("%Y-%m-%d"), breaks = "1 week")
plot <- plot + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot <- plot + xlab("") + ylab("steps (sum)")
plot
# print mean and median for overall steps
activity_imputed_by_day <- activity_imputed %>% group_by(date) %>% summarize(sum = sum(steps))
mean_steps_per_day_imputed <- mean(activity_imputed_by_day$sum)
median_steps_per_day_imputed <- median(activity_imputed_by_day$sum)
print("Median and Mean stats for overall dataset (imputed")
print(c("Mean", mean_steps_per_day_imputed))
print(c("Median", median_steps_per_day_imputed))
print("In contrast to unimposed data there are values available for all days now. Means and Medians are not really affected. That might be due to the overall low amout of missing data (2304 of 17,568)")
```
## Are there differences in activity patterns between weekdays and weekends?
```{r message=FALSE}
# create a factor variable for whether the date is a weekday or weekend
activity_imputed_with_weekend_factor <- activity_imputed %>% mutate(weekday = weekdays(date)) %>% mutate(is_weekend = as.factor(ifelse (weekday %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))) %>% select(-weekday)
# plot the average number of steps for each interval and weekday/weekend
activity_imputed_by_weekday <- activity_imputed_with_weekend_factor[which(activity_imputed_with_weekend_factor$is_weekend == "Weekday"), ]
activity_imputed_by_weekend <- activity_imputed_with_weekend_factor[which(activity_imputed_with_weekend_factor$is_weekend == "Weekend"), ]
# for better comparing the data I have choosen to draw them in one panel
plot <- ggplot(data = activity_imputed_with_weekend_factor, aes(x = interval, y = steps, width = 0.5))
plot <- plot + stat_summary(data = activity_imputed_by_weekday, fun.y = mean, geom = "line", aes(color = "mean/weekday"))
plot <- plot + stat_summary(data = activity_imputed_by_weekend, fun.y = mean, geom = "line", aes(color = "mean/weekend"))
plot <- plot + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot <- plot + xlab("") + ylab("steps (mean)")
plot
```