-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbasic-plotting-statistics.Rmd
More file actions
321 lines (294 loc) · 12.1 KB
/
basic-plotting-statistics.Rmd
File metadata and controls
321 lines (294 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
```{r ttest_example}
# Create example data for comparing two groups
set.seed(123) # Ensure reproducibility
control <- rnorm(30, # Generate control group data
mean = 100, # Set control group mean
sd = 15) # Set control group variation
treatment <- rnorm(30, # Generate treatment group data
mean = 115, # Set treatment group mean
sd = 15) # Match control group variation
# Perform statistical analysis
t_results <- t.test(control, treatment) # Compare groups using t-test
# Create organized results table
tibble(
Measure = c("Mean Difference", # Key statistical measures
"t-statistic",
"p-value",
"95% CI"),
Value = c(
round(diff(c(mean(control), # Calculate mean difference
mean(treatment))), 2),
round(t_results$statistic, 2), # Extract t-statistic
format.pval(t_results$p.value, 3), # Format p-value nicely
paste(round(t_results$conf.int, 2), # Format confidence interval
collapse = " to ")
)
) %>%
knitr::kable(caption = "Statistical Test Results")
# Prepare data for visualization
test_data <- tibble(
value = c(control, treatment), # Combine all measurements
group = rep(c("Control", "Treatment"), # Label each group
each = 30)
)
# Create informative visualizations
# 1. Density plot to show distribution shapes
density_plot <- ggplot(test_data,
aes(x = value,
fill = group)) +
geom_density(alpha = 0.5) + # Semi-transparent density curves
labs(
title = "Group Comparison: Density Plot",
subtitle = paste("p-value =", # Show statistical significance
format.pval(t_results$p.value, 3)),
x = "Measurement Value",
y = "Density"
) +
theme_minimal()
# 2. Box plot with data points for detailed view
box_plot <- ggplot(test_data,
aes(x = group,
y = value,
fill = group)) +
geom_boxplot(alpha = 0.5) + # Show distribution summary
geom_jitter(width = 0.2, # Add individual points
alpha = 0.5) + # Semi-transparent points
labs(
title = "Group Comparison: Box Plot",
subtitle = paste("Mean Difference =", # Show effect size
round(mean(treatment) - mean(control), 2)),
x = "Group",
y = "Measurement Value"
) +
theme_minimal()
# Display both plots side by side
grid.arrange(density_plot, box_plot, ncol = 2)
# Check statistical assumptions
# 1. Normality check using Q-Q plots
qq_plots <- map(list(Control = control, # Create Q-Q plot for each group
Treatment = treatment),
~{
ggplot(data.frame(x = .x),
aes(sample = x)) +
stat_qq() + # Add Q-Q points
stat_qq_line() + # Add reference line
labs(title = paste("Q-Q Plot:",
deparse(substitute(.x)))) +
theme_minimal()
})
# Display Q-Q plots side by side
grid.arrange(grobs = qq_plots, ncol = 2)
# 2. Variance equality check
var_test <- var.test(control, treatment) # Test for equal variances
# Create variance test results table
tibble(
Measure = c("F statistic", # Key measures for variance test
"p-value",
"Ratio of variances"),
Value = c(
round(var_test$statistic, 3), # F-statistic
format.pval(var_test$p.value, 3), # p-value
round(var(treatment)/var(control), 3) # Actual ratio of variances
)
) %>%
knitr::kable(caption = "Variance Test Results")
# 3. Independence check
independence_plot <- tibble(
Order = 1:30, # Observation order
Control = control, # Control group values
Treatment = treatment # Treatment group values
) %>%
pivot_longer(cols = c(Control, Treatment),
names_to = "Group",
values_to = "Value") %>%
ggplot(aes(x = Order,
y = Value,
color = Group)) +
geom_point() + # Show individual measurements
geom_line(alpha = 0.5) + # Connect points to show patterns
facet_wrap(~Group) + # Separate plots by group
labs(
title = "Independence Check",
subtitle = "Looking for patterns in measurement order",
x = "Observation Order",
y = "Measurement Value"
) +
theme_minimal()
# Display independence check plot
independence_plot
```{r scatter_plot}
# Analyze relationships in iris dataset
iris_analysis <- iris %>%
# Calculate correlation between sepal and petal length
summarise(
correlation = cor(Sepal.Length, Petal.Length), # Pearson correlation
p_value = cor.test(Sepal.Length, # Test significance
Petal.Length)$p.value
)
# Create comprehensive scatter plot
ggplot(iris,
aes(x = Sepal.Length, # X-axis variable
y = Petal.Length, # Y-axis variable
color = Species)) + # Color by species
# Add individual data points
geom_point(alpha = 0.7) + # Semi-transparent points
# Add regression lines for each species
geom_smooth(method = "lm", # Linear regression
se = TRUE, # Show confidence interval
alpha = 0.2) + # Transparent confidence bands
# Add informative labels
labs(
title = "Relationship between Sepal and Petal Length",
subtitle = paste("Overall correlation =", # Show correlation
round(iris_analysis$correlation, 3)),
x = "Sepal Length (cm)",
y = "Petal Length (cm)"
) +
# Add regression equations
stat_smooth(method = "lm", # Add equation for each species
geom = "text",
aes(label = paste("R² =",
round(stat(r.squared), 3))),
hjust = 0,
vjust = -1) +
theme_minimal() +
theme(legend.position = "bottom") # Move legend to bottom
## Practice Exercise Solutions
### Solution 1: Basic Data Exploration
```{r solution1}
# Comprehensive summary of mtcars dataset
car_summary <- mtcars %>%
summarise(across(
everything(), # Apply to all variables
list(
mean = ~mean(.x), # Calculate mean
sd = ~sd(.x), # Calculate std deviation
min = ~min(.x), # Find minimum
max = ~max(.x) # Find maximum
),
.names = "{.col}_{.fn}" # Name format: var_stat
)) %>%
pivot_longer(everything(), # Convert to long format
names_to = c("variable", "stat"), # Split names
names_sep = "_") %>%
pivot_wider(names_from = stat, # Reshape for presentation
values_from = value)
# Display formatted summary table
knitr::kable(car_summary,
caption = "Summary Statistics for Car Features",
digits = 2) # Round to 2 decimals
# Create distribution visualization
ggplot(mtcars, aes(x = wt)) +
# Add histogram with density scaling
geom_histogram(aes(y = after_stat(density)), # Modern syntax for density
fill = "skyblue",
color = "white",
bins = 30) +
# Add density curve
geom_density(color = "red", # Add smooth curve
linewidth = 1) +
# Add mean line
geom_vline(aes(xintercept = mean(wt)), # Show average
color = "darkred",
linetype = "dashed") +
# Add informative labels
labs(
title = "Distribution of Car Weights",
subtitle = paste("Mean =", round(mean(mtcars$wt), 2),
"±", round(sd(mtcars$wt), 2), "thousand lbs"),
x = "Weight (thousand lbs)",
y = "Density"
) +
theme_minimal()
```
### Solution 2: Group Comparisons
```{r solution2}
# Prepare data for analysis
cylinder_stats <- mtcars %>%
group_by(cyl) %>% # Group by cylinder count
summarise(
n = n(), # Count cars per group
mean_mpg = mean(mpg), # Calculate mean MPG
sd_mpg = sd(mpg), # Calculate std deviation
se_mpg = sd_mpg/sqrt(n) # Calculate std error
)
# Create comprehensive visualization
ggplot(mtcars, aes(x = factor(cyl), # Convert cylinders to factor
y = mpg,
fill = factor(cyl))) +
# Add violin plot for distribution shape
geom_violin(alpha = 0.3) + # Show distribution shape
# Add box plot for summary statistics
geom_boxplot(width = 0.2, # Narrow box width
alpha = 0.7) + # Semi-transparent
# Add individual points
geom_jitter(width = 0.1, # Minimal horizontal jitter
alpha = 0.5) + # Semi-transparent points
# Add mean ± SE
geom_pointrange(data = cylinder_stats, # Add error bars
aes(y = mean_mpg,
ymin = mean_mpg - se_mpg,
ymax = mean_mpg + se_mpg),
color = "darkred") +
# Add informative labels
labs(
title = "Car Mileage by Number of Cylinders",
subtitle = "Showing distribution, box plot, and mean ± SE",
x = "Number of Cylinders",
y = "Miles Per Gallon",
fill = "Cylinders"
) +
theme_minimal()
# Perform statistical analysis
# 1. ANOVA test
anova_result <- aov(mpg ~ factor(cyl), data = mtcars)
# 2. Tukey's HSD for pairwise comparisons
tukey_result <- TukeyHSD(anova_result)
# Display ANOVA results
knitr::kable(broom::tidy(anova_result),
caption = "ANOVA Results",
digits = 3)
# Display Tukey results
knitr::kable(broom::tidy(tukey_result),
caption = "Tukey's HSD Pairwise Comparisons",
digits = 3)
```
### Solution 3: Relationship Analysis
```{r solution3}
# Perform correlation analysis
cor_result <- cor.test(mtcars$wt, mtcars$mpg) # Test correlation
# Create comprehensive scatter plot
ggplot(mtcars, aes(x = wt, y = mpg)) +
# Add individual points
geom_point(aes(size = disp), # Size points by displacement
alpha = 0.6) + # Semi-transparent
# Add regression line with confidence interval
geom_smooth(method = "lm", # Linear regression
color = "red",
alpha = 0.2) + # Transparent confidence band
# Add correlation information
annotate("text",
x = min(mtcars$wt), # Position at left
y = max(mtcars$mpg), # Position at top
label = paste("r =", # Show correlation
round(cor_result$estimate, 3),
"\np <", format.pval(cor_result$p.value, 3)),
hjust = 0, # Left-align text
vjust = 1) + # Top-align text
# Add informative labels
labs(
title = "Relationship between Weight and Mileage",
subtitle = "Point size indicates engine displacement",
x = "Weight (thousand lbs)",
y = "Miles Per Gallon",
size = "Displacement"
) +
theme_minimal()
# Create summary statistics table
model <- lm(mpg ~ wt, data = mtcars) # Fit linear model
model_summary <- broom::tidy(model) # Extract model statistics
# Display regression results
knitr::kable(model_summary,
caption = "Linear Regression Results",
digits = 3)
```