Stats-via-Rstudio-Spring-2026/Class4_script.R at main · pezantn/Stats-via-Rstudio-Spring-2026 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
### let's get some data
### we will use this data for a lot of examples
### so let's take a good look at it.
flowers=iris

names(flowers)
summary(flowers)
flowers[c(1:4),]

hist(flowers$Sepal.Length[which(flowers$Species=="setosa")])
hist(flowers$Sepal.Length[which(flowers$Species=="versicolor")])
hist(flowers$Sepal.Length[which(flowers$Species=="virginica")])

par(mar=c(2,2,2,2))
par(mfrow=c(3,1))
hist(flowers$Sepal.Length[which(flowers$Species=="setosa")],xlim=c(min(flowers$Sepal.Length),max(flowers$Sepal.Length)))
hist(flowers$Sepal.Length[which(flowers$Species=="versicolor")],xlim=c(min(flowers$Sepal.Length),max(flowers$Sepal.Length)))
hist(flowers$Sepal.Length[which(flowers$Species=="virginica")],xlim=c(min(flowers$Sepal.Length),max(flowers$Sepal.Length)))
dev.off()

### Let's check out those assumptions we talked about
plot(flowers$Sepal.Length[which(flowers$Species=="setosa")])
plot(flowers$Sepal.Length[which(flowers$Species=="versicolor")])
plot(flowers$Sepal.Length[which(flowers$Species=="virginica")])

### let's check for randomness and independence
library(lawstat) # this package contains the runs.test function
runs.test(flowers$Sepal.Length[which(flowers$Species=="setosa")],plot.it = T)
runs.test(flowers$Sepal.Length[which(flowers$Species=="versicolor")],plot.it = T)
runs.test(flowers$Sepal.Length[which(flowers$Species=="virginica")],plot.it = T)

  # p<0.05 means our assumption of randomness IS VIOLATED. We want p>0.05.

#Let's check normality
shapiro.test(flowers$Sepal.Length[which(flowers$Species=="setosa")])
shapiro.test(flowers$Sepal.Length[which(flowers$Species=="versicolor")])
shapiro.test(flowers$Sepal.Length[which(flowers$Species=="virginica")])

  # p<0.05 means out assumption of normality IS VIOLATED. We want p>0.05.

# Let's check homogeneity of variance:
library(car) # this has the levene's test for homogeneity of variance

leveneTest(flowers$Sepal.Length~flowers$Species,data=flowers)


# our data look like a random sample from a normally distributed population
# although the populations have different variances.

### proportion tests
# How does the proportion of setosa sepal lengths greater than 5 compare to 50%
setosa=flowers[which(flowers$Species=="setosa"),]
setosa$gt5=NA
setosa$gt5=setosa$Sepal.Length>5

prop.test(x=sum(setosa$gt5),n=length(setosa$gt5),p=0.5)

# How do the proportions of sepal lengths greater than 5
## compare between setosa and virginica

prop.test(x=c(sum(flowers$Species=="setosa" & flowers$Sepal.Length>5),sum(flowers$Species=="virginica" & flowers$Sepal.Length>5)),
          n=c(sum(flowers$Species=="setosa"),sum(flowers$Species=="virginica")))

### or
virginica=flowers[which(flowers$Species=="virginica"),]
virginica$gt5=NA
virginica$gt5=virginica$Sepal.Length>5

x=c(sum(setosa$gt5),sum(virginica$gt5))
n=c(length(setosa$gt5),length(virginica$gt5))

### how many and which tails are we considering

# is there a difference at all??
prop.test(x,n,alternative = "two.sided")

# is setosa < virginica
prop.test(x,n,alternative = "less")

# is setosa > virginica
prop.test(x,n,alternative = "greater")


### NOTE: Base R does not have a z-test function
### only prop.test()

### What about the sepal lengths themselves
### rather than the proportion greater than 5?

versicolor=flowers[which(flowers$Species=="versicolor"),]

### one sample. compare sample mean to a fixed value.
t.test(versicolor$Sepal.Length,mu=5)
t.test(versicolor$Sepal.Length,mu=5,alternative="less")
t.test(versicolor$Sepal.Length,mu=5,alternative="greater")

### two.sample. compare sample mean A to sample mean B
t.test(setosa$Sepal.Length,versicolor$Sepal.Length,alternative = "two.sided")
t.test(setosa$Sepal.Length,versicolor$Sepal.Length,alternative = "less")
t.test(setosa$Sepal.Length,versicolor$Sepal.Length,alternative = "greater")


### Now let's talk about paired data
x.before=rnorm(30,0,1)
x.after=rnorm(30,2,1)
y.before=rnorm(30,1,1)
y.after=rnorm(30,3,1)

t.test(x.after,y.after)
t.test(x.before,y.before)

x.diff=x.after-x.before
y.diff=y.after-y.before

t.test(x.diff,y.diff)

### let's look at some real data
library(PairedData)
newdata=anorexia
summary(newdata)
newdata$Diff=newdata$Postwt-newdata$Prewt

CBT=newdata[which(newdata$Treat=="CBT"),]
Cont=newdata[which(newdata$Treat=="Cont"),]
FT=newdata[which(newdata$Treat=="FT"),]

### We can check if any of the groups changed weight
t.test(Cont$Diff,mu=0,alternative = "two.sided")
t.test(CBT$Diff,mu=0,alternative = "two.sided")
t.test(FT$Diff,mu=0,alternative = "two.sided")

### We can also check if FT is a more effective treatment that CBT
t.test(CBT$Diff,FT$Diff,alternative = "less")


#### What if our assumptions were WILDLY violated?
### We can use the non-parametric wilcoxon rank sum test.

wilcox.test(x=Cont$Diff,mu=0)
wilcox.test(x=CBT$Diff,y=FT$Diff)