-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstats_module_4.py
More file actions
309 lines (258 loc) · 9.38 KB
/
stats_module_4.py
File metadata and controls
309 lines (258 loc) · 9.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import stats_module as s
import stats_module_2 as s2
import stats_module_3 as s3
import q_dist
import f_dist
import scipy
##########ONE WAY ANOVA##########
def one_way_ANOVA_crd(msc, mse):
'''
mse mean of squares of columns
mse mean of squares of error terms
returns the f value for a completely randomized experimental design
'''
return round(msc / mse, 4)
def ssc_formula(column_means, grand_mean, ns):
'''
column_means list of column_mean
column mean number
ns list of column sizes
PRECONDITION: len(ns) == len(column_means)
PRECONDITION: if rbd design, then len(ns[i]) == len(ns[j]) for any i, j
returns the sum of squares of columns for any experimental design
'''
ssc = 0
i = 0
for i in range(len(column_means)):
ssc += (ns[i] * ((column_means[i] - grand_mean) ** 2))
return round(ssc, 6)
def ssr_formula_rbd(row_means, grand_mean, C):
'''
column_means list of column_mean
column mean number
PRECONDITION: len(ns) == len(column_means)
returns the sum of squares of columns for any experimental design
'''
ssr = 0
i = 0
for i in range(len(row_means)):
ssr += ((row_means[i] - grand_mean) ** 2)
return round(C * ssr, 6)
def sse_formula_using_ANOVA(std_devs, ns):
sse = 0
N = len(ns)
for i in range(N):
sse += (ns[i] - 1) * (std_devs[i] ** 2)
return sse
def msc_formula(ssc, dfc):
'''
ssc sum of squares of columns
dfc degrees of freedom for columns
returns the column mean square for any experimental design
'''
return round(ssc / dfc, 4)
def mse_formula_crd(sse, dfe):
'''
sse sum of squares of error terms
dfc degrees of freedom for error terms
returns the mean squared error for a completely randomized experimental
design
'''
return round(sse / dfe, 4)
def msr_formula_rbd(ssr, dfr):
'''
ssr sum of squares of blocks
dfr degrees of freedom for blocks
returns the row mean squared for randomized block experimental
design
'''
msr = ssr / dfr
return round(msr, 6)
def mse_formula_rbd(sse, dfe):
'''
sse sum of squares of error terms
dfc degrees of freedom for error terms
returns the mean squared error for a randomized block experimental
design
'''
return round(sse / dfe, 6)
def one_way_ANOVA_rbd(msc, mse, msr):
'''
treatment list of treatment_level (Columns)
treatment_level list of member
member number
PRECONDITION: treatment_levels are equal length
returns (f value for treatments, f value for error) for a randomized block
experimental design
'''
f_treatments = msc / mse
f_blocks = msr / mse
return (round(f_treatments, 6), round(f_blocks, 6))
def find_dfc(C):
return C - 1
def find_dfc_tukey(C):
return C
def find_dfe_crd(N, C):
return N - C
def find_dfe_rbd(N, n, C):
return N - n - C + 1
def find_dfr_rbd(n):
return n - 1
def find_df_for_CI_for_y(n):
return n - 2
def HT_checker(actual, inequality, expected):
'''
inequality String "=" <=> H_a: actual/parameter = estimation/value
"<" <=> H_a: actual/parameter < estimation/value
">" <=> H_a: actual/parameter > estimation/value
test Number hypothesis Test Statistic
test_c Number Critical Test Statistic (Test Statistic of
alpha)
returns a String indicating whether the Null Hypothesis was rejected. Compatible
with many kinds of hypothesis tests (z, t, F).
'''
test_c = expected
test = actual
if inequality == ">":
return s2.HT_result_interpreter(test > test_c)
elif inequality == "<":
return s2.HT_result_interpreter(test < test_c)
elif inequality == "=":
return s2.HT_result_interpreter((test < test_c[0]) or (test > test_c[1]))
else:
print("Error: value of inequality must be one of '=', '>', or '<'")
##########MULTIPLE REGRESSION##########
def CI_estimating_y_given_x_0(lsrl, x_0, x_bar, alpha, s_e, n, ss_xx, P=None):
'''
lsrl tuple of numbers (y_intercept, slope)
x_0 number a given x value
x_bar number the overall mean
alpha number significance level
n number sample size
s_e number standard error of the estimate
ss_xx number sum of squares of x
P anything converts to a Prediction Interval if P != None
returns the expected value of y at x_0 at alpha significance level
'''
if P == None:
P = 0
else:
P = 1
b_0 = lsrl[0]
b_1 = lsrl[1]
y_hat = b_0 + (b_1 * x_0)
df = n - 2
t_a = s.alpha_to_t(alpha, df, 2)
pe = y_hat
me = t_a * s_e * s.sqrt(P + (1 / n) + (((x_0 - x_bar) ** 2) / ss_xx))
return (round(pe, 6), round(me, 6))
##########CATEGORICAL DATA ANALYSIS##########
def X_2_goodness_of_fit_test(observations, expectations):
'''
observations List of observed frequencies
expectations List of expected frequencies
frequency Number
returns test statistic to hypothesize whether (observed) multinomial
distribution of categorical variable frequencies matches data from an
(expectated) particular distribution
'''
X_2 = 0
for i in range(len(observations)):
f_o = observations[i]
f_e = expectations[i]
X_2 += ((f_o - f_e) ** 2) / f_e
return X_2
def get_df_X_2_goodness_of_fit_test(k, distribution="uniform"):
'''
k Number number of categories
distribution String "uniform" (default)
"poisson"
"normal"
'''
if distribution == "uniform":
c = 0
elif distribution == "poisson":
c = 1
elif distribution == "normal":
c = 2
return k - 1 - c
def get_expectations_uniform(observations):
n = len(observations)
N = sum(observations)
expected_value = N / n
expectations = []
for i in range(n):
expectations.append(expected_value)
return expectations
def get_expectations_proportions(observations, compared_proportions):
N = sum(observations)
expectations = []
for proportion in compared_proportions:
expectations.append(proportion * N)
return expectations
def X_2_independance_test(contingency_table):
'''
contingency_table List of column
column List of frequency
frequency Number
'''
X_2 = 0
N = get_N_contingency_table(contingency_table)
num_cols = get_row_len(contingency_table)
num_rows = get_col_len(contingency_table)
expected_freqs = get_expected_freqs(contingency_table)
expected_freq_rows = expected_freqs[0]
expected_freq_cols = expected_freqs[1]
for j in range(num_cols): #j = column
for i in range(num_rows): #i = row
f_o = contingency_table[j][i]
n_i = expected_freq_rows[i]
n_j = expected_freq_cols[j]
f_e = expected_freq_value(n_i, n_j, N)
X_2 += ((f_o - f_e) ** 2) / f_e
return round(X_2, 4)
def get_N_contingency_table(contingency_table):
N = 0
for column in contingency_table:
N += sum(column)
return N
def get_expected_freqs(contingency_table):
expected_freq_rows = []
expected_freq_cols = []
num_cols = get_row_len(contingency_table)
num_rows = get_col_len(contingency_table)
row = []
for j in range(num_cols):
expected_freq_cols.append(sum(contingency_table[j]))
transposed = [list(i) for i in zip(*contingency_table)]
for i in range(num_rows):
expected_freq_rows.append(sum(transposed[i]))
return (expected_freq_rows, expected_freq_cols)
def expected_freq_value(n_i, n_j, N):
'''
n_i Number total frequency of row i contingency table elements
n_j Number total frequency of column j contingency table elements
N Number total frequency of contingency table elements
PRECONDITION: the variable in n_row and n_col of the contingency_table
represents independant variables
'''
return round((n_i * n_j) / N, 4)
def get_df_X_2_independance_test(c, r):
return (c - 1) * (r - 1)
def get_row_len(contingency_table):
return len(contingency_table)
def get_col_len(contingency_table):
return len(contingency_table[0])
def how_much_cell_contributes_to_X_2_statistic(contingency_table, row, col):
'''
i Number row index of target cell (minus 1)
j Number column index of target cell (minus 1)
contingency_table List of List of Number
'''
f_o = contingency_table[col][row]
expected_freqs = get_expected_freqs(contingency_table)
n_i = expected_freqs[0][row]
n_j = expected_freqs[1][col]
N = get_N_contingency_table(contingency_table)
f_e = expected_freq_value(n_i, n_j, N)
return ((f_o - f_e) ** 2) / f_e