intro_stats/stats_module_4.py at main · andyman409/intro_stats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import stats_module as s
import stats_module_2 as s2
import stats_module_3 as s3
import q_dist
import f_dist
import scipy

##########ONE WAY ANOVA##########

def one_way_ANOVA_crd(msc, mse):
    '''
    mse      mean of squares of columns
    mse      mean of squares of error terms

    returns the f value for a completely randomized experimental design
    '''
    return round(msc / mse, 4)

def ssc_formula(column_means, grand_mean, ns):
    '''
    column_means        list of column_mean
    column mean         number
    ns                  list of column sizes

    PRECONDITION: len(ns) == len(column_means)
    PRECONDITION: if rbd design, then len(ns[i]) == len(ns[j]) for any i, j

    returns the sum of squares of columns for any experimental design
    '''
    ssc = 0
    i = 0
    for i in range(len(column_means)):
        ssc += (ns[i] * ((column_means[i] - grand_mean) ** 2))
    return round(ssc, 6)

def ssr_formula_rbd(row_means, grand_mean, C):
    '''
    column_means        list of column_mean
    column mean         number

    PRECONDITION: len(ns) == len(column_means)

    returns the sum of squares of columns for any experimental design
    '''
    ssr = 0
    i = 0
    for i in range(len(row_means)):
        ssr += ((row_means[i] - grand_mean) ** 2)
    return round(C * ssr, 6)

def sse_formula_using_ANOVA(std_devs, ns):
    sse = 0
    N = len(ns)
    for i in range(N):
        sse += (ns[i] - 1) * (std_devs[i] ** 2)
    return sse

def msc_formula(ssc, dfc):
    '''
    ssc     sum of squares of columns
    dfc     degrees of freedom for columns

    returns the column mean square for any experimental design
    '''
    return round(ssc / dfc, 4)

def mse_formula_crd(sse, dfe):
    '''
    sse     sum of squares of error terms
    dfc     degrees of freedom for error terms

    returns the mean squared error for a completely randomized experimental
    design
    '''
    return round(sse / dfe, 4)

def msr_formula_rbd(ssr, dfr):
    '''
    ssr     sum of squares of blocks
    dfr     degrees of freedom for blocks

    returns the row mean squared for randomized block experimental
    design
    '''
    msr = ssr / dfr
    return round(msr, 6)

def mse_formula_rbd(sse, dfe):
    '''
    sse     sum of squares of error terms
    dfc     degrees of freedom for error terms

    returns the mean squared error for a randomized block experimental
    design
    '''
    return round(sse / dfe, 6)

def one_way_ANOVA_rbd(msc, mse, msr):
    '''
    treatment           list of treatment_level (Columns)
    treatment_level     list of member
    member              number

    PRECONDITION: treatment_levels are equal length

    returns (f value for treatments, f value for error) for a randomized block
    experimental design
    '''
    f_treatments = msc / mse
    f_blocks = msr / mse
    return (round(f_treatments, 6), round(f_blocks, 6))

def find_dfc(C):
    return C - 1

def find_dfc_tukey(C):
    return C

def find_dfe_crd(N, C):
    return N - C

def find_dfe_rbd(N, n, C):
    return N - n - C + 1

def find_dfr_rbd(n):
    return n - 1

def find_df_for_CI_for_y(n):
    return n - 2

def HT_checker(actual, inequality, expected):
    '''
    inequality String     "=" <=> H_a: actual/parameter = estimation/value
                          "<" <=> H_a: actual/parameter < estimation/value
                          ">" <=> H_a: actual/parameter > estimation/value
    test       Number     hypothesis Test Statistic
    test_c     Number     Critical Test Statistic (Test Statistic of
                          alpha)

    returns a String indicating whether the Null Hypothesis was rejected. Compatible
    with many kinds of hypothesis tests (z, t, F).
    '''
    test_c = expected
    test = actual
    if inequality == ">":
        return s2.HT_result_interpreter(test > test_c)
    elif inequality == "<":
        return s2.HT_result_interpreter(test < test_c)
    elif inequality == "=":
        return s2.HT_result_interpreter((test < test_c[0]) or (test > test_c[1]))
    else:
        print("Error: value of inequality must be one of '=', '>', or '<'")


##########MULTIPLE REGRESSION##########


def CI_estimating_y_given_x_0(lsrl, x_0, x_bar, alpha, s_e, n, ss_xx, P=None):
    '''
    lsrl           tuple of numbers (y_intercept, slope)
    x_0            number           a given x value
    x_bar          number           the overall mean
    alpha          number           significance level
    n              number           sample size
    s_e            number           standard error of the estimate
    ss_xx          number           sum of squares of x
    P              anything         converts to a Prediction Interval if P != None

    returns the expected value of y at x_0 at alpha significance level
    '''
    if P == None:
        P = 0
    else:
        P = 1
    b_0 = lsrl[0]
    b_1 = lsrl[1]
    y_hat = b_0 + (b_1 * x_0)
    df = n - 2
    t_a = s.alpha_to_t(alpha, df, 2)
    pe = y_hat
    me = t_a * s_e * s.sqrt(P + (1 / n) + (((x_0 - x_bar) ** 2) / ss_xx))
    return (round(pe, 6), round(me, 6))


##########CATEGORICAL DATA ANALYSIS##########


def X_2_goodness_of_fit_test(observations, expectations):
    '''
    observations    List of observed frequencies
    expectations    List of expected frequencies
    frequency       Number

    returns test statistic to hypothesize whether (observed) multinomial
    distribution of categorical variable frequencies matches data from an
    (expectated) particular distribution
    '''
    X_2 = 0
    for i in range(len(observations)):
        f_o = observations[i]
        f_e = expectations[i]
        X_2 += ((f_o - f_e) ** 2) / f_e
    return X_2

def get_df_X_2_goodness_of_fit_test(k, distribution="uniform"):
    '''
    k            Number     number of categories
    distribution String     "uniform" (default)
                            "poisson"
                            "normal"
    '''
    if distribution == "uniform":
        c = 0
    elif distribution == "poisson":
        c = 1
    elif distribution == "normal":
        c = 2
    return k - 1 - c

def get_expectations_uniform(observations):
    n = len(observations)
    N = sum(observations)
    expected_value = N / n
    expectations = []
    for i in range(n):
        expectations.append(expected_value)
    return expectations

def get_expectations_proportions(observations, compared_proportions):
    N = sum(observations)
    expectations = []
    for proportion in compared_proportions:
        expectations.append(proportion * N)
    return expectations

def X_2_independance_test(contingency_table):
    '''
    contingency_table    List of column
    column               List of frequency
    frequency            Number
    '''
    X_2 = 0
    N = get_N_contingency_table(contingency_table)
    num_cols = get_row_len(contingency_table)
    num_rows = get_col_len(contingency_table)
    expected_freqs = get_expected_freqs(contingency_table)
    expected_freq_rows = expected_freqs[0]
    expected_freq_cols = expected_freqs[1]
    for j in range(num_cols): #j = column
        for i in range(num_rows): #i = row
            f_o = contingency_table[j][i]
            n_i = expected_freq_rows[i]
            n_j = expected_freq_cols[j]
            f_e = expected_freq_value(n_i, n_j, N)
            X_2 += ((f_o - f_e) ** 2) / f_e
    return round(X_2, 4)

def get_N_contingency_table(contingency_table):
    N = 0
    for column in contingency_table:
        N += sum(column)
    return N

def get_expected_freqs(contingency_table):
    expected_freq_rows = []
    expected_freq_cols = []
    num_cols = get_row_len(contingency_table)
    num_rows = get_col_len(contingency_table)
    row = []
    for j in range(num_cols):
        expected_freq_cols.append(sum(contingency_table[j]))
    transposed = [list(i) for i in zip(*contingency_table)]
    for i in range(num_rows):
        expected_freq_rows.append(sum(transposed[i]))
    return (expected_freq_rows, expected_freq_cols)

def expected_freq_value(n_i, n_j, N):
    '''
    n_i   Number    total frequency of row i contingency table elements
    n_j   Number    total frequency of column j contingency table elements
    N     Number    total frequency of contingency table elements
    PRECONDITION: the variable in n_row and n_col of the contingency_table
    represents independant variables
    '''
    return round((n_i * n_j) / N, 4)

def get_df_X_2_independance_test(c, r):
    return (c - 1) * (r - 1)

def get_row_len(contingency_table):
    return len(contingency_table)

def get_col_len(contingency_table):
    return len(contingency_table[0])

def how_much_cell_contributes_to_X_2_statistic(contingency_table, row, col):
    '''
    i                    Number                  row index of target cell (minus 1)
    j                    Number                  column index of target cell (minus 1)
    contingency_table    List of List of Number
    '''
    f_o = contingency_table[col][row]
    expected_freqs = get_expected_freqs(contingency_table)
    n_i = expected_freqs[0][row]
    n_j = expected_freqs[1][col]
    N = get_N_contingency_table(contingency_table)
    f_e = expected_freq_value(n_i, n_j, N)
    return ((f_o - f_e) ** 2) / f_e