-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathhelpers.py
More file actions
170 lines (132 loc) · 5.1 KB
/
helpers.py
File metadata and controls
170 lines (132 loc) · 5.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
"""some helper functions."""
import numpy as np
def compute_gradient(y, tx, w):
""" Linear regression using gradient descent. """
e = y - tx.dot(w)
n = len(y)
return -np.dot(tx.T, e) / n
def sigmoid(t):
""" Apply sigmoid function on t. """
return np.exp(-np.logaddexp(0, -t))
def standardize(x, mean_x=None, std_x=None):
""" Standardize the original data set. """
if mean_x is None:
mean_x = np.mean(x, axis=0)
x = x - mean_x
if std_x is None:
std_x = np.std(x, axis=0)
x[:, std_x > 0] = x[:, std_x > 0] / std_x[std_x > 0]
return x, mean_x, std_x
def batch_iter(y, tx, batch_size, num_batches=None, shuffle=True):
"""
Generate a minibatch iterator for a dataset.
Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
Example of use :
for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
<DO-SOMETHING>
"""
data_size = len(y)
num_batches_max = int(np.ceil(data_size/batch_size))
if num_batches is None:
num_batches = num_batches_max
else:
num_batches = min(num_batches, num_batches_max)
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_y = y[shuffle_indices]
shuffled_tx = tx[shuffle_indices]
else:
shuffled_y = y
shuffled_tx = tx
for batch_num in range(num_batches):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
if start_index != end_index:
yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]
def build_poly(x, degree):
""" Apply a polynomial basis to all the X features. """
# First, we find the combinations of columns for which we have to
# compute the product
m, n = x.shape
combinations = {}
# Add combinations of same column power
for i in range(n * degree):
if i < n:
combinations[i] = [i]
else:
col_number = i - n
cpt = 2
while col_number >= n:
col_number -= n
cpt += 1
combinations[i] = [col_number] * cpt
# Add combinations of products between columns
cpt = i + 1
for i in range(n):
for j in range(i + 1, n):
combinations[cpt] = [i, j]
cpt = cpt + 1
# Now we can fill a new matrix with the column combinations
eval_poly = np.zeros(
shape=(m, n + len(combinations))
)
for i, c in combinations.items():
eval_poly[:, i] = x[:, c].prod(1)
# Add square root
for i in range(0, n):
eval_poly[:, len(combinations) + i] = np.abs(x[:, i]) ** 0.5
return eval_poly
def add_constant_column(x):
""" Prepend a column of 1 to the matrix. """
return np.hstack((np.ones((x.shape[0], 1)), x))
def na(x):
""" Identifies missing values. """
return np.any(x == -999)
def impute_data(x_train, x_test):
""" Replace missing values (NA) by the most frequent value of the column. """
for i in range(x_train.shape[1]):
# If NA values in column
if na(x_train[:, i]):
msk_train = (x_train[:, i] != -999.)
msk_test = (x_test[:, i] != -999.)
# Replace NA values with most frequent value
values, counts = np.unique(x_train[msk_train, i], return_counts=True)
# If there are values different from NA
if (len(values) > 1):
x_train[~msk_train, i] = values[np.argmax(counts)]
x_test[~msk_test, i] = values[np.argmax(counts)]
else:
x_train[~msk_train, i] = 0
x_test[~msk_test, i] = 0
return x_train, x_test
def process_data(x_train, x_test, add_constant_col=True):
"""
Impute missing data and compute inverse log values of positive columns
"""
# Impute missing data
x_train, x_test = impute_data(x_train, x_test)
inv_log_cols = [0, 2, 5, 7, 9, 10, 13, 16, 19, 21, 23, 26]
# Create inverse log values of features which are positive in value.
x_train_inv_log_cols = np.log(1 / (1 + x_train[:, inv_log_cols]))
x_train = np.hstack((x_train, x_train_inv_log_cols))
x_test_inv_log_cols = np.log(1 / (1 + x_test[:, inv_log_cols]))
x_test = np.hstack((x_test, x_test_inv_log_cols))
x_train, mean_x_train, std_x_train = standardize(x_train)
x_test, mean_x_test, std_x_test = standardize(x_test, mean_x_train, std_x_train)
if add_constant_col is True:
x_train = add_constant_column(x_train)
x_test = add_constant_column(x_test)
return x_train, x_test
def get_jet_masks(x):
"""
Returns 4 masks corresponding to the rows of x with a jet value
of 0, 1, 2 and 3 respectively.
"""
return {
0: x[:, 22] == 0,
1: x[:, 22] == 1,
2: np.logical_or(x[:, 22] == 2, x[:, 22] == 3)
}