-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathML_Model.py
More file actions
359 lines (246 loc) · 10.3 KB
/
ML_Model.py
File metadata and controls
359 lines (246 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
"""Test result log analyser
This script allows the user to train or update the model on script execution logs.
This script requires that `sklean, imb-learn,numpy and pandas` be installed within the Python
environment you are running this script in.
This file can also be imported as a module and contains the following
functions:
* train_model - trains model from scratch
* main - the main function of the script
* update_model - updates an existing model
"""
import datetime
import os
from collections import Counter
import time
# external libraries
import joblib
import numpy as np
import seaborn as sn
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks
class Model:
""" Encapsulates classifier related tasks and helps loading and saving model in one go """
def __init__(self):
self.time_stamp = datetime.datetime.now().strftime("%Y_%b_%d_%H_%M")
print('Model Stamp:' + self.time_stamp)
self.clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, criterion='gini',
n_estimators=30, warm_start=True)
self.vector = HashingVectorizer(n_features=2 ** 22, alternate_sign=False, analyzer='word',
decode_error='ignore', token_pattern=r'\b\w{1,}[^\d\W]+\b',
ngram_range=(2, 2))
# Samplers are not needed during testing
self.samplers = [
TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1),
EditedNearestNeighbours(random_state=7, sampling_strategy='majority', n_jobs=-1)
]
def load_model(self, time_stamp):
"""loads the model with provided timestamp
Parameters:
timestamp: time_stamp of the model to be loaded
"""
file_path = os.path.join('models', time_stamp + '_modl.joblib')
if not os.path.exists(file_path):
print('No model file found with stamp: ' + time_stamp)
return
mdl = load_joblib(file_path)
# Uses current time stamp for loaded model
self.clf = mdl.clf
self.vector = mdl.vector
print('Model Loaded: ' + time_stamp)
def fit_transform(self, text_train):
""" Fit and transform text strings to frequency matrix.
Parameters: array of training texts
Returns: matrix of training data
"""
add_to_log('Transforming..')
s_time = time.time()
x_train = self.vector.fit_transform(text_train)
add_to_log('Transformation Time: ' + str(time.time() - s_time))
# Save vector and matrix
self.save_vector()
self.save_matrix(x_train)
return x_train
def under_sample_data(self, matrix, y_train):
"""Remove samples from majority class to address bias in data.
Reduces rows in x_train and y_train.
Parameters:
samples: x_train, labels: y_train
Returns:
updated samples: x_train, labels: y_train
"""
add_to_log('Under Sampling')
add_to_log('Original distribution %s' % Counter(y_train))
s_time = time.time()
x_res = matrix
y_res = y_train
for sampler in self.samplers:
# clean proximity samples using TomeKLinks
x_res, y_res = sampler.fit_resample(x_res, y_res)
add_to_log('Adjusted distribution %s' % Counter(y_res))
add_to_log('Under sampling time: ' + str(time.time() - s_time))
return x_res, y_res
def train_classifier(self, x_train, y_train):
"""Train and save a classifier.
Parameters:
samples: x_train, labels: y_train, classifier: RandomForest
Returns:
relative path to the classifier file inside models directory
"""
add_to_log('Training Model..')
s_time = time.time()
self.clf.fit(x_train, y_train)
add_to_log('Model Trained')
add_to_log('Training Time: ' + str(time.time() - s_time))
clf_path = self.save_classifier()
return clf_path
def update_classifier(self, x_train, y_train):
"""Update and save a classifier.
Parameters:
samples: x_train, labels: y_train, classifier: RandomForest
Returns:
relative path to the classifier file inside models directory
"""
add_to_log('Training Model..')
s_time = time.time()
# Add a new decision tree per 300 samples
new_estimators = len(y_train) // 300
self.clf.n_estimators += new_estimators
add_to_log('New estimators added: ' + str(new_estimators))
self.clf.fit(x_train, y_train)
add_to_log('Classifier Updated')
add_to_log('Training Time: ' + str(time.time() - s_time))
clf_path = self.save_classifier()
return clf_path
def get_predict_prob(self, text):
"""Return class conditional probability for a single sample
Parameters: sample as text
Returns: array of class probabilities
"""
print('Vectorizing..')
x_test = self.vector.fit_transform([text])
y_preds = self.clf.predict_proba(x_test)
print(y_preds[0])
return y_preds[0]
def score_accuracy(self, x_test, y_expec):
"""Scores accuracy of the model.
Parameters: x_test, testing matrix, y_expec: expected labels
"""
add_to_log('Scoring Model..')
y_preds = self.clf.predict(x_test)
acc = np.mean(y_preds == y_expec)
add_to_log('accurary: ' + str(acc))
self.print_confusion_matrix(y_expec, y_preds)
def print_confusion_matrix(self, y_expec, y_preds):
"""Saves confusion matrix in png format.
Parameters: Expected labels and Predicted labels.
"""
clf_type = str(type(self.clf))
clf_name = clf_type.split("'")[1].split('.')[-1]
dir_path = os.path.join(os.getcwd(), 'cnf_mtrx')
file_path = os.path.join(dir_path, self.time_stamp + clf_name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
conf_mat = confusion_matrix(y_true=y_expec, y_pred=y_preds)
conf_mat_pr = []
for row in conf_mat:
conf_mat_pr.append((row / sum(row)))
add_to_log(conf_mat)
acc = np.mean(y_preds == y_expec)
# The order of labels is important
labels = ['Hardware', 'Other', 'Script', 'Software', 'Tools']
df_cm = DataFrame(conf_mat, index=labels, columns=labels)
df_prec = DataFrame(conf_mat_pr, index=labels, columns=labels)
sns_plot = sn.heatmap(df_cm, annot=True, cmap='Blues', fmt='g')
sns_plot.set_title("Acc: " + str(acc))
plt.savefig(file_path)
plt.figure()
sns_plot = sn.heatmap(df_prec, annot=True, cmap='Blues', fmt='.2%')
sns_plot.set_title("Acc: " + str(acc))
plt.savefig(file_path + '_pr')
plt.figure()
def save_vector(self):
"""Saves vector inside the vectors folder.
Parameters:
model.time_stamp and vector object example: TF-IDF Vectorizer, Hashing Vectorizer etc.
Returns:
relative path to the file inside vectors directory
"""
dir_path = os.path.join(os.getcwd(), 'vectors')
file_path = os.path.join(dir_path, self.time_stamp + '_vctr.joblib')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
joblib.dump(self.vector, file_path)
# Print vector attributes
add_to_log('Vector Saved ' + file_path)
if hasattr(self.vector, 'n_features'):
add_to_log(self.vector.n_features)
else:
add_to_log(len(self.vector.get_feature_names()))
add_to_log(self.vector.token_pattern)
add_to_log(self.vector.ngram_range)
if self.vector.stop_words is not None:
add_to_log('Total Stop Words: ' + str(len(self.vector.stop_words)))
else:
add_to_log('No Stop Words')
return file_path
def save_matrix(self, x_train):
"""Saves matrix inside the matrix folder.s
Parameters:
model.time_stamp and matrix object example: X_train
"""
dir_path = os.path.join(os.getcwd(), 'matrices')
file_path = os.path.join(dir_path, self.time_stamp + '_mtrx.joblib')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
joblib.dump(x_train, file_path)
add_to_log('Matrix Saved ' + file_path)
def save_classifier(self):
"""Saves classifier inside the models folder.s
Parameters:
classifier: classifier object
Returns:
relative path to the file inside models directory
"""
dir_path = os.path.join(os.getcwd(), 'classifiers')
file_path = os.path.join(dir_path, self.time_stamp + '_clfr.joblib')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
joblib.dump(self.clf, file_path)
add_to_log('Classifier Saved ' + file_path)
return file_path
def save_model(self):
""" Saves model in self under model folder """
dir_path = os.path.join(os.getcwd(), 'models')
file_path = os.path.join(dir_path, self.time_stamp + '_modl.joblib')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
joblib.dump(self, file_path)
add_to_log('Model Saved ' + file_path)
def load_joblib(file_path):
"""Loads the joblib file specified by file_path.
Parameters:
file_path (int): path to the file to load
Returns:
Object of the file, example: classifier, selector, vector
"""
obj_file = joblib.load(file_path)
add_to_log('File loaded ' + file_path)
return obj_file
def add_to_log(line):
"""Appends the input to execution_log.txt file and prints as well.
Parameters:
line (string): String to be appended to log.
"""
line = str(line)
with open('execution_log.txt', 'a') as log:
log.write(line)
log.write('\n')
if line == 'Done':
log.write('-' * 50)
log.write('\n')
print(line)