-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathHelper.py
More file actions
98 lines (87 loc) · 3.11 KB
/
Helper.py
File metadata and controls
98 lines (87 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class FeatureItem(object):
"""docstring for FeatureItem"""
def __init__(self, line):
super(FeatureItem, self).__init__()
self.arg = line
def fitSklearnAll(X, y, model, multi=False):
"""
Train a sklearn pipeline or model
"""
model.fit(X, y)
if multi:
return {"pred": model.predict_proba(X), "index": 0}
else:
return {"pred": model.predict_proba(X)[:, 1], "index": 0}
def fitSklearnCV(X, y, cv, i, model, multi=False):
"""
Train a sklearn pipeline or model -- wrapper to enable parallel CV.
"""
tr = cv[i][0]
vl = cv[i][1]
model.fit(X.iloc[tr], y.iloc[tr])
if multi:
return {"pred": model.predict_proba(X.iloc[vl]), "index": vl}
else:
return {"pred": model.predict_proba(X.iloc[vl])[:, 1], "index": vl}
def trainSklearn(model, grid, train, target, cv, refit=True, n_jobs=5, multi=False):
"""
Train a sklearn pipeline or model using textual data as input.
"""
from joblib import Parallel, delayed
from sklearn.grid_search import ParameterGrid
from numpy import zeros
if multi:
from sklearn.metrics import accuracy_score
# pred = zeros((train.shape[0],target.unique().shape[0]))
pred = zeros((train.shape[0], target[0].unique().shape[0]))
score_func = accuracy_score
else:
from sklearn.metrics import roc_auc_score
score_func = roc_auc_score
pred = zeros(train.shape[0])
best_score = 0
for g in ParameterGrid(grid):
model.set_params(**g)
if cv:
if len([True for x in g.keys() if x.find('nthread') != -1]) > 0:
results = [
fitSklearnCV(train, target, list(cv), i, model, multi) for i in range(cv.n_folds)]
else:
results = Parallel(n_jobs=n_jobs)(delayed(fitSklearnCV)(
train, target, list(cv), i, model, multi) for i in range(cv.n_folds))
if multi:
for i in results:
pred[i['index'], :] = i['pred']
score = score_func(target, pred.argmax(1))
else:
for i in results:
pred[i['index']] = i['pred']
score = score_func(target, pred)
else:
results = fitSklearnAll(train, target, model, multi)
if multi:
score = score_func(target, results['pred'].argmax(1))
if score > best_score:
best_score = score
best_pred = pred.copy()
best_grid = g
print "Best Score: %0.5f" % best_score
print "Best Grid", best_grid
if refit:
model.set_params(**best_grid)
model.fit(train, target)
return best_pred, model
def loadTrainSet(dir='csv.l'):
"""
Read in JSON to create training set.
"""
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
X = pd.read_csv(dir)
encoder = LabelEncoder()
y = DataFrame(encoder.fit_transform(X.iloc[:, -1]))
X = DataFrame(X.iloc[:, 1: -1])
return X, y, encoder
def loadTestSet(dir='../data/test.json'):
pass