-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathTrainClassifier.py
More file actions
73 lines (53 loc) · 2.13 KB
/
TrainClassifier.py
File metadata and controls
73 lines (53 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from sklearn import neural_network, externals, model_selection, metrics, feature_extraction
import csv
import numpy as np
import math
import pickle
class TrainClassifier:
def __init__(self):
self.classifier = None
def train(self,data,labels):
model = neural_network.MLPClassifier(max_iter=500)
model.fit(data,labels)
return model
def test(self,data, clf):
return clf.predict(data)
def main():
cls = TrainClassifier()
rawdata = []
labels = []
i = 0
with open('traindata.csv', encoding="utf8") as csvFile:
reader = csv.reader(csvFile)
for row in reader:
text = str(row[0])
text = text.replace(u'\xa0',u' ')
label = int(row[1])
rawdata.append(text)
labels.append(label)
rawdata = np.array(rawdata)
labels = np.array(labels)
vectorizer = feature_extraction.text.TfidfVectorizer()
vect = vectorizer.fit(rawdata)
with open ("vectorizer.pkl", "wb") as v:
pickle.dump(vect, v)
data = vectorizer.transform(rawdata)
classifier = cls.train(data,labels)
kf = model_selection.KFold(n_splits = 5)
classifiers = []
for trainI, testI in kf.split(data, labels):
dataTrain, dataTest = data[trainI], data[testI]
labelTrain, labelTest = labels[trainI], labels[testI]
clf = cls.train(dataTrain, labelTrain)
predictLabels = cls.test(dataTest, clf)
classifiers.append(clf)
print("Confusion Matrix:\n",metrics.confusion_matrix(labelTest, predictLabels))
print("Accuracy: ", metrics.accuracy_score(labelTest, predictLabels))
print("F1 score: ", metrics.f1_score(labelTest, predictLabels, average='micro'))
# predictLabels = cls.test(data,classifier)
# print("Confusion Matrix:\n",metrics.confusion_matrix(labels, predictLabels))
# print("Accuracy: ", metrics.accuracy_score(labels, predictLabels))
# print("F1 score: ", metrics.f1_score(labels, predictLabels, average='micro'))
externals.joblib.dump(classifiers, "EULA_Classifier.pkl")
if __name__ == "__main__":
main()