-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmulticlass_classifier.py
More file actions
138 lines (112 loc) · 4.67 KB
/
multiclass_classifier.py
File metadata and controls
138 lines (112 loc) · 4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
__author__ = 'Federico'
# Multiclass Naive-Bayes classifier for categorization of WoN e-mail dataset
# It uses MultinomialNB classifier
from numpy import *
from tools.tensor_utils import read_input_tensor, SparseTensor
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
# Get the input from a folder in C:
def get_example_data():
header_file = 'C:/Users/Federico/Desktop/test/evaluation/tensor_content_NEW/headers.txt'
data_file_prefix = 'C:/Users/Federico/Desktop/test/evaluation/tensor_content_NEW'
data_files = [data_file_prefix + "/connection.mtx",
data_file_prefix + "/needtype.mtx",
data_file_prefix + "/subject.mtx",
data_file_prefix + "/content.mtx",
data_file_prefix + "/category.mtx"]
slices = [SparseTensor.CONNECTION_SLICE, SparseTensor.NEED_TYPE_SLICE, SparseTensor.ATTR_SUBJECT_SLICE,
SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE]
tensor = read_input_tensor(header_file, data_files, slices, False)
data = []
target = []
# Store the chosen input into lists.
# The "if" statement is meant to include only samples with a single category (No multilabel)
for need_index in tensor.getNeedIndices():
content = ""
categories = tensor.getAttributesForNeed(need_index, SparseTensor.CATEGORY_SLICE)
numCategories = len(categories)
if numCategories >= 1:
category_index = tensor.getSliceMatrix(SparseTensor.CATEGORY_SLICE)[need_index,].nonzero()[1][0]
target.append(category_index)
for word in tensor.getAttributesForNeed(need_index, SparseTensor.ATTR_SUBJECT_SLICE):
content += word + " "
data.append(content)
# Include only few of all the categories (e.g. with samples > n)
newdata = []
newtarget = []
for i in range(len(target)):
if target.count(target[i]) > 50:
newtarget.append(target[i])
newdata.append(data[i])
data = newdata
target = newtarget
# Print out the input, just a check:
target_names = tensor.getHeaders()
print("test")
print data
print target_names
print target
return data, target, target_names
# Call for the input
my_data, my_target, my_targetname = get_example_data()
# A little information about dimensions and format of the input:
print type(my_data), type(my_target), # format of data and targets
print len(my_data) # number of samples
print len(my_target)
# Let's build the training and testing datasets:
SPLIT_PERC = 0.80 # 80% goes into training, 20% into test
split_size = int(len(my_data)*SPLIT_PERC)
X_train = my_data[:split_size]
X_test = my_data[split_size:]
y_train = my_target[:split_size]
y_test = my_target[split_size:]
# Training, prediction and evaluation of the classifier(s):
def train_and_evaluate(clf, X_train, X_test, y_train, y_test, y_name):
# Training
clf.fit(X_train, y_train)
# Prediction of testing sets
y_pred = clf.predict(X_test)
# Precision, recall and support (i.e. nr. of samples used for the testing)
print "Classification Report:"
print metrics.classification_report(y_test, y_pred)
# Confusion Matrix
print "Confusion Matrix:"
print metrics.confusion_matrix(y_test, y_pred)
# Visualization of Categories / Assigned / Data
print "Tested data => assigned category, data:"
for i in range(len(X_test)):
print str(i) + ") Real category: " + str(y_name[y_test[i]]) + ", Assigned category: " + \
str(y_name[y_pred[i]]) + ", Data: " + str(X_test[i])
# Assign names to the categories (defined by numbers)
print "\n Categories: \n"
categories = set()
for cat in y_pred:
categories.add(cat)
categories = sorted(categories)
for cat in categories:
print str(cat) + " " + y_name[cat]
# Introducing stop words
stopset = set(stopwords.words('english'))
# Two different classifiers: Count and Tfidf vectors
clf_count = Pipeline([
('vect', CountVectorizer(
stop_words=stopset,
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('clf', MultinomialNB(alpha=1)),
])
clf_tfidf = Pipeline([
('vect', TfidfVectorizer(
stop_words=stopset,
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
)),
('clf', MultinomialNB(alpha=1)),
])
# List of classifiers
clfs = [clf_count, clf_tfidf]
# Run the evaluation/classification
for clf in clfs:
train_and_evaluate(clf, X_train, X_test, y_train, y_test, my_targetname)