wonpreprocessing/python-processing/classification/multiclass_classifier.py at e4b9c4e12f275a8a9c9c16f9a0af605ff14220f7 · researchstudio-sat/wonpreprocessing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
__author__ = 'Federico'
# Multiclass Naive-Bayes classifier for categorization of WoN e-mail dataset
# It uses MultinomialNB classifier

from numpy import *
from tools.tensor_utils import read_input_tensor, SparseTensor
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords

# Get the input from a folder in C:
def get_example_data():

    header_file = 'C:/Users/Federico/Desktop/test/evaluation/tensor_content_NEW/headers.txt'
    data_file_prefix = 'C:/Users/Federico/Desktop/test/evaluation/tensor_content_NEW'
    data_files = [data_file_prefix + "/connection.mtx",
                  data_file_prefix + "/needtype.mtx",
                  data_file_prefix + "/subject.mtx",
                  data_file_prefix + "/content.mtx",
                  data_file_prefix + "/category.mtx"]
    slices = [SparseTensor.CONNECTION_SLICE, SparseTensor.NEED_TYPE_SLICE, SparseTensor.ATTR_SUBJECT_SLICE,
              SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE]

    tensor = read_input_tensor(header_file, data_files, slices, False)

    data = []
    target = []

    # Store the chosen input into lists.
    # The "if" statement is meant to include only samples with a single category (No multilabel)
    for need_index in tensor.getNeedIndices():
        content = ""
        categories = tensor.getAttributesForNeed(need_index, SparseTensor.CATEGORY_SLICE)
        numCategories = len(categories)
        if numCategories >= 1:
            category_index = tensor.getSliceMatrix(SparseTensor.CATEGORY_SLICE)[need_index,].nonzero()[1][0]
            target.append(category_index)
            for word in tensor.getAttributesForNeed(need_index, SparseTensor.ATTR_SUBJECT_SLICE):
                content += word + " "
            data.append(content)

    # Include only few of all the categories (e.g. with samples > n)
    newdata = []
    newtarget = []
    for i in range(len(target)):

        if target.count(target[i]) > 50:
            newtarget.append(target[i])
            newdata.append(data[i])

    data = newdata
    target = newtarget

    # Print out the input, just a check:
    target_names = tensor.getHeaders()
    print("test")
    print data
    print target_names
    print target

    return data, target, target_names

# Call for the input
my_data, my_target, my_targetname = get_example_data()

# A little information about dimensions and format of the input:
print type(my_data), type(my_target),   # format of data and targets
print len(my_data)  # number of samples
print len(my_target)


# Let's build the training and testing datasets:
SPLIT_PERC = 0.80  # 80% goes into training, 20% into test
split_size = int(len(my_data)*SPLIT_PERC)
X_train = my_data[:split_size]
X_test = my_data[split_size:]
y_train = my_target[:split_size]
y_test = my_target[split_size:]


# Training, prediction and evaluation of the classifier(s):
def train_and_evaluate(clf, X_train, X_test, y_train, y_test, y_name):

    # Training
    clf.fit(X_train, y_train)
    # Prediction of testing sets
    y_pred = clf.predict(X_test)

    # Precision, recall and support (i.e. nr. of samples used for the testing)
    print "Classification Report:"
    print metrics.classification_report(y_test, y_pred)
    # Confusion Matrix
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_pred)

    # Visualization of Categories / Assigned / Data
    print "Tested data => assigned category,    data:"
    for i in range(len(X_test)):
        print str(i) + ")   Real category: " + str(y_name[y_test[i]]) + ",    Assigned category: " + \
            str(y_name[y_pred[i]]) + ",     Data: " + str(X_test[i])

    # Assign names to the categories (defined by numbers)
    print "\n Categories: \n"
    categories = set()
    for cat in y_pred:
        categories.add(cat)
    categories = sorted(categories)
    for cat in categories:
        print str(cat) + "    " + y_name[cat]

# Introducing stop words
stopset = set(stopwords.words('english'))

# Two different classifiers: Count and Tfidf vectors
clf_count = Pipeline([
    ('vect', CountVectorizer(
        stop_words=stopset,
        token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
    )),
    ('clf', MultinomialNB(alpha=1)),
    ])

clf_tfidf = Pipeline([
     ('vect', TfidfVectorizer(
         stop_words=stopset,
         token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",
     )),
     ('clf', MultinomialNB(alpha=1)),
     ])

# List of classifiers
clfs = [clf_count, clf_tfidf]

# Run the evaluation/classification
for clf in clfs:
    train_and_evaluate(clf, X_train, X_test, y_train, y_test, my_targetname)