FalseFailureAnalyzer/ml_analyzer.py at master · rmitra34/FalseFailureAnalyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""Test result log analyser

This script allows the user to train or update the model on script execution logs.

This script requires that `sklean, imb-learn,numpy and pandas` be installed within the Python
environment you are running this script in.

This file can also be imported as a module and contains the following
functions:

    * train_model - trains model from scratch
    * main - the main function of the script
    * update_model - updates an existing model
"""

import sys

# external libraries
import numpy as np

from sklearn.datasets import load_files
from ML_Model import Model


def add_to_log(line):

    """Appends the input to execution_log.txt file and prints as well.

        Parameters:
        line (string): String to be appended to log.
    """

    line = str(line)
    with open('execution_log.txt', 'a') as log:
        log.write(line)
        log.write('\n')
        if line == 'Done':
            log.write('-' * 50)
            log.write('\n')

    print(line)


def get_training_data(path):

    """Loads the training data specified by path.

    Parameters:
    path (int): path to the dir to load, directory must have buckets as sub directories

    Returns:
    Array of text in training samples and class labels

   """

    logs_train = load_files(path)
    text_train, y_train = logs_train.data, logs_train.target
    add_to_log('Got Training Data')
    print('Classes', np.unique(y_train))

    return text_train, y_train


def train_model(train_data_path):
    """Trains the model from scratch.

    Parameters:
    path: Training data path. The target directory must have samples separated in class buckets.

   """
    add_to_log(train_data_path + ' Training')

    text_train, y_train = get_training_data(train_data_path)

    model = Model()

    # Transform the text into nd array
    x_train = model.fit_transform(text_train)

    # Data Synthesis
    x_train, y_train = model.under_sample_data(x_train, y_train)

    model.train_classifier(x_train, y_train)

    # Save the trained model as a joblib file
    model.save_model()


def update_model(time_stamp, train_data_path):

    """Updates an existing model.

    Parameters:
    time_stamp: time_stamp of the model example: 2019_Jul_19_12_13
    path: Training data path. The source directory must have samples separated in class buckets.

   """

    add_to_log(train_data_path + ' Training')

    text_train, y_train = get_training_data(train_data_path)

    model = Model()
    model.load_model(time_stamp)
    # Transform the text into nd array
    x_train = model.fit_transform(text_train)

    # Test the accuracy of current month on previously trained model.
    test_model(x_train, y_train, model)

    # Data Synthesis
    x_train, y_train = model.under_sample_data(x_train, y_train)

    model.update_classifier(x_train, y_train)

    # Save the trained model as a joblib file
    model.save_model()


def test_model(x_test, y_test, model):
    """Updates an existing model.

    Parameters:
    time_stamp: time_stamp of the model
    path: Training data path. The source directory must have samples separated in class buckets.

   """
    # Feature Selection
    # x_test = model.selector_transform(x_test)

    model.score_accuracy(x_test, y_test)
    add_to_log('Done')


def main():
    """Trains or Updates the model depending on attribute supplied.

    Parameters:
    path: Training data path. The target directory must have samples separated in class buckets.
    task: -t for training -u to update existing model.
    time stamp: time stamp to the model to be updated when using -u.
   """

    if len(sys.argv) > 1 and len(sys.argv) >= 3:

        task = sys.argv[1]
        data_path = sys.argv[2]

        if task == '-t':
            train_model(data_path)
        elif task == '-u':
            if len(sys.argv) == 4:
                time_stamp = sys.argv[3]
                update_model(time_stamp, data_path)
            else:
                print('Takes 3 arguments: task to perform, '
                      'data path and time stamp in case of update')

        else:
            print('Argument not recognized, use -t to train'
                  ' and -u to update.')

    else:
        print('Takes 2 or 3 arguments: task to perform and data path respectively'
              ',follow by time stamp in case of update')

    print('Done')


if __name__ == '__main__':
    main()