ML-F23-Project/plot_results.py at main · HVKHCM/ML-F23-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv

def plot_train_and_test_errors(model_name, model_attributes_dict, training_errors, testing_errors):

    '''
    Inputs:
        model_name: the name of the model
        model_attributes_dict: a dictionary of (key, value) pairs where the key is the name of a model hypermarameter
        and the value is a list of the values tested for that hyperparameter
        training_errors: a list of training errors corresponding to the order of the hypermarameter values in modal_attributes_dict
        testing_errors: a list of testing errors corresponding to the order of the hyperparameter values in model_attributes_dict

        Plots the train and test lossesfor the model and dataset specified in the input with respect to each hyperparameter
        specified in model_attributes_dict.
    '''

    sample_attribute = list(model_attributes_dict.keys())[0]
    print(sample_attribute)
    print(model_attributes_dict[sample_attribute])
    num_attributes = len(model_attributes_dict[sample_attribute])

    train_test_distinctions = ["train" for i in range(num_attributes)] + ["test" for i in range(num_attributes)]
    errors = training_errors + testing_errors

    for model_attribute in model_attributes_dict.keys():

        attribute_vals = model_attributes_dict[model_attribute]
        attribute_vals = attribute_vals + attribute_vals  #duplicate

        x_axis_name = model_name + " " + model_attribute + " Value"

        df = pd.DataFrame({
            x_axis_name: attribute_vals,
            "Error": training_errors + testing_errors,
            "Error Type": train_test_distinctions
        })

        plt.figure()
        error_pointplot = sns.pointplot(data=df, x=x_axis_name, y="Error", hue="Error Type")
        plt.savefig("plots/" + model_name.replace(" ", "_") + "_" + model_attribute + ".jpg")
        plt.savefig("plots/" + model_name.replace(" ", "_") + "_" + model_attribute + ".svg")

def plot_decision_tree_errors(decision_trees_csv_path, dataset_name:str):
    '''
    Inputs:
        decision_trees_csv_path: the path to the csv file with the information (performance metrics, hyperparameters) of
        each Decision Tree classifier tested in bias_variance_evaluation.py
        dataset_name: the name of the dataset the models were run on.

        Compiles a dictionary of (key,value) pairs where the key is the name of a hyperparameter that was tested for the Decision Tree
        classifier and the value is a list of the values of that hyperparameter. Also computes the train and test errors of each model,
        and then passes all of these values into a plotting function to get figures showing the train and test losses with respect
        to the values of each hyperparameter for the Decision Tree classifier.

    '''

    decision_trees_csv_file = open(decision_trees_csv_path, newline='')
    csvreader = csv.reader(decision_trees_csv_file)

    #get values for the different model attributes to be tested
    max_depths = []
    min_samples_splits = []
    min_samples_leafs = []

    training_errors = []
    testing_errors = []

    first_row = True

    for row in csvreader:

        if first_row == True:
            first_row = False
            continue #skip the first row since it contains names of attributes

        train_error = float(row[6])
        test_error = float(row[7])

        max_depth = int(row[8])
        min_samples_split = int(row[9])
        min_samples_leaf = int(row[10])

        training_errors.append(train_error)
        testing_errors.append(test_error)

        max_depths.append(max_depth)
        min_samples_splits.append(min_samples_split)
        min_samples_leafs.append(min_samples_leaf)

    model_attributes_dict = {}
    model_attributes_dict ["Max Depth"] = max_depths
    model_attributes_dict ["Min Samples Split"] = min_samples_splits
    model_attributes_dict ["Min Samples Leaf"] = min_samples_leafs

    plot_train_and_test_errors("Decision Tree " + dataset_name, model_attributes_dict, training_errors, testing_errors)


def plot_boosting_errors(boosting_csv_path, dataset_name:str):

    '''
    Inputs:
        boosting_csv_path: the path to the csv file with the information (performance metrics, hyperparameters) of
        each Bagging classifier tested in bias_variance_evaluation.py
        dataset_name: the name of the dataset the models were run on.

        Compiles a dictionary of (key,value) pairs where the key is the name of a hyperparameter that was tested for the Bagging classifier
        and the value is a list of the values of that hyperparameter. Also computes the train and test errors of each model, and
        then passes all of these values into a plotting function to get figures showing the train and test losses with respect
        to the values of each hyperparameter for the Bagging classifier.

    '''
    boosting_csv_file = open(boosting_csv_path, newline='')
    csvreader = csv.reader(boosting_csv_file)

    #get values for the different model attributes to be tested
    N_estimators = []
    Max_samples = []

    training_errors = []
    testing_errors = []

    first_row = True

    for row in csvreader:

        if first_row == True:
            first_row = False
            continue #skip the first row since it contains names of attributes

        train_error = float(row[6])
        test_error = float(row[7])

        N_estimators_ = float(row[8])
        Max_samples_ = float(row[9])

        training_errors.append(train_error)
        testing_errors.append(test_error)

        N_estimators.append(N_estimators_)
        Max_samples.append(Max_samples_)

    model_attributes_dict = {}
    model_attributes_dict ["N_estimators"] = N_estimators
    model_attributes_dict ["Max_samples"] = Max_samples

    plot_train_and_test_errors("Boosting" + dataset_name, model_attributes_dict, training_errors, testing_errors)


def plot_logistic_regression_errors(logistic_regression_csv_path, dataset_name:str):

     '''
    Inputs:
        logistic_regression_csv_path: the path to the csv file with the information (performance metrics, hyperparameters) of
        each Logistic Regression classifier tested in bias_variance_evaluation.py
        dataset_name: the name of the dataset the models were run on.

        Compiles a dictionary of (key,value) pairs where the key is the name of a hyperparameter that was tested for the Logistic
        Regression classifier and the value is a list of the values of that hyperparameter. Also computes the train and test errors
        of each model, and then passes all of these values into a plotting function to get figures showing the train and test losses
        with respect to the values of each hyperparameter for the Logistic Regression classifier.

    '''

    logistic_regression_csv_file = open(logistic_regression_csv_path, newline='')
    csvreader = csv.reader(logistic_regression_csv_file)

    #get values for the different model attributes to be tested
    penalties = []
    max_iters = []

    training_errors = []
    testing_errors = []

    first_row = True

    for row in csvreader:

        if first_row == True:
            first_row = False
            continue #skip the first row since it contains names of attributes

        train_error = float(row[6])
        test_error = float(row[7])

        penalty = row[8]
        max_iter = int(row[9])

        training_errors.append(train_error)
        testing_errors.append(test_error)

        penalties.append(penalty)
        max_iters.append(max_iter)

    model_attributes_dict = {}
    model_attributes_dict ["Penalty"] = penalties
    model_attributes_dict ["Max Iter"] = max_iters

    plot_train_and_test_errors("Logistic Regression " + dataset_name, model_attributes_dict, training_errors, testing_errors)

def plot_svm_errors(svm_csv_path, dataset_name:str):

     '''
    Inputs:
        svm_csv_path: the path to the csv file with the information (performance metrics, hyperparameters) of
        each SVM classifier tested in bias_variance_evaluation.py
        dataset_name: the name of the dataset the models were run on.

        Compiles a dictionary of (key,value) pairs where the key is the name of a hyperparameter that was tested for the SVM
        classifier and the value is a list of the values of that hyperparameter. Also computes the train and test errors of each model,
        and then passes all of these values into a plotting function to get figures showing the train and test losses with respect
        to the values of each hyperparameter for the SVM classifier.

    '''

    svm_csv_file = open(svm_csv_path, newline='')
    csvreader = csv.reader(svm_csv_file)

    #get values for the different model attributes to be tested
    C = []
    kernel = []
    gamma = []

    training_errors = []
    testing_errors = []

    first_row = True

    for row in csvreader:

        if first_row == True:
            first_row = False
            continue #skip the first row since it contains names of attributes

        train_error = float(row[6])
        test_error = float(row[7])

        C_ = float(row[8])
        kernel_ = str(row[9])
        gamma_ = float(row[10])

        training_errors.append(train_error)
        testing_errors.append(test_error)

        C.append(C_)
        kernel.append(kernel_)
        gamma.append(gamma_)

    model_attributes_dict = {}
    model_attributes_dict ["C"] = C
    model_attributes_dict ["Kernel"] = kernel
    model_attributes_dict ["Gamma"] = gamma

    plot_train_and_test_errors("SVM " + dataset_name, model_attributes_dict, training_errors, testing_errors)


def plot_kfold_knn_errors(kfold_knn_csv_path, dataset_name:str):

     '''
    Inputs:
        kfold_knn_csv_path: the path to the csv file with the information (performance metrics, hyperparameters) of
        each KNN classifier tested in bias_variance_evaluation.py
        dataset_name: the name of the dataset the models were run on.

        Compiles a dictionary of (key,value) pairs where the key is the name of a hyperparameter that was tested for the KNN
        classifier and the value is a list of the values of that hyperparameter. Also computes the train and test errors of each model,
        and then passes all of these values into a plotting function to get figures showing the train and test losses with respect
        to the values of each hyperparameter for the KNN classifier.

    '''

    kfold_random_forest_csv_file = open(kfold_knn_csv_path, newline='')
    csvreader = csv.reader(kfold_random_forest_csv_file)

    n_neighbors_list = []

    training_errors = []
    testing_errors = []

    first_row = True

    for row in csvreader:
        print(row)
        if first_row == True:
            first_row = False
            continue #skip the first row since it contains names of attributes

        train_error = float(row[6])
        test_error = float(row[7])

        n_neighbors = int(row[8])

        training_errors.append(train_error)
        testing_errors.append(test_error)

        n_neighbors_list.append(n_neighbors)

        model_attributes_dict = {}
        model_attributes_dict["Number of Neighbors"] = n_neighbors_list

        plot_train_and_test_errors("K-Fold KNN " + dataset_name, model_attributes_dict, training_errors, testing_errors)


def plot_kfold_random_forest_errors(kfold_random_forest_csv_path, dataset_name:str):

     '''
    Inputs:
        random_forest_csv_path: the path to the csv file with the information (performance metrics, hyperparameters) of
        each Random Forest classifier tested in bias_variance_evaluation.py
        dataset_name: the name of the dataset the models were run on.

        Compiles a dictionary of (key,value) pairs where the key is the name of a hyperparameter that was tested for the Random Forest
        classifier and the value is a list of the values of that hyperparameter. Also computes the train and test errors of each model,
        and then passes all of these values into a plotting function to get figures showing the train and test losses with respect
        to the values of each hyperparameter for the Random Forest classifier.

    '''

    kfold_random_forest_csv_file = open(kfold_random_forest_csv_path, newline='')
    csvreader = csv.reader(kfold_random_forest_csv_file)

    #get values for the different model attributes to be tested
    n_estimators_list = []
    max_depths = []

    training_errors = []
    testing_errors = []

    first_row = True

    for row in csvreader:

        if first_row == True:
            first_row = False
            continue #skip the first row since it contains names of attributes

        train_error = float(row[6])
        test_error = float(row[7])

        n_estimators = int(row[8])
        max_depth = int(row[9])

        training_errors.append(train_error)
        testing_errors.append(test_error)

        n_estimators_list.append(n_estimators)
        max_depths.append(max_depth)

    model_attributes_dict = {}
    model_attributes_dict ["Number of Estimators"] = n_estimators_list
    model_attributes_dict ["Max Depth"] = max_depths

    plot_train_and_test_errors("K-fold Random Forest " + dataset_name, model_attributes_dict, training_errors, testing_errors)


# decision_tree_dataset1_csv = "bias_variance_comparisons/decision_tree_dataset1.csv"
# plot_decision_tree_errors(decision_tree_dataset1_csv, "Dataset 1")

# decision_tree_dataset2_csv = "bias_variance_comparisons/decision_tree_dataset2.csv"
# plot_decision_tree_errors(decision_tree_dataset2_csv, "Dataset 2")

# boosting_dataset1_csv = "bias_variance_comparisons/best_boosting_dataset1.csv"
# plot_boosting_errors(boosting_dataset1_csv, "Dataset 1")

# boosting_dataset2_csv = "bias_variance_comparisons/best_boosting_dataset2.csv"
# plot_boosting_errors(boosting_dataset2_csv, "Dataset 2")

# logistic_regression_dataset1_csv = "bias_variance_comparisons/logistic_regression_dataset1.csv"
# plot_logistic_regression_errors(logistic_regression_dataset1_csv, "Dataset 1")

# logistic_regression_dataset2_csv = "bias_variance_comparisons/logistic_regression_dataset2.csv"
# plot_logistic_regression_errors(logistic_regression_dataset2_csv, "Dataset 2")

# svm_dataset1_csv = "bias_variance_comparisons/svm_dataset1.csv"
# plot_svm_errors(svm_dataset1_csv, "Dataset 1")

# svm_dataset2_csv = "bias_variance_comparisons/svm_dataset2.csv"
# plot_svm_errors(svm_dataset2_csv, "Dataset 2")

# knn_dataset1_csv = "bias_variance_comparisons/kfold_knn_dataset1.csv"
# knn_dataset2_csv = "bias_variance_comparisons/kfold_knn_dataset2.csv"

# plot_kfold_knn_errors(knn_dataset1_csv, "Dataset 1")
# plot_kfold_knn_errors(knn_dataset2_csv, "Dataset 2")

# kfold_random_forest_dataset1_csv = "bias_variance_comparisons/kfold_random_forest_dataset1.csv"
# plot_kfold_random_forest_errors(kfold_random_forest_dataset1_csv, "Dataset 1")

# kfold_random_forest_dataset2_csv = "bias_variance_comparisons/kfold_random_forest_dataset2.csv"
# plot_kfold_random_forest_errors(kfold_random_forest_dataset2_csv, "Dataset 2")