Machine-Learning/A1_Forte_Chris.py at main · cforte11/Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#Name: Chris Forte
#Date: 04/03/2023
#Course: COMPSCI 711
#Task: Assignment I

#This program features two functions.

#The first is decision_tree_parameter_check(dataset_id), which takes an OpenML
#dataset identification number as input and produces a graph as output. This
#function varies the values of 5 min_samples_leaf parameters and measures
#training and test roc_auc scores on a 10-fold cross-validation.

#The second is decision_tree_parameter_gridsearch(dataset_id), which takes an
#OpenML dataset identification number as input and produces a graph and numerical
#value as output.This function uses GridSearchCV to search for the best parameter
#and generate the results of 10-fold cross-validation.

#Referenced resources include https://www.geeksforgeeks.org, https://www.openml.org,
#"Fundamentals of Machine Learning for Predictive Data Analytics: Algorithms, Worked
#Examples, and Case Studies", and https://docs.python.org/.


import openml
import matplotlib.pyplot as mpl

from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score


def decision_tree_parameter_check(dataset_id):
    #Load OpenML dataset.
    dst = openml.datasets.get_dataset(dataset_id)

    #Get the input features and target variable as NumPy arrays.
    data = dst.get_data(target=dst.default_target_attribute)

    #Store the input features and target variable.
    X = data[0]
    y = data[1]

    #List of values to vary min_samples_leaf
    min_samples_leaf_values = [1, 3, 5, 7, 9]

    #Empty lists to store mean training and test roc_auc scores.
    training = []
    testing = []

    #Loop through the values of min_samples_leaf.
    for min_samples_leaf in min_samples_leaf_values:

        #Create a DecisionTreeClassifier via min_samples_leaf.
        mytree = tree.DecisionTreeClassifier(criterion="entropy", min_samples_leaf=min_samples_leaf)

        #Calculate the mean roc_auc score on a 10-fold cross-validation.
        train_cv = cross_val_score(mytree, X, y, cv=10, scoring='roc_auc').mean()
        training.append(train_cv)

        #Fit the model on the entire dataset.
        mytree.fit(X, y)
        y_prob = mytree.predict_proba(X)

        #Calculate the test roc_auc score.
        test_cv = roc_auc_score(y, y_prob[:, 1])
        testing.append(test_cv)

    #Plot results on graph.
    mpl.plot(min_samples_leaf_values, training, label='train')
    mpl.plot(min_samples_leaf_values, testing, label='test')
    mpl.xlabel('min_samples_leaf')
    mpl.ylabel('roc_auc_score')
    mpl.title(f"DTC Dataset ID: {dataset_id} Graph")
    mpl.legend()

    #Shading for underfitting.
    mpl.fill_between(min_samples_leaf_values, 0, training, alpha=0.1, color='blue')
    mpl.text(min_samples_leaf_values[0], 0.6, 'Underfitting', rotation=90, color='blue', ha='center', va='center')

    #Shading for overfitting.
    mpl.fill_between(min_samples_leaf_values, testing, 1, alpha=0.1, color='red')
    mpl.text(min_samples_leaf_values[0], 0.9, 'Overfitting', rotation=90, color='red', ha='center', va='center')

    #Return graph.
    mpl.show()


def decision_tree_parameter_gridsearch(dataset_id):
    #Load OpenML dataset.
    dst = openml.datasets.get_dataset(dataset_id)

    #Get the input features and target variable as NumPy arrays.
    data = dst.get_data(target=dst.default_target_attribute)

    #Store the input features and target variable.
    X = data[0]
    y = data[1]

    #Declare mytree as DecisionTreeClassifier.
    mytree = tree.DecisionTreeClassifier(criterion="entropy")

    #Dictionary of parameters to search through.
    parameters = {'min_samples_leaf': [1, 3, 5, 7, 9]}

    #Grid search using GridSearchCV.
    gs = GridSearchCV(mytree, param_grid=parameters, cv=10, scoring='roc_auc')
    gs.fit(X, y)

    #Get best parameter and print it in the shell.
    bp = gs.best_params_['min_samples_leaf']
    print(f"Best parameter: {bp}")

    #Obtain the mean test scores for each parameter.
    test_cv = gs.cv_results_['mean_test_score']

    #Plot results on graph and present the best parameter on the graph as well.
    mpl.plot(parameters['min_samples_leaf'], test_cv)
    mpl.xlabel('min_samples_leaf')
    mpl.ylabel('mean roc_auc_score')
    mpl.title(f"GridSearchCV Dataset ID: {dataset_id}\nBest Parameter Value: {bp}\n")

    #Show graph.
    mpl.show()