pattern-recognition/main.py at main · minasilva2003/pattern-recognition · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import pandas as pd
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from pre_process_funcs import normalize_features, drop_categorical_variables, drop_binary_variables
from feature_analysis_funcs import kruskal_wallis, generate_cov_matrix, remove_worst_features, analyse_pca, analyse_lda, rank_features_by_auc
from crossvalidation import cross_validate_classifier, calculate_specificity

from classifiers.euclidean_MDC import Euclidean_MDC
from classifiers.mahalanobis_MDC import Mahalanobis_MDC
from classifiers.LDA_fisher_MDC import LDA_Fisher_MDC
from classifiers.KNN_classifier import KnnClassifier
from classifiers.SVM_classifier import SvmClassifier
from classifiers.BayesianClass import BayesianGaussianClassifier

import csv

log_file_path = "main.log"

def print_log(message):
    """
    Appends a message to a .log file.

    Parameters:
        log_file_path (str): Path to the log file.
        message (str): The message to append.
    """

    print(message)

    with open(log_file_path, 'a') as log_file:
        log_file.write(message + '\n')


#________________________________________________________________________________

#0. read dataset
url_dataset = pd.read_csv("dataset.csv")
X = url_dataset.drop(columns=["label"])
Y = url_dataset.iloc[:,-1]

print_log("\n\n#########################################################################\n\n")
print_log(f"Shape of original X: {X.shape}")
print_log("\n\n#########################################################################\n\n")

#_________________________________________________________________________________

#1. remove categorical and binary features
X = drop_categorical_variables(X)

print_log(f"Shape of X after dropping categorical variables: {X.shape}")
print_log("\n\n#########################################################################\n\n")

X = drop_binary_variables(X)

print_log(f"Shape of X after dropping binary variables: {X.shape}")
print_log("\n\n#########################################################################\n\n")

#_________________________________________________________________________________

#2. Cleaning the dataset
#Avaliable information about the dataset guarantees that there are no missing values

#___________________________________________________________________________________

#3. covariance matrix
covm, high_corr_pairs = generate_cov_matrix(X, show_img=False)

#____________________________________________________________________________________

#4. with the information from the covariance matrix, we remove the following features
X = X.drop(columns=["DomainTitleMatchScore", "NoOfLettersInURL", "NoOfDegitsInURL"])
print_log(f"Shape of X after dropping highly correlated variables: {X.shape}")
print_log("\n\n#########################################################################\n\n")

#____________________________________________________________________________________
#5. Feature ranking with kruskal wallis

print_log("Performing Kruskal Wallis Ranking...")
kw_ranking = kruskal_wallis(X, Y)

#____________________________________________________________________________________

#6. Feature ranking with ROC AUC
print_log("\n\n#########################################################################\n\n")
print_log("Performing AUC ranking...")
auc_ranking = rank_features_by_auc(X,Y)


#____________________________________________________________________________________

#7. remove the 20% worst features based on kruskal wallis and roc auc
kw_X = remove_worst_features(X, kw_ranking, percentage=0.2)
auc_X = remove_worst_features(X, auc_ranking, percentage=0.2)

print_log("\n\n#########################################################################\n\n")
print_log(f"Shape for dataset after KW ranking selection: {kw_X.shape}")
print_log(f"Shape for dataset after AUC ranking selection: {auc_X.shape}")
print_log("\n\n#########################################################################\n\n")

feature_selection_Xs = {"KW": kw_X, "AUC": auc_X}

#____________________________________________________________________________________

rows = [["Selection Ranking", "Processing", "Classifier", "Run", "Accuracy", "Specificity", "F1-Score", "Sensitivity", "Auc"]]

for feature_selection_type, fs_X in feature_selection_Xs.items():

    print_log("\n\n#########################################################################\n\n")

    print_log(f"Performing PCA for {feature_selection_type}...")

    #8. PCA and LDA on the selected features to create new features
    X_pca = analyse_pca(fs_X, feature_selection_type, show_img=False)

    print_log(f"Shape of dataset after PCA for {feature_selection_type} is: {X_pca.shape}")

    print_log("\n\n#########################################################################\n\n")

    print_log(f"Performing LDA for {feature_selection_type}...")

    X_lda = analyse_lda(fs_X, Y)

    print_log(f"Shape of dataset after LDA is {feature_selection_type}: {X_lda.shape}")

    print_log("\n\n#########################################################################\n\n")

    data_processing_Xs = {"Natural": fs_X, "PCA": X_pca, "LDA": X_lda}

    #9. Loop of experiments with feature selection + data processing + classifier
    for data_processing_type, processed_X in data_processing_Xs.items():

        data_info_string = feature_selection_type + "_" + data_processing_type

        print_log(f"TESTING CLASSIFIERS FOR DATASET {data_info_string}...")

        #____________________________________________________________________________________

        #10. verifying the best K for the classifier KNN

        print_log("TESTING KNN...")
        knn_classifier = KnnClassifier(data_info = data_info_string)
        knn_training_results = knn_classifier.knn_analysis(processed_X, Y, n_runs=3)

        # Save results
        df = pd.DataFrame(zip(*knn_training_results))
        df.columns = ['K', 'Average Error', 'STD Error']
        df.to_csv(f"knn_training/err_{data_info_string}.csv", index=False)

        #____________________________________________________________________________________

        #11. verify the best C value for the classifier SVM
        print_log("TESTING SVM...")
        svm_classifier = SvmClassifier(data_info = data_info_string, kernel_function="rbf")
        svm_training_results = svm_classifier.svm_analysis(processed_X, Y, n_runs=3)

        # Save results
        df = pd.DataFrame(zip(*svm_training_results))
        df.columns = ['C', 'Average Error', 'STD Error']
        df.to_csv(f"svm_training/err_{data_info_string}.csv", index=False)

        #12. Selecting which classifiers to use based on the dataset
        if data_processing_type != "LDA":
            classifiers = [Euclidean_MDC(data_info_string), Mahalanobis_MDC(data_info_string), LDA_Fisher_MDC(data_info_string), BayesianGaussianClassifier(data_info_string), knn_classifier, svm_classifier]
        else:
            classifiers = [Euclidean_MDC(data_info_string), Mahalanobis_MDC(data_info_string), BayesianGaussianClassifier(data_info_string), knn_classifier, svm_classifier]

        try:
            #13. for each classifier run 10 times cross-validation and save results
            for classifier in classifiers:

                for run in range (5):

                    metrics = cross_validate_classifier(processed_X, Y, classifier, run+1, 5, view=False)

                    results = [feature_selection_type, data_processing_type, classifier.classifier_label, run+1] + metrics

                    rows.append(results)

        except Exception as e:
            with open('results.csv', mode='w', newline='') as file:
                writer = csv.writer(file)
                writer.writerows(rows)  # Write all rows


# Writing to a CSV file
with open('results.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)  # Write all rows