-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
194 lines (127 loc) · 7.88 KB
/
main.py
File metadata and controls
194 lines (127 loc) · 7.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import pandas as pd
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_process_funcs import normalize_features, drop_categorical_variables, drop_binary_variables
from feature_analysis_funcs import kruskal_wallis, generate_cov_matrix, remove_worst_features, analyse_pca, analyse_lda, rank_features_by_auc
from crossvalidation import cross_validate_classifier, calculate_specificity
from classifiers.euclidean_MDC import Euclidean_MDC
from classifiers.mahalanobis_MDC import Mahalanobis_MDC
from classifiers.LDA_fisher_MDC import LDA_Fisher_MDC
from classifiers.KNN_classifier import KnnClassifier
from classifiers.SVM_classifier import SvmClassifier
from classifiers.BayesianClass import BayesianGaussianClassifier
import csv
log_file_path = "main.log"
def print_log(message):
"""
Appends a message to a .log file.
Parameters:
log_file_path (str): Path to the log file.
message (str): The message to append.
"""
print(message)
with open(log_file_path, 'a') as log_file:
log_file.write(message + '\n')
#________________________________________________________________________________
#0. read dataset
url_dataset = pd.read_csv("dataset.csv")
X = url_dataset.drop(columns=["label"])
Y = url_dataset.iloc[:,-1]
print_log("\n\n#########################################################################\n\n")
print_log(f"Shape of original X: {X.shape}")
print_log("\n\n#########################################################################\n\n")
#_________________________________________________________________________________
#1. remove categorical and binary features
X = drop_categorical_variables(X)
print_log(f"Shape of X after dropping categorical variables: {X.shape}")
print_log("\n\n#########################################################################\n\n")
X = drop_binary_variables(X)
print_log(f"Shape of X after dropping binary variables: {X.shape}")
print_log("\n\n#########################################################################\n\n")
#_________________________________________________________________________________
#2. Cleaning the dataset
#Avaliable information about the dataset guarantees that there are no missing values
#___________________________________________________________________________________
#3. covariance matrix
covm, high_corr_pairs = generate_cov_matrix(X, show_img=False)
#____________________________________________________________________________________
#4. with the information from the covariance matrix, we remove the following features
X = X.drop(columns=["DomainTitleMatchScore", "NoOfLettersInURL", "NoOfDegitsInURL"])
print_log(f"Shape of X after dropping highly correlated variables: {X.shape}")
print_log("\n\n#########################################################################\n\n")
#____________________________________________________________________________________
#5. Feature ranking with kruskal wallis
print_log("Performing Kruskal Wallis Ranking...")
kw_ranking = kruskal_wallis(X, Y)
#____________________________________________________________________________________
#6. Feature ranking with ROC AUC
print_log("\n\n#########################################################################\n\n")
print_log("Performing AUC ranking...")
auc_ranking = rank_features_by_auc(X,Y)
#____________________________________________________________________________________
#7. remove the 20% worst features based on kruskal wallis and roc auc
kw_X = remove_worst_features(X, kw_ranking, percentage=0.2)
auc_X = remove_worst_features(X, auc_ranking, percentage=0.2)
print_log("\n\n#########################################################################\n\n")
print_log(f"Shape for dataset after KW ranking selection: {kw_X.shape}")
print_log(f"Shape for dataset after AUC ranking selection: {auc_X.shape}")
print_log("\n\n#########################################################################\n\n")
feature_selection_Xs = {"KW": kw_X, "AUC": auc_X}
#____________________________________________________________________________________
rows = [["Selection Ranking", "Processing", "Classifier", "Run", "Accuracy", "Specificity", "F1-Score", "Sensitivity", "Auc"]]
for feature_selection_type, fs_X in feature_selection_Xs.items():
print_log("\n\n#########################################################################\n\n")
print_log(f"Performing PCA for {feature_selection_type}...")
#8. PCA and LDA on the selected features to create new features
X_pca = analyse_pca(fs_X, feature_selection_type, show_img=False)
print_log(f"Shape of dataset after PCA for {feature_selection_type} is: {X_pca.shape}")
print_log("\n\n#########################################################################\n\n")
print_log(f"Performing LDA for {feature_selection_type}...")
X_lda = analyse_lda(fs_X, Y)
print_log(f"Shape of dataset after LDA is {feature_selection_type}: {X_lda.shape}")
print_log("\n\n#########################################################################\n\n")
data_processing_Xs = {"Natural": fs_X, "PCA": X_pca, "LDA": X_lda}
#9. Loop of experiments with feature selection + data processing + classifier
for data_processing_type, processed_X in data_processing_Xs.items():
data_info_string = feature_selection_type + "_" + data_processing_type
print_log(f"TESTING CLASSIFIERS FOR DATASET {data_info_string}...")
#____________________________________________________________________________________
#10. verifying the best K for the classifier KNN
print_log("TESTING KNN...")
knn_classifier = KnnClassifier(data_info = data_info_string)
knn_training_results = knn_classifier.knn_analysis(processed_X, Y, n_runs=3)
# Save results
df = pd.DataFrame(zip(*knn_training_results))
df.columns = ['K', 'Average Error', 'STD Error']
df.to_csv(f"knn_training/err_{data_info_string}.csv", index=False)
#____________________________________________________________________________________
#11. verify the best C value for the classifier SVM
print_log("TESTING SVM...")
svm_classifier = SvmClassifier(data_info = data_info_string, kernel_function="rbf")
svm_training_results = svm_classifier.svm_analysis(processed_X, Y, n_runs=3)
# Save results
df = pd.DataFrame(zip(*svm_training_results))
df.columns = ['C', 'Average Error', 'STD Error']
df.to_csv(f"svm_training/err_{data_info_string}.csv", index=False)
#12. Selecting which classifiers to use based on the dataset
if data_processing_type != "LDA":
classifiers = [Euclidean_MDC(data_info_string), Mahalanobis_MDC(data_info_string), LDA_Fisher_MDC(data_info_string), BayesianGaussianClassifier(data_info_string), knn_classifier, svm_classifier]
else:
classifiers = [Euclidean_MDC(data_info_string), Mahalanobis_MDC(data_info_string), BayesianGaussianClassifier(data_info_string), knn_classifier, svm_classifier]
try:
#13. for each classifier run 10 times cross-validation and save results
for classifier in classifiers:
for run in range (5):
metrics = cross_validate_classifier(processed_X, Y, classifier, run+1, 5, view=False)
results = [feature_selection_type, data_processing_type, classifier.classifier_label, run+1] + metrics
rows.append(results)
except Exception as e:
with open('results.csv', mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(rows) # Write all rows
# Writing to a CSV file
with open('results.csv', mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(rows) # Write all rows