-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathfinal_model_hyperparameter.py
More file actions
111 lines (95 loc) · 4.68 KB
/
final_model_hyperparameter.py
File metadata and controls
111 lines (95 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Multi label classification deep learning model voor RNAseq cancer data voor Machine learning project
# Volgens https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
# - hoeveel layers: hoeveelheid RNA expressed is input, daarna wordt dit o.a. (deels) omgezet in eiwitten die vervolgens dingen
# doen voor een cel (hier een groep cellen) wat iets zou kunnen zeggen over welk type kanker. Dus ca. 2 layers (3 in die tutorial, want daar is output ook een dense layer).
#Parameter BRCA COAD KIPAN
#Activation function {Rectifier, Tanh, Maxout}
#Number of hidden layers {2, 3, 4}
#Number of units per layer [10, 200]
#L1 regularization [0.001, 0.1]
#L2 regularization [0.001, 0.1]
#Input dropout ratio [0.001, 0.1]
#Hidden dropout ratios [0.001, 0.1]
# import packages
import pandas as pandas
import numpy as np
import imblearn
from numpy import mean
from numpy import std
np.random.seed(123) # for reproducibility
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical
from keras.utils import np_utils
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
# define the model
def create_model(hidden_layers=1,activation='relu',neurons=1):
# create model
model = Sequential()
model.add(Dense(80, input_dim=20531, activation=activation))
for i in range(hidden_layers):
#Add a hidden layer
model.add(Dense(neurons, activation=activation))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# load data
data = pandas.read_csv("data.csv", index_col= 0) # default header = True
labels = pandas.read_csv("one_hot-labels.csv", sep = ";", index_col= 0) # default header = True
#print(data)
#print(labels)
# Make data compatible for converting to tensors
data_as_array = np.asarray(data).astype('float32')
labels_as_array = np.asarray(labels).astype('float32')
#print(data_as_array)
#print(labels_as_array)
# OUD: Maak train en test set
#X_train, X_test, y_train, y_test = train_test_split(data_as_array, labels_as_array, test_size=0.20, random_state=33)
# Split 5 time the data into a test and training set for outer CV
cv_outer = KFold(n_splits=5, shuffle=True)
for train_ix, test_ix in cv_outer.split(data_as_array):
# split data
X_train, X_test = data_as_array[train_ix, :], data_as_array[test_ix, :]
y_train, y_test = labels_as_array[train_ix], labels_as_array[test_ix]
# Balance data set volgens http://glemaitre.github.io/imbalanced-learn/generated/imblearn.over_sampling.RandomOverSampler.html
# oversample samples < average
no_samples = np.count_nonzero(y_train, axis = 0)
average_samples = int(mean(no_samples))
weights = []
for i in range(len(no_samples)):
if no_samples[i] < average_samples:
weights.append(average_samples)
else:
weights.append(no_samples[i])
ratio_over = {0:weights[0], 1:weights[1], 2:weights[2], 3:weights[3], 4:weights[4]}
over = SMOTE(sampling_strategy = ratio_over, random_state = 314)
X_train,y_train = over.fit_resample(X_train,y_train)
# undersample samples > average
ratio_under = {0:average_samples, 1:average_samples, 2:average_samples, 3:average_samples, 4:average_samples}
under = RandomUnderSampler(sampling_strategy = ratio_under, random_state = 314)
X_train,y_train = under.fit_resample(X_train,y_train)
# Configure the cross-validation procedure
cv_inner = LeaveOneOut()
#Hyper parameter code
batch_size = [8,16,32]
neurons = [30,40,50]
hidden_layers = [1,2,3]
epochs = [10,50,100]
activation = ['relu','tanh','sigmoid','linear']
param_grid = dict(batch_size=batch_size,neurons=neurons,hidden_layers=hidden_layers,epochs=epochs,activation=activation)
estimator = KerasClassifier(build_fn=create_model, verbose=0)
grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-2, cv=cv_inner)
#Hier gaat het fout, misschien verkeerde x en y data set?
resultgridsearch = grid.fit(X_train,y_train)
print("Best grid search score: %f using the following parameters: %s" % (resultgridsearch.best_score_,resultgridsearch.best_params_))