Turbulence-Modeling-with-Machine-Learning/functions.py at master · Lagyamfi/Turbulence-Modeling-with-Machine-Learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
"""
Collection of classes and functions that are used for making tests
"""

#import libraries
import matplotlib.pyplot as plt
plt.style.use('seaborn-pastel')
import numpy as np
import pandas as pd
import clean_dataframe as cd
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms


# sklearn libraries
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib

# xgboost libraries
import xgboost as xgb

class Make_test(object):
    """
    Class for making test using a saved model.
    """
    def __init__(self, model, test_file):
        self.model = model
        self.test_file = cd.clean_dataframe(test_file)
        self.x_actual, self.y_actual = cd.X_Y_split(self.test_file)
        self.y_actual = self.y_actual.values.ravel()
        self.predictions = self.make_predictions()
        self.test_file["predictions"] = self.predictions

    def __call__(self):
        return self.test_file

    def __str__(self):
        pass

    def __repr(self):
        pass

    def make_predictions(self):
        self.predictions = self.model.predict(self.x_actual)
        return self.predictions

    def get_full_dataframe(self):
        self.full_dataframe = self.test_file.copy()
        self.full_dataframe["predictions"] = self.predictions
        return self.full_dataframe

    def compare_values(self,return_values=False, threshold=0, plot=False):
        """
        Compare predicted values with actual response values.
        Specify threshold to get the rows with the maximum difference between
        both values equal or greater than threshold
        Specify plot to get plot of comparison
        """
        comparison = pd.DataFrame(dict(ACTUALS=self.y_actual, PREDICTIONS=self.predictions))
        if threshold:
            comparison = comparison[np.abs(comparison.ACTUALS - comparison.PREDICTIONS >= threshold)]
        if plot:
            plot_predictions(comparison, colormap=True)
        if return_values:
            return comparison

    def score(self, metric="rmse", plot_residuals=False):
        """
        Get score of model with test data depending on which metric.
        RMSE is the default.
        Specify plot_residuals to show the distribution of the residuals and
        the fitted values
        """
        error = self.y_actual - self.predictions
        error_squared = np.square(error)
        mae = np.mean(error)
        mse = np.mean(error_squared)
        rmse = np.sqrt(mse)
        r2 = r2_score(self.predictions, self.y_actual)
        if metric == "rmse":
            self.test_score = rmse
        elif metric == "mse":
            self.test_score = mse
        elif metric == "mae":
            self.test_score = mae
        elif metric == "r2":
            self.test_score = r2
        if plot_residuals:
            fig, ax = plt.subplots()
            ax.scatter(self.predictions, self.predictions-self.y_actual,
                        c="b", s=40, alpha=0.5)
            ax.axhline(lw=2, color="red")
            ax.set(title="Residuals of errors on Test data")
            ax.set_ylabel("Residuals")
            ax.set_xlabel("Predicted Values")
        return float(self.test_score)

    def make_plot(self, title="Test", save=False):
        self.full_dataframe = self.test_file
        grid = int(np.cbrt(len(self.test_file)) // 2)
        plot_wall_test(self.full_dataframe, title ,half_channel_grids=grid, prediction=True, save=False)

#********* CLASS FOR ONLY XGBOOST TESTING ************************************
class XGB_test(Make_test):
    """
    Create an xgb_boost test object. inherited from main testing class (Make
    test)
    """
    import xgboost as xgb
    def __init__(self, model, test_file):
        Make_test.__init__(self, model, test_file)


    def make_predictions(self):
        self.test_matrix = xgb.DMatrix(data=self.x_actual, feature_names=self.x_actual.columns)
        try:
            self.predictions = self.model.predict(self.test_matrix)
        except TypeError as exc:
            # None
            print("Type Error Detected, using already instantiated x_actual")
            self.predictions = self.model.predict(self.x_actual)
        return self.predictions

    # plot importance of features contribution in model, using the "gain"
    # metric
    def plot_importance(self, importance_type="gain", **kwargs):
        msg = "Importance can be specified only when base learner in XGBoost is'gbtree'!, Current model uses %s" %self.model.booster
        assert self.model.booster == "gbtree", msg
        xgb.plot_importance(self.model, importance_type=importance_type,
                title="Feature Importance by %s" %importance_type)

#*******************************************************************************
def plot_wall_test(dataframe, title, half_channel_grids, prediction=None,
        save=False,):
    """
    Function to make plot of test, shows plot of prediction if present in
    dataframe.
    Title: Title of Plot
    half_channel_grids: This is half of the grid sizes used in the simulation
    """
    means_u = return_average(dataframe, "u_plus")
    means_y = return_average(dataframe, "y_plus")

    with plt.style.context("seaborn-pastel"):
        fig, ax = plt.subplots()
        ax.plot(means_y[:half_channel_grids], means_u[:half_channel_grids],linestyle="-", label="Actual", color="red")
        if prediction:
            means_pred = return_average(dataframe, "predictions")
            ax.plot(means_y[:half_channel_grids],
                    means_pred[:half_channel_grids],linestyle="--",label="Predictions", color="blue")

    ax.set_xscale("log")
    ax.set_ylabel(r'$U^+$')
    ax.set_xlabel(r'$y^+$')
    ax.grid(True)
    ax.legend()
    ax.set_title(title)
    fig.tight_layout()
    if save:
        fig.savefig("%s.png" %title)
    else:
        plt.show()

def return_average(dataframe, column_name):
    return dataframe.groupby("Points:1")[column_name].mean().values

# Function to plot predictions against actual response values
def plot_predictions(dataframe, colormap=False):
    """
    Input: Dataframe with predictions and actual values"
    Output: Plot of predictions against actual response values, optional
    (colormap of difference)
    If colormap set to "True", difference between predictions and actual
    response values are plotted as well. (Takes longer for this operation)
    """

    # check that dataframe has columns labelled as required
    msg = "Column names should be PREDICTIONS and ACTUALS respectively!"
    assert "PREDICTIONS" and "ACTUALS" in dataframe.columns, msg
    fig, ax = plt.subplots()
    if colormap:
        dataframe['delta'] = np.abs(dataframe.ACTUALS - dataframe.PREDICTIONS)
        dataframe.plot.scatter(ax=ax , x="ACTUALS", y="PREDICTIONS", alpha=0.5,
                c='delta',colormap='jet')
    else:
        dataframe.plot.scatter(ax=ax , x="ACTUALS", y="PREDICTIONS", alpha=0.25)

    #add diagonal line on plot
    line = mlines.Line2D([0, 1], [0, 1], color='red')
    transform = ax.transAxes
    line.set_transform(transform)
    ax.add_line(line)
    ax.set_xlabel(r'$U^+$ (actual)')
    ax.set_ylabel(r'$U^+$ (predicted)')
    ax.set_title('Predictions vs Actual Values');


#**************************************************************************************

class XGBoost_Model(object):
    """
    Class for building a model using xgboost algorithm
    Model can be tuned as well
    """
    def __init__(self, params, dataframe):
        self.params = params
        self.dataframe = dataframe
        self.model = xgb.XGBRegressor(**params)
        self.train_data, self.validation_data = train_test_split(self.dataframe, test_size=0.3, random_state=100)
        train_x, train_y = cd.X_Y_split(self.train_data)
        validation_x, validation_y = cd.X_Y_split(self.validation_data)
        self.dtrain = xgb.DMatrix(data=train_x, label=train_y, feature_names=train_x.columns)
        self.dvalidation = xgb.DMatrix(data=validation_x, label=validation_y, feature_names=validation_x.columns)
        self.eval_matrix  = [(self.dtrain,"train"),(self.dvalidation,"validation")]
        self.eval_set = [(train_x,train_y),(validation_x,validation_y)]

    def __call__(self):
        return self.model

    def train_model(self, num_rounds=50, parameters=None, plot=True):
        if parameters != None:
            self.params.update(parameters)
        evals_result = {}
        self.model = xgb.train(params =self.params, dtrain=self.dtrain,
                                       num_boost_round=num_rounds, early_stopping_rounds=50, evals=self.eval_matrix, verbose_eval=5, evals_result=evals_result)
        #if plot:
           # plot_fit(self)


    def add_evalset(self,dataframe):
        """
        Function to add additional dataset for validation during training.
        dataframe must be cleaned before with clean_dataframe function.
        """
        dataframe = cd.clean_dataframe(dataframe)
        new_val_x, new_val_y = cd.X_Y_split(dataframe)
        new_val_mat = xgb.DMatrix(data=new_val_x, label=new_val_y, feature_names=new_val_x.columns)
        self.eval_matrix.append((new_val_mat, "validation_2"))
        self.eval_set.append((new_val_x, new_val_y))

    def predict(self, test_X):
        dtest = xgb.DMatrix(test_X, feature_names=test_X.columns)
        return self.model.predict(dtest)

    def fit(self, x=None, y=None, n_estimators=100, plot=True, save_plot=False,save_path=None):
        if x != None:
            train_x, train_y = (x, y)
        else:
            train_x, train_y = self.eval_set[0]
        self.model.set_params(**{"n_estimators":n_estimators})
        #call_backs =[ xgb.callback.print_evaluation(period=2)]
        self.fitted_model=self.model.fit(train_x, train_y,
                eval_set=self.eval_set , eval_metric="rmse",
                early_stopping_rounds=50)
        if plot:
            plot_fit(self.model, save=save_plot, save_path=save_path)
        return self.fitted_model

    def get_params(self):
        return self.params

    def set_params(self, parameters):
        """
        Parameters should be a list of parameters to update model with.
        """
        self.params.update(parameters)
        """
    def get_tuned_model(self,fit_data, parameters):
        tune_results = tune_parameter(self.model, fit_data, parameters)
        self.tuned_model = tune_results.best_estimator_
        best_params = tune_results.best_params_
        return self.tuned_model
        """

    def tune_model_parameter(self, parameter,  param_range, save_plot=False, save_path=None, randomized=True, n_iter = None, n_splits=5, data_size=0.1, fit_param=False):
        if data_size:
            tune_data = self.train_data.sample(frac=data_size)
        results = tune_parameter(tune_data, parameter,param_range, save_plot=save_plot, save_path=save_path, randomized=randomized, n_iter=n_iter, n_splits=n_splits, estimator=self.model)
        if fit_param:
            self.set_params(results.best_params_)
            self.model.set_params(**results.best_params_)
        return results

    def tune_all_parameters(self, param_distribution, n_iter=10, cv=5, data_size=0.1):
        tune_data = self.train_data.sample(frac=data_size)
        results = tune_all(tune_data,self.model,param_distribution, n_iter=n_iter, n_splits=cv)


# Function for tuning model using some parameters
"""
def tune_parameter(model, data, parameters, n_splits=5):
    train_x = data[0]
    train_y = data[1]
    #param_grid = dict(parameter=parameter_range)
    kfold = KFold(n_splits=n_splits, random_state=10)
    grid_search = GridSearchCV(model, parameters, verbose=0,
                              n_jobs=-1, cv=kfold, scoring="neg_mean_squared_error")
    grid_result = grid_search.fit(train_x, train_y, verbose=0)
    return grid_result
"""

def tune_parameter(data, parameter, param_range, save_plot=False, randomized=False, save_path=None, n_iter = None, n_splits=5,estimator=None):
    """
    Function to tune a parameter using either gridsearch or randomized search with possibility of cross validation.
    Input:
        - data = dataset to be used tuning, usually the training dataset.
        - parameter = string of parameter to be tuned. (works with XGBoost for now)
        - param_range = parameter search space
        - estimator = model to be tuned if existing already, if not a new default XGBRegressor model wil be created
    """

    train_x, train_y = cd.X_Y_split(data)
    param_grid = {parameter : list(param_range)}
    kfold = KFold(n_splits=n_splits, random_state=7)
    if not estimator:
        estimator = xgb.XGBRegressor(objective="reg:squarederror", )
    if randomized:
        assert n_iter != None, "Missing number of iterations"
        param_search = RandomizedSearchCV(estimator,param_grid,n_iter=n_iter,scoring="neg_mean_squared_error",cv=kfold)
    else:
        param_search = GridSearchCV(estimator, param_grid, verbose=0, cv=kfold, scoring="neg_mean_squared_error")
    grid_result = param_search.fit(train_x, train_y, verbose=0)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']

    #for mean, sdev, param in zip(means, stds, params):
        #print("%f (%f) with: %r" % (mean, stdev, param))

    if randomized:
        param_range = [list(i.values())[0] for i in params]


    fig, ax = plt.subplots()
    ax.errorbar(param_range,-1*means, yerr=stds)
    ax.set_title("XGBoost %s vs RMSE" %parameter)
    ax.set_xlabel('%s' %parameter)
    ax.set_ylabel('RMSE')
    if save_plot:
        if save_path:
            fig.savefig("%s/%s.png" %(save_path,parameter))
        else:
            fig.savefig("%s.png" %parameter)
    return grid_result

def plot_fit(model,train_results=None, save=False, save_path=None):
    """
    Function to plot train and test(validation) results of a training or a model fitting
    Input:
        - Results of fit or training
    """
    # plot evolution of training and validation results
    if train_results:
        results =train_results
    else:
        results = model.evals_result()
    epochs = len(results['validation_0']['rmse'])
    x_axis = range(0, epochs)

    # plot RMSE
    fig, ax = plt.subplots()
    ax.plot(x_axis, results['validation_0']['rmse'], label='Train', c="b")
    ax.plot(x_axis, results['validation_1']['rmse'], label='Validation', c="r")
    ax.set(title=" Train and Validation RMSE")
    ax.set_xlabel("Iteration")
    ax.set_ylabel("RMSE")
    ax.legend()
    if save:
        if save_path:
            fig.savefig("%s/training_evolution.png"%save_path)
        else:
            fig.savefig("model_evolution.png")


def tune_all(data, estimator, param_grid, n_iter=10,n_splits=5):
    train_x, train_y = cd.X_Y_split(data)
    kfold = KFold(n_splits=n_splits)
    param_search = RandomizedSearchCV(estimator,param_grid,n_iter=n_iter,
                                          scoring="neg_mean_squared_error",cv=kfold)
    grid_result = param_search.fit(train_x, train_y, verbose=0)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    return grid_result