Cookbook/EDA&Preprocessing.py at master · JeffMacaluso/Cookbook · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
import sys
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print(time.strftime('%Y/%m/%d %H:%M'))
print('OS:', sys.platform)
print('CPU Cores:', os.cpu_count())
print('Python:', sys.version)
print('NumPy:', np.__version__)
print('Pandas:', pd.__version__)

# Formatting for seaborn plots
sns.set_context('notebook', font_scale=1.1)
sns.set_style('ticks')

# Displays all dataframe columns
pd.set_option('display.max_columns', None)

%matplotlib inline

#################################################################################################################
##### Exploratory Data Analysis

# Quick EDA report on dataframe
import pandas_profiling
profile = pandas_profiling.ProfileReport(df)
profile.get_rejected_variables(threshold=0.9)  # Rejected variables w/ high correlation
profile.to_file(outputfile='/tmp/myoutputfile.html')  # Saving report as a file

#################################################################################################################
##### Missing Values

# Printing the percentage of missing values per column
def percent_missing(dataframe: pd.DataFrame) -> None:
    '''
    Prints the percentage of missing values for each column in a dataframe
    '''
    # Summing the number of missing values per column and then dividing by the total
    sumMissing = dataframe.isnull().values.sum(axis=0)
    pctMissing = sumMissing / dataframe.shape[0]

    if sumMissing.sum() == 0:
        print('No missing values')
    else:
        # Looping through and printing out each columns missing value percentage
        print('Percent Missing Values:', '\n')
        for idx, col in enumerate(dataframe.columns):
            if sumMissing[idx] > 0:
                print('{0}: {1:.2f}%'.format(col, pctMissing[idx] * 100))


# Plotting missing values
import missingno as msno  # Visualizes missing values
msno.matrix(df)
msno.heatmap(df)  # Co-occurrence of missing values

# Drop missing values
df.dropna(how='any', thresh=None, inplace=True)  # Also 'all' for how, and thresh is an int

# Filling missing values with columnar means
df.fillna(value=df.mean(), inplace=True)

# Filling missing values with interpolation
df.fillna(method='ffill', inplace=True)  #'backfill' for interpolating the other direction

# Filling missing values with a predictive model
def predict_missing_values(data, column, correlationThresh: float = 0.5, cross_validations: int = 3):
    '''
    Fills missing values using a random forest model on highly correlated columns
    Returns a series of the column with missing values filled

    TODO: - Add the option to specify columns to use for predictions
           - Look into other options for handling missing predictors
    '''
    from sklearn.model_selection import cross_val_score
    from sklearn import ensemble

    # Printing number of percentage values missing
    pctMissing = data[column].isnull().values.sum() / data.shape[0]
    print('Predicting missing values for {0}\n'.format(column))
    print('Percentage missing: {0:.2f}%'.format(pctMissing*100))

    # Multi-threading if the dataset is a size where doing so is beneficial
    if data.shape[0] < 100000:
        num_cores = 1  # Single-thread
    else:
        num_cores = -1  # All available cores

    # Instantiating the model
    # Picking a classification model if the number of unique labels are 25 or under
    num_unique_values = len(np.unique(data[column]))
    if num_unique_values > 25 or data[column].dtype != 'category':
        print('Variable is continuous')
        rfImputer = ensemble.RandomForestRegressor(n_estimators=100, n_jobs=num_cores)
    else:
        print('Variable is categorical with {0} classes').format(num_unique_values)
        rfImputer = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=num_cores)

    # Calculating the highly correlated columns to use for the model
    highlyCorrelated = abs(data.corr()[column]) >= correlationThresh

    # Exiting the function if there are not any highly correlated columns found
    if highlyCorrelated.sum() < 2:  # Will always be 1 because of correlation with self
        print('Error: No correlated variables found. Re-try with less a lower correlation threshold')
        return  # Exits the function
    highlyCorrelated = data[data.columns[highlyCorrelated]]
    highlyCorrelated = highlyCorrelated.dropna(how='any')  # Drops any missing records
    print('Using {0} highly correlated features for predictions\n'.format(highlyCorrelated.shape[1]))

    # Creating the X/y objects to use for the
    y = highlyCorrelated[column]
    X = highlyCorrelated.drop(column, axis=1)

    # Evaluating the effectiveness of the model
    cvScore = np.mean(cross_val_score(rfImputer, X, y, cv=cross_validations, n_jobs=num_cores))
    print('Cross Validation Score:', cvScore)

    # Fitting the model for predictions and displaying initial results
    rfImputer.fit(X, y)
    if num_unique_values > 25 or data[column].dtype.name != 'category':
        print('R^2:', rfImputer.score(X, y))
    else:
        print('Accuracy:', rfImputer.score(X, y))

    # Re-filtering the dataset down to highly correlated columns
    # Filling NA predictors w/ columnar mean instead of removing
    X_missing = data[highlyCorrelated.columns]
    X_missing = X_missing.drop(column, axis=1)
    X_missing = X_missing.fillna(X_missing.mean())

    # Filtering to rows with missing values before generating predictions
    missingIndexes = data[data[column].isnull()].index
    X_missing = X_missing.iloc[missingIndexes]

    # Predicting the missing values
    predictions = rfImputer.predict(X_missing)

    # Preventing overwriting of original dataframe
    data = data.copy()

    # Looping through the missing values and replacing with predictions
    for i, idx in enumerate(missingIndexes):
        data.set_value(idx, column, predictions[i])

    return data[column]


df[colName] = predict_missing_values(df, colName)

#################################################################################################################
##### Outliers

# TODO: - Add docstrings to functions
#       - Add other functions (GESD, local outlier factor, isolation forests, etc.)

# Detecting outliers with Interquartile Range (IQR)
# Note: The function in its current form is taken from Chris Albon's Machine Learning with Python Cookbook
def iqr_indices_of_outliers(X: np.ndarray) -> np.ndarray:
    '''
    Detects outliers using the interquartile range (IQR) method

    Input: An array of a variable to detect outliers for
    Output: An array with indices of detected outliers
    '''
    q1, q3 = np.percentile(X, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    outlier_indices = np.where((X > upper_bound) | (X < lower_bound))
    return outlier_indices


# Detecting outliers with Z scores
def z_score_indices_of_outliers(X: np.ndarray, threshold: float = 3) -> np.ndarray:
    '''
    Detects outliers using the Z score method method

    Input: - X: An array of a variable to detect outliers for
           - threshold: The number of standard deviations from the mean
                        to be considered an outlier

    Output: An array with indices of detected outliers
    '''
    X_mean = np.mean(X)
    X_stdev = np.std(X)
    z_scores = [(y - X_mean) / X_stdev for y in X]
    outlier_indices = np.where(np.abs(z_scores) > threshold)
    return outlier_indices


# Detecting outliers with the Elliptical Envelope method
def ellipses_indices_of_outliers(X: np.ndarray, contamination: float = 0.1) -> np.ndarray:
    '''
    Detects outliers using the elliptical envelope method

    Input: An array of all variables to detect outliers for
    Output: An array with indices of detected outliers
    '''
    from sklearn.covariance import EllipticEnvelope

    # Copying to prevent changes to the input array
    X = X.copy()

    # Dropping categorical columns
    non_categorical = []
    for feature in range(X.shape[1]):
        num_unique_values = len(np.unique(X[:, feature]))
        if num_unique_values > 30:
            non_categorical.append(feature)
    X = X[:, non_categorical]  # Subsetting to columns without categorical indexes

    # Testing if there are an adequate number of features
    if X.shape[0] < X.shape[1] ** 2.:
        print('Will not perform well. Reduce the dimensionality and try again.')
        return

    # Creating and fitting the detector
    outlier_detector = EllipticEnvelope(contamination=contamination)
    outlier_detector.fit(X)

    # Predicting outliers and outputting an array with 1 if it is an outlier
    outliers = outlier_detector.predict(X)
    outlier_indices = np.where(outliers == -1)
    return outlier_indices


# Detecting outliers with the Isolation Forest method
def isolation_forest_indices_of_outliers(X, contamination='auto', n_estimators=100):
    '''
    Detects outliers using the isolation forest method

    Inputs:
        - X (array or data frame): Non-categorical variables to detect outliers for
        - Contamination (float or 'auto'): The percentage of outliers
        - n_estimators (int): The number of treess to use in the isolation forest
    Output: An array with indices of detected outliers
    '''
    from sklearn.ensemble import IsolationForest

    # Copying to prevent changes to the input array
    X = X.copy()

    # Creating and fitting the detector
    outlier_detector = IsolationForest(contamination=contamination,
                                       n_estimators=n_estimators,
                                       behaviour='new',
                                       n_jobs=-1)
    outlier_detector.fit(X)

    # Predicting outliers and outputting an array with 1 if it is an outlier
    outliers = outlier_detector.predict(X)
    outlier_indices = np.where(outliers == -1)
    return outlier_indices

outlier_indexes_forest = helper.isolation_forest_indices_of_outliers(X.select_dtypes(exclude='category'),
                                                              contamination='auto')
print('Outliers detected: {0}'.format(len(outlier_indexes_forest[0])))


# Detecting outliers with the One Class SVM method
def one_class_svm_indices_of_outliers(X):
    '''
    Detects outliers using the one class SVM method

    Input: An array of all variables to detect outliers for
    Output: An array with indices of detected outliers
    '''
    from sklearn.svm import OneClassSVM

    # Copying to prevent changes to the input array
    X = X.copy()

    # Dropping categorical columns
    non_categorical = []
    for feature in range(X.shape[1]):
        num_unique_values = len(np.unique(X[:, feature]))
        if num_unique_values > 30:
            non_categorical.append(feature)
    X = X[:, non_categorical]  # Subsetting to columns without categorical indexes

    # Testing if there are an adequate number of features
    if X.shape[0] < X.shape[1] ** 2.:
        print('Will not perform well. Reduce the dimensionality and try again.')
        return

    # Creating and fitting the detector
    outlier_detector = OneClassSVM()
    outlier_detector.fit(X)

    # Predicting outliers and outputting an array with 1 if it is an outlier
    outliers = outlier_detector.predict(X)
    outlier_indices = np.where(outliers == -1)
    return outlier_indices


outlier_report(df)['feature']['Outlier type']  # Returns array of indices for outliers
# or
outlier_report(df)['Multiple feature outlier type']  # Returns array of indices for outliers

#################################################################################################################
##### Preprocessing

# One-hot encoding multiple columns
df_encoded = pd.get_dummies(df, columns=['a', 'b', 'c'], drop_first=True)

# Converting a categorical column to numbers
df['TargetVariable'].astype('category').cat.codes

# Scaling from 0 to 1
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)  # Scaling from 0 to 1 across columns

# Normalizing w/ standard scaling
from sklearn import preprocessing
normalizer = preprocessing.StandardScaler()
X_normalized = normalizer.fit_transform(X)  # Normalizing across columns

# Grouping by multiple levels and getting the percentage of the second level by the first level
df.groupby(['level_1', 'level_2']).size().groupby(level='level_1').apply(lambda x: x / x.sum())

# Principal Component Analysis (PCA)
def fit_PCA(X: np.ndarray, num_components: float = 0.99) -> np.ndarray:
    '''
    Performs min-max normalization and PCA transformation on the input data array

    Inputs:
        - X: An array of values to perform PCA on
        - num_components: If >1, the number of principal components desired
                          If <1, the percentage of variance explained desired

    Outputs:
        - An array of the principal components

    TODO: Add check if data is already normalized
    '''
    from sklearn import preprocessing
    from sklearn.decomposition import PCA

    # Checking if the input is a numpy array and converting it if not
    if type(X) != np.ndarray:
        X = np.array(X)

    # Normalizing data before PCA
    min_max_scaler = preprocessing.MinMaxScaler()
    X_norm = min_max_scaler.fit_transform(X)

    # Performing PCA
    pca = PCA(n_components=num_components)
    pca.fit(X_norm)

    # Reporting explained variance
    explained_variance = pca.explained_variance_ratio_ * 100
    print('Total variance % explained:', sum(explained_variance))
    print()
    print('Variance % explained by principal component:')
    for principal_component in range(len(explained_variance)):
        print(principal_component, ':', explained_variance[principal_component])

    # Transforming the data before returning
    principal_components = pca.transform(X_norm)
    return principal_components


# Oversampling
def oversample_binary_label(dataframe: pd.DataFrame, label_column: str) -> pd.DataFrame:
    '''
    Oversamples a dataframe with a binary label to have an equal proportion in classes. Dynamically
    determines the label with the lower proportion.

    Inputs:
        - dataframe: A dataframe containing the label
        - label_column: A string of the column containing the label
    Output: A dataframe with the lower proportion label oversampled

    TODO: Update this to oversample the training set and return both the training and testing sets
    '''

    # Counting the classes
    class_0_count, class_1_count = dataframe[label_column].value_counts()

    # Creating two dataframes for each class
    dataframe_class_0 = dataframe[dataframe[label_column] == dataframe[label_column].unique()[0]]
    dataframe_class_1 = dataframe[dataframe[label_column] == dataframe[label_column].unique()[1]]

    # Determining the smaller class
    smaller_label = dataframe[label_column].value_counts().idxmin()

    # Oversampling
    if smaller_label == 0:
        dataframe_class_0_oversampled = dataframe_class_0.sample(class_1_count, replace=True)
        dataframe_oversampled = pd.concat([dataframe_class_1, dataframe_class_0_oversampled], axis=0)
    else:
        dataframe_class_1_oversampled = dataframe_class_1.sample(class_0_count, replace=True)
        dataframe_oversampled = pd.concat([dataframe_class_0, dataframe_class_1_oversampled], axis=0)

    # Printing results
    print('Initial number of observations in each class:')
    print(dataframe[label_column].value_counts())
    print()

    print('Oversampled number of observations in each class:')
    print(dataframe_oversampled[label_column].value_counts())

    return dataframe_oversampled

# Oversampling with SMOTE
def oversample_smote(training_features, training_labels, is_dataframe=True):
    '''
    Convenience function for oversampling with SMOTE. This generates synthetic samples via interpolation.
    Automatically encodes categorical columns if a dataframe is provided with categorical columns properly marked.

    Input: The training features and labels. is_dataframe is for checking for categorical columns.
    Output: The oversampled training features and labels
    '''
    from imblearn import over_sampling

    if is_dataframe == True:
        # Testing if there are any categorical columns
        # Note: These must have the "category" datatype
        categorical_variable_list = training_features.select_dtypes(exclude=['number', 'bool_', 'object_']).columns
        if categorical_variable_list.shape[0] > 0:
            categorical_variable_list = list(categorical_variable_list)
            categorical_variable_indexes = training_features.columns.get_indexer(categorical_variable_list)
            smote = over_sampling.SMOTENC(categorical_features=categorical_variable_indexes, random_state=46, n_jobs=-1)
        else:
            smote = over_sampling.SMOTE(random_state=46, n_jobs=-1)
    else:
        smote = over_sampling.SMOTE(random_state=46, n_jobs=-1)

    # Performing oversampling
    training_features_oversampled, training_labels_oversampled = smote.fit_sample(training_features, training_labels)

    # Rounding discrete variables for appropriate cutoffs
    # This is becuase SMOTE NC only deals with binary categorical variables, not discrete variables
    if is_dataframe == True:
        discrete_variable_list = training_features.select_dtypes(include=['int', 'int32', 'int64']).columns
        if discrete_variable_list.shape[0] > 0:
            discrete_variable_indexes = training_features.columns.get_indexer(discrete_variable_list)
            for discrete_variable_index in discrete_variable_indexes:
                training_features_oversampled[:, discrete_variable_index] = np.round(training_features_oversampled[:, discrete_variable_index].astype(float)).astype(int)

    print('Previous training size:', len(training_labels))
    print('Oversampled training size', len(training_labels_oversampled), '\n')
    print('Previous label mean:', training_labels.astype(int).mean())
    print('Oversampled label mean:', training_labels_oversampled.mean())

    return training_features_oversampled, training_labels_oversampled

X_train_oversampled, y_train_oversampled = oversample_smote(X_train, y_train)


# Target mean encoding
def target_encode(train_variable, test_variable, train_label, smoothing=1, min_samples_leaf=1, noise_level=0):
    '''
    Mean target encoding using Daniele Micci-Barreca's technique from the following paper:
    http://helios.mm.di.uoa.gr/~rouvas/ssi/sigkdd/sigkdd.vol3.1/barreca.pdf

    This function heavily borrows code from Olivier's Kaggle post:
    https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features

    Inputs:
        - train_variable (Series): Variable in the training set to perform the encoding on.
        - test_variable (Series): Variable in the testing set to be transformed.
        - train_label (Series): The label in the training set to use for performing the encoding.
        - smoothing (int): Balances the categorical average vs. the prior.
        - min_samples_leaf (int): The minimum number of samples to take the category averagesinto account.
        - noise_level (int): Amount of Gaussian noise to add in order to help prevent overfitting.
    '''

    def add_noise(series, noise_level):
        '''
        Adds Gaussian noise to the data
        '''
        return series * (1 + noise_level * np.random.randn(len(series)))

    assert len(train_variable) == len(train_label)
    assert train_variable.name == test_variable.name

    # Creating a data frame out of the training variable and label in order to get the averages of the label
    # for the training variable
    temp = pd.concat([train_variable, train_label], axis=1)

    # Computing the target mean
    averages = temp.groupby(train_variable.name)[train_label.name].agg(['mean', 'count'])

    # Computing the smoothing
    smoothing = 1 / (1 + np.exp(-(averages['count'] - min_samples_leaf) / smoothing))

    # Calculating the prior before adding the smoothing
    prior = train_label.mean()

    # Adding the smoothing to the prior to get the posterior
    # Larger samples will take the average into account less
    averages[train_label.name] = prior * (1 - smoothing) + averages['mean'] * smoothing

    # Applying the averages to the training variable
    fitted_train_variable = pd.merge(
        train_variable.to_frame(train_variable.name),
        averages.reset_index().rename(columns={'index': train_label.name, train_label.name: 'average'}),
        on=train_variable.name, how='left')
    fitted_train_variable = fitted_train_variable['average'].rename(train_variable.name + '_mean').fillna(prior)
    fitted_train_variable.index = train_variable.index  # Restoring the index lost in pd.merge

    # Applying the averages to the testing variable
    fitted_test_variable = pd.merge(
        test_variable.to_frame(test_variable.name),
        averages.reset_index().rename(columns={'index': train_label.name, train_label.name: 'average'}),
        on=test_variable.name, how='left')
    fitted_test_variable = fitted_test_variable['average'].rename(test_variable.name + '_mean').fillna(prior)
    fitted_test_variable.index = fitted_test_variable.index  # Restoring the index lost in pd.merge

    # Adding the noise if there is any
    if noise_level != 0:
        fitted_train_variable = add_noise(fitted_train_variable, noise_level)
        fitted_test_variable = add_noise(fitted_test_variable, noise_level)
    return fitted_train_variable, fitted_test_variable


# Percentage of total within group
df['numbers'] / df.groupby('group')['numbers'].transform('sum')

# Groupby two levels and get the percentage of rows for the second level within groups of the first level
df.groupby(['first_level', 'second_level']).size().groupby(level=0).apply(lambda x:100 * x / float(x.sum()))