-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmedical_appointment.py
More file actions
113 lines (93 loc) · 4.86 KB
/
medical_appointment.py
File metadata and controls
113 lines (93 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#! python 3
# medical_appointment.py -
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
# read csv file into list in python
path = '../data/KaggleV2-May-2016.csv'
data = pd.read_csv(path)
# plot features as bar charts
def features_plots(vars):
plt.figure(figsize=(15, 24.5))
for i, dv in enumerate(vars):
plt.subplot(7, 2, i + 3)
data[dv].value_counts().plot(kind='bar', title=dv)
plt.ylabel('Frequency')
plt.show()
def model_performance(model_name, y_test, y_pred):
print('Model name: %s' % model_name)
print('Test accuracy (Accuracy Score): %f' % metrics.accuracy_score(y_test, y_pred))
print('Test accuracy (ROC AUC Score): %f' % metrics.roc_auc_score(y_test, y_pred))
fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, y_pred)
print('Area Under the Precision-Recall Curve: %f' % metrics.auc(fpr, tpr))
print('\n')
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# vars = ['Gender', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show']
# features_plots(vars)
data = data[data['Age'] >= 0] # Removing Observations with Negative Age Values
del data['Handcap'] # Removing Variable Named ‘Handicap’ from the Dataset
# vars = ['Gender', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'SMS_received', 'No-show']
# features_plots(vars)
# Breaking Date Features(ScheduledDay, AppointmentDay) into Date Components
for col in ['ScheduledDay', 'AppointmentDay']:
for i, component in enumerate(['year', 'month', 'day']):
data['%s_%s' % (col, component)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[i]))
# Breaking ScheduledDay into Time Components
for j, component in enumerate(['hour', 'min', 'sec']):
data['%s_%s' % ('Scheduled', component)] = \
data['ScheduledDay'].apply(lambda x: int(x.split('T')[1][:-1].split(':')[j]))
# Features in Gender change to integers(F-0, M-1)
data['Gender'] = data['Gender'].apply(lambda x: int(0.0) if(x == 'F') else int(1.0))
# Features in No-show change to integers(no-0, yes-1)
data['Label'] = data['No-show'].apply(lambda x: int(0.0) if(x == 'No') else int(1.0))
selected_features = ['Gender', 'Age',
'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'SMS_received',
'ScheduledDay_year', 'ScheduledDay_month', 'ScheduledDay_day',
'AppointmentDay_year', 'AppointmentDay_month', 'AppointmentDay_day',
'Scheduled_hour', 'Scheduled_min', 'Scheduled_sec']
features = np.array(data[selected_features])
labels = np.array(data['Label'])
featTrain, featTest = np.array_split(features, [int(features.shape[0] * 0.7)])
labelTrain, labelTest = np.array_split(labels, [int(features.shape[0] * 0.7)])
# Train the model by applying decision tree
dt = DecisionTreeClassifier()
dt.fit(featTrain, labelTrain)
dtPred = dt.predict(featTest)
model_performance('Decision Tree Classifier', labelTest, dtPred)
# Train the model by applying Kernel Approximation with SGD Classify
rbf_feature = RBFSampler(gamma=1, random_state=1) #The RBFSampler constructs an approximate mapping for the radial basis function kernel
featTrainSGD = rbf_feature.fit_transform(featTrain)
sgd = SGDClassifier(max_iter= 1000, tol= 0.001)
sgd.fit(featTrainSGD, labelTrain)
featTestSGD = rbf_feature.fit_transform(featTest)
sgdPred = sgd.predict(featTestSGD)
model_performance('SGD Classifier', labelTest, sgdPred)
# Training RandomForest Classifier on Training Dataset
rf = RandomForestClassifier()
rf.fit(featTrain, labelTrain)
rdPred = rf.predict(featTest)
model_performance('Random Forest Classifier', labelTest, rdPred)
# Training the Model by Applying Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=10, learning_rate=0.1, n_estimators=200, max_depth=5, max_features=10)
gb.fit(featTrain, labelTrain)
gbPred = gb.predict(featTest)
model_performance('Gradient Boosting Classifier', labelTest, gbPred)
# Printing Features’ Weight as Assigned by Gradient Boosting Classifier
for feature, score in zip(selected_features, list(gb.feature_importances_)):
print('%s\t%f'%(feature, score))