-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbus_nuke_ml.py
More file actions
128 lines (125 loc) · 6.41 KB
/
bus_nuke_ml.py
File metadata and controls
128 lines (125 loc) · 6.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def route_prediction(monthly):
import statsmodels.formula.api as smf
accuracy_list = []
route = []
pred19 = []
act18 = []
routes = list(monthly.route.unique())
for i in range(0,137):
#Supervised Machine Learning to predcit the ridership over the years using Long Short-term Memory (LSTM)
monthlyx = monthly.loc[monthly.route == routes[i]].reset_index()
month_sum = monthlyx.groupby(['year','month'],as_index= False).sum()
month_sum['date'] = pd.to_datetime(month_sum[['year', 'month']].assign(DAY=1))
some = month_sum.loc[(month_sum.year == 2019) | (month_sum.year == 2018) | (month_sum.year == 2017)]
month_sum['prev'] = month_sum['MonthTotal'].shift(1)
month_sum = month_sum.dropna()
month_sum['diff'] = month_sum['MonthTotal'] - month_sum['prev']
# ax = month_sum.plot.line(x = 'date',y='diff')
month_sum_spr = month_sum.drop(['prev','year','month','Avg_Weekday_Rides','Avg_Saturday_Rides','Avg_Sunday-Holiday_Rides'],axis=1)
for inc in range(1,19):
field_name = 'lag_' + str(inc)
month_sum_spr[field_name] = month_sum_spr['diff'].shift(inc)
month_sum_spr = month_sum_spr.dropna().reset_index(drop=True)
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4+ lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12 + lag_13 + lag_14 + lag_15 + lag_16 + lag_17 ', data=month_sum_spr)
model_fit = model.fit()
regression_adj_rsq = model_fit.rsquared_adj
# print(regression_adj_rsq)
if regression_adj_rsq < 0.6 :
continue
else:
from sklearn.preprocessing import MinMaxScaler
df_model = month_sum_spr.drop(['MonthTotal','date'],axis=1)
train_set, test_set = df_model[0:-6].values, df_model[-6:].values
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.recurrent import LSTM
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)
y_pred = model.predict(X_test,batch_size=1)
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])
pred_test_set = []
for index in range(0,len(y_pred)):
pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
result_list = []
dates = list(month_sum[-6:].date)
train = list(month_sum[-6:].MonthTotal)
testclass = list(month_sum[-18:-12].MonthTotal)
for index in range(0,len(pred_test_set_inverted)):
result_dict = {}
result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + train[index])
result_dict['pred_value'] = result_dict['pred_value']/1.05
result_dict['date'] = dates[index]
result_list.append(result_dict)
df_result = pd.DataFrame(result_list)
accuracy = 0
for j in range (0,len(act_sales)):
accuracy = ((accuracy*j)+((abs(act_sales[j]-df_result.pred_value[j]))/act_sales[j])*100)/(j+1)
accuracy = 100 - accuracy
if accuracy < 85:
continue
else:
for j in range (0,len(act_sales)):
accuracy = ((abs(act_sales[j]-df_result.pred_value[j]))/act_sales[j])*100
accuracy = 100 - accuracy
route.append(routes[i])
pred19.append(df_result.pred_value[j])
accuracy_list.append(accuracy)
act18.append(testclass)
data18 = []
for sublist in act18:
for item in sublist:
data18.append(item)
final = pd.DataFrame(columns = ["route","pred19","act18","class","accuracy"])
final.route = route
final.pred19 = pred19
final.act18 = data18
final.accuracy = accuracy_list
for l in range(0,len(final.route)):
x = (final['pred19'][l])/(final['act18'][l])
if x < .8:
final['class'][l] = "Heavy Decrease"
if x > 1.2:
final['class'][l] = "Heavy Increase"
if ((x >= .9) and (x <= 1.1 )):
final['class'][l] = "Almost Same"
if ((x >= .8) and (x< .9)):
final['class'][l] = "Slight Decrease"
if ((x > 1.1) and (x<= 1.2)):
final['class'][l] = "Slight Decrease"
final.pred19 = final.pred19.astype(int)
months = ["July","August","September","Ocotber","November","December"]
count_routes = len(final.route)//6
months_final = []
for i in range(0,count_routes):
months_final.append(months)
month_final = []
for sublist in months_final:
for item in sublist:
month_final.append(item)
final['month'] = month_final
columns = ["month","route","pred19","act18","class","accuracy"]
final = final[columns]
final = final.rename(columns={'month': 'Month','pred19': 'Predicted_Ridership_2019','act18':'Ridership_2018', 'class': 'Classification','accuracy': 'Accuracy_of_Prediction'})
final.to_csv('final_bus.csv', index=False)
return (final)