-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlinear_regression.py
More file actions
100 lines (80 loc) · 3.65 KB
/
linear_regression.py
File metadata and controls
100 lines (80 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, median_absolute_error
class Linear_Regression():
def __init__(self, df, correlation_value):
self.predictors = []
self.df = df
self.correlation_value = correlation_value
def correlation_assessment(self):
self.df.corr()[['meantempm']].sort_values('meantempm')
def print_df_corr(self):
print(self.df.corr())
def get_suitable_predictors(self):
corr_values = self.df.corr()[['meantempm']]
corr_values_processed = corr_values.unstack()
var_names = []
count = 0
for key, value in corr_values_processed.items():
var_names.append(key[1])
print(key[1])
for key, value in corr_values_processed.items():
if value >= 0.6:
self.predictors.append(var_names[count])
count = count + 1
if 'maxtempm' in self.predictors:
self.predictors.remove('maxtempm')
if 'mintempm' in self.predictors:
self.predictors.remove('mintempm')
if 'meantempm' in self.predictors:
self.predictors.remove('meantempm')
df2 = self.df[['meantempm'] + self.predictors]
if len(self.predictors) > 3:
return df2
else:
return False
def plot_graph(self, df2):
plt.rcParams['figure.figsize'] = [11, 15]
fig, axes = plt.subplots(nrows=4, ncols=3, sharey=True)
arr = np.array(self.predictors).reshape(4, 3)
# use enumerate to loop over the arr 2D array of rows and columns
# and create scatter plots of each meantempm vs each feature
for row, col_arr in enumerate(arr):
for col, feature in enumerate(col_arr):
axes[row, col].scatter(df2[feature], df2['meantempm'])
if col == 0:
axes[row, col].set(xlabel=feature, ylabel='meantempm')
else:
axes[row, col].set(xlabel=feature)
plt.show()
def further_filtering_the_predictors(self, alpha, df2):
x = df2[self.predictors]
y = df2['meantempm']
x = sm.add_constant(x)
# print("This is x:")
# print(x)
while True:
model = sm.OLS(y, x).fit()
model_pvalues_list = model.pvalues.tolist()
max_value = max(model_pvalues_list)
if alpha >= max_value:
index_of_max_value = model_pvalues_list.index(max_value)
variable_removed = self.predictors[index_of_max_value]
x = x.drop(variable_removed, axis=1)
else:
return x
def print_results(self, x, y):
x = x.drop('const', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=12)
regressor = LinearRegression()
# fit the build the model by fitting the regressor to the training data
regressor.fit(x_train, y_train)
# make a prediction set using the test set
prediction = regressor.predict(x_test)
# Evaluate the prediction accuracy of the model
print("The Explained Variance: %.2f" % regressor.score(x_test, y_test))
print("The Mean Absolute Error: %.2f degrees celsius" % mean_absolute_error(y_test, prediction))
print("The Median Absolute Error: %.2f degrees celsius" % median_absolute_error(y_test, prediction))