-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjupiter.py
More file actions
134 lines (98 loc) · 5.54 KB
/
jupiter.py
File metadata and controls
134 lines (98 loc) · 5.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import sqlite3
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
class Moons:
def __init__(self, db):
database_service = "sqlite"
self.connectable = f"{database_service}:///{db}"
self.load_data() #creates the Moons class which will extract the data from the jupiter.db database
def load_data(self):
query = "SELECT * FROM moons"
self.data = pd.read_sql(query, self.connectable) #creates an sql for the database data
def stats(self, decimal_places=2):
statdp = self.data.describe()
statdp2 = statdp.round(decimal_places)
return statdp2 #calculates the statistics of all of the variables to 2 decimal places
def correlations(self, decimal_places=2):
corrdp = self.data.corr()
corrdp2 = corrdp.round(decimal_places)
return corrdp2 #calculates the correlation coefficients between variables to 2 decimal places
def plot_hist(self):
plt.hist(self.data['distance_km'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Moon Distances from Jupiter')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.show() #plots a histogram of the distance of the moons from jupiter
def group_plot(self):
groups = self.data['group'].unique()
for group in groups:
group_data = self.data[self.data['group'] == group]
plt.scatter(group_data['period_days'], group_data['distance_km'], label=group) #isolates all of the seperate groups of moons so each can be individually seen
plt.xlabel('Period Days')
plt.ylabel('Distance km')
plt.legend(title='Groups', loc='lower right')
plt.title('Scatter Plot of Period Days vs Distance km by Group')
plt.show() #plots a scatter graph of the distance of the moons to jupiter and the period days they have
sns.displot(data= self.data, x="distance_km", hue="group", col="group")
plt.show() #creastes individual histograms for eac type of moon
def moon(self, moon_name):
moon_data = self.data[self.data['moon'] == moon_name]
return moon_data #allows individual moons to be searched for
def linear_regression_model(self):
self.data['T_seconds'] = self.data['period_days'] * 24 * 60 * 60 #converts period days into period seconds, the variable T
self.data['T2'] = self.data['T_seconds'] **2 #squares T to produce T^2
self.data['a_m'] = self.data['distance_km'] * 1000 #turns the variable a to metres from kilometres
self.data['a3'] = self.data['a_m'] **3 #cubes the variable a to make a^3
Y = self.data['T2'].values
X = self.data['a3'].values.reshape(-1, 1) #reshaping to make a 2D array which is necessary
sns.regplot(data=self.data, y='T2', x='a3', scatter_kws={'s': 15}, line_kws = {'linewidth': 1}) #plots a scatter graph of T^2 vs a^3, adjusting the size of the plots and line of best fit
plt.title("Scatter Plot")
plt.show()
sns.residplot(data=self.data, y='T2', x='a3') #creates a residual plot based off the scatter above
plt.title("Residual Plot")
plt.show()
# separate data into training and testing sets
# use 30% of the data for testing, and the rest for training
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
model = linear_model.LinearRegression(fit_intercept= False) #hyperparameter set here as I made fit_intercept= False, this is so the line of best fit passes through the origin, as a3 and t2 are proportional as well as keplers third law equation having no c value
model.fit(x_train,y_train)
pred = model.predict(x_test) #creates a prediction of the x_test
f, a0 = plt.subplots(figsize=(15, 15))
a0.scatter(self.data["a3"], self.data["T2"], label="Actual") #plots a scatter of t2 vs a3, 'actual' values
a0.plot(x_test.flatten(), pred.flatten(), 'r--', label="Predicted") #plots the line of best fit using the 'prediction' values
plt.title("Scatter Plot with Predicted Model")
plt.legend()
a0.set_xlabel("a3")
a0.set_ylabel("T2")
plt.show()
fig, ax = plt.subplots()
# Create a plot of residuals
ax.plot(x_test,y_test - pred,'.')
# Add a horizontal line at zero to guide the eye
ax.axhline(0, color='k', linestyle='dashed')
# Add axis labels
ax.set_xlabel("T2")
ax.set_ylabel("Residuals")
ax.set_title("Residual Plot with Predicted Model")
plt.show()
from sklearn.metrics import r2_score, mean_squared_error
print(f"unweighted model r2_score: {r2_score(y_test,pred)}") #calculates the r2 value to show the correlations
print(f"unweighted model root mean squared error: {mean_squared_error(y_test,pred)}") #calculates root mean squared error
gradient = model.coef_[0] #works out gradient of line of best fit
print("gradient from model: ", gradient)
print("intercept from model:", model.intercept_) #works out y-intercept
print(f"root mean squared error: {mean_squared_error(y_test,pred, squared=False)}")
print(f"4π/2GM is = {gradient}") #applying the equation to the graph
G = 6.67e-11 #m3kg−1s−2
pi = np.pi #pi value imported
M = (4*(pi)) / (2*(G)*(gradient)) #works out Mass of Jupiter in kg importing all other values into the equation
print(f"The predicted mass of jupiter from my model is: {M}kg")
actm = 1.899e+27 #kg (actual mass of Jupiter)
print("The actual mass of jupiter is 1.899e+27 kg")
print(f"So the actual mass of jupiter is {((actm)/(M))} times heavier than my estimation") #compares estimation to literature value