-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdb.py
More file actions
94 lines (79 loc) · 3.03 KB
/
db.py
File metadata and controls
94 lines (79 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegressionModel
class Database:
"""
Main database class
"""
def __init__(self, spark):
print("Loading players dataset")
self.players = pd.read_csv("data/players.csv")
print("Columns: " + self.players.columns)
print("Loading teams dataset")
self.teams = pd.read_csv("data/teams.csv")
print("Columns: " + self.teams.columns)
print("Loading predictions pickle...")
pred_rdd = spark.spark_ctx.pickleFile("data/df.pkl").collect()
self.pred = spark.spark.createDataFrame(pred_rdd)
self.model = LinearRegressionModel.load("data/model")
self.predictions = self.model.transform(self.pred)
self._normalize_predictions_df()
print("Database ready")
def get_player_by_name(self, name, season):
"""
Returns a Pandas dataframe object containing
the player characterized by the given {name} and {season}
"""
return self.players.loc[
(self.players["short_name"] == name) & (self.players["season"] == season)
]
def get_players_in_team(self, name, season, compact=False):
"""
Returns a Pandas dataframe object containing
the players that played with {team} in {season}
"""
if compact:
players = self.players["short_name"]
else:
players = self.players
return players.loc[
(self.players["club_name"] == name) & (self.players["season"] == season)
]
def get_team_by_name(self, name, season):
"""
Returns a Pandas dataframe object containing
the team characterized by the given {name} and {season}
"""
return self.teams.loc[
(self.teams["club_name"] == name) & (self.teams["year"] == season)
]
def get_table(self, league, season, compact=False):
"""
Returns a Pandas dataframe object containing
the table for the given {league} and {season}
"""
if compact:
teams = self.teams[["club_name_ext", "points", "place"]]
else:
teams = self.teams
return teams.loc[
(self.teams["league"] == league) & (self.teams["year"] == season)
].sort_values(by=["place"], ascending=True)
def get_prediction(self, league, season):
"""
Returns a Pandas dataframe object containing
the predicted table for the given {league} and {season}
"""
return (
self.predictions.select(
"season", "club_name", "league", "points", "prediction"
)
.where((col("league") == league) & (col("season") == season))
.orderBy("prediction", ascending=False)
.toPandas()
)
def _normalize_predictions_df(self):
self.predictions = self.predictions.withColumnRenamed(
"attempt_4_regression_linear_regression_predictions",
"prediction",
)