Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/site_info.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"MHD": {
"name": "Mace Head, Ireland",
"coords": [53.3267, -9.9046],
"training_period": [2018, 2018],
"training_period": [2016, 2018],
"validation_period": [2019, 2019],
"testing_period": [2020, 2023]
},
Expand Down
21 changes: 11 additions & 10 deletions ml_baselines/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,13 +333,15 @@ def preprocess_all_features_arco_era5(force=False):
def open_features(site,
start_year=1978,
end_year=2024,
time_shift_hours=[6],
features_dir=""):
"""Opens the preprocessed features for a given site.

Args:
site (str): Site code.
start_year (int): Start year to retrieve data (inclusive).
end_year (int): End year for to retrieve data (inclusive).
time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
features_dir (str): Directory where the features files are located if not in location specified in config.
Mainly used for testing purposes. If empty, uses default path.

Expand Down Expand Up @@ -396,19 +398,18 @@ def open_features(site,

df = pd.concat(dfs, axis=0)

# Create a shifted copy: subtract 6 hours from time by equivalently shifting the index forward by 6 hours.
# For each record at time T, the _past columns will come from time T – 6 hours.
df_past = df.copy()
df_past.index = df_past.index + pd.Timedelta(hours=6)
df_past = df_past.add_suffix("_6h")

# Merge the current and past dataframes on their time index
df_final = pd.merge(df, df_past, left_index=True, right_index=True, how="left")
# Create time-shifted features, if specified
if time_shift_hours:
shifted_dfs = [
df.shift(shift, freq="h").add_suffix(f"_{shift}h")
for shift in time_shift_hours
]
df = df.join(shifted_dfs, how="left")

# Add hour of day column
df_final["hour_of_day"] = df_final.index.hour
df["hour_of_day"] = df.index.hour

return df_final
return df


if __name__ == "__main__":
Expand Down
60 changes: 46 additions & 14 deletions ml_baselines/modelling/train.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
import joblib
import numpy as np
import xarray as xr
import pandas as pd
from pathlib import Path
import pickle

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, PredefinedSplit
Expand All @@ -21,6 +17,7 @@
def get_train_test_data(site, test_train,
balance=-1,
undersample=0,
time_shift_hours=[6],
return_dataframe=False,
balance_method="random"):
""" Get the training, testing or validation data for a given site.
Expand All @@ -32,6 +29,7 @@ def get_train_test_data(site, test_train,
NOTE: This is only applied to training data (ignored for test or validation).
undersample (float or bool): If a float between 0 and 1, randomly undersample the dataset to this fraction.
NOTE: This is only applied to training data (ignored for test or validation).
time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
return_dataframe (bool): If True, return the data as a DataFrame. If False, return the features and target separately.
balance_method (str): The method to use for balancing the dataset. Must be one of 'random' or 'deterministic'.

Expand All @@ -57,7 +55,8 @@ def get_train_test_data(site, test_train,

df_features = open_features(site,
start_year = start_year,
end_year = end_year)
end_year = end_year,
time_shift_hours = time_shift_hours)
df_intem = read_intem(site,
start_year = start_year,
end_year = end_year)
Expand Down Expand Up @@ -192,12 +191,26 @@ def train_mlp(site,
balance=0.5,
balance_method="random",
undersample=0,
time_shift_hours=[6],
mlp_params=None,):
""" Train a MLP model for a given site.

Args:
site (str): The site for which to train the model.
balance (float): The target ratio of baseline to non-baseline
values in the training data. Must be between 0 and 1. Only applied to training data.
balance_method (str): The method to use for balancing the dataset. Must be one of 'random' or 'deterministic'. Only applied to training data.
undersample (float): If a float between 0 and 1, randomly undersample the training dataset to this fraction. Only applied to training data.
time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
mlp_params (dict): A dictionary of hyperparameters to pass to the MLPClassifier. If None, default parameters will be used.
Returns:
MLPClassifier: The trained MLP model.
"""

# Get the training data
print(f"Training MLP model for site: {site}")
X, y = get_train_test_data(site, "train", balance=balance, balance_method=balance_method,
undersample=undersample)
time_shift_hours=time_shift_hours, undersample=undersample)

print(f"Number of training points: {len(y)}")
print(f"... number of baseline points: {sum(y == 1)} ({sum(y == 1) / len(y):.1%})")
Expand All @@ -210,42 +223,56 @@ def train_mlp(site,

# Validation
X_val, y_val = get_train_test_data(site, "validation",
balance=-1,
undersample=0)
balance=balance,
balance_method=balance_method,
time_shift_hours=time_shift_hours,
undersample=undersample)

# Testing
#TODO: Testing on unbalanced data?
X_test, y_test = get_train_test_data(site, "test",
balance=-1,
undersample=0)
time_shift_hours=time_shift_hours,
)

print("... predicting")
y_pred_val = nn_model.predict(X_val)
y_pred_train = nn_model.predict(X)
y_pred_test = nn_model.predict(X_test)

# calculating scores
precision_val = precision_score(y_val, y_pred_val)
precision_train = precision_score(y, y_pred_train)
precision_test = precision_score(y_test, y_pred_test)
recall_val = recall_score(y_val, y_pred_val)
recall_train = recall_score(y, y_pred_train)
recall_test = recall_score(y_test, y_pred_test)

f1_val = f1_score(y_val, y_pred_val)
f1_train = f1_score(y, y_pred_train)
f1_test = f1_score(y_test, y_pred_test)

print(f"Precision on Training Set = {precision_train:.3f}")
print(f"Precision on Validation Set = {precision_val:.3f}")
print(f"Precision on Test Set = {precision_test:.3f}")
print(f"Recall on Training Set = {recall_train:.3f}")
print(f"Recall on Validation Set = {recall_val:.3f}")
print(f"Recall on Test Set = {recall_test:.3f}")
print(f"F1 Score on Training Set = {f1_train:.3f}")
print(f"F1 Score on Validation Set = {f1_val:.3f}")
print(f"F1 Score on Test Set = {f1_test:.3f}")

return nn_model, X, y


def train_mlp_grid_search(site, param_grid=None):
def train_mlp_grid_search(site,
param_grid=None,
time_shift_hours=[6]):
""" Train a MLP model using grid search for hyperparameter tuning.

Args:
site (str): The site for which to train the model.
param_grid (dict, optional): A dictionary containing the hyperparameters to tune. If None, default values are used.
time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.

Returns:
GridSearchCV: The trained model with the best hyperparameters.
Expand All @@ -271,15 +298,19 @@ def train_mlp_grid_search(site, param_grid=None):
# 'early_stopping': [True, False]
}

X_val, y_val = get_train_test_data(site, "validation")
X_val, y_val = get_train_test_data(site, "validation", time_shift_hours=time_shift_hours)

best_params = []
best_scores = []
best_balances = []

for balance in np.arange(0.2, 0.9, 0.1):

#TODO: Could also do undersampling instead of balancing, or both?
# and could also do different balance methods (random vs deterministic)
X_train, y_train = get_train_test_data(site, "train", balance=balance,
balance_method="deterministic")
balance_method="deterministic",
time_shift_hours=time_shift_hours)

# Combine your training and validation sets
X_all = pd.concat([X_train, X_val])
Expand Down Expand Up @@ -319,7 +350,8 @@ def train_mlp_grid_search(site, param_grid=None):

# Train final model with the best balance and parameters
X_train_final, y_train_final = get_train_test_data(site, "train", balance=best_balance,
balance_method="deterministic")
balance_method="deterministic",
time_shift_hours=time_shift_hours)

best_model = MLPClassifier(random_state=42, **best_best_params)
best_model.fit(X_train_final, y_train_final)
Expand Down
40 changes: 22 additions & 18 deletions notebooks/training.ipynb

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,29 @@ def test_open_features():
end_year=2020,
features_dir=temp_dir)

df_6_24 = open_features("MHD",
start_year=2020,
end_year=2020,
time_shift_hours=[6, 24],
features_dir=temp_dir)

columns = []
for key in cfg.met_variables.keys():
for i in range(17):
columns.append(f"{key}_{i}")

# Check that the hour_of_day column is correctly computed
hour_of_day = df.index.hour.values
assert np.all(df["hour_of_day"].values == hour_of_day), "hour_of_day column is not correctly computed."

### TEST LAGS
columns_6h = []
columns_24h = []
for column in columns:
columns_6h.append(f"{column}_6h")
columns_24h.append(f"{column}_24h")

### Firstly, just test the 6-hour lagged features

expected_columns = columns + columns_6h

Expand All @@ -98,3 +113,21 @@ def test_open_features():
# Check that the hour_of_day column is correctly computed
hour_of_day = df.index.hour.values
assert np.all(df["hour_of_day"].values == hour_of_day), "hour_of_day column is not correctly computed."

### Now test the 6 and 24-hour lagged features

expected_columns = columns + columns_6h + columns_24h

# Check that all expected met. columns are present
assert all([col in df_6_24.columns for col in expected_columns]), "Not all expected columns are present in the opened features DataFrame."

# Check that all _6h and _24h columns are indeed 6- and 24-hours lagged behind their original columns
now_columns = [col for col in expected_columns if not col.endswith("_6h") and not col.endswith("_24h")]
lagged_columns_6h = [col for col in expected_columns if col.endswith("_6h")]
lagged_columns_24h = [col for col in expected_columns if col.endswith("_24h")]

future_rows_6h = df_6_24[df_6_24.index >= df_6_24.index[0] + pd.Timedelta(hours=6)]
future_rows_24h = df_6_24[df_6_24.index >= df_6_24.index[0] + pd.Timedelta(hours=24)]

assert np.all(df_6_24[now_columns].iloc[:len(future_rows_6h)].values == future_rows_6h[lagged_columns_6h].values), "Lagged columns do not match the expected 6-hour lag."
assert np.all(df_6_24[now_columns].iloc[:len(future_rows_24h)].values == future_rows_24h[lagged_columns_24h].values), "Lagged columns do not match the expected 24-hour lag."