openghg · mrghg · Mar 20, 2026 · Mar 13, 2026 · Mar 16, 2026
diff --git a/data/site_info.json b/data/site_info.json
@@ -2,7 +2,7 @@
     "MHD": {
         "name": "Mace Head, Ireland",
         "coords": [53.3267, -9.9046],
-        "training_period": [2018, 2018],
+        "training_period": [2016, 2018],
         "validation_period": [2019, 2019],
         "testing_period": [2020, 2023]
     },

diff --git a/ml_baselines/features.py b/ml_baselines/features.py
@@ -333,13 +333,15 @@ def preprocess_all_features_arco_era5(force=False):
 def open_features(site,
                 start_year=1978,
                 end_year=2024,
+                time_shift_hours=[6],
                 features_dir=""):
     """Opens the preprocessed features for a given site.
 
     Args:
         site (str): Site code.
         start_year (int): Start year to retrieve data (inclusive).
         end_year (int): End year for to retrieve data (inclusive).
+        time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
         features_dir (str): Directory where the features files are located if not in location specified in config. 
             Mainly used for testing purposes. If empty, uses default path.
 
@@ -396,19 +398,18 @@ def open_features(site,
 
     df = pd.concat(dfs, axis=0)
 
-    # Create a shifted copy: subtract 6 hours from time by equivalently shifting the index forward by 6 hours.
-    # For each record at time T, the _past columns will come from time T – 6 hours.
-    df_past = df.copy()
-    df_past.index = df_past.index + pd.Timedelta(hours=6)
-    df_past = df_past.add_suffix("_6h")
-
-    # Merge the current and past dataframes on their time index
-    df_final = pd.merge(df, df_past, left_index=True, right_index=True, how="left")
+    # Create time-shifted features, if specified
+    if time_shift_hours:
+        shifted_dfs = [
+            df.shift(shift, freq="h").add_suffix(f"_{shift}h")
+            for shift in time_shift_hours
+        ]
+        df = df.join(shifted_dfs, how="left")
 
     # Add hour of day column
-    df_final["hour_of_day"] = df_final.index.hour
+    df["hour_of_day"] = df.index.hour
 
-    return df_final
+    return df
 
 
 if __name__ == "__main__":

diff --git a/ml_baselines/modelling/train.py b/ml_baselines/modelling/train.py
@@ -1,9 +1,5 @@
-import joblib
 import numpy as np
-import xarray as xr
 import pandas as pd
-from pathlib import Path
-import pickle
 
 from sklearn.neural_network import MLPClassifier
 from sklearn.model_selection import GridSearchCV, PredefinedSplit
@@ -21,6 +17,7 @@
 def get_train_test_data(site, test_train,
                         balance=-1,
                         undersample=0,
+                        time_shift_hours=[6],
                         return_dataframe=False,
                         balance_method="random"):
     """ Get the training, testing or validation data for a given site.
@@ -32,6 +29,7 @@ def get_train_test_data(site, test_train,
             NOTE: This is only applied to training data (ignored for test or validation).
         undersample (float or bool): If a float between 0 and 1, randomly undersample the dataset to this fraction.
             NOTE: This is only applied to training data (ignored for test or validation).
+        time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
         return_dataframe (bool): If True, return the data as a DataFrame. If False, return the features and target separately.
         balance_method (str): The method to use for balancing the dataset. Must be one of 'random' or 'deterministic'.
 
@@ -57,7 +55,8 @@ def get_train_test_data(site, test_train,
 
     df_features = open_features(site,
                             start_year = start_year,
-                            end_year = end_year)
+                            end_year = end_year,
+                            time_shift_hours = time_shift_hours)
     df_intem = read_intem(site,
                         start_year = start_year,
                         end_year = end_year)
@@ -192,12 +191,26 @@ def train_mlp(site,
             balance=0.5,
             balance_method="random",
             undersample=0,
+            time_shift_hours=[6],
             mlp_params=None,):
+    """ Train a MLP model for a given site.
+
+    Args:
+        site (str): The site for which to train the model.
+        balance (float): The target ratio of baseline to non-baseline
+            values in the training data. Must be between 0 and 1. Only applied to training data.
+        balance_method (str): The method to use for balancing the dataset. Must be one of 'random' or 'deterministic'. Only applied to training data.
+        undersample (float): If a float between 0 and 1, randomly undersample the training dataset to this fraction. Only applied to training data.
+        time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
+        mlp_params (dict): A dictionary of hyperparameters to pass to the MLPClassifier. If None, default parameters will be used.
+    Returns:
+        MLPClassifier: The trained MLP model.
+    """
 
     # Get the training data
     print(f"Training MLP model for site: {site}")
     X, y = get_train_test_data(site, "train", balance=balance, balance_method=balance_method,
-                               undersample=undersample)
+                               time_shift_hours=time_shift_hours, undersample=undersample)
 
     print(f"Number of training points: {len(y)}")
     print(f"... number of baseline points: {sum(y == 1)} ({sum(y == 1) / len(y):.1%})")
@@ -210,42 +223,56 @@ def train_mlp(site,
 
     # Validation
     X_val, y_val = get_train_test_data(site, "validation",
-                                       balance=-1,
-                                       undersample=0)
+                                       balance=balance,
+                                       balance_method=balance_method,
+                                       time_shift_hours=time_shift_hours,
+                                       undersample=undersample)
 
     # Testing
+    #TODO: Testing on unbalanced data?
     X_test, y_test = get_train_test_data(site, "test",
-                                         balance=-1,
-                                         undersample=0)
+                                         time_shift_hours=time_shift_hours,
+                                         )
 
     print("... predicting")
     y_pred_val = nn_model.predict(X_val)
     y_pred_train = nn_model.predict(X)
+    y_pred_test = nn_model.predict(X_test)
 
     # calculating scores
     precision_val = precision_score(y_val, y_pred_val)
     precision_train = precision_score(y, y_pred_train)
+    precision_test = precision_score(y_test, y_pred_test)
     recall_val = recall_score(y_val, y_pred_val)
     recall_train = recall_score(y, y_pred_train)
+    recall_test = recall_score(y_test, y_pred_test)
+
     f1_val = f1_score(y_val, y_pred_val)
     f1_train = f1_score(y, y_pred_train)
+    f1_test = f1_score(y_test, y_pred_test)
 
     print(f"Precision on Training Set = {precision_train:.3f}")
     print(f"Precision on Validation Set = {precision_val:.3f}")
+    print(f"Precision on Test Set = {precision_test:.3f}")
     print(f"Recall on Training Set = {recall_train:.3f}")
     print(f"Recall on Validation Set = {recall_val:.3f}")
+    print(f"Recall on Test Set = {recall_test:.3f}")
     print(f"F1 Score on Training Set = {f1_train:.3f}")
     print(f"F1 Score on Validation Set = {f1_val:.3f}")
+    print(f"F1 Score on Test Set = {f1_test:.3f}")
 
     return nn_model, X, y
 
 
-def train_mlp_grid_search(site, param_grid=None):
+def train_mlp_grid_search(site,
+                          param_grid=None,
+                          time_shift_hours=[6]):
     """ Train a MLP model using grid search for hyperparameter tuning.
 
     Args:
         site (str): The site for which to train the model.
         param_grid (dict, optional): A dictionary containing the hyperparameters to tune. If None, default values are used.
+        time_shift_hours (list of int): List of time shifts in hours to create lagged features for. For example, [6, 24] will create features shifted by 6 and 24 hours.
 
     Returns:
         GridSearchCV: The trained model with the best hyperparameters.
@@ -271,15 +298,19 @@ def train_mlp_grid_search(site, param_grid=None):
             # 'early_stopping': [True, False]
         }
 
-    X_val, y_val = get_train_test_data(site, "validation")
+    X_val, y_val = get_train_test_data(site, "validation", time_shift_hours=time_shift_hours)
 
     best_params = []
     best_scores = []
     best_balances = []
 
     for balance in np.arange(0.2, 0.9, 0.1):
+
+        #TODO: Could also do undersampling instead of balancing, or both?
+        # and could also do different balance methods (random vs deterministic)
         X_train, y_train = get_train_test_data(site, "train", balance=balance,
-                                            balance_method="deterministic")
+                                            balance_method="deterministic",
+                                            time_shift_hours=time_shift_hours)
 
         # Combine your training and validation sets
         X_all = pd.concat([X_train, X_val])
@@ -319,7 +350,8 @@ def train_mlp_grid_search(site, param_grid=None):
 
     # Train final model with the best balance and parameters
     X_train_final, y_train_final = get_train_test_data(site, "train", balance=best_balance,
-                                                      balance_method="deterministic")
+                                                      balance_method="deterministic",
+                                                      time_shift_hours=time_shift_hours)
 
     best_model = MLPClassifier(random_state=42, **best_best_params)
     best_model.fit(X_train_final, y_train_final)

diff --git a/notebooks/training.ipynb b/notebooks/training.ipynb
diff --git a/tests/test_features.py b/tests/test_features.py
@@ -73,14 +73,29 @@ def test_open_features():
                            end_year=2020,
                            features_dir=temp_dir)
 
+        df_6_24 = open_features("MHD",
+                                start_year=2020,
+                                end_year=2020,
+                                time_shift_hours=[6, 24],
+                                features_dir=temp_dir)
+
     columns = []
     for key in cfg.met_variables.keys():
         for i in range(17):
             columns.append(f"{key}_{i}")
 
+    # Check that the hour_of_day column is correctly computed
+    hour_of_day = df.index.hour.values
+    assert np.all(df["hour_of_day"].values == hour_of_day), "hour_of_day column is not correctly computed."
+
+    ### TEST LAGS
     columns_6h = []
+    columns_24h = []
     for column in columns:
         columns_6h.append(f"{column}_6h")
+        columns_24h.append(f"{column}_24h")
+
+    ### Firstly, just test the 6-hour lagged features
 
     expected_columns = columns + columns_6h
 
@@ -98,3 +113,21 @@ def test_open_features():
     # Check that the hour_of_day column is correctly computed
     hour_of_day = df.index.hour.values
     assert np.all(df["hour_of_day"].values == hour_of_day), "hour_of_day column is not correctly computed."
+
+    ### Now test the 6 and 24-hour lagged features
+
+    expected_columns = columns + columns_6h + columns_24h
+
+    # Check that all expected met. columns are present
+    assert all([col in df_6_24.columns for col in expected_columns]), "Not all expected columns are present in the opened features DataFrame."
+
+    # Check that all _6h and _24h columns are indeed 6- and 24-hours lagged behind their original columns
+    now_columns = [col for col in expected_columns if not col.endswith("_6h") and not col.endswith("_24h")]
+    lagged_columns_6h = [col for col in expected_columns if col.endswith("_6h")]
+    lagged_columns_24h = [col for col in expected_columns if col.endswith("_24h")]
+
+    future_rows_6h = df_6_24[df_6_24.index >= df_6_24.index[0] + pd.Timedelta(hours=6)]
+    future_rows_24h = df_6_24[df_6_24.index >= df_6_24.index[0] + pd.Timedelta(hours=24)]
+
+    assert np.all(df_6_24[now_columns].iloc[:len(future_rows_6h)].values == future_rows_6h[lagged_columns_6h].values), "Lagged columns do not match the expected 6-hour lag."
+    assert np.all(df_6_24[now_columns].iloc[:len(future_rows_24h)].values == future_rows_24h[lagged_columns_24h].values), "Lagged columns do not match the expected 24-hour lag."