From ed8ba0fa61ee0623cde0365b8e2c4eeee9c98474 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 30 Jan 2026 08:14:10 -0800 Subject: [PATCH 1/4] Separate normalization and split into exp/sim data --- ml/train_model.py | 99 ++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/ml/train_model.py b/ml/train_model.py index 0970a9d5..f6eb3ec5 100644 --- a/ml/train_model.py +++ b/ml/train_model.py @@ -106,24 +106,7 @@ def normalize(df, input_names, input_transform, output_names, output_transform): norm_df = df.copy() norm_df[input_names] = input_transform(torch.tensor(df[input_names].values)) norm_df[output_names] = output_transform(torch.tensor(df[output_names].values)) - - norm_exp_inputs = torch.tensor( - norm_df[norm_df.experiment_flag == 1][input_names].values, - dtype=torch.float, - ) - norm_exp_outputs = torch.tensor( - norm_df[norm_df.experiment_flag == 1][output_names].values, - dtype=torch.float, - ) - norm_sim_inputs = torch.tensor( - norm_df[norm_df.experiment_flag == 0][input_names].values, - dtype=torch.float, - ) - norm_sim_outputs = torch.tensor( - norm_df[norm_df.experiment_flag == 0][output_names].values, - dtype=torch.float, - ) - return norm_df, norm_exp_inputs, norm_exp_outputs, norm_sim_inputs, norm_sim_outputs + return norm_df def split_data(df_exp, df_sim, variables, model_type): @@ -162,18 +145,48 @@ def build_transforms(n_inputs, X_train, n_outputs, y_train): def train_nn_ensemble( model_type, - n_inputs, - n_outputs, - sim_X_train, - sim_y_train, - exp_X_train, - exp_y_train, - sim_X_val, - sim_y_val, - exp_X_val, - exp_y_val, + norm_df_train, + norm_df_val, + input_names, + output_names, device, ): + n_inputs = len(input_names) + n_outputs = len(output_names) + + exp_X_train = torch.tensor( + norm_df_train[norm_df_train.experiment_flag == 1][input_names].values, + dtype=torch.float, + ) + exp_y_train = torch.tensor( + norm_df_train[norm_df_train.experiment_flag == 1][output_names].values, + dtype=torch.float, + ) + sim_X_train = torch.tensor( + norm_df_train[norm_df_train.experiment_flag == 0][input_names].values, + dtype=torch.float, + ) + sim_y_train = torch.tensor( + norm_df_train[norm_df_train.experiment_flag == 0][output_names].values, + dtype=torch.float, + ) + exp_X_val = torch.tensor( + norm_df_val[norm_df_val.experiment_flag == 1][input_names].values, + dtype=torch.float, + ) + exp_y_val = torch.tensor( + norm_df_val[norm_df_val.experiment_flag == 1][output_names].values, + dtype=torch.float, + ) + sim_X_val = torch.tensor( + norm_df_val[norm_df_val.experiment_flag == 0][input_names].values, + dtype=torch.float, + ) + sim_y_val = torch.tensor( + norm_df_val[norm_df_val.experiment_flag == 0][output_names].values, + dtype=torch.float, + ) + if model_type == "NN": num_models = 1 elif model_type == "ensemble_NN": @@ -456,13 +469,7 @@ def write_model(model, model_type, experiment, db): input_transform, output_transform = build_transforms( len(input_names), X_train, len(output_names), y_train ) - ( - norm_df_train, - norm_expt_inputs_train, - norm_expt_outputs_train, - norm_sim_inputs_train, - norm_sim_outputs_train, - ) = normalize( + norm_df_train = normalize( df_train, input_names, input_transform, output_names, output_transform ) @@ -471,29 +478,17 @@ def write_model(model, model_type, experiment, db): # Neural Net and Ensemble Creation and training ###################################################### if model_type != "GP": - ( - norm_df_val, - norm_expt_inputs_val, - norm_expt_outputs_val, - norm_sim_inputs_val, - norm_sim_outputs_val, - ) = normalize( + norm_df_val = normalize( df_val, input_names, input_transform, output_names, output_transform ) print("training started") NN_start_time = time.time() ensemble = train_nn_ensemble( model_type, - len(input_names), - len(output_names), - norm_sim_inputs_train.to(device), - norm_sim_outputs_train.to(device), - norm_expt_inputs_train.to(device), - norm_expt_outputs_train.to(device), - norm_sim_inputs_val.to(device), - norm_sim_outputs_val.to(device), - norm_expt_inputs_val.to(device), - norm_expt_outputs_val.to(device), + norm_df_train, + norm_df_val, + input_names, + output_names, device, ) print("training ended") From 4771c0cb7963e30428fed3622a8a887fc6deb322 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 30 Jan 2026 08:16:08 -0800 Subject: [PATCH 2/4] Fix for GPU --- ml/train_model.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ml/train_model.py b/ml/train_model.py index f6eb3ec5..429c662b 100644 --- a/ml/train_model.py +++ b/ml/train_model.py @@ -157,35 +157,35 @@ def train_nn_ensemble( exp_X_train = torch.tensor( norm_df_train[norm_df_train.experiment_flag == 1][input_names].values, dtype=torch.float, - ) + ).to(device) exp_y_train = torch.tensor( norm_df_train[norm_df_train.experiment_flag == 1][output_names].values, dtype=torch.float, - ) + ).to(device) sim_X_train = torch.tensor( norm_df_train[norm_df_train.experiment_flag == 0][input_names].values, dtype=torch.float, - ) + ).to(device) sim_y_train = torch.tensor( norm_df_train[norm_df_train.experiment_flag == 0][output_names].values, dtype=torch.float, - ) + ).to(device) exp_X_val = torch.tensor( norm_df_val[norm_df_val.experiment_flag == 1][input_names].values, dtype=torch.float, - ) + ).to(device) exp_y_val = torch.tensor( norm_df_val[norm_df_val.experiment_flag == 1][output_names].values, dtype=torch.float, - ) + ).to(device) sim_X_val = torch.tensor( norm_df_val[norm_df_val.experiment_flag == 0][input_names].values, dtype=torch.float, - ) + ).to(device) sim_y_val = torch.tensor( norm_df_val[norm_df_val.experiment_flag == 0][output_names].values, dtype=torch.float, - ) + ).to(device) if model_type == "NN": num_models = 1 From 2441744d544a5566668a049e073ffc7b31bbe5ff Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 30 Jan 2026 08:36:20 -0800 Subject: [PATCH 3/4] Ensure that each model in an ensemble is trained with a different split --- ml/train_model.py | 136 ++++++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 72 deletions(-) diff --git a/ml/train_model.py b/ml/train_model.py index 429c662b..ef059a8d 100644 --- a/ml/train_model.py +++ b/ml/train_model.py @@ -110,26 +110,11 @@ def normalize(df, input_names, input_transform, output_names, output_transform): def split_data(df_exp, df_sim, variables, model_type): - if model_type == "GP": - if len(df_exp) > 0: - return (pd.concat((df_exp[variables], df_sim[variables])), None) - else: - return df_sim[variables] + # Concatenate experimental and simulation data into a single dataframe + if len(df_exp) > 0: + return pd.concat((df_exp[variables], df_sim[variables])) else: - # Split exp and sim data into training and validation data with 80:20 ratio, selected randomly - sim_train_df, sim_val_df = train_test_split( - df_sim, test_size=0.2, random_state=None, shuffle=True - ) # random_state will ensure the seed is different everytime, data will be shuffled randomly before splitting - if len(df_exp) > 0: - exp_train_df, exp_val_df = train_test_split( - df_exp, test_size=0.2, random_state=None, shuffle=True - ) # 20% of the data will go in validation test, no fixing the - return ( - pd.concat((exp_train_df[variables], sim_train_df[variables])), - pd.concat((exp_val_df[variables], sim_val_df[variables])), - ) - else: - return (sim_train_df[variables], sim_val_df[variables]) + return df_sim[variables] def build_transforms(n_inputs, X_train, n_outputs, y_train): @@ -145,8 +130,7 @@ def build_transforms(n_inputs, X_train, n_outputs, y_train): def train_nn_ensemble( model_type, - norm_df_train, - norm_df_val, + norm_df, input_names, output_names, device, @@ -154,39 +138,6 @@ def train_nn_ensemble( n_inputs = len(input_names) n_outputs = len(output_names) - exp_X_train = torch.tensor( - norm_df_train[norm_df_train.experiment_flag == 1][input_names].values, - dtype=torch.float, - ).to(device) - exp_y_train = torch.tensor( - norm_df_train[norm_df_train.experiment_flag == 1][output_names].values, - dtype=torch.float, - ).to(device) - sim_X_train = torch.tensor( - norm_df_train[norm_df_train.experiment_flag == 0][input_names].values, - dtype=torch.float, - ).to(device) - sim_y_train = torch.tensor( - norm_df_train[norm_df_train.experiment_flag == 0][output_names].values, - dtype=torch.float, - ).to(device) - exp_X_val = torch.tensor( - norm_df_val[norm_df_val.experiment_flag == 1][input_names].values, - dtype=torch.float, - ).to(device) - exp_y_val = torch.tensor( - norm_df_val[norm_df_val.experiment_flag == 1][output_names].values, - dtype=torch.float, - ).to(device) - sim_X_val = torch.tensor( - norm_df_val[norm_df_val.experiment_flag == 0][input_names].values, - dtype=torch.float, - ).to(device) - sim_y_val = torch.tensor( - norm_df_val[norm_df_val.experiment_flag == 0][output_names].values, - dtype=torch.float, - ).to(device) - if model_type == "NN": num_models = 1 elif model_type == "ensemble_NN": @@ -194,6 +145,51 @@ def train_nn_ensemble( ensemble = [] for i in range(num_models): + # Split data into train and validation for this model + # First split by experiment_flag, then split each into train/val + df_exp = norm_df[norm_df.experiment_flag == 1] + df_sim = norm_df[norm_df.experiment_flag == 0] + + # Split experimental data into train and validation + if len(df_exp) > 0: + exp_train_df, exp_val_df = train_test_split( + df_exp, test_size=0.2, random_state=None, shuffle=True + ) + else: + exp_train_df = pd.DataFrame(columns=norm_df.columns) + exp_val_df = pd.DataFrame(columns=norm_df.columns) + + # Split simulation data into train and validation + sim_train_df, sim_val_df = train_test_split( + df_sim, test_size=0.2, random_state=None, shuffle=True + ) + + # Extract tensors for training and validation + exp_X_train = torch.tensor( + exp_train_df[input_names].values, dtype=torch.float + ).to(device) if len(exp_train_df) > 0 else torch.empty((0, n_inputs), dtype=torch.float).to(device) + exp_y_train = torch.tensor( + exp_train_df[output_names].values, dtype=torch.float + ).to(device) if len(exp_train_df) > 0 else torch.empty((0, n_outputs), dtype=torch.float).to(device) + sim_X_train = torch.tensor( + sim_train_df[input_names].values, dtype=torch.float + ).to(device) + sim_y_train = torch.tensor( + sim_train_df[output_names].values, dtype=torch.float + ).to(device) + exp_X_val = torch.tensor( + exp_val_df[input_names].values, dtype=torch.float + ).to(device) if len(exp_val_df) > 0 else torch.empty((0, n_inputs), dtype=torch.float).to(device) + exp_y_val = torch.tensor( + exp_val_df[output_names].values, dtype=torch.float + ).to(device) if len(exp_val_df) > 0 else torch.empty((0, n_outputs), dtype=torch.float).to(device) + sim_X_val = torch.tensor( + sim_val_df[input_names].values, dtype=torch.float + ).to(device) + sim_y_val = torch.tensor( + sim_val_df[output_names].values, dtype=torch.float + ).to(device) + model = CombinedNN(n_inputs, n_outputs, learning_rate=0.0001) model.to(device) # moving to GPU NNmodel_start_time = time.time() @@ -273,7 +269,7 @@ def build_torch_model_from_nn( def train_gp( - norm_df_train, input_names, output_names, input_transform, output_transform, device + norm_df, input_names, output_names, input_transform, output_transform, device ): # Create separate GP models for each output to handle NaN values in the training data gp_models = [] @@ -282,14 +278,14 @@ def train_gp( print(f"Processing output {i + 1}/{len(output_names)}: {output_name}") # Get data where this output is not NaN - output_data = norm_df_train[output_name].values + output_data = norm_df[output_name].values valid_mask = torch.logical_not(torch.isnan(torch.tensor(output_data))) n_valid = torch.sum(valid_mask).item() print(f"Output {output_name}: {n_valid}/{len(output_data)} valid data points") # Prepare input and output data for this output X_valid = torch.tensor( - norm_df_train[input_names].values[valid_mask], dtype=torch.float64 + norm_df[input_names].values[valid_mask], dtype=torch.float64 ) y_valid = torch.tensor(output_data[valid_mask], dtype=torch.float64).unsqueeze( -1 @@ -301,7 +297,7 @@ def train_gp( ): # len(df_exp) > 0: # Temporarily deactivate MultiTaskGP for simplicity # MultiTaskGP for experimental vs simulation data exp_flag_valid = torch.tensor( - norm_df_train[["experiment_flag"]].values[valid_mask], + norm_df[["experiment_flag"]].values[valid_mask], dtype=torch.float64, ) X_with_task = torch.cat([exp_flag_valid, X_valid], dim=1) @@ -459,18 +455,18 @@ def write_model(model, model_type, experiment, db): exp_name = value["depends_on"] df_sim[exp_name] = df_sim[sim_name] / value["alpha_guess"] + value["beta_guess"] - # Concatenate experimental and simulation data for training and validation + # Concatenate experimental and simulation data variables = input_names + output_names + ["experiment_flag"] - df_train, df_val = split_data(df_exp, df_sim, variables, model_type) + df = split_data(df_exp, df_sim, variables, model_type) - # Apply normalization to the training data - X_train = torch.tensor(df_train[input_names].values, dtype=torch.float) - y_train = torch.tensor(df_train[output_names].values, dtype=torch.float) + # Apply normalization to the data + X = torch.tensor(df[input_names].values, dtype=torch.float) + y = torch.tensor(df[output_names].values, dtype=torch.float) input_transform, output_transform = build_transforms( - len(input_names), X_train, len(output_names), y_train + len(input_names), X, len(output_names), y ) - norm_df_train = normalize( - df_train, input_names, input_transform, output_names, output_transform + norm_df = normalize( + df, input_names, input_transform, output_names, output_transform ) model = None @@ -478,15 +474,11 @@ def write_model(model, model_type, experiment, db): # Neural Net and Ensemble Creation and training ###################################################### if model_type != "GP": - norm_df_val = normalize( - df_val, input_names, input_transform, output_names, output_transform - ) print("training started") NN_start_time = time.time() ensemble = train_nn_ensemble( model_type, - norm_df_train, - norm_df_val, + norm_df, input_names, output_names, device, @@ -516,7 +508,7 @@ def write_model(model, model_type, experiment, db): ############################################################### else: model = train_gp( - norm_df_train, + norm_df, input_names, output_names, input_transform, From a62c039dffd0a2b1e851d9161614a4cfbedddf86 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 30 Jan 2026 16:37:55 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ml/train_model.py | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/ml/train_model.py b/ml/train_model.py index ef059a8d..edf3a580 100644 --- a/ml/train_model.py +++ b/ml/train_model.py @@ -165,30 +165,40 @@ def train_nn_ensemble( ) # Extract tensors for training and validation - exp_X_train = torch.tensor( - exp_train_df[input_names].values, dtype=torch.float - ).to(device) if len(exp_train_df) > 0 else torch.empty((0, n_inputs), dtype=torch.float).to(device) - exp_y_train = torch.tensor( - exp_train_df[output_names].values, dtype=torch.float - ).to(device) if len(exp_train_df) > 0 else torch.empty((0, n_outputs), dtype=torch.float).to(device) + exp_X_train = ( + torch.tensor(exp_train_df[input_names].values, dtype=torch.float).to(device) + if len(exp_train_df) > 0 + else torch.empty((0, n_inputs), dtype=torch.float).to(device) + ) + exp_y_train = ( + torch.tensor(exp_train_df[output_names].values, dtype=torch.float).to( + device + ) + if len(exp_train_df) > 0 + else torch.empty((0, n_outputs), dtype=torch.float).to(device) + ) sim_X_train = torch.tensor( sim_train_df[input_names].values, dtype=torch.float ).to(device) sim_y_train = torch.tensor( sim_train_df[output_names].values, dtype=torch.float ).to(device) - exp_X_val = torch.tensor( - exp_val_df[input_names].values, dtype=torch.float - ).to(device) if len(exp_val_df) > 0 else torch.empty((0, n_inputs), dtype=torch.float).to(device) - exp_y_val = torch.tensor( - exp_val_df[output_names].values, dtype=torch.float - ).to(device) if len(exp_val_df) > 0 else torch.empty((0, n_outputs), dtype=torch.float).to(device) - sim_X_val = torch.tensor( - sim_val_df[input_names].values, dtype=torch.float - ).to(device) - sim_y_val = torch.tensor( - sim_val_df[output_names].values, dtype=torch.float - ).to(device) + exp_X_val = ( + torch.tensor(exp_val_df[input_names].values, dtype=torch.float).to(device) + if len(exp_val_df) > 0 + else torch.empty((0, n_inputs), dtype=torch.float).to(device) + ) + exp_y_val = ( + torch.tensor(exp_val_df[output_names].values, dtype=torch.float).to(device) + if len(exp_val_df) > 0 + else torch.empty((0, n_outputs), dtype=torch.float).to(device) + ) + sim_X_val = torch.tensor(sim_val_df[input_names].values, dtype=torch.float).to( + device + ) + sim_y_val = torch.tensor(sim_val_df[output_names].values, dtype=torch.float).to( + device + ) model = CombinedNN(n_inputs, n_outputs, learning_rate=0.0001) model.to(device) # moving to GPU