diff --git a/autoxai4omics/models/tabauto/keras_model.py b/autoxai4omics/models/tabauto/keras_model.py index c978fdc..bee6747 100755 --- a/autoxai4omics/models/tabauto/keras_model.py +++ b/autoxai4omics/models/tabauto/keras_model.py @@ -18,6 +18,8 @@ from tensorflow.keras.models import load_model from tensorflow.keras.layers import Dense, Flatten, Conv1D from tensorflow.keras import losses +import numpy as np +import pandas as pd import tensorflow.keras.optimizers.legacy @@ -374,6 +376,11 @@ def fit_data_fx(self, trainX, trainY, testX, testY, input_list=None): self.model.feature_importances_ = feature_importances def fit_data(self, trainX, trainY, testX=None, testY=None, input_list=None): + # Ensure numpy arrays + trainX = trainX.to_numpy() if hasattr(trainX, "to_numpy") else trainX + testX = testX.to_numpy() if hasattr(testX, "to_numpy") else testX + trainY = trainY.to_numpy() if hasattr(trainY, "to_numpy") else trainY + testY = testY.to_numpy() if hasattr(testY, "to_numpy") else testY if self.method == "train_dnn_keras": return self.fit_data_fx(trainX, trainY, testX, testY, input_list) elif self.method == "train_dnn_autokeras": @@ -381,6 +388,7 @@ def fit_data(self, trainX, trainY, testX=None, testY=None, input_list=None): def predict(self, x): print("predicting values ...") + x = x.to_numpy() if isinstance(x, pd.DataFrame) else x if self.conv1d: x = x.reshape((x.shape[0], x.shape[1], 1)) @@ -397,6 +405,7 @@ def predict(self, x): def predict_proba(self, x): print("predicting probs ...") + x = x.to_numpy() if isinstance(x, pd.DataFrame) else x if self.conv1d: x = x.reshape((x.shape[0], x.shape[1], 1)) diff --git a/autoxai4omics/models/tabauto/lgbm_model.py b/autoxai4omics/models/tabauto/lgbm_model.py index 06e4dda..dcc6c04 100755 --- a/autoxai4omics/models/tabauto/lgbm_model.py +++ b/autoxai4omics/models/tabauto/lgbm_model.py @@ -20,7 +20,7 @@ import lightgbm as lgb_core import numpy as np import optuna - +import pandas as pd def to_matrix(data, n): return [data[i : i + n] for i in range(0, len(data), n)] @@ -74,11 +74,26 @@ def __call__(self, trial): scores = [] for train_index, test_index in kf.split(train_x): lgb_model = lgb_core.LGBMClassifier(**param) - lgb_model.fit(train_x[train_index], train_y[train_index]) - predictions = lgb_model.predict(train_x[test_index]) + + # Handle DataFrame vs ndarray + if isinstance(train_x, pd.DataFrame): + X_train_fold = train_x.iloc[train_index] + X_test_fold = train_x.iloc[test_index] + else: + X_train_fold = train_x[train_index] + X_test_fold = train_x[test_index] + + if isinstance(train_y, (pd.Series, pd.DataFrame)): + y_train_fold = train_y.iloc[train_index] + y_test_fold = train_y.iloc[test_index] + else: + y_train_fold = train_y[train_index] + y_test_fold = train_y[test_index] + + lgb_model.fit(X_train_fold, y_train_fold) + predictions = lgb_model.predict(X_test_fold) predictions = np.rint(predictions) - actuals = train_y[test_index] - s = accuracy_score(actuals, predictions) + s = accuracy_score(y_test_fold, predictions) scores.append(s) else: @@ -101,8 +116,8 @@ def __call__(self, trial): param["metric"] = "l1" lgb_model = lgb_core.LGBMRegressor(**param) - lgb_model.fit(train_x[train_index], train_y[train_index]) - predictions = lgb_model.predict(train_x[test_index]) + lgb_model.fit(train_x.iloc[train_index], train_y[train_index]) + predictions = lgb_model.predict(train_x.iloc[test_index]) actuals = train_y[test_index] s = mean_absolute_error(actuals, predictions) print(s) diff --git a/autoxai4omics/models/tabauto/xgboost_model.py b/autoxai4omics/models/tabauto/xgboost_model.py index 568f298..6113093 100755 --- a/autoxai4omics/models/tabauto/xgboost_model.py +++ b/autoxai4omics/models/tabauto/xgboost_model.py @@ -90,8 +90,8 @@ def __call__(self, trial): random_state=self.random_state, ) xgb_model.set_params(**param) - xgb_model.fit(train_x[train_index], train_y[train_index]) - predictions = xgb_model.predict(train_x[test_index]) + xgb_model.fit(train_x.iloc[train_index], train_y[train_index]) + predictions = xgb_model.predict(train_x.iloc[test_index]) predictions = np.rint(predictions) actuals = train_y[test_index] s = accuracy_score(actuals, predictions) @@ -112,8 +112,8 @@ def __call__(self, trial): random_state=self.random_state, ) xgb_model.set_params(**param) - xgb_model.fit(train_x[train_index], train_y[train_index]) - predictions = xgb_model.predict(train_x[test_index]) + xgb_model.fit(train_x.iloc[train_index], train_y[train_index]) + predictions = xgb_model.predict(train_x.iloc[test_index]) actuals = train_y[test_index] s = mean_absolute_error(actuals, predictions) print(s) diff --git a/autoxai4omics/omics/tabular.py b/autoxai4omics/omics/tabular.py index 15ac116..d32171c 100755 --- a/autoxai4omics/omics/tabular.py +++ b/autoxai4omics/omics/tabular.py @@ -89,7 +89,7 @@ def get_data_tabular( mask = metadata.index.isin(filtered_data.index) filtered_metadata = metadata.loc[mask] filtered_metadata.to_csv(metout_file) - y = filtered_metadata[config_dict["data"]["target"]].values + y = filtered_metadata[config_dict["data"]["target"]] # keep as Series with SampleID index else: file = "file_path" + ("_holdout_data" if holdout else "") @@ -100,7 +100,7 @@ def get_data_tabular( # Filter y mask = target_y.index.isin(filtered_data.index) filtered_target_y = target_y.loc[mask] - y = filtered_target_y.values + y = filtered_target_y # keep index feature_names = filtered_data.columns.to_list() @@ -153,7 +153,7 @@ def get_data_tabular_trained( mask = metadata.index.isin(filtered_data.index) filtered_metadata = metadata.loc[mask] filtered_metadata.to_csv(metout_file) - y = filtered_metadata[config_dict["data"]["target"]].values + y = filtered_metadata[config_dict["data"]["target"]] else: file = "file_path" + ("_holdout_data" if holdout else "") @@ -164,7 +164,7 @@ def get_data_tabular_trained( # Filter y mask = target_y.index.isin(filtered_data.index) filtered_target_y = target_y.loc[mask] - y = filtered_target_y.values + y = filtered_target_y else: y = None diff --git a/autoxai4omics/utils/ml/class_balancing.py b/autoxai4omics/utils/ml/class_balancing.py index 290cb6e..db293ee 100755 --- a/autoxai4omics/utils/ml/class_balancing.py +++ b/autoxai4omics/utils/ml/class_balancing.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from numpy import ndarray -from pandas.core.frame import DataFrame +import numpy as np +import pandas as pd from typing import Union import imblearn import logging @@ -22,25 +22,25 @@ def oversample_data( - x_train: Union[ndarray, DataFrame], - y_train: Union[ndarray, DataFrame], + x_train: Union[np.ndarray, pd.DataFrame], + y_train: Union[np.ndarray, pd.DataFrame], seed: int = 29292, -) -> tuple[ndarray, ndarray, ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Given the training set it has a class imbalance problem, this will over sample the training data to balance out the classes Parameters ---------- - x_train : Union[ndarray, DataFrame] + x_train : Union[np.ndarray, pd.DataFrame]] The training data that needs to be re-sampled - y_train : Union[ndarray, DataFrame] + y_train : Union[np.ndarray, pd.DataFrame, pd.Series] The train labels to be re-sampled seed : int, optional The seed to control the random sampling, by default 29292 Returns ------- - tuple[ndarray,ndarray,ndarray] + tuple[np.ndarray,np.ndarray,np.ndarray] A tuple containing the re-sampled training data, labels plus and the indicies of what original samples have been used @@ -49,21 +49,21 @@ def oversample_data( TypeError is raised if the seed is not an int TypeError - is raised if x_train or y_train is not an ndarray or a pandas DataFrame + is raised if x_train or y_train is not an np.ndarray or a pandas pd.DataFrame ValueError is raised if x_train and y_train dont have the same number of rows """ if not isinstance(seed, int): raise TypeError(f"seed must be an int, recieved {type(seed)}") - if not isinstance(x_train, (ndarray, DataFrame)): + if not isinstance(x_train, (np.ndarray, pd.DataFrame)): raise TypeError( - f"x_train must be either a ndarray or a DataFrame. Recieved: {type(x_train)}" + f"x_train must be either a np.ndarray or a pd.DataFrame. Recieved: {type(x_train)}" ) - if not isinstance(y_train, (ndarray, DataFrame)): + if not isinstance(y_train, (np.ndarray, pd.DataFrame, pd.Series)): raise TypeError( - f"y_train must be either a ndarray or a DataFrame. Recieved: {type(y_train)}" + f"y_train must be either a np.ndarray, a pd.DataFrame or pd.Series. Recieved: {type(y_train)}" ) if x_train.shape[0] != y_train.shape[0]: @@ -86,25 +86,25 @@ def oversample_data( def undersample_data( - x_train: Union[ndarray, DataFrame], - y_train: Union[ndarray, DataFrame], + x_train: Union[np.ndarray, pd.DataFrame], + y_train: Union[np.ndarray, pd.DataFrame], seed: int = 29292, -) -> tuple[ndarray, ndarray, ndarray]: +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """Given the training set it has a class imbalance problem, this will under sample the training data to balance out theclasses Parameters ---------- - x_train : Union[ndarray, DataFrame] + x_train : Union[np.ndarray, pd.DataFrame] The training data that needs to be re-sampled - y_train : Union[ndarray, DataFrame] + y_train : Union[np.ndarray, pd.DataFrame, pd.Series] The train labels to be re-sampled seed : int, optional The seed to control the random sampling, by default 29292 Returns ------- - tuple[ndarray,ndarray,ndarray] + tuple[np.ndarray,np.ndarray,np.ndarray] A tuple containing the re-sampled training data, labels plus and the indicies of what original samples have been used @@ -113,21 +113,21 @@ def undersample_data( TypeError is raised if the seed is not an int TypeError - is raised if x_train or y_train is not an ndarray or a pandas DataFrame + is raised if x_train or y_train is not an np.ndarray or a pandas pd.DataFrame ValueError is raised if x_train and y_train dont have the same number of rows """ if not isinstance(seed, int): raise TypeError(f"seed must be an int, recieved {type(seed)}") - if not isinstance(x_train, (ndarray, DataFrame)): + if not isinstance(x_train, (np.ndarray, pd.DataFrame)): raise TypeError( - f"x_train must be either a ndarray or a DataFrame. Recieved: {type(x_train)}" + f"x_train must be either a np.ndarray or a pd.DataFrame.. Recieved: {type(x_train)}" ) - if not isinstance(y_train, (ndarray, DataFrame)): + if not isinstance(y_train, (np.ndarray, pd.DataFrame, pd.Series)): raise TypeError( - f"x_train must be either a ndarray or a DataFrame. Recieved: {type(x_train)}" + f"y_train must be either a np.ndarray, a pd.DataFrame or pd.Series. Recieved: {type(y_train)}" ) if x_train.shape[0] != y_train.shape[0]: diff --git a/autoxai4omics/utils/ml/data_split.py b/autoxai4omics/utils/ml/data_split.py index a7102c1..5df1f00 100755 --- a/autoxai4omics/utils/ml/data_split.py +++ b/autoxai4omics/utils/ml/data_split.py @@ -115,9 +115,9 @@ def strat_split( f"x must be either a ndarray or a DataFrame. Recieved: {type(x)}" ) - if not isinstance(y, (ndarray, DataFrame)): + if not isinstance(y, (ndarray, DataFrame, pd.Series)): raise TypeError( - f"x must be either a ndarray or a DataFrame. Recieved: {type(y)}" + f"y must be either a ndarray or a DataFrame. Recieved: {type(y)}" ) if x.shape[0] != y.shape[0]: @@ -146,6 +146,12 @@ def strat_split( raise TypeError(f"group_name must be a str, provided: {type(group_name)}") metadata = pd.read_csv(meta_file, index_col=0) + + # Align metadata to X (same order, drop extras) + metadata = metadata.reindex(x.index) + + if isinstance(y, (pd.Series, pd.DataFrame)): + y = y.reindex(x.index) if group_name not in metadata.columns: raise ValueError( @@ -168,8 +174,8 @@ def strat_split( x_train, x_test, y_train, y_test = ( x.iloc[train_idx, :], x.iloc[test_idx, :], - y.iloc[train_idx, :], - y.iloc[test_idx, :], + y.iloc[train_idx], + y.iloc[test_idx], ) else: x_train, x_test, y_train, y_test = ( @@ -247,7 +253,7 @@ def std_split( f"x_full must be either a ndarray or a DataFrame. Recieved: {type(x_full)}" ) - if not isinstance(y_full, (ndarray, DataFrame)): + if not isinstance(y_full, (ndarray, DataFrame, pd.Series)): raise TypeError( f"y_full must be either a ndarray or a DataFrame. Recieved: {type(y_full)}" ) diff --git a/autoxai4omics/utils/ml/preprocessing.py b/autoxai4omics/utils/ml/preprocessing.py index 4a952db..55dd1a8 100644 --- a/autoxai4omics/utils/ml/preprocessing.py +++ b/autoxai4omics/utils/ml/preprocessing.py @@ -15,6 +15,7 @@ import joblib import logging import numpy as np +import pandas as pd omicLogger = logging.getLogger("OmicLogger") @@ -93,11 +94,24 @@ def learn_ml_preprocessing( else: omicLogger.info("Skipping class balancing...") + # Ensure x_train and x_test are DataFrames with indices preserved + if not isinstance(x_train, pd.DataFrame): + x_train = pd.DataFrame(x_train, index=x_ind_train, columns=features_names) + if not isinstance(x_test, pd.DataFrame): + x_test = pd.DataFrame(x_test, index=x_ind_test, columns=features_names) + + # also force y indices to match x indices (they should be same since in split_data they are split and should have same ids) + if isinstance(y_train, (pd.Series, pd.DataFrame)): + y_train.index = x_train.index + if isinstance(y_test, (pd.Series, pd.DataFrame)): + y_test.index = x_test.index + omicLogger.info("Re-combining data...") - # concatenate both test and train into test - x = np.concatenate((x_train, x_test)) - # y needs to be re-concatenated as the ordering of x may have been changed in splitting - y = np.concatenate((y_train, y_test)) + + x = pd.concat([x_train, x_test]) + y = pd.concat([ + pd.Series(y_train, index=x_ind_train, name="target"), + pd.Series(y_test, index=x_ind_test, name="target")]) # save the transformed input data omicLogger.info("Saving transformed data...") diff --git a/autoxai4omics/utils/save.py b/autoxai4omics/utils/save.py index 76ff5b1..c241942 100755 --- a/autoxai4omics/utils/save.py +++ b/autoxai4omics/utils/save.py @@ -164,10 +164,15 @@ def save_transformed_data( omicLogger.info(f"saving input data to: {save_path}") x_df.to_csv(save_path, index=True) - y_df = pd.DataFrame(y, columns=["target"]) + # If y is already a Series with proper index, will keep it + if isinstance(y, pd.Series): + y_df = y.to_frame(name="target") + else: + # if y is numpy + y_df = pd.DataFrame(y, columns=["target"], index=list(x_ind_train) + list(x_ind_test)) + y_df["set"] = "Train" - y_df["set"].iloc[-y_test.shape[0] :] = "Test" - y_df.index = list(x_ind_train) + list(x_ind_test) + y_df.loc[x_ind_test, "set"] = "Test" # mark test rows using their index y_df.index.name = "SampleID" save_path = experiment_folder / "transformed_model_target_data.csv" omicLogger.info(f"saving target data to: {save_path}")