Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions autoxai4omics/models/tabauto/keras_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Flatten, Conv1D
from tensorflow.keras import losses
import numpy as np
import pandas as pd

import tensorflow.keras.optimizers.legacy

Expand Down Expand Up @@ -374,13 +376,19 @@ def fit_data_fx(self, trainX, trainY, testX, testY, input_list=None):
self.model.feature_importances_ = feature_importances

def fit_data(self, trainX, trainY, testX=None, testY=None, input_list=None):
# Ensure numpy arrays
trainX = trainX.to_numpy() if hasattr(trainX, "to_numpy") else trainX
testX = testX.to_numpy() if hasattr(testX, "to_numpy") else testX
trainY = trainY.to_numpy() if hasattr(trainY, "to_numpy") else trainY
testY = testY.to_numpy() if hasattr(testY, "to_numpy") else testY
if self.method == "train_dnn_keras":
return self.fit_data_fx(trainX, trainY, testX, testY, input_list)
elif self.method == "train_dnn_autokeras":
return self.fit_data_ak(trainX, trainY, testX, testY, input_list)

def predict(self, x):
print("predicting values ...")
x = x.to_numpy() if isinstance(x, pd.DataFrame) else x
if self.conv1d:
x = x.reshape((x.shape[0], x.shape[1], 1))

Expand All @@ -397,6 +405,7 @@ def predict(self, x):

def predict_proba(self, x):
print("predicting probs ...")
x = x.to_numpy() if isinstance(x, pd.DataFrame) else x
if self.conv1d:
x = x.reshape((x.shape[0], x.shape[1], 1))

Expand Down
29 changes: 22 additions & 7 deletions autoxai4omics/models/tabauto/lgbm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import lightgbm as lgb_core
import numpy as np
import optuna

import pandas as pd

def to_matrix(data, n):
return [data[i : i + n] for i in range(0, len(data), n)]
Expand Down Expand Up @@ -74,11 +74,26 @@ def __call__(self, trial):
scores = []
for train_index, test_index in kf.split(train_x):
lgb_model = lgb_core.LGBMClassifier(**param)
lgb_model.fit(train_x[train_index], train_y[train_index])
predictions = lgb_model.predict(train_x[test_index])

# Handle DataFrame vs ndarray
if isinstance(train_x, pd.DataFrame):
X_train_fold = train_x.iloc[train_index]
X_test_fold = train_x.iloc[test_index]
else:
X_train_fold = train_x[train_index]
X_test_fold = train_x[test_index]

if isinstance(train_y, (pd.Series, pd.DataFrame)):
y_train_fold = train_y.iloc[train_index]
y_test_fold = train_y.iloc[test_index]
else:
y_train_fold = train_y[train_index]
y_test_fold = train_y[test_index]

lgb_model.fit(X_train_fold, y_train_fold)
predictions = lgb_model.predict(X_test_fold)
predictions = np.rint(predictions)
actuals = train_y[test_index]
s = accuracy_score(actuals, predictions)
s = accuracy_score(y_test_fold, predictions)
scores.append(s)

else:
Expand All @@ -101,8 +116,8 @@ def __call__(self, trial):
param["metric"] = "l1"

lgb_model = lgb_core.LGBMRegressor(**param)
lgb_model.fit(train_x[train_index], train_y[train_index])
predictions = lgb_model.predict(train_x[test_index])
lgb_model.fit(train_x.iloc[train_index], train_y[train_index])
predictions = lgb_model.predict(train_x.iloc[test_index])
actuals = train_y[test_index]
s = mean_absolute_error(actuals, predictions)
print(s)
Expand Down
8 changes: 4 additions & 4 deletions autoxai4omics/models/tabauto/xgboost_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def __call__(self, trial):
random_state=self.random_state,
)
xgb_model.set_params(**param)
xgb_model.fit(train_x[train_index], train_y[train_index])
predictions = xgb_model.predict(train_x[test_index])
xgb_model.fit(train_x.iloc[train_index], train_y[train_index])
predictions = xgb_model.predict(train_x.iloc[test_index])
predictions = np.rint(predictions)
actuals = train_y[test_index]
s = accuracy_score(actuals, predictions)
Expand All @@ -112,8 +112,8 @@ def __call__(self, trial):
random_state=self.random_state,
)
xgb_model.set_params(**param)
xgb_model.fit(train_x[train_index], train_y[train_index])
predictions = xgb_model.predict(train_x[test_index])
xgb_model.fit(train_x.iloc[train_index], train_y[train_index])
predictions = xgb_model.predict(train_x.iloc[test_index])
actuals = train_y[test_index]
s = mean_absolute_error(actuals, predictions)
print(s)
Expand Down
8 changes: 4 additions & 4 deletions autoxai4omics/omics/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def get_data_tabular(
mask = metadata.index.isin(filtered_data.index)
filtered_metadata = metadata.loc[mask]
filtered_metadata.to_csv(metout_file)
y = filtered_metadata[config_dict["data"]["target"]].values
y = filtered_metadata[config_dict["data"]["target"]] # keep as Series with SampleID index

else:
file = "file_path" + ("_holdout_data" if holdout else "")
Expand All @@ -100,7 +100,7 @@ def get_data_tabular(
# Filter y
mask = target_y.index.isin(filtered_data.index)
filtered_target_y = target_y.loc[mask]
y = filtered_target_y.values
y = filtered_target_y # keep index

feature_names = filtered_data.columns.to_list()

Expand Down Expand Up @@ -153,7 +153,7 @@ def get_data_tabular_trained(
mask = metadata.index.isin(filtered_data.index)
filtered_metadata = metadata.loc[mask]
filtered_metadata.to_csv(metout_file)
y = filtered_metadata[config_dict["data"]["target"]].values
y = filtered_metadata[config_dict["data"]["target"]]

else:
file = "file_path" + ("_holdout_data" if holdout else "")
Expand All @@ -164,7 +164,7 @@ def get_data_tabular_trained(
# Filter y
mask = target_y.index.isin(filtered_data.index)
filtered_target_y = target_y.loc[mask]
y = filtered_target_y.values
y = filtered_target_y
else:
y = None

Expand Down
48 changes: 24 additions & 24 deletions autoxai4omics/utils/ml/class_balancing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from numpy import ndarray
from pandas.core.frame import DataFrame
import numpy as np
import pandas as pd
from typing import Union
import imblearn
import logging
Expand All @@ -22,25 +22,25 @@


def oversample_data(
x_train: Union[ndarray, DataFrame],
y_train: Union[ndarray, DataFrame],
x_train: Union[np.ndarray, pd.DataFrame],
y_train: Union[np.ndarray, pd.DataFrame],
seed: int = 29292,
) -> tuple[ndarray, ndarray, ndarray]:
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Given the training set it has a class imbalance problem, this will over sample the training data to balance out
the classes

Parameters
----------
x_train : Union[ndarray, DataFrame]
x_train : Union[np.ndarray, pd.DataFrame]]
The training data that needs to be re-sampled
y_train : Union[ndarray, DataFrame]
y_train : Union[np.ndarray, pd.DataFrame, pd.Series]
The train labels to be re-sampled
seed : int, optional
The seed to control the random sampling, by default 29292

Returns
-------
tuple[ndarray,ndarray,ndarray]
tuple[np.ndarray,np.ndarray,np.ndarray]
A tuple containing the re-sampled training data, labels plus and the indicies of what original samples have been
used

Expand All @@ -49,21 +49,21 @@ def oversample_data(
TypeError
is raised if the seed is not an int
TypeError
is raised if x_train or y_train is not an ndarray or a pandas DataFrame
is raised if x_train or y_train is not an np.ndarray or a pandas pd.DataFrame
ValueError
is raised if x_train and y_train dont have the same number of rows
"""
if not isinstance(seed, int):
raise TypeError(f"seed must be an int, recieved {type(seed)}")

if not isinstance(x_train, (ndarray, DataFrame)):
if not isinstance(x_train, (np.ndarray, pd.DataFrame)):
raise TypeError(
f"x_train must be either a ndarray or a DataFrame. Recieved: {type(x_train)}"
f"x_train must be either a np.ndarray or a pd.DataFrame. Recieved: {type(x_train)}"
)

if not isinstance(y_train, (ndarray, DataFrame)):
if not isinstance(y_train, (np.ndarray, pd.DataFrame, pd.Series)):
raise TypeError(
f"y_train must be either a ndarray or a DataFrame. Recieved: {type(y_train)}"
f"y_train must be either a np.ndarray, a pd.DataFrame or pd.Series. Recieved: {type(y_train)}"
)

if x_train.shape[0] != y_train.shape[0]:
Expand All @@ -86,25 +86,25 @@ def oversample_data(


def undersample_data(
x_train: Union[ndarray, DataFrame],
y_train: Union[ndarray, DataFrame],
x_train: Union[np.ndarray, pd.DataFrame],
y_train: Union[np.ndarray, pd.DataFrame],
seed: int = 29292,
) -> tuple[ndarray, ndarray, ndarray]:
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Given the training set it has a class imbalance problem, this will under sample the training data to balance out
theclasses

Parameters
----------
x_train : Union[ndarray, DataFrame]
x_train : Union[np.ndarray, pd.DataFrame]
The training data that needs to be re-sampled
y_train : Union[ndarray, DataFrame]
y_train : Union[np.ndarray, pd.DataFrame, pd.Series]
The train labels to be re-sampled
seed : int, optional
The seed to control the random sampling, by default 29292

Returns
-------
tuple[ndarray,ndarray,ndarray]
tuple[np.ndarray,np.ndarray,np.ndarray]
A tuple containing the re-sampled training data, labels plus and the indicies of what original samples have been
used

Expand All @@ -113,21 +113,21 @@ def undersample_data(
TypeError
is raised if the seed is not an int
TypeError
is raised if x_train or y_train is not an ndarray or a pandas DataFrame
is raised if x_train or y_train is not an np.ndarray or a pandas pd.DataFrame
ValueError
is raised if x_train and y_train dont have the same number of rows
"""
if not isinstance(seed, int):
raise TypeError(f"seed must be an int, recieved {type(seed)}")

if not isinstance(x_train, (ndarray, DataFrame)):
if not isinstance(x_train, (np.ndarray, pd.DataFrame)):
raise TypeError(
f"x_train must be either a ndarray or a DataFrame. Recieved: {type(x_train)}"
f"x_train must be either a np.ndarray or a pd.DataFrame.. Recieved: {type(x_train)}"
)

if not isinstance(y_train, (ndarray, DataFrame)):
if not isinstance(y_train, (np.ndarray, pd.DataFrame, pd.Series)):
raise TypeError(
f"x_train must be either a ndarray or a DataFrame. Recieved: {type(x_train)}"
f"y_train must be either a np.ndarray, a pd.DataFrame or pd.Series. Recieved: {type(y_train)}"
)

if x_train.shape[0] != y_train.shape[0]:
Expand Down
16 changes: 11 additions & 5 deletions autoxai4omics/utils/ml/data_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ def strat_split(
f"x must be either a ndarray or a DataFrame. Recieved: {type(x)}"
)

if not isinstance(y, (ndarray, DataFrame)):
if not isinstance(y, (ndarray, DataFrame, pd.Series)):
raise TypeError(
f"x must be either a ndarray or a DataFrame. Recieved: {type(y)}"
f"y must be either a ndarray or a DataFrame. Recieved: {type(y)}"
)

if x.shape[0] != y.shape[0]:
Expand Down Expand Up @@ -146,6 +146,12 @@ def strat_split(
raise TypeError(f"group_name must be a str, provided: {type(group_name)}")

metadata = pd.read_csv(meta_file, index_col=0)

# Align metadata to X (same order, drop extras)
metadata = metadata.reindex(x.index)

if isinstance(y, (pd.Series, pd.DataFrame)):
y = y.reindex(x.index)

if group_name not in metadata.columns:
raise ValueError(
Expand All @@ -168,8 +174,8 @@ def strat_split(
x_train, x_test, y_train, y_test = (
x.iloc[train_idx, :],
x.iloc[test_idx, :],
y.iloc[train_idx, :],
y.iloc[test_idx, :],
y.iloc[train_idx],
y.iloc[test_idx],
)
else:
x_train, x_test, y_train, y_test = (
Expand Down Expand Up @@ -247,7 +253,7 @@ def std_split(
f"x_full must be either a ndarray or a DataFrame. Recieved: {type(x_full)}"
)

if not isinstance(y_full, (ndarray, DataFrame)):
if not isinstance(y_full, (ndarray, DataFrame, pd.Series)):
raise TypeError(
f"y_full must be either a ndarray or a DataFrame. Recieved: {type(y_full)}"
)
Expand Down
22 changes: 18 additions & 4 deletions autoxai4omics/utils/ml/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import joblib
import logging
import numpy as np
import pandas as pd

omicLogger = logging.getLogger("OmicLogger")

Expand Down Expand Up @@ -93,11 +94,24 @@ def learn_ml_preprocessing(
else:
omicLogger.info("Skipping class balancing...")

# Ensure x_train and x_test are DataFrames with indices preserved
if not isinstance(x_train, pd.DataFrame):
x_train = pd.DataFrame(x_train, index=x_ind_train, columns=features_names)
if not isinstance(x_test, pd.DataFrame):
x_test = pd.DataFrame(x_test, index=x_ind_test, columns=features_names)

# also force y indices to match x indices (they should be same since in split_data they are split and should have same ids)
if isinstance(y_train, (pd.Series, pd.DataFrame)):
y_train.index = x_train.index
if isinstance(y_test, (pd.Series, pd.DataFrame)):
y_test.index = x_test.index

omicLogger.info("Re-combining data...")
# concatenate both test and train into test
x = np.concatenate((x_train, x_test))
# y needs to be re-concatenated as the ordering of x may have been changed in splitting
y = np.concatenate((y_train, y_test))

x = pd.concat([x_train, x_test])
y = pd.concat([
pd.Series(y_train, index=x_ind_train, name="target"),
pd.Series(y_test, index=x_ind_test, name="target")])

# save the transformed input data
omicLogger.info("Saving transformed data...")
Expand Down
11 changes: 8 additions & 3 deletions autoxai4omics/utils/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,15 @@ def save_transformed_data(
omicLogger.info(f"saving input data to: {save_path}")
x_df.to_csv(save_path, index=True)

y_df = pd.DataFrame(y, columns=["target"])
# If y is already a Series with proper index, will keep it
if isinstance(y, pd.Series):
y_df = y.to_frame(name="target")
else:
# if y is numpy
y_df = pd.DataFrame(y, columns=["target"], index=list(x_ind_train) + list(x_ind_test))

y_df["set"] = "Train"
y_df["set"].iloc[-y_test.shape[0] :] = "Test"
y_df.index = list(x_ind_train) + list(x_ind_test)
y_df.loc[x_ind_test, "set"] = "Test" # mark test rows using their index
y_df.index.name = "SampleID"
save_path = experiment_folder / "transformed_model_target_data.csv"
omicLogger.info(f"saving target data to: {save_path}")
Expand Down