Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions supervised/algorithms/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def catboost_eval_metric(ml_task, eval_metric):
"rmse": "RMSE",
"mse": "mse",
"mae": "MAE",
"mape": "MAPE",
"map": "MAP",
"r2": "R2",
"spearman": "spearman",
"pearson": "pearson",
Expand All @@ -73,7 +73,7 @@ def catboost_objective(ml_task, eval_metric):
"spearman",
"pearson",
"user_defined_metric",
]: # cant optimize them directly
]: # can't optimize them directly
objective = "RMSE"
return objective

Expand Down Expand Up @@ -333,8 +333,8 @@ def get_metric_name(self):
return "mse"
elif metric == "MAE":
return "mae"
elif metric == "MAPE":
return "mape"
elif metric == "MAP":
return "map"
elif metric in ["F1", "TotalF1:average=Micro"]:
return "f1"
elif metric == "Accuracy":
Expand Down Expand Up @@ -404,7 +404,7 @@ class CBClassifier(ClassifierMixin, CatBoostAlgorithm):
)

regression_params = copy.deepcopy(classification_params)
regression_params["loss_function"] = ["RMSE", "MAE", "MAPE"]
regression_params["loss_function"] = ["RMSE", "MAE", "MAP"]

regression_required_preprocessing = [
"missing_values_inputation",
Expand Down
2 changes: 1 addition & 1 deletion supervised/algorithms/lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def lightgbm_eval_metric(ml_task, automl_eval_metric):
"rmse": "rmse",
"mse": "l2",
"mae": "l1",
"mape": "mape",
"map": "map",
"r2": "custom",
"spearman": "custom",
"pearson": "custom",
Expand Down
2 changes: 1 addition & 1 deletion supervised/algorithms/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def get_metric_name(self):

xgb_regression_params = dict(xgb_bin_class_params)
xgb_regression_params["objective"] = ["reg:squarederror"]
# xgb_regression_params["eval_metric"] = ["rmse", "mae", "mape"]
# xgb_regression_params["eval_metric"] = ["rmse", "mae", "map"]
xgb_regression_params["max_depth"] = [4, 5, 6, 7, 8, 9]


Expand Down
22 changes: 16 additions & 6 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(

Arguments:
results_path (str): The path with results. If None, then the name of directory will be generated with the template: AutoML_{number},
where the number can be from 1 to 1,000 - depends which direcory name will be available.
where the number can be from 1 to 1,000 - depends which directory name will be available.
If the `results_path` will point to directory with AutoML results (`params.json` must be present),
then all models will be loaded.

Expand Down Expand Up @@ -154,13 +154,13 @@ def __init__(

- for binary classification: `logloss`, `auc`, `f1`, `average_precision`, `accuracy` - default is logloss (if left "auto")
- for mutliclass classification: `logloss`, `f1`, `accuracy` - default is `logloss` (if left "auto")
- for regression: `rmse`, `mse`, `mae`, `r2`, `mape`, `spearman`, `pearson` - default is `rmse` (if left "auto")
- for regression: `rmse`, `mse`, `mae`, `r2`, `map`, `spearman`, `pearson` - default is `rmse` (if left "auto")

validation_strategy (dict): Dictionary with validation type. Right now train/test split and cross-validation are supported.

Example:

Cross-validation exmaple:
Cross-validation example:
{
"validation_type": "kfold",
"k_folds": 5,
Expand All @@ -169,6 +169,16 @@ def __init__(
"random_seed": 123
}

Repeated k-fold cross-validation example:
{
"validation_type": "kfold",
"k_folds": 5,
"n_repetitions": 3,
"shuffle": True,
"stratify": True,
"random_seed": 123
}

Train/test example:
{
"validation_type": "split",
Expand Down Expand Up @@ -261,16 +271,16 @@ def __init__(
- `group_loss_ratio` - default metric.


fairness_threshold (float): The treshold value for fairness metric.
fairness_threshold (float): The threshold value for fairness metric.
The direction optimization (below or above threshold) of fairness metric is determined automatically.

Default values:

- for `demographic_parity_difference` the metric value should be below 0.1,
- for `demographic_parity_ratio` the metric value should be above 0.8,
- for `equalized_odds_difference` the metric value should be below 0.1,
- for `equalized_odds_ratio` the metric value shoule be above 0.8.
- for `group_loss_ratio` the metric value shoule be above 0.8.
- for `equalized_odds_ratio` the metric value should be above 0.8.
- for `group_loss_ratio` the metric value should be above 0.8.

For `group_loss_difference` the default threshold value can't be set because it depends on the dataset.
If `group_loss_difference` metric is used and `fairness_threshold` is not specified manually, then an exception will be raised.
Expand Down
20 changes: 11 additions & 9 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,9 @@ def create_dir(self, model_path):

def _expected_learners_cnt(self):
try:
repeats = self._validation_strategy.get("repeats", 1)
repeats = self._validation_strategy.get(
"repeats", self._validation_strategy.get("n_repetitions", 1)
)
folds = self._validation_strategy.get("k_folds", 1)
return repeats * folds
except Exception as e:
Expand Down Expand Up @@ -949,7 +951,7 @@ def _set_adjusted_validation(self):

self.verbose_print(f"Validation strategy: {k_folds}-fold CV {','.join(cv)}")
else:
# cant stack models for train/test split
# can't stack models for train/test split
self._stack_models = False
self.verbose_print("Disable stacking for split validation")

Expand All @@ -961,7 +963,7 @@ def _apply_constraints_stack_models(self):
self.verbose_print("Disable stacking for split validation")
self._stack_models = False
self._boost_on_errors = False
if "repeats" in self._validation_strategy:
if "repeats" in self._validation_strategy or "n_repetitions" in self._validation_strategy:
if self._stack_models:
self.verbose_print("Disable stacking for repeated validation")
self._stack_models = False
Expand All @@ -972,7 +974,7 @@ def _apply_constraints_stack_models(self):
self.tuner._stack_models = self._stack_models
self.tuner._boost_on_errors = self._boost_on_errors

# update Time Controler
# update Time Controller
if self._time_ctrl is not None:
self._time_ctrl._is_stacking = self._stack_models

Expand Down Expand Up @@ -1037,10 +1039,10 @@ def _fit(self, X, y, sample_weight=None, cv=None, sensitive_features=None):
self._adjust_validation = False
self._apply_constraints()
if not self._adjust_validation:
# if there is no validation adjustement
# if there is no validation adjustment
# then we can apply stack_models constraints immediately
# if there is validation adjustement
# then we will apply contraints after the adjustement
# if there is validation adjustment
# then we will apply constraints after the adjustment
self._apply_constraints_stack_models()

try:
Expand Down Expand Up @@ -1078,7 +1080,7 @@ def _fit(self, X, y, sample_weight=None, cv=None, sensitive_features=None):

# Automatic Exloratory Data Analysis
# I disabled EDA, because it won't be supported
# I recomend use pandas_profiling or Sweetviz
# I recommend use pandas_profiling or Sweetviz
# if self._explain_level == 2:
# EDA.compute(X, y, os.path.join(self._results_path, "EDA"))

Expand Down Expand Up @@ -2062,7 +2064,7 @@ def _validate_eval_metric(self):
"mse",
"mae",
"r2",
"mape",
"map",
"spearman",
"pearson",
]:
Expand Down
12 changes: 6 additions & 6 deletions supervised/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,13 @@ def get_oof_matrix(self, models):
m for m in models if m.is_fast_enough(self._max_single_prediction_time)
]:
raise NotTrainedException(
"Can't contruct ensemble with prediction time smaller than limit."
"Can't construct ensemble with prediction time smaller than limit."
)

# check if we can construct fair ensemble
if self._fairness_metric is not None:
if not [m for m in models if m.is_fair()]:
raise NotTrainedException("Can't contruct fair ensemble.")
raise NotTrainedException("Can't construct fair ensemble.")

oofs = {}
sensitive_features = None
Expand Down Expand Up @@ -336,10 +336,10 @@ def fit(self, oofs, y, sample_weight=None, sensitive_features=None):
self.sensitive_features = sensitive_features
start_time = time.time()
selected_algs_cnt = 0 # number of selected algorithms
self.best_algs = [] # selected algoritms indices from each loop
self.best_algs = [] # selected algorithms indices from each loop

total_prediction_time = 0
best_sum = None # sum of best algorihtms
best_sum = None # sum of best algorithms
for j in range(len(oofs)): # iterate over all solutions
min_score = self.metric.get_maximum()
best_model = None
Expand Down Expand Up @@ -379,7 +379,7 @@ def fit(self, oofs, y, sample_weight=None, sensitive_features=None):
self.best_loss = min_score
selected_algs_cnt = j

self.best_algs.append(best_model) # save the best algoritm
self.best_algs.append(best_model) # save the best algorithm
# update best_sum value
best_sum = (
oofs[best_model] if best_sum is None else best_sum + oofs[best_model]
Expand Down Expand Up @@ -531,7 +531,7 @@ def save(self, results_path, model_subpath):

LearningCurves.plot_for_ensemble(self._scores, self.metric.name, model_path)

# call additional metics just to be sure they are computed
# call additional metrics just to be sure they are computed
self._additional_metrics = self.get_additional_metrics()

AdditionalMetrics.save(
Expand Down
2 changes: 1 addition & 1 deletion supervised/fairness/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def regression(
mean_squared_error(t, p, sample_weight=sample_weight)
),
"R2": r2_score,
"MAPE": mean_absolute_percentage_error,
"MAP": mean_absolute_percentage_error,
"SPEARMAN": spearman,
"PEARSON": pearson,
}
Expand Down
2 changes: 1 addition & 1 deletion supervised/fairness/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def save_classification(
if v.get("privileged_value") is not None:
fout.write(f'Privileged value is {v["privileged_value"]}.\n')

# add fairness cerificate
# add fairness certificate
FairnessReport.write_certificate_section(
fout,
FairnessReport.certificate_info(
Expand Down
19 changes: 10 additions & 9 deletions supervised/model_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def get_train_time(self):
def predictions(
self,
learner,
preproces,
preprocess,
X_train,
y_train,
sample_weight,
Expand All @@ -106,17 +106,17 @@ def predictions(
y_validation_true = y_validation
y_validation_predicted = learner.predict(X_validation)

y_train_true = preproces.inverse_scale_target(y_train_true)
y_train_predicted = preproces.inverse_scale_target(y_train_predicted)
y_validation_true = preproces.inverse_scale_target(y_validation_true)
y_validation_predicted = preproces.inverse_scale_target(y_validation_predicted)
y_train_true = preprocess.inverse_scale_target(y_train_true)
y_train_predicted = preprocess.inverse_scale_target(y_train_predicted)
y_validation_true = preprocess.inverse_scale_target(y_validation_true)
y_validation_predicted = preprocess.inverse_scale_target(y_validation_predicted)

y_validation_columns = []
if self._ml_task == MULTICLASS_CLASSIFICATION:
# y_train_true = preproces.inverse_categorical_target(y_train_true)
# y_validation_true = preproces.inverse_categorical_target(y_validation_true)
# y_train_true = preprocess.inverse_categorical_target(y_train_true)
# y_validation_true = preprocess.inverse_categorical_target(y_validation_true)
# get columns, omit the last one (it is label)
y_validation_columns = preproces.prepare_target_labels(
y_validation_columns = preprocess.prepare_target_labels(
y_validation_predicted
).columns.tolist()[:-1]
elif self._ml_task == BINARY_CLASSIFICATION:
Expand Down Expand Up @@ -643,7 +643,7 @@ def save(self, results_path, model_subpath):
trees_in_iteration=self.additional_params.get("trees_in_step"),
)

# call additional metics just to be sure they are computed
# call additional metrics just to be sure they are computed
self._additional_metrics = self.get_additional_metrics()

AdditionalMetrics.save(
Expand Down Expand Up @@ -721,6 +721,7 @@ def load(results_path, model_subpath, lazy_load=True):
for learner_desc, learner_subpath in zip(
json_desc.get("learners"), json_desc.get("saved")
):
learner_subpath = os.path.normpath(learner_subpath)
learner_path = os.path.join(results_path, learner_subpath)
l = AlgorithmFactory.load(learner_desc, learner_path, lazy_load)
mf.learners += [l]
Expand Down
4 changes: 2 additions & 2 deletions supervised/preprocessing/goldenfeatures_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,9 @@ def fit(self, X, y):
)
return
if X.shape[1] == 0:
self._error = f"Golden Features not created. No continous features. Input data shape: {X.shape}, {y.shape}"
self._error = f"Golden Features not created. No continuous features. Input data shape: {X.shape}, {y.shape}"
self.save()
raise AutoMLException("Golden Features not created. No continous features.")
raise AutoMLException("Golden Features not created. No continuous features.")

start_time = time.time()
combinations = itertools.combinations(X.columns, r=2)
Expand Down
4 changes: 2 additions & 2 deletions supervised/preprocessing/kmeans_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def fit(self, X, y):
)
return
if X.shape[1] == 0:
self._error = f"KMeans not created. No continous features. Input data shape: {X.shape}, {y.shape}"
raise AutoMLException("KMeans Features not created. No continous features.")
self._error = f"KMeans not created. No continuous features. Input data shape: {X.shape}, {y.shape}"
raise AutoMLException("KMeans Features not created. No continuous features.")

start_time = time.time()

Expand Down
8 changes: 4 additions & 4 deletions supervised/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,14 +281,14 @@ def fit_and_transform(self, X_train, y_train, sample_weight=None):
drop_cols = [c for c in self._drop_features if c in available_cols]
if len(drop_cols) == X_train.shape[1]:
raise AutoMLException(
"All features are droppped! Your data looks like random data."
"All features are dropped! Your data looks like random data."
)
if drop_cols:
X_train.drop(drop_cols, axis=1, inplace=True)
self._drop_features = drop_cols

if X_train is not None:
# there can be catagorical columns (in CatBoost) which cant be clipped
# there can be categorical columns (in CatBoost) which can't be clipped
numeric_cols = X_train.select_dtypes(include="number").columns.tolist()
X_train[numeric_cols] = X_train[numeric_cols].clip(
lower=np.finfo(np.float32).min + 1000,
Expand Down Expand Up @@ -374,7 +374,7 @@ def transform(self, X_validation, y_validation, sample_weight_validation=None):
# we should notice user about it!
# warnings should go to the separate file ...
# warnings.warn(
# "There are columns {} with missing values which didnt have missing values in train dataset.".format(
# "There are columns {} with missing values which didn't have missing values in train dataset.".format(
# list(
# X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))]
# )
Expand Down Expand Up @@ -415,7 +415,7 @@ def transform(self, X_validation, y_validation, sample_weight_validation=None):
X_validation.drop(self._drop_features, axis=1, inplace=True)

if X_validation is not None:
# there can be catagorical columns (in CatBoost) which cant be clipped
# there can be categorical columns (in CatBoost) which can't be clipped
numeric_cols = X_validation.select_dtypes(include="number").columns.tolist()
X_validation[numeric_cols] = X_validation[numeric_cols].clip(
lower=np.finfo(np.float32).min + 1000,
Expand Down
2 changes: 1 addition & 1 deletion supervised/preprocessing/preprocessing_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def transform(self, X):
X = self._transform_na_fill(X)
# this is additional run through columns,
# in case of transforming data with new columns with missing values
# X = self._make_sure_na_filled(X) # disbaled for now
# X = self._make_sure_na_filled(X) # disabled for now
return X

def _transform_na_fill(self, X):
Expand Down
4 changes: 2 additions & 2 deletions supervised/preprocessing/preprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class PreprocessingUtilsException(Exception):

class PreprocessingUtils(object):
CATEGORICAL = "categorical"
CONTINOUS = "continous"
CONTINUOUS = "continuous"
DISCRETE = "discrete"
DATETIME = "datetime"
TEXT = "text"
Expand All @@ -26,7 +26,7 @@ def get_type(x):

data_type = PreprocessingUtils.CATEGORICAL
if col_type.startswith("float"):
data_type = PreprocessingUtils.CONTINOUS
data_type = PreprocessingUtils.CONTINUOUS
elif col_type.startswith("int") or col_type.startswith("uint"):
data_type = PreprocessingUtils.DISCRETE
elif col_type.startswith("datetime"):
Expand Down
Loading