added graceful OOM handling (#354)

manujosephv · web-flow · commit 3b8bbc77e20b · 2023-12-25T07:29:28.000+05:30
* added graceful OOM handling

* fix for pytorch &lt;1.13
diff --git a/src/pytorch_tabular/tabular_model.py b/src/pytorch_tabular/tabular_model.py
@@ -50,7 +50,13 @@
     PreEncoded1dLayer,
 )
 from pytorch_tabular.tabular_datamodule import TabularDatamodule
-from pytorch_tabular.utils import get_logger, getattr_nested, pl_load
+from pytorch_tabular.utils import (
+    OOMException,
+    OutOfMemoryHandler,
+    get_logger,
+    getattr_nested,
+    pl_load,
+)
 
 try:
     import captum.attr
@@ -574,6 +580,7 @@ def train(
         callbacks: Optional[List[pl.Callback]] = None,
         max_epochs: int = None,
         min_epochs: int = None,
+        handle_oom: bool = True,
     ) -> pl.Trainer:
         """Trains the model.
 
@@ -589,6 +596,8 @@ def train(
 
             min_epochs (Optional[int]): Overwrite minimum number of epochs to be run. Defaults to None.
 
+            handle_oom (bool): If True, will try to handle OOM errors elegantly. Defaults to True.
+
         Returns:
             pl.Trainer: The PyTorch Lightning Trainer instance
         """
@@ -601,18 +610,36 @@ def train(
         if self.config.auto_lr_find and (not self.config.fast_dev_run):
             if self.verbose:
                 logger.info("Auto LR Find Started")
-            result = Tuner(self.trainer).lr_find(self.model, train_dataloaders=train_loader, val_dataloaders=val_loader)
+            with OutOfMemoryHandler(handle_oom=handle_oom) as oom_handler:
+                result = Tuner(self.trainer).lr_find(
+                    self.model,
+                    train_dataloaders=train_loader,
+                    val_dataloaders=val_loader,
+                )
+            if oom_handler.oom_triggered:
+                raise OOMException(
+                    "OOM detected during LR Find. Try reducing your batch_size or the"
+                    " model parameters." + "/n" + "Original Error: " + oom_handler.oom_msg
+                )
             if self.verbose:
                 logger.info(
                     f"Suggested LR: {result.suggestion()}. For plot and detailed"
                     " analysis, use `find_learning_rate` method."
                 )
+            self.model.reset_weights()
             # Parameters in models needs to be initialized again after LR find
             self.model.data_aware_initialization(self.datamodule)
         self.model.train()
         if self.verbose:
             logger.info("Training Started")
-        self.trainer.fit(self.model, train_loader, val_loader)
+        with OutOfMemoryHandler(handle_oom=handle_oom) as oom_handler:
+            self.trainer.fit(self.model, train_loader, val_loader)
+        if oom_handler.oom_triggered:
+            raise OOMException(
+                "OOM detected during Training. Try reducing your batch_size or the"
+                " model parameters."
+                "/n" + "Original Error: " + oom_handler.oom_msg
+            )
         self._is_fitted = True
         if self.verbose:
             logger.info("Training the model completed")
@@ -637,6 +664,7 @@ def fit(
         callbacks: Optional[List[pl.Callback]] = None,
         datamodule: Optional[TabularDatamodule] = None,
         cache_data: str = "memory",
+        handle_oom: bool = True,
     ) -> pl.Trainer:
         """The fit method which takes in the data and triggers the training.
 
@@ -690,6 +718,8 @@ def fit(
             cache_data (str): Decides how to cache the data in the dataloader. If set to
                 "memory", will cache in memory. If set to a valid path, will cache in that path. Defaults to "memory".
 
+            handle_oom (bool): If True, will try to handle OOM errors elegantly. Defaults to True.
+
         Returns:
             pl.Trainer: The PyTorch Lightning Trainer instance
         """
@@ -728,7 +758,7 @@ def fit(
             optimizer_params or {},
         )
 
-        return self.train(model, datamodule, callbacks, max_epochs, min_epochs)
+        return self.train(model, datamodule, callbacks, max_epochs, min_epochs, handle_oom)
 
     def pretrain(
         self,
@@ -1229,7 +1259,7 @@ def predict(
 
             progress_bar = partial(tqdm, description="Generating Predictions...")
         else:
-            progress_bar = lambda it: it
+            progress_bar = lambda it: it  # noqa E731
         for batch in progress_bar(inference_dataloader):
             for k, v in batch.items():
                 if isinstance(v, list) and (len(v) == 0):
@@ -1293,8 +1323,9 @@ def predict(
                 np.argmax(point_predictions, axis=1)
             )
             warnings.warn(
-                "Classification prediction column will be renamed to `{target_col}_prediction` "
-                "in the next release to maintain consistency with regression.",
+                "Classification prediction column will be renamed to"
+                " `{target_col}_prediction` in the next release to maintain"
+                " consistency with regression.",
                 DeprecationWarning,
             )
         if ret_logits:
@@ -1710,6 +1741,7 @@ def cross_validate(
         groups: Optional[Union[str, np.ndarray]] = None,
         verbose: bool = True,
         reset_datamodule: bool = True,
+        handle_oom: bool = True,
         **kwargs,
     ):
         """Cross validate the model.
@@ -1753,6 +1785,7 @@ def cross_validate(
                 If False, we take an approximation that once the transformations are fit on the first
                 fold, they will be valid for all the other folds. Defaults to True.
 
+            handle_oom (bool, optional): If True, will handle out of memory errors elegantly
             **kwargs: Additional keyword arguments to be passed to the `fit` method of the model.
 
         Returns:
@@ -1789,7 +1822,8 @@ def cross_validate(
                 datamodule.validation, _ = datamodule.preprocess_data(val_fold, stage="inference")
 
             # Train the model
-            self.train(model, datamodule, **train_kwargs)
+            handle_oom = train_kwargs.pop("handle_oom", handle_oom)
+            self.train(model, datamodule, handle_oom=handle_oom, **train_kwargs)
             if return_oof or is_callable_metric:
                 preds = self.predict(val_fold, include_input_features=False)
                 oof_preds.append(preds)
@@ -1864,6 +1898,7 @@ def bagging_predict(
         return_raw_predictions: bool = False,
         aggregate: Union[str, Callable] = "mean",
         weights: Optional[List[float]] = None,
+        handle_oom: bool = True,
         **kwargs,
     ):
         """Bagging predict on the test data.
@@ -1912,6 +1947,8 @@ def bagging_predict(
                 from each fold. If None, will use equal weights. This is only used when `aggregate` is "mean".
                 Defaults to None.
 
+            handle_oom (bool, optional): If True, will handle out of memory errors elegantly
+
             **kwargs: Additional keyword arguments to be passed to the `fit` method of the model.
 
         Returns:
@@ -1953,7 +1990,8 @@ def bagging_predict(
                 datamodule.validation, _ = datamodule.preprocess_data(val_fold, stage="inference")
 
             # Train the model
-            self.train(model, datamodule, **train_kwargs)
+            handle_oom = train_kwargs.pop("handle_oom", handle_oom)
+            self.train(model, datamodule, handle_oom=handle_oom, **train_kwargs)
             fold_preds = self.predict(test, include_input_features=False)
             pred_idx = fold_preds.index
             if self.config.task == "classification":
diff --git a/src/pytorch_tabular/tabular_model_tuner.py b/src/pytorch_tabular/tabular_model_tuner.py
@@ -4,7 +4,6 @@
 """Tabular Model."""
 import warnings
 from collections import namedtuple
-from contextlib import nullcontext
 from copy import deepcopy
 from pathlib import Path
 from typing import Callable, Dict, Iterable, Optional, Union
@@ -13,7 +12,7 @@
 import pandas as pd
 from omegaconf.dictconfig import DictConfig
 from pandas import DataFrame
-from rich.progress import Progress
+from rich.progress import track
 from sklearn.model_selection import BaseCrossValidator, ParameterGrid, ParameterSampler
 
 from pytorch_tabular.config import (
@@ -23,7 +22,7 @@
     TrainerConfig,
 )
 from pytorch_tabular.tabular_model import TabularModel
-from pytorch_tabular.utils import get_logger
+from pytorch_tabular.utils import OOMException, OutOfMemoryHandler, get_logger
 
 logger = get_logger(__name__)
 
@@ -146,6 +145,7 @@ def tune(
         verbose: bool = False,
         progress_bar: bool = True,
         random_state: Optional[int] = 42,
+        ignore_oom: bool = True,
         **kwargs,
     ):
         """Tune the hyperparameters of the TabularModel.
@@ -194,6 +194,8 @@ def tune(
 
         random_state (Optional[int], optional): Random state to be used for random search. Defaults to 42.
 
+        ignore_oom (bool, optional): Whether to ignore out of memory errors. Defaults to True.
+
         **kwargs: Additional keyword arguments to be passed to the TabularModel fit.
 
         Returns:
@@ -230,9 +232,7 @@ def tune(
         else:
             raise NotImplementedError(f"{strategy} is not implemented yet.")
         if progress_bar:
-            ctx_mgr = Progress()
-        else:
-            ctx_mgr = nullcontext()
+            iterator = track(iterator, description=f"[green]{strategy.replace('_',' ').title()}...", total=n_trials)
         verbose_tabular_model = self.tabular_model_init_kwargs.pop("verbose", False)
         temp_tabular_model = TabularModel(
             data_config=self.data_config,
@@ -253,58 +253,74 @@ def tune(
             is_callable_metric = True
         del temp_tabular_model
         trials = []
+        for i, params in enumerate(iterator):
+            # Copying the configs as a base
+            # Make sure all default parameters that you want to be set for all
+            # trials are in the original configs
+            trainer_config_t = deepcopy(self.trainer_config)
+            optimizer_config_t = deepcopy(self.optimizer_config)
+            model_config_t = deepcopy(self.model_config)
 
-        with ctx_mgr as progress:
-            if progress:
-                task = progress.add_task(f"[green]{strategy.replace('_',' ').title()}...", total=n_trials)
-            for i, params in enumerate(iterator):
-                # Copying the configs as a base
-                # Make sure all default parameters that you want to be set for all
-                # trials are in the original configs
-                trainer_config_t = deepcopy(self.trainer_config)
-                optimizer_config_t = deepcopy(self.optimizer_config)
-                model_config_t = deepcopy(self.model_config)
-
-                trainer_config_t, optimizer_config_t, model_config_t = self._update_configs(
-                    trainer_config_t, optimizer_config_t, model_config_t, params
-                )
-                # Initialize Tabular model using the new config
-                tabular_model_t = TabularModel(
-                    data_config=self.data_config,
-                    model_config=model_config_t,
-                    optimizer_config=optimizer_config_t,
-                    trainer_config=trainer_config_t,
-                    verbose=verbose_tabular_model,
-                    **self.tabular_model_init_kwargs,
-                )
-                if cv is not None:
-                    cv_verbose = cv_kwargs.pop("verbose", False)
+            trainer_config_t, optimizer_config_t, model_config_t = self._update_configs(
+                trainer_config_t, optimizer_config_t, model_config_t, params
+            )
+            # Initialize Tabular model using the new config
+            tabular_model_t = TabularModel(
+                data_config=self.data_config,
+                model_config=model_config_t,
+                optimizer_config=optimizer_config_t,
+                trainer_config=trainer_config_t,
+                verbose=verbose_tabular_model,
+                **self.tabular_model_init_kwargs,
+            )
+            if cv is not None:
+                cv_verbose = cv_kwargs.pop("verbose", False)
+                cv_kwargs.pop("handle_oom", None)
+                with OutOfMemoryHandler(handle_oom=True) as handler:
                     cv_scores, _ = tabular_model_t.cross_validate(
                         cv=cv,
                         train=train,
                         metric=metric,
                         verbose=cv_verbose,
+                        handle_oom=False,
                         **cv_kwargs,
                     )
+                if handler.oom_triggered:
+                    if not ignore_oom:
+                        raise OOMException(
+                            "Out of memory error occurred during cross validation. "
+                            "Set ignore_oom=True to ignore this error."
+                        )
+                    else:
+                        params.update({metric.__name__ if is_callable_metric else metric: "OOM"})
+                else:
                     params.update({metric.__name__ if is_callable_metric else metric: cv_agg_func(cv_scores)})
+            else:
+                model = tabular_model_t.prepare_model(
+                    datamodule=datamodule,
+                    **prep_model_kwargs,
+                )
+                train_kwargs.pop("handle_oom", None)
+                with OutOfMemoryHandler(handle_oom=True) as handler:
+                    tabular_model_t.train(model=model, datamodule=datamodule, handle_oom=False, **train_kwargs)
+                if handler.oom_triggered:
+                    if not ignore_oom:
+                        raise OOMException(
+                            "Out of memory error occurred during training. " "Set ignore_oom=True to ignore this error."
+                        )
+                    else:
+                        params.update({metric.__name__ if is_callable_metric else metric: "OOM"})
                 else:
-                    model = tabular_model_t.prepare_model(
-                        datamodule=datamodule,
-                        **prep_model_kwargs,
-                    )
-                    tabular_model_t.train(model=model, datamodule=datamodule, **train_kwargs)
                     if is_callable_metric:
                         preds = tabular_model_t.predict(validation, include_input_features=False)
                         params.update({metric.__name__: metric(validation[tabular_model_t.config.target], preds)})
                     else:
                         result = tabular_model_t.evaluate(validation, verbose=False)
                         params.update({k.replace("test_", ""): v for k, v in result[0].items()})
-                params.update({"trial_id": i})
-                trials.append(params)
-                if verbose:
-                    logger.info(f"Trial {i+1}/{n_trials}: {params} | Score: {params[metric]}")
-                if progress:
-                    progress.update(task, advance=1)
+            params.update({"trial_id": i})
+            trials.append(params)
+            if verbose:
+                logger.info(f"Trial {i+1}/{n_trials}: {params} | Score: {params[metric]}")
         trials_df = pd.DataFrame(trials)
         trials = trials_df.pop("trial_id")
         if mode == "max":
diff --git a/src/pytorch_tabular/utils/__init__.py b/src/pytorch_tabular/utils/__init__.py
@@ -1,6 +1,8 @@
 from .data_utils import get_balanced_sampler, get_class_weighted_cross_entropy, get_gaussian_centers
 from .logger import get_logger
 from .nn_utils import (
+    OOMException,
+    OutOfMemoryHandler,
     _initialize_kaiming,
     _initialize_layers,
     _linear_dropout_bn,
@@ -26,4 +28,6 @@
     "to_one_hot",
     "_initialize_kaiming",
     "check_numpy",
+    "OutOfMemoryHandler",
+    "OOMException",
 ]
diff --git a/src/pytorch_tabular/utils/nn_utils.py b/src/pytorch_tabular/utils/nn_utils.py