save classification labels to checkpoints

aditya0by0 · aditya0by0 · commit 0ea34903622d · 2026-01-08T19:46:30.000+01:00
diff --git a/chebai/cli.py b/chebai/cli.py
@@ -59,6 +59,12 @@ def call_data_methods(data: Type[XYBaseDataModule]):
             apply_on="instantiate",
         )
 
+        parser.link_arguments(
+            "data.classes_txt_file_path",
+            "model.init_args.classes_txt_file_path",
+            apply_on="instantiate",
+        )
+
         for kind in ("train", "val", "test"):
             for average in (
                 "micro-f1",
diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -40,6 +40,7 @@ def __init__(
         pass_loss_kwargs: bool = True,
         optimizer_kwargs: Optional[Dict[str, Any]] = None,
         exclude_hyperparameter_logging: Optional[Iterable[str]] = None,
+        classes_txt_file_path: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -77,6 +78,17 @@ def __init__(
         self.validation_metrics = val_metrics
         self.test_metrics = test_metrics
         self.pass_loss_kwargs = pass_loss_kwargs
+        with open(classes_txt_file_path, "r") as f:
+            self.labels_list = [cls.strip() for cls in f.readlines()]
+        assert len(self.labels_list) > 0, "Class labels list is empty."
+        assert len(self.labels_list) == out_dim, (
+            f"Number of class labels ({len(self.labels_list)}) does not match "
+            f"the model output dimension ({out_dim})."
+        )
+
+    def on_save_checkpoint(self, checkpoint):
+        # https://lightning.ai/docs/pytorch/stable/common/checkpointing_intermediate.html#modify-a-checkpoint-anywhere
+        checkpoint["classification_labels"] = self.labels_list
 
     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
         # avoid errors due to unexpected keys (e.g., if loading checkpoint from a bce model and using it with a
@@ -100,7 +112,7 @@ def __init_subclass__(cls, **kwargs):
 
     def _get_prediction_and_labels(
         self, data: Dict[str, Any], labels: torch.Tensor, output: torch.Tensor
-    ) -> (torch.Tensor, torch.Tensor):
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Gets the predictions and labels from the model output.
 
@@ -151,7 +163,7 @@ def _process_for_loss(
         model_output: torch.Tensor,
         labels: torch.Tensor,
         loss_kwargs: Dict[str, Any],
-    ) -> (torch.Tensor, torch.Tensor, Dict[str, Any]):
+    ) -> tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
         """
         Processes the data for loss computation.
 
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -96,9 +96,9 @@ def __init__(
         self.prediction_kind = prediction_kind
         self.data_limit = data_limit
         self.label_filter = label_filter
-        assert (balance_after_filter is not None) or (
-            self.label_filter is None
-        ), "Filter balancing requires a filter"
+        assert (balance_after_filter is not None) or (self.label_filter is None), (
+            "Filter balancing requires a filter"
+        )
         self.balance_after_filter = balance_after_filter
         self.num_workers = num_workers
         self.persistent_workers: bool = bool(persistent_workers)
@@ -108,13 +108,13 @@ def __init__(
         self.use_inner_cross_validation = (
             inner_k_folds > 1
         )  # only use cv if there are at least 2 folds
-        assert (
-            fold_index is None or self.use_inner_cross_validation is not None
-        ), "fold_index can only be set if cross validation is used"
+        assert fold_index is None or self.use_inner_cross_validation is not None, (
+            "fold_index can only be set if cross validation is used"
+        )
         if fold_index is not None and self.inner_k_folds is not None:
-            assert (
-                fold_index < self.inner_k_folds
-            ), "fold_index can't be larger than the total number of folds"
+            assert fold_index < self.inner_k_folds, (
+                "fold_index can't be larger than the total number of folds"
+            )
         self.fold_index = fold_index
         self._base_dir = base_dir
         self.n_token_limit = n_token_limit
@@ -137,9 +137,9 @@ def num_of_labels(self):
 
     @property
     def feature_vector_size(self):
-        assert (
-            self._feature_vector_size is not None
-        ), "size of feature vector must be set"
+        assert self._feature_vector_size is not None, (
+            "size of feature vector must be set"
+        )
         return self._feature_vector_size
 
     @property
@@ -619,6 +619,19 @@ def raw_file_names_dict(self) -> dict:
         """
         raise NotImplementedError
 
+    @property
+    def classes_txt_file_path(self) -> str:
+        """
+        Returns the filename for the classes text file.
+
+        Returns:
+            str: The filename for the classes text file.
+        """
+        # This property also used in following places:
+        #   - results/prediction.py: to load class names for csv columns names
+        #   - chebai/cli.py: to link this property to `model.init_args.classes_txt_file_path`
+        return os.path.join(self.processed_dir_main, "classes.txt")
+
 
 class MergedDataset(XYBaseDataModule):
     MERGED = []
@@ -1373,14 +1386,3 @@ def processed_file_names_dict(self) -> dict:
         if self.n_token_limit is not None:
             return {"data": f"data_maxlen{self.n_token_limit}.pt"}
         return {"data": "data.pt"}
-
-    @property
-    def classes_txt_file_path(self) -> str:
-        """
-        Returns the filename for the classes text file.
-
-        Returns:
-            str: The filename for the classes text file.
-        """
-        # This property also used in custom trainer `chebai/trainer/CustomTrainer.py`
-        return os.path.join(self.processed_dir_main, "classes.txt")
diff --git a/chebai/result/prediction.py b/chebai/result/prediction.py
@@ -126,6 +126,7 @@ def _add_class_columns(class_file_path: _PATH) -> list[str]:
         predictions_df = pd.DataFrame(rows, columns=CLASS_LABELS, index=smiles_strings)
 
         predictions_df.to_csv(save_to)
+        print(f"Predictions saved to: {save_to}")
 
     @torch.inference_mode()
     def predict_smiles(