ChEB-AI
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.vscode/extensions.json‎
Lines changed: 11 additions & 0 deletions b/‎.vscode/extensions.json‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 16 additions & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 661 deletions b/‎LICENSE‎
Lines changed: 21 additions & 661 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 6 deletions b/‎README.md‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎chebai/cli.py‎
Lines changed: 6 additions & 2 deletions b/‎chebai/cli.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 24 additions & 5 deletions b/‎chebai/models/base.py‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎chebai/models/electra.py‎
Lines changed: 1 addition & 0 deletions b/‎chebai/models/electra.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎chebai/preprocessing/bin/smiles_token/tokens.txt‎
Lines changed: 2 additions & 178 deletions b/‎chebai/preprocessing/bin/smiles_token/tokens.txt‎
Lines changed: 2 additions & 178 deletions
@@ -175,7 +175,7 @@ chebai.egg-info
 lightning_logs
 logs
 .isort.cfg
-/.vscode
+/.vscode/launch.json
 
 *.out
 *.err
 
@@ -0,0 +1,11 @@
+{
+    "recommendations": [
+        "ms-python.python",
+        "ms-python.vscode-pylance",
+        "charliermarsh.ruff",
+        "usernamehw.errorlens"
+    ],
+    "unwantedRecommendations": [
+        "ms-python.vscode-python2"
+    ]
+}
@@ -0,0 +1,16 @@
+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "./tests",
+        "-p",
+        "test*.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true,
+    "python.analysis.typeCheckingMode": "basic",
+    "editor.formatOnSave": true,
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff"
+    }
+}
@@ -63,7 +63,7 @@ python -m chebai fit --trainer=configs/training/default_trainer.yml --model=conf
 ```
 A command with additional options may look like this:
 ```
-python3 -m chebai fit --trainer=configs/training/default_trainer.yml --model=configs/model/electra.yml --model.train_metrics=configs/metrics/micro-macro-f1.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --model.val_metrics=configs/metrics/micro-macro-f1.yml --model.pretrained_checkpoint=electra_pretrained.ckpt --model.load_prefix=generator. --data=configs/data/chebi/chebi50.yml --model.criterion=configs/loss/bce.yml --data.init_args.batch_size=10 --trainer.logger.init_args.name=chebi50_bce_unweighted --data.init_args.num_workers=9 --model.pass_loss_kwargs=false --data.init_args.chebi_version=231 --data.init_args.data_limit=1000
+python3 -m chebai fit --trainer=configs/training/default_trainer.yml --model=configs/model/electra.yml --model.train_metrics=configs/metrics/micro-macro-f1.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --model.val_metrics=configs/metrics/micro-macro-f1.yml --model.pretrained_checkpoint=electra_pretrained.ckpt --model.load_prefix=generator. --data=configs/data/chebi/chebi50.yml --model.criterion=configs/loss/bce_weighted.yml --data.init_args.batch_size=10 --trainer.logger.init_args.name=chebi50_bce_weighted --data.init_args.num_workers=9 --model.pass_loss_kwargs=false --data.init_args.chebi_version=231 --data.init_args.data_limit=1000
 ```
 
 ### Fine-tuning for classification tasks, e.g. Toxicity prediction
@@ -78,11 +78,16 @@ python -m chebai fit --config=[path-to-your-esol-config] --trainer.callbacks=con
 
 ### Predicting classes given SMILES strings
 ```
-python3 -m chebai predict_from_file --model=[path-to-model-config] --checkpoint_path=[path-to-model] --input_path={path-to-file-containing-smiles] [--classes_path=[path-to-classes-file]] [--save_to=[path-to-output]]
+python3  chebai/result/prediction.py predict_from_file --checkpoint_path=[path-to-model] --smiles_file_path=[path-to-file-containing-smiles]  [--save_to=[path-to-output]]
 ```
-The input files should contain a list of line-separated SMILES strings. This generates a CSV file  that contains the
-one row for each SMILES string and one column for each class.
-The `classes_path` is the path to the dataset's `raw/classes.txt` file that contains the relationship between model output and ChEBI-IDs.
+
+* **`--checkpoint_path`**: Path to the Lightning checkpoint file (must end with `.ckpt`).
+
+* **`--smiles_file_path`**: Path to a text file containing one SMILES string per line.
+
+* **`--save_to`** *(optional)*: Predictions will be saved to the path as CSV file. The CSV will contain one row per SMILES string and one column per predicted class. Default path will be the current working directory with file name as `predictions.csv`.
+
+> **Note**: Newly created checkpoints after PR #148 must be used for this prediction pipeline. The list of ChEBI classes (classification labels) used during training is stored in new checkpoints, which are required.
 
 ## Evaluation
 
@@ -96,7 +101,7 @@ An example notebook is provided at `tutorials/eval_model_basic.ipynb`.
 Alternatively, you can evaluate the model via the CLI:
 
 ```bash
-python -m chebai test --trainer=configs/training/default_trainer.yml --trainer.devices=1 --trainer.num_nodes=1 --ckpt_path=[path-to-finetuned-model] --model=configs/model/electra.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --data=configs/data/chebi/chebi50.yml --data.init_args.batch_size=32 --data.init_args.num_workers=10 --data.init_args.chebi_version=[chebi-version] --model.pass_loss_kwargs=false --model.criterion=configs/loss/bce.yml --model.criterion.init_args.beta=0.99 --data.init_args.splits_file_path=[path-to-splits-file]
+python -m chebai test --trainer=configs/training/default_trainer.yml --trainer.devices=1 --trainer.num_nodes=1 --ckpt_path=[path-to-finetuned-model] --model=configs/model/electra.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --data=configs/data/chebi/chebi50.yml --data.init_args.batch_size=32 --data.init_args.num_workers=10 --data.init_args.chebi_version=[chebi-version] --model.pass_loss_kwargs=false --model.criterion=configs/loss/bce_weighted.yml --model.criterion.init_args.beta=0.99 --data.init_args.splits_file_path=[path-to-splits-file]
 ```
 
 > **Note**: It is recommended to use `devices=1` and `num_nodes=1` during testing; multi-device settings use a `DistributedSampler`, which may replicate some samples to maintain equal batch sizes, so using a single device ensures that each sample or batch is evaluated exactly once.
 
@@ -59,6 +59,12 @@ def call_data_methods(data: Type[XYBaseDataModule]):
             apply_on="instantiate",
         )
 
+        parser.link_arguments(
+            "data.classes_txt_file_path",
+            "model.init_args.classes_txt_file_path",
+            apply_on="instantiate",
+        )
+
         for kind in ("train", "val", "test"):
             for average in (
                 "micro-f1",
@@ -111,8 +117,6 @@ def subcommands() -> Dict[str, Set[str]]:
             "fit": {"model", "train_dataloaders", "val_dataloaders", "datamodule"},
             "validate": {"model", "dataloaders", "datamodule"},
             "test": {"model", "dataloaders", "datamodule"},
-            "predict": {"model", "dataloaders", "datamodule"},
-            "predict_from_file": {"model"},
         }
 
 
 
@@ -40,15 +40,16 @@ def __init__(
         pass_loss_kwargs: bool = True,
         optimizer_kwargs: Optional[Dict[str, Any]] = None,
         exclude_hyperparameter_logging: Optional[Iterable[str]] = None,
+        classes_txt_file_path: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         # super().__init__()
         if exclude_hyperparameter_logging is None:
             exclude_hyperparameter_logging = tuple()
         self.criterion = criterion
-        assert out_dim is not None, "out_dim must be specified"
-        assert input_dim is not None, "input_dim must be specified"
+        assert out_dim is not None and out_dim > 0, "out_dim must be specified"
+        assert input_dim is not None and input_dim > 0, "input_dim must be specified"
         self.out_dim = out_dim
         self.input_dim = input_dim
         print(
@@ -62,6 +63,7 @@ def __init__(
                 "train_metrics",
                 "val_metrics",
                 "test_metrics",
+                "classes_txt_file_path",
                 *exclude_hyperparameter_logging,
             ]
         )
@@ -78,6 +80,23 @@ def __init__(
         self.test_metrics = test_metrics
         self.pass_loss_kwargs = pass_loss_kwargs
 
+        self.classes_txt_file_path = classes_txt_file_path
+
+        # During prediction `classes_txt_file_path` is set to None
+        if classes_txt_file_path is not None:
+            with open(classes_txt_file_path, "r") as f:
+                self.labels_list = [cls.strip() for cls in f.readlines()]
+            assert len(self.labels_list) > 0, "Class labels list is empty."
+            assert len(self.labels_list) == out_dim, (
+                f"Number of class labels ({len(self.labels_list)}) does not match "
+                f"the model output dimension ({out_dim})."
+            )
+
+    def on_save_checkpoint(self, checkpoint):
+        if self.classes_txt_file_path is not None:
+            # https://lightning.ai/docs/pytorch/stable/common/checkpointing_intermediate.html#modify-a-checkpoint-anywhere
+            checkpoint["classification_labels"] = self.labels_list
+
     def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
         # avoid errors due to unexpected keys (e.g., if loading checkpoint from a bce model and using it with a
         # different loss)
@@ -100,7 +119,7 @@ def __init_subclass__(cls, **kwargs):
 
     def _get_prediction_and_labels(
         self, data: Dict[str, Any], labels: torch.Tensor, output: torch.Tensor
-    ) -> (torch.Tensor, torch.Tensor):
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Gets the predictions and labels from the model output.
 
@@ -151,7 +170,7 @@ def _process_for_loss(
         model_output: torch.Tensor,
         labels: torch.Tensor,
         loss_kwargs: Dict[str, Any],
-    ) -> (torch.Tensor, torch.Tensor, Dict[str, Any]):
+    ) -> tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
         """
         Processes the data for loss computation.
 
@@ -237,7 +256,7 @@ def predict_step(
         Returns:
             Dict[str, Union[torch.Tensor, Any]]: The result of the prediction step.
         """
-        return self._execute(batch, batch_idx, self.test_metrics, prefix="", log=False)
+        return self._execute(batch, batch_idx, log=False)
 
     def _execute(
         self,
 
@@ -203,6 +203,7 @@ def _process_batch(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any
             )
             * CLS_TOKEN
         )
+        model_kwargs["output_attentions"] = True
         return dict(
             features=torch.cat((cls_tokens, batch.x), dim=1),
             labels=batch.y,
 
@@ -4373,181 +4373,5 @@ b
 [CaH2]
 [NH3]
 [OH2]
-[1*]
-[2*]
-[3*]
-[4*]
-[5*]
-[6*]
-[7*]
-[8*]
-[9*]
-[3He+]
-[12C+4]
-[16O+6]
-[11B-3]
-[11B+3]
-[31P+3]
-[31P+5]
-[34S+2]
-[34S+4]
-[34S+6]
-[55Mn+2]
-[55Mn+4]
-[55Mn+7]
-[57Fe+3]
-[59Co+2]
-[75As-3]
-[98Mo+3]
-[98Mo+6]
-[Cl:1]
-[c:2]
-[n:3]
-[c:4]
-[cH:5]
-[cH:6]
-[cH:7]
-[cH:8]
-[c:9]
-[cH:10]
-[c:11]
-[CH2:12]
-[O:13]
-[c:14]
-[cH:15]
-[cH:16]
-[cH:17]
-[c:18]
-[cH:19]
-[cH:20]
-[cH:21]
-[n:22]
-[c:23]
-[CH3:1]
-[C:2]
-[O:3]
-[O:4]
-[c:5]
-[cH:9]
-[c:10]
-[C:11]
-[O:12]
-[CH2:14]
-[C:15]
-[O:16]
-[NH:17]
-[CH2:18]
-[CH:19]
-[CH2:20]
-[N:21]
-[c:25]
-[cH:26]
-[c:27]
-[F:37]
-[c:28]
-[N:31]
-[CH2:32]
-[CH2:33]
-[O:34]
-[CH2:35]
-[CH2:36]
-[cH:29]
-[cH:30]
-[C:22]
-[O:23]
-[O:24]
-[NaH2-]
-[KH2-]
-[LiH2-]
-[BH2-3]
-[BeH3-]
-[RbH2-]
-[FrH2-]
-[AlH-2]
-[CsH2-]
-[P@TB5]
-[Ru@OH14]
-[Ru@OH15+2]
-[Ru@OH16]
-[Ru@OH23+2]
-[Ru@OH4]
-[*:0]
-[1*:0]
-[2*:0]
-[3*:0]
-[224RaH2]
-[226RaH2]
-[228RaH2]
-[H:24]
-[c:6]
-[H:25]
-[c:7]
-[H:26]
-[c:8]
-[H:27]
-[H:28]
-[C:12]
-[c:15]
-[H:31]
-[c:16]
-[H:32]
-[c:17]
-[H:33]
-[c:19]
-[H:34]
-[c:20]
-[H:35]
-[c:21]
-[H:36]
-[H:29]
-[H:30]
-[C:1]
-[H:41]
-[H:42]
-[H:43]
-[H:44]
-[C:14]
-[N:17]
-[C:18]
-[C:19]
-[H:50]
-[C:20]
-[H:51]
-[H:52]
-[c:26]
-[H:53]
-[C:32]
-[H:56]
-[H:57]
-[C:33]
-[H:58]
-[H:59]
-[C:35]
-[H:60]
-[H:61]
-[C:36]
-[H:62]
-[H:63]
-[c:29]
-[H:54]
-[c:30]
-[H:55]
-[H:48]
-[H:49]
-[H:47]
-[H:45]
-[H:46]
-[H:38]
-[H:39]
-[H:40]
-[C-2]
-[As+2]
-[P+2]
-[O+2]
-[BeH2-]
-[W@]
-[W@@]
-[B-2]
-[V@]
-[V@@]
-[V@OH]
+[TlH2+]
+[SbH6+3]
Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,7 @@ def _process_batch(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any`
`203`	`203`	`)`
`204`	`204`	`* CLS_TOKEN`
`205`	`205`	`)`
	`206`	`+ model_kwargs["output_attentions"] = True`
`206`	`207`	`return dict(`
`207`	`208`	`features=torch.cat((cls_tokens, batch.x), dim=1),`
`208`	`209`	`labels=batch.y,`