yigbt · soulios · Sep 7, 2023 · Sep 7, 2023 · Sep 8, 2023 · Sep 8, 2023
diff --git a/.dockerignore b/.dockerignore
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -101,5 +101,17 @@ jobs:
           exit 1
         fi
 
+        dfpl interpretgnn -f example/interpret.json
+        if [ "$(cat interpretations.csv | wc -l)" -lt "6" ]; then
+          echo "predict result should have at least 5 lines. But had only $(cat interpretations.csv | wc -l)" >&2 
+          exit 1
+        fi
+
+        dfpl convert -f tests/data
+        if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
+          echo "not all csv files are converted to pickle ones" >&2 
+          exit 1
+        fi
+
 
-        dfpl convert -f tests/data
+        echo "All tests passed"
diff --git a/.github/workflows/push-to-gitlab.yml b/.github/workflows/push-to-gitlab.yml
diff --git a/.gitignore b/.gitignore
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
diff --git a/LICENSE.pdf b/LICENSE.pdf
diff --git a/README.md b/README.md
@@ -111,22 +111,23 @@ In order to use the environment it needs to be activated with `. ENV_PATH/bin/ac
 
 To use this tool in a conda environment:
 
-1. Create the conda env from scratch
+1. Install mamba. For details follow the installation guide here. https://mamba.readthedocs.io/en/latest/mamba-installation.html#mamba-install}
+2. Create the mamba env from scratch
 
    From within the `deepFPlearn` directory, you can create the conda environment with the provided yaml file that
    contains all information and necessary packages
 
    ```shell
-   conda env create -f environment.yml
+   mamba env create -f environment.yml
    ```
 
-2. Activate the `dfpl_env` environment with
+3. Activate the `dfpl_env` environment with
 
    ```shell
-   conda activate dfpl_env
+   mamba activate dfpl_env
    ```
 
-3. Install the local `dfpl` package by calling
+4. Install the local `dfpl` package by calling
 
    ```shell
    pip install --no-deps ./
@@ -327,7 +328,6 @@ Kyriakos Soulios, Patrick Scheibe, Matthias Bernt, Jörg Hackermüller, and Jana
 deepFPlearn<sup>+</sup>: Enhancing Toxicity Prediction Across the Chemical Universe Using Graph Neural Networks.
 Submitted to a scientific journal, currently under review.
 
-<a id="2">[2]</a>
 Jana Schor, Patrick Scheibe, Matthias Bernt, Wibke Busch, Chih Lai, and Jörg Hackermüller.
 AI for predicting chemical-effect associations at the chemical universe level—deepFPlearn.
 Briefings in Bioinformatics, Volume 23, Issue 5, September 2022, bbac257, https://doi.org/10.1093/bib/bbac257

diff --git a/container/Dockerfile b/container/Dockerfile
@@ -50,6 +50,7 @@ RUN sh -c 'echo "APT { Get { AllowUnauthenticated \"1\"; }; };" > /etc/apt/apt.c
 
 RUN apt -o Acquire::AllowInsecureRepositories=true -o Acquire::AllowDowngradeToInsecureRepositories=true update
 RUN apt-get install -y curl wget
+RUN apt-get install -y git
 
 RUN apt-key del 7fa2af80
 RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
@@ -118,7 +119,7 @@ RUN ln -s $(which python3) /usr/local/bin/python
 # does not work sind it just copies the files in dfpl
 COPY ./ /deepFPlearn/
 
-# install dfpl 
+# install dfpl
 RUN python -m pip install --no-cache-dir  /deepFPlearn && pip install --no-cache-dir pytest
 
 # The code to run when container is started. 

diff --git a/container/README.md b/container/README.md
diff --git a/dfpl/__init__.py b/dfpl/__init__.py
diff --git a/dfpl/__main__.py b/dfpl/__main__.py
@@ -1,12 +1,10 @@
 import dataclasses
 import logging
-import os.path
-import pathlib
+import os
 from argparse import Namespace
 from os import path
-
-import chemprop as cp
-import pandas as pd
+import wandb
+import chemprop
 from keras.models import load_model
 
 from dfpl import autoencoder as ac
@@ -17,108 +15,62 @@
 from dfpl import vae as vae
 from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute
 
-project_directory = pathlib.Path(".").parent.parent.absolute()
-test_train_opts = options.Options(
-    inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
-    outputDir=f"{project_directory}/output_data/console_test",
-    ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
-    ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
-    type="smiles",
-    fpType="topological",
-    epochs=100,
-    batchSize=1024,
-    fpSize=2048,
-    encFPSize=256,
-    enableMultiLabel=False,
-    testSize=0.2,
-    kFolds=2,
-    verbose=2,
-    trainAC=False,
-    trainFNN=True,
-    compressFeatures=True,
-    activationFunction="selu",
-    lossFunction="bce",
-    optimizer="Adam",
-    fnnType="FNN",
-)
-
-test_pred_opts = options.Options(
-    inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
-    outputDir=f"{project_directory}/output_data/console_test",
-    outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
-    ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
-    fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
-    type="smiles",
-    fpType="topological",
-)
-
-
-def traindmpnn(opts: options.GnnOptions):
+
+def traindmpnn(opts: options.GnnOptions) -> None:
     """
     Train a D-MPNN model using the given options.
     Args:
     - opts: options.GnnOptions instance containing the details of the training
     Returns:
     - None
     """
-    os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
-    ignore_elements = ["py/object"]
     # Load options from a JSON file and replace the relevant attributes in `opts`
-    arguments = createArgsFromJson(
-        opts.configFile, ignore_elements, return_json_object=False
-    )
-    opts = cp.args.TrainArgs().parse_args(arguments)
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
+    opts = chemprop.args.TrainArgs().parse_args(arguments)
     logging.info("Training DMPNN...")
-    # Train the model and get the mean and standard deviation of AUC score from cross-validation
-    mean_score, std_score = cp.train.cross_validate(
-        args=opts, train_func=cp.train.run_training
+    mean_score, std_score = chemprop.train.cross_validate(
+        args=opts, train_func=chemprop.train.run_training
     )
     logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")
 
 
-def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
+def predictdmpnn(opts: options.GnnOptions) -> None:
     """
     Predict the values using a trained D-MPNN model with the given options.
     Args:
     - opts: options.GnnOptions instance containing the details of the prediction
-    - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
     Returns:
     - None
     """
-    ignore_elements = [
-        "py/object",
-        "checkpoint_paths",
-        "save_dir",
-        "saving_name",
-    ]
     # Load options and additional arguments from a JSON file
-    arguments, data = createArgsFromJson(
-        json_arg_path, ignore_elements, return_json_object=True
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
+    opts = chemprop.args.PredictArgs().parse_args(arguments)
+
+    chemprop.train.make_predictions(args=opts)
+
+
+def interpretdmpnn(opts: options.GnnOptions) -> None:
+    """
+    Interpret the predictions of a trained D-MPNN model with the given options.
+    Args:
+    - opts: options.GnnOptions instance containing the details of the prediction
+    Returns:
+    - None
+    """
+    # Load options and additional arguments from a JSON file
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
+    opts = chemprop.args.InterpretArgs().parse_args(arguments)
+
+    chemprop.interpret.interpret(
+        args=opts, save_to_csv=True
     )
-    arguments.append("--preds_path")
-    arguments.append("")
-    save_dir = data.get("save_dir")
-    name = data.get("saving_name")
-    # Replace relevant attributes in `opts` with loaded options
-    opts = cp.args.PredictArgs().parse_args(arguments)
-    opts.preds_path = save_dir + "/" + name
-    df = pd.read_csv(opts.test_path)
-    smiles = []
-    for index, rows in df.iterrows():
-        my_list = [rows.smiles]
-        smiles.append(my_list)
-    # Make predictions and return the result
-    cp.train.make_predictions(args=opts, smiles=smiles)
 
 
 def train(opts: options.Options):
     """
     Run the main training procedure
     :param opts: Options defining the details of the training
     """
-
-    os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
-
     # import data from file and create DataFrame
     if "tsv" in opts.inputFile:
         df = fp.importDataFile(
@@ -128,7 +80,7 @@ def train(opts: options.Options):
         df = fp.importDataFile(
             opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
         )
-    # initialize encoders to None
+    # initialize (auto)encoders to None
     encoder = None
     autoencoder = None
     if opts.trainAC:
@@ -142,26 +94,32 @@ def train(opts: options.Options):
     # if feature compression is enabled
     if opts.compressFeatures:
         if not opts.trainAC:
-            if opts.aeType == "deterministic":
-                (autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
-            elif opts.aeType == "variational":
+            if opts.aeType == "variational":
                 (autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
-            elif opts.ecWeightsFile == "":
+            else:
+                (autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
+
+            if opts.ecWeightsFile == "":
                 encoder = load_model(opts.ecModelDir)
             else:
                 autoencoder.load_weights(
                     os.path.join(opts.ecModelDir, opts.ecWeightsFile)
                 )
+
         # compress the fingerprints using the autoencoder
         df = ac.compress_fingerprints(df, encoder)
-        # ac.visualize_fingerprints(
-        #     df,
-        #     before_col="fp",
-        #     after_col="fpcompressed",
-        #     train_indices=train_indices,
-        #     test_indices=test_indices,
-        #     save_as=f"UMAP_{opts.aeSplitType}.png",
-        # )
+        if opts.visualizeLatent and opts.trainAC:
+            ac.visualize_fingerprints(
+                df,
+                save_as=f"{opts.ecModelDir}/TSNE_{opts.aeType}_{opts.aeSplitType}.png",
+            )
+        elif opts.visualizeLatent:
+            logging.info(
+                "Visualizing latent space is only available if you train the autoencoder. Skipping visualization."
+            )
+    if opts.trainFNN and opts.finetuneEncoder:
+        sl.train_single_label_models(df=df, opts=opts)
+
     # train single label models if requested
     if opts.trainFNN and not opts.enableMultiLabel:
         sl.train_single_label_models(df=df, opts=opts)
@@ -257,29 +215,36 @@ def main():
                 raise ValueError("Input directory is not a directory")
         elif prog_args.method == "traingnn":
             traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
-
+            createLogger("traingnn.log")
             traindmpnn(traingnn_opts)
 
         elif prog_args.method == "predictgnn":
-            predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
-            fixed_opts = dataclasses.replace(
-                predictgnn_opts,
-                test_path=makePathAbsolute(predictgnn_opts.test_path),
-                preds_path=makePathAbsolute(predictgnn_opts.preds_path),
-            )
-
-            logging.info(
-                f"The following arguments are received or filled with default values:\n{prog_args}"
-            )
-
-            predictdmpnn(fixed_opts, prog_args.configFile)
+            predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args)
+            createLogger("predictgnn.log")
+            predictdmpnn(predictgnn_opts)
+        elif prog_args.method == "interpretgnn":
+            interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args)
+            createLogger("interpretgnn.log")
+            interpretdmpnn(interpretgnn_opts)
 
         elif prog_args.method == "train":
+            if prog_args.configFile is None and prog_args.inputFile is None:
+                parser.error("Either --configFile or --inputFile must be provided.")
+
             train_opts = options.Options.fromCmdArgs(prog_args)
+            # Access wandb configuration
+            # wandb.init(project="dfpl")
+            # config = wandb.config
+
             fixed_opts = dataclasses.replace(
                 train_opts,
                 inputFile=makePathAbsolute(train_opts.inputFile),
                 outputDir=makePathAbsolute(train_opts.outputDir),
+                # learningRate=config.learningRate,
+                # learningRateDecay=config.learningRateDecay,
+                # dropout=config.dropout,
+                # batchSize=config.batchSize,
+                # l2reg=config.l2reg
             )
             createDirectory(fixed_opts.outputDir)
             createLogger(path.join(fixed_opts.outputDir, "train.log"))
@@ -288,6 +253,8 @@ def main():
             )
             train(fixed_opts)
         elif prog_args.method == "predict":
+            if prog_args.configFile is None and prog_args.inputFile is None:
+                parser.error("Either --configFile or --inputFile must be provided.")
             predict_opts = options.Options.fromCmdArgs(prog_args)
             fixed_opts = dataclasses.replace(
                 predict_opts,
@@ -298,8 +265,6 @@ def main():
                 ),
                 ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
                 fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
-                trainAC=False,
-                trainFNN=False,
             )
             createDirectory(fixed_opts.outputDir)
             createLogger(path.join(fixed_opts.outputDir, "predict.log"))