diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 56df68fc..d7d4ae76 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -95,9 +95,14 @@ jobs: exit 1 fi echo "result lines "$(wc -l preds_dmpnn/DMPNN_preds.csv) - if [ "$(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" -lt "6" ]; then - echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" >&2 + if [ "$(cat preds_dmpnn/preds.csv | wc -l)" -lt "6" ]; then + echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/preds.csv | wc -l)" >&2 exit 1 fi dfpl convert -f tests/data + if [ "$(find tests/data \( -name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + echo "not all csv files are converted to pickle ones" >&2 + exit 1 + fi + echo "All tests passed!" \ No newline at end of file diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..fe66eec8 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,12 +1,10 @@ import dataclasses import logging -import os.path -import pathlib +import os from argparse import Namespace from os import path -import chemprop as cp -import pandas as pd +import chemprop from keras.models import load_model from dfpl import autoencoder as ac @@ -17,43 +15,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) - -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): + +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +24,44 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) - opts = cp.args.TrainArgs().parse_args(arguments) + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation - mean_score, std_score = cp.train.cross_validate( - args=opts, train_func=cp.train.run_training + mean_score, std_score = chemprop.train.cross_validate( + args=opts, train_func=chemprop.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True - ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options - opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.PredictArgs().parse_args(arguments) + + chemprop.train.make_predictions(args=opts) + + +def interpretdmpnn(opts: options.GnnOptions) -> None: + """ + Interpret the predictions of a trained D-MPNN model with the given options. + Args: + - opts: options.GnnOptions instance containing the details of the prediction + Returns: + - None + """ + # Load options and additional arguments from a JSON file + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.InterpretArgs().parse_args(arguments) + + chemprop.interpret.interpret(args=opts, save_to_csv=True) def train(opts: options.Options): @@ -116,9 +69,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,7 +78,7 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + # initialize (auto)encoders to None encoder = None autoencoder = None if opts.trainAC: @@ -142,11 +92,12 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": + else: + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) + + if opts.ecWeightsFile == "": encoder = load_model(opts.ecModelDir) else: autoencoder.load_weights( @@ -154,14 +105,18 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent and opts.trainAC: + ac.visualize_fingerprints( + df, + train_indices=train_indices, + test_indices=test_indices, + save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png", + ) + elif opts.visualizeLatent: + logging.info( + "Visualizing latent space is only available if you train the autoencoder. Skipping visualization." + ) + # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -257,24 +212,22 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - + createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": - predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - predictgnn_opts, - test_path=makePathAbsolute(predictgnn_opts.test_path), - preds_path=makePathAbsolute(predictgnn_opts.preds_path), - ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args) + createLogger("predictgnn.log") + predictdmpnn(predictgnn_opts) + elif prog_args.method == "interpretgnn": + interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args) + createLogger("interpretgnn.log") + interpretdmpnn(interpretgnn_opts) elif prog_args.method == "train": + if prog_args.configFile is None and prog_args.inputFile is None: + parser.error("Either --configFile or --inputFile must be provided.") + train_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( train_opts, @@ -288,6 +241,8 @@ def main(): ) train(fixed_opts) elif prog_args.method == "predict": + if prog_args.configFile is None and prog_args.inputFile is None: + parser.error("Either --configFile or --inputFile must be provided.") predict_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( predict_opts, @@ -298,8 +253,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 99bf4578..b2b13d76 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,19 +1,18 @@ import logging import math import os.path -from os.path import basename from typing import Tuple import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import umap +import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, losses, optimizers from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.models import Model +from tensorflow.keras.models import Model, load_model from dfpl import callbacks from dfpl import history as ht @@ -32,9 +31,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod """ input_size = opts.fpSize encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) @@ -104,7 +107,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod )(decoded) # output layer - # to either 0 or 1 and hence we use sigmoid activation function. decoded = Dense( units=input_size, activation="sigmoid", bias_initializer=output_bias )(decoded) @@ -145,37 +147,8 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"AE_{opts.aeSplitType}") - # Define output files for autoencoder and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No AE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType - ) - logging.info( - f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=ac_weights_file, opts=opts - ) # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( @@ -286,30 +259,35 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) - + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=callback_list, + callbacks=[callback_list], epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) - logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), + base_file_name=save_path, hist=auto_hist, ) # Save the autoencoder callback model to disk - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") if opts.testSize > 0.0: - (callback_autoencoder, callback_encoder) = define_ac_model(opts) - callback_encoder.save(filepath=save_path) + # Re-define autoencoder and encoder using your function + callback_autoencoder = load_model(filepath=save_path) + _, callback_encoder = define_ac_model(opts) + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = os.path.join(save_path, "encoder_model") + callback_encoder.save(filepath=encoder_save_path) else: encoder.save(filepath=save_path) # Return the encoder model of the trained autoencoder diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 6eae7965..8bf157fd 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -22,15 +22,25 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: else: target = "loss" # enable this checkpoint to restore the weights of the best performing model - checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - period=settings.ac_train_check_period, - save_best_only=True, - save_weights_only=True, - ) + if opts.aeType == "deterministic": + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + ) + else: + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + save_weights_only=True, + ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore @@ -43,7 +53,6 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: restore_best_weights=True, ) callbacks.append(early_stop) - if opts.aeWabTracking and not opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks @@ -65,7 +74,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - period=settings.nn_train_check_period, + save_freq="epoch", save_best_only=True, monitor="val_loss", mode="min", diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index e9c88776..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -69,10 +69,16 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( input_size: int, output_size: int, opts: options.Options ) -> Model: + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -132,9 +138,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -294,6 +300,8 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, + model_hist_csv_path, + model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..c266e24b 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,13 @@ import argparse from dataclasses import dataclass from pathlib import Path +from typing import List, Literal, Optional import jsonpickle import torch -from chemprop.args import TrainArgs +from chemprop.args import InterpretArgs, PredictArgs, TrainArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import parseCmdArgs @dataclass @@ -17,51 +18,51 @@ class Options: Dataclass for all options necessary for training the neural nets """ - configFile: str = "./example/train.json" - inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv" - outputDir: str = "." + configFile: str = None + inputFile: str = "" + outputDir: str = "" # changes according to mode outputFile: str = "" - ecWeightsFile: str = "AE.encoder.weights.hdf5" - ecModelDir: str = "AE_encoder" - fnnModelDir: str = "modeltraining" + ecWeightsFile: str = "" + ecModelDir: str = "" + fnnModelDir: str = "" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 512 + epochs: int = 100 fpSize: int = 2048 encFPSize: int = 256 - kFolds: int = 0 + kFolds: int = 1 testSize: float = 0.2 enableMultiLabel: bool = False - verbose: int = 0 - trainAC: bool = True # if set to False, an AC weight file must be provided! + verbose: int = 2 + trainAC: bool = False trainFNN: bool = True - compressFeatures: bool = True - sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] + compressFeatures: bool = False + sampleFractionOnes: float = 0.5 sampleDown: bool = False split_type: str = "random" aeSplitType: str = "random" aeType: str = "deterministic" - aeEpochs: int = 3000 + aeEpochs: int = 100 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 - aeActivationFunction: str = "relu" + aeLearningRateDecay: float = 0.96 + aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - gpu: str = "" - snnDepth = 8 - snnWidth = 50 - aeWabTracking: str = "" # Wand & Biases autoencoder tracking - wabTracking: str = "" # Wand & Biases FNN tracking - wabTarget: str = "ER" # Wand & Biases target used for showing training progress + visualizeLatent: bool = False # only if autoencoder is trained or loaded + gpu: int = None + aeWabTracking: bool = False # Wand & Biases autoencoder tracking + wabTracking: bool = False # Wand & Biases FNN tracking + wabTarget: str = "AR" # Wand & Biases target used for showing training progress def saveToFile(self, file: str) -> None: """ @@ -72,42 +73,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -118,8 +85,8 @@ class GnnOptions(TrainArgs): total_epochs: int = 30 save: bool = True - configFile: str = "./example/traingnn.json" - data_path: str = "./example/data/tox21.csv" + configFile: str = "" + data_path: str = "" use_compound_names: bool = False save_dir: str = "" no_cache: bool = False @@ -129,42 +96,122 @@ class GnnOptions(TrainArgs): num_lrs: int = 2 minimize_score: bool = False num_tasks: int = 12 - preds_path: str = "./tox21dmpnn.csv" + preds_path: str = "" test_path: str = "" save_preds: bool = True + calibration_method: str = "" + uncertainty_method: str = "" + calibration_path: str = "" + evaluation_methods: str = "" + evaluation_scores_path: str = "" + wabTracking: bool = False + split_sizes: List[float] = None + # save_smiles_splits: bool = False @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts + + +class PredictGnnOptions(PredictArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + configFile: str = "" + calibration_atom_descriptors_path: str = None + calibration_features_path: str = None + calibration_interval_percentile: float = 95 + calibration_method: Optional[ + Literal[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ] + ] = None + calibration_path: str = None + calibration_phase_features_path: str = None + drop_extra_columns: bool = False + dropout_sampling_size: int = 10 + evaluation_methods: List[str] = None + evaluation_scores_path: str = None + # no_features_scaling: bool = True + individual_ensemble_predictions: bool = False + preds_path: str = None + regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None + test_path: str = None + uncertainty_dropout_p: float = 0.1 + uncertainty_method: Optional[ + Literal[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + ] + ] = None @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() + + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts + + +class InterpretGNNoptions(InterpretArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + configFile: str = "./example/interpret.json" + data_path: str = "./example/data/smiles.csv" + batch_size: int = 500 + c_puct: float = 10.0 + max_atoms: int = 20 + min_atoms: int = 8 + prop_delta: float = 0.5 + property_id: List[int] = None + rollout: int = 20 + + @classmethod + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() + + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -186,6 +233,12 @@ def createCommandlineParser() -> argparse.ArgumentParser: parser_predict_gnn.set_defaults(method="predictgnn") parsePredictGnn(parser_predict_gnn) + parser_interpret_gnn = subparsers.add_parser( + "interpretgnn", help="Interpret your GNN models" + ) + parser_interpret_gnn.set_defaults(method="interpretgnn") + parseInterpretGnn(parser_interpret_gnn) + parser_train = subparsers.add_parser( "train", help="Train new models with your data" ) @@ -225,7 +278,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -234,7 +287,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv", ) general_args.add_argument( "-o", @@ -243,16 +296,17 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/", ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", - metavar="STRING", type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp", ) general_args.add_argument( "-thr", @@ -260,34 +314,30 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5, ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None, ) general_args.add_argument( - "-k", "--fpType", - metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological", ) general_args.add_argument( - "-s", "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048, ) general_args.add_argument( - "-c", "--compressFeatures", metavar="BOOL", type=bool, @@ -295,7 +345,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=argparse.SUPPRESS, ) general_args.add_argument( - "-m", "--enableMultiLabel", metavar="BOOL", type=bool, @@ -309,7 +358,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="", ) autoencoder_args.add_argument( "--ecModelDir", @@ -320,29 +369,27 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeType", - metavar="STRING", type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic", ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512, ) autoencoder_args.add_argument( "--aeActivationFunction", - metavar="STRING", type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", @@ -353,18 +400,17 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96, ) autoencoder_args.add_argument( "--aeSplitType", - metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", @@ -376,19 +422,23 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256, + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False, ) # Training Configuration training_args.add_argument( "--split_type", - metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", default=argparse.SUPPRESS, ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, @@ -406,25 +456,22 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "-v", "--verbose", - metavar="INT", type=int, choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", default=argparse.SUPPRESS, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", + action="store_false", + help="When called it deactivates the training.", default=argparse.SUPPRESS, ) training_args.add_argument( @@ -450,10 +497,9 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Number of epochs that should be used for the FNN training", default=argparse.SUPPRESS, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", - metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", @@ -461,7 +507,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) training_args.add_argument( "--optimizer", - metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', @@ -472,32 +517,38 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", - metavar="STRING", type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", @@ -524,7 +575,109 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting.", + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", + ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + ( + general_args.add_argument( + "--aeType", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) ) @@ -534,21 +687,60 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: files_args = parser.add_argument_group("Files") model_args = parser.add_argument_group("Model arguments") training_args = parser.add_argument_group("Training Configuration") + uncertainty_args = parser.add_argument_group("Uncertainty Configuration") + uncertainty_args.add_argument( + "--uncertainty_method", + type=str, + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "dirichlet", + ], + help="Method to use for uncertainty estimation", + default="none", + ) + # Uncertainty arguments + uncertainty_args.add_argument( + "--calibration_method", + type=str, + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Method to use for calibration", + default="none", + ) + uncertainty_args.add_argument( + "--calibration_path", + type=str, + metavar="FILE", + help="Path to file with calibration data", + ) # General arguments general_args.add_argument("--split_key_molecule", type=int) general_args.add_argument("--pytorch_seed", type=int) general_args.add_argument("--cache_cutoff", type=float) general_args.add_argument("--save_preds", type=bool) + general_args.add_argument("--wabTracking", action="store_true", default=False) general_args.add_argument( "--cuda", action="store_true", default=False, help="Turn on cuda" ) - general_args.add_argument( - "--save_smiles_splits", - action="store_true", - default=False, - help="Save smiles for each train/val/test splits for prediction convenience later", - ) + # general_args.add_argument( + # "--save_smiles_splits", + # action="store_true", + # default=False, + # help="Save smiles for each train/val/test splits for prediction convenience later", + # ) general_args.add_argument( "--test", action="store_true", @@ -575,9 +767,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -593,13 +782,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: type=str, help="Input JSON file that contains all information for training/predicting.", ) - files_args.add_argument( - "--config_path", - type=str, - metavar="FILE", - help="Path to a .json file containing arguments. Any arguments present in the config" - "file will override arguments specified via the command line or by the defaults.", - ) files_args.add_argument( "--save_dir", type=str, @@ -747,7 +929,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--dataset_type", type=str, - metavar="STRING", choices=["classification", "regression", "multiclass"], help="Type of dataset, e.g. classification or regression." "This determines the loss function used during training.", @@ -763,7 +944,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--split_type", type=str, - metavar="STRING", default="random", choices=[ "random", @@ -871,7 +1051,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: model_args.add_argument( "--activation", type=str, - metavar="STRING", default="ReLU", choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"], help="Activation function", @@ -990,7 +1169,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--loss_function", type=str, - metavar="STRING", choices=[ "mse", "bounded_mse", @@ -1008,7 +1186,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--metric", type=str, - metavar="STRING", default=None, choices=[ "auc", @@ -1034,141 +1211,151 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - +def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") files_args = parser.add_argument_group("Files") + uncertainty_args = parser.add_argument_group("Uncertainty Configuration") + + general_args.add_argument( + "--checkpoint_path", + type=str, + metavar="FILE", + help="Path to model checkpoint (.pt file)", + ) + # general_args.add_argument( + # "--no_features_scaling", + # action="store_true", + # help="Turn on scaling of features", + # ) files_args.add_argument( "-f", "--configFile", - metavar="FILE", type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-i", - "--inputFile", metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default=argparse.SUPPRESS, + help="Path to a .json file containing arguments. Any arguments present in the config" + "file will override arguments specified via the command line or by the defaults.", ) files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", + "--test_path", type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default=argparse.SUPPRESS, + help="Path to CSV file containing testing data for which predictions will be made.", ) files_args.add_argument( - "--outputFile", - metavar="FILE", + "--preds_path", type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default=argparse.SUPPRESS, + help="Predictions output file. CSV or PICKLE file where predictions will be saved.", ) - general_args.add_argument( - "-t", - "--type", - metavar="STR", + files_args.add_argument( + "--calibration_path", type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + help="Data file to be used for uncertainty calibration.", ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", + files_args.add_argument( + "--calibration_features_path", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + nargs="+", + help="Feature data file to be used with the uncertainty calibration dataset.", ) + files_args.add_argument("--calibration_phase_features_path", type=str, help="") files_args.add_argument( - "--ecModelDir", + "--calibration_atom_descriptors_path", type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default=argparse.SUPPRESS, + help="Extra atom descriptors file.", ) files_args.add_argument( - "--fnnModelDir", + "--calibration_bond_descriptors_path", type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default=argparse.SUPPRESS, + help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", ) + general_args.add_argument( + "--drop_extra_columns", + action="store_true", + help="Keep only SMILES and new prediction columns in the test data files.", + ) -def parsePredictGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - data_args = parser.add_argument_group("Data Configuration") - files_args = parser.add_argument_group("Files") - training_args = parser.add_argument_group("Training Configuration") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", + uncertainty_args.add_argument( + "--uncertainty_method", type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "spectra_roundrobin", + "dirichlet", + ], + help="The method of calculating uncertainty.", ) - general_args.add_argument( - "--gpu", - type=int, - metavar="INT", - choices=list(range(torch.cuda.device_count())), - help="Which GPU to use", + uncertainty_args.add_argument( + "--calibration_method", + type=str, + nargs="+", + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Methods used for calibrating the uncertainty calculated with uncertainty method.", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=False, help="Turn off cuda" + uncertainty_args.add_argument( + "--individual_ensemble_predictions", + action="store_true", + default=False, + help="Save individual ensemble predictions.", ) - general_args.add_argument( - "--num_workers", + uncertainty_args.add_argument( + "--evaluation_methods", + type=str, + nargs="+", + help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", + ) + uncertainty_args.add_argument( + "--evaluation_scores_path", + type=str, + help="Location to save the results of uncertainty evaluations.", + ) + uncertainty_args.add_argument( + "--uncertainty_dropout_p", + type=float, + default=0.1, + help="The probability to use for Monte Carlo dropout uncertainty estimation.", + ) + uncertainty_args.add_argument( + "--dropout_sampling_size", type=int, - metavar="INT", - help="Number of workers for the parallel data loading 0 means sequential", + default=10, + help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.", ) - general_args.add_argument( - "--no_cache", - type=bool, - metavar="BOOL", - default=False, - help="Turn off caching mol2graph computation", + uncertainty_args.add_argument( + "--calibration_interval_percentile", + type=float, + default=95, + help="Percentile used in calibration methods. Must be in the range (1,100).", ) - general_args.add_argument( - "--no_cache_mol", - type=bool, - metavar="BOOL", - default=False, - help="Whether to not cache the RDKit molecule for each SMILES string to reduce memory\ - usage cached by default", + uncertainty_args.add_argument( + "--regression_calibrator_metric", + type=str, + choices=["stdev", "interval"], + help="Regression calibrator output metric. Regression calibrators can output either a stdev or an inverval.", ) - general_args.add_argument( - "--empty_cache", - type=bool, - metavar="BOOL", - help="Whether to empty all caches before training or predicting. This is necessary if\ - multiple jobs are run within a single script and the atom or bond features change", + + +def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: + files_args = parser.add_argument_group("Files") + interpret_args = parser.add_argument_group("Interpretation Configuration") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for interpretation.", ) files_args.add_argument( "--preds_path", @@ -1191,89 +1378,44 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: metavar="DIR", help="Path to model checkpoint (.pt file)", ) - files_args.add_argument( - "--checkpoint_paths", - type=str, - metavar="FILE", - nargs="*", - help="Path to model checkpoint (.pt file)", - ) files_args.add_argument( "--data_path", type=str, metavar="FILE", help="Path to CSV file containing testing data for which predictions will be made", - default="", - ) - files_args.add_argument( - "--test_path", - type=str, - metavar="FILE", - help="Path to CSV file containing testing data for which predictions will be made", - default="", - ) - files_args.add_argument( - "--features_path", - type=str, - metavar="FILE", - nargs="*", - help="Path to features to use in FNN (instead of features_generator)", ) - files_args.add_argument( - "--atom_descriptors_path", - type=str, - metavar="FILE", - help="Path to the extra atom descriptors.", - ) - data_args.add_argument( - "--use_compound_names", - action="store_true", - default=False, - help="Use when test data file contains compound names in addition to SMILES strings", - ) - data_args.add_argument( - "--no_features_scaling", - action="store_true", - default=False, - help="Turn off scaling of features", - ) - data_args.add_argument( - "--max_data_size", + interpret_args.add_argument( + "--max_atoms", type=int, metavar="INT", - help="Maximum number of data points to load", - ) - data_args.add_argument( - "--smiles_columns", - type=str, - metavar="STRING", - help="List of names of the columns containing SMILES strings.By default, uses the first\ - number_of_molecules columns.", + help="Maximum number of atoms to use for interpretation", ) - data_args.add_argument( - "--number_of_molecules", + + interpret_args.add_argument( + "--min_atoms", type=int, metavar="INT", - help="Number of molecules in each input to the model.This must equal the length of\ - smiles_columns if not None", + help="Minimum number of atoms to use for interpretation", ) - data_args.add_argument( - "--atom_descriptors", - type=bool, - metavar="Bool", - help="Use or not atom descriptors", + interpret_args.add_argument( + "--prop_delta", + type=float, + metavar="FLOAT", + help="The minimum change in the property of interest that is considered significant", ) - - data_args.add_argument( - "--bond_features_size", + interpret_args.add_argument( + "--property_id", type=int, metavar="INT", - help="Size of the extra bond descriptors that will be used as bond features to featurize a\ - given molecule", + help="The index of the property of interest", ) - training_args.add_argument( - "--batch_size", type=int, metavar="INT", default=50, help="Batch size" + # write the argument for rollouts + interpret_args.add_argument( + "--rollout", + type=int, + metavar="INT", + help="The number of rollouts to use for interpretation", ) diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 18402f09..191690ba 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -333,12 +333,17 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.learningRate, + decay_steps=1000, + decay_rate=opts.learningRateDecay, + staircase=True, + ) # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") sys.exit("Unsupported optimizer") @@ -596,11 +601,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: """ # find target columns - targets = [ - c - for c in df.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] - ] + targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..1b8f1a9a 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,12 +1,16 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict +from pathlib import Path from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Type, TypeVar, Union +import jsonpickle import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -14,7 +18,48 @@ from rdkit.Chem.Scaffolds import MurckoScaffold from tqdm import tqdm +# Define a type variable + + RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + + return result def makePathAbsolute(p: str) -> str: @@ -31,20 +76,74 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def parse_cli_list(value: str): + # Simple parser for lists passed as comma-separated values + return value.split(",") + + +def parse_cli_boolean(cli_args, cli_arg_key): + # Determines boolean value based on command line presence + if cli_arg_key in cli_args: + return True # Presence of flag implies True + return False + + +def createArgsFromJson(jsonFile: str) -> List[str]: arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + cli_args = sys.argv[1:] # Skipping the script name itself + + with open(jsonFile, "r") as f: data = json.load(f) + + processed_cli_keys = [] # To track which CLI keys have been processed + for key, value in data.items(): if key not in ignore_elements: - if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") - arguments.extend(value) + cli_arg_key = f"--{key}" + if cli_arg_key in cli_args: + processed_cli_keys.append(cli_arg_key) + arg_index = cli_args.index(cli_arg_key) + 1 + if isinstance(value, bool): + value = parse_cli_boolean(cli_args, cli_arg_key) + elif arg_index < len(cli_args) and not cli_args[arg_index].startswith( + "--" + ): + cli_value = cli_args[arg_index] + if isinstance(value, list): + value = parse_cli_list(cli_value) + else: + value = cli_value # Override JSON value with command-line value + if isinstance(value, bool): + if value: + arguments.append(cli_arg_key) + elif isinstance(value, list): + arguments.append(cli_arg_key) + arguments.extend(map(str, value)) # Ensure all elements are strings else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data + arguments.extend([cli_arg_key, str(value)]) + i = 0 + while i < len(cli_args): + arg = cli_args[i] + if arg.startswith("--"): + key = arg.lstrip("--") + if key not in data: + value = ( + True + if i + 1 >= len(cli_args) or cli_args[i + 1].startswith("--") + else cli_args[i + 1] + ) + if isinstance(value, bool): + if value: + arguments.append(arg) + else: + arguments.extend([arg, str(value)]) + i += 1 if isinstance(value, bool) else 2 + else: + i += 1 + else: + i += 1 + return arguments @@ -76,53 +175,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): return mol -def generate_scaffold( - mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True -) -> str: - """ - Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. - - :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. - :param include_chirality: Whether to include chirality in the computed scaffold. - :return: The Bemis-Murcko scaffold for the molecule. - """ - if isinstance(mol, str): - if mol.startswith("InChI="): - mol = inchi_to_mol(mol) - else: - mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) - elif isinstance(mol, tuple): - mol = mol[0] - scaffold = MurckoScaffold.MurckoScaffoldSmiles( - mol=mol, includeChirality=include_chirality - ) - - return scaffold - - -def scaffold_to_smiles( - mols: List[str], use_indices: bool = False -) -> Dict[str, Union[Set[str], Set[int]]]: - """ - Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). - :param mols: A list of SMILES. - :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than - mapping to the smiles string itself. This is necessary if there are duplicate smiles. - :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. - """ - scaffolds = defaultdict(set) - for i, mol in tqdm(enumerate(mols), total=len(mols)): - scaffold = generate_scaffold(mol) - if use_indices: - scaffolds[scaffold].add(i) - else: - scaffolds[scaffold].add(mol) - - return scaffolds - - -# def inchi_to_mol(inchi: str) -> Chem.Mol: -# return Chem.inchi.MolFromInchi(inchi) def smiles_to_mol(smiles: str) -> Chem.Mol: mol = Chem.MolFromSmiles(smiles) if mol is None: @@ -186,6 +238,51 @@ def weight_split( return train_df, val_df, test_df +def generate_scaffold( + mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True +) -> str: + """ + Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. + + :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. + :param include_chirality: Whether to include chirality in the computed scaffold. + :return: The Bemis-Murcko scaffold for the molecule. + """ + if isinstance(mol, str): + if mol.startswith("InChI="): + mol = inchi_to_mol(mol) + else: + mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) + elif isinstance(mol, tuple): + mol = mol[0] + scaffold = MurckoScaffold.MurckoScaffoldSmiles( + mol=mol, includeChirality=include_chirality + ) + + return scaffold + + +def scaffold_to_smiles( + mols: List[str], use_indices: bool = False +) -> Dict[str, Union[Set[str], Set[int]]]: + """ + Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). + :param mols: A list of SMILES. + :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than + mapping to the smiles string itself. This is necessary if there are duplicate smiles. + :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. + """ + scaffolds = defaultdict(set) + for i, mol in tqdm(enumerate(mols), total=len(mols)): + scaffold = generate_scaffold(mol) + if use_indices: + scaffolds[scaffold].add(i) + else: + scaffolds[scaffold].add(mol) + + return scaffolds + + def ae_scaffold_split( data: pd.DataFrame, sizes: Tuple[float, float, float] = (0.8, 0, 0.2), diff --git a/dfpl/vae.py b/dfpl/vae.py index d0a89dbe..45cfda7a 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -1,8 +1,6 @@ -import csv import logging import math import os.path -from os.path import basename from typing import Tuple import numpy as np @@ -26,22 +24,26 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize - encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + encoding_dim = ( + opts.encFPSize + ) # This should be the intended size of your latent space, e.g., 256 + + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) - # get the number of meaningful hidden layers (latent space included) hidden_layer_count = round(math.log2(input_size / encoding_dim)) - # the input placeholder input_vec = Input(shape=(input_size,)) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! + # 1st hidden layer if opts.aeActivationFunction != "selu": encoded = Dense( units=int(input_size / 2), activation=opts.aeActivationFunction @@ -53,87 +55,81 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mo kernel_initializer="lecun_normal", )(input_vec) - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - # print(f'{factor_units}: {int(input_size / factor_units)}') - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # latent space layers - factor_units = 2 ** (hidden_layer_count - 1) + # encoding layers + for i in range( + 1, hidden_layer_count - 1 + ): # Adjust the range to stop before the latent space layers + factor_units = 2 ** (i + 1) if opts.aeActivationFunction != "selu": - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - z_log_var = Dense( + encoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, )(encoded) else: - z_mean = Dense( + encoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal", )(encoded) - z_log_var = Dense( + + # latent space layers + if opts.aeActivationFunction != "selu": + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + else: + z_mean = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + + # sampling layer + def sampling(args): + z_mean, z_log_var = args + batch = K.shape(z_mean)[0] + dim = K.int_shape(z_mean)[1] + epsilon = K.random_normal(shape=(batch, dim)) + return z_mean + K.exp(0.5 * z_log_var) * epsilon + + z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) + decoded = z + + # decoding layers + for i in range(hidden_layer_count - 2, 0, -1): + factor_units = 2**i + if opts.aeActivationFunction != "selu": + decoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + )(decoded) + else: + decoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal", - )(encoded) + )(decoded) - # sampling layer - def sampling(args): - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon - - # sample from latent space - z = Lambda(sampling, output_shape=(int(input_size / factor_units),))( - [z_mean, z_log_var] - ) - decoded = z - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - # print(f'{factor_units}: {int(input_size/factor_units)}') - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) - - else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + # output layer + decoded = Dense( + units=input_size, activation="sigmoid", bias_initializer=output_bias + )(decoded) autoencoder = Model(input_vec, decoded) + encoder = Model(input_vec, z) + autoencoder.summary(print_fn=logging.info) # KL divergence loss def kl_loss(z_mean, z_log_var): @@ -155,9 +151,6 @@ def vae_loss(y_true, y_pred): optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] ) - # build encoder model - encoder = Model(input_vec, z_mean) - return autoencoder, encoder @@ -175,39 +168,9 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - # Define output files for VAE and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No VAE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] - + opts.aeType - + opts.aeSplitType - ) - logging.info( - f"(variational) encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"VAE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=vae_weights_file, opts=opts - ) + # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), @@ -219,17 +182,17 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: ) assert 0.0 <= opts.testSize <= 0.5 if opts.aeSplitType == "random": - logging.info("Training VAE using random split") - train_indices = np.arange(fp_matrix.shape[0]) + logging.info("Training autoencoder using random split") + initial_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: # Split data into test and training data if opts.aeWabTracking: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: x_train = fp_matrix @@ -255,6 +218,12 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() else: x_train = fp_matrix x_test = None @@ -262,7 +231,6 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: logging.info("Training autoencoder using molecular weight split") train_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: - # if opts.aeWabTracking: train_data, val_data, test_data = weight_split( df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" ) @@ -276,16 +244,21 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + # Get the sorted indices from the sorted DataFrame + sorted_indices = df_sorted.index.to_numpy() + + # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame + train_indices = sorted_indices[df.index.isin(train_data.index)] + # val_indices = sorted_indices[df.index.isin(val_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] else: x_train = fp_matrix x_test = None else: raise ValueError(f"Invalid split type: {opts.split_type}") - if opts.testSize > 0.0: - train_indices = train_indices[train_indices < x_train.shape[0]] - test_indices = np.arange(x_train.shape[0], x_train.shape[0] + x_test.shape[0]) - else: - test_indices = None + + # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints ids, counts = np.unique(x_train.flatten(), return_counts=True) count_dict = dict(zip(ids, counts)) if count_dict[0] == 0: @@ -304,34 +277,34 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) # Train the VAE on the training data + callback_list = callbacks.autoencoder_callback( + checkpoint_path=f"{save_path}.h5", opts=opts + ) + vae_hist = vae.fit( x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, - callbacks=callback_list, + callbacks=[callback_list], validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) # Save the VAE weights - logging.info(f"VAE weights stored in file: {vae_weights_file}") ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".VAE"), + base_file_name=save_path, hist=vae_hist, ) - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_VAE.h5") - if opts.testSize > 0.0: - (callback_vae, callback_encoder) = define_vae_model(opts) - callback_vae.load_weights(filepath=vae_weights_file) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - latent_space = encoder.predict(fp_matrix) - latent_space_file = os.path.join( - opts.outputDir, base_file_name + ".latent_space.csv" - ) - with open(latent_space_file, "w", newline="") as file: - writer = csv.writer(file) - writer.writerows(latent_space) + # Re-define autoencoder and encoder using your function + callback_autoencoder, callback_encoder = define_vae_model(opts) + callback_autoencoder.load_weights(filepath=f"{save_path}.h5") + + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = f"{save_path}_encoder.h5" + callback_encoder.save_weights(filepath=encoder_save_path) + return encoder, train_indices, test_indices diff --git a/example/predict.json b/example/predict.json index 252965e3..d96ad803 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,11 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_autoencoder/", - "ecWeightsFile": "", - "fnnModelDir": "example/results_train/AR_saved_model", + "ecModelDir": "example/results_train/random_split_autoencoder/encoder_model", + "fnnModelDir": "example/results_train/AR-1_best_saved_model", + "aeType": "deterministic", "compressFeatures": true, - "trainAC": false, "trainFNN": false } diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..dfdd6a8d 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" + "preds_path": "preds_dmpnn/preds.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/train.json b/example/train.json index 62f2abb4..bf57e7e2 100755 --- a/example/train.json +++ b/example/train.json @@ -3,11 +3,12 @@ "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/", - "ecWeightsFile": "random_autoencoder.hdf5", + "ecWeightsFile": "", "verbose": 2, "trainAC": true, "compressFeatures": true, + "visualizeLatent": false, "encFPSize": 256, "aeSplitType": "random", @@ -16,7 +17,7 @@ "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, - "aeLearningRateDecay": 0.0001, + "aeLearningRateDecay": 0.96, "aeType": "deterministic", "type": "smiles", @@ -29,7 +30,7 @@ "gpu": "", "trainFNN": true, - "kFolds": 1, + "kFolds": 2, "threshold": 0.5, "testSize": 0.2, "fnnType": "FNN", @@ -40,6 +41,7 @@ "activationFunction": "selu", "dropout": 0.0107, "learningRate": 0.0000022, + "learningRateDecay": 0.96, "l2reg": 0.001, "aeWabTracking": false, diff --git a/example/traingnn.json b/example/traingnn.json index 714fa80a..5536f700 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -2,13 +2,13 @@ "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", - "epochs": 2, - "num_folds": 2, + "epochs": 4, + "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", "split_type": "random", "dataset_type": "classification", "smiles_columns": "smiles", "extra_metrics": ["balanced_accuracy","auc","f1","mcc","recall","precision"], - "hidden_size": 256 + "hidden_size": 300 }