Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ jobs:
echo "predict result directory missing" >&2
exit 1
fi

echo "result lines "$(wc -l preds_dmpnn/DMPNN_preds.csv)
if [ "$(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" -lt "6" ]; then
echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" >&2
Expand Down
165 changes: 59 additions & 106 deletions dfpl/__main__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import dataclasses
import logging
import os.path
import pathlib
import os
from argparse import Namespace
from os import path

import chemprop as cp
import pandas as pd
import chemprop
from keras.models import load_model

from dfpl import autoencoder as ac
Expand All @@ -17,108 +15,60 @@
from dfpl import vae as vae
from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute

project_directory = pathlib.Path(".").parent.parent.absolute()
test_train_opts = options.Options(
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
outputDir=f"{project_directory}/output_data/console_test",
ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
type="smiles",
fpType="topological",
epochs=100,
batchSize=1024,
fpSize=2048,
encFPSize=256,
enableMultiLabel=False,
testSize=0.2,
kFolds=2,
verbose=2,
trainAC=False,
trainFNN=True,
compressFeatures=True,
activationFunction="selu",
lossFunction="bce",
optimizer="Adam",
fnnType="FNN",
)

test_pred_opts = options.Options(
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
outputDir=f"{project_directory}/output_data/console_test",
outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
type="smiles",
fpType="topological",
)


def traindmpnn(opts: options.GnnOptions):

def traindmpnn(opts: options.GnnOptions) -> None:
"""
Train a D-MPNN model using the given options.
Args:
- opts: options.GnnOptions instance containing the details of the training
Returns:
- None
"""
os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
ignore_elements = ["py/object"]
# Load options from a JSON file and replace the relevant attributes in `opts`
arguments = createArgsFromJson(
opts.configFile, ignore_elements, return_json_object=False
)
opts = cp.args.TrainArgs().parse_args(arguments)
arguments = createArgsFromJson(jsonFile=opts.configFile)
opts = chemprop.args.TrainArgs().parse_args(arguments)
logging.info("Training DMPNN...")
# Train the model and get the mean and standard deviation of AUC score from cross-validation
mean_score, std_score = cp.train.cross_validate(
args=opts, train_func=cp.train.run_training
mean_score, std_score = chemprop.train.cross_validate(
args=opts, train_func=chemprop.train.run_training
)
logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")


def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
def predictdmpnn(opts: options.GnnOptions) -> None:
"""
Predict the values using a trained D-MPNN model with the given options.
Args:
- opts: options.GnnOptions instance containing the details of the prediction
- JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
Returns:
- None
"""
ignore_elements = [
"py/object",
"checkpoint_paths",
"save_dir",
"saving_name",
]
# Load options and additional arguments from a JSON file
arguments, data = createArgsFromJson(
json_arg_path, ignore_elements, return_json_object=True
)
arguments.append("--preds_path")
arguments.append("")
save_dir = data.get("save_dir")
name = data.get("saving_name")
# Replace relevant attributes in `opts` with loaded options
opts = cp.args.PredictArgs().parse_args(arguments)
opts.preds_path = save_dir + "/" + name
df = pd.read_csv(opts.test_path)
smiles = []
for index, rows in df.iterrows():
my_list = [rows.smiles]
smiles.append(my_list)
# Make predictions and return the result
cp.train.make_predictions(args=opts, smiles=smiles)
arguments = createArgsFromJson(jsonFile=opts.configFile)
opts = chemprop.args.PredictArgs().parse_args(arguments)

chemprop.train.make_predictions(args=opts)


def interpretdmpnn(opts: options.GnnOptions) -> None:
"""
Interpret the predictions of a trained D-MPNN model with the given options.
Args:
- opts: options.GnnOptions instance containing the details of the prediction
Returns:
- None
"""
# Load options and additional arguments from a JSON file
arguments = createArgsFromJson(jsonFile=opts.configFile)
opts = chemprop.args.InterpretArgs().parse_args(arguments)

chemprop.interpret.interpret(args=opts, save_to_csv=True)


def train(opts: options.Options):
"""
Run the main training procedure
:param opts: Options defining the details of the training
"""

os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"

# import data from file and create DataFrame
if "tsv" in opts.inputFile:
df = fp.importDataFile(
Expand All @@ -128,7 +78,7 @@ def train(opts: options.Options):
df = fp.importDataFile(
opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
)
# initialize encoders to None
# initialize (auto)encoders to None
encoder = None
autoencoder = None
if opts.trainAC:
Expand All @@ -142,26 +92,31 @@ def train(opts: options.Options):
# if feature compression is enabled
if opts.compressFeatures:
if not opts.trainAC:
if opts.aeType == "deterministic":
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
elif opts.aeType == "variational":
if opts.aeType == "variational":
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
elif opts.ecWeightsFile == "":
else:
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())

if opts.ecWeightsFile == "":
encoder = load_model(opts.ecModelDir)
else:
autoencoder.load_weights(
os.path.join(opts.ecModelDir, opts.ecWeightsFile)
)
# compress the fingerprints using the autoencoder
df = ac.compress_fingerprints(df, encoder)
# ac.visualize_fingerprints(
# df,
# before_col="fp",
# after_col="fpcompressed",
# train_indices=train_indices,
# test_indices=test_indices,
# save_as=f"UMAP_{opts.aeSplitType}.png",
# )
if opts.visualizeLatent and opts.trainAC:
ac.visualize_fingerprints(
df,
train_indices=train_indices,
test_indices=test_indices,
save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png",
)
elif opts.visualizeLatent:
logging.info(
"Visualizing latent space is only available if you train the autoencoder. Skipping visualization."
)

# train single label models if requested
if opts.trainFNN and not opts.enableMultiLabel:
sl.train_single_label_models(df=df, opts=opts)
Expand Down Expand Up @@ -257,24 +212,22 @@ def main():
raise ValueError("Input directory is not a directory")
elif prog_args.method == "traingnn":
traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)

createLogger("traingnn.log")
traindmpnn(traingnn_opts)

elif prog_args.method == "predictgnn":
predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
predictgnn_opts,
test_path=makePathAbsolute(predictgnn_opts.test_path),
preds_path=makePathAbsolute(predictgnn_opts.preds_path),
)

logging.info(
f"The following arguments are received or filled with default values:\n{prog_args}"
)

predictdmpnn(fixed_opts, prog_args.configFile)
predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args)
createLogger("predictgnn.log")
predictdmpnn(predictgnn_opts)
elif prog_args.method == "interpretgnn":
interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args)
createLogger("interpretgnn.log")
interpretdmpnn(interpretgnn_opts)

elif prog_args.method == "train":
if prog_args.configFile is None and prog_args.inputFile is None:
parser.error("Either --configFile or --inputFile must be provided.")

train_opts = options.Options.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
train_opts,
Expand All @@ -288,6 +241,8 @@ def main():
)
train(fixed_opts)
elif prog_args.method == "predict":
if prog_args.configFile is None and prog_args.inputFile is None:
parser.error("Either --configFile or --inputFile must be provided.")
predict_opts = options.Options.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
predict_opts,
Expand All @@ -298,8 +253,6 @@ def main():
),
ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
trainAC=False,
trainFNN=False,
)
createDirectory(fixed_opts.outputDir)
createLogger(path.join(fixed_opts.outputDir, "predict.log"))
Expand Down
Loading