MOlecular Language for Chemical Assembly NOtation (MOLCANO) is a novel molecular line notation that represents molecules as an unordered sequence of fragment blocks to improve molecule design using generative models. (Accepted in npj Computational Materials)
- Python 3.11.8 or later
- CUDA 12.8 and cuDNN 9.10.2 if using GPU
virtualenv venv
source venv/bin/activate
# install all third-party packages and this project
pip install -r requirements.txtThis project includes the vendored third-party package t-SMILES, originally developed at: https://github.com/juanniwu/t-SMILES
The version included here is based on a public fork that contains additional bug fixes and modifications, developed at: https://github.com/wglee-incerebro/t-SMILES/
The original MIT license and attribution have been preserved in the vendored directory.
This project depends on chemicalgof, available at: https://github.com/f48r1/chemicalgof)
Important: The
chemicalgofrepository does not specify an open-source
license. For this reason, this project does not redistribute or vendor
chemicalgof. Users must download and install it themselves.
-
Download the
chemicalgof-master.ziparchive from the upstream repository. -
Place the downloaded zip file inside the
third_party/directory of this project:
third_party/ chemicalgof-master.zip
- Relax the RDKit requirement.
The upstreamchemicalgofpackage depends on RDKit.
To remain compatible with this project, the RDKit version must be relaxed to:
rdkit>=2023.9.5,<2026
Update the requirements.txt (or dependency metadata) inside the unzipped
chemicalgof directory to enforce this relaxed RDKit constraint.
- Unzip and install the package:
unzip third_party/chemicalgof-master.zip -d third_party
pip install third_party/chemicalgof-masterTest all functionalities with unittest
pytest testfrom molcano.converter import decompose, decode, SmilesToCanonicalSmiles
from rdkit import Chem
vDABNA = "c1c2c(cc3c1Nc1cccc4c1B3c1ccccc1N4)B1c3ccccc3Nc3cccc(c31)N2"
DABNA1 = Chem.MolFromSmarts("N1c2ccccc2B2c3ccccc3Nc3cccc1c23")
# User defined fragment library
PREDEFINED_SUBSTRUCTURES = [DABNA1,]
# SMILES --> MOLCANO
decomposed_fragments = decompose(vDABNA, PREDEFINED_SUBSTRUCTURES)
vDABNA_molcano = '.'.join(decomposed_fragments)
# MOLCANO --> SMILES
vDABNA_smiles = SmilesToCanonicalSmiles(decode(decomposed_fragments))## Data
export dataset=DRUG
export text_column=renumbered_minimal
export processed_data=expts/processed_data/${dataset}
export data_sources=expts/data/DRUG/pretrain_implicit.csv
export eval_dataset=expts/data/DRUG/train_w_sim_implicit.csv
export tokenizer_root=./expts/tokenizer/${dataset}_${text_column}_implicit_tokenizer_hf_proc
# hyper-parameters
export model=GPT_small
export OUTPUT_DIR=expts/run/${model}/${text_column}/de_novo
export lr_scheduler_type=cosine
export label_smoothing_factor=0.0
export permute_fragment=True
export permute_index=True
export traversal_method=bfs
export learning_rate=0.0003
export model_dropout=0.3
export adam_beta1=0.9
export adam_beta2=0.98
export weight_decay=0.01
export max_grad_norm=5.0
export warmup_steps=3000
export eval_steps=5000
export save_steps=20000
export max_steps=100000
export logging_steps=1000
export max_eval_samples=1000
export per_device_train_batch_size=64
export gradient_accumulation_steps=1
export per_device_eval_batch_size=128
export decoding_batch_size=128
if [[ ! -s ${processed_data} ]]; then
CUDA_VISIBLE_DEVICES= molcano-prepare \
--data_sources ${data_sources} \
--text_column ${text_column} \
--eval_dataset ${eval_dataset} \
--output_dir ${processed_data}
fi
molcano-train \
--do_train \
--config configs/${model}_config.json \
--output_dir ${OUTPUT_DIR} \
--num_of_class 1 \
--torch_compile True \
--optim adamw_torch \
--streaming True \
--permute_fragment ${permute_fragment} \
--permute_index ${permute_index} \
--traversal_method ${traversal_method} \
--dataloader_num_workers 8 \
--max_steps ${max_steps} \
--logging_steps ${logging_steps} \
--eval_strategy steps \
--eval_steps ${eval_steps} \
--save_steps ${save_steps} \
--max_eval_samples ${max_eval_samples} \
--save_only_model True \
--load_best_model_at_end True \
--metric_for_best_model eval_loss \
--dataset ${processed_data} \
--train_dataset ${data_sources} \
--eval_dataset ${eval_dataset} \
--tokenizer ${tokenizer_root} \
--text_column ${text_column} \
--learning_rate ${learning_rate} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--model_dropout ${model_dropout} \
--decoding_batch_size ${decoding_batch_size} \
--lr_scheduler_type ${lr_scheduler_type} \
--warmup_steps ${warmup_steps} \
--weight_decay ${weight_decay} \
--adam_beta1 ${adam_beta1} \
--adam_beta2 ${adam_beta2} \
--max_grad_norm ${max_grad_norm} \
--label_smoothing_factor ${label_smoothing_factor}# Data
export dataset=DRUG
export data_sources=expts/data/DRUG/train_w_sim_implicit.csv
export eval_dataset=expts/data/DRUG/test_w_sim_implicit.csv
export train_dataset=expts/data/DRUG_all/pretrain_plus_train_implicit.csv
export text_column=renumbered_minimal
export processed_data=expts/processed_data/${dataset}-SSG
export tokenizer_root=./expts/tokenizer/${dataset}_${text_column}_implicit_tokenizer_hf_proc
# hyper-parameters
export task_type=superstructure
export eval_steps=1000
export save_steps=5000
export max_steps=101000
export logging_steps=100
export max_eval_samples=1000
export decoding_num_samples=200
export OUTPUT_DIR=expts/run/${model}/${text_column}/ssg
export resume_from_checkpoint=expts/run/${model}/${text_column}/de_novo
if [[ ! -s ${processed_data} ]]; then
CUDA_VISIBLE_DEVICES= molcano-prepare \
--data_sources ${data_sources} \
--text_column ${text_column} \
--eval_dataset ${eval_dataset} \
--output_dir ${processed_data}
fi
molcano-train \
--do_train \
--config configs/${model}_config.json \
--output_dir ${OUTPUT_DIR} \
--num_of_class 1 \
--torch_compile True \
--optim adamw_torch \
--streaming True \
--permute_fragment ${permute_fragment} \
--permute_index ${permute_index} \
--traversal_method ${traversal_method} \
--remove_full_valency ${remove_full_valency} \
--random_numbering ${random_numbering} \
--dataloader_num_workers 8 \
--max_steps ${max_steps} \
--logging_steps ${logging_steps} \
--eval_strategy steps \
--eval_steps ${eval_steps} \
--save_steps ${save_steps} \
--max_eval_samples ${max_eval_samples} \
--save_only_model True \
--metric_for_best_model eval_loss \
--decoding_num_samples ${decoding_num_samples} \
--decoding_bad_words A a \
--task_type ${task_type} \
--resume_from_checkpoint ${resume_from_checkpoint} \
--dataset ${processed_data} \
--train_dataset ${train_dataset} \
--eval_dataset ${eval_dataset} \
--tokenizer ${tokenizer_root} \
--text_column ${text_column} \
--scaffold_column ${text_column} \
--learning_rate ${learning_rate} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--model_dropout ${model_dropout} \
--decoding_batch_size ${decoding_batch_size} \
--lr_scheduler_type ${lr_scheduler_type} \
--warmup_steps ${warmup_steps} \
--weight_decay ${weight_decay} \
--adam_beta1 ${adam_beta1} \
--adam_beta2 ${adam_beta2} \
--max_grad_norm ${max_grad_norm} \
--label_smoothing_factor ${label_smoothing_factor}export dataset=DRUG
export data_sources=expts/data/DRUG/pretrain_implicit.csv
export eval_dataset=expts/data/DRUG/train_w_sim_implicit.csv
export text_column=renumbered_minimal
export tokenizer_root=./expts/tokenizer/${dataset}_${text_column}_implicit_tokenizer_hf_proc
# hyper-parameters
export eval_steps=50
export max_steps=500
export logging_steps=10
export reinvent_batch_size=2048
export reinvent_mini_batch_size=512
export reinvent_reward_fn="qed:2.0 sas:0.1 similarity:0.0"
export loss_type=grpo
export beta=0.1
export importance_sampling_level=sequence
export use_experience_replay=True
export prioritize_experience_replay=False
export learning_rate=5e-5
export num_inner_epoch=10
export OUTPUT_DIR=expts/run/${model}/${text_column}/trl
export resume_from_checkpoint=expts/run/${model}/${text_column}/de_novo
CUDA_VISIBLE_DEVICES=0 \
molcano-trl \
--config configs/${model}_config.json \
--output_dir ${OUTPUT_DIR} \
--num_of_class 1 \
--optim adamw_torch \
--streaming True \
--loss_type ${loss_type} \
--beta ${beta} \
--importance_sampling_level ${importance_sampling_level} \
--use_experience_replay ${use_experience_replay} \
--prioritize_experience_replay ${prioritize_experience_replay} \
--reinvent_reward_fn ${reinvent_reward_fn} \
--reinvent_batch_size ${reinvent_batch_size} \
--reinvent_mini_batch_size ${reinvent_mini_batch_size} \
--reinvent_similarity_target ${eval_dataset} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--dataloader_num_workers 8 \
--max_steps ${max_steps} \
--eval_steps ${eval_steps} \
--logging_steps ${logging_steps} \
--eval_on_start \
--eval_strategy steps \
--resume_from_checkpoint ${resume_from_checkpoint} \
--dataset ${processed_data} \
--train_dataset ${data_sources} \
--tokenizer ${tokenizer_root} \
--text_column ${text_column} \
--scaffold_column ${text_column} \
--model_dropout ${model_dropout} \
--permute_fragment ${permute_fragment} \
--permute_index ${permute_index} \
--traversal_method ${traversal_method} \
--lr_scheduler_type cosine \
--learning_rate ${learning_rate} \
--decoding_batch_size ${decoding_batch_size} \
--decoding_bad_words A amolcano-eval \
--input-files ${OUTPUT_DIR}/de_novo_10000.csv \
--output-dir ${OUTPUT_DIR} \
--functions qed sas \
--columns target_keys