LeMaterial · jwtoney · May 25, 2026 · sid-betalol · May 26, 2026 · sid-betalol
diff --git a/.gitignore b/.gitignore
@@ -206,6 +206,10 @@ tempCodeRunnerFile.py
 # Ruff stuff:
 .ruff_cache/
 
+# step-up training artifacts and local data
+outputs/
+data/processed/
+
 # PyPI configuration file
 .pypirc
 

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "external/ReBIND"]
+	path = external/ReBIND
+	url = https://github.com/holymollyhao/ReBIND.git
diff --git a/README.md b/README.md
@@ -4,16 +4,23 @@ A benchmark for 2D to 3D conformer generation models.
 
 ## Getting Started
 
-This project uses [`uv`](https://docs.astral.sh/uv/) for Python, dependency, and
-environment management.
+This project uses [`uv`](https://docs.astral.sh/uv/) for Python, dependency, and environment management.
+
+The first model (ReBind) is vendored as a git submodule under `external/ReBIND`, so clone with `--recursive`:
 
 ```bash
-git clone https://github.com/LeMaterial/step-up.git
+git clone --recursive https://github.com/LeMaterial/step-up.git
 cd step-up
 uv sync --dev
 ```
 
-Run the test suite:
+If you already cloned without `--recursive`:
+
+```bash
+git submodule update --init --recursive
+```
+
+Run the test suite (CPU-only, ~30s):
 
 ```bash
 uv run pytest
@@ -26,15 +33,100 @@ uv run ruff format --check
 uv run ruff check
 ```
 
-## Usage
+## CPU Smoke Runs
+
+Each config under `configs/*_smoke.yaml` trains a tiny model on a 100-molecule
+subset for 3 epochs on CPU. The goal is to verify the pipeline end-to-end (data
+loader --> model forward --> loss --> optimizer), not to produce meaningful
+metrics. Loss should decrease monotonically across epochs.
+
+```bash
+uv run step-up train -c configs/qm9_smoke.yaml      # ~25s
+uv run step-up train -c configs/tmqmg_smoke.yaml    # ~5 min (larger complexes)
+uv run step-up train -c configs/bostmc_smoke.yaml   # ~6 min (larger complexes)
+```
+
+## Production Training
+
+Three full-scale configs are staged. They target GPU (`device: cuda`) and use
+ReBind's published QM9 hyperparameters (8 encoder + 8 decoder layers,
+d_model=512, lr=9e-5, AdamW, batch 100 for QM9 / 32 for organometallics,
+20 epochs by default).
+
+| Config | Dataset | Rows | Notes |
+|---|---|---|---|
+| `configs/qm9.yaml` | QM9-full.csv | ~134K | Sanity baseline on organic systems |
+| `configs/tmqmg.yaml` | tmQMg-full.csv | ~60K | Singlets, full d-block + La |
+| `configs/bostmc.yaml` | BOSTMC-low-spin.csv | ~140K | Singlets + doublets, full d-block |
+
+To dry-run a config (validates the YAML and dataset path without training):
+
+```bash
+uv run step-up train -c configs/qm9.yaml --dry-run
+```
+
+```bash
+sbatch scripts/train.slurm configs/qm9.yaml
+sbatch scripts/train.slurm configs/tmqmg.yaml
+sbatch scripts/train.slurm configs/bostmc.yaml
+```
+
+The Slurm script (1) initializes the submodule, (2) runs `uv sync --dev`, and
+(3) launches `uv run step-up train -c <config>`. Each run writes its config,
+TensorBoard logs, and best checkpoint (by val D-MAE) to the `output_dir`
+specified in the config. default is `outputs/<dataset>_full/`.
+
+### Adjusting training duration
 
-The benchmark is in early development. As functionality is added, reusable code
-will live under the `step_up` Python package and can be run through `uv`:
+To train longer, edit the config's `epochs` field. ReBind's paper used 20
+epochs; more epochs only help if val D-MAE is still trending down at the end.
+Inspect TensorBoard during the run:
 
 ```bash
-uv run python
+uv run tensorboard --logdir outputs/qm9_full/tb
 ```
 
+### Resuming after compute interruptions
+
+The current loop saves `best.pt` (model weights only) on the best val D-MAE
+epoch but doesn't snapshot the optimizer / scheduler state. Resumption is a
+TODO for the next iteration. For long runs, prefer over-provisioning the time
+budget in the Slurm header.
+
+## Repository Layout
+
+```
+src/step_up/
+|-- data/
+|   |-- csv_dataset.py    # streaming CSV --> graph-dict dataset
+|   |-- featurize.py      # XYZ path (RDKit DetermineBonds for QM9)
+|   |-- mol2.py           # direct MOL2 parser (no RDKit, used for organometallics)
+|   |-- splits.py
+|-- models/
+|   |-- rebind.py         # thin wrapper over external/ReBIND + 3 runtime patches
+|   eval/
+|   |-- metrics.py        # D-MAE, D-RMSE, coord-RMSD, per-element D-MAE
+|-- train.py              # config-driven training loop
+|-- cli.py                # `uv run step-up train -c <yaml>`
+
+configs/           # per-dataset YAML configs (smoke + full)
+external/ReBIND/   # git submodule, vendored upstream ReBind
+scripts/train.slurm
+tests/             # 9 tests covering imports, dataset loading, forward pass, metrics
+```
+
+## Data Path Notes
+
+- **QM9 (smiles + xyz)**: built from the XYZ block via
+  `Chem.MolFromXYZBlock` + `rdDetermineBonds.DetermineBonds`. The SMILES
+  column is intentionally unused because its atom order doesn't match the
+  XYZ block in QM9-full.csv.
+- **Organometallics (mol2 + xyz)**: built directly from the MOL2 block via
+  the in-house parser in `step_up/data/mol2.py`. No RDKit* is involved;
+  connectivity and bond types come straight from the MOL2 file, SYBYL atom
+  types provide hybridization and aromaticity (e.g. `C.ar`, `N.am`), and rings
+  are computed from the bond graph via Tarjan bridge-finding.
+
 ## Contributing
 
 See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, pre-commit hooks,

diff --git a/configs/bostmc.yaml b/configs/bostmc.yaml
@@ -0,0 +1,28 @@
+# Full BOSTMC-low-spin training run (~140K complexes, singlets + doublets, full d-block). GPU-only; staged for Slurm.
+# DESIGN DECISION (flagged in plan file):
+#   - `charge` and `spinmult` columns are currently IGNORED.
+#   - Recommendation for first publishable run: pre-filter to singlets-only (spinmult == 1) for the cleanest comparison to tmQMg. The CSVMoleculeDataset does not yet expose a filter knob, add one when this config is first run.
+#   - Follow-up: condition the model on (charge, spinmult) as a global feature.
+dataset_path: /home/gridsan/jtoney/BOSTMC/datasets/BOSTMC-low-spin.csv
+dataset_source: mol2
+subset_size: null
+split_ratios: [0.9, 0.05, 0.05]
+split_seed: 0
+
+n_layers: 8
+d_model: 512
+d_ffn: 1024
+n_head: 8
+dropout: 0.0
+
+epochs: 20
+batch_size: 100
+eval_batch_size: 100
+lr: 9.0e-5
+weight_decay: 0.0
+warmup_ratio: 0.1
+num_workers: 4
+
+device: cuda
+output_dir: outputs/bostmc_full
+seed: 0
diff --git a/configs/bostmc_smoke.yaml b/configs/bostmc_smoke.yaml
@@ -0,0 +1,25 @@
+# BOSTMC smoke run on a 100-molecule subset. Verifies the direct MOL2 parser
+# end-to-end (no RDKit) plus the LJ patch on d-block elements.
+dataset_path: /home/gridsan/jtoney/BOSTMC/datasets/BOSTMC-low-spin.csv
+dataset_source: mol2
+subset_size: 100
+split_ratios: [0.8, 0.1, 0.1]
+split_seed: 0
+
+n_layers: 2
+d_model: 64
+d_ffn: 128
+n_head: 4
+dropout: 0.0
+
+epochs: 3
+batch_size: 4
+eval_batch_size: 4
+lr: 3.0e-4
+weight_decay: 0.0
+warmup_ratio: 0.1
+num_workers: 0
+
+device: cpu
+output_dir: outputs/bostmc_smoke
+seed: 0
diff --git a/configs/qm9.yaml b/configs/qm9.yaml
@@ -0,0 +1,26 @@
+# Full QM9-full.csv training run (~134K molecules).
+# GPU-only, staged for the Slurm job once compute is available.
+# Mirrors ReBind's QM9 hyperparams from external/ReBIND/experiments/conformer_prediction/rebind.sh.
+dataset_path: /home/gridsan/jtoney/ElemNet/benchmarking/datasets/QM9-full.csv
+dataset_source: smiles
+subset_size: null
+split_ratios: [0.9, 0.05, 0.05]
+split_seed: 0
+
+n_layers: 8
+d_model: 512
+d_ffn: 1024
+n_head: 8
+dropout: 0.0
+
+epochs: 20
+batch_size: 100
+eval_batch_size: 100
+lr: 9.0e-5
+weight_decay: 0.0
+warmup_ratio: 0.1
+num_workers: 4
+
+device: cuda
+output_dir: outputs/qm9_full
+seed: 0
diff --git a/configs/qm9_smoke.yaml b/configs/qm9_smoke.yaml
@@ -0,0 +1,26 @@
+# Smallest possible run that exercises the full pipeline on CPU.
+# Goal: prove the loss strictly decreases. Numbers are not meaningful.
+dataset_path: /home/gridsan/jtoney/ElemNet/benchmarking/datasets/QM9-full.csv
+dataset_source: smiles
+subset_size: 100
+split_ratios: [0.8, 0.1, 0.1]
+split_seed: 0
+
+# Tiny model — enough capacity to learn distance regression on 80 toy molecules.
+n_layers: 2
+d_model: 64
+d_ffn: 128
+n_head: 4
+dropout: 0.0
+
+epochs: 3
+batch_size: 8
+eval_batch_size: 8
+lr: 3.0e-4
+weight_decay: 0.0
+warmup_ratio: 0.1
+num_workers: 0
+
+device: cpu
+output_dir: outputs/qm9_smoke
+seed: 0
diff --git a/configs/tmqmg.yaml b/configs/tmqmg.yaml
@@ -0,0 +1,24 @@
+# Full tmQMg-full.csv training run (~60K organometallic complexes, closed-shell singlets, full d-block + La). GPU-only; staged for Slurm.
+dataset_path: /home/gridsan/jtoney/ElemNet/benchmarking/datasets/tmQMg-full.csv
+dataset_source: mol2
+subset_size: null
+split_ratios: [0.9, 0.05, 0.05]
+split_seed: 0
+
+n_layers: 8
+d_model: 512
+d_ffn: 1024
+n_head: 8
+dropout: 0.0
+
+epochs: 20
+batch_size: 100
+eval_batch_size: 100
+lr: 9.0e-5
+weight_decay: 0.0
+warmup_ratio: 0.1
+num_workers: 4
+
+device: cuda
+output_dir: outputs/tmqmg_full
+seed: 0
diff --git a/configs/tmqmg_smoke.yaml b/configs/tmqmg_smoke.yaml
@@ -0,0 +1,25 @@
+# Same shape as qm9_smoke, but reading the MOL2 path on organometallic data.
+# Verifies the LJ patch and the MOL2 → graph pipeline end-to-end on CPU.
+dataset_path: /home/gridsan/jtoney/ElemNet/benchmarking/datasets/tmQMg-full.csv
+dataset_source: mol2
+subset_size: 100
+split_ratios: [0.8, 0.1, 0.1]
+split_seed: 0
+
+n_layers: 2
+d_model: 64
+d_ffn: 128
+n_head: 4
+dropout: 0.0
+
+epochs: 3
+batch_size: 4   # tmQMg complexes are larger (~50 atoms vs ~15 for QM9)
+eval_batch_size: 4
+lr: 3.0e-4
+weight_decay: 0.0
+warmup_ratio: 0.1
+num_workers: 0
+
+device: cpu
+output_dir: outputs/tmqmg_smoke
+seed: 0
diff --git a/external/ReBIND b/external/ReBIND
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,28 @@
 [project]
 name = "step-up"
 version = "0.1.0"
-description = "Add your description here"
+description = "Benchmark for 2D to 3D molecular conformer generation models"
 readme = "README.md"
 authors = [
-    { name = "Siddharth Betala", email = "betalas5@gmail.com" }
+    { name = "Siddharth Betala", email = "betalas5@gmail.com" },
+    { name = "Jacob Toney", email = "jwt@mit.edu" }
 ]
 requires-python = ">=3.12"
-dependencies = []
+dependencies = [
+    "numpy>=1.26",
+    "pandas>=2.2",
+    "pyyaml>=6.0",
+    "rdkit>=2024.3.3",
+    "tensorboard>=2.16",
+    "torch>=2.1",
+    "torch-geometric>=2.3",
+    "torchmetrics>=1.0",
+    "tqdm>=4.66",
+    "transformers>=4.43",
+]
+
+[project.scripts]
+step-up = "step_up.cli:main"
 
 [build-system]
 requires = ["uv_build>=0.10.8,<0.11.0"]
@@ -38,3 +53,19 @@ max-complexity = 12
 [tool.ruff.format]
 quote-style = "double"
 indent-style = "space"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+filterwarnings = [
+    "ignore::DeprecationWarning",
+    "ignore::UserWarning",
+]
+
+[tool.uv.sources]
+torch = [{ index = "pytorch-cu124" }]
+
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
diff --git a/scripts/train.sh b/scripts/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#SBATCH --job-name=step-up
+#SBATCH --gres=gpu:volta:1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --output=train.out
+
+# Usage: sbatch scripts/train.slurm configs/qm9.yaml
+#   - Edit partition / gres / time to match your cluster.
+#   - The job runs from the repo root and uses uv for everything.
+
+set -euo pipefail
+
+if [[ $# -lt 1 ]]; then
+  echo "Usage: $0 <config.yaml>"
+  exit 2
+fi
+CONFIG="$1"
+
+cd "$(dirname "$0")/.."
+
+# Make sure the submodule is initialized in case the job runs on a fresh checkout.
+git submodule update --init --recursive
+
+uv sync --dev
+
+uv run step-up train -c "$CONFIG"
diff --git a/src/step_up/__init__.py b/src/step_up/__init__.py
@@ -1,2 +1,3 @@
-def hello() -> str:
-    return "Hello from step-up!"
+"""step-up: a benchmark for 2D to 3D molecular conformer generation models."""
+
+__version__ = "0.1.0"