diff --git a/.gitignore b/.gitignore
index 461c5c9732..be0cbb22bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -81,7 +81,7 @@ instance/
 # Scrapy stuff:
 .scrapy
 
-# Sphinx documentation
+# Sphinx documentation (if sphinx is later added; docs/*.md is tracked)
 docs/_build/
 
 # PyBuilder
diff --git a/README.md b/README.md
index 99f1cbbed5..a50723c7f2 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,8 @@ python scripts/submit_cluster.py \
 
 `scripts/cluster_configs/nyu_greene.yaml` defines `account`, `gpus`, `cpus`, `mem`, `time` — edit `account` to your allocation before first submit. `--container` makes `submit_cluster.py` wrap the job command in `singularity exec --nv --overlay $OVERLAY_PATH:ro $IMAGE_PATH ...`.
 
+**For a full guide on how to use this see [`docs/cluster_training.md`](docs/cluster_training.md).**
+
 ## Data
 
 Place binaries under `pufferlib/resources/drive/binaries/`.
diff --git a/docs/cluster_training.md b/docs/cluster_training.md
new file mode 100644
index 0000000000..e4963ea2b6
--- /dev/null
+++ b/docs/cluster_training.md
@@ -0,0 +1,180 @@
+# Cluster training — operational guide
+
+How to run PufferDrive training on a SLURM cluster. This is written with the NYU cluster in mind but it should mostly hold for any SLURM cluster.
+
+## A quick overview of the setup and launch process
+
+```bash
+# One-time per cluster: create the singularity overlay and install deps
+# into the venv (this also installs submitit and the other submission
+# deps as part of the project's pyproject.toml).
+./scripts/setup_container.sh create-overlay
+sbatch --account=<acct> --gres=gpu:1 --cpus-per-task=8 --mem=32gb --time=60 \
+    --wrap "./scripts/setup_container.sh install"
+
+# If code changes, or we haven't built before, rebuild the C code in the container
+sbatch --account=<acct> --partition=cpu_short --cpus-per-task=8 --mem=16gb --time=20 \
+    --chdir=$PWD -o $LOGDIR/rebuild_%j.log \
+    --wrap "./scripts/setup_container.sh rebuild"
+
+# Training: source the venv on the login node, then submit_cluster.py
+# with --container --heartbeat. --main defaults to RL training; override
+# it to launch other modes (e.g. mining, eval).
+source /scratch/$USER/venvs/pufferdrive/bin/activate
+python scripts/submit_cluster.py \
+    --save_dir /scratch/$USER/runs \
+    --compute_config scripts/cluster_configs/nyu_greene.yaml \
+    --program_config scripts/cluster_configs/train_base.yaml \
+    --container --heartbeat \
+    --account <acct> --partition <gpu-partition> --time 2880 \
+    --args train.checkpoint_interval=250 env.simulation_mode=gigaflow # use this to override config args
+```
+
+## Container model
+
+PufferDrive on Greene runs inside a singularity container. The container provides
+a modern glibc + CUDA toolkit; the project's Python environment lives in a venv
+on `/scratch` so installs aren't bottlenecked by the slow process of building a venv inside a container.
+
+The container is invoked with a **read-only** overlay mount for the miniforge3
+base interpreter, plus the on-disk venv for project packages. As an example of running such a command:
+```bash
+singularity exec --nv \
+    --overlay /scratch/$USER/images/PufferDrive/overlay-15GB-500K.ext3:ro \
+    /share/apps/images/cuda12.8.1-cudnn9.8.0-ubuntu24.04.2.sif \
+    bash -c '
+        source /scratch/$USER/venvs/pufferdrive/bin/activate
+        export PYTHONNOUSERSITE=1
+        cd /scratch/$USER/code/PufferDrive
+        <your command>
+    '
+```
+
+## Submitting training — `submit_cluster.py`
+
+`scripts/submit_cluster.py` is the canonical submission path. It composes:
+- a `compute_config` YAML (SLURM settings)
+- a `program_config` YAML (pufferl training args)
+- `--args` CLI overrides
+- wraps the inner train command in `singularity exec` when `--container` is set
+- optionally injects the GPU heartbeat when `--heartbeat` is set. WARNING: this is specifically for the torch cluster to prevent our jobs being killed. No one else should use this.
+
+It performs code isolation (symlinks the
+top-level entries + hard-copies `pufferlib/` into a per-run sandbox), and
+hands the package to `submitit` for `sbatch`-submission.
+
+### Source the venv before invoking `submit_cluster.py`
+
+`setup_container.sh install` puts submitit + its deps into the project
+venv at `/scratch/$USER/venvs/pufferdrive/`. Sourcing the venv on the
+login node makes that submitit importable and lines up `sys.executable`
+with the same venv python that the compute node will run, so submitit's
+serialization round-trips cleanly.
+
+```bash
+source /scratch/$USER/venvs/pufferdrive/bin/activate
+python scripts/submit_cluster.py \
+    --save_dir /scratch/$USER/runs \
+    --prefix mytrain \
+    --compute_config scripts/cluster_configs/nyu_greene.yaml \
+    --program_config scripts/cluster_configs/train_base.yaml \
+    --account <acct> --partition <gpu-partition> --time 2880 \
+    --container \
+    --heartbeat \
+    --args \
+        train.total_timesteps=10000000000 \
+        train.checkpoint_interval=250
+```
+
+Key flags:
+
+| Flag | Effect |
+|---|---|
+| `--container` | wraps both submitit's outer launcher and the inner train command in `singularity exec --nv --overlay $OVERLAY:ro $IMAGE` |
+| `--heartbeat` | wraps the train command in a brace group that backgrounds `python scripts/gpu_heartbeat.py` preventing the cluster from killing your job due to low GPU usage |
+| `--args key=value ...` | passes nested config keys (underscores converted to dashes) as `--key value` on the torchrun line; e.g. `env.simulation_mode=replay` becomes `--env.simulation-mode replay` |
+| `--account` / `--partition` / `--time` | override `compute_config` SLURM settings |
+
+### GPU heartbeat — required for long runs
+
+`--heartbeat` is not optional for jobs over ~2 hours. Without it, the
+cluster's idle-GPU reclaimer issues a `scancel` from `uid 0` (root) during
+the first eval / checkpoint dip in GPU utilization.
+
+`scripts/gpu_heartbeat.py` monitors `nvidia-smi` and runs short matmul bursts
+when utilization drops below 65%, so the cluster always sees the GPU as
+active. It cooperates with training and steps aside when training is busy.
+
+### Environment knobs the container path sets
+
+When `--container` is on, the inner bash command has these env vars set
+before `cd $PROJECT_ROOT && <train>`:
+
+```bash
+source /scratch/$USER/venvs/pufferdrive/bin/activate
+export PYTHONNOUSERSITE=1
+export XDG_CACHE_HOME=/scratch/$USER/cache
+export WANDB_CACHE_DIR=/scratch/$USER/wandb_cache
+export WANDB_CONFIG_DIR=/scratch/$USER/wandb_config
+export WANDB_DATA_DIR=/scratch/$USER/wandb_data
+export WANDB_DIR=/scratch/$USER/wandb_data
+```
+
+## CPU rebuild path
+
+GPU partitions are routinely saturated by training jobs. `setup_container.sh
+rebuild` doesn't need a GPU — submit to a CPU partition for fast turnaround:
+
+```bash
+sbatch --account=<general-account> --partition=cpu_short \
+    --cpus-per-task=8 --mem=16gb --time=20 \
+    --chdir=$PWD \
+    -o /scratch/$USER/rebuild_logs/rebuild_%j.log \
+    --wrap "./scripts/setup_container.sh rebuild"
+```
+
+`--chdir=$PWD` is required because the script uses `./scripts/`. Takes ~40s.
+
+### Common pitfalls
+
+- **`ncclCommShrink` undefined symbol** at `from torch._C import *`. Greene's
+  cuda12.8.1 sif ships `libnccl 2.25.1` in `/usr/lib`, but torch ≥ 2.10 calls
+  `ncclCommShrink` from NCCL ≥ 2.27.5. torch's own NCCL 2.27.5 sits in
+  `site-packages/nvidia/nccl/lib/` and needs to win the loader search.
+  `setup_container.sh install`/`rebuild` patches `/ext3/env.sh` to prepend that
+  dir to `LD_LIBRARY_PATH`; existing overlays from before that patch need the
+  same line appended to `/ext3/env.sh`.
+- **`-lomp5` link errors on Linux** with conda-forge openmp. The default is for
+  older Intel OpenMP packaging. `setup.py` honors `OMP_LIB="-L$prefix/lib -lomp"`.
+- **`du /ext3` undercounts** when the overlay has cruft outside `upper/ext3/`
+  (e.g. failed pip installs that wrote to `/usr/local/lib/...` end up in
+  `upper/usr/local/` and aren't visible to apptainer's view). Use
+  `debugfs -R "ls /upper" overlay.ext3` from a login node to inspect.
+
+### `TORCH_CUDA_ARCH_LIST`: a quick warning that won't generally be an issue
+
+PufferDrive's C extension contains CUDA kernels. When `setup.py build_ext`
+compiles them, `nvcc` emits machine code for each architecture listed in
+the `TORCH_CUDA_ARCH_LIST` env var (and only those); the result is a large binary containing one variant per arch. If the env var is unset, the build
+defaults to whatever GPU was visible to the compiler at build time which is often
+just one architecture.
+
+On Greene, you frequently don't get to
+choose which GPU you land on. `_general` accounts queue across L40S
+(sm_89), H100 (sm_90), and H200 (sm_90); `_tandon_*` partitions add A100
+(sm_80). If the `_C.so` was built against only sm_80 and your job lands on
+an H100, every CUDA call into the extension dies with
+`no kernel image is available for execution on the device`.
+
+Setting `TORCH_CUDA_ARCH_LIST="8.0;8.9;9.0"` covers A100 / L40S+H100 / H200
+in one fat binary — the build is a bit slower (three variants instead of
+one) and the `.so` is a bit larger, but the resulting binary runs on every
+GPU Greene routes you to.
+
+`setup_container.sh rebuild` exports this automatically for the build step,
+so a fresh rebuild on the cluster is already multi-arch. The env var only
+matters when you build the C extension **outside** the rebuild wrapper —
+e.g. an interactive `python setup.py build_ext --inplace --force` inside a
+hand-launched singularity exec. Adding the export to your shell profile
+(or sourcing it before any manual build) saves you from hitting the "no
+kernel image" error after a quick fix-and-rebuild loop.
diff --git a/scripts/setup_container.sh b/scripts/setup_container.sh
index bf1b8f2c33..7623039f2d 100755
--- a/scripts/setup_container.sh
+++ b/scripts/setup_container.sh
@@ -4,12 +4,16 @@
 # with older glibc versions.
 #
 # Architecture:
-#   - The overlay is used ONLY for the miniforge3 base Python interpreter.
-#   - All Python packages (torch, pufferlib, etc.) live in a venv on /scratch
-#     (regular ext4) instead of the overlay (fuse2fs single-threaded ~10 MB/s).
-#     This makes installs/rebuilds ~50x faster than the all-in-overlay approach.
-#   - At runtime the venv's bin/python symlinks back to /ext3/miniforge3, which
-#     is why we still mount the overlay (read-only) when activating the venv.
+#   - miniforge3 lives on /scratch (NOT in the overlay) so its python is a
+#     real file accessible from any node, in or out of singularity. The venv
+#     symlinks `bin/python` into the /scratch miniforge3, which makes
+#     `source venv/activate` work on the login node directly without
+#     needing to enter the container.
+#   - All Python packages (torch, pufferlib, etc.) live in the venv on /scratch
+#     too — fuse2fs is not on the write path for any install step.
+#   - The singularity image still supplies CUDA + cuDNN at job runtime. The
+#     overlay is preserved for the rare case where you need to install
+#     system-level tools, but it's not used for the standard python flow.
 #
 # Usage:
 #   1. Create an overlay (one time): ./setup_container.sh create-overlay
@@ -28,8 +32,15 @@ CONTAINER_DIR="${CONTAINER_DIR:-$(dirname "$OVERLAY_PATH")}"
 PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 # Venv lives on /scratch (regular ext4) — bypasses fuse2fs entirely for installs.
 VENV_PATH="${VENV_PATH:-/scratch/$USER/venvs/pufferdrive}"
-# Python from the overlay's miniforge3 (mounted read-only at runtime).
-CONTAINER_PYTHON="${CONTAINER_PYTHON:-/ext3/miniforge3/bin/python3}"
+# miniforge3 lives on /scratch too so the venv's python symlink resolves
+# from any node without needing the singularity overlay to be mounted.
+MINIFORGE3_DIR="${MINIFORGE3_DIR:-/scratch/$USER/miniforge3}"
+# Pin to a miniforge3 release that ships Python 3.12. 25.x switched to 3.13,
+# but torch's cu121 wheels are cp39..cp312 only (no cp313), so 3.13 breaks
+# the install. Bump this once torch publishes cp313 wheels for our index.
+MINIFORGE3_INSTALLER_URL="${MINIFORGE3_INSTALLER_URL:-https://github.com/conda-forge/miniforge/releases/download/24.11.3-2/Miniforge3-24.11.3-2-Linux-x86_64.sh}"
+MINIFORGE3_PYTHON_VERSION="${MINIFORGE3_PYTHON_VERSION:-3.12}"
+CONTAINER_PYTHON="${CONTAINER_PYTHON:-$MINIFORGE3_DIR/bin/python3}"
 
 create_overlay() {
     echo "=== Creating overlay filesystem ==="
@@ -46,7 +57,6 @@ create_overlay() {
     TEMPLATE_NAME=$(basename "$OVERLAY_TEMPLATE")
     cd "$CONTAINER_DIR"
     gunzip "$TEMPLATE_NAME"
-    mv "${TEMPLATE_NAME%.gz}" overlay.ext3
 
     echo "Overlay created at $OVERLAY_PATH"
     echo ""
@@ -76,6 +86,34 @@ fi
 EOF
 }
 
+# Install miniforge3 to /scratch if it isn't there yet. The conda-forge
+# installer is a self-contained shell script — no root, no singularity
+# required. Doing this on /scratch (rather than inside the overlay)
+# means $MINIFORGE3_DIR/bin/python3 is a real file accessible from any
+# node, so the venv's bin/python symlink resolves outside singularity too.
+ensure_miniforge3() {
+    if [ -x "$MINIFORGE3_DIR/bin/python3" ]; then
+        # Verify the existing miniforge3 has the python version we expect —
+        # otherwise an earlier install that grabbed "latest" (Python 3.13)
+        # would stay around, and uv venv would happily reuse it.
+        local existing
+        existing="$("$MINIFORGE3_DIR/bin/python3" -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || true)"
+        if [ "$existing" = "$MINIFORGE3_PYTHON_VERSION" ]; then
+            return 0
+        fi
+        echo "=== miniforge3 at $MINIFORGE3_DIR has python $existing (want $MINIFORGE3_PYTHON_VERSION); reinstalling ==="
+        rm -rf "$MINIFORGE3_DIR"
+    fi
+    echo "=== Installing miniforge3 to $MINIFORGE3_DIR ==="
+    mkdir -p "$(dirname "$MINIFORGE3_DIR")"
+    local installer
+    installer="$(mktemp -t miniforge3-installer.XXXXXX.sh)"
+    curl -fsSL "$MINIFORGE3_INSTALLER_URL" -o "$installer"
+    bash "$installer" -b -p "$MINIFORGE3_DIR"
+    rm -f "$installer"
+    echo "miniforge3 installed at $MINIFORGE3_DIR"
+}
+
 # Find or bootstrap a uv binary. Prefer one already on PATH or in
 # $HOME/.local/bin (auto-bound by apptainer). Fall back to the official
 # installer, which drops a static binary into ~/.local/bin in seconds.
@@ -106,6 +144,23 @@ ensure_uv() {
 # of the box and works against any cpython.
 ensure_venv() {
     ensure_uv
+    # If the venv exists but its python doesn't resolve into the current
+    # $MINIFORGE3_DIR (e.g. it points at /ext3/miniforge3 from before we
+    # moved miniforge3 onto /scratch), rebuild. readlink -f resolves the
+    # whole symlink chain, so this catches the case where the link is
+    # valid inside the container (overlay mounted) but stale relative to
+    # where the new venv should point.
+    if [ -f "$VENV_PATH/bin/activate" ]; then
+        local resolved
+        resolved="$(readlink -f "$VENV_PATH/bin/python" 2>/dev/null || true)"
+        case "$resolved" in
+            "$MINIFORGE3_DIR"/*) ;;
+            *)
+                echo "=== Rebuilding stale venv at $VENV_PATH (python points to '$resolved', not under $MINIFORGE3_DIR) ==="
+                rm -rf "$VENV_PATH"
+                ;;
+        esac
+    fi
     if [ ! -f "$VENV_PATH/bin/activate" ]; then
         echo "=== Creating venv at $VENV_PATH ==="
         mkdir -p "$(dirname "$VENV_PATH")"
@@ -168,27 +223,16 @@ rebuild_extension() {
 
 run_in_container() {
     local cmd="$1"
-    # Overlay mounted read-only — venv's bin/python symlinks back into
-    # /ext3/miniforge3 for the interpreter, but every package read/write
-    # happens on /scratch ext4 (the venv on $VENV_PATH).
+    # Overlay mounted read-only — every read/write the install or rebuild
+    # cares about happens on /scratch ext4 (miniforge3 + venv). The overlay
+    # is kept on the mount line for backward compatibility, but nothing
+    # in the python flow writes to it.
     singularity exec --nv \
         --overlay "$OVERLAY_PATH:ro" \
         "$IMAGE_PATH" \
         bash -c "cd $PROJECT_ROOT && $cmd"
 }
 
-run_in_container_writable() {
-    local cmd="$1"
-    # --fakeroot still required because uv bootstrap writes to /ext3/miniforge3
-    # (the system pip puts uv there before we activate the venv). Once uv
-    # is bootstrapped, all subsequent installs go to the venv on /scratch
-    # (regular ext4, no fuse2fs in the write path).
-    singularity exec --nv --fakeroot \
-        --overlay "$OVERLAY_PATH" \
-        "$IMAGE_PATH" \
-        bash -c "cd $PROJECT_ROOT && $cmd"
-}
-
 case "${1:-}" in
     create-overlay)
         create_overlay
@@ -197,7 +241,11 @@ case "${1:-}" in
         if [ -f /.singularity.d/Singularity ]; then
             install_deps
         else
-            run_in_container_writable "$0 install"
+            # miniforge3 installs on /scratch via plain shell — no singularity
+            # needed for that step. The rest (uv + pip + build_ext) runs in
+            # the container so nvcc and the right glibc are on PATH.
+            ensure_miniforge3
+            run_in_container "$0 install"
         fi
         ;;
     rebuild)
@@ -218,12 +266,13 @@ case "${1:-}" in
         echo "  rebuild         Rebuild C extension only (submit as GPU job)"
         echo ""
         echo "Environment variables:"
+        echo "  MINIFORGE3_DIR  Where the base python lives (default: /scratch/\$USER/miniforge3)"
         echo "  VENV_PATH       Where the venv lives (default: /scratch/\$USER/venvs/pufferdrive)"
-        echo "  OVERLAY_PATH    Singularity overlay (only needs miniforge3 base python)"
+        echo "  OVERLAY_PATH    Singularity overlay (kept for system-tool installs; not used by the python flow)"
         echo ""
         echo "Example workflow:"
         echo "  1. $0 create-overlay"
         echo "  2. sbatch --gres=gpu:1 --time=60 --wrap \"$0 install\""
-        echo "  3. python scripts/submit_cluster.py --container ..."
+        echo "  3. source \$VENV_PATH/bin/activate && python scripts/submit_cluster.py --container ..."
         ;;
 esac
diff --git a/scripts/submit_cluster.py b/scripts/submit_cluster.py
index 59fe9ad3a5..9a8182bf8c 100644
--- a/scripts/submit_cluster.py
+++ b/scripts/submit_cluster.py
@@ -289,7 +289,6 @@ def launch_training(args, from_config, cmd, save_dir, project_root, container_co
         import submitit
 
         # Code isolation: symlink top-level entries, hard copy pufferlib/ source
-        # (symlink resources/ to avoid copying 3.7GB of maps/models).
         isolated_root = os.path.join(save_dir, "code")
         if os.path.exists(isolated_root):
             version = 1
@@ -308,8 +307,6 @@ def launch_training(args, from_config, cmd, save_dir, project_root, container_co
                     os.remove(dst)
             os.symlink(src, dst)
         # Hard copy pufferlib/ so branch switches don't break running jobs.
-        # Previously used `cp -rs` (symlinks) which meant switching branches
-        # after submission would silently change the code running jobs use.
         # We symlink resources/ (3.7GB of maps/models) to avoid slow copies,
         # but hard copy everything else (source code, .so files).
         pufferlib_dst = os.path.join(isolated_root, "pufferlib")