Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions scripts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ def main() -> None:
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["WANDB_MODE"] = "offline"
else:
# Explicitly clear offline flags so any value inherited from the
# container environment (e.g. a stale host env var) doesn't take effect.
os.environ["HF_HUB_OFFLINE"] = "0"
os.environ["HF_DATASETS_OFFLINE"] = "0"
os.environ["TRANSFORMERS_OFFLINE"] = "0"

# Lazy import: must come after offline env vars are set so that
# huggingface_hub caches HF_HUB_OFFLINE=1 on first import.
Expand Down
5 changes: 1 addition & 4 deletions src/post_training/slurm/job_trl_container.sh.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,11 @@ srun --export=ALL --wait=60 --kill-on-bad-exit=1 \
export NNODES=\"\$SLURM_NNODES\"
export NPROC_PER_NODE=${GPUS_PER_NODE}

# Forward HF cache/offline settings from host env
# Forward HF cache paths from host env (offline mode is controlled by train.py via config)
export HF_HOME=${HF_HOME}
export HF_HUB_CACHE=${HF_HUB_CACHE}
export HUGGINGFACE_HUB_CACHE=${HUGGINGFACE_HUB_CACHE}
export HF_DATASETS_CACHE=${HF_HOME}/datasets
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export HF_HUB_OFFLINE=1
export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt

cd {{ repo_dir }}
Expand Down
Loading