From c409831d709ce874ca625c2460c1053dc7096b8c Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Wed, 29 Apr 2026 12:04:54 +0200 Subject: [PATCH] fix: delegate offline mode control from container template to train.py The container SLURM template was hardcoding HF_HUB_OFFLINE=1 etc. regardless of config.offline, so offline: false had no effect inside the container and jobs would stall trying to reach the Hub at runtime. Remove the three hardcoded offline flags from job_trl_container.sh.jinja and let train.py own this: the existing if config.offline block sets them to 1, and a new else block explicitly sets them to 0 to clear any value inherited from the container environment. --- scripts/train.py | 6 ++++++ src/post_training/slurm/job_trl_container.sh.jinja | 5 +---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/train.py b/scripts/train.py index 8f3fd7f..f0ca2e1 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -92,6 +92,12 @@ def main() -> None: os.environ["HF_DATASETS_OFFLINE"] = "1" os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["WANDB_MODE"] = "offline" + else: + # Explicitly clear offline flags so any value inherited from the + # container environment (e.g. a stale host env var) doesn't take effect. + os.environ["HF_HUB_OFFLINE"] = "0" + os.environ["HF_DATASETS_OFFLINE"] = "0" + os.environ["TRANSFORMERS_OFFLINE"] = "0" # Lazy import: must come after offline env vars are set so that # huggingface_hub caches HF_HUB_OFFLINE=1 on first import. diff --git a/src/post_training/slurm/job_trl_container.sh.jinja b/src/post_training/slurm/job_trl_container.sh.jinja index d9cbb89..0dda5ef 100644 --- a/src/post_training/slurm/job_trl_container.sh.jinja +++ b/src/post_training/slurm/job_trl_container.sh.jinja @@ -78,14 +78,11 @@ srun --export=ALL --wait=60 --kill-on-bad-exit=1 \ export NNODES=\"\$SLURM_NNODES\" export NPROC_PER_NODE=${GPUS_PER_NODE} - # Forward HF cache/offline settings from host env + # Forward HF cache paths from host env (offline mode is controlled by train.py via config) export HF_HOME=${HF_HOME} export HF_HUB_CACHE=${HF_HUB_CACHE} export HUGGINGFACE_HUB_CACHE=${HUGGINGFACE_HUB_CACHE} export HF_DATASETS_CACHE=${HF_HOME}/datasets - export HF_DATASETS_OFFLINE=1 - export TRANSFORMERS_OFFLINE=1 - export HF_HUB_OFFLINE=1 export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt cd {{ repo_dir }}