From a768dbadcd0429cb031322a695ae587d064c0712 Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Wed, 29 Apr 2026 11:52:36 +0200 Subject: [PATCH 1/3] fix: apply container env_file HF cache vars before offline prefetch When offline=True, prefetch_assets() was downloading models/datasets to the login shell's default HF_HOME rather than the cluster-specific path set by container.env_file. The container sources this file at job runtime, so the two locations disagreed and the container couldn't find the cached assets. Now submit.py parses the env_file for HF cache vars (HF_HOME, HF_HUB_CACHE, etc.) and applies them to os.environ before calling prefetch_assets(), ensuring both the prefetch and the container use the same cache root. --- scripts/submit.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/scripts/submit.py b/scripts/submit.py index b86cbc2..0eb496a 100644 --- a/scripts/submit.py +++ b/scripts/submit.py @@ -13,6 +13,8 @@ import argparse import logging +import os +import re import sys from pathlib import Path @@ -29,6 +31,51 @@ logger = logging.getLogger(__name__) +_HF_CACHE_VARS = frozenset( + { + "HF_HOME", + "HF_HUB_CACHE", + "HUGGINGFACE_HUB_CACHE", + "HF_DATASETS_CACHE", + "TRANSFORMERS_CACHE", + } +) + + +def _apply_hf_env_from_file(env_file: str) -> None: + """Parse HF cache vars from a shell env file and apply them to os.environ. + + Ensures prefetch_assets() downloads to the same cache root that the + container will use (set via container.env_file sourced in the SLURM script). + """ + path = Path(env_file) + if not path.exists(): + logger.warning("container.env_file '%s' not found, skipping.", env_file) + return + + parsed: dict[str, str] = {} + export_re = re.compile(r"^export\s+([A-Za-z_][A-Za-z0-9_]*)=(.*)$") + with path.open() as f: + for line in f: + m = export_re.match(line.strip()) + if not m: + continue + key, value = m.group(1), m.group(2).strip("\"'") + value = re.sub( + r"\$\{?([A-Za-z_][A-Za-z0-9_]*)\}?", + lambda mv: parsed.get(mv.group(1), os.environ.get(mv.group(1), "")), + value, + ) + parsed[key] = value + + applied = [] + for key in _HF_CACHE_VARS: + if key in parsed: + os.environ[key] = parsed[key] + applied.append(f"{key}={parsed[key]}") + if applied: + logger.info("Applied HF cache vars from %s: %s", env_file, ", ".join(applied)) + def _parse_args() -> tuple[str, list[str], bool]: parser = argparse.ArgumentParser(description="Submit a SLURM training job.") @@ -64,6 +111,8 @@ def main() -> None: config.slurm.gpus_per_node = 1 if config.offline: + if config.container.env_file: + _apply_hf_env_from_file(config.container.env_file) logger.info( "offline=True: pre-fetching models and datasets on the login node " "before submitting the job." From edeaeb170b5f7eaf2656a0f184ed4f400b57e231 Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Wed, 29 Apr 2026 12:23:17 +0200 Subject: [PATCH 2/3] fix: raise FileNotFoundError when container.env_file is missing Silently skipping a missing env file would let prefetch_assets() run against the wrong HF cache without any indication of the misconfiguration. Raising FileNotFoundError makes the error explicit and fails fast. --- scripts/submit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/submit.py b/scripts/submit.py index 0eb496a..5160f11 100644 --- a/scripts/submit.py +++ b/scripts/submit.py @@ -50,8 +50,7 @@ def _apply_hf_env_from_file(env_file: str) -> None: """ path = Path(env_file) if not path.exists(): - logger.warning("container.env_file '%s' not found, skipping.", env_file) - return + raise FileNotFoundError(f"container.env_file '{env_file}' not found.") parsed: dict[str, str] = {} export_re = re.compile(r"^export\s+([A-Za-z_][A-Za-z0-9_]*)=(.*)$") From 3bc91109a86a427ba8da217728f5e27382798811 Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Wed, 29 Apr 2026 13:11:51 +0200 Subject: [PATCH 3/3] fix: lazy-import prefetch_assets so HF cache vars take effect huggingface_hub and datasets cache HF_HOME/HF_HUB_CACHE at import time. Importing prefetch_assets at module level meant those libraries were initialised before _apply_hf_env_from_file ran, so the env vars we set were ignored and downloads went to the default cache location. Move the import to just before the call, after the env vars are applied. --- scripts/submit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/submit.py b/scripts/submit.py index 5160f11..5087eac 100644 --- a/scripts/submit.py +++ b/scripts/submit.py @@ -27,7 +27,6 @@ from post_training.utils.guardrails import run_guardrails from post_training.utils.logging import setup_logging from post_training.utils.paths import setup_run_directory -from post_training.utils.prefetch import prefetch_assets logger = logging.getLogger(__name__) @@ -116,6 +115,10 @@ def main() -> None: "offline=True: pre-fetching models and datasets on the login node " "before submitting the job." ) + # Lazy import: must come after _apply_hf_env_from_file so that + # huggingface_hub and datasets read the correct HF_HOME on first import. + from post_training.utils.prefetch import prefetch_assets + prefetch_assets(config) # Set up the run directory (so the SLURM script can reference it).