From a768dbadcd0429cb031322a695ae587d064c0712 Mon Sep 17 00:00:00 2001
From: Arjun Krishnakumar <arjun.krishnakumar.mec@gmail.com>
Date: Wed, 29 Apr 2026 11:52:36 +0200
Subject: [PATCH 1/3] fix: apply container env_file HF cache vars before
 offline prefetch

When offline=True, prefetch_assets() was downloading models/datasets to
the login shell's default HF_HOME rather than the cluster-specific path
set by container.env_file. The container sources this file at job runtime,
so the two locations disagreed and the container couldn't find the cached
assets.

Now submit.py parses the env_file for HF cache vars (HF_HOME,
HF_HUB_CACHE, etc.) and applies them to os.environ before calling
prefetch_assets(), ensuring both the prefetch and the container use the
same cache root.
---
 scripts/submit.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/scripts/submit.py b/scripts/submit.py
index b86cbc2..0eb496a 100644
--- a/scripts/submit.py
+++ b/scripts/submit.py
@@ -13,6 +13,8 @@
 
 import argparse
 import logging
+import os
+import re
 import sys
 from pathlib import Path
 
@@ -29,6 +31,51 @@
 
 logger = logging.getLogger(__name__)
 
+_HF_CACHE_VARS = frozenset(
+    {
+        "HF_HOME",
+        "HF_HUB_CACHE",
+        "HUGGINGFACE_HUB_CACHE",
+        "HF_DATASETS_CACHE",
+        "TRANSFORMERS_CACHE",
+    }
+)
+
+
+def _apply_hf_env_from_file(env_file: str) -> None:
+    """Parse HF cache vars from a shell env file and apply them to os.environ.
+
+    Ensures prefetch_assets() downloads to the same cache root that the
+    container will use (set via container.env_file sourced in the SLURM script).
+    """
+    path = Path(env_file)
+    if not path.exists():
+        logger.warning("container.env_file '%s' not found, skipping.", env_file)
+        return
+
+    parsed: dict[str, str] = {}
+    export_re = re.compile(r"^export\s+([A-Za-z_][A-Za-z0-9_]*)=(.*)$")
+    with path.open() as f:
+        for line in f:
+            m = export_re.match(line.strip())
+            if not m:
+                continue
+            key, value = m.group(1), m.group(2).strip("\"'")
+            value = re.sub(
+                r"\$\{?([A-Za-z_][A-Za-z0-9_]*)\}?",
+                lambda mv: parsed.get(mv.group(1), os.environ.get(mv.group(1), "")),
+                value,
+            )
+            parsed[key] = value
+
+    applied = []
+    for key in _HF_CACHE_VARS:
+        if key in parsed:
+            os.environ[key] = parsed[key]
+            applied.append(f"{key}={parsed[key]}")
+    if applied:
+        logger.info("Applied HF cache vars from %s: %s", env_file, ", ".join(applied))
+
 
 def _parse_args() -> tuple[str, list[str], bool]:
     parser = argparse.ArgumentParser(description="Submit a SLURM training job.")
@@ -64,6 +111,8 @@ def main() -> None:
         config.slurm.gpus_per_node = 1
 
     if config.offline:
+        if config.container.env_file:
+            _apply_hf_env_from_file(config.container.env_file)
         logger.info(
             "offline=True: pre-fetching models and datasets on the login node "
             "before submitting the job."

From edeaeb170b5f7eaf2656a0f184ed4f400b57e231 Mon Sep 17 00:00:00 2001
From: Arjun Krishnakumar <arjun.krishnakumar.mec@gmail.com>
Date: Wed, 29 Apr 2026 12:23:17 +0200
Subject: [PATCH 2/3] fix: raise FileNotFoundError when container.env_file is
 missing

Silently skipping a missing env file would let prefetch_assets() run
against the wrong HF cache without any indication of the misconfiguration.
Raising FileNotFoundError makes the error explicit and fails fast.
---
 scripts/submit.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/submit.py b/scripts/submit.py
index 0eb496a..5160f11 100644
--- a/scripts/submit.py
+++ b/scripts/submit.py
@@ -50,8 +50,7 @@ def _apply_hf_env_from_file(env_file: str) -> None:
     """
     path = Path(env_file)
     if not path.exists():
-        logger.warning("container.env_file '%s' not found, skipping.", env_file)
-        return
+        raise FileNotFoundError(f"container.env_file '{env_file}' not found.")
 
     parsed: dict[str, str] = {}
     export_re = re.compile(r"^export\s+([A-Za-z_][A-Za-z0-9_]*)=(.*)$")

From 3bc91109a86a427ba8da217728f5e27382798811 Mon Sep 17 00:00:00 2001
From: Arjun Krishnakumar <arjun.krishnakumar.mec@gmail.com>
Date: Wed, 29 Apr 2026 13:11:51 +0200
Subject: [PATCH 3/3] fix: lazy-import prefetch_assets so HF cache vars take
 effect

huggingface_hub and datasets cache HF_HOME/HF_HUB_CACHE at import time.
Importing prefetch_assets at module level meant those libraries were
initialised before _apply_hf_env_from_file ran, so the env vars we set
were ignored and downloads went to the default cache location.

Move the import to just before the call, after the env vars are applied.
---
 scripts/submit.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/submit.py b/scripts/submit.py
index 5160f11..5087eac 100644
--- a/scripts/submit.py
+++ b/scripts/submit.py
@@ -27,7 +27,6 @@
 from post_training.utils.guardrails import run_guardrails
 from post_training.utils.logging import setup_logging
 from post_training.utils.paths import setup_run_directory
-from post_training.utils.prefetch import prefetch_assets
 
 logger = logging.getLogger(__name__)
 
@@ -116,6 +115,10 @@ def main() -> None:
             "offline=True: pre-fetching models and datasets on the login node "
             "before submitting the job."
         )
+        # Lazy import: must come after _apply_hf_env_from_file so that
+        # huggingface_hub and datasets read the correct HF_HOME on first import.
+        from post_training.utils.prefetch import prefetch_assets
+
         prefetch_assets(config)
 
     # Set up the run directory (so the SLURM script can reference it).