From f65cdf6a4ca2bc4eff371f7d0e6bdb19604ff4f0 Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Tue, 28 Apr 2026 22:05:00 +0200 Subject: [PATCH 1/3] fix: remove CUDA_DEVICE_MAX_CONNECTIONS, fix PYTHONPATH, add WANDB_DIR in container template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CUDA_DEVICE_MAX_CONNECTIONS=1 is a Megatron-LM tensor-parallel flag that serializes CUDA streams and hurts ZeRO-2 overlap_comm — removed. PYTHONPATH was cleared to an empty string, causing the container to use its baked-in post_training package instead of the local src/ checkout. Now set to repo_dir/src so local changes take effect. WANDB_DIR is set to the run directory so that offline WandB runs persist to Lustre-backed shared storage rather than being lost on node teardown. --- src/post_training/slurm/job_trl_container.sh.jinja | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/post_training/slurm/job_trl_container.sh.jinja b/src/post_training/slurm/job_trl_container.sh.jinja index d9cbb89..21a88a6 100644 --- a/src/post_training/slurm/job_trl_container.sh.jinja +++ b/src/post_training/slurm/job_trl_container.sh.jinja @@ -35,7 +35,8 @@ export WORLD_SIZE=$(( SLURM_NNODES * GPUS_PER_NODE )) # NCCL tuning for multi-node stability export NCCL_IB_TIMEOUT=120 export NCCL_DEBUG=INFO -export CUDA_DEVICE_MAX_CONNECTIONS=1 +# CUDA_DEVICE_MAX_CONNECTIONS=1 is a Megatron-LM tensor-parallel flag that +# serializes CUDA streams and hurts ZeRO-2 overlap_comm — do not set it. export OMP_NUM_THREADS=1 echo "==========================================" @@ -71,7 +72,7 @@ srun --export=ALL --wait=60 --kill-on-bad-exit=1 \ # Prevent host Python/PATH from interfering with container export PATH=\"/usr/local/bin:/usr/bin:/bin\" - export PYTHONPATH=\"\" + export PYTHONPATH=\"{{ repo_dir }}/src\" export PYTHONNOUSERSITE=1 export NODE_RANK=\"\$SLURM_NODEID\" @@ -90,6 +91,8 @@ srun --export=ALL --wait=60 --kill-on-bad-exit=1 \ cd {{ repo_dir }} + export WANDB_DIR={{ repo_dir }}/{{ run_dir }} + accelerate launch \ --num_machines \$NNODES \ --num_processes $WORLD_SIZE \ From 7bd7efabc791d06cd24582b9be8c4a0a0eb2041d Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Thu, 30 Apr 2026 14:30:47 +0200 Subject: [PATCH 2/3] fix: set PYTHONPATH to repo src/ in LlamaFactory container template --- src/post_training/slurm/job_llamafactory.sh.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/post_training/slurm/job_llamafactory.sh.jinja b/src/post_training/slurm/job_llamafactory.sh.jinja index a532b6e..e214165 100644 --- a/src/post_training/slurm/job_llamafactory.sh.jinja +++ b/src/post_training/slurm/job_llamafactory.sh.jinja @@ -70,7 +70,7 @@ srun --export=ALL --wait=60 --kill-on-bad-exit=1 \ # Prevent host Python/PATH from interfering with container export PATH=\"/usr/local/bin:/usr/bin:/bin\" - export PYTHONPATH=\"\" + export PYTHONPATH=\"{{ repo_dir }}/src\" export PYTHONNOUSERSITE=1 export NODE_RANK=\"\$SLURM_NODEID\" From f077f78fa4bb495e19ccdd13c2db0ca7cf180c0a Mon Sep 17 00:00:00 2001 From: Arjun Krishnakumar Date: Thu, 30 Apr 2026 15:41:32 +0200 Subject: [PATCH 3/3] fix: resolve run_dir to absolute path in container SLURM templates WANDB_DIR was constructed as repo_dir/run_dir in job_trl_container.sh.jinja, which breaks when output_base is an absolute path. Pass run_dir.resolve() from the launcher so templates always receive an absolute path, then simplify WANDB_DIR to use run_dir directly. Same resolve() applied to the LlamaFactory renderer for consistency. --- src/post_training/slurm/job_trl_container.sh.jinja | 2 +- src/post_training/slurm/launcher.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/post_training/slurm/job_trl_container.sh.jinja b/src/post_training/slurm/job_trl_container.sh.jinja index 1343b2b..b085291 100644 --- a/src/post_training/slurm/job_trl_container.sh.jinja +++ b/src/post_training/slurm/job_trl_container.sh.jinja @@ -88,7 +88,7 @@ srun --export=ALL --wait=60 --kill-on-bad-exit=1 \ cd {{ repo_dir }} - export WANDB_DIR={{ repo_dir }}/{{ run_dir }} + export WANDB_DIR={{ run_dir }} accelerate launch \ --num_machines \$NNODES \ diff --git a/src/post_training/slurm/launcher.py b/src/post_training/slurm/launcher.py index d16b4a2..17e989a 100644 --- a/src/post_training/slurm/launcher.py +++ b/src/post_training/slurm/launcher.py @@ -114,7 +114,7 @@ def render_trl_container_slurm_script( wall_time=config.slurm.wall_time, signal_time_seconds=config.slurm.signal_time_seconds, max_failures=config.slurm.max_failures, - run_dir=str(run_dir), + run_dir=str(run_dir.resolve()), config_path=config_path, # Accelerate flags mixed_precision=config.accelerate.mixed_precision, @@ -162,7 +162,7 @@ def render_llamafactory_slurm_script( wall_time=config.slurm.wall_time, signal_time_seconds=config.slurm.signal_time_seconds, max_failures=config.slurm.max_failures, - run_dir=str(run_dir), + run_dir=str(run_dir.resolve()), # Container container_image=config.container.image, bind_mounts=config.container.bind_mounts,