Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions nemo_run/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,13 +468,33 @@ def get_name(self):
return os.path.basename(self.path)

def to_command(
self, with_entrypoint: bool = False, filename: Optional[str] = None, is_local: bool = False
self,
with_entrypoint: bool = False,
filename: Optional[str] = None,
is_local: bool = False,
substitute_rundir_path: Optional[str] = None,
) -> list[str]:
"""Convert the script to a command.

Args:
with_entrypoint: If True, prepend the entrypoint to the command.
filename: If provided, write the inline script to this file.
is_local: If True, use the local filename in the command.
substitute_rundir_path: If provided, substitute /{RUNDIR_NAME} paths
with this path in the inline script content. Used for non-container
mode where container paths need to be replaced with actual cluster paths.
"""
if self.inline:
if filename:
os.makedirs(os.path.dirname(filename), exist_ok=True)
inline_content = self.inline
# Substitute /{RUNDIR_NAME} paths if specified (non-container mode)
if substitute_rundir_path is not None:
inline_content = inline_content.replace(
f"/{RUNDIR_NAME}", substitute_rundir_path
)
with open(filename, "w") as f:
f.write("#!/usr/bin/bash\n" + self.inline)
f.write("#!/usr/bin/bash\n" + inline_content)

if is_local:
cmd = [filename]
Expand Down
25 changes: 24 additions & 1 deletion nemo_run/core/execution/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,18 @@ def get_container_flags(
container_image: Optional[str],
container_env: Optional[list[str]] = None,
) -> list[str]:
_container_flags = ["--container-image", container_image] if container_image else []
"""Get srun flags for container or non-container mode.

For non-container mode, returns --chdir flag to set working directory.
For container mode, returns container-related flags (image, mounts, workdir, env).
"""
if container_image is None:
# Non-container mode: use --chdir to set working directory
workdir = os.path.join(src_job_dir, "code")
return ["--chdir", workdir]

# Container mode: set up container mounts and workdir
_container_flags = ["--container-image", container_image]

new_mounts = copy.deepcopy(base_mounts)
for i, mount in enumerate(new_mounts):
Expand Down Expand Up @@ -1077,6 +1088,18 @@ def get_container_flags(
vars_to_fill["fault_tol_job_results_file"] = self.launcher.job_results_file

sbatch_script = fill_template("slurm.sh.j2", vars_to_fill)

# For non-container mode, substitute /{RUNDIR_NAME} paths with actual job directory
# Check both top-level container_image and resource_group container images
has_container = self.executor.container_image is not None
if self.executor.run_as_group and self.executor.resource_group:
has_container = has_container or any(
rg.container_image is not None for rg in self.executor.resource_group
)
if not has_container:
actual_job_dir = os.path.join(slurm_job_dir, job_directory_name)
sbatch_script = sbatch_script.replace(f"/{RUNDIR_NAME}", actual_job_dir)

return sbatch_script

def __repr__(self) -> str:
Expand Down
15 changes: 15 additions & 0 deletions nemo_run/run/torchx_backend/packaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import logging
import os
from pathlib import Path
from typing import Iterator, Optional, Type, Union

import fiddle as fdl
Expand Down Expand Up @@ -120,9 +121,23 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool):
log.warning(f"Failed saving yaml configs due to: {e}")

args = fn_or_script.args

# For SlurmExecutor without container, substitute /{RUNDIR_NAME} paths
# with actual cluster paths in inline scripts
substitute_rundir_path = None
if (
isinstance(executor, SlurmExecutor)
and executor.container_image is None
and executor.tunnel is not None
):
substitute_rundir_path = os.path.join(
executor.tunnel.job_dir, Path(executor.job_dir).name
)

role_args = fn_or_script.to_command(
filename=os.path.join(executor.job_dir, SCRIPTS_DIR, f"{name}.sh"),
is_local=True if isinstance(executor, LocalExecutor) else False,
substitute_rundir_path=substitute_rundir_path,
)
m = fn_or_script.path if fn_or_script.m else None
no_python = fn_or_script.entrypoint != "python"
Expand Down
2 changes: 1 addition & 1 deletion test/core/execution/artifacts/dummy_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export ENV_VAR=value

# Command 1

srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4

exitcode=$?

Expand Down
2 changes: 1 addition & 1 deletion test/core/execution/artifacts/ft_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ echo "$SLURM_JOB_ID ${SLURM_RESTART_COUNT:-0} X" >> "$JOB_RESULTS_FILE"

# Command 1

srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh
srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh

exitcode=$?

Expand Down
147 changes: 147 additions & 0 deletions test/core/execution/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

import pytest

from nemo_run.config import RUNDIR_NAME
from nemo_run.core.execution.launcher import SlurmTemplate, Torchrun
from nemo_run.core.execution.slurm import (
SlurmBatchRequest,
SlurmExecutor,
SlurmJobDetails,
SlurmTunnelCallback,
Expand Down Expand Up @@ -403,3 +405,148 @@ def test_merge_mismatch(self):
[SlurmExecutor(account="account1"), SlurmExecutor(account="account2")],
num_tasks=3,
)


class TestSlurmBatchRequestNonContainerMode:
"""Tests for non-container mode support (container_image=None)."""

@pytest.fixture
def executor_with_container(self):
"""Create an executor with container image."""
executor = SlurmExecutor(
account="test_account",
partition="gpu",
nodes=2,
ntasks_per_node=8,
container_image="nvcr.io/nvidia/pytorch:24.01-py3",
container_mounts=["/data:/data"],
)
executor.job_name = "test-job"
executor.experiment_dir = "/local/experiments"
executor.job_dir = "/local/experiments/test-job"
executor.experiment_id = "exp-123"

# Mock tunnel
tunnel = MagicMock(spec=LocalTunnel)
tunnel.job_dir = "/remote/experiments/exp-123"
executor.tunnel = tunnel

return executor

@pytest.fixture
def executor_without_container(self):
"""Create an executor without container image (non-container mode)."""
executor = SlurmExecutor(
account="test_account",
partition="gpu",
nodes=2,
ntasks_per_node=8,
container_image=None, # Non-container mode
)
executor.job_name = "test-job"
executor.experiment_dir = "/local/experiments"
executor.job_dir = "/local/experiments/test-job"
executor.experiment_id = "exp-123"

# Mock tunnel
tunnel = MagicMock(spec=LocalTunnel)
tunnel.job_dir = "/remote/experiments/exp-123"
executor.tunnel = tunnel

return executor

def test_materialize_with_container_uses_container_flags(self, executor_with_container):
"""Test that materialize uses container flags when container_image is set."""
request = SlurmBatchRequest(
launch_cmd=["sbatch", "--parsable"],
jobs=["test-job"],
command_groups=[["python train.py"]],
executor=executor_with_container,
max_retries=0,
extra_env={},
)

script = request.materialize()

# Should contain container flags
assert "--container-image" in script
assert "--container-mounts" in script
assert "--container-workdir" in script
# Should NOT contain --chdir (used for non-container mode)
assert "--chdir" not in script
# Should contain /nemo_run paths (not substituted)
assert f"/{RUNDIR_NAME}" in script

def test_materialize_without_container_uses_chdir(self, executor_without_container):
"""Test that materialize uses --chdir when container_image is None."""
request = SlurmBatchRequest(
launch_cmd=["sbatch", "--parsable"],
jobs=["test-job"],
command_groups=[["python train.py"]],
executor=executor_without_container,
max_retries=0,
extra_env={},
)

script = request.materialize()

# Should contain --chdir flag for working directory
assert "--chdir" in script
# Should NOT contain container flags
assert "--container-image" not in script
assert "--container-mounts" not in script
assert "--container-workdir" not in script

def test_materialize_without_container_substitutes_rundir_paths(
self, executor_without_container
):
"""Test that /{RUNDIR_NAME} paths are substituted with actual paths in non-container mode."""
request = SlurmBatchRequest(
launch_cmd=["sbatch", "--parsable"],
jobs=["test-job"],
command_groups=[["python train.py"]],
executor=executor_without_container,
max_retries=0,
extra_env={},
)

script = request.materialize()

# Should NOT contain /nemo_run paths (should be substituted)
assert f"/{RUNDIR_NAME}/code" not in script
# Should contain the actual job directory path
actual_job_dir = "/remote/experiments/exp-123/test-job"
assert f"{actual_job_dir}/code" in script

def test_materialize_with_container_preserves_rundir_paths(self, executor_with_container):
"""Test that /{RUNDIR_NAME} paths are NOT substituted when using container."""
request = SlurmBatchRequest(
launch_cmd=["sbatch", "--parsable"],
jobs=["test-job"],
command_groups=[["python train.py"]],
executor=executor_with_container,
max_retries=0,
extra_env={},
)

script = request.materialize()

# Should contain /nemo_run paths (not substituted for container mode)
assert f"/{RUNDIR_NAME}" in script

def test_non_container_mode_chdir_points_to_code_directory(self, executor_without_container):
"""Test that --chdir in non-container mode points to the code directory."""
request = SlurmBatchRequest(
launch_cmd=["sbatch", "--parsable"],
jobs=["test-job"],
command_groups=[["python train.py"]],
executor=executor_without_container,
max_retries=0,
extra_env={},
)

script = request.materialize()

# The --chdir should point to {job_dir}/code
expected_chdir = "--chdir /remote/experiments/exp-123/test-job/code"
assert expected_chdir in script
2 changes: 2 additions & 0 deletions test/core/execution/test_slurm_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def dummy_slurm_request_with_artifact(
account="account",
job_dir="/root/sample_job",
tunnel=LocalTunnel(job_dir="/root"),
container_image="test_image",
)
slurm_config.job_name = "sample_job"
max_retries = 3
Expand All @@ -79,6 +80,7 @@ def ft_slurm_request_with_artifact(
account="account",
job_dir="/root/sample_job",
tunnel=LocalTunnel(job_dir="/root/"),
container_image="test_image",
)
slurm_config.job_name = "sample_job"
slurm_config.launcher = FaultTolerance(
Expand Down
Loading
Loading