diff --git a/nemo_run/config.py b/nemo_run/config.py index 8bccfc36..d45e536d 100644 --- a/nemo_run/config.py +++ b/nemo_run/config.py @@ -468,13 +468,33 @@ def get_name(self): return os.path.basename(self.path) def to_command( - self, with_entrypoint: bool = False, filename: Optional[str] = None, is_local: bool = False + self, + with_entrypoint: bool = False, + filename: Optional[str] = None, + is_local: bool = False, + substitute_rundir_path: Optional[str] = None, ) -> list[str]: + """Convert the script to a command. + + Args: + with_entrypoint: If True, prepend the entrypoint to the command. + filename: If provided, write the inline script to this file. + is_local: If True, use the local filename in the command. + substitute_rundir_path: If provided, substitute /{RUNDIR_NAME} paths + with this path in the inline script content. Used for non-container + mode where container paths need to be replaced with actual cluster paths. + """ if self.inline: if filename: os.makedirs(os.path.dirname(filename), exist_ok=True) + inline_content = self.inline + # Substitute /{RUNDIR_NAME} paths if specified (non-container mode) + if substitute_rundir_path is not None: + inline_content = inline_content.replace( + f"/{RUNDIR_NAME}", substitute_rundir_path + ) with open(filename, "w") as f: - f.write("#!/usr/bin/bash\n" + self.inline) + f.write("#!/usr/bin/bash\n" + inline_content) if is_local: cmd = [filename] diff --git a/nemo_run/core/execution/slurm.py b/nemo_run/core/execution/slurm.py index f483a61e..c83a9528 100644 --- a/nemo_run/core/execution/slurm.py +++ b/nemo_run/core/execution/slurm.py @@ -936,7 +936,18 @@ def get_container_flags( container_image: Optional[str], container_env: Optional[list[str]] = None, ) -> list[str]: - _container_flags = ["--container-image", container_image] if container_image else [] + """Get srun flags for container or non-container mode. + + For non-container mode, returns --chdir flag to set working directory. + For container mode, returns container-related flags (image, mounts, workdir, env). + """ + if container_image is None: + # Non-container mode: use --chdir to set working directory + workdir = os.path.join(src_job_dir, "code") + return ["--chdir", workdir] + + # Container mode: set up container mounts and workdir + _container_flags = ["--container-image", container_image] new_mounts = copy.deepcopy(base_mounts) for i, mount in enumerate(new_mounts): @@ -1077,6 +1088,18 @@ def get_container_flags( vars_to_fill["fault_tol_job_results_file"] = self.launcher.job_results_file sbatch_script = fill_template("slurm.sh.j2", vars_to_fill) + + # For non-container mode, substitute /{RUNDIR_NAME} paths with actual job directory + # Check both top-level container_image and resource_group container images + has_container = self.executor.container_image is not None + if self.executor.run_as_group and self.executor.resource_group: + has_container = has_container or any( + rg.container_image is not None for rg in self.executor.resource_group + ) + if not has_container: + actual_job_dir = os.path.join(slurm_job_dir, job_directory_name) + sbatch_script = sbatch_script.replace(f"/{RUNDIR_NAME}", actual_job_dir) + return sbatch_script def __repr__(self) -> str: diff --git a/nemo_run/run/torchx_backend/packaging.py b/nemo_run/run/torchx_backend/packaging.py index 84b9dc4c..55c11f64 100644 --- a/nemo_run/run/torchx_backend/packaging.py +++ b/nemo_run/run/torchx_backend/packaging.py @@ -15,6 +15,7 @@ import logging import os +from pathlib import Path from typing import Iterator, Optional, Type, Union import fiddle as fdl @@ -120,9 +121,23 @@ def _get_details_from_script(fn_or_script: Script, serialize_configs: bool): log.warning(f"Failed saving yaml configs due to: {e}") args = fn_or_script.args + + # For SlurmExecutor without container, substitute /{RUNDIR_NAME} paths + # with actual cluster paths in inline scripts + substitute_rundir_path = None + if ( + isinstance(executor, SlurmExecutor) + and executor.container_image is None + and executor.tunnel is not None + ): + substitute_rundir_path = os.path.join( + executor.tunnel.job_dir, Path(executor.job_dir).name + ) + role_args = fn_or_script.to_command( filename=os.path.join(executor.job_dir, SCRIPTS_DIR, f"{name}.sh"), is_local=True if isinstance(executor, LocalExecutor) else False, + substitute_rundir_path=substitute_rundir_path, ) m = fn_or_script.path if fn_or_script.m else None no_python = fn_or_script.entrypoint != "python" diff --git a/test/core/execution/artifacts/dummy_slurm.sh b/test/core/execution/artifacts/dummy_slurm.sh index 1673f3f0..94cb9b1a 100644 --- a/test/core/execution/artifacts/dummy_slurm.sh +++ b/test/core/execution/artifacts/dummy_slurm.sh @@ -33,7 +33,7 @@ export ENV_VAR=value # Command 1 -srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4 +srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 cmd3 cmd4 exitcode=$? diff --git a/test/core/execution/artifacts/ft_slurm.sh b/test/core/execution/artifacts/ft_slurm.sh index 59b15123..173e97d1 100644 --- a/test/core/execution/artifacts/ft_slurm.sh +++ b/test/core/execution/artifacts/ft_slurm.sh @@ -62,7 +62,7 @@ echo "$SLURM_JOB_ID ${SLURM_RESTART_COUNT:-0} X" >> "$JOB_RESULTS_FILE" # Command 1 -srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh +srun --output /root/sample_job/log-account-account.sample_job_%j_${SLURM_RESTART_COUNT:-0}.out --container-image test_image --container-mounts /root/sample_job:/nemo_run --container-workdir /nemo_run/code --wait=60 --kill-on-bad-exit=1 ft_launcher --ft-param-workload_check_interval 10 --ft-param-rank_heartbeat_timeout 10 --rdzv-backend c10d --rdzv-endpoint localhost:0 --rdzv-id 7680 --nnodes 1 --nproc-per-node 1 --node-rank 0 --tee 3 --no-python test_ft.sh exitcode=$? diff --git a/test/core/execution/test_slurm.py b/test/core/execution/test_slurm.py index 5f194df0..0e79eec6 100644 --- a/test/core/execution/test_slurm.py +++ b/test/core/execution/test_slurm.py @@ -18,8 +18,10 @@ import pytest +from nemo_run.config import RUNDIR_NAME from nemo_run.core.execution.launcher import SlurmTemplate, Torchrun from nemo_run.core.execution.slurm import ( + SlurmBatchRequest, SlurmExecutor, SlurmJobDetails, SlurmTunnelCallback, @@ -403,3 +405,148 @@ def test_merge_mismatch(self): [SlurmExecutor(account="account1"), SlurmExecutor(account="account2")], num_tasks=3, ) + + +class TestSlurmBatchRequestNonContainerMode: + """Tests for non-container mode support (container_image=None).""" + + @pytest.fixture + def executor_with_container(self): + """Create an executor with container image.""" + executor = SlurmExecutor( + account="test_account", + partition="gpu", + nodes=2, + ntasks_per_node=8, + container_image="nvcr.io/nvidia/pytorch:24.01-py3", + container_mounts=["/data:/data"], + ) + executor.job_name = "test-job" + executor.experiment_dir = "/local/experiments" + executor.job_dir = "/local/experiments/test-job" + executor.experiment_id = "exp-123" + + # Mock tunnel + tunnel = MagicMock(spec=LocalTunnel) + tunnel.job_dir = "/remote/experiments/exp-123" + executor.tunnel = tunnel + + return executor + + @pytest.fixture + def executor_without_container(self): + """Create an executor without container image (non-container mode).""" + executor = SlurmExecutor( + account="test_account", + partition="gpu", + nodes=2, + ntasks_per_node=8, + container_image=None, # Non-container mode + ) + executor.job_name = "test-job" + executor.experiment_dir = "/local/experiments" + executor.job_dir = "/local/experiments/test-job" + executor.experiment_id = "exp-123" + + # Mock tunnel + tunnel = MagicMock(spec=LocalTunnel) + tunnel.job_dir = "/remote/experiments/exp-123" + executor.tunnel = tunnel + + return executor + + def test_materialize_with_container_uses_container_flags(self, executor_with_container): + """Test that materialize uses container flags when container_image is set.""" + request = SlurmBatchRequest( + launch_cmd=["sbatch", "--parsable"], + jobs=["test-job"], + command_groups=[["python train.py"]], + executor=executor_with_container, + max_retries=0, + extra_env={}, + ) + + script = request.materialize() + + # Should contain container flags + assert "--container-image" in script + assert "--container-mounts" in script + assert "--container-workdir" in script + # Should NOT contain --chdir (used for non-container mode) + assert "--chdir" not in script + # Should contain /nemo_run paths (not substituted) + assert f"/{RUNDIR_NAME}" in script + + def test_materialize_without_container_uses_chdir(self, executor_without_container): + """Test that materialize uses --chdir when container_image is None.""" + request = SlurmBatchRequest( + launch_cmd=["sbatch", "--parsable"], + jobs=["test-job"], + command_groups=[["python train.py"]], + executor=executor_without_container, + max_retries=0, + extra_env={}, + ) + + script = request.materialize() + + # Should contain --chdir flag for working directory + assert "--chdir" in script + # Should NOT contain container flags + assert "--container-image" not in script + assert "--container-mounts" not in script + assert "--container-workdir" not in script + + def test_materialize_without_container_substitutes_rundir_paths( + self, executor_without_container + ): + """Test that /{RUNDIR_NAME} paths are substituted with actual paths in non-container mode.""" + request = SlurmBatchRequest( + launch_cmd=["sbatch", "--parsable"], + jobs=["test-job"], + command_groups=[["python train.py"]], + executor=executor_without_container, + max_retries=0, + extra_env={}, + ) + + script = request.materialize() + + # Should NOT contain /nemo_run paths (should be substituted) + assert f"/{RUNDIR_NAME}/code" not in script + # Should contain the actual job directory path + actual_job_dir = "/remote/experiments/exp-123/test-job" + assert f"{actual_job_dir}/code" in script + + def test_materialize_with_container_preserves_rundir_paths(self, executor_with_container): + """Test that /{RUNDIR_NAME} paths are NOT substituted when using container.""" + request = SlurmBatchRequest( + launch_cmd=["sbatch", "--parsable"], + jobs=["test-job"], + command_groups=[["python train.py"]], + executor=executor_with_container, + max_retries=0, + extra_env={}, + ) + + script = request.materialize() + + # Should contain /nemo_run paths (not substituted for container mode) + assert f"/{RUNDIR_NAME}" in script + + def test_non_container_mode_chdir_points_to_code_directory(self, executor_without_container): + """Test that --chdir in non-container mode points to the code directory.""" + request = SlurmBatchRequest( + launch_cmd=["sbatch", "--parsable"], + jobs=["test-job"], + command_groups=[["python train.py"]], + executor=executor_without_container, + max_retries=0, + extra_env={}, + ) + + script = request.materialize() + + # The --chdir should point to {job_dir}/code + expected_chdir = "--chdir /remote/experiments/exp-123/test-job/code" + assert expected_chdir in script diff --git a/test/core/execution/test_slurm_templates.py b/test/core/execution/test_slurm_templates.py index a0269543..5457563d 100644 --- a/test/core/execution/test_slurm_templates.py +++ b/test/core/execution/test_slurm_templates.py @@ -54,6 +54,7 @@ def dummy_slurm_request_with_artifact( account="account", job_dir="/root/sample_job", tunnel=LocalTunnel(job_dir="/root"), + container_image="test_image", ) slurm_config.job_name = "sample_job" max_retries = 3 @@ -79,6 +80,7 @@ def ft_slurm_request_with_artifact( account="account", job_dir="/root/sample_job", tunnel=LocalTunnel(job_dir="/root/"), + container_image="test_image", ) slurm_config.job_name = "sample_job" slurm_config.launcher = FaultTolerance( diff --git a/test/run/torchx_backend/test_packaging.py b/test/run/torchx_backend/test_packaging.py index 9343c637..e0bff87b 100644 --- a/test/run/torchx_backend/test_packaging.py +++ b/test/run/torchx_backend/test_packaging.py @@ -14,15 +14,18 @@ # limitations under the License. from dataclasses import dataclass +from unittest.mock import MagicMock import pytest from torchx import specs -from nemo_run.config import Partial, Script +from nemo_run.config import RUNDIR_NAME, Partial, Script from nemo_run.core.execution.base import Executor from nemo_run.core.execution.launcher import FaultTolerance, Torchrun from nemo_run.core.execution.local import LocalExecutor +from nemo_run.core.execution.slurm import SlurmExecutor from nemo_run.core.packaging.base import Packager +from nemo_run.core.tunnel.client import LocalTunnel from nemo_run.run.torchx_backend.packaging import ( merge_executables, package, @@ -304,3 +307,132 @@ def test_merge_executables(): assert len(merged_app_def.roles) == 2 assert merged_app_def.roles[0].name == "role1" assert merged_app_def.roles[1].name == "role2" + + +class TestPackagingNonContainerMode: + """Tests for non-container mode path substitution in packaging.""" + + def test_package_script_inline_with_slurm_non_container_mode(self, tmp_path): + """Test that inline scripts have /nemo_run paths substituted in non-container mode.""" + # Create a SlurmExecutor without container + executor = SlurmExecutor( + account="test_account", + partition="gpu", + nodes=1, + ntasks_per_node=8, + container_image=None, # Non-container mode + ) + executor.job_dir = str(tmp_path / "test-job") + executor.experiment_id = "exp-123" + + # Mock tunnel + tunnel = MagicMock(spec=LocalTunnel) + tunnel.job_dir = "/remote/experiments/exp-123" + executor.tunnel = tunnel + + # Create an inline script with /nemo_run paths + fn_or_script = Script( + inline=f"cd /{RUNDIR_NAME}/code && python /{RUNDIR_NAME}/scripts/run.py" + ) + + app_def = package( + name="test", + fn_or_script=fn_or_script, + executor=executor, + ) + + assert app_def.name == "test" + assert len(app_def.roles) == 1 + + # Read the generated script file and verify paths were substituted + script_file = tmp_path / "test-job" / "scripts" / "test.sh" + assert script_file.exists() + + content = script_file.read_text() + actual_job_dir = "/remote/experiments/exp-123/test-job" + + # Should NOT contain /nemo_run paths + assert f"/{RUNDIR_NAME}" not in content + # Should contain the actual job directory path + assert f"{actual_job_dir}/code" in content + assert f"{actual_job_dir}/scripts/run.py" in content + + def test_package_script_inline_with_slurm_container_mode(self, tmp_path): + """Test that inline scripts preserve /nemo_run paths in container mode.""" + # Create a SlurmExecutor with container + executor = SlurmExecutor( + account="test_account", + partition="gpu", + nodes=1, + ntasks_per_node=8, + container_image="nvcr.io/nvidia/pytorch:24.01-py3", + ) + executor.job_dir = str(tmp_path / "test-job") + executor.experiment_id = "exp-123" + + # Mock tunnel + tunnel = MagicMock(spec=LocalTunnel) + tunnel.job_dir = "/remote/experiments/exp-123" + executor.tunnel = tunnel + + # Create an inline script with /nemo_run paths + fn_or_script = Script( + inline=f"cd /{RUNDIR_NAME}/code && python /{RUNDIR_NAME}/scripts/run.py" + ) + + app_def = package( + name="test", + fn_or_script=fn_or_script, + executor=executor, + ) + + assert app_def.name == "test" + assert len(app_def.roles) == 1 + + # Read the generated script file and verify paths were NOT substituted + script_file = tmp_path / "test-job" / "scripts" / "test.sh" + assert script_file.exists() + + content = script_file.read_text() + + # Should contain /nemo_run paths (not substituted) + assert f"/{RUNDIR_NAME}/code" in content + assert f"/{RUNDIR_NAME}/scripts/run.py" in content + + def test_package_script_path_not_affected_by_non_container_mode(self, tmp_path): + """Test that path-based scripts are not affected by non-container mode substitution.""" + # Create a SlurmExecutor without container + executor = SlurmExecutor( + account="test_account", + partition="gpu", + nodes=1, + ntasks_per_node=8, + container_image=None, # Non-container mode + ) + executor.job_dir = str(tmp_path / "test-job") + executor.experiment_id = "exp-123" + + # Mock tunnel + tunnel = MagicMock(spec=LocalTunnel) + tunnel.job_dir = "/remote/experiments/exp-123" + executor.tunnel = tunnel + + # Create a path-based script (not inline) + fn_or_script = Script( + path="test.py", + args=["--config", f"/{RUNDIR_NAME}/configs/config.yaml"], + ) + + app_def = package( + name="test", + fn_or_script=fn_or_script, + executor=executor, + ) + + assert app_def.name == "test" + assert len(app_def.roles) == 1 + role = app_def.roles[0] + + # Path-based scripts don't write files, so args should remain unchanged + # (the substitution only affects inline script file content) + assert role.args == ["test.py", "--config", f"/{RUNDIR_NAME}/configs/config.yaml"] diff --git a/test/test_config.py b/test/test_config.py index 8ad473ef..cbf3054c 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -391,6 +391,45 @@ def test_inline_script(self): "\"echo 'test'\"", ] + def test_to_command_substitute_rundir_path(self, tmp_path): + """Test that substitute_rundir_path substitutes /nemo_run paths in inline scripts.""" + from nemo_run.config import RUNDIR_NAME + + script = Script(inline=f"cd /{RUNDIR_NAME}/code && python /{RUNDIR_NAME}/scripts/run.py") + filename = str(tmp_path / "test_script.sh") + + script.to_command( + filename=filename, + substitute_rundir_path="/remote/experiments/exp-123/test-job", + ) + + # Read the file and verify paths were substituted + with open(filename) as f: + content = f.read() + + assert f"/{RUNDIR_NAME}" not in content + assert "/remote/experiments/exp-123/test-job/code" in content + assert "/remote/experiments/exp-123/test-job/scripts/run.py" in content + + def test_to_command_without_substitute_rundir_path(self, tmp_path): + """Test that paths are NOT substituted when substitute_rundir_path is None.""" + from nemo_run.config import RUNDIR_NAME + + script = Script(inline=f"cd /{RUNDIR_NAME}/code && python /{RUNDIR_NAME}/scripts/run.py") + filename = str(tmp_path / "test_script.sh") + + script.to_command( + filename=filename, + substitute_rundir_path=None, # Default, no substitution + ) + + # Read the file and verify paths were NOT substituted + with open(filename) as f: + content = f.read() + + assert f"/{RUNDIR_NAME}/code" in content + assert f"/{RUNDIR_NAME}/scripts/run.py" in content + class TestGetUnderlyingTypes: def test_simple_type(self):