diff --git a/nemo_run/run/ray/templates/ray_enroot.sub.j2 b/nemo_run/run/ray/templates/ray_enroot.sub.j2 index b1d9ed22..80f62057 100644 --- a/nemo_run/run/ray/templates/ray_enroot.sub.j2 +++ b/nemo_run/run/ray/templates/ray_enroot.sub.j2 @@ -99,6 +99,9 @@ rm -f $LOG_DIR/ENDED # Number of GPUs per node gpus_per_node=8 +# Number of CPUs per worker node +CPUS_PER_WORKER=${CPUS_PER_WORKER:-112} + num_retries={{ num_retries }} # Track backgrounded srun client PIDs for head and workers @@ -279,7 +282,7 @@ touch $LOG_DIR/ENDED exit 1 EOF ) -srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" & +srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" & SRUN_PIDS["ray-head"]=$! # Helper function to get container PID using enroot (workaround for --overlap --container-name bug) @@ -380,7 +383,7 @@ EOF if [[ $i -eq 0 ]]; then OVERLAP_HEAD_AND_WORKER_ARG="--overlap" fi - srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" & + srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" & SRUN_PIDS["ray-worker-$i"]=$! sleep 3 done diff --git a/test/core/execution/artifacts/expected_ray_cluster_enroot.sub b/test/core/execution/artifacts/expected_ray_cluster_enroot.sub index 89f43bd3..227afe8e 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_enroot.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_enroot.sub @@ -101,6 +101,9 @@ rm -f $LOG_DIR/ENDED # Number of GPUs per node gpus_per_node=8 +# Number of CPUs per worker node +CPUS_PER_WORKER=${CPUS_PER_WORKER:-112} + num_retries=1 # Track backgrounded srun client PIDs for head and workers @@ -273,7 +276,7 @@ touch $LOG_DIR/ENDED exit 1 EOF ) -srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" & +srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" & SRUN_PIDS["ray-head"]=$! # Helper function to get container PID using enroot (workaround for --overlap --container-name bug) @@ -370,7 +373,7 @@ EOF if [[ $i -eq 0 ]]; then OVERLAP_HEAD_AND_WORKER_ARG="--overlap" fi - srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" & + srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" & SRUN_PIDS["ray-worker-$i"]=$! sleep 3 done