Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions nemo_run/run/ray/templates/ray_enroot.sub.j2
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ rm -f $LOG_DIR/ENDED
# Number of GPUs per node
gpus_per_node=8

# Number of CPUs per worker node
CPUS_PER_WORKER=${CPUS_PER_WORKER:-112}

num_retries={{ num_retries }}

# Track backgrounded srun client PIDs for head and workers
Expand Down Expand Up @@ -279,7 +282,7 @@ touch $LOG_DIR/ENDED
exit 1
EOF
)
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
SRUN_PIDS["ray-head"]=$!

# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
Expand Down Expand Up @@ -380,7 +383,7 @@ EOF
if [[ $i -eq 0 ]]; then
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
fi
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
SRUN_PIDS["ray-worker-$i"]=$!
sleep 3
done
Expand Down
7 changes: 5 additions & 2 deletions test/core/execution/artifacts/expected_ray_cluster_enroot.sub
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ rm -f $LOG_DIR/ENDED
# Number of GPUs per node
gpus_per_node=8

# Number of CPUs per worker node
CPUS_PER_WORKER=${CPUS_PER_WORKER:-112}

num_retries=1

# Track backgrounded srun client PIDs for head and workers
Expand Down Expand Up @@ -273,7 +276,7 @@ touch $LOG_DIR/ENDED
exit 1
EOF
)
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
SRUN_PIDS["ray-head"]=$!

# Helper function to get container PID using enroot (workaround for --overlap --container-name bug)
Expand Down Expand Up @@ -370,7 +373,7 @@ EOF
if [[ $i -eq 0 ]]; then
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
fi
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
SRUN_PIDS["ray-worker-$i"]=$!
sleep 3
done
Expand Down
Loading