@@ -536,7 +536,7 @@ jobs:
536536
537537 L2-dynamo-distributed-tests :
538538 name : L2 dynamo distributed tests
539- needs : [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests ]
539+ needs : [filter-matrix, build]
540540 if : ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
541541 strategy :
542542 fail-fast : false
@@ -560,6 +560,8 @@ jobs:
560560 "include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
561561 }
562562 pre-script : ${{ matrix.pre-script }}
563+ post-script : ${{ matrix.post-script }}
564+ smoke-test-script : ${{ matrix.smoke-test-script }}
563565 script : |
564566 set -euo pipefail
565567
@@ -569,9 +571,7 @@ jobs:
569571 echo "=========================================="
570572 echo "Python version: ${PYTHON_VERSION}"
571573 echo "CUDA version: ${CU_VERSION}"
572- echo "Runner: ${{ matrix.validation_runner }}"
573- echo "Num GPUs: ${{ matrix.num_gpus }}"
574- echo "Config: ${{ matrix.config }}"
574+ echo "Num GPUs: ${NUM_GPUS}"
575575 echo "=========================================="
576576
577577 # Verify GPUs are available
@@ -586,17 +586,36 @@ jobs:
586586
587587 # Install MPI (required for TensorRT-LLM plugins)
588588 echo "Installing MPI..."
589- dnf install -y mpich mpich-devel openmpi openmpi-devel
589+ dnf install -y openmpi openmpi-devel
590+
591+ # Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
592+ export PATH="/usr/lib64/openmpi/bin:$PATH"
593+ export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
594+
595+ # Verify mpirun is accessible
596+ which mpirun
597+ mpirun --version
590598
591599 # Run distributed tests
592600 pushd .
593601 cd tests/py/dynamo
594602
595603 echo "Running distributed tests with mpirun..."
596- mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
597- python -m pytest -ra \
598- --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
599- distributed/test_nccl_ops.py
604+ echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
605+ echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
606+
607+ # Use a wrapper script to ensure only rank 0 writes the JUnit XML
608+ # Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
609+ RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
610+ mpirun --allow-run-as-root -n ${NUM_GPUS} \
611+ bash -c '
612+ echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
613+ if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
614+ python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
615+ else
616+ python -m pytest -ra distributed/test_nccl_ops.py
617+ fi
618+ '
600619
601620 popd
602621
0 commit comments