Skip to content

Commit 93e9198

Browse files
committed
debug prints
1 parent fd8175b commit 93e9198

File tree

3 files changed

+31
-9
lines changed

3 files changed

+31
-9
lines changed

.github/scripts/filter-matrix.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ def main(args: list[str]) -> None:
150150
options.limit_pr_builds == "true",
151151
):
152152
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
153+
# Add tensorrt version to all items (required by linux-test.yml)
154+
item["tensorrt"] = {"version": "10.13.3"}
153155
filtered_includes.append(item)
154156

155157
# NEW: Create distributed variant for specific configs

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ jobs:
536536
537537
L2-dynamo-distributed-tests:
538538
name: L2 dynamo distributed tests
539-
needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests]
539+
needs: [filter-matrix, build]
540540
if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
541541
strategy:
542542
fail-fast: false
@@ -560,6 +560,8 @@ jobs:
560560
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
561561
}
562562
pre-script: ${{ matrix.pre-script }}
563+
post-script: ${{ matrix.post-script }}
564+
smoke-test-script: ${{ matrix.smoke-test-script }}
563565
script: |
564566
set -euo pipefail
565567
@@ -569,9 +571,7 @@ jobs:
569571
echo "=========================================="
570572
echo "Python version: ${PYTHON_VERSION}"
571573
echo "CUDA version: ${CU_VERSION}"
572-
echo "Runner: ${{ matrix.validation_runner }}"
573-
echo "Num GPUs: ${{ matrix.num_gpus }}"
574-
echo "Config: ${{ matrix.config }}"
574+
echo "Num GPUs: ${NUM_GPUS}"
575575
echo "=========================================="
576576
577577
# Verify GPUs are available
@@ -586,17 +586,36 @@ jobs:
586586
587587
# Install MPI (required for TensorRT-LLM plugins)
588588
echo "Installing MPI..."
589-
dnf install -y mpich mpich-devel openmpi openmpi-devel
589+
dnf install -y openmpi openmpi-devel
590+
591+
# Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
592+
export PATH="/usr/lib64/openmpi/bin:$PATH"
593+
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
594+
595+
# Verify mpirun is accessible
596+
which mpirun
597+
mpirun --version
590598
591599
# Run distributed tests
592600
pushd .
593601
cd tests/py/dynamo
594602
595603
echo "Running distributed tests with mpirun..."
596-
mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
597-
python -m pytest -ra \
598-
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
599-
distributed/test_nccl_ops.py
604+
echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
605+
echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
606+
607+
# Use a wrapper script to ensure only rank 0 writes the JUnit XML
608+
# Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
609+
RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
610+
mpirun --allow-run-as-root -n ${NUM_GPUS} \
611+
bash -c '
612+
echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
613+
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
614+
python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
615+
else
616+
python -m pytest -ra distributed/test_nccl_ops.py
617+
fi
618+
'
600619
601620
popd
602621

.github/workflows/linux-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ jobs:
7474
RUNNER_TEST_RESULTS_DIR: /tmp/test_results
7575
ARCH: ${{ inputs.architecture }}
7676
USE_TRT_RTX: ${{ inputs.use-rtx }}
77+
NUM_GPUS: ${{ matrix.num_gpus || '' }}
7778
DOWNLOAD_ARTIFACT_NAME: pytorch_tensorrt_${{ matrix.tensorrt.version }}_${{ matrix.python_version }}_${{ matrix.desired_cuda }}_${{ inputs.architecture }}
7879
name: ${{ inputs.job-name }}-${{ matrix.tensorrt.version }}-${{ matrix.python_version }}-${{ matrix.desired_cuda }}
7980
runs-on: ${{ matrix.validation_runner }}

0 commit comments

Comments
 (0)