Skip to content

Commit b378bcc

Browse files
committed
mmultigpu ci test
1 parent 26875a5 commit b378bcc

File tree

3 files changed

+141
-6
lines changed

3 files changed

+141
-6
lines changed

.github/scripts/filter-matrix.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,41 @@ def filter_matrix_item(
6060
return True
6161

6262

63+
def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
64+
"""Create distributed test configuration from a regular config.
65+
66+
Takes a standard test config and modifies it for distributed testing:
67+
- Changes runner to multi-GPU instance
68+
- Adds num_gpus field
69+
- Adds config marker
70+
"""
71+
import sys
72+
73+
# Create a copy to avoid modifying the original
74+
dist_item = item.copy()
75+
76+
# Debug: Show original config
77+
print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
78+
print(f"[DEBUG] Python: {item.get('python_version')}", file=sys.stderr)
79+
print(f"[DEBUG] CUDA: {item.get('desired_cuda')}", file=sys.stderr)
80+
print(
81+
f"[DEBUG] Original runner: {item.get('validation_runner')}", file=sys.stderr
82+
)
83+
84+
# Override runner to use multi-GPU instance
85+
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
86+
87+
# Add distributed-specific fields
88+
dist_item["num_gpus"] = 2
89+
dist_item["config"] = "distributed"
90+
91+
# Debug: Show modified config
92+
print(f"[DEBUG] New runner: {dist_item['validation_runner']}", file=sys.stderr)
93+
print(f"[DEBUG] GPUs: {dist_item['num_gpus']}", file=sys.stderr)
94+
95+
return dist_item
96+
97+
6398
def main(args: list[str]) -> None:
6499
parser = argparse.ArgumentParser()
65100
parser.add_argument(
@@ -99,16 +134,69 @@ def main(args: list[str]) -> None:
99134

100135
includes = matrix_dict["include"]
101136
filtered_includes = []
137+
distributed_includes = [] # NEW: separate list for distributed configs
138+
139+
print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
102140

103141
for item in includes:
142+
py_ver = item.get("python_version", "unknown")
143+
cuda_ver = item.get("desired_cuda", "unknown")
144+
145+
print(f"[DEBUG] Checking config: py={py_ver}, cuda={cuda_ver}", file=sys.stderr)
146+
104147
if filter_matrix_item(
105148
item,
106149
options.jetpack == "true",
107150
options.limit_pr_builds == "true",
108151
):
152+
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
109153
filtered_includes.append(item)
110154

111-
filtered_matrix_dict = {"include": filtered_includes}
155+
# NEW: Create distributed variant for specific configs
156+
# Only Python 3.10 + CUDA 13.0 for now
157+
if item["python_version"] == "3.10" and item["desired_cuda"] == "cu130":
158+
print(
159+
f"[DEBUG] Creating distributed config for py3.10+cu130",
160+
file=sys.stderr,
161+
)
162+
distributed_includes.append(create_distributed_config(item))
163+
else:
164+
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
165+
166+
# Debug: Show summary
167+
print(f"[DEBUG] Final counts:", file=sys.stderr)
168+
print(f"[DEBUG] Regular configs: {len(filtered_includes)}", file=sys.stderr)
169+
print(
170+
f"[DEBUG] Distributed configs: {len(distributed_includes)}", file=sys.stderr
171+
)
172+
173+
# Debug: Show which configs will be built
174+
print(
175+
f"[DEBUG] Configs that will be BUILT (in filtered_includes):", file=sys.stderr
176+
)
177+
for item in filtered_includes:
178+
print(
179+
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}",
180+
file=sys.stderr,
181+
)
182+
183+
print(
184+
f"[DEBUG] Configs for DISTRIBUTED TESTS (in distributed_includes):",
185+
file=sys.stderr,
186+
)
187+
for item in distributed_includes:
188+
print(
189+
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}, gpus={item.get('num_gpus')}",
190+
file=sys.stderr,
191+
)
192+
193+
# NEW: Output both regular and distributed configs
194+
filtered_matrix_dict = {
195+
"include": filtered_includes,
196+
"distributed_include": distributed_includes, # NEW field
197+
}
198+
199+
# Output to stdout (consumed by GitHub Actions)
112200
print(json.dumps(filtered_matrix_dict))
113201

114202

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@ jobs:
6868
ref: ""
6969
test-infra-repository: pytorch/test-infra
7070
test-infra-ref: main
71-
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
71+
# Extract the include array from filter-matrix output
72+
build-matrix: |
73+
{
74+
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
75+
}
7276
pre-script: ${{ matrix.pre-script }}
7377
env-var-script: ${{ matrix.env-var-script }}
7478
post-script: ${{ matrix.post-script }}
@@ -498,18 +502,50 @@ jobs:
498502
ref: ""
499503
test-infra-repository: pytorch/test-infra
500504
test-infra-ref: main
501-
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
505+
# Extract the distributed_include array from filter-matrix output
506+
build-matrix: |
507+
{
508+
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
509+
}
502510
pre-script: ${{ matrix.pre-script }}
503511
script: |
504512
set -euo pipefail
513+
514+
# Debug: Show what config we're using
515+
echo "=========================================="
516+
echo "DISTRIBUTED TEST CONFIGURATION"
517+
echo "=========================================="
518+
echo "Python version: ${PYTHON_VERSION}"
519+
echo "CUDA version: ${CU_VERSION}"
520+
echo "Runner: ${{ matrix.validation_runner }}"
521+
echo "Num GPUs: ${{ matrix.num_gpus }}"
522+
echo "Config: ${{ matrix.config }}"
523+
echo "=========================================="
524+
525+
# Verify GPUs are available
526+
echo "Checking GPU availability:"
527+
nvidia-smi
528+
echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
529+
echo "=========================================="
530+
505531
export USE_HOST_DEPS=1
506532
export CI_BUILD=1
507533
export USE_TRTLLM_PLUGINS=1
534+
535+
# Install MPI (required for TensorRT-LLM plugins)
536+
echo "Installing MPI..."
508537
dnf install -y mpich mpich-devel openmpi openmpi-devel
538+
539+
# Run distributed tests
509540
pushd .
510-
cd tests/py
511-
cd dynamo
512-
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
541+
cd tests/py/dynamo
542+
543+
echo "Running distributed tests with mpirun..."
544+
mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
545+
python -m pytest -ra \
546+
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
547+
distributed/test_nccl_ops.py
548+
513549
popd
514550
515551
concurrency:

.github/workflows/build_linux.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,17 @@ jobs:
160160
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
161161
timeout-minutes: ${{ inputs.timeout }}
162162
steps:
163+
- name: Debug matrix configuration
164+
shell: bash
165+
run: |
166+
echo "=========================================="
167+
echo "BUILD MATRIX DEBUG"
168+
echo "=========================================="
169+
echo "Python version: ${{ matrix.python_version }}"
170+
echo "CUDA version: ${{ matrix.desired_cuda }}"
171+
echo "GPU arch type: ${{ matrix.gpu_arch_type }}"
172+
echo "Runner: ${{ matrix.validation_runner }}"
173+
echo "=========================================="
163174
- name: Clean workspace
164175
shell: bash -l {0}
165176
run: |

0 commit comments

Comments
 (0)