Skip to content

Commit 10e90cb

Browse files
allenwang28facebook-github-bot
authored andcommitted
Add ARM workflow for GB200 support (#2029)
Summary: As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI. Differential Revision: D88202637
1 parent 2c0ad12 commit 10e90cb

File tree

8 files changed

+401
-60
lines changed

8 files changed

+401
-60
lines changed

.github/workflows/build-cpu.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@ jobs:
2020
with:
2121
timeout: 60
2222
runner: ${{ matrix.runs-on }}
23+
docker-image: "pytorch/manylinux2_28-builder:cpu"
2324
submodules: recursive
2425
upload-artifact: monarch-cpu-${{ github.sha }}
2526
script: |
2627
# Source common setup functions
2728
source scripts/common-setup.sh
2829
29-
# Setup build environment (conda + system deps + rust + build deps)
30+
# Setup build environment (manylinux Python + system deps + rust)
3031
setup_build_environment
3132
3233
# Install torch nightly (CPU version)

.github/workflows/build-cuda.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
# Source common setup functions
3232
source scripts/common-setup.sh
3333
34-
# Setup build environment (conda + system deps + rust + build deps)
34+
# Setup build environment (manylinux Python + system deps + rust)
3535
setup_build_environment
3636
3737
# Install torch nightly
@@ -41,7 +41,8 @@ jobs:
4141
# Setup Tensor Engine
4242
setup_tensor_engine
4343
44-
export CUDA_LIB_DIR=/usr/lib64
44+
# Setup CUDA environment (detects CUDA paths automatically)
45+
setup_cuda_environment
4546
4647
# Build monarch (CUDA version)
4748
python setup.py bdist_wheel

.github/workflows/doc_build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
# Source common setup functions
2727
source scripts/common-setup.sh
2828
29-
# Setup build environment (conda + system deps + rust + build deps)
29+
# Setup build environment (manylinux Python + system deps + rust)
3030
# docs build will use 3.13
3131
setup_build_environment 3.13
3232
@@ -46,7 +46,9 @@ jobs:
4646
export USE_TENSOR_ENGINE=1
4747
export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}"
4848
export _GLIBCXX_USE_CXX11_ABI=1
49-
export CUDA_LIB_DIR=/usr/lib64
49+
50+
# Setup CUDA environment (detects CUDA paths automatically)
51+
setup_cuda_environment
5052
5153
# Build Monarch completely for documentation - use dedicated script
5254
./scripts/build_monarch_for_docs.sh

.github/workflows/publish_release.yml

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,38 @@ concurrency:
1212
cancel-in-progress: true
1313
jobs:
1414
build:
15-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
15+
name: ${{ matrix.name }}-py${{ matrix.python-version }}
1616
strategy:
17-
fail-fast: false # Changed to false to see results from all Python versions
17+
fail-fast: false
1818
matrix:
1919
# TODO add 3.14 once we figure out py03 issue
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
2121
include:
22-
- name: 4xlarge
23-
runs-on: linux.g5.4xlarge.nvidia.gpu
22+
# x86_64 CUDA builds
23+
- name: cuda12.8-x86_64
24+
runner: linux.g5.4xlarge.nvidia.gpu
2425
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
2526
gpu-arch-type: "cuda"
2627
gpu-arch-version: "12.8"
28+
docker-image: "pytorch/almalinux-builder" # Uses default, becomes pytorch/almalinux-builder:cuda12.8
29+
platform-tag: "manylinux2014_x86_64"
30+
# aarch64 CUDA builds
31+
- name: cuda12.8-aarch64
32+
runner: linux.arm64.r7g.12xlarge.memory # GPU-enabled ARM runner like PyTorch uses
33+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
34+
gpu-arch-type: "cpu" # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
35+
gpu-arch-version: ""
36+
docker-image: "pytorch/manylinuxaarch64-builder:cuda12.8" # ARM-specific image with CUDA
37+
platform-tag: "manylinux2014_aarch64"
2738
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2839
with:
2940
timeout: 60
30-
runner: ${{ matrix.runs-on }}
41+
runner: ${{ matrix.runner }}
3142
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3243
gpu-arch-version: ${{ matrix.gpu-arch-version }}
44+
docker-image: ${{ matrix.docker-image }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -44,17 +56,18 @@ jobs:
4456
# Setup Tensor Engine dependencies
4557
setup_tensor_engine
4658
59+
# Setup CUDA environment (detects CUDA paths automatically for both x86_64 and aarch64)
60+
setup_cuda_environment
61+
4762
cargo install --path monarch_hyperactor
4863
4964
# Build wheel
5065
export MONARCH_PACKAGE_NAME="torchmonarch"
51-
export CUDA_LIB_DIR=/usr/lib64
5266
export MONARCH_VERSION="${{ github.event.inputs.version }}"
5367
python setup.py bdist_wheel
5468
55-
# hacky until the right distribution wheel can be made...
56-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
57-
ls -la dist/
69+
# Properly retag wheel with manylinux platform tag
70+
retag_wheel_platform "${{ matrix.platform-tag }}"
5871
5972
# Run tests
6073
install_python_test_dependencies

.github/workflows/test-cpu-python.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,15 @@ jobs:
1919
with:
2020
timeout: 60
2121
runner: linux.4xlarge
22+
docker-image: "pytorch/manylinux2_28-builder:cpu"
2223
submodules: recursive
2324
download-artifact: ${{ inputs.artifact-name }}
2425
script: |
2526
# Source common setup functions
2627
source scripts/common-setup.sh
2728
28-
# Setup test environment
29-
setup_conda_environment
29+
# Setup test environment (uses manylinux Python)
30+
setup_test_environment
3031
3132
# Disable tensor engine
3233
export USE_TENSOR_ENGINE=0

.github/workflows/wheels.yml

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,37 @@ concurrency:
1313
cancel-in-progress: true
1414
jobs:
1515
build:
16-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
16+
name: ${{ matrix.name }}-py${{ matrix.python-version }}
1717
strategy:
18-
fail-fast: false # Changed to false to see results from all Python versions
18+
fail-fast: false
1919
matrix:
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
2121
include:
22-
- name: 4xlarge
23-
runs-on: linux.g5.4xlarge.nvidia.gpu
22+
# x86_64 CUDA builds
23+
- name: cuda12.6-x86_64
24+
runner: linux.g5.4xlarge.nvidia.gpu
2425
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
2526
gpu-arch-type: "cuda"
2627
gpu-arch-version: "12.6"
28+
docker-image: "pytorch/almalinux-builder" # Uses default, becomes pytorch/almalinux-builder:cuda12.6
29+
platform-tag: "manylinux2014_x86_64"
30+
# aarch64 CUDA builds
31+
- name: cuda12.6-aarch64
32+
runner: linux.arm64.r7g.12xlarge.memory # GPU-enabled ARM runner like PyTorch uses
33+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
34+
gpu-arch-type: "cpu" # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
35+
gpu-arch-version: ""
36+
docker-image: "pytorch/manylinuxaarch64-builder:cuda12.6" # ARM-specific image with CUDA
37+
platform-tag: "manylinux2014_aarch64"
2738
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2839
with:
2940
timeout: 60
30-
runner: ${{ matrix.runs-on }}
41+
runner: ${{ matrix.runner }}
3142
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3243
gpu-arch-version: ${{ matrix.gpu-arch-version }}
44+
docker-image: ${{ matrix.docker-image }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -44,18 +56,19 @@ jobs:
4456
# Setup Tensor Engine dependencies
4557
setup_tensor_engine
4658
59+
# Setup CUDA environment (detects CUDA paths automatically for both x86_64 and aarch64)
60+
setup_cuda_environment
61+
4762
cargo install --path monarch_hyperactor
4863
4964
# Build wheel
5065
export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
5166
export MONARCH_VERSION=$(date +'%Y.%m.%d')
52-
export CUDA_LIB_DIR=/usr/lib64
5367
5468
python setup.py bdist_wheel
5569
56-
# hacky until the right distribution wheel can be made...
57-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
58-
ls -la dist/
70+
# Properly retag wheel with manylinux platform tag
71+
retag_wheel_platform "${{ matrix.platform-tag }}"
5972
6073
# Run tests
6174
install_python_test_dependencies

build_utils/src/lib.rs

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,17 +152,33 @@ pub fn discover_cuda_config() -> Result<CudaConfig, BuildError> {
152152
};
153153

154154
// Add standard include directories
155-
// Check both old-style (include) and new-style (targets/x86_64-linux/include) CUDA installations
156-
for include_subdir in &["include", "targets/x86_64-linux/include"] {
155+
// Check both old-style (include) and new-style target-specific paths
156+
// Support both x86_64 and aarch64/ARM architectures
157+
for include_subdir in &[
158+
"include",
159+
"targets/x86_64-linux/include",
160+
"targets/aarch64-linux/include",
161+
"targets/sbsa-linux/include",
162+
] {
157163
let include_dir = cuda_home_path.join(include_subdir);
158164
if include_dir.exists() {
159165
config.include_dirs.push(include_dir);
160166
}
161167
}
162168

163169
// Add standard library directories
164-
// Check both old-style (lib64, lib) and new-style (targets/x86_64-linux/lib) CUDA installations
165-
for lib_subdir in &["lib64", "lib", "lib/x64", "targets/x86_64-linux/lib"] {
170+
// Check both old-style and new-style CUDA installations for both x86_64 and aarch64
171+
// Try architecture-specific paths first, then generic paths
172+
for lib_subdir in &[
173+
"lib64", // Common x86_64 location
174+
"lib", // Common aarch64 location
175+
"lib/x64", // Windows x64
176+
"targets/x86_64-linux/lib", // CUDA toolkit x86_64
177+
"targets/aarch64-linux/lib", // CUDA toolkit aarch64
178+
"targets/sbsa-linux/lib", // CUDA toolkit ARM server
179+
"lib/aarch64-linux-gnu", // Debian/Ubuntu aarch64
180+
"lib/x86_64-linux-gnu", // Debian/Ubuntu x86_64
181+
] {
166182
let lib_dir = cuda_home_path.join(lib_subdir);
167183
if lib_dir.exists() {
168184
config.lib_dirs.push(lib_dir);
@@ -201,8 +217,16 @@ pub fn get_cuda_lib_dir() -> Result<String, BuildError> {
201217
// Try to deduce from CUDA configuration
202218
let cuda_config = discover_cuda_config()?;
203219
if let Some(cuda_home) = cuda_config.cuda_home {
204-
// Check both old-style and new-style CUDA library paths
205-
for lib_subdir in &["lib64", "lib", "targets/x86_64-linux/lib"] {
220+
// Check both x86_64 and aarch64 CUDA library paths
221+
for lib_subdir in &[
222+
"lib64", // Common x86_64 location
223+
"lib", // Common aarch64 location
224+
"targets/x86_64-linux/lib", // CUDA toolkit x86_64
225+
"targets/aarch64-linux/lib", // CUDA toolkit aarch64
226+
"targets/sbsa-linux/lib", // CUDA toolkit ARM server
227+
"lib/aarch64-linux-gnu", // Debian/Ubuntu aarch64
228+
"lib/x86_64-linux-gnu", // Debian/Ubuntu x86_64
229+
] {
206230
let lib_path = cuda_home.join(lib_subdir);
207231
if lib_path.exists() {
208232
return Ok(lib_path.to_string_lossy().to_string());

0 commit comments

Comments
 (0)