Add ARM workflow for GB200 support (#2029)

allenwang28 · facebook-github-bot · commit 10e90cb25597 · 2025-12-05T09:54:53.000-08:00
Summary:

As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI.

Differential Revision: D88202637
diff --git a/.github/workflows/build-cpu.yml b/.github/workflows/build-cpu.yml
@@ -20,13 +20,14 @@ jobs:
     with:
       timeout: 60
       runner: ${{ matrix.runs-on }}
+      docker-image: "pytorch/manylinux2_28-builder:cpu"
       submodules: recursive
       upload-artifact: monarch-cpu-${{ github.sha }}
       script: |
         # Source common setup functions
         source scripts/common-setup.sh
 
-        # Setup build environment (conda + system deps + rust + build deps)
+        # Setup build environment (manylinux Python + system deps + rust)
         setup_build_environment
 
         # Install torch nightly (CPU version)
diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml
@@ -31,7 +31,7 @@ jobs:
         # Source common setup functions
         source scripts/common-setup.sh
 
-        # Setup build environment (conda + system deps + rust + build deps)
+        # Setup build environment (manylinux Python + system deps + rust)
         setup_build_environment
 
         # Install torch nightly
@@ -41,7 +41,8 @@ jobs:
         # Setup Tensor Engine
         setup_tensor_engine
 
-        export CUDA_LIB_DIR=/usr/lib64
+        # Setup CUDA environment (detects CUDA paths automatically)
+        setup_cuda_environment
 
         # Build monarch (CUDA version)
         python setup.py bdist_wheel
diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml
@@ -26,7 +26,7 @@ jobs:
         # Source common setup functions
         source scripts/common-setup.sh
 
-        # Setup build environment (conda + system deps + rust + build deps)
+        # Setup build environment (manylinux Python + system deps + rust)
         # docs build will use 3.13
         setup_build_environment 3.13
 
@@ -46,7 +46,9 @@ jobs:
         export USE_TENSOR_ENGINE=1
         export RUSTFLAGS="-Zthreads=16 ${RUSTFLAGS:-}"
         export _GLIBCXX_USE_CXX11_ABI=1
-        export CUDA_LIB_DIR=/usr/lib64
+
+        # Setup CUDA environment (detects CUDA paths automatically)
+        setup_cuda_environment
 
         # Build Monarch completely for documentation - use dedicated script
         ./scripts/build_monarch_for_docs.sh
diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml
@@ -12,26 +12,38 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
+    name: ${{ matrix.name }}-py${{ matrix.python-version }}
     strategy:
-      fail-fast: false  # Changed to false to see results from all Python versions
+      fail-fast: false
       matrix:
         # TODO add 3.14 once we figure out py03 issue
         python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
-          - name: 4xlarge
-            runs-on: linux.g5.4xlarge.nvidia.gpu
+          # x86_64 CUDA builds
+          - name: cuda12.8-x86_64
+            runner: linux.g5.4xlarge.nvidia.gpu
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.8"
+            docker-image: "pytorch/almalinux-builder"  # Uses default, becomes pytorch/almalinux-builder:cuda12.8
+            platform-tag: "manylinux2014_x86_64"
+          # aarch64 CUDA builds
+          - name: cuda12.8-aarch64
+            runner: linux.arm64.r7g.12xlarge.memory  # GPU-enabled ARM runner like PyTorch uses
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
+            gpu-arch-type: "cpu"  # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
+            gpu-arch-version: ""
+            docker-image: "pytorch/manylinuxaarch64-builder:cuda12.8"  # ARM-specific image with CUDA
+            platform-tag: "manylinux2014_aarch64"
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
+      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
@@ -44,17 +56,18 @@ jobs:
         # Setup Tensor Engine dependencies
         setup_tensor_engine
 
+        # Setup CUDA environment (detects CUDA paths automatically for both x86_64 and aarch64)
+        setup_cuda_environment
+
         cargo install --path monarch_hyperactor
 
         # Build wheel
         export MONARCH_PACKAGE_NAME="torchmonarch"
-        export CUDA_LIB_DIR=/usr/lib64
         export MONARCH_VERSION="${{ github.event.inputs.version }}"
         python setup.py bdist_wheel
 
-        # hacky until the right distribution wheel can be made...
-        find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
-        ls -la dist/
+        # Properly retag wheel with manylinux platform tag
+        retag_wheel_platform "${{ matrix.platform-tag }}"
 
         # Run tests
         install_python_test_dependencies
diff --git a/.github/workflows/test-cpu-python.yml b/.github/workflows/test-cpu-python.yml
@@ -19,14 +19,15 @@ jobs:
     with:
       timeout: 60
       runner: linux.4xlarge
+      docker-image: "pytorch/manylinux2_28-builder:cpu"
       submodules: recursive
       download-artifact: ${{ inputs.artifact-name }}
       script: |
         # Source common setup functions
         source scripts/common-setup.sh
 
-        # Setup test environment
-        setup_conda_environment
+        # Setup test environment (uses manylinux Python)
+        setup_test_environment
 
         # Disable tensor engine
         export USE_TENSOR_ENGINE=0
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -13,25 +13,37 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
+    name: ${{ matrix.name }}-py${{ matrix.python-version }}
     strategy:
-      fail-fast: false  # Changed to false to see results from all Python versions
+      fail-fast: false
       matrix:
         python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
-          - name: 4xlarge
-            runs-on: linux.g5.4xlarge.nvidia.gpu
+          # x86_64 CUDA builds
+          - name: cuda12.6-x86_64
+            runner: linux.g5.4xlarge.nvidia.gpu
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
+            docker-image: "pytorch/almalinux-builder"  # Uses default, becomes pytorch/almalinux-builder:cuda12.6
+            platform-tag: "manylinux2014_x86_64"
+          # aarch64 CUDA builds
+          - name: cuda12.6-aarch64
+            runner: linux.arm64.r7g.12xlarge.memory  # GPU-enabled ARM runner like PyTorch uses
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cpu"  # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
+            gpu-arch-version: ""
+            docker-image: "pytorch/manylinuxaarch64-builder:cuda12.6"  # ARM-specific image with CUDA
+            platform-tag: "manylinux2014_aarch64"
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
+      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
@@ -44,18 +56,19 @@ jobs:
         # Setup Tensor Engine dependencies
         setup_tensor_engine
 
+        # Setup CUDA environment (detects CUDA paths automatically for both x86_64 and aarch64)
+        setup_cuda_environment
+
         cargo install --path monarch_hyperactor
 
         # Build wheel
         export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
         export MONARCH_VERSION=$(date +'%Y.%m.%d')
-        export CUDA_LIB_DIR=/usr/lib64
 
         python setup.py bdist_wheel
 
-        # hacky until the right distribution wheel can be made...
-        find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
-        ls -la dist/
+        # Properly retag wheel with manylinux platform tag
+        retag_wheel_platform "${{ matrix.platform-tag }}"
 
         # Run tests
         install_python_test_dependencies
diff --git a/build_utils/src/lib.rs b/build_utils/src/lib.rs
@@ -152,17 +152,33 @@ pub fn discover_cuda_config() -> Result<CudaConfig, BuildError> {
     };
 
     // Add standard include directories
-    // Check both old-style (include) and new-style (targets/x86_64-linux/include) CUDA installations
-    for include_subdir in &["include", "targets/x86_64-linux/include"] {
+    // Check both old-style (include) and new-style target-specific paths
+    // Support both x86_64 and aarch64/ARM architectures
+    for include_subdir in &[
+        "include",
+        "targets/x86_64-linux/include",
+        "targets/aarch64-linux/include",
+        "targets/sbsa-linux/include",
+    ] {
         let include_dir = cuda_home_path.join(include_subdir);
         if include_dir.exists() {
             config.include_dirs.push(include_dir);
         }
     }
 
     // Add standard library directories
-    // Check both old-style (lib64, lib) and new-style (targets/x86_64-linux/lib) CUDA installations
-    for lib_subdir in &["lib64", "lib", "lib/x64", "targets/x86_64-linux/lib"] {
+    // Check both old-style and new-style CUDA installations for both x86_64 and aarch64
+    // Try architecture-specific paths first, then generic paths
+    for lib_subdir in &[
+        "lib64",                          // Common x86_64 location
+        "lib",                             // Common aarch64 location
+        "lib/x64",                         // Windows x64
+        "targets/x86_64-linux/lib",        // CUDA toolkit x86_64
+        "targets/aarch64-linux/lib",       // CUDA toolkit aarch64
+        "targets/sbsa-linux/lib",          // CUDA toolkit ARM server
+        "lib/aarch64-linux-gnu",           // Debian/Ubuntu aarch64
+        "lib/x86_64-linux-gnu",            // Debian/Ubuntu x86_64
+    ] {
         let lib_dir = cuda_home_path.join(lib_subdir);
         if lib_dir.exists() {
             config.lib_dirs.push(lib_dir);
@@ -201,8 +217,16 @@ pub fn get_cuda_lib_dir() -> Result<String, BuildError> {
     // Try to deduce from CUDA configuration
     let cuda_config = discover_cuda_config()?;
     if let Some(cuda_home) = cuda_config.cuda_home {
-        // Check both old-style and new-style CUDA library paths
-        for lib_subdir in &["lib64", "lib", "targets/x86_64-linux/lib"] {
+        // Check both x86_64 and aarch64 CUDA library paths
+        for lib_subdir in &[
+            "lib64",                          // Common x86_64 location
+            "lib",                             // Common aarch64 location
+            "targets/x86_64-linux/lib",        // CUDA toolkit x86_64
+            "targets/aarch64-linux/lib",       // CUDA toolkit aarch64
+            "targets/sbsa-linux/lib",          // CUDA toolkit ARM server
+            "lib/aarch64-linux-gnu",           // Debian/Ubuntu aarch64
+            "lib/x86_64-linux-gnu",            // Debian/Ubuntu x86_64
+        ] {
             let lib_path = cuda_home.join(lib_subdir);
             if lib_path.exists() {
                 return Ok(lib_path.to_string_lossy().to_string());
diff --git a/scripts/common-setup.sh b/scripts/common-setup.sh