Add ARM workflow for GB200 support (#2029)

allenwang28 · facebook-github-bot · commit 3dad6328e7ef · 2025-12-02T14:48:56.000-08:00
Summary:

As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI.

Differential Revision: D88202637
diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml
@@ -12,26 +12,38 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
+    name: ${{ matrix.name }}-py${{ matrix.python-version }}
     strategy:
-      fail-fast: false  # Changed to false to see results from all Python versions
+      fail-fast: false
       matrix:
         # TODO add 3.14 once we figure out py03 issue
         python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
-          - name: 4xlarge
-            runs-on: linux.g5.4xlarge.nvidia.gpu
+          # x86_64 CUDA builds
+          - name: cuda12.8-x86_64
+            runner: linux.g5.4xlarge.nvidia.gpu
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.8"
+            docker-image: "pytorch/almalinux-builder"  # Uses default, becomes pytorch/almalinux-builder:cuda12.8
+            platform-tag: "manylinux2014_x86_64"
+          # aarch64 CUDA builds
+          - name: cuda12.8-aarch64
+            runner: linux.arm64.r7g.12xlarge.memory  # GPU-enabled ARM runner like PyTorch uses
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
+            gpu-arch-type: "cpu"  # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
+            gpu-arch-version: ""
+            docker-image: "pytorch/manylinuxaarch64-builder:cuda12.8"  # ARM-specific image with CUDA
+            platform-tag: "manylinux2014_aarch64"
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
+      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
@@ -52,9 +64,8 @@ jobs:
         export MONARCH_VERSION="${{ github.event.inputs.version }}"
         python setup.py bdist_wheel
 
-        # hacky until the right distribution wheel can be made...
-        find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
-        ls -la dist/
+        # Properly retag wheel with manylinux platform tag
+        retag_wheel_platform "${{ matrix.platform-tag }}"
 
         # Run tests
         install_python_test_dependencies
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -13,25 +13,37 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
+    name: ${{ matrix.name }}-py${{ matrix.python-version }}
     strategy:
-      fail-fast: false  # Changed to false to see results from all Python versions
+      fail-fast: false
       matrix:
         python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
-          - name: 4xlarge
-            runs-on: linux.g5.4xlarge.nvidia.gpu
+          # x86_64 CUDA builds
+          - name: cuda12.6-x86_64
+            runner: linux.g5.4xlarge.nvidia.gpu
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
+            docker-image: "pytorch/almalinux-builder"  # Uses default, becomes pytorch/almalinux-builder:cuda12.6
+            platform-tag: "manylinux2014_x86_64"
+          # aarch64 CUDA builds
+          - name: cuda12.6-aarch64
+            runner: linux.arm64.r7g.12xlarge.memory  # GPU-enabled ARM runner like PyTorch uses
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cpu"  # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
+            gpu-arch-version: ""
+            docker-image: "pytorch/manylinuxaarch64-builder:cuda12.6"  # ARM-specific image with CUDA
+            platform-tag: "manylinux2014_aarch64"
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
+      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
@@ -53,9 +65,8 @@ jobs:
 
         python setup.py bdist_wheel
 
-        # hacky until the right distribution wheel can be made...
-        find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
-        ls -la dist/
+        # Properly retag wheel with manylinux platform tag
+        retag_wheel_platform "${{ matrix.platform-tag }}"
 
         # Run tests
         install_python_test_dependencies
diff --git a/scripts/common-setup.sh b/scripts/common-setup.sh
@@ -227,3 +227,109 @@ run_test_groups() {
   fi
   set -e
 }
+
+# Retag wheels with manylinux platform tags
+#
+# When building wheels with `setup.py bdist_wheel`, the wheel is tagged with the
+# generic platform tag (e.g., "linux_x86_64", "linux_aarch64"). However, PyPI
+# requires proper manylinux tags (e.g., "manylinux2014_x86_64") to indicate
+# compatibility with the manylinux standard. Simply renaming the .whl file is
+# insufficient because the platform tag is also stored in the WHEEL metadata file
+# inside the wheel archive.
+#
+# This function properly retags wheels by:
+# 1. Unpacking the wheel archive
+# 2. Modifying the "Tag:" field in the .dist-info/WHEEL metadata file
+# 3. Repacking the wheel with the updated metadata and correct filename
+#
+# The `wheel pack` command automatically regenerates the RECORD file with updated
+# hashes, ensuring wheel integrity. This is similar to how PyTorch does it in their
+# manywheel build scripts (see pytorch/.ci/manywheel/build_common.sh).
+#
+# Usage: retag_wheel_platform <platform_tag> [wheel_dir]
+#   platform_tag: Target platform (e.g., "manylinux2014_x86_64", "manylinux2014_aarch64")
+#   wheel_dir: Directory containing wheels (defaults to "dist")
+#
+# Example:
+#   retag_wheel_platform "manylinux2014_x86_64"
+#   retag_wheel_platform "manylinux2014_aarch64" "build/wheels"
+retag_wheel_platform() {
+    local platform_tag="${1}"
+    local wheel_dir="${2:-dist}"
+
+    if [[ -z "$platform_tag" ]]; then
+        echo "Error: platform_tag is required"
+        echo "Usage: retag_wheel_platform <platform_tag> [wheel_dir]"
+        return 1
+    fi
+
+    if [[ ! -d "$wheel_dir" ]]; then
+        echo "Error: wheel directory '$wheel_dir' does not exist"
+        return 1
+    fi
+
+    echo "Retagging wheels in '$wheel_dir' with platform tag: $platform_tag"
+
+    # Install wheel tool if not present
+    pip install -q wheel
+
+    local wheel_count=0
+    for whl in "$wheel_dir"/*.whl; do
+        if [[ ! -f "$whl" ]]; then
+            continue
+        fi
+
+        wheel_count=$((wheel_count + 1))
+        echo "  Processing: $(basename "$whl")"
+
+        # Unpack the wheel
+        wheel unpack "$whl" -d "$wheel_dir"
+        local whl_dir=$(find "$wheel_dir" -maxdepth 1 -type d -name "$(basename "$whl" .whl)" -print -quit)
+
+        if [[ -n "$whl_dir" && -d "$whl_dir" ]]; then
+            # Find and modify the WHEEL metadata file
+            local wheel_file=$(find "$whl_dir" -name "WHEEL" -type f)
+
+            if [[ -f "$wheel_file" ]]; then
+                echo "    Updating WHEEL metadata: $wheel_file"
+
+                # Replace platform tag based on target
+                case "$platform_tag" in
+                    manylinux*_x86_64)
+                        sed -i 's/Tag:.*linux_x86_64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
+                        ;;
+                    manylinux*_aarch64)
+                        sed -i 's/Tag:.*linux_aarch64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
+                        ;;
+                    *)
+                        echo "    Warning: Unknown platform tag pattern '$platform_tag', attempting generic replacement"
+                        sed -i 's/Tag: \(.*\)-linux_[^-]*/Tag: \1-'"$platform_tag"'/g' "$wheel_file"
+                        ;;
+                esac
+            else
+                echo "    Warning: WHEEL file not found in unpacked wheel"
+            fi
+
+            # Repack the wheel with new platform tag
+            echo "    Repacking wheel..."
+            wheel pack "$whl_dir" -d "$wheel_dir" >/dev/null
+
+            # Clean up unpacked directory
+            rm -rf "$whl_dir"
+        fi
+
+        # Remove original wheel
+        rm "$whl"
+        echo "    ✓ Retagged: $(basename "$whl")"
+    done
+
+    if [[ $wheel_count -eq 0 ]]; then
+        echo "Warning: No wheels found in '$wheel_dir'"
+        return 1
+    fi
+
+    echo "✓ Successfully retagged $wheel_count wheel(s)"
+    echo "Final wheels:"
+    ls -lh "$wheel_dir"/*.whl
+}
+