Add ARM workflow for GB200 support (#2029)

allenwang28 · facebook-github-bot · commit abd085d61c99 · 2025-12-02T15:23:16.000-08:00
Summary:

As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI.

Differential Revision: D88202637
diff --git a/.github/workflows/publish_release.yml b/.github/workflows/publish_release.yml
@@ -12,26 +12,38 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
+    name: ${{ matrix.name }}-py${{ matrix.python-version }}
     strategy:
-      fail-fast: false  # Changed to false to see results from all Python versions
+      fail-fast: false
       matrix:
         # TODO add 3.14 once we figure out py03 issue
         python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
-          - name: 4xlarge
-            runs-on: linux.g5.4xlarge.nvidia.gpu
+          # x86_64 CUDA builds
+          - name: cuda12.8-x86_64
+            runner: linux.g5.4xlarge.nvidia.gpu
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.8"
+            docker-image: "pytorch/almalinux-builder"  # Uses default, becomes pytorch/almalinux-builder:cuda12.8
+            platform-tag: "manylinux2014_x86_64"
+          # aarch64 CUDA builds
+          - name: cuda12.8-aarch64
+            runner: linux.arm64.r7g.12xlarge.memory  # GPU-enabled ARM runner like PyTorch uses
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
+            gpu-arch-type: "cpu"  # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
+            gpu-arch-version: ""
+            docker-image: "pytorch/manylinuxaarch64-builder:cuda12.8"  # ARM-specific image with CUDA
+            platform-tag: "manylinux2014_aarch64"
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
+      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
@@ -52,9 +64,8 @@ jobs:
         export MONARCH_VERSION="${{ github.event.inputs.version }}"
         python setup.py bdist_wheel
 
-        # hacky until the right distribution wheel can be made...
-        find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
-        ls -la dist/
+        # Properly retag wheel with manylinux platform tag
+        retag_wheel_platform "${{ matrix.platform-tag }}"
 
         # Run tests
         install_python_test_dependencies
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -13,25 +13,37 @@ concurrency:
   cancel-in-progress: true
 jobs:
   build:
-    name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
+    name: ${{ matrix.name }}-py${{ matrix.python-version }}
     strategy:
-      fail-fast: false  # Changed to false to see results from all Python versions
+      fail-fast: false
       matrix:
         python-version: ["3.10", "3.11", "3.12", "3.13"]
         include:
-          - name: 4xlarge
-            runs-on: linux.g5.4xlarge.nvidia.gpu
+          # x86_64 CUDA builds
+          - name: cuda12.6-x86_64
+            runner: linux.g5.4xlarge.nvidia.gpu
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
+            docker-image: "pytorch/almalinux-builder"  # Uses default, becomes pytorch/almalinux-builder:cuda12.6
+            platform-tag: "manylinux2014_x86_64"
+          # aarch64 CUDA builds
+          - name: cuda12.6-aarch64
+            runner: linux.arm64.r7g.12xlarge.memory  # GPU-enabled ARM runner like PyTorch uses
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cpu"  # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
+            gpu-arch-version: ""
+            docker-image: "pytorch/manylinuxaarch64-builder:cuda12.6"  # ARM-specific image with CUDA
+            platform-tag: "manylinux2014_aarch64"
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
+      runner: ${{ matrix.runner }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
+      upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
@@ -53,9 +65,8 @@ jobs:
 
         python setup.py bdist_wheel
 
-        # hacky until the right distribution wheel can be made...
-        find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
-        ls -la dist/
+        # Properly retag wheel with manylinux platform tag
+        retag_wheel_platform "${{ matrix.platform-tag }}"
 
         # Run tests
         install_python_test_dependencies
diff --git a/scripts/common-setup.sh b/scripts/common-setup.sh
@@ -12,6 +12,14 @@ set -ex
 # Setup conda environment. Defaults to Python 3.10.
 setup_conda_environment() {
     local python_version=${1:-3.10}
+
+    # Check if we're in a manylinux container (no conda)
+    if [[ -d /opt/python ]] && [[ ! -x "$(command -v conda)" ]]; then
+        echo "Detected manylinux environment, using system Python instead of conda..."
+        setup_manylinux_python "${python_version}"
+        return
+    fi
+
     echo "Setting up conda environment with Python ${python_version}..."
     conda create -n venv python="${python_version}" -y
     conda activate venv
@@ -20,12 +28,48 @@ setup_conda_environment() {
     python -m pip install --upgrade pip
 }
 
+# Setup Python in manylinux container (no conda available)
+setup_manylinux_python() {
+    local python_version=${1:-3.10}
+    echo "Setting up manylinux Python ${python_version}..."
+
+    # Map Python version to manylinux cp version
+    case "${python_version}" in
+        3.10) PYTHON_DIR="/opt/python/cp310-cp310" ;;
+        3.11) PYTHON_DIR="/opt/python/cp311-cp311" ;;
+        3.12) PYTHON_DIR="/opt/python/cp312-cp312" ;;
+        3.13) PYTHON_DIR="/opt/python/cp313-cp313" ;;
+        *) echo "Unsupported Python version: ${python_version}"; return 1 ;;
+    esac
+
+    if [[ ! -d "${PYTHON_DIR}" ]]; then
+        echo "ERROR: Python directory ${PYTHON_DIR} not found"
+        return 1
+    fi
+
+    export PATH="${PYTHON_DIR}/bin:${PATH}"
+    export PYTHON_BIN="${PYTHON_DIR}/bin/python"
+
+    echo "Using Python from: ${PYTHON_DIR}"
+    python --version
+    python -m pip install --upgrade pip
+}
+
 # Install system-level dependencies
 install_system_dependencies() {
     echo "Installing system dependencies..."
-    dnf update -y
-    # Protobuf compiler is required for the tracing-perfetto-sdk-schema crate.
-    dnf install clang-devel libunwind libunwind-devel protobuf-compiler -y
+
+    # Check if package manager is available (not in all manylinux containers)
+    if command -v dnf >/dev/null 2>&1; then
+        dnf update -y
+        # Protobuf compiler is required for the tracing-perfetto-sdk-schema crate.
+        dnf install clang-devel libunwind libunwind-devel protobuf-compiler -y
+    elif command -v yum >/dev/null 2>&1; then
+        yum update -y
+        yum install clang-devel libunwind libunwind-devel protobuf-compiler -y
+    else
+        echo "Warning: No package manager (dnf/yum) available, skipping system dependencies"
+    fi
 }
 
 # Install and configure Rust nightly toolchain
@@ -50,7 +94,15 @@ setup_rust_toolchain() {
 install_python_test_dependencies() {
     echo "Installing test dependencies..."
     pip install -r python/tests/requirements.txt
-    dnf install -y rsync # required for code sync tests
+
+    # Install rsync if package manager is available
+    if command -v dnf >/dev/null 2>&1; then
+        dnf install -y rsync # required for code sync tests
+    elif command -v yum >/dev/null 2>&1; then
+        yum install -y rsync
+    else
+        echo "Warning: No package manager available, skipping rsync install"
+    fi
 }
 
 # Install wheel from artifact directory
@@ -62,9 +114,25 @@ install_wheel_from_artifact() {
 # Setup and install dependencies for Tensor Engine
 setup_tensor_engine() {
     echo "Installing Tensor Engine dependencies..."
-    # Install the fmt library for C++ headers in pytorch.
-    conda install -y -c conda-forge fmt
-    dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
+
+    # Install fmt library
+    if command -v conda >/dev/null 2>&1; then
+        # Use conda if available (x86_64 with conda)
+        conda install -y -c conda-forge fmt
+    else
+        # Use pip if conda not available (manylinux/ARM)
+        echo "Conda not available, installing fmt via pip..."
+        pip install libfmt || echo "Warning: libfmt not available via pip, build may need system fmt"
+    fi
+
+    # Install RDMA libraries if package manager is available
+    if command -v dnf >/dev/null 2>&1; then
+        dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
+    elif command -v yum >/dev/null 2>&1; then
+        yum install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
+    else
+        echo "Warning: No package manager available, skipping RDMA dependencies"
+    fi
 }
 
 # Install PyTorch with C++ development headers (libtorch) for Rust compilation
@@ -177,8 +245,13 @@ run_test_groups() {
   # (which was used to compile monarch) instead of the system's.
   # TODO: Revisit this to determine if this is the proper/most
   # sustainable/most robust solution.
-  export CONDA_LIBSTDCPP="${CONDA_PREFIX}/lib/libstdc++.so.6"
-  export LD_PRELOAD="${CONDA_LIBSTDCPP}${LD_PRELOAD:+:$LD_PRELOAD}"
+  if [[ -n "${CONDA_PREFIX:-}" ]] && [[ -f "${CONDA_PREFIX}/lib/libstdc++.so.6" ]]; then
+    export CONDA_LIBSTDCPP="${CONDA_PREFIX}/lib/libstdc++.so.6"
+    export LD_PRELOAD="${CONDA_LIBSTDCPP}${LD_PRELOAD:+:$LD_PRELOAD}"
+    echo "Using conda libstdc++ from: ${CONDA_LIBSTDCPP}"
+  else
+    echo "Conda not in use or libstdc++ not found, using system libstdc++"
+  fi
   # Backtraces help with debugging remotely.
   export RUST_BACKTRACE=1
   local FAILED_GROUPS=()
@@ -227,3 +300,109 @@ run_test_groups() {
   fi
   set -e
 }
+
+# Retag wheels with manylinux platform tags
+#
+# When building wheels with `setup.py bdist_wheel`, the wheel is tagged with the
+# generic platform tag (e.g., "linux_x86_64", "linux_aarch64"). However, PyPI
+# requires proper manylinux tags (e.g., "manylinux2014_x86_64") to indicate
+# compatibility with the manylinux standard. Simply renaming the .whl file is
+# insufficient because the platform tag is also stored in the WHEEL metadata file
+# inside the wheel archive.
+#
+# This function properly retags wheels by:
+# 1. Unpacking the wheel archive
+# 2. Modifying the "Tag:" field in the .dist-info/WHEEL metadata file
+# 3. Repacking the wheel with the updated metadata and correct filename
+#
+# The `wheel pack` command automatically regenerates the RECORD file with updated
+# hashes, ensuring wheel integrity. This is similar to how PyTorch does it in their
+# manywheel build scripts (see pytorch/.ci/manywheel/build_common.sh).
+#
+# Usage: retag_wheel_platform <platform_tag> [wheel_dir]
+#   platform_tag: Target platform (e.g., "manylinux2014_x86_64", "manylinux2014_aarch64")
+#   wheel_dir: Directory containing wheels (defaults to "dist")
+#
+# Example:
+#   retag_wheel_platform "manylinux2014_x86_64"
+#   retag_wheel_platform "manylinux2014_aarch64" "build/wheels"
+retag_wheel_platform() {
+    local platform_tag="${1}"
+    local wheel_dir="${2:-dist}"
+
+    if [[ -z "$platform_tag" ]]; then
+        echo "Error: platform_tag is required"
+        echo "Usage: retag_wheel_platform <platform_tag> [wheel_dir]"
+        return 1
+    fi
+
+    if [[ ! -d "$wheel_dir" ]]; then
+        echo "Error: wheel directory '$wheel_dir' does not exist"
+        return 1
+    fi
+
+    echo "Retagging wheels in '$wheel_dir' with platform tag: $platform_tag"
+
+    # Install wheel tool if not present
+    pip install -q wheel
+
+    local wheel_count=0
+    for whl in "$wheel_dir"/*.whl; do
+        if [[ ! -f "$whl" ]]; then
+            continue
+        fi
+
+        wheel_count=$((wheel_count + 1))
+        echo "  Processing: $(basename "$whl")"
+
+        # Unpack the wheel
+        wheel unpack "$whl" -d "$wheel_dir"
+        local whl_dir=$(find "$wheel_dir" -maxdepth 1 -type d -name "$(basename "$whl" .whl)" -print -quit)
+
+        if [[ -n "$whl_dir" && -d "$whl_dir" ]]; then
+            # Find and modify the WHEEL metadata file
+            local wheel_file=$(find "$whl_dir" -name "WHEEL" -type f)
+
+            if [[ -f "$wheel_file" ]]; then
+                echo "    Updating WHEEL metadata: $wheel_file"
+
+                # Replace platform tag based on target
+                case "$platform_tag" in
+                    manylinux*_x86_64)
+                        sed -i 's/Tag:.*linux_x86_64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
+                        ;;
+                    manylinux*_aarch64)
+                        sed -i 's/Tag:.*linux_aarch64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
+                        ;;
+                    *)
+                        echo "    Warning: Unknown platform tag pattern '$platform_tag', attempting generic replacement"
+                        sed -i 's/Tag: \(.*\)-linux_[^-]*/Tag: \1-'"$platform_tag"'/g' "$wheel_file"
+                        ;;
+                esac
+            else
+                echo "    Warning: WHEEL file not found in unpacked wheel"
+            fi
+
+            # Repack the wheel with new platform tag
+            echo "    Repacking wheel..."
+            wheel pack "$whl_dir" -d "$wheel_dir" >/dev/null
+
+            # Clean up unpacked directory
+            rm -rf "$whl_dir"
+        fi
+
+        # Remove original wheel
+        rm "$whl"
+        echo "    ✓ Retagged: $(basename "$whl")"
+    done
+
+    if [[ $wheel_count -eq 0 ]]; then
+        echo "Warning: No wheels found in '$wheel_dir'"
+        return 1
+    fi
+
+    echo "✓ Successfully retagged $wheel_count wheel(s)"
+    echo "Final wheels:"
+    ls -lh "$wheel_dir"/*.whl
+}
+