Skip to content

Commit abd085d

Browse files
allenwang28facebook-github-bot
authored andcommitted
Add ARM workflow for GB200 support (#2029)
Summary: As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI. Differential Revision: D88202637
1 parent fb0cf11 commit abd085d

File tree

3 files changed

+228
-27
lines changed

3 files changed

+228
-27
lines changed

.github/workflows/publish_release.yml

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,38 @@ concurrency:
1212
cancel-in-progress: true
1313
jobs:
1414
build:
15-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
15+
name: ${{ matrix.name }}-py${{ matrix.python-version }}
1616
strategy:
17-
fail-fast: false # Changed to false to see results from all Python versions
17+
fail-fast: false
1818
matrix:
1919
# TODO add 3.14 once we figure out py03 issue
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
2121
include:
22-
- name: 4xlarge
23-
runs-on: linux.g5.4xlarge.nvidia.gpu
22+
# x86_64 CUDA builds
23+
- name: cuda12.8-x86_64
24+
runner: linux.g5.4xlarge.nvidia.gpu
2425
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
2526
gpu-arch-type: "cuda"
2627
gpu-arch-version: "12.8"
28+
docker-image: "pytorch/almalinux-builder" # Uses default, becomes pytorch/almalinux-builder:cuda12.8
29+
platform-tag: "manylinux2014_x86_64"
30+
# aarch64 CUDA builds
31+
- name: cuda12.8-aarch64
32+
runner: linux.arm64.r7g.12xlarge.memory # GPU-enabled ARM runner like PyTorch uses
33+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
34+
gpu-arch-type: "cpu" # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
35+
gpu-arch-version: ""
36+
docker-image: "pytorch/manylinuxaarch64-builder:cuda12.8" # ARM-specific image with CUDA
37+
platform-tag: "manylinux2014_aarch64"
2738
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2839
with:
2940
timeout: 60
30-
runner: ${{ matrix.runs-on }}
41+
runner: ${{ matrix.runner }}
3142
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3243
gpu-arch-version: ${{ matrix.gpu-arch-version }}
44+
docker-image: ${{ matrix.docker-image }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -52,9 +64,8 @@ jobs:
5264
export MONARCH_VERSION="${{ github.event.inputs.version }}"
5365
python setup.py bdist_wheel
5466
55-
# hacky until the right distribution wheel can be made...
56-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
57-
ls -la dist/
67+
# Properly retag wheel with manylinux platform tag
68+
retag_wheel_platform "${{ matrix.platform-tag }}"
5869
5970
# Run tests
6071
install_python_test_dependencies

.github/workflows/wheels.yml

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,37 @@ concurrency:
1313
cancel-in-progress: true
1414
jobs:
1515
build:
16-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
16+
name: ${{ matrix.name }}-py${{ matrix.python-version }}
1717
strategy:
18-
fail-fast: false # Changed to false to see results from all Python versions
18+
fail-fast: false
1919
matrix:
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
2121
include:
22-
- name: 4xlarge
23-
runs-on: linux.g5.4xlarge.nvidia.gpu
22+
# x86_64 CUDA builds
23+
- name: cuda12.6-x86_64
24+
runner: linux.g5.4xlarge.nvidia.gpu
2425
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
2526
gpu-arch-type: "cuda"
2627
gpu-arch-version: "12.6"
28+
docker-image: "pytorch/almalinux-builder" # Uses default, becomes pytorch/almalinux-builder:cuda12.6
29+
platform-tag: "manylinux2014_x86_64"
30+
# aarch64 CUDA builds
31+
- name: cuda12.6-aarch64
32+
runner: linux.arm64.r7g.12xlarge.memory # GPU-enabled ARM runner like PyTorch uses
33+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
34+
gpu-arch-type: "cpu" # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
35+
gpu-arch-version: ""
36+
docker-image: "pytorch/manylinuxaarch64-builder:cuda12.6" # ARM-specific image with CUDA
37+
platform-tag: "manylinux2014_aarch64"
2738
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2839
with:
2940
timeout: 60
30-
runner: ${{ matrix.runs-on }}
41+
runner: ${{ matrix.runner }}
3142
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3243
gpu-arch-version: ${{ matrix.gpu-arch-version }}
44+
docker-image: ${{ matrix.docker-image }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -53,9 +65,8 @@ jobs:
5365
5466
python setup.py bdist_wheel
5567
56-
# hacky until the right distribution wheel can be made...
57-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
58-
ls -la dist/
68+
# Properly retag wheel with manylinux platform tag
69+
retag_wheel_platform "${{ matrix.platform-tag }}"
5970
6071
# Run tests
6172
install_python_test_dependencies

scripts/common-setup.sh

Lines changed: 188 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ set -ex
1212
# Setup conda environment. Defaults to Python 3.10.
1313
setup_conda_environment() {
1414
local python_version=${1:-3.10}
15+
16+
# Check if we're in a manylinux container (no conda)
17+
if [[ -d /opt/python ]] && [[ ! -x "$(command -v conda)" ]]; then
18+
echo "Detected manylinux environment, using system Python instead of conda..."
19+
setup_manylinux_python "${python_version}"
20+
return
21+
fi
22+
1523
echo "Setting up conda environment with Python ${python_version}..."
1624
conda create -n venv python="${python_version}" -y
1725
conda activate venv
@@ -20,12 +28,48 @@ setup_conda_environment() {
2028
python -m pip install --upgrade pip
2129
}
2230

31+
# Setup Python in manylinux container (no conda available)
32+
setup_manylinux_python() {
33+
local python_version=${1:-3.10}
34+
echo "Setting up manylinux Python ${python_version}..."
35+
36+
# Map Python version to manylinux cp version
37+
case "${python_version}" in
38+
3.10) PYTHON_DIR="/opt/python/cp310-cp310" ;;
39+
3.11) PYTHON_DIR="/opt/python/cp311-cp311" ;;
40+
3.12) PYTHON_DIR="/opt/python/cp312-cp312" ;;
41+
3.13) PYTHON_DIR="/opt/python/cp313-cp313" ;;
42+
*) echo "Unsupported Python version: ${python_version}"; return 1 ;;
43+
esac
44+
45+
if [[ ! -d "${PYTHON_DIR}" ]]; then
46+
echo "ERROR: Python directory ${PYTHON_DIR} not found"
47+
return 1
48+
fi
49+
50+
export PATH="${PYTHON_DIR}/bin:${PATH}"
51+
export PYTHON_BIN="${PYTHON_DIR}/bin/python"
52+
53+
echo "Using Python from: ${PYTHON_DIR}"
54+
python --version
55+
python -m pip install --upgrade pip
56+
}
57+
2358
# Install system-level dependencies
2459
install_system_dependencies() {
2560
echo "Installing system dependencies..."
26-
dnf update -y
27-
# Protobuf compiler is required for the tracing-perfetto-sdk-schema crate.
28-
dnf install clang-devel libunwind libunwind-devel protobuf-compiler -y
61+
62+
# Check if package manager is available (not in all manylinux containers)
63+
if command -v dnf >/dev/null 2>&1; then
64+
dnf update -y
65+
# Protobuf compiler is required for the tracing-perfetto-sdk-schema crate.
66+
dnf install clang-devel libunwind libunwind-devel protobuf-compiler -y
67+
elif command -v yum >/dev/null 2>&1; then
68+
yum update -y
69+
yum install clang-devel libunwind libunwind-devel protobuf-compiler -y
70+
else
71+
echo "Warning: No package manager (dnf/yum) available, skipping system dependencies"
72+
fi
2973
}
3074

3175
# Install and configure Rust nightly toolchain
@@ -50,7 +94,15 @@ setup_rust_toolchain() {
5094
install_python_test_dependencies() {
5195
echo "Installing test dependencies..."
5296
pip install -r python/tests/requirements.txt
53-
dnf install -y rsync # required for code sync tests
97+
98+
# Install rsync if package manager is available
99+
if command -v dnf >/dev/null 2>&1; then
100+
dnf install -y rsync # required for code sync tests
101+
elif command -v yum >/dev/null 2>&1; then
102+
yum install -y rsync
103+
else
104+
echo "Warning: No package manager available, skipping rsync install"
105+
fi
54106
}
55107

56108
# Install wheel from artifact directory
@@ -62,9 +114,25 @@ install_wheel_from_artifact() {
62114
# Setup and install dependencies for Tensor Engine
63115
setup_tensor_engine() {
64116
echo "Installing Tensor Engine dependencies..."
65-
# Install the fmt library for C++ headers in pytorch.
66-
conda install -y -c conda-forge fmt
67-
dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
117+
118+
# Install fmt library
119+
if command -v conda >/dev/null 2>&1; then
120+
# Use conda if available (x86_64 with conda)
121+
conda install -y -c conda-forge fmt
122+
else
123+
# Use pip if conda not available (manylinux/ARM)
124+
echo "Conda not available, installing fmt via pip..."
125+
pip install libfmt || echo "Warning: libfmt not available via pip, build may need system fmt"
126+
fi
127+
128+
# Install RDMA libraries if package manager is available
129+
if command -v dnf >/dev/null 2>&1; then
130+
dnf install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
131+
elif command -v yum >/dev/null 2>&1; then
132+
yum install -y libibverbs rdma-core libmlx5 libibverbs-devel rdma-core-devel
133+
else
134+
echo "Warning: No package manager available, skipping RDMA dependencies"
135+
fi
68136
}
69137

70138
# Install PyTorch with C++ development headers (libtorch) for Rust compilation
@@ -177,8 +245,13 @@ run_test_groups() {
177245
# (which was used to compile monarch) instead of the system's.
178246
# TODO: Revisit this to determine if this is the proper/most
179247
# sustainable/most robust solution.
180-
export CONDA_LIBSTDCPP="${CONDA_PREFIX}/lib/libstdc++.so.6"
181-
export LD_PRELOAD="${CONDA_LIBSTDCPP}${LD_PRELOAD:+:$LD_PRELOAD}"
248+
if [[ -n "${CONDA_PREFIX:-}" ]] && [[ -f "${CONDA_PREFIX}/lib/libstdc++.so.6" ]]; then
249+
export CONDA_LIBSTDCPP="${CONDA_PREFIX}/lib/libstdc++.so.6"
250+
export LD_PRELOAD="${CONDA_LIBSTDCPP}${LD_PRELOAD:+:$LD_PRELOAD}"
251+
echo "Using conda libstdc++ from: ${CONDA_LIBSTDCPP}"
252+
else
253+
echo "Conda not in use or libstdc++ not found, using system libstdc++"
254+
fi
182255
# Backtraces help with debugging remotely.
183256
export RUST_BACKTRACE=1
184257
local FAILED_GROUPS=()
@@ -227,3 +300,109 @@ run_test_groups() {
227300
fi
228301
set -e
229302
}
303+
304+
# Retag wheels with manylinux platform tags
305+
#
306+
# When building wheels with `setup.py bdist_wheel`, the wheel is tagged with the
307+
# generic platform tag (e.g., "linux_x86_64", "linux_aarch64"). However, PyPI
308+
# requires proper manylinux tags (e.g., "manylinux2014_x86_64") to indicate
309+
# compatibility with the manylinux standard. Simply renaming the .whl file is
310+
# insufficient because the platform tag is also stored in the WHEEL metadata file
311+
# inside the wheel archive.
312+
#
313+
# This function properly retags wheels by:
314+
# 1. Unpacking the wheel archive
315+
# 2. Modifying the "Tag:" field in the .dist-info/WHEEL metadata file
316+
# 3. Repacking the wheel with the updated metadata and correct filename
317+
#
318+
# The `wheel pack` command automatically regenerates the RECORD file with updated
319+
# hashes, ensuring wheel integrity. This is similar to how PyTorch does it in their
320+
# manywheel build scripts (see pytorch/.ci/manywheel/build_common.sh).
321+
#
322+
# Usage: retag_wheel_platform <platform_tag> [wheel_dir]
323+
# platform_tag: Target platform (e.g., "manylinux2014_x86_64", "manylinux2014_aarch64")
324+
# wheel_dir: Directory containing wheels (defaults to "dist")
325+
#
326+
# Example:
327+
# retag_wheel_platform "manylinux2014_x86_64"
328+
# retag_wheel_platform "manylinux2014_aarch64" "build/wheels"
329+
retag_wheel_platform() {
330+
local platform_tag="${1}"
331+
local wheel_dir="${2:-dist}"
332+
333+
if [[ -z "$platform_tag" ]]; then
334+
echo "Error: platform_tag is required"
335+
echo "Usage: retag_wheel_platform <platform_tag> [wheel_dir]"
336+
return 1
337+
fi
338+
339+
if [[ ! -d "$wheel_dir" ]]; then
340+
echo "Error: wheel directory '$wheel_dir' does not exist"
341+
return 1
342+
fi
343+
344+
echo "Retagging wheels in '$wheel_dir' with platform tag: $platform_tag"
345+
346+
# Install wheel tool if not present
347+
pip install -q wheel
348+
349+
local wheel_count=0
350+
for whl in "$wheel_dir"/*.whl; do
351+
if [[ ! -f "$whl" ]]; then
352+
continue
353+
fi
354+
355+
wheel_count=$((wheel_count + 1))
356+
echo " Processing: $(basename "$whl")"
357+
358+
# Unpack the wheel
359+
wheel unpack "$whl" -d "$wheel_dir"
360+
local whl_dir=$(find "$wheel_dir" -maxdepth 1 -type d -name "$(basename "$whl" .whl)" -print -quit)
361+
362+
if [[ -n "$whl_dir" && -d "$whl_dir" ]]; then
363+
# Find and modify the WHEEL metadata file
364+
local wheel_file=$(find "$whl_dir" -name "WHEEL" -type f)
365+
366+
if [[ -f "$wheel_file" ]]; then
367+
echo " Updating WHEEL metadata: $wheel_file"
368+
369+
# Replace platform tag based on target
370+
case "$platform_tag" in
371+
manylinux*_x86_64)
372+
sed -i 's/Tag:.*linux_x86_64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
373+
;;
374+
manylinux*_aarch64)
375+
sed -i 's/Tag:.*linux_aarch64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
376+
;;
377+
*)
378+
echo " Warning: Unknown platform tag pattern '$platform_tag', attempting generic replacement"
379+
sed -i 's/Tag: \(.*\)-linux_[^-]*/Tag: \1-'"$platform_tag"'/g' "$wheel_file"
380+
;;
381+
esac
382+
else
383+
echo " Warning: WHEEL file not found in unpacked wheel"
384+
fi
385+
386+
# Repack the wheel with new platform tag
387+
echo " Repacking wheel..."
388+
wheel pack "$whl_dir" -d "$wheel_dir" >/dev/null
389+
390+
# Clean up unpacked directory
391+
rm -rf "$whl_dir"
392+
fi
393+
394+
# Remove original wheel
395+
rm "$whl"
396+
echo " ✓ Retagged: $(basename "$whl")"
397+
done
398+
399+
if [[ $wheel_count -eq 0 ]]; then
400+
echo "Warning: No wheels found in '$wheel_dir'"
401+
return 1
402+
fi
403+
404+
echo "✓ Successfully retagged $wheel_count wheel(s)"
405+
echo "Final wheels:"
406+
ls -lh "$wheel_dir"/*.whl
407+
}
408+

0 commit comments

Comments
 (0)