Skip to content

Commit 3dad632

Browse files
allenwang28facebook-github-bot
authored andcommitted
Add ARM workflow for GB200 support (#2029)
Summary: As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI. Differential Revision: D88202637
1 parent d274c2a commit 3dad632

File tree

3 files changed

+146
-18
lines changed

3 files changed

+146
-18
lines changed

.github/workflows/publish_release.yml

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,38 @@ concurrency:
1212
cancel-in-progress: true
1313
jobs:
1414
build:
15-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
15+
name: ${{ matrix.name }}-py${{ matrix.python-version }}
1616
strategy:
17-
fail-fast: false # Changed to false to see results from all Python versions
17+
fail-fast: false
1818
matrix:
1919
# TODO add 3.14 once we figure out py03 issue
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
2121
include:
22-
- name: 4xlarge
23-
runs-on: linux.g5.4xlarge.nvidia.gpu
22+
# x86_64 CUDA builds
23+
- name: cuda12.8-x86_64
24+
runner: linux.g5.4xlarge.nvidia.gpu
2425
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
2526
gpu-arch-type: "cuda"
2627
gpu-arch-version: "12.8"
28+
docker-image: "pytorch/almalinux-builder" # Uses default, becomes pytorch/almalinux-builder:cuda12.8
29+
platform-tag: "manylinux2014_x86_64"
30+
# aarch64 CUDA builds
31+
- name: cuda12.8-aarch64
32+
runner: linux.arm64.r7g.12xlarge.memory # GPU-enabled ARM runner like PyTorch uses
33+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
34+
gpu-arch-type: "cpu" # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
35+
gpu-arch-version: ""
36+
docker-image: "pytorch/manylinuxaarch64-builder:cuda12.8" # ARM-specific image with CUDA
37+
platform-tag: "manylinux2014_aarch64"
2738
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2839
with:
2940
timeout: 60
30-
runner: ${{ matrix.runs-on }}
41+
runner: ${{ matrix.runner }}
3142
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3243
gpu-arch-version: ${{ matrix.gpu-arch-version }}
44+
docker-image: ${{ matrix.docker-image }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -52,9 +64,8 @@ jobs:
5264
export MONARCH_VERSION="${{ github.event.inputs.version }}"
5365
python setup.py bdist_wheel
5466
55-
# hacky until the right distribution wheel can be made...
56-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
57-
ls -la dist/
67+
# Properly retag wheel with manylinux platform tag
68+
retag_wheel_platform "${{ matrix.platform-tag }}"
5869
5970
# Run tests
6071
install_python_test_dependencies

.github/workflows/wheels.yml

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,37 @@ concurrency:
1313
cancel-in-progress: true
1414
jobs:
1515
build:
16-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
16+
name: ${{ matrix.name }}-py${{ matrix.python-version }}
1717
strategy:
18-
fail-fast: false # Changed to false to see results from all Python versions
18+
fail-fast: false
1919
matrix:
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
2121
include:
22-
- name: 4xlarge
23-
runs-on: linux.g5.4xlarge.nvidia.gpu
22+
# x86_64 CUDA builds
23+
- name: cuda12.6-x86_64
24+
runner: linux.g5.4xlarge.nvidia.gpu
2425
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
2526
gpu-arch-type: "cuda"
2627
gpu-arch-version: "12.6"
28+
docker-image: "pytorch/almalinux-builder" # Uses default, becomes pytorch/almalinux-builder:cuda12.6
29+
platform-tag: "manylinux2014_x86_64"
30+
# aarch64 CUDA builds
31+
- name: cuda12.6-aarch64
32+
runner: linux.arm64.r7g.12xlarge.memory # GPU-enabled ARM runner like PyTorch uses
33+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
34+
gpu-arch-type: "cpu" # Use "cpu" to skip nvidia driver install, CUDA libs are in Docker image
35+
gpu-arch-version: ""
36+
docker-image: "pytorch/manylinuxaarch64-builder:cuda12.6" # ARM-specific image with CUDA
37+
platform-tag: "manylinux2014_aarch64"
2738
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2839
with:
2940
timeout: 60
30-
runner: ${{ matrix.runs-on }}
41+
runner: ${{ matrix.runner }}
3142
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3243
gpu-arch-version: ${{ matrix.gpu-arch-version }}
44+
docker-image: ${{ matrix.docker-image }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.name }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -53,9 +65,8 @@ jobs:
5365
5466
python setup.py bdist_wheel
5567
56-
# hacky until the right distribution wheel can be made...
57-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
58-
ls -la dist/
68+
# Properly retag wheel with manylinux platform tag
69+
retag_wheel_platform "${{ matrix.platform-tag }}"
5970
6071
# Run tests
6172
install_python_test_dependencies

scripts/common-setup.sh

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,109 @@ run_test_groups() {
227227
fi
228228
set -e
229229
}
230+
231+
# Retag wheels with manylinux platform tags
232+
#
233+
# When building wheels with `setup.py bdist_wheel`, the wheel is tagged with the
234+
# generic platform tag (e.g., "linux_x86_64", "linux_aarch64"). However, PyPI
235+
# requires proper manylinux tags (e.g., "manylinux2014_x86_64") to indicate
236+
# compatibility with the manylinux standard. Simply renaming the .whl file is
237+
# insufficient because the platform tag is also stored in the WHEEL metadata file
238+
# inside the wheel archive.
239+
#
240+
# This function properly retags wheels by:
241+
# 1. Unpacking the wheel archive
242+
# 2. Modifying the "Tag:" field in the .dist-info/WHEEL metadata file
243+
# 3. Repacking the wheel with the updated metadata and correct filename
244+
#
245+
# The `wheel pack` command automatically regenerates the RECORD file with updated
246+
# hashes, ensuring wheel integrity. This is similar to how PyTorch does it in their
247+
# manywheel build scripts (see pytorch/.ci/manywheel/build_common.sh).
248+
#
249+
# Usage: retag_wheel_platform <platform_tag> [wheel_dir]
250+
# platform_tag: Target platform (e.g., "manylinux2014_x86_64", "manylinux2014_aarch64")
251+
# wheel_dir: Directory containing wheels (defaults to "dist")
252+
#
253+
# Example:
254+
# retag_wheel_platform "manylinux2014_x86_64"
255+
# retag_wheel_platform "manylinux2014_aarch64" "build/wheels"
256+
retag_wheel_platform() {
257+
local platform_tag="${1}"
258+
local wheel_dir="${2:-dist}"
259+
260+
if [[ -z "$platform_tag" ]]; then
261+
echo "Error: platform_tag is required"
262+
echo "Usage: retag_wheel_platform <platform_tag> [wheel_dir]"
263+
return 1
264+
fi
265+
266+
if [[ ! -d "$wheel_dir" ]]; then
267+
echo "Error: wheel directory '$wheel_dir' does not exist"
268+
return 1
269+
fi
270+
271+
echo "Retagging wheels in '$wheel_dir' with platform tag: $platform_tag"
272+
273+
# Install wheel tool if not present
274+
pip install -q wheel
275+
276+
local wheel_count=0
277+
for whl in "$wheel_dir"/*.whl; do
278+
if [[ ! -f "$whl" ]]; then
279+
continue
280+
fi
281+
282+
wheel_count=$((wheel_count + 1))
283+
echo " Processing: $(basename "$whl")"
284+
285+
# Unpack the wheel
286+
wheel unpack "$whl" -d "$wheel_dir"
287+
local whl_dir=$(find "$wheel_dir" -maxdepth 1 -type d -name "$(basename "$whl" .whl)" -print -quit)
288+
289+
if [[ -n "$whl_dir" && -d "$whl_dir" ]]; then
290+
# Find and modify the WHEEL metadata file
291+
local wheel_file=$(find "$whl_dir" -name "WHEEL" -type f)
292+
293+
if [[ -f "$wheel_file" ]]; then
294+
echo " Updating WHEEL metadata: $wheel_file"
295+
296+
# Replace platform tag based on target
297+
case "$platform_tag" in
298+
manylinux*_x86_64)
299+
sed -i 's/Tag:.*linux_x86_64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
300+
;;
301+
manylinux*_aarch64)
302+
sed -i 's/Tag:.*linux_aarch64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
303+
;;
304+
*)
305+
echo " Warning: Unknown platform tag pattern '$platform_tag', attempting generic replacement"
306+
sed -i 's/Tag: \(.*\)-linux_[^-]*/Tag: \1-'"$platform_tag"'/g' "$wheel_file"
307+
;;
308+
esac
309+
else
310+
echo " Warning: WHEEL file not found in unpacked wheel"
311+
fi
312+
313+
# Repack the wheel with new platform tag
314+
echo " Repacking wheel..."
315+
wheel pack "$whl_dir" -d "$wheel_dir" >/dev/null
316+
317+
# Clean up unpacked directory
318+
rm -rf "$whl_dir"
319+
fi
320+
321+
# Remove original wheel
322+
rm "$whl"
323+
echo " ✓ Retagged: $(basename "$whl")"
324+
done
325+
326+
if [[ $wheel_count -eq 0 ]]; then
327+
echo "Warning: No wheels found in '$wheel_dir'"
328+
return 1
329+
fi
330+
331+
echo "✓ Successfully retagged $wheel_count wheel(s)"
332+
echo "Final wheels:"
333+
ls -lh "$wheel_dir"/*.whl
334+
}
335+

0 commit comments

Comments
 (0)