Skip to content

Commit 7b647e5

Browse files
allenwang28facebook-github-bot
authored andcommitted
Add ARM workflow for GB200 support
Summary: As title - currently pip install from a GB200 machine won't work as we don't publish an ARM build to PyPI. Differential Revision: D88202637
1 parent 31a24ad commit 7b647e5

File tree

3 files changed

+144
-16
lines changed

3 files changed

+144
-16
lines changed

.github/workflows/publish_release.yml

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,38 @@ concurrency:
1212
cancel-in-progress: true
1313
jobs:
1414
build:
15-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
15+
name: ${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}-py${{ matrix.python-version }}-${{ matrix.arch }}
1616
strategy:
17-
fail-fast: false # Changed to false to see results from all Python versions
17+
fail-fast: false
1818
matrix:
1919
# TODO add 3.14 once we figure out py03 issue
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
21+
arch: ["x86_64", "aarch64"]
2122
include:
22-
- name: 4xlarge
23+
# x86_64 CUDA builds
24+
- arch: x86_64
2325
runs-on: linux.g5.4xlarge.nvidia.gpu
2426
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
2527
gpu-arch-type: "cuda"
2628
gpu-arch-version: "12.8"
29+
platform-tag: "manylinux2014_x86_64"
30+
cuda-lib-dir: "/usr/lib64"
31+
# aarch64 CUDA builds
32+
- arch: aarch64
33+
runs-on: linux.arm64.2xlarge
34+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu128'
35+
gpu-arch-type: "cuda-aarch64"
36+
gpu-arch-version: "12.8"
37+
platform-tag: "manylinux2014_aarch64"
38+
cuda-lib-dir: "/usr/lib64"
2739
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2840
with:
2941
timeout: 60
3042
runner: ${{ matrix.runs-on }}
3143
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3244
gpu-arch-version: ${{ matrix.gpu-arch-version }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}-${{ matrix.arch }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -48,13 +60,12 @@ jobs:
4860
4961
# Build wheel
5062
export MONARCH_PACKAGE_NAME="torchmonarch"
51-
export CUDA_LIB_DIR=/usr/lib64
63+
export CUDA_LIB_DIR=${{ matrix.cuda-lib-dir }}
5264
export MONARCH_VERSION="${{ github.event.inputs.version }}"
5365
python setup.py bdist_wheel
5466
55-
# hacky until the right distribution wheel can be made...
56-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
57-
ls -la dist/
67+
# Properly retag wheel with manylinux platform tag
68+
retag_wheel_platform "${{ matrix.platform-tag }}"
5869
5970
# Run tests
6071
install_python_test_dependencies

.github/workflows/wheels.yml

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,37 @@ concurrency:
1313
cancel-in-progress: true
1414
jobs:
1515
build:
16-
name: cuda12.6-py${{ matrix.python-version }}-${{ matrix.name }}
16+
name: ${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}-py${{ matrix.python-version }}-${{ matrix.arch }}
1717
strategy:
18-
fail-fast: false # Changed to false to see results from all Python versions
18+
fail-fast: false
1919
matrix:
2020
python-version: ["3.10", "3.11", "3.12", "3.13"]
21+
arch: ["x86_64", "aarch64"]
2122
include:
22-
- name: 4xlarge
23+
# x86_64 CUDA builds
24+
- arch: x86_64
2325
runs-on: linux.g5.4xlarge.nvidia.gpu
2426
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
2527
gpu-arch-type: "cuda"
2628
gpu-arch-version: "12.6"
29+
platform-tag: "manylinux2014_x86_64"
30+
cuda-lib-dir: "/usr/lib64"
31+
# aarch64 CUDA builds
32+
- arch: aarch64
33+
runs-on: linux.arm64.2xlarge
34+
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
35+
gpu-arch-type: "cuda-aarch64"
36+
gpu-arch-version: "12.6"
37+
platform-tag: "manylinux2014_aarch64"
38+
cuda-lib-dir: "/usr/lib64"
2739
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
2840
with:
2941
timeout: 60
3042
runner: ${{ matrix.runs-on }}
3143
gpu-arch-type: ${{ matrix.gpu-arch-type }}
3244
gpu-arch-version: ${{ matrix.gpu-arch-version }}
3345
submodules: recursive
34-
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}
46+
upload-artifact: monarch-${{ matrix.python-version }}-${{ matrix.gpu-arch-type }}${{ matrix.gpu-arch-version }}-${{ matrix.arch }}
3547
script: |
3648
source scripts/common-setup.sh
3749
setup_build_environment ${{ matrix.python-version }}
@@ -49,13 +61,12 @@ jobs:
4961
# Build wheel
5062
export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
5163
export MONARCH_VERSION=$(date +'%Y.%m.%d')
52-
export CUDA_LIB_DIR=/usr/lib64
64+
export CUDA_LIB_DIR=${{ matrix.cuda-lib-dir }}
5365
5466
python setup.py bdist_wheel
5567
56-
# hacky until the right distribution wheel can be made...
57-
find dist -name "*linux_x86_64.whl" -type f -exec bash -c 'mv "$1" "${1/linux_x86_64.whl/manylinux2014_x86_64.whl}"' _ {} \;
58-
ls -la dist/
68+
# Properly retag wheel with manylinux platform tag
69+
retag_wheel_platform "${{ matrix.platform-tag }}"
5970
6071
# Run tests
6172
install_python_test_dependencies

scripts/common-setup.sh

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,109 @@ run_test_groups() {
227227
fi
228228
set -e
229229
}
230+
231+
# Retag wheels with manylinux platform tags
232+
#
233+
# When building wheels with `setup.py bdist_wheel`, the wheel is tagged with the
234+
# generic platform tag (e.g., "linux_x86_64", "linux_aarch64"). However, PyPI
235+
# requires proper manylinux tags (e.g., "manylinux2014_x86_64") to indicate
236+
# compatibility with the manylinux standard. Simply renaming the .whl file is
237+
# insufficient because the platform tag is also stored in the WHEEL metadata file
238+
# inside the wheel archive.
239+
#
240+
# This function properly retags wheels by:
241+
# 1. Unpacking the wheel archive
242+
# 2. Modifying the "Tag:" field in the .dist-info/WHEEL metadata file
243+
# 3. Repacking the wheel with the updated metadata and correct filename
244+
#
245+
# The `wheel pack` command automatically regenerates the RECORD file with updated
246+
# hashes, ensuring wheel integrity. This is similar to how PyTorch does it in their
247+
# manywheel build scripts (see pytorch/.ci/manywheel/build_common.sh).
248+
#
249+
# Usage: retag_wheel_platform <platform_tag> [wheel_dir]
250+
# platform_tag: Target platform (e.g., "manylinux2014_x86_64", "manylinux2014_aarch64")
251+
# wheel_dir: Directory containing wheels (defaults to "dist")
252+
#
253+
# Example:
254+
# retag_wheel_platform "manylinux2014_x86_64"
255+
# retag_wheel_platform "manylinux2014_aarch64" "build/wheels"
256+
retag_wheel_platform() {
257+
local platform_tag="${1}"
258+
local wheel_dir="${2:-dist}"
259+
260+
if [[ -z "$platform_tag" ]]; then
261+
echo "Error: platform_tag is required"
262+
echo "Usage: retag_wheel_platform <platform_tag> [wheel_dir]"
263+
return 1
264+
fi
265+
266+
if [[ ! -d "$wheel_dir" ]]; then
267+
echo "Error: wheel directory '$wheel_dir' does not exist"
268+
return 1
269+
fi
270+
271+
echo "Retagging wheels in '$wheel_dir' with platform tag: $platform_tag"
272+
273+
# Install wheel tool if not present
274+
pip install -q wheel
275+
276+
local wheel_count=0
277+
for whl in "$wheel_dir"/*.whl; do
278+
if [[ ! -f "$whl" ]]; then
279+
continue
280+
fi
281+
282+
wheel_count=$((wheel_count + 1))
283+
echo " Processing: $(basename "$whl")"
284+
285+
# Unpack the wheel
286+
wheel unpack "$whl" -d "$wheel_dir"
287+
local whl_dir=$(find "$wheel_dir" -maxdepth 1 -type d -name "$(basename "$whl" .whl)" -print -quit)
288+
289+
if [[ -n "$whl_dir" && -d "$whl_dir" ]]; then
290+
# Find and modify the WHEEL metadata file
291+
local wheel_file=$(find "$whl_dir" -name "WHEEL" -type f)
292+
293+
if [[ -f "$wheel_file" ]]; then
294+
echo " Updating WHEEL metadata: $wheel_file"
295+
296+
# Replace platform tag based on target
297+
case "$platform_tag" in
298+
manylinux*_x86_64)
299+
sed -i 's/Tag:.*linux_x86_64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
300+
;;
301+
manylinux*_aarch64)
302+
sed -i 's/Tag:.*linux_aarch64/Tag: py3-none-'"$platform_tag"'/g' "$wheel_file"
303+
;;
304+
*)
305+
echo " Warning: Unknown platform tag pattern '$platform_tag', attempting generic replacement"
306+
sed -i 's/Tag: \(.*\)-linux_[^-]*/Tag: \1-'"$platform_tag"'/g' "$wheel_file"
307+
;;
308+
esac
309+
else
310+
echo " Warning: WHEEL file not found in unpacked wheel"
311+
fi
312+
313+
# Repack the wheel with new platform tag
314+
echo " Repacking wheel..."
315+
wheel pack "$whl_dir" -d "$wheel_dir" >/dev/null
316+
317+
# Clean up unpacked directory
318+
rm -rf "$whl_dir"
319+
fi
320+
321+
# Remove original wheel
322+
rm "$whl"
323+
echo " ✓ Retagged: $(basename "$whl")"
324+
done
325+
326+
if [[ $wheel_count -eq 0 ]]; then
327+
echo "Warning: No wheels found in '$wheel_dir'"
328+
return 1
329+
fi
330+
331+
echo "✓ Successfully retagged $wheel_count wheel(s)"
332+
echo "Final wheels:"
333+
ls -lh "$wheel_dir"/*.whl
334+
}
335+

0 commit comments

Comments
 (0)