Skip to content

Commit fc29a30

Browse files
authored
fix(ci,publish): harden publish flow and cache nemoclaw wheel builds (NVIDIA#55)
1 parent 2f8645d commit fc29a30

File tree

14 files changed

+465
-190
lines changed

14 files changed

+465
-190
lines changed

.github/workflows/publish.yml

Lines changed: 135 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ permissions:
99
contents: read
1010
packages: write
1111

12+
defaults:
13+
run:
14+
shell: bash
15+
1216
jobs:
1317
# ---------------------------------------------------------------------------
1418
# Build container images to GHCR (same as E2E pipeline)
@@ -28,21 +32,12 @@ jobs:
2832
with:
2933
component: cluster
3034

31-
# ---------------------------------------------------------------------------
32-
# Run E2E tests against the built images
33-
# ---------------------------------------------------------------------------
34-
e2e:
35-
needs: [build-server, build-sandbox, build-cluster]
36-
uses: ./.github/workflows/e2e-test.yml
37-
with:
38-
image-tag: ${{ github.sha }}
39-
4035
# ---------------------------------------------------------------------------
4136
# Publish multi-arch container images to ECR
4237
# ---------------------------------------------------------------------------
4338
publish-containers:
4439
name: Publish Containers
45-
needs: [e2e]
40+
needs: [build-server, build-sandbox, build-cluster]
4641
runs-on: build-amd64
4742
timeout-minutes: 120
4843
container:
@@ -60,21 +55,6 @@ jobs:
6055
AWS_DEFAULT_REGION: us-west-2
6156
steps:
6257
- uses: actions/checkout@v4
63-
with:
64-
fetch-depth: 0 # Full history needed for setuptools_scm version
65-
66-
- name: Install Python dependencies
67-
run: uv sync --frozen
68-
69-
- name: Compute version
70-
id: version
71-
run: |
72-
VERSION_DOCKER=$(uv run python build/scripts/release.py get-version --docker)
73-
echo "docker=$VERSION_DOCKER" >> "$GITHUB_OUTPUT"
74-
echo "Docker image version: $VERSION_DOCKER"
75-
76-
- name: Set version in source files
77-
run: mise run --no-prepare version:set
7858

7959
- name: Log in to GHCR
8060
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
@@ -90,26 +70,148 @@ jobs:
9070
DOCKER_BUILDER: navigator
9171
IMAGE_TAG: dev
9272
TAG_LATEST: "true"
93-
EXTRA_DOCKER_TAGS: ${{ steps.version.outputs.docker }}
9473
run: mise run --no-prepare docker:publish:cluster:multiarch
9574

9675
# ---------------------------------------------------------------------------
97-
# Publish Python packages (placeholder)
76+
# Build Python wheels and stage them in S3
9877
# ---------------------------------------------------------------------------
99-
publish-python:
100-
name: Publish Python
101-
needs: [e2e]
78+
build-python-wheels:
79+
name: Stage Python Wheels
80+
needs: [build-server, build-sandbox, build-cluster]
10281
runs-on: build-amd64
103-
if: false # TODO: Enable when Python packaging is ready
82+
timeout-minutes: 120
83+
outputs:
84+
wheel_version: ${{ steps.version.outputs.wheel_version }}
85+
s3_prefix: ${{ steps.upload.outputs.s3_prefix }}
10486
container:
10587
image: ghcr.io/nvidia/nv-agent-env/ci:latest
10688
credentials:
10789
username: ${{ github.actor }}
10890
password: ${{ secrets.GITHUB_TOKEN }}
91+
options: --privileged
92+
volumes:
93+
- /var/run/docker.sock:/var/run/docker.sock
94+
env:
95+
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
96+
NAV_PYPI_S3_BUCKET: navigator-pypi-artifacts
97+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
98+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
99+
AWS_DEFAULT_REGION: us-west-2
109100
steps:
110101
- uses: actions/checkout@v4
111102
with:
112103
fetch-depth: 0
113104

114-
- name: Placeholder
115-
run: echo "Python packaging not yet implemented"
105+
- name: Log in to GHCR
106+
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
107+
108+
- name: Set up Docker Buildx
109+
uses: ./.github/actions/setup-buildx
110+
111+
- name: Mark workspace safe for git
112+
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
113+
114+
- name: Fetch tags
115+
run: git fetch --tags --force
116+
117+
- name: Compute Python version
118+
id: version
119+
run: |
120+
set -euo pipefail
121+
WHEEL_VERSION=$(uv run python build/scripts/release.py get-version --python)
122+
echo "wheel_version=${WHEEL_VERSION}" >> "$GITHUB_OUTPUT"
123+
124+
- name: Build Python wheels
125+
run: |
126+
set -euo pipefail
127+
WHEEL_VERSION="${{ steps.version.outputs.wheel_version }}"
128+
CARGO_VERSION=$(uv run python build/scripts/release.py get-version --cargo)
129+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:multiarch
130+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
131+
ls -la target/wheels/*.whl
132+
133+
- name: Upload wheels to S3
134+
id: upload
135+
run: |
136+
set -euo pipefail
137+
WHEEL_VERSION="${{ steps.version.outputs.wheel_version }}"
138+
S3_PREFIX="nemoclaw/${WHEEL_VERSION}"
139+
aws s3 cp target/wheels/ "s3://${NAV_PYPI_S3_BUCKET}/${S3_PREFIX}/" --recursive --exclude "*" --include "*.whl"
140+
aws s3 ls "s3://${NAV_PYPI_S3_BUCKET}/${S3_PREFIX}/"
141+
echo "s3_prefix=${S3_PREFIX}" >> "$GITHUB_OUTPUT"
142+
143+
# ---------------------------------------------------------------------------
144+
# Publish Python wheels to Artifactory from S3 staging
145+
# ---------------------------------------------------------------------------
146+
publish-python:
147+
name: Publish Python
148+
needs: [build-python-wheels]
149+
runs-on: [self-hosted, nv]
150+
timeout-minutes: 10
151+
env:
152+
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
153+
NAV_PYPI_S3_BUCKET: navigator-pypi-artifacts
154+
NAV_PYPI_REPOSITORY_URL: https://urm.nvidia.com/artifactory/api/pypi/nv-shared-pypi-local
155+
NAV_PYPI_USERNAME: ${{ secrets.NAV_PYPI_USERNAME }}
156+
NAV_PYPI_PASSWORD: ${{ secrets.NAV_PYPI_PASSWORD }}
157+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
158+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
159+
AWS_DEFAULT_REGION: us-west-2
160+
steps:
161+
- uses: actions/checkout@v4
162+
with:
163+
fetch-depth: 0
164+
165+
- uses: actions/setup-python@v5
166+
with:
167+
python-version: "3.12"
168+
169+
- name: Install publish dependencies
170+
run: |
171+
set -euo pipefail
172+
python -m pip install --upgrade pip uv
173+
174+
if ! command -v aws >/dev/null 2>&1; then
175+
ARCH="$(uname -m)"
176+
case "$ARCH" in
177+
x86_64|amd64) AWSCLI_ARCH="x86_64" ;;
178+
aarch64|arm64) AWSCLI_ARCH="aarch64" ;;
179+
*)
180+
echo "Unsupported architecture for AWS CLI installer: $ARCH" >&2
181+
exit 1
182+
;;
183+
esac
184+
185+
rm -rf aws awscliv2.zip
186+
curl --fail --silent --show-error --location \
187+
"https://awscli.amazonaws.com/awscli-exe-linux-${AWSCLI_ARCH}.zip" \
188+
--output awscliv2.zip
189+
unzip -q awscliv2.zip
190+
./aws/install --install-dir "$HOME/.local/aws-cli" --bin-dir "$HOME/.local/bin" --update
191+
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
192+
export PATH="$HOME/.local/bin:$PATH"
193+
fi
194+
195+
aws --version
196+
uv --version
197+
198+
- name: List and download versioned wheels from S3
199+
run: |
200+
set -euo pipefail
201+
WHEEL_VERSION="${{ needs.build-python-wheels.outputs.wheel_version }}"
202+
S3_PREFIX="${{ needs.build-python-wheels.outputs.s3_prefix }}"
203+
OBJECT_COUNT=$(aws s3api list-objects-v2 --bucket "$NAV_PYPI_S3_BUCKET" --prefix "${S3_PREFIX}/" --query "length(Contents)" --output text)
204+
if [ "$OBJECT_COUNT" = "None" ] || [ "$OBJECT_COUNT" = "0" ]; then
205+
echo "No wheel artifacts found for ${WHEEL_VERSION} at s3://${NAV_PYPI_S3_BUCKET}/${S3_PREFIX}/" >&2
206+
exit 1
207+
fi
208+
aws s3api list-objects-v2 --bucket "$NAV_PYPI_S3_BUCKET" --prefix "${S3_PREFIX}/" --query "Contents[].Key" --output text
209+
mkdir -p target/wheels
210+
aws s3 cp "s3://${NAV_PYPI_S3_BUCKET}/${S3_PREFIX}/" target/wheels/ --recursive --exclude "*" --include "*.whl"
211+
ls -la target/wheels/*.whl
212+
213+
- name: Publish wheels to Artifactory
214+
run: |
215+
set -euo pipefail
216+
WHEEL_VERSION="${{ needs.build-python-wheels.outputs.wheel_version }}"
217+
uv run python build/scripts/release.py python-publish --version "$WHEEL_VERSION"

CONTRIBUTING.md

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -327,14 +327,16 @@ automatically, so you generally do not need to generate stubs manually.
327327
### Publishing
328328

329329
Versions are derived from git tags using `setuptools_scm`. No version bumps need to be committed.
330+
Python wheel builds inject version at build time via
331+
`NAVIGATOR_CARGO_VERSION` (Cargo/SemVer), applied inside wheel-builder Docker
332+
layers, so publish flows do not edit `Cargo.toml`/`Cargo.lock` in the working
333+
tree.
330334

331335
**Version commands:**
332336

333337
```bash
334338
mise run version:print # Show computed versions (python, cargo, docker)
335339
mise run version:print -- --cargo # Show cargo version only
336-
mise run version:set # Update Cargo.toml with git-derived version (or specified with --version)
337-
mise run version:reset # Restore Cargo.toml to git state
338340
```
339341

340342
**Publishing credentials (one-time setup):**
@@ -345,18 +347,24 @@ NAV_PYPI_USERNAME=$USER
345347
NAV_PYPI_PASSWORD=$ARTIFACTORY_PASSWORD" >> .env
346348
```
347349

348-
Docker publishing in CI uses AWS credentials for ECR. Python publishing uses
349-
`NAV_PYPI_*` credentials for Artifactory.
350+
Docker publishing in CI uses AWS credentials for ECR. Python publishing uses a
351+
two-stage flow: wheels are uploaded to S3, then an internal-network runner
352+
publishes them to Artifactory with `NAV_PYPI_*` credentials.
350353

351354
**Main branch publish (CI):**
352355

353356
- Publishes Docker multiarch images to ECR as `:dev`, `:latest`, and a versioned dev tag.
357+
- Builds Linux + macOS (arm64) Python wheels and uploads them to
358+
`s3://navigator-pypi-artifacts/navigator/<wheel-version>/`.
359+
- Runs a publish job on the `nv` runner to list that version prefix, download
360+
the wheels, and publish them to Artifactory.
354361

355362
**Tag release publish (CI):**
356363

357364
- Push a semver tag (`vX.Y.Z`) to trigger release jobs.
358365
- CI publishes Docker multiarch images to ECR as `:X.Y.Z` (no `:latest`).
359-
- CI publishes Linux + macOS (arm64) Python wheels to Artifactory and creates GitHub release notes.
366+
- CI stages Linux + macOS (arm64) Python wheels in S3 and publishes to
367+
Artifactory from the `nv` runner.
360368

361369
**Tagging a release:**
362370

architecture/build-containers.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,8 @@ A k3s image with bundled Helm charts and Kubernetes manifests for single-contain
121121

122122
Two Dockerfiles produce Python wheels for the CLI package distribution. These are not deployed as running containers.
123123

124-
- **`Dockerfile.python-wheels`** -- Builds Linux amd64/arm64 wheels using Maturin. Installs Rust toolchain and cross-compilation targets. Output stage is `scratch` with only the `.whl` files.
125-
- **`Dockerfile.python-wheels-macos`** -- Builds macOS arm64 wheels using osxcross (cross-compiling from Linux). Uses `crazymax/osxcross:latest` as the cross-toolchain source. The `OSXCROSS_IMAGE` build arg allows using a mirrored registry image instead of Docker Hub.
124+
- **`Dockerfile.python-wheels`** -- Builds Linux amd64/arm64 wheels using Maturin with a two-pass Rust build (dependency prebuild + final wheel build), BuildKit cache mounts for cargo registry/git/target and sccache, and `cross-build.sh` for conditional cross-toolchain installation. The final build step patches workspace version inside the container layer from `NAVIGATOR_CARGO_VERSION` (computed before Docker build), preserving cacheable dependency layers and avoiding dirty working-tree edits. Output stage is `scratch` with only the `.whl` files.
125+
- **`Dockerfile.python-wheels-macos`** -- Builds macOS arm64 wheels using osxcross (cross-compiling from Linux) with the same two-pass dependency caching pattern and cargo cache mounts. Version injection uses the same in-container workspace-version patch from `NAVIGATOR_CARGO_VERSION`, avoiding host-side file edits that break Docker layer caching. Uses `crazymax/osxcross:latest` as the cross-toolchain source. The `OSXCROSS_IMAGE` build arg allows using a mirrored registry image instead of Docker Hub.
126126

127127
### CI Runner Image (`navigator-ci`)
128128

@@ -135,7 +135,7 @@ A pre-built Ubuntu 24.04 image for CI pipeline jobs, defined in `deploy/docker/D
135135
| Docker CLI + buildx plugin | DinD-based image build/publish jobs |
136136
| AWS CLI v2 | ECR authentication and image publishing |
137137
| kubectl, helm, protoc | Kubernetes operations, chart packaging, proto compilation |
138-
| mise | Task runner with Rust, Python, and cargo-edit toolchains |
138+
| mise | Task runner with Rust and Python toolchains |
139139
| uv | Python package management (installed from Astral's installer to avoid GitHub API rate limits) |
140140
| sccache | Rust compilation cache (amd64 only; skipped on arm64) |
141141
| socat | Docker socket forwarding in sandbox e2e tests |
@@ -385,6 +385,7 @@ Container builds use Docker BuildKit with local cache directories:
385385

386386
- `build/scripts/docker-build-component.sh` stores per-component caches in `.cache/buildkit/<component>`.
387387
- `build/scripts/docker-build-cluster.sh` stores the cluster image cache in `.cache/buildkit/cluster`.
388+
- `mise run python:build:multiarch` stores per-platform wheel caches in `.cache/buildkit/python-wheels/<platform>` for local builds when using a `docker-container` buildx driver.
388389
- Rust-heavy Dockerfiles use BuildKit cache mounts for cargo registry and target directories, keyed by image name and `TARGETARCH`, with `sharing=locked` to prevent concurrent cache corruption in parallel CI builds.
389390
- When the active buildx driver is `docker` (not `docker-container`), local cache import/export flags are skipped automatically because the docker driver cannot export local caches. In CI, cache export is also skipped.
390391
- For local single-arch builds, the scripts auto-select a builder with the native `docker` driver (matching the active Docker context) so images land directly in the Docker image store without slow tarball export.
@@ -450,6 +451,13 @@ mise run docker:publish:cluster:multiarch
450451
mise run publish:main
451452
```
452453

454+
GitHub Actions stages Python wheels in S3 before final publication to
455+
Artifactory:
456+
457+
- Wheels are uploaded to `s3://navigator-pypi-artifacts/navigator/<wheel-version>/`.
458+
- A follow-up job on the `nv` runner lists that version prefix, downloads the
459+
wheels, and publishes them to Artifactory.
460+
453461
### Auto-Deployed Components in Cluster
454462

455463
When the cluster container starts, k3s automatically deploys these HelmChart CRs from `/var/lib/rancher/k3s/server/manifests/`:

build/publish.toml

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@ run = """
66
#!/usr/bin/env bash
77
set -euo pipefail
88
VERSION=$(uv run python build/scripts/release.py get-version --python)
9-
mise run version:set
10-
trap "mise run version:reset" EXIT
11-
NAVIGATOR_VERSION="$VERSION" mise run python:build:all
9+
CARGO_VERSION=$(uv run python build/scripts/release.py get-version --cargo)
10+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:all
1211
uv run python build/scripts/release.py python-publish --version "$VERSION"
1312
"""
1413

@@ -18,9 +17,8 @@ run = """
1817
#!/usr/bin/env bash
1918
set -euo pipefail
2019
VERSION=$(uv run python build/scripts/release.py get-version --python)
21-
mise run version:set
22-
trap "mise run version:reset" EXIT
23-
NAVIGATOR_VERSION="$VERSION" mise run python:build:macos
20+
CARGO_VERSION=$(uv run python build/scripts/release.py get-version --cargo)
21+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
2422
uv run python build/scripts/release.py python-publish --version "$VERSION" --wheel-glob "*macosx*arm64.whl"
2523
"""
2624

@@ -31,25 +29,23 @@ run = """
3129
set -euo pipefail
3230
VERSION_DOCKER=$(uv run python build/scripts/release.py get-version --docker)
3331
VERSION_PYTHON=$(uv run python build/scripts/release.py get-version --python)
34-
mise run version:set
35-
trap "mise run version:reset" EXIT
32+
CARGO_VERSION=$(uv run python build/scripts/release.py get-version --cargo)
3633
IMAGE_TAG=dev TAG_LATEST=true EXTRA_DOCKER_TAGS="$VERSION_DOCKER" mise run docker:publish:cluster:multiarch
37-
NAVIGATOR_VERSION="$VERSION_PYTHON" mise run python:build:multiarch
38-
NAVIGATOR_VERSION="$VERSION_PYTHON" mise run python:build:macos
34+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:multiarch
35+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
3936
uv run python build/scripts/release.py python-publish --version "$VERSION_PYTHON"
4037
"""
4138

4239
["publish:tag"]
43-
description = "Tag release publish: versioned Docker to ECR and Python to Artifactory"
40+
description = "Tag release publish: versioned Docker to ECR and Python to GitHub Packages"
4441
run = """
4542
#!/usr/bin/env bash
4643
set -euo pipefail
4744
VERSION_DOCKER=$(uv run python build/scripts/release.py get-version --docker)
4845
VERSION_PYTHON=$(uv run python build/scripts/release.py get-version --python)
49-
mise run version:set
50-
trap "mise run version:reset" EXIT
46+
CARGO_VERSION=$(uv run python build/scripts/release.py get-version --cargo)
5147
IMAGE_TAG="$VERSION_DOCKER" TAG_LATEST=false mise run docker:publish:cluster:multiarch
52-
NAVIGATOR_VERSION="$VERSION_PYTHON" mise run python:build:multiarch
53-
NAVIGATOR_VERSION="$VERSION_PYTHON" mise run python:build:macos
48+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:multiarch
49+
NAVIGATOR_CARGO_VERSION="$CARGO_VERSION" mise run python:build:macos
5450
uv run python build/scripts/release.py python-publish --version "$VERSION_PYTHON"
5551
"""

0 commit comments

Comments
 (0)