From 34e3c81bb870165ee30a52e642f50bc9692bc1a4 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Wed, 4 Mar 2026 13:33:51 +0800 Subject: [PATCH 01/25] feat(installer): add offline pack command and CI workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 'pack' command to create self-contained offline deployment bundles. Support all 4 image source × deploy target combinations: install build locally + deploy (existing) install --pull pull from GHCR + deploy (new) pack pull from GHCR + save to bundle (new) pack --local build locally + save to bundle (new) Offline bundles include K3s binary/images, Helm, K9s, ROCm device plugin manifest, all container images, and Helm chart+values. Auto-detected via manifest.json when running from bundle directory. Add pack-bundle.yml CI workflow for manual bundle creation. --- .github/workflows/pack-bundle.yml | 131 ++++++ .gitignore | 4 + auplc-installer | 746 +++++++++++++++++++++++++----- 3 files changed, 766 insertions(+), 115 deletions(-) create mode 100644 .github/workflows/pack-bundle.yml diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml new file mode 100644 index 0000000..b93a32b --- /dev/null +++ b/.github/workflows/pack-bundle.yml @@ -0,0 +1,131 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +name: Pack Offline Bundle + +on: + workflow_dispatch: + inputs: + gpu_target: + description: 'GPU target for the bundle' + required: true + default: 'gfx1151' + type: choice + options: + - gfx110x + - gfx1151 + gpu_type: + description: 'GPU type (accelerator name)' + required: true + default: 'strix-halo' + type: choice + options: + - strix-halo + - strix + - phx + create_release: + description: 'Create a GitHub Release with the bundle' + required: false + default: false + type: boolean + +permissions: + contents: write + packages: read + +jobs: + pack: + name: "Pack Bundle (${{ inputs.gpu_target }})" + runs-on: ubuntu-latest + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: false + + - name: Check available disk space + run: df -h / + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} + + - name: Run pack command + run: | + GPU_TYPE="${{ inputs.gpu_type }}" ./auplc-installer pack + + - name: Verify bundle + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + echo "Bundle: ${BUNDLE}" + echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" + + # Extract and verify structure + tar tzf "${BUNDLE}" | head -30 + echo "---" + echo "Total files: $(tar tzf "${BUNDLE}" | wc -l)" + + - name: Upload bundle as artifact + uses: actions/upload-artifact@v4 + with: + name: auplc-bundle-${{ inputs.gpu_target }} + path: auplc-bundle-*.tar.gz + retention-days: 7 + compression-level: 0 # already compressed + + - name: Create GitHub Release + if: inputs.create_release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + BUNDLE_NAME=$(basename "${BUNDLE}" .tar.gz) + TAG="bundle-${{ inputs.gpu_target }}-$(date +%Y%m%d)" + + gh release create "${TAG}" "${BUNDLE}" \ + --title "Offline Bundle: ${{ inputs.gpu_target }} ($(date +%Y-%m-%d))" \ + --notes "$(cat < m.daocloud.io/quay.io/jupyterhub/k8s-hub:4.1.0 +# Registry/package mirror configuration MIRROR_PREFIX="${MIRROR_PREFIX:-}" - -# Package manager mirrors (set via environment variables) MIRROR_PIP="${MIRROR_PIP:-}" MIRROR_NPM="${MIRROR_NPM:-}" -# Custom images (built locally) +# Custom images (built locally or pulled from GHCR) CUSTOM_IMAGES=( "ghcr.io/amdresearch/auplc-hub:latest" "ghcr.io/amdresearch/auplc-default:latest" @@ -48,29 +51,35 @@ CUSTOM_IMAGES=( "ghcr.io/amdresearch/auplc-llm:latest" ) -# External images required by JupyterHub (for offline deployment) +# GPU-specific custom images (have :latest- tags) +GPU_CUSTOM_NAMES=("auplc-base" "auplc-cv" "auplc-dl" "auplc-llm" "auplc-physim") + +# Non-GPU custom images (only :latest tag) +PLAIN_CUSTOM_NAMES=("auplc-hub" "auplc-default") + +# External images required by JupyterHub at runtime EXTERNAL_IMAGES=( - # JupyterHub core components "quay.io/jupyterhub/k8s-hub:4.1.0" "quay.io/jupyterhub/configurable-http-proxy:4.6.3" "quay.io/jupyterhub/k8s-secret-sync:4.1.0" "quay.io/jupyterhub/k8s-network-tools:4.1.0" "quay.io/jupyterhub/k8s-image-awaiter:4.1.0" "quay.io/jupyterhub/k8s-singleuser-sample:4.1.0" - # Kubernetes components "registry.k8s.io/kube-scheduler:v1.30.8" "registry.k8s.io/pause:3.10" - # Traefik proxy "traefik:v3.3.1" - # Utility images "curlimages/curl:8.5.0" - # Base images for Docker build + "alpine/git:2.47.2" +) + +# Base images only needed for local Docker build, not for runtime or bundle +BUILD_ONLY_IMAGES=( "node:20-alpine" "ubuntu:24.04" "quay.io/jupyter/base-notebook" ) -# Combined list for backward compatibility +# Combined list for backward compatibility (img pull still pulls everything) IMAGES=("${CUSTOM_IMAGES[@]}") # GPU configuration globals (set by detect_and_configure_gpu) @@ -78,8 +87,43 @@ ACCEL_KEY="" GPU_TARGET="" ACCEL_ENV="" +# ============================================================ +# Offline Bundle Detection +# ============================================================ + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OFFLINE_MODE=0 +BUNDLE_DIR="" + +function detect_offline_bundle() { + if [[ ! -f "${SCRIPT_DIR}/manifest.json" ]]; then + return + fi + + BUNDLE_DIR="${SCRIPT_DIR}" + OFFLINE_MODE=1 + K3S_USE_DOCKER=0 + echo "Offline bundle detected at: ${BUNDLE_DIR}" + + # Parse GPU config from manifest without python + local gpu_target accel_key accel_env + gpu_target=$(sed -n 's/.*"gpu_target"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + accel_key=$(sed -n 's/.*"accel_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + accel_env=$(sed -n 's/.*"accel_env"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + + if [[ -n "${gpu_target}" ]]; then + GPU_TARGET="${gpu_target}" + ACCEL_KEY="${accel_key}" + ACCEL_ENV="${accel_env}" + echo " GPU config: accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" + fi +} + +# ============================================================ +# GPU Detection & Configuration +# ============================================================ + function detect_gpu() { - # Try rocminfo first (most readable output) if command -v rocminfo &>/dev/null; then local gfx gfx=$(rocminfo 2>/dev/null | grep -o 'gfx[0-9]*' | head -1) @@ -171,8 +215,15 @@ function detect_and_configure_gpu() { echo " accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" } +# ============================================================ +# Values Overlay +# ============================================================ + function generate_values_overlay() { local overlay_path="runtime/values.local.yaml" + if [[ "${OFFLINE_MODE}" == "1" ]]; then + overlay_path="${BUNDLE_DIR}/config/values.local.yaml" + fi echo "Generating values overlay: ${overlay_path}" local tag="latest-${GPU_TARGET}" @@ -205,6 +256,10 @@ function generate_values_overlay() { } > "${overlay_path}" } +# ============================================================ +# Tool Installation (Helm, K9s) +# ============================================================ + function check_root() { if [[ $EUID -ne 0 ]]; then echo "Error: This script must be run as root." >&2 @@ -215,9 +270,22 @@ function check_root() { function install_tools() { echo "Checking/Installing tools (may require sudo)..." + if [[ "${OFFLINE_MODE}" == "1" ]]; then + if ! command -v helm &> /dev/null; then + echo "Installing Helm from bundle..." + sudo cp "${BUNDLE_DIR}/bin/helm" /usr/local/bin/helm + sudo chmod +x /usr/local/bin/helm + fi + if ! command -v k9s &> /dev/null; then + echo "Installing K9s from bundle..." + sudo dpkg -i "${BUNDLE_DIR}/bin/k9s_linux_amd64.deb" + fi + return + fi + if ! command -v helm &> /dev/null; then echo "Installing Helm..." - wget https://get.helm.sh/helm-v3.17.2-linux-amd64.tar.gz -O /tmp/helm-linux-amd64.tar.gz + wget https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz -O /tmp/helm-linux-amd64.tar.gz tar -zxvf /tmp/helm-linux-amd64.tar.gz -C /tmp sudo mv /tmp/linux-amd64/helm /usr/local/bin/helm rm /tmp/helm-linux-amd64.tar.gz @@ -226,16 +294,17 @@ function install_tools() { if ! command -v k9s &> /dev/null; then echo "Installing K9s..." - wget https://github.com/derailed/k9s/releases/latest/download/k9s_linux_amd64.deb -O /tmp/k9s_linux_amd64.deb + wget "https://github.com/derailed/k9s/releases/download/${K9S_VERSION}/k9s_linux_amd64.deb" -O /tmp/k9s_linux_amd64.deb sudo apt install /tmp/k9s_linux_amd64.deb -y rm /tmp/k9s_linux_amd64.deb fi } -function configure_registry_mirrors() { - # Configure K3s registry mirrors using MIRROR_PREFIX - # This must be done BEFORE k3s starts +# ============================================================ +# K3s Management +# ============================================================ +function configure_registry_mirrors() { if [[ -z "${MIRROR_PREFIX}" ]]; then echo "No registry mirror configured. Using default registries." return 0 @@ -244,7 +313,6 @@ function configure_registry_mirrors() { echo "Configuring registry mirrors with prefix: ${MIRROR_PREFIX}" sudo mkdir -p "$(dirname "${K3S_REGISTRIES_FILE}")" - # Configure mirrors for all registries using the prefix pattern local config="mirrors: docker.io: endpoint: @@ -263,15 +331,9 @@ function configure_registry_mirrors() { echo "Registry mirrors configured at ${K3S_REGISTRIES_FILE}" } -# Dummy interface IP for K3s node binding -# Using a private IP range that won't conflict with typical networks K3S_NODE_IP="10.255.255.1" function setup_dummy_interface() { - # Create a dummy network interface for offline/portable operation - # This provides a stable node IP that doesn't change when WiFi/network changes - # Reference: https://docs.k3s.io/installation/airgap - if ip link show dummy0 &>/dev/null; then echo "Dummy interface already exists, skipping setup" return 0 @@ -281,10 +343,8 @@ function setup_dummy_interface() { sudo ip link add dummy0 type dummy sudo ip link set dummy0 up sudo ip addr add "${K3S_NODE_IP}/32" dev dummy0 - # Add a low-priority default route so K3s can detect a valid route sudo ip route add default via "${K3S_NODE_IP}" dev dummy0 metric 1000 2>/dev/null || true - # Make persistent across reboots cat << EOF | sudo tee /etc/systemd/system/dummy-interface.service > /dev/null [Unit] Description=Setup dummy network interface for K3s portable operation @@ -308,30 +368,41 @@ EOF function install_k3s_single_node() { echo "Starting K3s installation..." - if [[ "${K3S_USE_DOCKER}" == "1" ]]; then - echo "Using Docker as container runtime (K3S_USE_DOCKER=1). Images stay in Docker; no export to agent/images." - if ! command -v docker &> /dev/null; then - echo "Error: K3S_USE_DOCKER is set but Docker is not installed. Install Docker first." >&2 - exit 1 - fi - fi - - # Setup dummy interface for offline operation setup_dummy_interface + local k3s_exec="--node-ip=${K3S_NODE_IP} --flannel-iface=dummy0" - # Configure registry mirrors before starting k3s - configure_registry_mirrors + if [[ "${OFFLINE_MODE}" == "1" ]]; then + echo "Offline mode: installing K3s from bundle (containerd)..." - # Build K3s server exec flags (--docker = use host Docker so image updates are visible in dev) - local k3s_exec="--node-ip=${K3S_NODE_IP} --flannel-iface=dummy0" - if [[ "${K3S_USE_DOCKER}" == "1" ]]; then - k3s_exec="${k3s_exec} --docker" - fi + sudo cp "${BUNDLE_DIR}/bin/k3s" /usr/local/bin/k3s + sudo chmod +x /usr/local/bin/k3s - # Bind K3s to dummy interface IP for portable operation - # With --docker, K3s uses host Docker; image updates (e.g. make hub) are visible without re-export. - curl -sfL https://get.k3s.io | sudo K3S_KUBECONFIG_MODE="644" \ - INSTALL_K3S_EXEC="${k3s_exec}" sh - + sudo mkdir -p "${K3S_IMAGES_DIR}" + for img_file in "${BUNDLE_DIR}"/k3s-images/*; do + [[ -f "${img_file}" ]] || continue + echo " Copying: $(basename "${img_file}")" + sudo cp "${img_file}" "${K3S_IMAGES_DIR}/" + done + + sudo INSTALL_K3S_SKIP_DOWNLOAD=true \ + K3S_KUBECONFIG_MODE="644" \ + INSTALL_K3S_EXEC="${k3s_exec}" \ + bash "${BUNDLE_DIR}/bin/k3s-install.sh" + else + if [[ "${K3S_USE_DOCKER}" == "1" ]]; then + echo "Using Docker as container runtime (K3S_USE_DOCKER=1)." + if ! command -v docker &> /dev/null; then + echo "Error: K3S_USE_DOCKER is set but Docker is not installed." >&2 + exit 1 + fi + k3s_exec="${k3s_exec} --docker" + fi + + configure_registry_mirrors + + curl -sfL https://get.k3s.io | sudo K3S_KUBECONFIG_MODE="644" \ + INSTALL_K3S_EXEC="${k3s_exec}" sh - + fi echo "Configuring kubeconfig for user: $(whoami)" mkdir -p "$HOME/.kube" @@ -367,7 +438,6 @@ function remove_k3s() { echo "Removing K3S local data" sudo rm -rf /var/lib/rancher/k3s - # Remove dummy interface service if [[ -f /etc/systemd/system/dummy-interface.service ]]; then echo "Removing dummy interface service..." sudo systemctl disable dummy-interface.service 2>/dev/null || true @@ -375,13 +445,16 @@ function remove_k3s() { sudo systemctl daemon-reload fi - # Remove dummy interface if ip link show dummy0 &>/dev/null; then echo "Removing dummy interface..." sudo ip link del dummy0 fi } +# ============================================================ +# GPU Device Plugin +# ============================================================ + function deploy_rocm_gpu_device_plugin() { echo "Deploying ROCm GPU device plugin..." @@ -390,7 +463,11 @@ function deploy_rocm_gpu_device_plugin() { return 0 fi - kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml + if [[ "${OFFLINE_MODE}" == "1" ]]; then + kubectl create -f "${BUNDLE_DIR}/manifests/k8s-ds-amdgpu-dp.yaml" + else + kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml + fi if ! kubectl wait --for=jsonpath='{.status.numberReady}'=1 --namespace=kube-system ds/amdgpu-device-plugin-daemonset --timeout=300s | grep "condition met"; then exit 1 @@ -399,34 +476,51 @@ function deploy_rocm_gpu_device_plugin() { fi } -function deply_aup_learning_cloud_runtime() { - detect_and_configure_gpu - generate_values_overlay - - echo "Deploying AUP Learning Cloud Runtime..." +# ============================================================ +# Image Helpers +# ============================================================ - helm install jupyterhub runtime/chart --namespace jupyterhub \ - --create-namespace -f runtime/values.yaml -f runtime/values.local.yaml +# Apply MIRROR_PREFIX to an image reference for pulling +function resolve_pull_ref() { + local image="$1" + local full_image="${image}" + local first_segment="${image%%/*}" - echo "Waiting for JupyterHub deployments to be ready..." - kubectl wait --namespace jupyterhub \ - --for=condition=available --timeout=600s \ - deployment/hub deployment/proxy deployment/user-scheduler + if [[ "${image}" == *"/"* ]]; then + [[ "${first_segment}" != *"."* ]] && full_image="docker.io/${image}" + else + full_image="docker.io/library/${image}" + fi - kubectl label "$(kubectl get nodes -o name)" node-type="${ACCEL_KEY}" --overwrite + if [[ -n "${MIRROR_PREFIX}" ]]; then + echo "${MIRROR_PREFIX}/${full_image}" + else + echo "${full_image}" + fi } -function upgrade_aup_learning_cloud_runtime() { - detect_and_configure_gpu - generate_values_overlay +# Pull a single image, apply mirror prefix, tag back to original name. +# Returns 0 on success, 1 on failure. +function pull_and_tag() { + local image="$1" + local pull_ref + pull_ref=$(resolve_pull_ref "${image}") + + echo " Pulling: ${pull_ref}" + if ! docker pull "${pull_ref}"; then + echo " FAILED: ${image}" + return 1 + fi - helm upgrade jupyterhub runtime/chart --namespace jupyterhub \ - --create-namespace -f runtime/values.yaml -f runtime/values.local.yaml + if [[ "${pull_ref}" != "${image}" ]]; then + docker tag "${pull_ref}" "${image}" + fi + return 0 } -function remove_aup_learning_cloud_runtime() { - helm uninstall jupyterhub --namespace jupyterhub -} +# ============================================================ +# Image: Local Build +# ============================================================ # Build local images. Optional: list of Makefile targets (e.g. hub, cv, base-cpu). Default: all. function local_image_build() { @@ -438,7 +532,6 @@ function local_image_build() { local targets=("${@:-all}") echo "Building local images: ${targets[*]}" - # When using Docker runtime, images stay in Docker; no need to export to K3S_IMAGES_DIR if [[ "${K3S_USE_DOCKER}" != "1" ]]; then if [ ! -d "${K3S_IMAGES_DIR}" ]; then sudo mkdir -p "${K3S_IMAGES_DIR}" @@ -448,7 +541,6 @@ function local_image_build() { echo "Build images in Docker (K3S_USE_DOCKER=1; K3s will use them directly)" fi - # Makefile: SAVE_IMAGES=1 and K3S_IMAGES_DIR only when not using Docker backend (containerd + export) local save_images_for_make="" local images_dir_for_make="" if [[ "${K3S_USE_DOCKER}" != "1" ]]; then @@ -471,21 +563,85 @@ function local_image_build() { echo "-------------------------------------------" } -function pull_external_images() { - # Pull external images. When K3S_USE_DOCKER=1, keep in Docker only; else also save to K3S_IMAGES_DIR for offline. +# ============================================================ +# Image: Pull from GHCR (custom images) +# ============================================================ +function pull_custom_images() { if ! command -v docker &> /dev/null; then echo "Please install docker" exit 1 fi + detect_and_configure_gpu + local tag="latest-${GPU_TARGET}" + echo "===========================================" - echo "Pulling external images..." - if [[ "${K3S_USE_DOCKER}" == "1" ]]; then - echo "K3S_USE_DOCKER=1: images stay in Docker (no export to K3s image dir)" + echo "Pulling pre-built custom images from GHCR..." + echo " GPU_TARGET=${GPU_TARGET}, tag=${tag}" + echo "===========================================" + + if [[ "${K3S_USE_DOCKER}" != "1" && ! -d "${K3S_IMAGES_DIR}" ]]; then + sudo mkdir -p "${K3S_IMAGES_DIR}" + fi + + local failed_images=() + + # GPU-specific images: pull :latest-, also tag as :latest + for name in "${GPU_CUSTOM_NAMES[@]}"; do + local image="ghcr.io/amdresearch/${name}:${tag}" + if pull_and_tag "${image}"; then + docker tag "${image}" "ghcr.io/amdresearch/${name}:latest" + + if [[ "${K3S_USE_DOCKER}" != "1" ]]; then + sudo docker save \ + "ghcr.io/amdresearch/${name}:latest" \ + "ghcr.io/amdresearch/${name}:${tag}" \ + -o "${K3S_IMAGES_DIR}/${name}.tar" + fi + else + failed_images+=("${image}") + fi + done + + # Non-GPU images: pull :latest + for name in "${PLAIN_CUSTOM_NAMES[@]}"; do + local image="ghcr.io/amdresearch/${name}:latest" + if pull_and_tag "${image}"; then + if [[ "${K3S_USE_DOCKER}" != "1" ]]; then + sudo docker save "${image}" -o "${K3S_IMAGES_DIR}/${name}.tar" + fi + else + failed_images+=("${image}") + fi + done + + echo "===========================================" + if [[ ${#failed_images[@]} -eq 0 ]]; then + echo "All custom images pulled successfully!" else - echo "Saving to K3s image pool for offline deployment" + echo "Failed images:" + for img in "${failed_images[@]}"; do echo " - ${img}"; done + echo "Warning: Some custom images failed." fi + echo "===========================================" +} + +# ============================================================ +# Image: Pull External Images +# ============================================================ + +function pull_external_images() { + if ! command -v docker &> /dev/null; then + echo "Please install docker" + exit 1 + fi + + # When called during 'install --pull', skip build-only images + local skip_build_only="${1:-0}" + + echo "===========================================" + echo "Pulling external images..." if [[ -n "${MIRROR_PREFIX}" ]]; then echo "Using mirror prefix: ${MIRROR_PREFIX}" fi @@ -495,46 +651,37 @@ function pull_external_images() { sudo mkdir -p "${K3S_IMAGES_DIR}" fi + # Build image list, combining EXTERNAL_IMAGES + optionally BUILD_ONLY_IMAGES + local images_to_pull=("${EXTERNAL_IMAGES[@]}") + if [[ "${skip_build_only}" != "1" ]]; then + images_to_pull+=("${BUILD_ONLY_IMAGES[@]}") + fi + local failed_images=() - for image in "${EXTERNAL_IMAGES[@]}"; do - # Determine the full image path for pulling with mirror - # Images without registry prefix are from docker.io + for image in "${images_to_pull[@]}"; do local full_image="${image}" local first_segment="${image%%/*}" if [[ "${image}" == *"/"* ]]; then - # Has slash - check if first segment looks like a registry (contains a dot) - if [[ "${first_segment}" != *"."* ]]; then - # No dot in first segment, it's docker.io (e.g., curlimages/curl) - full_image="docker.io/${image}" - fi + [[ "${first_segment}" != *"."* ]] && full_image="docker.io/${image}" else - # No slash - it's an official docker image (e.g., traefik:v3.3.1) full_image="docker.io/library/${image}" fi - # Apply mirror prefix if set local pull_image="${full_image}" - if [[ -n "${MIRROR_PREFIX}" ]]; then - pull_image="${MIRROR_PREFIX}/${full_image}" - fi + [[ -n "${MIRROR_PREFIX}" ]] && pull_image="${MIRROR_PREFIX}/${full_image}" echo "-------------------------------------------" echo "Pulling: ${pull_image}" if docker pull "${pull_image}"; then - # Tag to original name so K3s can use it - if [[ "${pull_image}" != "${image}" ]]; then - docker tag "${pull_image}" "${image}" - fi + [[ "${pull_image}" != "${image}" ]] && docker tag "${pull_image}" "${image}" - # Also tag to mirror-prefixed name so Docker build with MIRROR_PREFIX can use local cache if [[ -n "${MIRROR_PREFIX}" && "${pull_image}" != "${MIRROR_PREFIX}/${full_image}" ]]; then docker tag "${pull_image}" "${MIRROR_PREFIX}/${full_image}" fi - # Save to K3S_IMAGES_DIR only when not using Docker backend (so K3s can load at boot) if [[ "${K3S_USE_DOCKER}" != "1" && -n "${K3S_IMAGES_DIR}" ]]; then local filename filename=$(echo "${image}" | sed 's/[\/:]/-/g').tar @@ -560,22 +707,112 @@ function pull_external_images() { echo "All external images pulled and saved successfully!" else echo "Failed images:" - for img in "${failed_images[@]}"; do - echo " - ${img}" - done + for img in "${failed_images[@]}"; do echo " - ${img}"; done echo "Warning: Some images failed. Deployment may require internet access." fi echo "===========================================" } +# ============================================================ +# Image: Load from Offline Bundle +# ============================================================ + +function load_offline_images() { + echo "===========================================" + echo "Loading images from offline bundle..." + echo "===========================================" + + local loaded=0 failed=0 + + for tar_file in "${BUNDLE_DIR}/images/"custom/*.tar "${BUNDLE_DIR}/images/"external/*.tar; do + [[ -f "${tar_file}" ]] || continue + echo " Importing: $(basename "${tar_file}")" + if sudo k3s ctr images import "${tar_file}" 2>/dev/null; then + loaded=$((loaded + 1)) + else + echo " Failed!" + failed=$((failed + 1)) + fi + done + + echo "===========================================" + echo "Loaded ${loaded} images, ${failed} failed" + [[ "${failed}" -gt 0 ]] && echo "Warning: Some images failed to load." + echo "===========================================" +} + +# ============================================================ +# Runtime Management +# ============================================================ + +# Resolve chart/values paths (bundle or local repo) +function get_runtime_paths() { + if [[ "${OFFLINE_MODE}" == "1" ]]; then + CHART_PATH="${BUNDLE_DIR}/chart" + VALUES_PATH="${BUNDLE_DIR}/config/values.yaml" + OVERLAY_PATH="${BUNDLE_DIR}/config/values.local.yaml" + else + CHART_PATH="runtime/chart" + VALUES_PATH="runtime/values.yaml" + OVERLAY_PATH="runtime/values.local.yaml" + fi +} + +function deply_aup_learning_cloud_runtime() { + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + + echo "Deploying AUP Learning Cloud Runtime..." + + helm install jupyterhub "${CHART_PATH}" --namespace jupyterhub \ + --create-namespace -f "${VALUES_PATH}" -f "${OVERLAY_PATH}" + + echo "Waiting for JupyterHub deployments to be ready..." + kubectl wait --namespace jupyterhub \ + --for=condition=available --timeout=600s \ + deployment/hub deployment/proxy deployment/user-scheduler + + kubectl label "$(kubectl get nodes -o name)" node-type="${ACCEL_KEY}" --overwrite +} + +function upgrade_aup_learning_cloud_runtime() { + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + + helm upgrade jupyterhub "${CHART_PATH}" --namespace jupyterhub \ + --create-namespace -f "${VALUES_PATH}" -f "${OVERLAY_PATH}" +} + +function remove_aup_learning_cloud_runtime() { + helm uninstall jupyterhub --namespace jupyterhub +} + +# ============================================================ +# Deployment Orchestration +# ============================================================ + function deploy_all_components() { + local flag="${1:-}" + detect_and_configure_gpu + get_runtime_paths generate_values_overlay install_tools install_k3s_single_node deploy_rocm_gpu_device_plugin - pull_external_images - local_image_build + + if [[ "${OFFLINE_MODE}" == "1" ]]; then + load_offline_images + elif [[ "${flag}" == "--pull" ]]; then + pull_custom_images + pull_external_images 1 # skip build-only images + else + pull_external_images + local_image_build + fi + deply_aup_learning_cloud_runtime } @@ -584,22 +821,280 @@ function remove_all_components() { remove_k3s } +# ============================================================ +# Pack: Create Offline Bundle +# ============================================================ + +function pack_download_binaries() { + local staging="$1" + local k3s_url_ver + k3s_url_ver=$(echo "${K3S_VERSION}" | sed 's/+/%2B/g') + + echo "--- Downloading binaries ---" + mkdir -p "${staging}/bin" + + echo " K3s ${K3S_VERSION}..." + wget -q "https://github.com/k3s-io/k3s/releases/download/${k3s_url_ver}/k3s" \ + -O "${staging}/bin/k3s" + chmod +x "${staging}/bin/k3s" + + echo " K3s install script..." + wget -q "https://get.k3s.io" -O "${staging}/bin/k3s-install.sh" + chmod +x "${staging}/bin/k3s-install.sh" + + echo " Helm ${HELM_VERSION}..." + wget -q "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" -O /tmp/helm-pack.tar.gz + tar -zxf /tmp/helm-pack.tar.gz -C /tmp linux-amd64/helm + mv /tmp/linux-amd64/helm "${staging}/bin/helm" + chmod +x "${staging}/bin/helm" + rm -rf /tmp/helm-pack.tar.gz /tmp/linux-amd64 + + echo " K9s ${K9S_VERSION}..." + wget -q "https://github.com/derailed/k9s/releases/download/${K9S_VERSION}/k9s_linux_amd64.deb" \ + -O "${staging}/bin/k9s_linux_amd64.deb" +} + +function pack_download_k3s_images() { + local staging="$1" + local k3s_url_ver + k3s_url_ver=$(echo "${K3S_VERSION}" | sed 's/+/%2B/g') + + echo "--- Downloading K3s airgap images ---" + mkdir -p "${staging}/k3s-images" + + wget -q "https://github.com/k3s-io/k3s/releases/download/${k3s_url_ver}/k3s-airgap-images-amd64.tar.zst" \ + -O "${staging}/k3s-images/k3s-airgap-images-amd64.tar.zst" +} + +function pack_save_manifests() { + local staging="$1" + echo "--- Saving manifests ---" + mkdir -p "${staging}/manifests" + + wget -q "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml" \ + -O "${staging}/manifests/k8s-ds-amdgpu-dp.yaml" + echo " Saved ROCm device plugin DaemonSet." +} + +function pack_copy_chart() { + local staging="$1" + echo "--- Copying chart and config ---" + + cp -r runtime/chart "${staging}/chart" + mkdir -p "${staging}/config" + cp runtime/values.yaml "${staging}/config/values.yaml" +} + +# Save custom images: pull from GHCR, then docker save +function pack_save_custom_images_pull() { + local staging="$1" + local tag="latest-${GPU_TARGET}" + + echo "--- Pulling and saving custom images from GHCR ---" + mkdir -p "${staging}/images/custom" + + for name in "${GPU_CUSTOM_NAMES[@]}"; do + local image="ghcr.io/amdresearch/${name}:${tag}" + if pull_and_tag "${image}"; then + docker tag "${image}" "ghcr.io/amdresearch/${name}:latest" + docker save \ + "ghcr.io/amdresearch/${name}:latest" \ + "ghcr.io/amdresearch/${name}:${tag}" \ + -o "${staging}/images/custom/${name}.tar" + echo " Saved: ${name} (:latest + :${tag})" + else + echo " ERROR: Failed to pull ${image}" >&2 + fi + done + + for name in "${PLAIN_CUSTOM_NAMES[@]}"; do + local image="ghcr.io/amdresearch/${name}:latest" + if pull_and_tag "${image}"; then + docker save "${image}" -o "${staging}/images/custom/${name}.tar" + echo " Saved: ${name}" + else + echo " ERROR: Failed to pull ${image}" >&2 + fi + done +} + +# Save custom images: build locally via Makefile, then docker save +function pack_save_custom_images_local() { + local staging="$1" + local tag="latest-${GPU_TARGET}" + + echo "--- Building and saving custom images locally ---" + mkdir -p "${staging}/images/custom" + + # Build all images to Docker daemon (no K3s export) + (cd dockerfiles/ && make \ + GPU_TARGET="${GPU_TARGET}" \ + MIRROR_PREFIX="${MIRROR_PREFIX}" \ + MIRROR_PIP="${MIRROR_PIP}" \ + MIRROR_NPM="${MIRROR_NPM}" \ + all) + + echo "--- Saving built images to bundle ---" + + for name in "${GPU_CUSTOM_NAMES[@]}"; do + docker save \ + "ghcr.io/amdresearch/${name}:latest" \ + "ghcr.io/amdresearch/${name}:${tag}" \ + -o "${staging}/images/custom/${name}.tar" + echo " Saved: ${name} (:latest + :${tag})" + done + + for name in "${PLAIN_CUSTOM_NAMES[@]}"; do + docker save "ghcr.io/amdresearch/${name}:latest" \ + -o "${staging}/images/custom/${name}.tar" + echo " Saved: ${name}" + done +} + +# Save external images (always pulled from registries) +function pack_save_external_images() { + local staging="$1" + echo "--- Pulling and saving external images ---" + mkdir -p "${staging}/images/external" + + # Build list: runtime external images (skip build-only) + local pack_images=("${EXTERNAL_IMAGES[@]}") + + # Extract ROCm device plugin image from saved manifest + if [[ -f "${staging}/manifests/k8s-ds-amdgpu-dp.yaml" ]]; then + local dp_image + dp_image=$(sed -n 's/.*image:[[:space:]]*\([^ ]*\).*/\1/p' "${staging}/manifests/k8s-ds-amdgpu-dp.yaml" | head -1) + if [[ -n "${dp_image}" ]]; then + echo " Found device plugin image: ${dp_image}" + pack_images+=("${dp_image}") + fi + fi + + local failed_images=() + for image in "${pack_images[@]}"; do + if pull_and_tag "${image}"; then + local filename + filename=$(echo "${image}" | sed 's/[\/:]/-/g').tar + docker save "${image}" -o "${staging}/images/external/${filename}" + echo " Saved: ${image}" + else + failed_images+=("${image}") + fi + done + + if [[ ${#failed_images[@]} -gt 0 ]]; then + echo " WARNING: Failed to pull ${#failed_images[@]} images:" + for img in "${failed_images[@]}"; do echo " - ${img}"; done + fi +} + +function pack_write_manifest() { + local staging="$1" + cat > "${staging}/manifest.json" << EOF +{ + "format_version": "1", + "build_date": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", + "gpu_target": "${GPU_TARGET}", + "accel_key": "${ACCEL_KEY}", + "accel_env": "${ACCEL_ENV}", + "k3s_version": "${K3S_VERSION}", + "helm_version": "${HELM_VERSION}", + "k9s_version": "${K9S_VERSION}" +} +EOF +} + +function pack_bundle() { + local flag="${1:-}" + + echo "===========================================" + echo "AUP Learning Cloud - Pack Offline Bundle" + if [[ "${flag}" == "--local" ]]; then + echo " Image source: local build" + else + echo " Image source: pull from GHCR" + fi + echo "===========================================" + + if ! command -v docker &> /dev/null; then + echo "Error: Docker is required." >&2 + exit 1 + fi + + detect_and_configure_gpu + + local date_stamp + date_stamp=$(date +%Y%m%d) + local bundle_name="auplc-bundle-${GPU_TARGET}-${date_stamp}" + + [[ -d "${bundle_name}" ]] && rm -rf "${bundle_name}" + mkdir -p "${bundle_name}" + + # Copy installer itself + cp "${BASH_SOURCE[0]}" "${bundle_name}/auplc-installer" + chmod +x "${bundle_name}/auplc-installer" + + pack_download_binaries "${bundle_name}" + pack_download_k3s_images "${bundle_name}" + pack_save_manifests "${bundle_name}" + + if [[ "${flag}" == "--local" ]]; then + pack_save_custom_images_local "${bundle_name}" + else + pack_save_custom_images_pull "${bundle_name}" + fi + + pack_save_external_images "${bundle_name}" + pack_copy_chart "${bundle_name}" + pack_write_manifest "${bundle_name}" + + echo "===========================================" + echo "Creating archive: ${bundle_name}.tar.gz ..." + echo "===========================================" + + tar czf "${bundle_name}.tar.gz" "${bundle_name}/" + rm -rf "${bundle_name}" + + local size + size=$(du -sh "${bundle_name}.tar.gz" | cut -f1) + + echo "===========================================" + echo "Bundle created: ${bundle_name}.tar.gz (${size})" + echo "" + echo "Deploy on air-gapped machine:" + echo " tar xzf ${bundle_name}.tar.gz" + echo " cd ${bundle_name}" + echo " sudo ./auplc-installer install" + echo "===========================================" +} + +# ============================================================ +# Help +# ============================================================ + function show_help() { cat << 'EOF' -Usage: ./auplc-installer [subcommand] +Usage: ./auplc-installer [options] Commands: - install Full installation (k3s + images + runtime) - uninstall Remove everything + install [--pull] Full installation (k3s + images + runtime) + Default: build images locally via Makefile + --pull: use pre-built images from GHCR (no local build needed) + + pack [--local] Create offline deployment bundle (requires Docker + internet) + Default: pull pre-built images from GHCR + --local: build images locally then pack (needs build deps) + + uninstall Remove everything (K3s + runtime) install-tools Install helm and k9s rt install Deploy JupyterHub runtime only - rt reinstall Reinstall JupyterHub runtime (For container images changes) - rt upgrade Upgrade JupyterHub runtime (For vaules.yaml changes) + rt reinstall Reinstall JupyterHub runtime (for container image changes) + rt upgrade Upgrade JupyterHub runtime (for values.yaml changes) rt remove Remove JupyterHub runtime - img build Build all custom images - img build [target...] Build custom images (default: all). e.g. img build hub, img build hub cv + img build [target...] Build custom images (default: all) + Targets: all, hub, base-cpu, base-rocm, cv, dl, llm, physim img pull Pull external images for offline use detect-gpu Show detected GPU configuration @@ -625,9 +1120,23 @@ Options (can also be set via environment variables): ./auplc-installer img build base-rocm --gpu=strix ./auplc-installer install --mirror=mirror.example.com +Offline Deployment: + 1. On a machine with internet access, create bundle: + ./auplc-installer pack --gpu=strix-halo # pull from GHCR + ./auplc-installer pack --gpu=strix-halo --local # or build locally + + 2. Transfer bundle to air-gapped machine, then: + tar xzf auplc-bundle-gfx1151-*.tar.gz + cd auplc-bundle-gfx1151-* + sudo ./auplc-installer install + EOF } +# ============================================================ +# Main +# ============================================================ + # Parse global options (--key=value flags override environment variables) args=() for arg in "$@"; do @@ -642,17 +1151,24 @@ for arg in "$@"; do done set -- "${args[@]}" +# Detect offline bundle at startup +detect_offline_bundle + if [[ $# -eq 0 ]]; then show_help exit 1 fi case "$1" in - install) deploy_all_components ;; + install) + deploy_all_components "${2:-}" + ;; + pack) + pack_bundle "${2:-}" + ;; uninstall) remove_all_components ;; install-tools) install_tools ;; detect-gpu) detect_and_configure_gpu ;; - # New short form: rt / img rt) case "${2:-}" in install) deply_aup_learning_cloud_runtime ;; From c31a2f131664a5e4cc32e62897593d33ec01cf55 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Wed, 4 Mar 2026 13:51:17 +0800 Subject: [PATCH 02/25] fix(pack): add IMAGE_REGISTRY support and fail on pull errors - Add IMAGE_REGISTRY env var (default: ghcr.io/amdresearch) for configurable image source in pack and install --pull - Pack now exits with error if any custom or external image fails to pull, preventing incomplete bundles - Add image_registry input to pack-bundle CI workflow - Read IMAGE_REGISTRY from bundle manifest for offline installs --- .github/workflows/pack-bundle.yml | 9 +++- auplc-installer | 81 +++++++++++++++++++------------ 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index b93a32b..2cbd2e6 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -39,6 +39,11 @@ on: - strix-halo - strix - phx + image_registry: + description: 'Registry prefix for custom images' + required: false + default: 'ghcr.io/amdresearch' + type: string create_release: description: 'Create a GitHub Release with the bundle' required: false @@ -80,7 +85,9 @@ jobs: - name: Run pack command run: | - GPU_TYPE="${{ inputs.gpu_type }}" ./auplc-installer pack + GPU_TYPE="${{ inputs.gpu_type }}" \ + IMAGE_REGISTRY="${{ inputs.image_registry }}" \ + ./auplc-installer pack - name: Verify bundle run: | diff --git a/auplc-installer b/auplc-installer index 9deabdb..a955e5f 100755 --- a/auplc-installer +++ b/auplc-installer @@ -42,13 +42,16 @@ MIRROR_PREFIX="${MIRROR_PREFIX:-}" MIRROR_PIP="${MIRROR_PIP:-}" MIRROR_NPM="${MIRROR_NPM:-}" +# Registry prefix for custom images (override for forks or private registries) +IMAGE_REGISTRY="${IMAGE_REGISTRY:-ghcr.io/amdresearch}" + # Custom images (built locally or pulled from GHCR) CUSTOM_IMAGES=( - "ghcr.io/amdresearch/auplc-hub:latest" - "ghcr.io/amdresearch/auplc-default:latest" - "ghcr.io/amdresearch/auplc-cv:latest" - "ghcr.io/amdresearch/auplc-dl:latest" - "ghcr.io/amdresearch/auplc-llm:latest" + "${IMAGE_REGISTRY}/auplc-hub:latest" + "${IMAGE_REGISTRY}/auplc-default:latest" + "${IMAGE_REGISTRY}/auplc-cv:latest" + "${IMAGE_REGISTRY}/auplc-dl:latest" + "${IMAGE_REGISTRY}/auplc-llm:latest" ) # GPU-specific custom images (have :latest- tags) @@ -105,16 +108,18 @@ function detect_offline_bundle() { K3S_USE_DOCKER=0 echo "Offline bundle detected at: ${BUNDLE_DIR}" - # Parse GPU config from manifest without python - local gpu_target accel_key accel_env + # Parse config from manifest without python + local gpu_target accel_key accel_env image_registry gpu_target=$(sed -n 's/.*"gpu_target"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") accel_key=$(sed -n 's/.*"accel_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") accel_env=$(sed -n 's/.*"accel_env"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + image_registry=$(sed -n 's/.*"image_registry"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") if [[ -n "${gpu_target}" ]]; then GPU_TARGET="${gpu_target}" ACCEL_KEY="${accel_key}" ACCEL_ENV="${accel_env}" + [[ -n "${image_registry}" ]] && IMAGE_REGISTRY="${image_registry}" echo " GPU config: accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" fi } @@ -242,11 +247,11 @@ function generate_values_overlay() { echo " resources:" echo " images:" - echo " gpu: \"ghcr.io/amdresearch/auplc-base:${tag}\"" - echo " Course-CV: \"ghcr.io/amdresearch/auplc-cv:${tag}\"" - echo " Course-DL: \"ghcr.io/amdresearch/auplc-dl:${tag}\"" - echo " Course-LLM: \"ghcr.io/amdresearch/auplc-llm:${tag}\"" - echo " Course-PhySim: \"ghcr.io/amdresearch/auplc-physim:${tag}\"" + echo " gpu: \"${IMAGE_REGISTRY}/auplc-base:${tag}\"" + echo " Course-CV: \"${IMAGE_REGISTRY}/auplc-cv:${tag}\"" + echo " Course-DL: \"${IMAGE_REGISTRY}/auplc-dl:${tag}\"" + echo " Course-LLM: \"${IMAGE_REGISTRY}/auplc-llm:${tag}\"" + echo " Course-PhySim: \"${IMAGE_REGISTRY}/auplc-physim:${tag}\"" echo " metadata:" for resource in gpu Course-CV Course-DL Course-LLM Course-PhySim; do echo " ${resource}:" @@ -589,14 +594,14 @@ function pull_custom_images() { # GPU-specific images: pull :latest-, also tag as :latest for name in "${GPU_CUSTOM_NAMES[@]}"; do - local image="ghcr.io/amdresearch/${name}:${tag}" + local image="${IMAGE_REGISTRY}/${name}:${tag}" if pull_and_tag "${image}"; then - docker tag "${image}" "ghcr.io/amdresearch/${name}:latest" + docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" if [[ "${K3S_USE_DOCKER}" != "1" ]]; then sudo docker save \ - "ghcr.io/amdresearch/${name}:latest" \ - "ghcr.io/amdresearch/${name}:${tag}" \ + "${IMAGE_REGISTRY}/${name}:latest" \ + "${IMAGE_REGISTRY}/${name}:${tag}" \ -o "${K3S_IMAGES_DIR}/${name}.tar" fi else @@ -606,7 +611,7 @@ function pull_custom_images() { # Non-GPU images: pull :latest for name in "${PLAIN_CUSTOM_NAMES[@]}"; do - local image="ghcr.io/amdresearch/${name}:latest" + local image="${IMAGE_REGISTRY}/${name}:latest" if pull_and_tag "${image}"; then if [[ "${K3S_USE_DOCKER}" != "1" ]]; then sudo docker save "${image}" -o "${K3S_IMAGES_DIR}/${name}.tar" @@ -890,32 +895,41 @@ function pack_save_custom_images_pull() { local staging="$1" local tag="latest-${GPU_TARGET}" - echo "--- Pulling and saving custom images from GHCR ---" + echo "--- Pulling and saving custom images (${IMAGE_REGISTRY}) ---" mkdir -p "${staging}/images/custom" + local failed=0 + for name in "${GPU_CUSTOM_NAMES[@]}"; do - local image="ghcr.io/amdresearch/${name}:${tag}" + local image="${IMAGE_REGISTRY}/${name}:${tag}" if pull_and_tag "${image}"; then - docker tag "${image}" "ghcr.io/amdresearch/${name}:latest" + docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" docker save \ - "ghcr.io/amdresearch/${name}:latest" \ - "ghcr.io/amdresearch/${name}:${tag}" \ + "${IMAGE_REGISTRY}/${name}:latest" \ + "${IMAGE_REGISTRY}/${name}:${tag}" \ -o "${staging}/images/custom/${name}.tar" echo " Saved: ${name} (:latest + :${tag})" else - echo " ERROR: Failed to pull ${image}" >&2 + failed=$((failed + 1)) fi done for name in "${PLAIN_CUSTOM_NAMES[@]}"; do - local image="ghcr.io/amdresearch/${name}:latest" + local image="${IMAGE_REGISTRY}/${name}:latest" if pull_and_tag "${image}"; then docker save "${image}" -o "${staging}/images/custom/${name}.tar" echo " Saved: ${name}" else - echo " ERROR: Failed to pull ${image}" >&2 + failed=$((failed + 1)) fi done + + if [[ "${failed}" -gt 0 ]]; then + echo "Error: ${failed} custom image(s) failed to pull. Bundle would be incomplete." >&2 + echo " Check that IMAGE_REGISTRY (${IMAGE_REGISTRY}) is correct and you have pull access." >&2 + rm -rf "${staging}" + exit 1 + fi } # Save custom images: build locally via Makefile, then docker save @@ -938,14 +952,14 @@ function pack_save_custom_images_local() { for name in "${GPU_CUSTOM_NAMES[@]}"; do docker save \ - "ghcr.io/amdresearch/${name}:latest" \ - "ghcr.io/amdresearch/${name}:${tag}" \ + "${IMAGE_REGISTRY}/${name}:latest" \ + "${IMAGE_REGISTRY}/${name}:${tag}" \ -o "${staging}/images/custom/${name}.tar" echo " Saved: ${name} (:latest + :${tag})" done for name in "${PLAIN_CUSTOM_NAMES[@]}"; do - docker save "ghcr.io/amdresearch/${name}:latest" \ + docker save "${IMAGE_REGISTRY}/${name}:latest" \ -o "${staging}/images/custom/${name}.tar" echo " Saved: ${name}" done @@ -983,8 +997,10 @@ function pack_save_external_images() { done if [[ ${#failed_images[@]} -gt 0 ]]; then - echo " WARNING: Failed to pull ${#failed_images[@]} images:" - for img in "${failed_images[@]}"; do echo " - ${img}"; done + echo "Error: ${#failed_images[@]} external image(s) failed to pull:" >&2 + for img in "${failed_images[@]}"; do echo " - ${img}" >&2; done + rm -rf "${staging}" + exit 1 fi } @@ -997,6 +1013,7 @@ function pack_write_manifest() { "gpu_target": "${GPU_TARGET}", "accel_key": "${ACCEL_KEY}", "accel_env": "${ACCEL_ENV}", + "image_registry": "${IMAGE_REGISTRY}", "k3s_version": "${K3S_VERSION}", "helm_version": "${HELM_VERSION}", "k9s_version": "${K9S_VERSION}" @@ -1120,6 +1137,10 @@ Options (can also be set via environment variables): ./auplc-installer img build base-rocm --gpu=strix ./auplc-installer install --mirror=mirror.example.com +Image Registry: + IMAGE_REGISTRY Registry prefix for custom images (default: ghcr.io/amdresearch) + Override when pulling from a fork or private registry. + Offline Deployment: 1. On a machine with internet access, create bundle: ./auplc-installer pack --gpu=strix-halo # pull from GHCR From 5dd55b951725e3bf9328832ca13ce5f1610c7976 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:12:16 +0800 Subject: [PATCH 03/25] feat(installer): add IMAGE_TAG env var for configurable image tag prefix Support pulling images with non-default tag prefixes (e.g. develop-gfx1151 instead of latest-gfx1151). The IMAGE_TAG is stored in the bundle manifest and restored on offline install. Default remains "latest". --- .github/workflows/pack-bundle.yml | 6 ++++++ auplc-installer | 22 +++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 2cbd2e6..f4d2c2b 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -44,6 +44,11 @@ on: required: false default: 'ghcr.io/amdresearch' type: string + image_tag: + description: 'Image tag prefix (e.g. latest, develop, v1.0)' + required: false + default: 'latest' + type: string create_release: description: 'Create a GitHub Release with the bundle' required: false @@ -87,6 +92,7 @@ jobs: run: | GPU_TYPE="${{ inputs.gpu_type }}" \ IMAGE_REGISTRY="${{ inputs.image_registry }}" \ + IMAGE_TAG="${{ inputs.image_tag }}" \ ./auplc-installer pack - name: Verify bundle diff --git a/auplc-installer b/auplc-installer index a955e5f..cf9dc0b 100755 --- a/auplc-installer +++ b/auplc-installer @@ -45,6 +45,9 @@ MIRROR_NPM="${MIRROR_NPM:-}" # Registry prefix for custom images (override for forks or private registries) IMAGE_REGISTRY="${IMAGE_REGISTRY:-ghcr.io/amdresearch}" +# Image tag prefix (e.g. latest, develop, v1.0). GPU suffix is appended automatically. +IMAGE_TAG="${IMAGE_TAG:-latest}" + # Custom images (built locally or pulled from GHCR) CUSTOM_IMAGES=( "${IMAGE_REGISTRY}/auplc-hub:latest" @@ -109,17 +112,19 @@ function detect_offline_bundle() { echo "Offline bundle detected at: ${BUNDLE_DIR}" # Parse config from manifest without python - local gpu_target accel_key accel_env image_registry + local gpu_target accel_key accel_env image_registry image_tag gpu_target=$(sed -n 's/.*"gpu_target"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") accel_key=$(sed -n 's/.*"accel_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") accel_env=$(sed -n 's/.*"accel_env"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") image_registry=$(sed -n 's/.*"image_registry"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + image_tag=$(sed -n 's/.*"image_tag"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") if [[ -n "${gpu_target}" ]]; then GPU_TARGET="${gpu_target}" ACCEL_KEY="${accel_key}" ACCEL_ENV="${accel_env}" [[ -n "${image_registry}" ]] && IMAGE_REGISTRY="${image_registry}" + [[ -n "${image_tag}" ]] && IMAGE_TAG="${image_tag}" echo " GPU config: accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" fi } @@ -231,7 +236,7 @@ function generate_values_overlay() { fi echo "Generating values overlay: ${overlay_path}" - local tag="latest-${GPU_TARGET}" + local tag="${IMAGE_TAG}-${GPU_TARGET}" { echo "# Auto-generated by auplc-installer (GPU: ${ACCEL_KEY}, target: ${GPU_TARGET})" @@ -579,7 +584,7 @@ function pull_custom_images() { fi detect_and_configure_gpu - local tag="latest-${GPU_TARGET}" + local tag="${IMAGE_TAG}-${GPU_TARGET}" echo "===========================================" echo "Pulling pre-built custom images from GHCR..." @@ -609,9 +614,9 @@ function pull_custom_images() { fi done - # Non-GPU images: pull :latest + # Non-GPU images: pull : for name in "${PLAIN_CUSTOM_NAMES[@]}"; do - local image="${IMAGE_REGISTRY}/${name}:latest" + local image="${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" if pull_and_tag "${image}"; then if [[ "${K3S_USE_DOCKER}" != "1" ]]; then sudo docker save "${image}" -o "${K3S_IMAGES_DIR}/${name}.tar" @@ -893,7 +898,7 @@ function pack_copy_chart() { # Save custom images: pull from GHCR, then docker save function pack_save_custom_images_pull() { local staging="$1" - local tag="latest-${GPU_TARGET}" + local tag="${IMAGE_TAG}-${GPU_TARGET}" echo "--- Pulling and saving custom images (${IMAGE_REGISTRY}) ---" mkdir -p "${staging}/images/custom" @@ -915,7 +920,7 @@ function pack_save_custom_images_pull() { done for name in "${PLAIN_CUSTOM_NAMES[@]}"; do - local image="${IMAGE_REGISTRY}/${name}:latest" + local image="${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" if pull_and_tag "${image}"; then docker save "${image}" -o "${staging}/images/custom/${name}.tar" echo " Saved: ${name}" @@ -1014,6 +1019,7 @@ function pack_write_manifest() { "accel_key": "${ACCEL_KEY}", "accel_env": "${ACCEL_ENV}", "image_registry": "${IMAGE_REGISTRY}", + "image_tag": "${IMAGE_TAG}", "k3s_version": "${K3S_VERSION}", "helm_version": "${HELM_VERSION}", "k9s_version": "${K9S_VERSION}" @@ -1140,6 +1146,8 @@ Options (can also be set via environment variables): Image Registry: IMAGE_REGISTRY Registry prefix for custom images (default: ghcr.io/amdresearch) Override when pulling from a fork or private registry. + IMAGE_TAG Image tag prefix (default: latest). GPU suffix appended automatically. + Use "develop" for images built from the develop branch. Offline Deployment: 1. On a machine with internet access, create bundle: From f3dc90426c42b19d2504ef60ea44e30b6cac9a6f Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:04:10 +0800 Subject: [PATCH 04/25] fix(installer): fix glob quoting in load_offline_images and move registry/tag restore out of gpu_target guard --- auplc-installer | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/auplc-installer b/auplc-installer index cf9dc0b..8cc6d41 100755 --- a/auplc-installer +++ b/auplc-installer @@ -119,12 +119,13 @@ function detect_offline_bundle() { image_registry=$(sed -n 's/.*"image_registry"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") image_tag=$(sed -n 's/.*"image_tag"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + [[ -n "${image_registry}" ]] && IMAGE_REGISTRY="${image_registry}" + [[ -n "${image_tag}" ]] && IMAGE_TAG="${image_tag}" + if [[ -n "${gpu_target}" ]]; then GPU_TARGET="${gpu_target}" ACCEL_KEY="${accel_key}" ACCEL_ENV="${accel_env}" - [[ -n "${image_registry}" ]] && IMAGE_REGISTRY="${image_registry}" - [[ -n "${image_tag}" ]] && IMAGE_TAG="${image_tag}" echo " GPU config: accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" fi } @@ -734,7 +735,7 @@ function load_offline_images() { local loaded=0 failed=0 - for tar_file in "${BUNDLE_DIR}/images/"custom/*.tar "${BUNDLE_DIR}/images/"external/*.tar; do + for tar_file in "${BUNDLE_DIR}/images/custom"/*.tar "${BUNDLE_DIR}/images/external"/*.tar; do [[ -f "${tar_file}" ]] || continue echo " Importing: $(basename "${tar_file}")" if sudo k3s ctr images import "${tar_file}" 2>/dev/null; then From a2c1c0287a1ecfc7942e7cdd7ae187daadc348b6 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:09:22 +0800 Subject: [PATCH 05/25] fix(installer): remove traefik from EXTERNAL_IMAGES, already included in K3s airgap bundle --- auplc-installer | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auplc-installer b/auplc-installer index 8cc6d41..bde545f 100755 --- a/auplc-installer +++ b/auplc-installer @@ -73,7 +73,7 @@ EXTERNAL_IMAGES=( "quay.io/jupyterhub/k8s-singleuser-sample:4.1.0" "registry.k8s.io/kube-scheduler:v1.30.8" "registry.k8s.io/pause:3.10" - "traefik:v3.3.1" + # traefik is already included in the K3s airgap images bundle "curlimages/curl:8.5.0" "alpine/git:2.47.2" ) From 3e11295bba70e6193833fa47fca0ec5715763189 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:18:51 +0800 Subject: [PATCH 06/25] fix(installer): load offline images before deploying GPU device plugin --- auplc-installer | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auplc-installer b/auplc-installer index bde545f..2918722 100755 --- a/auplc-installer +++ b/auplc-installer @@ -812,7 +812,6 @@ function deploy_all_components() { generate_values_overlay install_tools install_k3s_single_node - deploy_rocm_gpu_device_plugin if [[ "${OFFLINE_MODE}" == "1" ]]; then load_offline_images @@ -824,6 +823,7 @@ function deploy_all_components() { local_image_build fi + deploy_rocm_gpu_device_plugin deply_aup_learning_cloud_runtime } From 7895da60de1326b80febf40f0fcebaf5283e8d88 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:37:53 +0800 Subject: [PATCH 07/25] fix(offline): move hub image override after custom block in values overlay hub.image was incorrectly nested inside custom.resources.images block, causing metadata to be misinterpreted as hub.image property and triggering Helm schema validation failure. --- auplc-installer | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/auplc-installer b/auplc-installer index 2918722..b99f0b9 100755 --- a/auplc-installer +++ b/auplc-installer @@ -264,6 +264,13 @@ function generate_values_overlay() { echo " acceleratorKeys:" echo " - ${ACCEL_KEY}" done + if [[ "${OFFLINE_MODE}" == "1" ]]; then + echo "hub:" + echo " image:" + echo " name: \"${IMAGE_REGISTRY}/auplc-hub\"" + echo " tag: \"${IMAGE_TAG}\"" + echo " pullPolicy: IfNotPresent" + fi } > "${overlay_path}" } @@ -476,6 +483,9 @@ function deploy_rocm_gpu_device_plugin() { if [[ "${OFFLINE_MODE}" == "1" ]]; then kubectl create -f "${BUNDLE_DIR}/manifests/k8s-ds-amdgpu-dp.yaml" + # Patch imagePullPolicy to avoid pulling from registry in air-gapped environments + kubectl patch ds amdgpu-device-plugin-daemonset -n kube-system --type=json \ + -p '[{"op":"replace","path":"/spec/template/spec/containers/0/imagePullPolicy","value":"IfNotPresent"}]' else kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml fi From 96e08e61f723047cd6c9be1581febb20a97b8ab1 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:53:06 +0800 Subject: [PATCH 08/25] fix(pack): save plain images with both :latest and :${IMAGE_TAG} tags for consistency Both pull and local-build modes now save hub/default images with :latest and :${IMAGE_TAG} tags, matching GPU image behavior. This ensures values.local.yaml references always resolve regardless of which IMAGE_TAG was used during pack. --- auplc-installer | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/auplc-installer b/auplc-installer index b99f0b9..8e6ff33 100755 --- a/auplc-installer +++ b/auplc-installer @@ -933,8 +933,12 @@ function pack_save_custom_images_pull() { for name in "${PLAIN_CUSTOM_NAMES[@]}"; do local image="${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" if pull_and_tag "${image}"; then - docker save "${image}" -o "${staging}/images/custom/${name}.tar" - echo " Saved: ${name}" + docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" + docker save \ + "${IMAGE_REGISTRY}/${name}:latest" \ + "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" \ + -o "${staging}/images/custom/${name}.tar" + echo " Saved: ${name} (:latest + :${IMAGE_TAG})" else failed=$((failed + 1)) fi @@ -975,9 +979,12 @@ function pack_save_custom_images_local() { done for name in "${PLAIN_CUSTOM_NAMES[@]}"; do - docker save "${IMAGE_REGISTRY}/${name}:latest" \ + docker tag "${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" + docker save \ + "${IMAGE_REGISTRY}/${name}:latest" \ + "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" \ -o "${staging}/images/custom/${name}.tar" - echo " Saved: ${name}" + echo " Saved: ${name} (:latest + :${IMAGE_TAG})" done } From 0e40738431f3ae047102cf8d0fdda74846cb8016 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:55:40 +0800 Subject: [PATCH 09/25] fix(offline): fail fast when image import fails in load_offline_images Silent warning on import failure could leave the cluster with missing images that cause pod failures at runtime. Now exits immediately so the user sees a clear error instead of a mysteriously broken install. --- auplc-installer | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/auplc-installer b/auplc-installer index 8e6ff33..402e51e 100755 --- a/auplc-installer +++ b/auplc-installer @@ -758,7 +758,10 @@ function load_offline_images() { echo "===========================================" echo "Loaded ${loaded} images, ${failed} failed" - [[ "${failed}" -gt 0 ]] && echo "Warning: Some images failed to load." + if [[ "${failed}" -gt 0 ]]; then + echo "Error: ${failed} image(s) failed to import. Bundle may be corrupted." >&2 + exit 1 + fi echo "===========================================" } From 34584d619b4f4829797be76b88243720e5a2f282 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:00:32 +0800 Subject: [PATCH 10/25] refactor(installer): improve code clarity and consistency - Remove redundant CUSTOM_IMAGES/IMAGES arrays; GPU_CUSTOM_NAMES and PLAIN_CUSTOM_NAMES are the single source of truth for image lists - Fix typo: deply_aup_learning_cloud_runtime -> deploy_aup_learning_cloud_runtime - Remove duplicate generate_values_overlay call in deploy function (orchestration now handled exclusively by callers) - Remove unused check_root function; inline root check at entry points of deploy_all_components and pack_bundle - Add missing section headers for Runtime Management group - rt install/reinstall and legacy install-runtime now correctly call detect/get_paths/generate_overlay before deploy --- auplc-installer | 65 ++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/auplc-installer b/auplc-installer index 402e51e..6a96111 100755 --- a/auplc-installer +++ b/auplc-installer @@ -48,19 +48,10 @@ IMAGE_REGISTRY="${IMAGE_REGISTRY:-ghcr.io/amdresearch}" # Image tag prefix (e.g. latest, develop, v1.0). GPU suffix is appended automatically. IMAGE_TAG="${IMAGE_TAG:-latest}" -# Custom images (built locally or pulled from GHCR) -CUSTOM_IMAGES=( - "${IMAGE_REGISTRY}/auplc-hub:latest" - "${IMAGE_REGISTRY}/auplc-default:latest" - "${IMAGE_REGISTRY}/auplc-cv:latest" - "${IMAGE_REGISTRY}/auplc-dl:latest" - "${IMAGE_REGISTRY}/auplc-llm:latest" -) - -# GPU-specific custom images (have :latest- tags) +# GPU-specific custom images (tagged as :-) GPU_CUSTOM_NAMES=("auplc-base" "auplc-cv" "auplc-dl" "auplc-llm" "auplc-physim") -# Non-GPU custom images (only :latest tag) +# Non-GPU custom images (tagged as :) PLAIN_CUSTOM_NAMES=("auplc-hub" "auplc-default") # External images required by JupyterHub at runtime @@ -85,9 +76,6 @@ BUILD_ONLY_IMAGES=( "quay.io/jupyter/base-notebook" ) -# Combined list for backward compatibility (img pull still pulls everything) -IMAGES=("${CUSTOM_IMAGES[@]}") - # GPU configuration globals (set by detect_and_configure_gpu) ACCEL_KEY="" GPU_TARGET="" @@ -278,13 +266,6 @@ function generate_values_overlay() { # Tool Installation (Helm, K9s) # ============================================================ -function check_root() { - if [[ $EUID -ne 0 ]]; then - echo "Error: This script must be run as root." >&2 - exit 1 - fi -} - function install_tools() { echo "Checking/Installing tools (may require sudo)..." @@ -575,7 +556,6 @@ function local_image_build() { GPU_TARGET="${GPU_TARGET}" \ SAVE_IMAGES="${save_images_for_make}" \ K3S_IMAGES_DIR="${images_dir_for_make}" \ - IMAGES="${IMAGES[*]}" \ MIRROR_PREFIX="${MIRROR_PREFIX}" \ MIRROR_PIP="${MIRROR_PIP}" \ MIRROR_NPM="${MIRROR_NPM}" \ @@ -769,6 +749,10 @@ function load_offline_images() { # Runtime Management # ============================================================ +# ============================================================ +# Runtime Management +# ============================================================ + # Resolve chart/values paths (bundle or local repo) function get_runtime_paths() { if [[ "${OFFLINE_MODE}" == "1" ]]; then @@ -782,11 +766,7 @@ function get_runtime_paths() { fi } -function deply_aup_learning_cloud_runtime() { - detect_and_configure_gpu - get_runtime_paths - generate_values_overlay - +function deploy_aup_learning_cloud_runtime() { echo "Deploying AUP Learning Cloud Runtime..." helm install jupyterhub "${CHART_PATH}" --namespace jupyterhub \ @@ -818,6 +798,11 @@ function remove_aup_learning_cloud_runtime() { # ============================================================ function deploy_all_components() { + if [[ $EUID -ne 0 ]]; then + echo "Error: This script must be run as root." >&2 + exit 1 + fi + local flag="${1:-}" detect_and_configure_gpu @@ -837,7 +822,7 @@ function deploy_all_components() { fi deploy_rocm_gpu_device_plugin - deply_aup_learning_cloud_runtime + deploy_aup_learning_cloud_runtime } function remove_all_components() { @@ -1049,6 +1034,11 @@ EOF } function pack_bundle() { + if [[ $EUID -ne 0 ]]; then + echo "Error: This script must be run as root." >&2 + exit 1 + fi + local flag="${1:-}" echo "===========================================" @@ -1221,13 +1211,21 @@ case "$1" in detect-gpu) detect_and_configure_gpu ;; rt) case "${2:-}" in - install) deply_aup_learning_cloud_runtime ;; + install) + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + deploy_aup_learning_cloud_runtime + ;; upgrade) upgrade_aup_learning_cloud_runtime ;; remove) remove_aup_learning_cloud_runtime ;; reinstall) remove_aup_learning_cloud_runtime || true sleep 0.5 - deply_aup_learning_cloud_runtime + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + deploy_aup_learning_cloud_runtime ;; *) echo "Usage: $0 rt {install|upgrade|remove|reinstall}"; exit 1 ;; esac @@ -1243,7 +1241,12 @@ case "$1" in esac ;; # Legacy long form (still supported) - install-runtime) deply_aup_learning_cloud_runtime ;; + install-runtime) + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + deploy_aup_learning_cloud_runtime + ;; remove-runtime) remove_aup_learning_cloud_runtime ;; upgrade-runtime) upgrade_aup_learning_cloud_runtime ;; build-images) local_image_build ;; From 18ca4ff4fa4fbffd7d5158334cf0ade0953bb221 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:06:23 +0800 Subject: [PATCH 11/25] fix(ci): simplify pack-bundle workflow inputs and fix pack root check - Merge gpu_target + gpu_type into single gpu_type choice; installer derives GPU_TARGET internally via resolve_gpu_config - Add rdna4 option (gfx120x) to match upstream installer support - image_tag now defaults to current branch name (github.ref_name) so develop branch packs use 'develop' tag automatically - Use env: block instead of inline var prefix for cleaner CI syntax - Remove root check from pack_bundle; pack only needs docker/wget, not root access (install still requires root) --- .github/workflows/pack-bundle.yml | 51 +++++++++++++------------------ auplc-installer | 5 --- 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index f4d2c2b..d1afad3 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -22,32 +22,25 @@ name: Pack Offline Bundle on: workflow_dispatch: inputs: - gpu_target: - description: 'GPU target for the bundle' - required: true - default: 'gfx1151' - type: choice - options: - - gfx110x - - gfx1151 gpu_type: - description: 'GPU type (accelerator name)' + description: 'GPU type (determines target architecture and HSA config)' required: true default: 'strix-halo' type: choice options: - - strix-halo - - strix - - phx - image_registry: - description: 'Registry prefix for custom images' + - strix-halo # gfx1151 — Ryzen AI Max+ 395 / Max 390 + - phx # gfx110x — Ryzen AI 300 (Phoenix) + - strix # gfx110x + HSA override — Ryzen AI 300 (Strix Point) + - rdna4 # gfx120x — Radeon RX 9000 series + image_tag: + description: 'Image tag prefix (default: current branch name, e.g. develop, latest, v1.0)' required: false - default: 'ghcr.io/amdresearch' + default: '' type: string - image_tag: - description: 'Image tag prefix (e.g. latest, develop, v1.0)' + image_registry: + description: 'Registry prefix for custom images (override for forks or private registries)' required: false - default: 'latest' + default: 'ghcr.io/amdresearch' type: string create_release: description: 'Create a GitHub Release with the bundle' @@ -61,7 +54,7 @@ permissions: jobs: pack: - name: "Pack Bundle (${{ inputs.gpu_target }})" + name: "Pack Bundle (${{ inputs.gpu_type }})" runs-on: ubuntu-latest steps: - name: Free disk space @@ -89,11 +82,11 @@ jobs: password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} - name: Run pack command - run: | - GPU_TYPE="${{ inputs.gpu_type }}" \ - IMAGE_REGISTRY="${{ inputs.image_registry }}" \ - IMAGE_TAG="${{ inputs.image_tag }}" \ - ./auplc-installer pack + env: + GPU_TYPE: ${{ inputs.gpu_type }} + IMAGE_REGISTRY: ${{ inputs.image_registry }} + IMAGE_TAG: ${{ inputs.image_tag || github.ref_name }} + run: ./auplc-installer pack - name: Verify bundle run: | @@ -109,7 +102,7 @@ jobs: - name: Upload bundle as artifact uses: actions/upload-artifact@v4 with: - name: auplc-bundle-${{ inputs.gpu_target }} + name: auplc-bundle-${{ inputs.gpu_type }} path: auplc-bundle-*.tar.gz retention-days: 7 compression-level: 0 # already compressed @@ -121,17 +114,17 @@ jobs: run: | BUNDLE=$(ls auplc-bundle-*.tar.gz) BUNDLE_NAME=$(basename "${BUNDLE}" .tar.gz) - TAG="bundle-${{ inputs.gpu_target }}-$(date +%Y%m%d)" + TAG="bundle-${{ inputs.gpu_type }}-$(date +%Y%m%d)" gh release create "${TAG}" "${BUNDLE}" \ - --title "Offline Bundle: ${{ inputs.gpu_target }} ($(date +%Y-%m-%d))" \ + --title "Offline Bundle: ${{ inputs.gpu_type }} ($(date +%Y-%m-%d))" \ --notes "$(cat <&2 - exit 1 - fi - local flag="${1:-}" echo "===========================================" From 993c88dc132793865cde45165a96bbdac4ad7772 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:11:56 +0800 Subject: [PATCH 12/25] fix(ci): sanitize branch name for Docker image tag github.ref_name for feature branches contains '/' (e.g. feature/offline-pack) which is invalid in Docker tags. Replace '/' with '-' when using branch name as default IMAGE_TAG. --- .github/workflows/pack-bundle.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index d1afad3..2a64de4 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -81,11 +81,22 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} + - name: Resolve image tag + id: tag + run: | + # Use explicit input if provided; otherwise sanitize branch name + # (Docker tags cannot contain '/', replace with '-') + if [[ -n "${{ inputs.image_tag }}" ]]; then + echo "value=${{ inputs.image_tag }}" >> "$GITHUB_OUTPUT" + else + echo "value=$(echo '${{ github.ref_name }}' | tr '/' '-')" >> "$GITHUB_OUTPUT" + fi + - name: Run pack command env: GPU_TYPE: ${{ inputs.gpu_type }} IMAGE_REGISTRY: ${{ inputs.image_registry }} - IMAGE_TAG: ${{ inputs.image_tag || github.ref_name }} + IMAGE_TAG: ${{ steps.tag.outputs.value }} run: ./auplc-installer pack - name: Verify bundle @@ -122,7 +133,7 @@ jobs: ## Offline Deployment Bundle - **GPU Type**: ${{ inputs.gpu_type }} - - **Image Tag**: ${{ inputs.image_tag || github.ref_name }} + - **Image Tag**: ${{ steps.tag.outputs.value }} - **Bundle**: ${BUNDLE_NAME} - **Built from**: ${{ github.sha }} (${{ github.ref_name }}) From ad57e6bbcf65d4733b31238bf437c4a254cd8eb3 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:33:32 +0800 Subject: [PATCH 13/25] fix(ci/pack): silently sanitize IMAGE_TAG by replacing '/' with '-' Branch names like 'feature/offline-pack' are invalid Docker tags. Both the workflow and pack_bundle now auto-replace '/' with '-' so no manual sanitization is needed by the caller. --- .github/workflows/pack-bundle.yml | 12 +++++------- auplc-installer | 3 +++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 2a64de4..b63da29 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -84,13 +84,11 @@ jobs: - name: Resolve image tag id: tag run: | - # Use explicit input if provided; otherwise sanitize branch name - # (Docker tags cannot contain '/', replace with '-') - if [[ -n "${{ inputs.image_tag }}" ]]; then - echo "value=${{ inputs.image_tag }}" >> "$GITHUB_OUTPUT" - else - echo "value=$(echo '${{ github.ref_name }}' | tr '/' '-')" >> "$GITHUB_OUTPUT" - fi + # Use explicit input if provided; otherwise derive from branch name. + # Sanitize: Docker tags cannot contain '/' — replace with '-'. + RAW="${{ inputs.image_tag || github.ref_name }}" + echo "value=${RAW//\//-}" >> "$GITHUB_OUTPUT" + echo "Resolved IMAGE_TAG: ${RAW//\//-}" - name: Run pack command env: diff --git a/auplc-installer b/auplc-installer index 30f37f5..f8ff7cd 100755 --- a/auplc-installer +++ b/auplc-installer @@ -1036,6 +1036,9 @@ EOF function pack_bundle() { local flag="${1:-}" + # Sanitize IMAGE_TAG: Docker tags cannot contain '/' (e.g. branch names) + IMAGE_TAG="${IMAGE_TAG//\//-}" + echo "===========================================" echo "AUP Learning Cloud - Pack Offline Bundle" if [[ "${flag}" == "--local" ]]; then From 86d3f862ec8680edac6476c57bc2bb732fedbfd3 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:05:40 +0800 Subject: [PATCH 14/25] ci(pack): auto-pack on release tags, matrix all GPU types - Add workflow_run trigger: fires after 'Build Docker Images' completes, ensuring all images (hub, base, courses) are built before packing starts - pack-release job: matrix over all 4 GPU types, only runs on v* tags pushed to AMDResearch/aup-learning-cloud (main repo guard) - pack-release attaches bundles to the existing GitHub Release - pack-manual job: unchanged workflow_dispatch flow for manual testing - Fix tar SIGPIPE false error in verify step (2>/dev/null) --- .github/workflows/pack-bundle.yml | 112 ++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index b63da29..bb0829e 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -20,6 +20,13 @@ name: Pack Offline Bundle on: + # Automatic: fires after all images are built on a release tag push. + # The job condition below filters to v* tags on the main repo only. + workflow_run: + workflows: ["Build Docker Images"] + types: [completed] + + # Manual: for testing or on-demand bundle creation. workflow_dispatch: inputs: gpu_type: @@ -33,7 +40,7 @@ on: - strix # gfx110x + HSA override — Ryzen AI 300 (Strix Point) - rdna4 # gfx120x — Radeon RX 9000 series image_tag: - description: 'Image tag prefix (default: current branch name, e.g. develop, latest, v1.0)' + description: 'Image tag prefix (default: current branch/tag name)' required: false default: '' type: string @@ -53,9 +60,104 @@ permissions: packages: read jobs: - pack: + # ── Automatic release: one job per GPU target, triggered by workflow_run ── + pack-release: + name: "Pack Bundle (${{ matrix.gpu_type }}) — Release" + if: | + github.event_name == 'workflow_run' && + github.event.workflow_run.conclusion == 'success' && + github.repository == 'AMDResearch/aup-learning-cloud' && + startsWith(github.event.workflow_run.head_branch, 'v') + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + gpu_type: [strix-halo, phx, strix, rdna4] + + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: false + + - name: Check available disk space + run: df -h / + + - name: Checkout code at the release tag + uses: actions/checkout@v4 + with: + ref: ${{ github.event.workflow_run.head_sha }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} + + - name: Resolve image tag + id: tag + run: | + # For a release, use the tag name (e.g. v1.0); Docker tags cannot contain '/'. + RAW="${{ github.event.workflow_run.head_branch }}" + SANITIZED="${RAW//\//-}" + echo "value=${SANITIZED}" >> "$GITHUB_OUTPUT" + echo "Resolved IMAGE_TAG: ${SANITIZED}" + + - name: Run pack command + env: + GPU_TYPE: ${{ matrix.gpu_type }} + IMAGE_REGISTRY: ghcr.io/amdresearch + IMAGE_TAG: ${{ steps.tag.outputs.value }} + run: ./auplc-installer pack + + - name: Verify bundle + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + echo "Bundle: ${BUNDLE}" + echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" + tar tzf "${BUNDLE}" 2>/dev/null | head -30 + echo "---" + echo "Total files: $(tar tzf "${BUNDLE}" | wc -l)" + + - name: Upload bundle as artifact + uses: actions/upload-artifact@v4 + with: + name: auplc-bundle-${{ matrix.gpu_type }} + path: auplc-bundle-*.tar.gz + retention-days: 30 + compression-level: 0 # already compressed + + - name: Attach bundle to GitHub Release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + TAG="${{ github.event.workflow_run.head_branch }}" + + # Upload the bundle to the existing release created by the tag push. + # If the release doesn't exist yet, create it. + if gh release view "${TAG}" &>/dev/null; then + gh release upload "${TAG}" "${BUNDLE}" --clobber + else + gh release create "${TAG}" "${BUNDLE}" \ + --title "Release ${TAG}" \ + --notes "Offline deployment bundles for ${TAG}." + fi + echo "Bundle uploaded to release ${TAG}" + + # ── Manual: single GPU target via workflow_dispatch ── + pack-manual: name: "Pack Bundle (${{ inputs.gpu_type }})" + if: github.event_name == 'workflow_dispatch' runs-on: ubuntu-latest + steps: - name: Free disk space uses: jlumbroso/free-disk-space@main @@ -84,7 +186,7 @@ jobs: - name: Resolve image tag id: tag run: | - # Use explicit input if provided; otherwise derive from branch name. + # Use explicit input if provided; otherwise derive from branch/tag name. # Sanitize: Docker tags cannot contain '/' — replace with '-'. RAW="${{ inputs.image_tag || github.ref_name }}" echo "value=${RAW//\//-}" >> "$GITHUB_OUTPUT" @@ -102,9 +204,7 @@ jobs: BUNDLE=$(ls auplc-bundle-*.tar.gz) echo "Bundle: ${BUNDLE}" echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" - - # Extract and verify structure - tar tzf "${BUNDLE}" | head -30 + tar tzf "${BUNDLE}" 2>/dev/null | head -30 echo "---" echo "Total files: $(tar tzf "${BUNDLE}" | wc -l)" From fda559b1f9bf40e909397243dc0310c4da98136e Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:07:15 +0800 Subject: [PATCH 15/25] ci(pack): remove slow file count in verify step --- .github/workflows/pack-bundle.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index bb0829e..f249b35 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -123,8 +123,6 @@ jobs: echo "Bundle: ${BUNDLE}" echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" tar tzf "${BUNDLE}" 2>/dev/null | head -30 - echo "---" - echo "Total files: $(tar tzf "${BUNDLE}" | wc -l)" - name: Upload bundle as artifact uses: actions/upload-artifact@v4 @@ -205,8 +203,6 @@ jobs: echo "Bundle: ${BUNDLE}" echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" tar tzf "${BUNDLE}" 2>/dev/null | head -30 - echo "---" - echo "Total files: $(tar tzf "${BUNDLE}" | wc -l)" - name: Upload bundle as artifact uses: actions/upload-artifact@v4 From 7ef70b8c5f6df1e8f8b88b6fd97534d7a2b1203a Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:08:41 +0800 Subject: [PATCH 16/25] ci(pack): simplify verify step to filename and size only --- .github/workflows/pack-bundle.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index f249b35..2072c4c 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -122,8 +122,6 @@ jobs: BUNDLE=$(ls auplc-bundle-*.tar.gz) echo "Bundle: ${BUNDLE}" echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" - tar tzf "${BUNDLE}" 2>/dev/null | head -30 - - name: Upload bundle as artifact uses: actions/upload-artifact@v4 with: @@ -202,8 +200,6 @@ jobs: BUNDLE=$(ls auplc-bundle-*.tar.gz) echo "Bundle: ${BUNDLE}" echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" - tar tzf "${BUNDLE}" 2>/dev/null | head -30 - - name: Upload bundle as artifact uses: actions/upload-artifact@v4 with: From 7ec7d40ba1680f22df07bae91e9dc91dd884a8cc Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Fri, 6 Mar 2026 17:15:24 +0800 Subject: [PATCH 17/25] perf(pack): save all custom images into one tar to deduplicate shared layers Course images (cv/dl/llm/physim) all share auplc-base layers. Saving them separately caused those layers to be written N times. A single docker save call with all image refs deduplicates shared layers automatically, reducing bundle size significantly. --- auplc-installer | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/auplc-installer b/auplc-installer index f8ff7cd..03f69cb 100755 --- a/auplc-installer +++ b/auplc-installer @@ -895,6 +895,7 @@ function pack_copy_chart() { } # Save custom images: pull from GHCR, then docker save +# All images are saved into a single tar to deduplicate shared layers. function pack_save_custom_images_pull() { local staging="$1" local tag="${IMAGE_TAG}-${GPU_TARGET}" @@ -903,16 +904,14 @@ function pack_save_custom_images_pull() { mkdir -p "${staging}/images/custom" local failed=0 + local all_refs=() for name in "${GPU_CUSTOM_NAMES[@]}"; do local image="${IMAGE_REGISTRY}/${name}:${tag}" if pull_and_tag "${image}"; then docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" - docker save \ - "${IMAGE_REGISTRY}/${name}:latest" \ - "${IMAGE_REGISTRY}/${name}:${tag}" \ - -o "${staging}/images/custom/${name}.tar" - echo " Saved: ${name} (:latest + :${tag})" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${tag}") + echo " Pulled: ${name} (:latest + :${tag})" else failed=$((failed + 1)) fi @@ -922,11 +921,8 @@ function pack_save_custom_images_pull() { local image="${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" if pull_and_tag "${image}"; then docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" - docker save \ - "${IMAGE_REGISTRY}/${name}:latest" \ - "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" \ - -o "${staging}/images/custom/${name}.tar" - echo " Saved: ${name} (:latest + :${IMAGE_TAG})" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}") + echo " Pulled: ${name} (:latest + :${IMAGE_TAG})" else failed=$((failed + 1)) fi @@ -938,6 +934,10 @@ function pack_save_custom_images_pull() { rm -rf "${staging}" exit 1 fi + + echo " Saving all custom images (shared layers deduplicated)..." + docker save "${all_refs[@]}" -o "${staging}/images/custom/auplc-custom.tar" + echo " Saved: ${staging}/images/custom/auplc-custom.tar" } # Save custom images: build locally via Makefile, then docker save @@ -956,24 +956,23 @@ function pack_save_custom_images_local() { MIRROR_NPM="${MIRROR_NPM}" \ all) - echo "--- Saving built images to bundle ---" + echo "--- Saving built images to bundle (shared layers deduplicated) ---" + + local all_refs=() for name in "${GPU_CUSTOM_NAMES[@]}"; do - docker save \ - "${IMAGE_REGISTRY}/${name}:latest" \ - "${IMAGE_REGISTRY}/${name}:${tag}" \ - -o "${staging}/images/custom/${name}.tar" - echo " Saved: ${name} (:latest + :${tag})" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${tag}") + echo " Queued: ${name} (:latest + :${tag})" done for name in "${PLAIN_CUSTOM_NAMES[@]}"; do docker tag "${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" - docker save \ - "${IMAGE_REGISTRY}/${name}:latest" \ - "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" \ - -o "${staging}/images/custom/${name}.tar" - echo " Saved: ${name} (:latest + :${IMAGE_TAG})" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}") + echo " Queued: ${name} (:latest + :${IMAGE_TAG})" done + + docker save "${all_refs[@]}" -o "${staging}/images/custom/auplc-custom.tar" + echo " Saved: ${staging}/images/custom/auplc-custom.tar" } # Save external images (always pulled from registries) From 832e749b587ce1170527c4b93550bb0a39370110 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 09:39:15 +0800 Subject: [PATCH 18/25] ci(pack): allow fork repo in release trigger condition --- .github/workflows/pack-bundle.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 2072c4c..4bb205e 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -66,8 +66,8 @@ jobs: if: | github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && - github.repository == 'AMDResearch/aup-learning-cloud' && - startsWith(github.event.workflow_run.head_branch, 'v') + startsWith(github.event.workflow_run.head_branch, 'v') && + (github.repository == 'AMDResearch/aup-learning-cloud' || github.repository == 'MioYuuIH/aup-learning-cloud') runs-on: ubuntu-latest strategy: fail-fast: false From 48d83c3313079eb3fb3a228375cc5efaeeb45f77 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 09:57:06 +0800 Subject: [PATCH 19/25] ci: add type=ref,event=tag to all metadata tag lists Ensures any v* tag (semver or not) gets pushed with the exact tag name. Previously non-semver tags (e.g. v0.1-test) would only get sha-based tags, causing course image builds to fail when looking for the base image by tag. Also removes main repo restriction from pack-release trigger condition. --- .github/workflows/docker-build.yml | 4 ++++ .github/workflows/pack-bundle.yml | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 62cdb76..e873c80 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -319,6 +319,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Docker metadata (unsuffixed tags — default target only) @@ -335,6 +336,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Merge tags @@ -455,6 +457,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Docker metadata (unsuffixed tags — default target only) @@ -471,6 +474,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Merge tags diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 4bb205e..e5c0411 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -66,8 +66,7 @@ jobs: if: | github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && - startsWith(github.event.workflow_run.head_branch, 'v') && - (github.repository == 'AMDResearch/aup-learning-cloud' || github.repository == 'MioYuuIH/aup-learning-cloud') + startsWith(github.event.workflow_run.head_branch, 'v') runs-on: ubuntu-latest strategy: fail-fast: false From 005985655c575caff049ec9066dcd7464123e6fa Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:50:22 +0800 Subject: [PATCH 20/25] ci(pack): derive IMAGE_REGISTRY from repository owner --- .github/workflows/pack-bundle.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index e5c0411..67b88f7 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -112,7 +112,7 @@ jobs: - name: Run pack command env: GPU_TYPE: ${{ matrix.gpu_type }} - IMAGE_REGISTRY: ghcr.io/amdresearch + IMAGE_REGISTRY: ghcr.io/${{ github.repository_owner }} IMAGE_TAG: ${{ steps.tag.outputs.value }} run: ./auplc-installer pack From 4afa48f2a58da07182e98038feaeb66d7a1fd296 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:15:57 +0800 Subject: [PATCH 21/25] ci(pack): lowercase repository owner for image registry --- .github/workflows/pack-bundle.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 67b88f7..fb7667b 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -100,19 +100,20 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} - - name: Resolve image tag + - name: Resolve image tag and registry id: tag run: | - # For a release, use the tag name (e.g. v1.0); Docker tags cannot contain '/'. RAW="${{ github.event.workflow_run.head_branch }}" SANITIZED="${RAW//\//-}" echo "value=${SANITIZED}" >> "$GITHUB_OUTPUT" echo "Resolved IMAGE_TAG: ${SANITIZED}" + OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + echo "registry=ghcr.io/${OWNER}" >> "$GITHUB_OUTPUT" - name: Run pack command env: GPU_TYPE: ${{ matrix.gpu_type }} - IMAGE_REGISTRY: ghcr.io/${{ github.repository_owner }} + IMAGE_REGISTRY: ${{ steps.tag.outputs.registry }} IMAGE_TAG: ${{ steps.tag.outputs.value }} run: ./auplc-installer pack From 9603fac0ca81c1cffa8af563c5fdee966ae9715e Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:06:32 +0800 Subject: [PATCH 22/25] ci(pack): continue-on-error for release asset upload --- .github/workflows/pack-bundle.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index fb7667b..2d5de39 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -131,6 +131,7 @@ jobs: compression-level: 0 # already compressed - name: Attach bundle to GitHub Release + continue-on-error: true env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | From d8fcd10e8c9fb9afc9420b584317ddfe42e5937d Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:08:50 +0800 Subject: [PATCH 23/25] ci(pack): skip release upload if no release exists, don't auto-create --- .github/workflows/pack-bundle.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 2d5de39..4d52192 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -138,16 +138,14 @@ jobs: BUNDLE=$(ls auplc-bundle-*.tar.gz) TAG="${{ github.event.workflow_run.head_branch }}" - # Upload the bundle to the existing release created by the tag push. - # If the release doesn't exist yet, create it. + # Upload to the existing release. Releases are created manually with + # proper release notes before tagging; CI only attaches the bundle. if gh release view "${TAG}" &>/dev/null; then gh release upload "${TAG}" "${BUNDLE}" --clobber + echo "Bundle uploaded to release ${TAG}" else - gh release create "${TAG}" "${BUNDLE}" \ - --title "Release ${TAG}" \ - --notes "Offline deployment bundles for ${TAG}." + echo "No release found for ${TAG}, skipping upload." fi - echo "Bundle uploaded to release ${TAG}" # ── Manual: single GPU target via workflow_dispatch ── pack-manual: From 8b4540fd1332f2039a32da917a0cecdd3f57d082 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:10:12 +0800 Subject: [PATCH 24/25] ci(pack): skip GPU if bundle already exists in release --- .github/workflows/pack-bundle.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index 4d52192..c30a626 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -110,7 +110,28 @@ jobs: OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') echo "registry=ghcr.io/${OWNER}" >> "$GITHUB_OUTPUT" + - name: Check if bundle already exists in release + id: check + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + TAG="${{ steps.tag.outputs.value }}" + GPU="${{ matrix.gpu_type }}" + if gh release view "${TAG}" &>/dev/null; then + # Check if a bundle for this GPU type is already attached + if gh release view "${TAG}" --json assets --jq '.assets[].name' 2>/dev/null \ + | grep -q "auplc-bundle.*${GPU}"; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "Bundle for ${GPU} already exists in release ${TAG}, skipping." + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + - name: Run pack command + if: steps.check.outputs.skip != 'true' env: GPU_TYPE: ${{ matrix.gpu_type }} IMAGE_REGISTRY: ${{ steps.tag.outputs.registry }} @@ -118,11 +139,14 @@ jobs: run: ./auplc-installer pack - name: Verify bundle + if: steps.check.outputs.skip != 'true' run: | BUNDLE=$(ls auplc-bundle-*.tar.gz) echo "Bundle: ${BUNDLE}" echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" + - name: Upload bundle as artifact + if: steps.check.outputs.skip != 'true' uses: actions/upload-artifact@v4 with: name: auplc-bundle-${{ matrix.gpu_type }} @@ -131,6 +155,7 @@ jobs: compression-level: 0 # already compressed - name: Attach bundle to GitHub Release + if: steps.check.outputs.skip != 'true' continue-on-error: true env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 2a0f2ba57b4c12de5aafa51a126b806e7232b8a9 Mon Sep 17 00:00:00 2001 From: ShifZhan <252984256+MioYuuIH@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:05:44 +0800 Subject: [PATCH 25/25] ci(pack): remove auto-create release option from manual job --- .github/workflows/pack-bundle.yml | 34 ------------------------------- 1 file changed, 34 deletions(-) diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml index c30a626..86a57f5 100644 --- a/.github/workflows/pack-bundle.yml +++ b/.github/workflows/pack-bundle.yml @@ -49,12 +49,6 @@ on: required: false default: 'ghcr.io/amdresearch' type: string - create_release: - description: 'Create a GitHub Release with the bundle' - required: false - default: false - type: boolean - permissions: contents: write packages: read @@ -232,31 +226,3 @@ jobs: retention-days: 7 compression-level: 0 # already compressed - - name: Create GitHub Release - if: inputs.create_release - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - BUNDLE=$(ls auplc-bundle-*.tar.gz) - BUNDLE_NAME=$(basename "${BUNDLE}" .tar.gz) - TAG="bundle-${{ inputs.gpu_type }}-$(date +%Y%m%d)" - - gh release create "${TAG}" "${BUNDLE}" \ - --title "Offline Bundle: ${{ inputs.gpu_type }} ($(date +%Y-%m-%d))" \ - --notes "$(cat <