diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 62cdb76..e873c80 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -319,6 +319,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Docker metadata (unsuffixed tags — default target only) @@ -335,6 +336,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Merge tags @@ -455,6 +457,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Docker metadata (unsuffixed tags — default target only) @@ -471,6 +474,7 @@ jobs: type=raw,value=${{ github.event.inputs.version }},enable=${{ github.event.inputs.version != '' }} type=sha,prefix=sha- type=ref,event=branch + type=ref,event=tag type=ref,event=pr - name: Merge tags diff --git a/.github/workflows/pack-bundle.yml b/.github/workflows/pack-bundle.yml new file mode 100644 index 0000000..86a57f5 --- /dev/null +++ b/.github/workflows/pack-bundle.yml @@ -0,0 +1,228 @@ +# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +name: Pack Offline Bundle + +on: + # Automatic: fires after all images are built on a release tag push. + # The job condition below filters to v* tags on the main repo only. + workflow_run: + workflows: ["Build Docker Images"] + types: [completed] + + # Manual: for testing or on-demand bundle creation. + workflow_dispatch: + inputs: + gpu_type: + description: 'GPU type (determines target architecture and HSA config)' + required: true + default: 'strix-halo' + type: choice + options: + - strix-halo # gfx1151 — Ryzen AI Max+ 395 / Max 390 + - phx # gfx110x — Ryzen AI 300 (Phoenix) + - strix # gfx110x + HSA override — Ryzen AI 300 (Strix Point) + - rdna4 # gfx120x — Radeon RX 9000 series + image_tag: + description: 'Image tag prefix (default: current branch/tag name)' + required: false + default: '' + type: string + image_registry: + description: 'Registry prefix for custom images (override for forks or private registries)' + required: false + default: 'ghcr.io/amdresearch' + type: string +permissions: + contents: write + packages: read + +jobs: + # ── Automatic release: one job per GPU target, triggered by workflow_run ── + pack-release: + name: "Pack Bundle (${{ matrix.gpu_type }}) — Release" + if: | + github.event_name == 'workflow_run' && + github.event.workflow_run.conclusion == 'success' && + startsWith(github.event.workflow_run.head_branch, 'v') + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + gpu_type: [strix-halo, phx, strix, rdna4] + + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: false + + - name: Check available disk space + run: df -h / + + - name: Checkout code at the release tag + uses: actions/checkout@v4 + with: + ref: ${{ github.event.workflow_run.head_sha }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} + + - name: Resolve image tag and registry + id: tag + run: | + RAW="${{ github.event.workflow_run.head_branch }}" + SANITIZED="${RAW//\//-}" + echo "value=${SANITIZED}" >> "$GITHUB_OUTPUT" + echo "Resolved IMAGE_TAG: ${SANITIZED}" + OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + echo "registry=ghcr.io/${OWNER}" >> "$GITHUB_OUTPUT" + + - name: Check if bundle already exists in release + id: check + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + TAG="${{ steps.tag.outputs.value }}" + GPU="${{ matrix.gpu_type }}" + if gh release view "${TAG}" &>/dev/null; then + # Check if a bundle for this GPU type is already attached + if gh release view "${TAG}" --json assets --jq '.assets[].name' 2>/dev/null \ + | grep -q "auplc-bundle.*${GPU}"; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "Bundle for ${GPU} already exists in release ${TAG}, skipping." + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + + - name: Run pack command + if: steps.check.outputs.skip != 'true' + env: + GPU_TYPE: ${{ matrix.gpu_type }} + IMAGE_REGISTRY: ${{ steps.tag.outputs.registry }} + IMAGE_TAG: ${{ steps.tag.outputs.value }} + run: ./auplc-installer pack + + - name: Verify bundle + if: steps.check.outputs.skip != 'true' + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + echo "Bundle: ${BUNDLE}" + echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" + + - name: Upload bundle as artifact + if: steps.check.outputs.skip != 'true' + uses: actions/upload-artifact@v4 + with: + name: auplc-bundle-${{ matrix.gpu_type }} + path: auplc-bundle-*.tar.gz + retention-days: 30 + compression-level: 0 # already compressed + + - name: Attach bundle to GitHub Release + if: steps.check.outputs.skip != 'true' + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + TAG="${{ github.event.workflow_run.head_branch }}" + + # Upload to the existing release. Releases are created manually with + # proper release notes before tagging; CI only attaches the bundle. + if gh release view "${TAG}" &>/dev/null; then + gh release upload "${TAG}" "${BUNDLE}" --clobber + echo "Bundle uploaded to release ${TAG}" + else + echo "No release found for ${TAG}, skipping upload." + fi + + # ── Manual: single GPU target via workflow_dispatch ── + pack-manual: + name: "Pack Bundle (${{ inputs.gpu_type }})" + if: github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: false + + - name: Check available disk space + run: df -h / + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GH_PACKAGES_TOKEN || secrets.GITHUB_TOKEN }} + + - name: Resolve image tag + id: tag + run: | + # Use explicit input if provided; otherwise derive from branch/tag name. + # Sanitize: Docker tags cannot contain '/' — replace with '-'. + RAW="${{ inputs.image_tag || github.ref_name }}" + echo "value=${RAW//\//-}" >> "$GITHUB_OUTPUT" + echo "Resolved IMAGE_TAG: ${RAW//\//-}" + + - name: Run pack command + env: + GPU_TYPE: ${{ inputs.gpu_type }} + IMAGE_REGISTRY: ${{ inputs.image_registry }} + IMAGE_TAG: ${{ steps.tag.outputs.value }} + run: ./auplc-installer pack + + - name: Verify bundle + run: | + BUNDLE=$(ls auplc-bundle-*.tar.gz) + echo "Bundle: ${BUNDLE}" + echo "Size: $(du -sh "${BUNDLE}" | cut -f1)" + - name: Upload bundle as artifact + uses: actions/upload-artifact@v4 + with: + name: auplc-bundle-${{ inputs.gpu_type }} + path: auplc-bundle-*.tar.gz + retention-days: 7 + compression-level: 0 # already compressed + diff --git a/.gitignore b/.gitignore index 477e4d8..121d4ff 100644 --- a/.gitignore +++ b/.gitignore @@ -372,3 +372,7 @@ dockerfiles/Courses/DL/data/FashionMNIST/raw/ # Local config overrides (any file containing 'local') *local* *.local.* + +# Offline bundle artifacts +auplc-bundle-*/ +auplc-bundle-*.tar.gz diff --git a/auplc-installer b/auplc-installer index 8a6caf7..03f69cb 100755 --- a/auplc-installer +++ b/auplc-installer @@ -22,64 +22,107 @@ set -euo pipefail -# k3s image dir (used only when not using Docker runtime) +# ============================================================ +# Constants & Configuration +# ============================================================ + +# Pinned tool versions (used by pack and offline install) +K3S_VERSION="v1.32.3+k3s1" +HELM_VERSION="v3.17.2" +K9S_VERSION="v0.32.7" + K3S_IMAGES_DIR="/var/lib/rancher/k3s/agent/images" K3S_REGISTRIES_FILE="/etc/rancher/k3s/registries.yaml" -# Default: use host Docker as K3s runtime so "docker build" updates are visible without -# exporting to agent/images. Set K3S_USE_DOCKER=0 for containerd + export (offline/portable). +# K3s container runtime: 1=Docker (dev), 0=containerd (offline/portable) K3S_USE_DOCKER="${K3S_USE_DOCKER:-1}" -# Registry mirror prefix (set via environment variable) -# Example: MIRROR_PREFIX="m.daocloud.io" will transform: -# quay.io/jupyterhub/k8s-hub:4.1.0 -> m.daocloud.io/quay.io/jupyterhub/k8s-hub:4.1.0 +# Registry/package mirror configuration MIRROR_PREFIX="${MIRROR_PREFIX:-}" - -# Package manager mirrors (set via environment variables) MIRROR_PIP="${MIRROR_PIP:-}" MIRROR_NPM="${MIRROR_NPM:-}" -# Custom images (built locally) -CUSTOM_IMAGES=( - "ghcr.io/amdresearch/auplc-hub:latest" - "ghcr.io/amdresearch/auplc-default:latest" - "ghcr.io/amdresearch/auplc-cv:latest" - "ghcr.io/amdresearch/auplc-dl:latest" - "ghcr.io/amdresearch/auplc-llm:latest" -) +# Registry prefix for custom images (override for forks or private registries) +IMAGE_REGISTRY="${IMAGE_REGISTRY:-ghcr.io/amdresearch}" + +# Image tag prefix (e.g. latest, develop, v1.0). GPU suffix is appended automatically. +IMAGE_TAG="${IMAGE_TAG:-latest}" -# External images required by JupyterHub (for offline deployment) +# GPU-specific custom images (tagged as :-) +GPU_CUSTOM_NAMES=("auplc-base" "auplc-cv" "auplc-dl" "auplc-llm" "auplc-physim") + +# Non-GPU custom images (tagged as :) +PLAIN_CUSTOM_NAMES=("auplc-hub" "auplc-default") + +# External images required by JupyterHub at runtime EXTERNAL_IMAGES=( - # JupyterHub core components "quay.io/jupyterhub/k8s-hub:4.1.0" "quay.io/jupyterhub/configurable-http-proxy:4.6.3" "quay.io/jupyterhub/k8s-secret-sync:4.1.0" "quay.io/jupyterhub/k8s-network-tools:4.1.0" "quay.io/jupyterhub/k8s-image-awaiter:4.1.0" "quay.io/jupyterhub/k8s-singleuser-sample:4.1.0" - # Kubernetes components "registry.k8s.io/kube-scheduler:v1.30.8" "registry.k8s.io/pause:3.10" - # Traefik proxy - "traefik:v3.3.1" - # Utility images + # traefik is already included in the K3s airgap images bundle "curlimages/curl:8.5.0" - # Base images for Docker build + "alpine/git:2.47.2" +) + +# Base images only needed for local Docker build, not for runtime or bundle +BUILD_ONLY_IMAGES=( "node:20-alpine" "ubuntu:24.04" "quay.io/jupyter/base-notebook" ) -# Combined list for backward compatibility -IMAGES=("${CUSTOM_IMAGES[@]}") - # GPU configuration globals (set by detect_and_configure_gpu) ACCEL_KEY="" GPU_TARGET="" ACCEL_ENV="" +# ============================================================ +# Offline Bundle Detection +# ============================================================ + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OFFLINE_MODE=0 +BUNDLE_DIR="" + +function detect_offline_bundle() { + if [[ ! -f "${SCRIPT_DIR}/manifest.json" ]]; then + return + fi + + BUNDLE_DIR="${SCRIPT_DIR}" + OFFLINE_MODE=1 + K3S_USE_DOCKER=0 + echo "Offline bundle detected at: ${BUNDLE_DIR}" + + # Parse config from manifest without python + local gpu_target accel_key accel_env image_registry image_tag + gpu_target=$(sed -n 's/.*"gpu_target"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + accel_key=$(sed -n 's/.*"accel_key"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + accel_env=$(sed -n 's/.*"accel_env"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + image_registry=$(sed -n 's/.*"image_registry"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + image_tag=$(sed -n 's/.*"image_tag"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "${BUNDLE_DIR}/manifest.json") + + [[ -n "${image_registry}" ]] && IMAGE_REGISTRY="${image_registry}" + [[ -n "${image_tag}" ]] && IMAGE_TAG="${image_tag}" + + if [[ -n "${gpu_target}" ]]; then + GPU_TARGET="${gpu_target}" + ACCEL_KEY="${accel_key}" + ACCEL_ENV="${accel_env}" + echo " GPU config: accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" + fi +} + +# ============================================================ +# GPU Detection & Configuration +# ============================================================ + function detect_gpu() { - # Try rocminfo first (most readable output) if command -v rocminfo &>/dev/null; then local gfx gfx=$(rocminfo 2>/dev/null | grep -o 'gfx[0-9]*' | head -1) @@ -171,11 +214,18 @@ function detect_and_configure_gpu() { echo " accelerator=${ACCEL_KEY}, GPU_TARGET=${GPU_TARGET}${ACCEL_ENV:+, HSA_OVERRIDE=${ACCEL_ENV}}" } +# ============================================================ +# Values Overlay +# ============================================================ + function generate_values_overlay() { local overlay_path="runtime/values.local.yaml" + if [[ "${OFFLINE_MODE}" == "1" ]]; then + overlay_path="${BUNDLE_DIR}/config/values.local.yaml" + fi echo "Generating values overlay: ${overlay_path}" - local tag="latest-${GPU_TARGET}" + local tag="${IMAGE_TAG}-${GPU_TARGET}" { echo "# Auto-generated by auplc-installer (GPU: ${ACCEL_KEY}, target: ${GPU_TARGET})" @@ -191,33 +241,50 @@ function generate_values_overlay() { echo " resources:" echo " images:" - echo " gpu: \"ghcr.io/amdresearch/auplc-base:${tag}\"" - echo " Course-CV: \"ghcr.io/amdresearch/auplc-cv:${tag}\"" - echo " Course-DL: \"ghcr.io/amdresearch/auplc-dl:${tag}\"" - echo " Course-LLM: \"ghcr.io/amdresearch/auplc-llm:${tag}\"" - echo " Course-PhySim: \"ghcr.io/amdresearch/auplc-physim:${tag}\"" + echo " gpu: \"${IMAGE_REGISTRY}/auplc-base:${tag}\"" + echo " Course-CV: \"${IMAGE_REGISTRY}/auplc-cv:${tag}\"" + echo " Course-DL: \"${IMAGE_REGISTRY}/auplc-dl:${tag}\"" + echo " Course-LLM: \"${IMAGE_REGISTRY}/auplc-llm:${tag}\"" + echo " Course-PhySim: \"${IMAGE_REGISTRY}/auplc-physim:${tag}\"" echo " metadata:" for resource in gpu Course-CV Course-DL Course-LLM Course-PhySim; do echo " ${resource}:" echo " acceleratorKeys:" echo " - ${ACCEL_KEY}" done + if [[ "${OFFLINE_MODE}" == "1" ]]; then + echo "hub:" + echo " image:" + echo " name: \"${IMAGE_REGISTRY}/auplc-hub\"" + echo " tag: \"${IMAGE_TAG}\"" + echo " pullPolicy: IfNotPresent" + fi } > "${overlay_path}" } -function check_root() { - if [[ $EUID -ne 0 ]]; then - echo "Error: This script must be run as root." >&2 - exit 1 - fi -} +# ============================================================ +# Tool Installation (Helm, K9s) +# ============================================================ function install_tools() { echo "Checking/Installing tools (may require sudo)..." + if [[ "${OFFLINE_MODE}" == "1" ]]; then + if ! command -v helm &> /dev/null; then + echo "Installing Helm from bundle..." + sudo cp "${BUNDLE_DIR}/bin/helm" /usr/local/bin/helm + sudo chmod +x /usr/local/bin/helm + fi + if ! command -v k9s &> /dev/null; then + echo "Installing K9s from bundle..." + sudo dpkg -i "${BUNDLE_DIR}/bin/k9s_linux_amd64.deb" + fi + return + fi + if ! command -v helm &> /dev/null; then echo "Installing Helm..." - wget https://get.helm.sh/helm-v3.17.2-linux-amd64.tar.gz -O /tmp/helm-linux-amd64.tar.gz + wget https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz -O /tmp/helm-linux-amd64.tar.gz tar -zxvf /tmp/helm-linux-amd64.tar.gz -C /tmp sudo mv /tmp/linux-amd64/helm /usr/local/bin/helm rm /tmp/helm-linux-amd64.tar.gz @@ -226,16 +293,17 @@ function install_tools() { if ! command -v k9s &> /dev/null; then echo "Installing K9s..." - wget https://github.com/derailed/k9s/releases/latest/download/k9s_linux_amd64.deb -O /tmp/k9s_linux_amd64.deb + wget "https://github.com/derailed/k9s/releases/download/${K9S_VERSION}/k9s_linux_amd64.deb" -O /tmp/k9s_linux_amd64.deb sudo apt install /tmp/k9s_linux_amd64.deb -y rm /tmp/k9s_linux_amd64.deb fi } -function configure_registry_mirrors() { - # Configure K3s registry mirrors using MIRROR_PREFIX - # This must be done BEFORE k3s starts +# ============================================================ +# K3s Management +# ============================================================ +function configure_registry_mirrors() { if [[ -z "${MIRROR_PREFIX}" ]]; then echo "No registry mirror configured. Using default registries." return 0 @@ -244,7 +312,6 @@ function configure_registry_mirrors() { echo "Configuring registry mirrors with prefix: ${MIRROR_PREFIX}" sudo mkdir -p "$(dirname "${K3S_REGISTRIES_FILE}")" - # Configure mirrors for all registries using the prefix pattern local config="mirrors: docker.io: endpoint: @@ -263,15 +330,9 @@ function configure_registry_mirrors() { echo "Registry mirrors configured at ${K3S_REGISTRIES_FILE}" } -# Dummy interface IP for K3s node binding -# Using a private IP range that won't conflict with typical networks K3S_NODE_IP="10.255.255.1" function setup_dummy_interface() { - # Create a dummy network interface for offline/portable operation - # This provides a stable node IP that doesn't change when WiFi/network changes - # Reference: https://docs.k3s.io/installation/airgap - if ip link show dummy0 &>/dev/null; then echo "Dummy interface already exists, skipping setup" return 0 @@ -281,10 +342,8 @@ function setup_dummy_interface() { sudo ip link add dummy0 type dummy sudo ip link set dummy0 up sudo ip addr add "${K3S_NODE_IP}/32" dev dummy0 - # Add a low-priority default route so K3s can detect a valid route sudo ip route add default via "${K3S_NODE_IP}" dev dummy0 metric 1000 2>/dev/null || true - # Make persistent across reboots cat << EOF | sudo tee /etc/systemd/system/dummy-interface.service > /dev/null [Unit] Description=Setup dummy network interface for K3s portable operation @@ -308,30 +367,41 @@ EOF function install_k3s_single_node() { echo "Starting K3s installation..." - if [[ "${K3S_USE_DOCKER}" == "1" ]]; then - echo "Using Docker as container runtime (K3S_USE_DOCKER=1). Images stay in Docker; no export to agent/images." - if ! command -v docker &> /dev/null; then - echo "Error: K3S_USE_DOCKER is set but Docker is not installed. Install Docker first." >&2 - exit 1 - fi - fi - - # Setup dummy interface for offline operation setup_dummy_interface + local k3s_exec="--node-ip=${K3S_NODE_IP} --flannel-iface=dummy0" - # Configure registry mirrors before starting k3s - configure_registry_mirrors + if [[ "${OFFLINE_MODE}" == "1" ]]; then + echo "Offline mode: installing K3s from bundle (containerd)..." - # Build K3s server exec flags (--docker = use host Docker so image updates are visible in dev) - local k3s_exec="--node-ip=${K3S_NODE_IP} --flannel-iface=dummy0" - if [[ "${K3S_USE_DOCKER}" == "1" ]]; then - k3s_exec="${k3s_exec} --docker" - fi + sudo cp "${BUNDLE_DIR}/bin/k3s" /usr/local/bin/k3s + sudo chmod +x /usr/local/bin/k3s + + sudo mkdir -p "${K3S_IMAGES_DIR}" + for img_file in "${BUNDLE_DIR}"/k3s-images/*; do + [[ -f "${img_file}" ]] || continue + echo " Copying: $(basename "${img_file}")" + sudo cp "${img_file}" "${K3S_IMAGES_DIR}/" + done - # Bind K3s to dummy interface IP for portable operation - # With --docker, K3s uses host Docker; image updates (e.g. make hub) are visible without re-export. - curl -sfL https://get.k3s.io | sudo K3S_KUBECONFIG_MODE="644" \ - INSTALL_K3S_EXEC="${k3s_exec}" sh - + sudo INSTALL_K3S_SKIP_DOWNLOAD=true \ + K3S_KUBECONFIG_MODE="644" \ + INSTALL_K3S_EXEC="${k3s_exec}" \ + bash "${BUNDLE_DIR}/bin/k3s-install.sh" + else + if [[ "${K3S_USE_DOCKER}" == "1" ]]; then + echo "Using Docker as container runtime (K3S_USE_DOCKER=1)." + if ! command -v docker &> /dev/null; then + echo "Error: K3S_USE_DOCKER is set but Docker is not installed." >&2 + exit 1 + fi + k3s_exec="${k3s_exec} --docker" + fi + + configure_registry_mirrors + + curl -sfL https://get.k3s.io | sudo K3S_KUBECONFIG_MODE="644" \ + INSTALL_K3S_EXEC="${k3s_exec}" sh - + fi echo "Configuring kubeconfig for user: $(whoami)" mkdir -p "$HOME/.kube" @@ -367,7 +437,6 @@ function remove_k3s() { echo "Removing K3S local data" sudo rm -rf /var/lib/rancher/k3s - # Remove dummy interface service if [[ -f /etc/systemd/system/dummy-interface.service ]]; then echo "Removing dummy interface service..." sudo systemctl disable dummy-interface.service 2>/dev/null || true @@ -375,13 +444,16 @@ function remove_k3s() { sudo systemctl daemon-reload fi - # Remove dummy interface if ip link show dummy0 &>/dev/null; then echo "Removing dummy interface..." sudo ip link del dummy0 fi } +# ============================================================ +# GPU Device Plugin +# ============================================================ + function deploy_rocm_gpu_device_plugin() { echo "Deploying ROCm GPU device plugin..." @@ -390,7 +462,14 @@ function deploy_rocm_gpu_device_plugin() { return 0 fi - kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml + if [[ "${OFFLINE_MODE}" == "1" ]]; then + kubectl create -f "${BUNDLE_DIR}/manifests/k8s-ds-amdgpu-dp.yaml" + # Patch imagePullPolicy to avoid pulling from registry in air-gapped environments + kubectl patch ds amdgpu-device-plugin-daemonset -n kube-system --type=json \ + -p '[{"op":"replace","path":"/spec/template/spec/containers/0/imagePullPolicy","value":"IfNotPresent"}]' + else + kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml + fi if ! kubectl wait --for=jsonpath='{.status.numberReady}'=1 --namespace=kube-system ds/amdgpu-device-plugin-daemonset --timeout=300s | grep "condition met"; then exit 1 @@ -399,34 +478,51 @@ function deploy_rocm_gpu_device_plugin() { fi } -function deply_aup_learning_cloud_runtime() { - detect_and_configure_gpu - generate_values_overlay +# ============================================================ +# Image Helpers +# ============================================================ - echo "Deploying AUP Learning Cloud Runtime..." +# Apply MIRROR_PREFIX to an image reference for pulling +function resolve_pull_ref() { + local image="$1" + local full_image="${image}" + local first_segment="${image%%/*}" - helm install jupyterhub runtime/chart --namespace jupyterhub \ - --create-namespace -f runtime/values.yaml -f runtime/values.local.yaml - - echo "Waiting for JupyterHub deployments to be ready..." - kubectl wait --namespace jupyterhub \ - --for=condition=available --timeout=600s \ - deployment/hub deployment/proxy deployment/user-scheduler + if [[ "${image}" == *"/"* ]]; then + [[ "${first_segment}" != *"."* ]] && full_image="docker.io/${image}" + else + full_image="docker.io/library/${image}" + fi - kubectl label "$(kubectl get nodes -o name)" node-type="${ACCEL_KEY}" --overwrite + if [[ -n "${MIRROR_PREFIX}" ]]; then + echo "${MIRROR_PREFIX}/${full_image}" + else + echo "${full_image}" + fi } -function upgrade_aup_learning_cloud_runtime() { - detect_and_configure_gpu - generate_values_overlay +# Pull a single image, apply mirror prefix, tag back to original name. +# Returns 0 on success, 1 on failure. +function pull_and_tag() { + local image="$1" + local pull_ref + pull_ref=$(resolve_pull_ref "${image}") + + echo " Pulling: ${pull_ref}" + if ! docker pull "${pull_ref}"; then + echo " FAILED: ${image}" + return 1 + fi - helm upgrade jupyterhub runtime/chart --namespace jupyterhub \ - --create-namespace -f runtime/values.yaml -f runtime/values.local.yaml + if [[ "${pull_ref}" != "${image}" ]]; then + docker tag "${pull_ref}" "${image}" + fi + return 0 } -function remove_aup_learning_cloud_runtime() { - helm uninstall jupyterhub --namespace jupyterhub -} +# ============================================================ +# Image: Local Build +# ============================================================ # Build local images. Optional: list of Makefile targets (e.g. hub, cv, base-cpu). Default: all. function local_image_build() { @@ -438,7 +534,6 @@ function local_image_build() { local targets=("${@:-all}") echo "Building local images: ${targets[*]}" - # When using Docker runtime, images stay in Docker; no need to export to K3S_IMAGES_DIR if [[ "${K3S_USE_DOCKER}" != "1" ]]; then if [ ! -d "${K3S_IMAGES_DIR}" ]; then sudo mkdir -p "${K3S_IMAGES_DIR}" @@ -448,7 +543,6 @@ function local_image_build() { echo "Build images in Docker (K3S_USE_DOCKER=1; K3s will use them directly)" fi - # Makefile: SAVE_IMAGES=1 and K3S_IMAGES_DIR only when not using Docker backend (containerd + export) local save_images_for_make="" local images_dir_for_make="" if [[ "${K3S_USE_DOCKER}" != "1" ]]; then @@ -462,7 +556,6 @@ function local_image_build() { GPU_TARGET="${GPU_TARGET}" \ SAVE_IMAGES="${save_images_for_make}" \ K3S_IMAGES_DIR="${images_dir_for_make}" \ - IMAGES="${IMAGES[*]}" \ MIRROR_PREFIX="${MIRROR_PREFIX}" \ MIRROR_PIP="${MIRROR_PIP}" \ MIRROR_NPM="${MIRROR_NPM}" \ @@ -471,21 +564,85 @@ function local_image_build() { echo "-------------------------------------------" } -function pull_external_images() { - # Pull external images. When K3S_USE_DOCKER=1, keep in Docker only; else also save to K3S_IMAGES_DIR for offline. +# ============================================================ +# Image: Pull from GHCR (custom images) +# ============================================================ +function pull_custom_images() { if ! command -v docker &> /dev/null; then echo "Please install docker" exit 1 fi + detect_and_configure_gpu + local tag="${IMAGE_TAG}-${GPU_TARGET}" + echo "===========================================" - echo "Pulling external images..." - if [[ "${K3S_USE_DOCKER}" == "1" ]]; then - echo "K3S_USE_DOCKER=1: images stay in Docker (no export to K3s image dir)" + echo "Pulling pre-built custom images from GHCR..." + echo " GPU_TARGET=${GPU_TARGET}, tag=${tag}" + echo "===========================================" + + if [[ "${K3S_USE_DOCKER}" != "1" && ! -d "${K3S_IMAGES_DIR}" ]]; then + sudo mkdir -p "${K3S_IMAGES_DIR}" + fi + + local failed_images=() + + # GPU-specific images: pull :latest-, also tag as :latest + for name in "${GPU_CUSTOM_NAMES[@]}"; do + local image="${IMAGE_REGISTRY}/${name}:${tag}" + if pull_and_tag "${image}"; then + docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" + + if [[ "${K3S_USE_DOCKER}" != "1" ]]; then + sudo docker save \ + "${IMAGE_REGISTRY}/${name}:latest" \ + "${IMAGE_REGISTRY}/${name}:${tag}" \ + -o "${K3S_IMAGES_DIR}/${name}.tar" + fi + else + failed_images+=("${image}") + fi + done + + # Non-GPU images: pull : + for name in "${PLAIN_CUSTOM_NAMES[@]}"; do + local image="${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" + if pull_and_tag "${image}"; then + if [[ "${K3S_USE_DOCKER}" != "1" ]]; then + sudo docker save "${image}" -o "${K3S_IMAGES_DIR}/${name}.tar" + fi + else + failed_images+=("${image}") + fi + done + + echo "===========================================" + if [[ ${#failed_images[@]} -eq 0 ]]; then + echo "All custom images pulled successfully!" else - echo "Saving to K3s image pool for offline deployment" + echo "Failed images:" + for img in "${failed_images[@]}"; do echo " - ${img}"; done + echo "Warning: Some custom images failed." fi + echo "===========================================" +} + +# ============================================================ +# Image: Pull External Images +# ============================================================ + +function pull_external_images() { + if ! command -v docker &> /dev/null; then + echo "Please install docker" + exit 1 + fi + + # When called during 'install --pull', skip build-only images + local skip_build_only="${1:-0}" + + echo "===========================================" + echo "Pulling external images..." if [[ -n "${MIRROR_PREFIX}" ]]; then echo "Using mirror prefix: ${MIRROR_PREFIX}" fi @@ -495,46 +652,37 @@ function pull_external_images() { sudo mkdir -p "${K3S_IMAGES_DIR}" fi + # Build image list, combining EXTERNAL_IMAGES + optionally BUILD_ONLY_IMAGES + local images_to_pull=("${EXTERNAL_IMAGES[@]}") + if [[ "${skip_build_only}" != "1" ]]; then + images_to_pull+=("${BUILD_ONLY_IMAGES[@]}") + fi + local failed_images=() - for image in "${EXTERNAL_IMAGES[@]}"; do - # Determine the full image path for pulling with mirror - # Images without registry prefix are from docker.io + for image in "${images_to_pull[@]}"; do local full_image="${image}" local first_segment="${image%%/*}" if [[ "${image}" == *"/"* ]]; then - # Has slash - check if first segment looks like a registry (contains a dot) - if [[ "${first_segment}" != *"."* ]]; then - # No dot in first segment, it's docker.io (e.g., curlimages/curl) - full_image="docker.io/${image}" - fi + [[ "${first_segment}" != *"."* ]] && full_image="docker.io/${image}" else - # No slash - it's an official docker image (e.g., traefik:v3.3.1) full_image="docker.io/library/${image}" fi - # Apply mirror prefix if set local pull_image="${full_image}" - if [[ -n "${MIRROR_PREFIX}" ]]; then - pull_image="${MIRROR_PREFIX}/${full_image}" - fi + [[ -n "${MIRROR_PREFIX}" ]] && pull_image="${MIRROR_PREFIX}/${full_image}" echo "-------------------------------------------" echo "Pulling: ${pull_image}" if docker pull "${pull_image}"; then - # Tag to original name so K3s can use it - if [[ "${pull_image}" != "${image}" ]]; then - docker tag "${pull_image}" "${image}" - fi + [[ "${pull_image}" != "${image}" ]] && docker tag "${pull_image}" "${image}" - # Also tag to mirror-prefixed name so Docker build with MIRROR_PREFIX can use local cache if [[ -n "${MIRROR_PREFIX}" && "${pull_image}" != "${MIRROR_PREFIX}/${full_image}" ]]; then docker tag "${pull_image}" "${MIRROR_PREFIX}/${full_image}" fi - # Save to K3S_IMAGES_DIR only when not using Docker backend (so K3s can load at boot) if [[ "${K3S_USE_DOCKER}" != "1" && -n "${K3S_IMAGES_DIR}" ]]; then local filename filename=$(echo "${image}" | sed 's/[\/:]/-/g').tar @@ -560,23 +708,121 @@ function pull_external_images() { echo "All external images pulled and saved successfully!" else echo "Failed images:" - for img in "${failed_images[@]}"; do - echo " - ${img}" - done + for img in "${failed_images[@]}"; do echo " - ${img}"; done echo "Warning: Some images failed. Deployment may require internet access." fi echo "===========================================" } +# ============================================================ +# Image: Load from Offline Bundle +# ============================================================ + +function load_offline_images() { + echo "===========================================" + echo "Loading images from offline bundle..." + echo "===========================================" + + local loaded=0 failed=0 + + for tar_file in "${BUNDLE_DIR}/images/custom"/*.tar "${BUNDLE_DIR}/images/external"/*.tar; do + [[ -f "${tar_file}" ]] || continue + echo " Importing: $(basename "${tar_file}")" + if sudo k3s ctr images import "${tar_file}" 2>/dev/null; then + loaded=$((loaded + 1)) + else + echo " Failed!" + failed=$((failed + 1)) + fi + done + + echo "===========================================" + echo "Loaded ${loaded} images, ${failed} failed" + if [[ "${failed}" -gt 0 ]]; then + echo "Error: ${failed} image(s) failed to import. Bundle may be corrupted." >&2 + exit 1 + fi + echo "===========================================" +} + +# ============================================================ +# Runtime Management +# ============================================================ + +# ============================================================ +# Runtime Management +# ============================================================ + +# Resolve chart/values paths (bundle or local repo) +function get_runtime_paths() { + if [[ "${OFFLINE_MODE}" == "1" ]]; then + CHART_PATH="${BUNDLE_DIR}/chart" + VALUES_PATH="${BUNDLE_DIR}/config/values.yaml" + OVERLAY_PATH="${BUNDLE_DIR}/config/values.local.yaml" + else + CHART_PATH="runtime/chart" + VALUES_PATH="runtime/values.yaml" + OVERLAY_PATH="runtime/values.local.yaml" + fi +} + +function deploy_aup_learning_cloud_runtime() { + echo "Deploying AUP Learning Cloud Runtime..." + + helm install jupyterhub "${CHART_PATH}" --namespace jupyterhub \ + --create-namespace -f "${VALUES_PATH}" -f "${OVERLAY_PATH}" + + echo "Waiting for JupyterHub deployments to be ready..." + kubectl wait --namespace jupyterhub \ + --for=condition=available --timeout=600s \ + deployment/hub deployment/proxy deployment/user-scheduler + + kubectl label "$(kubectl get nodes -o name)" node-type="${ACCEL_KEY}" --overwrite +} + +function upgrade_aup_learning_cloud_runtime() { + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + + helm upgrade jupyterhub "${CHART_PATH}" --namespace jupyterhub \ + --create-namespace -f "${VALUES_PATH}" -f "${OVERLAY_PATH}" +} + +function remove_aup_learning_cloud_runtime() { + helm uninstall jupyterhub --namespace jupyterhub +} + +# ============================================================ +# Deployment Orchestration +# ============================================================ + function deploy_all_components() { + if [[ $EUID -ne 0 ]]; then + echo "Error: This script must be run as root." >&2 + exit 1 + fi + + local flag="${1:-}" + detect_and_configure_gpu + get_runtime_paths generate_values_overlay install_tools install_k3s_single_node + + if [[ "${OFFLINE_MODE}" == "1" ]]; then + load_offline_images + elif [[ "${flag}" == "--pull" ]]; then + pull_custom_images + pull_external_images 1 # skip build-only images + else + pull_external_images + local_image_build + fi + deploy_rocm_gpu_device_plugin - pull_external_images - local_image_build - deply_aup_learning_cloud_runtime + deploy_aup_learning_cloud_runtime } function remove_all_components() { @@ -584,22 +830,302 @@ function remove_all_components() { remove_k3s } +# ============================================================ +# Pack: Create Offline Bundle +# ============================================================ + +function pack_download_binaries() { + local staging="$1" + local k3s_url_ver + k3s_url_ver=$(echo "${K3S_VERSION}" | sed 's/+/%2B/g') + + echo "--- Downloading binaries ---" + mkdir -p "${staging}/bin" + + echo " K3s ${K3S_VERSION}..." + wget -q "https://github.com/k3s-io/k3s/releases/download/${k3s_url_ver}/k3s" \ + -O "${staging}/bin/k3s" + chmod +x "${staging}/bin/k3s" + + echo " K3s install script..." + wget -q "https://get.k3s.io" -O "${staging}/bin/k3s-install.sh" + chmod +x "${staging}/bin/k3s-install.sh" + + echo " Helm ${HELM_VERSION}..." + wget -q "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" -O /tmp/helm-pack.tar.gz + tar -zxf /tmp/helm-pack.tar.gz -C /tmp linux-amd64/helm + mv /tmp/linux-amd64/helm "${staging}/bin/helm" + chmod +x "${staging}/bin/helm" + rm -rf /tmp/helm-pack.tar.gz /tmp/linux-amd64 + + echo " K9s ${K9S_VERSION}..." + wget -q "https://github.com/derailed/k9s/releases/download/${K9S_VERSION}/k9s_linux_amd64.deb" \ + -O "${staging}/bin/k9s_linux_amd64.deb" +} + +function pack_download_k3s_images() { + local staging="$1" + local k3s_url_ver + k3s_url_ver=$(echo "${K3S_VERSION}" | sed 's/+/%2B/g') + + echo "--- Downloading K3s airgap images ---" + mkdir -p "${staging}/k3s-images" + + wget -q "https://github.com/k3s-io/k3s/releases/download/${k3s_url_ver}/k3s-airgap-images-amd64.tar.zst" \ + -O "${staging}/k3s-images/k3s-airgap-images-amd64.tar.zst" +} + +function pack_save_manifests() { + local staging="$1" + echo "--- Saving manifests ---" + mkdir -p "${staging}/manifests" + + wget -q "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml" \ + -O "${staging}/manifests/k8s-ds-amdgpu-dp.yaml" + echo " Saved ROCm device plugin DaemonSet." +} + +function pack_copy_chart() { + local staging="$1" + echo "--- Copying chart and config ---" + + cp -r runtime/chart "${staging}/chart" + mkdir -p "${staging}/config" + cp runtime/values.yaml "${staging}/config/values.yaml" +} + +# Save custom images: pull from GHCR, then docker save +# All images are saved into a single tar to deduplicate shared layers. +function pack_save_custom_images_pull() { + local staging="$1" + local tag="${IMAGE_TAG}-${GPU_TARGET}" + + echo "--- Pulling and saving custom images (${IMAGE_REGISTRY}) ---" + mkdir -p "${staging}/images/custom" + + local failed=0 + local all_refs=() + + for name in "${GPU_CUSTOM_NAMES[@]}"; do + local image="${IMAGE_REGISTRY}/${name}:${tag}" + if pull_and_tag "${image}"; then + docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${tag}") + echo " Pulled: ${name} (:latest + :${tag})" + else + failed=$((failed + 1)) + fi + done + + for name in "${PLAIN_CUSTOM_NAMES[@]}"; do + local image="${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" + if pull_and_tag "${image}"; then + docker tag "${image}" "${IMAGE_REGISTRY}/${name}:latest" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}") + echo " Pulled: ${name} (:latest + :${IMAGE_TAG})" + else + failed=$((failed + 1)) + fi + done + + if [[ "${failed}" -gt 0 ]]; then + echo "Error: ${failed} custom image(s) failed to pull. Bundle would be incomplete." >&2 + echo " Check that IMAGE_REGISTRY (${IMAGE_REGISTRY}) is correct and you have pull access." >&2 + rm -rf "${staging}" + exit 1 + fi + + echo " Saving all custom images (shared layers deduplicated)..." + docker save "${all_refs[@]}" -o "${staging}/images/custom/auplc-custom.tar" + echo " Saved: ${staging}/images/custom/auplc-custom.tar" +} + +# Save custom images: build locally via Makefile, then docker save +function pack_save_custom_images_local() { + local staging="$1" + local tag="latest-${GPU_TARGET}" + + echo "--- Building and saving custom images locally ---" + mkdir -p "${staging}/images/custom" + + # Build all images to Docker daemon (no K3s export) + (cd dockerfiles/ && make \ + GPU_TARGET="${GPU_TARGET}" \ + MIRROR_PREFIX="${MIRROR_PREFIX}" \ + MIRROR_PIP="${MIRROR_PIP}" \ + MIRROR_NPM="${MIRROR_NPM}" \ + all) + + echo "--- Saving built images to bundle (shared layers deduplicated) ---" + + local all_refs=() + + for name in "${GPU_CUSTOM_NAMES[@]}"; do + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${tag}") + echo " Queued: ${name} (:latest + :${tag})" + done + + for name in "${PLAIN_CUSTOM_NAMES[@]}"; do + docker tag "${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}" + all_refs+=("${IMAGE_REGISTRY}/${name}:latest" "${IMAGE_REGISTRY}/${name}:${IMAGE_TAG}") + echo " Queued: ${name} (:latest + :${IMAGE_TAG})" + done + + docker save "${all_refs[@]}" -o "${staging}/images/custom/auplc-custom.tar" + echo " Saved: ${staging}/images/custom/auplc-custom.tar" +} + +# Save external images (always pulled from registries) +function pack_save_external_images() { + local staging="$1" + echo "--- Pulling and saving external images ---" + mkdir -p "${staging}/images/external" + + # Build list: runtime external images (skip build-only) + local pack_images=("${EXTERNAL_IMAGES[@]}") + + # Extract ROCm device plugin image from saved manifest + if [[ -f "${staging}/manifests/k8s-ds-amdgpu-dp.yaml" ]]; then + local dp_image + dp_image=$(sed -n 's/.*image:[[:space:]]*\([^ ]*\).*/\1/p' "${staging}/manifests/k8s-ds-amdgpu-dp.yaml" | head -1) + if [[ -n "${dp_image}" ]]; then + echo " Found device plugin image: ${dp_image}" + pack_images+=("${dp_image}") + fi + fi + + local failed_images=() + for image in "${pack_images[@]}"; do + if pull_and_tag "${image}"; then + local filename + filename=$(echo "${image}" | sed 's/[\/:]/-/g').tar + docker save "${image}" -o "${staging}/images/external/${filename}" + echo " Saved: ${image}" + else + failed_images+=("${image}") + fi + done + + if [[ ${#failed_images[@]} -gt 0 ]]; then + echo "Error: ${#failed_images[@]} external image(s) failed to pull:" >&2 + for img in "${failed_images[@]}"; do echo " - ${img}" >&2; done + rm -rf "${staging}" + exit 1 + fi +} + +function pack_write_manifest() { + local staging="$1" + cat > "${staging}/manifest.json" << EOF +{ + "format_version": "1", + "build_date": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", + "gpu_target": "${GPU_TARGET}", + "accel_key": "${ACCEL_KEY}", + "accel_env": "${ACCEL_ENV}", + "image_registry": "${IMAGE_REGISTRY}", + "image_tag": "${IMAGE_TAG}", + "k3s_version": "${K3S_VERSION}", + "helm_version": "${HELM_VERSION}", + "k9s_version": "${K9S_VERSION}" +} +EOF +} + +function pack_bundle() { + local flag="${1:-}" + + # Sanitize IMAGE_TAG: Docker tags cannot contain '/' (e.g. branch names) + IMAGE_TAG="${IMAGE_TAG//\//-}" + + echo "===========================================" + echo "AUP Learning Cloud - Pack Offline Bundle" + if [[ "${flag}" == "--local" ]]; then + echo " Image source: local build" + else + echo " Image source: pull from GHCR" + fi + echo "===========================================" + + if ! command -v docker &> /dev/null; then + echo "Error: Docker is required." >&2 + exit 1 + fi + + detect_and_configure_gpu + + local date_stamp + date_stamp=$(date +%Y%m%d) + local bundle_name="auplc-bundle-${GPU_TARGET}-${date_stamp}" + + [[ -d "${bundle_name}" ]] && rm -rf "${bundle_name}" + mkdir -p "${bundle_name}" + + # Copy installer itself + cp "${BASH_SOURCE[0]}" "${bundle_name}/auplc-installer" + chmod +x "${bundle_name}/auplc-installer" + + pack_download_binaries "${bundle_name}" + pack_download_k3s_images "${bundle_name}" + pack_save_manifests "${bundle_name}" + + if [[ "${flag}" == "--local" ]]; then + pack_save_custom_images_local "${bundle_name}" + else + pack_save_custom_images_pull "${bundle_name}" + fi + + pack_save_external_images "${bundle_name}" + pack_copy_chart "${bundle_name}" + pack_write_manifest "${bundle_name}" + + echo "===========================================" + echo "Creating archive: ${bundle_name}.tar.gz ..." + echo "===========================================" + + tar czf "${bundle_name}.tar.gz" "${bundle_name}/" + rm -rf "${bundle_name}" + + local size + size=$(du -sh "${bundle_name}.tar.gz" | cut -f1) + + echo "===========================================" + echo "Bundle created: ${bundle_name}.tar.gz (${size})" + echo "" + echo "Deploy on air-gapped machine:" + echo " tar xzf ${bundle_name}.tar.gz" + echo " cd ${bundle_name}" + echo " sudo ./auplc-installer install" + echo "===========================================" +} + +# ============================================================ +# Help +# ============================================================ + function show_help() { cat << 'EOF' -Usage: ./auplc-installer [subcommand] +Usage: ./auplc-installer [options] Commands: - install Full installation (k3s + images + runtime) - uninstall Remove everything + install [--pull] Full installation (k3s + images + runtime) + Default: build images locally via Makefile + --pull: use pre-built images from GHCR (no local build needed) + + pack [--local] Create offline deployment bundle (requires Docker + internet) + Default: pull pre-built images from GHCR + --local: build images locally then pack (needs build deps) + + uninstall Remove everything (K3s + runtime) install-tools Install helm and k9s rt install Deploy JupyterHub runtime only - rt reinstall Reinstall JupyterHub runtime (For container images changes) - rt upgrade Upgrade JupyterHub runtime (For vaules.yaml changes) + rt reinstall Reinstall JupyterHub runtime (for container image changes) + rt upgrade Upgrade JupyterHub runtime (for values.yaml changes) rt remove Remove JupyterHub runtime - img build Build all custom images - img build [target...] Build custom images (default: all). e.g. img build hub, img build hub cv + img build [target...] Build custom images (default: all) + Targets: all, hub, base-cpu, base-rocm, cv, dl, llm, physim img pull Pull external images for offline use detect-gpu Show detected GPU configuration @@ -625,9 +1151,29 @@ Options (can also be set via environment variables): ./auplc-installer img build base-rocm --gpu=strix ./auplc-installer install --mirror=mirror.example.com +Image Registry: + IMAGE_REGISTRY Registry prefix for custom images (default: ghcr.io/amdresearch) + Override when pulling from a fork or private registry. + IMAGE_TAG Image tag prefix (default: latest). GPU suffix appended automatically. + Use "develop" for images built from the develop branch. + +Offline Deployment: + 1. On a machine with internet access, create bundle: + ./auplc-installer pack --gpu=strix-halo # pull from GHCR + ./auplc-installer pack --gpu=strix-halo --local # or build locally + + 2. Transfer bundle to air-gapped machine, then: + tar xzf auplc-bundle-gfx1151-*.tar.gz + cd auplc-bundle-gfx1151-* + sudo ./auplc-installer install + EOF } +# ============================================================ +# Main +# ============================================================ + # Parse global options (--key=value flags override environment variables) args=() for arg in "$@"; do @@ -642,26 +1188,41 @@ for arg in "$@"; do done set -- "${args[@]}" +# Detect offline bundle at startup +detect_offline_bundle + if [[ $# -eq 0 ]]; then show_help exit 1 fi case "$1" in - install) deploy_all_components ;; + install) + deploy_all_components "${2:-}" + ;; + pack) + pack_bundle "${2:-}" + ;; uninstall) remove_all_components ;; install-tools) install_tools ;; detect-gpu) detect_and_configure_gpu ;; - # New short form: rt / img rt) case "${2:-}" in - install) deply_aup_learning_cloud_runtime ;; + install) + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + deploy_aup_learning_cloud_runtime + ;; upgrade) upgrade_aup_learning_cloud_runtime ;; remove) remove_aup_learning_cloud_runtime ;; reinstall) remove_aup_learning_cloud_runtime || true sleep 0.5 - deply_aup_learning_cloud_runtime + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + deploy_aup_learning_cloud_runtime ;; *) echo "Usage: $0 rt {install|upgrade|remove|reinstall}"; exit 1 ;; esac @@ -677,7 +1238,12 @@ case "$1" in esac ;; # Legacy long form (still supported) - install-runtime) deply_aup_learning_cloud_runtime ;; + install-runtime) + detect_and_configure_gpu + get_runtime_paths + generate_values_overlay + deploy_aup_learning_cloud_runtime + ;; remove-runtime) remove_aup_learning_cloud_runtime ;; upgrade-runtime) upgrade_aup_learning_cloud_runtime ;; build-images) local_image_build ;;