From 899908bfb2cc7e4efe1c23b0553b61a3f8ee1cc8 Mon Sep 17 00:00:00 2001 From: alyssavu Date: Wed, 13 May 2026 00:03:42 +0000 Subject: [PATCH 1/4] feat(examples): add Ray on AKS example Add example for running Ray applications on AKS, including: - Setup script with commands for infra, Kueue, and KubeRay operator - RayCluster and RayJob manifests - Terraform-based AKS deployment (aks-classic) - CPU inference example with Kueue integration Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/ray/README.md | 75 ++++++++++ examples/ray/aks-classic/deploy.sh | 128 ++++++++++++++++ examples/ray/aks-classic/main.tf | 97 ++++++++++++ examples/ray/aks-classic/outputs.tf | 46 ++++++ examples/ray/aks-classic/providers.tf | 28 ++++ examples/ray/aks-classic/ssh.tf | 25 ++++ examples/ray/aks-classic/variables.tf | 51 +++++++ .../ray/inference-cpu/inference-rayjob.yaml | 91 ++++++++++++ examples/ray/inference-cpu/inference_job.py | 85 +++++++++++ .../inference-cpu/stack-kueue-resources.yaml | 45 ++++++ examples/ray/setup.sh | 140 ++++++++++++++++++ 11 files changed, 811 insertions(+) create mode 100644 examples/ray/README.md create mode 100755 examples/ray/aks-classic/deploy.sh create mode 100644 examples/ray/aks-classic/main.tf create mode 100644 examples/ray/aks-classic/outputs.tf create mode 100644 examples/ray/aks-classic/providers.tf create mode 100644 examples/ray/aks-classic/ssh.tf create mode 100644 examples/ray/aks-classic/variables.tf create mode 100644 examples/ray/inference-cpu/inference-rayjob.yaml create mode 100644 examples/ray/inference-cpu/inference_job.py create mode 100644 examples/ray/inference-cpu/stack-kueue-resources.yaml create mode 100755 examples/ray/setup.sh diff --git a/examples/ray/README.md b/examples/ray/README.md new file mode 100644 index 000000000..98c326303 --- /dev/null +++ b/examples/ray/README.md @@ -0,0 +1,75 @@ +# Running Ray on Azure Kubernetes Service (AKS) + +This example demonstrates how to deploy and run a [Ray](https://www.ray.io/) application on AKS using the KubeRay operator. + +## Prerequisites + +- An AKS cluster (Kubernetes 1.26+) +- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured to access your cluster +- [Helm](https://helm.sh/docs/intro/install/) 3.x installed + +## Overview + +Ray is an open-source framework for scaling AI and Python workloads. This example deploys: + +1. The **KubeRay operator** to manage Ray clusters on Kubernetes +2. A **RayCluster** custom resource with a head node and worker nodes +3. A sample **Ray job** to verify the deployment + +## Deploy the KubeRay operator + +```bash +helm repo add kuberay https://ray-project.github.io/kuberay-helm/ +helm repo update + +helm install kuberay-operator kuberay/kuberay-operator \ + --namespace kuberay-system \ + --create-namespace +``` + +Verify the operator is running: + +```bash +kubectl get pods -n kuberay-system +``` + +## Deploy the RayCluster + +```bash +kubectl apply -f ray-cluster.yaml +``` + +Wait for the cluster to be ready: + +```bash +kubectl get rayclusters +kubectl get pods -l ray.io/cluster=ray-cluster +``` + +## Submit a sample job + +```bash +kubectl apply -f ray-job.yaml +``` + +Check job status: + +```bash +kubectl get rayjobs +kubectl logs -l job-name=ray-sample-job +``` + +## Clean up + +```bash +kubectl delete -f ray-job.yaml +kubectl delete -f ray-cluster.yaml +helm uninstall kuberay-operator -n kuberay-system +kubectl delete namespace kuberay-system +``` + +## Resources + +- [Ray documentation](https://docs.ray.io/) +- [KubeRay documentation](https://ray-project.github.io/kuberay/) +- [AKS documentation](https://learn.microsoft.com/azure/aks) diff --git a/examples/ray/aks-classic/deploy.sh b/examples/ray/aks-classic/deploy.sh new file mode 100755 index 000000000..01113bd1c --- /dev/null +++ b/examples/ray/aks-classic/deploy.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +# Check if the user is logged into Azure CLI +if ! az account show > /dev/null 2>&1; then + echo "Please login to Azure CLI using 'az login' before running this script." + exit 1 +fi + +# Initialize Terraform +terraform init + +# Create a Terraform plan +terraform plan -out main.tfplan + +# Apply the Terraform plan +terraform apply main.tfplan + +# Retrieve the Terraform outputs and store in variables +resource_group_name=$(terraform output -raw resource_group_name) +system_node_pool_name=$(terraform output -raw system_node_pool_name) +aks_cluster_name=$(terraform output -raw kubernetes_cluster_name) + +# Get AKS credentials for the cluster +az aks get-credentials \ + --resource-group $resource_group_name \ + --name $aks_cluster_name + +# Create the kuberay namespace +kuberay_namespace="kuberay" +kubectl create namespace $kuberay_namespace + +# Output the current Kubernetes context +current_context=$(kubectl config current-context) +echo "Current Kubernetes Context: $current_context" + +# Output the nodes in the cluster +kubectl get nodes + +# Check Helm version +helm version + +# Add the KubeRay Helm repository +helm repo add kuberay https://ray-project.github.io/kuberay-helm/ + +# Update the Helm repository +helm repo update + +# Install or upgrade the KubeRay operator using Helm +helm upgrade \ +--install \ +--cleanup-on-fail \ +--wait \ +--timeout 10m0s \ +--namespace "$kuberay_namespace" \ +--create-namespace kuberay-operator kuberay/kuberay-operator \ +--version 1.1.1 + +# Output the pods in the kuberay namespace +kubectl get pods -n $kuberay_namespace + +# Download the PyTorch MNIST job YAML file +curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml + +# Train a PyTorch Model on Fashion MNIST +kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml + +# Output the pods in the kuberay namespace +kubectl get pods -n $kuberay_namespace + +# Get the status of the Ray job +job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}') + +# Wait for the Ray job to complete +while [ "$job_status" != "Complete" ]; do + echo -ne "Job Status: $job_status\\r" + sleep 30 + job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}') +done +echo "Job Status: $job_status" + +# Check if the job succeeded +job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobStatus}') + +if [ "$job_status" != "SUCCEEDED" ]; then + echo "Job Failed!" + exit 1 +fi + +# If the job succeeded, get the Ray cluster head service +rayclusterhead=$(kubectl get service -n $kuberay_namespace | grep 'rayjob-pytorch-mnist-raycluster' | grep 'ClusterIP' | awk '{print $1}') + +# Now create a service of type NodePort for the Ray cluster head +kubectl expose service $rayclusterhead \ +-n $kuberay_namespace \ +--port=80 \ +--target-port=8265 \ +--type=NodePort \ +--name=ray-dash + +# Create an ingress for the KubeRay dashboard +cat <) and admits it +# once head 2 CPU + 2 workers × 1 CPU = 4 CPU quota is available. +# 3. KubeRay spawns head + 2 workers; Ray Data runs image classification on CPU. +# +# Uses the custom Ray image built from images/ray/. {{RAY_IMAGE}} is resolved at +# test time from the RAY_E2E_IMAGE env var, which CI sets to the image pushed to +# the PR-scoped ACR. {{RAY_VERSION}} is resolved from images/ray/Makefile. +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: e2e-inference + namespace: e2e-stack + labels: + kueue.x-k8s.io/queue-name: e2e-stack-queue +spec: + entrypoint: python /home/ray/scripts/inference_job.py + shutdownAfterJobFinishes: true + ttlSecondsAfterFinished: 600 + runtimeEnvYAML: | + pip: + packages: + - torch + - torchvision + pip_install_options: + - --index-url + - https://download.pytorch.org/whl/cpu + - --extra-index-url + - https://pypi.org/simple + pip_check: false + rayClusterSpec: + rayVersion: "{{RAY_VERSION}}" + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + num-cpus: "0" # Don't schedule Ray tasks on head; Kueue still counts its K8s CPU requests. + dashboard-host: "0.0.0.0" + template: + spec: + containers: + - name: ray-head + image: {{RAY_IMAGE}} + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "2" + memory: "4Gi" + startupProbe: + httpGet: + path: /api/version + port: 8265 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 60 + volumeMounts: + - name: inference-script + mountPath: /home/ray/scripts + volumes: + - name: inference-script + configMap: + name: e2e-inference-script + workerGroupSpecs: + - groupName: workers + replicas: 2 + minReplicas: 2 + maxReplicas: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: {{RAY_IMAGE}} + resources: + requests: + cpu: "1" + memory: "4Gi" + limits: + cpu: "1" + memory: "4Gi" + startupProbe: + httpGet: + path: /api/healthz + port: 52365 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 60 diff --git a/examples/ray/inference-cpu/inference_job.py b/examples/ray/inference-cpu/inference_job.py new file mode 100644 index 000000000..2aaaa1082 --- /dev/null +++ b/examples/ray/inference-cpu/inference_job.py @@ -0,0 +1,85 @@ +""" +Image classification test for e2e validation. +Adapted from aks-unbounded/ray/inference_job.py. + +Uses a locally-constructed model with random weights instead of downloading from +HuggingFace. This avoids network dependencies (SSL proxy issues, rate limits) while +still exercising the full Ray Data distributed inference pipeline: + - ActorPoolStrategy distributes work across workers + - Each actor loads a model, processes batches, returns predictions + - Ray Data orchestrates the dataflow end-to-end + +Auto-detects GPU: if torch.cuda.is_available(), actors declare num_gpus=1 and the +model runs on CUDA; otherwise falls back to CPU. Ray Data infers actor count from +the ActorPoolStrategy size; the cluster must have enough GPU/CPU quota to schedule. +""" +import ray +import numpy as np + +ray.init() +cluster_resources = ray.cluster_resources() +print(f"Cluster resources: {cluster_resources}") +GPU_AVAILABLE = cluster_resources.get("GPU", 0) > 0 +print(f"GPU_AVAILABLE = {GPU_AVAILABLE}") + +# Generate synthetic images in-memory (no S3 dependency) +NUM_IMAGES = 10 +synthetic_images = [ + {"image": np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)} + for _ in range(NUM_IMAGES) +] + +ds = ray.data.from_items(synthetic_images) + +LABELS = ["cat", "dog", "bird", "fish", "car"] + +class ImageClassifier: + def __init__(self): + import torch + import torch.nn as nn + + self.device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"Device: {self.device}") + # Simple conv net with random weights — no download needed. + self.model = nn.Sequential( + nn.Conv2d(3, 16, 3, stride=2), + nn.ReLU(), + nn.AdaptiveAvgPool2d(1), + nn.Flatten(), + nn.Linear(16, len(LABELS)), + ).to(self.device) + self.model.eval() + print("Model loaded successfully") + + def __call__(self, batch): + import torch + + images = torch.from_numpy( + np.stack(batch["image"]) + ).permute(0, 3, 1, 2).float().to(self.device) / 255.0 + + with torch.no_grad(): + logits = self.model(images) + indices = logits.argmax(dim=1) + + batch["label"] = [LABELS[i] for i in indices.tolist()] + batch["score"] = [logits[j, indices[j]].item() for j in range(len(indices))] + return batch + +# Use fewer actors on GPU (1 worker × 1 GPU) than CPU (2 workers × 2 CPU). +POOL_SIZE = 1 if GPU_AVAILABLE else 2 + +predictions = ds.map_batches( + ImageClassifier, + compute=ray.data.ActorPoolStrategy(size=POOL_SIZE), + batch_size=4, + **({"num_gpus": 1} if GPU_AVAILABLE else {}), +) + +results = predictions.take_all() +print(f"\nInference complete: {len(results)} images classified") +for i, r in enumerate(results): + print(f" Image {i}: Label: {r['label']} (score: {r['score']:.4f})") + +print("\nSUCCESS: All images classified") + diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml new file mode 100644 index 000000000..11fcddd63 --- /dev/null +++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml @@ -0,0 +1,45 @@ +--- +# Namespace for the stack integration test. +apiVersion: v1 +kind: Namespace +metadata: + name: e2e-stack +--- +# ResourceFlavor — represents CPU hardware for the stack test. +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ResourceFlavor +metadata: + name: e2e-stack-flavor +spec: {} +--- +# ClusterQueue — sized to admit both the CPU-only inference job (head 2 + 2 workers × 1 CPU + submitter) +# and the GPU variant (head 1 + 1 worker × 2 CPU + 1 GPU + submitter). GPU quota is only exercised +# on GPU-capable clusters; on CPU clusters workloads that don't request GPU are unaffected. +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ClusterQueue +metadata: + name: e2e-stack-cluster-queue +spec: + namespaceSelector: {} + queueingStrategy: BestEffortFIFO + resourceGroups: + - coveredResources: ["cpu", "memory", "nvidia.com/gpu"] + flavors: + - name: e2e-stack-flavor + resources: + - name: cpu + nominalQuota: 6 + - name: memory + nominalQuota: 20Gi + - name: nvidia.com/gpu + nominalQuota: 2 +--- +# LocalQueue — RayJobs must carry label kueue.x-k8s.io/queue-name: e2e-stack-queue +# to be managed by Kueue. +apiVersion: kueue.x-k8s.io/v1beta2 +kind: LocalQueue +metadata: + name: e2e-stack-queue + namespace: e2e-stack +spec: + clusterQueue: e2e-stack-cluster-queue diff --git a/examples/ray/setup.sh b/examples/ray/setup.sh new file mode 100755 index 000000000..5f1596d7e --- /dev/null +++ b/examples/ray/setup.sh @@ -0,0 +1,140 @@ +#!/bin/bash +set -euo pipefail + +# --- Infrastructure Configuration --- +RESOURCE_GROUP="${RESOURCE_GROUP:-ray-example-rg}" +LOCATION="${LOCATION:-eastus}" +CLUSTER_NAME="${CLUSTER_NAME:-demo}" +NODE_COUNT="${NODE_COUNT:-3}" +NODE_VM_SIZE="${NODE_VM_SIZE:-Standard_D4ds_v4}" +KUBERNETES_VERSION="${KUBERNETES_VERSION:-1.35}" +NODEPOOL_NAME="${NODEPOOL_NAME:-cpu-pool}" +NODEPOOL_NODE_COUNT="${NODEPOOL_NODE_COUNT:-2}" +NODEPOOL_VM_SIZE="${NODEPOOL_VM_SIZE:-Standard_D16ds_v7}" + +# --- Helm Charts Configuration --- +HELM_REGISTRY="${HELM_REGISTRY:-oci://mcr.microsoft.com/aks/ai-runtime/helm}" +KUEUE_VERSION="${KUEUE_VERSION:-0.17.1}" +KUBERAY_OPERATOR_VERSION="${KUBERAY_OPERATOR_VERSION:-1.6.1}" + +create_infra() { + echo "=== Creating resource group ===" + az group create \ + --name "$RESOURCE_GROUP" \ + --location "$LOCATION" + + echo "=== Creating AKS cluster ===" + az aks create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --location "$LOCATION" \ + --node-count "$NODE_COUNT" \ + --node-vm-size "$NODE_VM_SIZE" \ + --kubernetes-version "$KUBERNETES_VERSION" \ + --generate-ssh-keys \ + --enable-managed-identity + + echo "=== Adding cpu worker node pool ===" + az aks nodepool add \ + --resource-group "$RESOURCE_GROUP" \ + --cluster-name "$CLUSTER_NAME" \ + --name "$NODEPOOL_NAME" \ + --node-count "$NODEPOOL_NODE_COUNT" \ + --node-vm-size "$NODEPOOL_VM_SIZE" + + echo "=== Getting cluster credentials ===" + az aks get-credentials \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --overwrite-existing +} + +install_kueue() { + echo "=== Installing Kueue (v${KUEUE_VERSION}) ===" + helm upgrade --install kueue "$HELM_REGISTRY/kueue" \ + --version "$KUEUE_VERSION" \ + --namespace kueue-system \ + --create-namespace \ + --wait +} + +install_kuberay() { + echo "=== Installing KubeRay Operator (v${KUBERAY_OPERATOR_VERSION}) ===" + helm upgrade --install kuberay-operator "$HELM_REGISTRY/kuberay-operator" \ + --version "$KUBERAY_OPERATOR_VERSION" \ + --namespace kuberay-system \ + --create-namespace \ + --wait +} + +install_operators() { + install_kueue + install_kuberay +} + +status() { + echo "=== Cluster status ===" + kubectl get nodes + echo "" + kubectl get pods -n kueue-system + echo "" + kubectl get pods -n kuberay-system +} + +all() { + check_prerequisites + create_infra + install_operators + status +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Commands:" + echo " check_prerequisites Check and install required tools (az, helm)" + echo " create_infra Create resource group, AKS cluster, node pool, and fetch credentials" + echo " install_operators Install both Kueue and KubeRay" + echo " status Show cluster and operator pod status" + echo " all Run all steps end-to-end" + echo "" + echo "Examples:" + echo " $0 install_operators # Just install Kueue + KubeRay" + echo " $0 all # Full setup from scratch" +} + +# --- Prerequisites --- +install_az_cli() { + echo "Installing Azure CLI..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +} + +install_helm() { + echo "Installing Helm..." + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +} + +check_prerequisites() { + if ! command -v az &> /dev/null; then + echo "Azure CLI (az) not found. Installing..." + install_az_cli + fi + + if ! command -v helm &> /dev/null; then + echo "Helm not found. Installing..." + install_helm + fi + + echo "Prerequisites satisfied: az $(az version --query '"azure-cli"' -o tsv), helm $(helm version --short)" +} + +COMMAND="${1:-}" +case "$COMMAND" in + check_prerequisites|create_infra|install_operators|status|all) + "$COMMAND" + ;; + *) + usage + exit 1 + ;; +esac \ No newline at end of file From 1c8e2af12e150e7432a99fbc4b0f142ff387d2af Mon Sep 17 00:00:00 2001 From: alyssavu Date: Wed, 13 May 2026 00:40:05 +0000 Subject: [PATCH 2/4] feat(examples): add inference-cpu command and update Ray config - Add run_inference_cpu command to setup.sh - Use MCR Ray image (ray:py3.12-ray2.54.0) - Rename namespace from e2e-stack to ray - Rename job from e2e-inference to cpu-inference - Consolidate install_kueue/install_kuberay into install_operators Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ray/inference-cpu/inference-rayjob.yaml | 6 +-- .../inference-cpu/stack-kueue-resources.yaml | 6 +-- examples/ray/setup.sh | 47 ++++++++++++++----- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/examples/ray/inference-cpu/inference-rayjob.yaml b/examples/ray/inference-cpu/inference-rayjob.yaml index c953293c6..9134ef610 100644 --- a/examples/ray/inference-cpu/inference-rayjob.yaml +++ b/examples/ray/inference-cpu/inference-rayjob.yaml @@ -12,8 +12,8 @@ apiVersion: ray.io/v1 kind: RayJob metadata: - name: e2e-inference - namespace: e2e-stack + name: cpu-inference + namespace: ray labels: kueue.x-k8s.io/queue-name: e2e-stack-queue spec: @@ -63,7 +63,7 @@ spec: volumes: - name: inference-script configMap: - name: e2e-inference-script + name: cpu-inference-script workerGroupSpecs: - groupName: workers replicas: 2 diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml index 11fcddd63..7a4329421 100644 --- a/examples/ray/inference-cpu/stack-kueue-resources.yaml +++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml @@ -1,9 +1,9 @@ --- -# Namespace for the stack integration test. +# Namespace for the Ray example. apiVersion: v1 kind: Namespace metadata: - name: e2e-stack + name: ray --- # ResourceFlavor — represents CPU hardware for the stack test. apiVersion: kueue.x-k8s.io/v1beta2 @@ -40,6 +40,6 @@ apiVersion: kueue.x-k8s.io/v1beta2 kind: LocalQueue metadata: name: e2e-stack-queue - namespace: e2e-stack + namespace: ray spec: clusterQueue: e2e-stack-cluster-queue diff --git a/examples/ray/setup.sh b/examples/ray/setup.sh index 5f1596d7e..745e6cade 100755 --- a/examples/ray/setup.sh +++ b/examples/ray/setup.sh @@ -17,6 +17,12 @@ HELM_REGISTRY="${HELM_REGISTRY:-oci://mcr.microsoft.com/aks/ai-runtime/helm}" KUEUE_VERSION="${KUEUE_VERSION:-0.17.1}" KUBERAY_OPERATOR_VERSION="${KUBERAY_OPERATOR_VERSION:-1.6.1}" +# --- Inference CPU Configuration --- +RAY_IMAGE="${RAY_IMAGE:-mcr.microsoft.com/aks/ai-runtime/ray:py3.12-ray2.54.0}" +RAY_VERSION="${RAY_VERSION:-2.54.0}" +INFERENCE_NAMESPACE="ray" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + create_infra() { echo "=== Creating resource group ===" az group create \ @@ -49,16 +55,14 @@ create_infra() { --overwrite-existing } -install_kueue() { +install_operators() { echo "=== Installing Kueue (v${KUEUE_VERSION}) ===" helm upgrade --install kueue "$HELM_REGISTRY/kueue" \ --version "$KUEUE_VERSION" \ --namespace kueue-system \ --create-namespace \ --wait -} -install_kuberay() { echo "=== Installing KubeRay Operator (v${KUBERAY_OPERATOR_VERSION}) ===" helm upgrade --install kuberay-operator "$HELM_REGISTRY/kuberay-operator" \ --version "$KUBERAY_OPERATOR_VERSION" \ @@ -67,11 +71,6 @@ install_kuberay() { --wait } -install_operators() { - install_kueue - install_kuberay -} - status() { echo "=== Cluster status ===" kubectl get nodes @@ -81,6 +80,30 @@ status() { kubectl get pods -n kuberay-system } +run_inference_cpu() { + echo "=== Deploying Kueue resources ===" + kubectl apply -f "$SCRIPT_DIR/inference-cpu/stack-kueue-resources.yaml" + + echo "=== Creating inference script ConfigMap ===" + kubectl create configmap cpu-inference-script \ + --from-file=inference_job.py="$SCRIPT_DIR/inference-cpu/inference_job.py" \ + --namespace "$INFERENCE_NAMESPACE" \ + --dry-run=client -o yaml | kubectl apply -f - + + echo "=== Submitting RayJob ===" + sed "s|{{RAY_IMAGE}}|${RAY_IMAGE}|g; s|{{RAY_VERSION}}|${RAY_VERSION}|g" \ + "$SCRIPT_DIR/inference-cpu/inference-rayjob.yaml" | kubectl apply -f - + + echo "=== Waiting for RayJob to complete ===" + kubectl wait --for=jsonpath='{.status.jobStatus}'=SUCCEEDED \ + rayjob/cpu-inference \ + --namespace "$INFERENCE_NAMESPACE" \ + --timeout=600s + + echo "=== Job logs ===" + kubectl logs -l job-name=cpu-inference --namespace "$INFERENCE_NAMESPACE" --tail=50 +} + all() { check_prerequisites create_infra @@ -94,12 +117,14 @@ usage() { echo "Commands:" echo " check_prerequisites Check and install required tools (az, helm)" echo " create_infra Create resource group, AKS cluster, node pool, and fetch credentials" - echo " install_operators Install both Kueue and KubeRay" - echo " status Show cluster and operator pod status" + echo " install_operators Install Kueue and KubeRay from MCR" + echo " run_inference_cpu Deploy and run the CPU inference example" + echo " status Show cluster and operator pod status" echo " all Run all steps end-to-end" echo "" echo "Examples:" echo " $0 install_operators # Just install Kueue + KubeRay" + echo " $0 run_inference_cpu # Run CPU inference example" echo " $0 all # Full setup from scratch" } @@ -130,7 +155,7 @@ check_prerequisites() { COMMAND="${1:-}" case "$COMMAND" in - check_prerequisites|create_infra|install_operators|status|all) + check_prerequisites|create_infra|install_operators|run_inference_cpu|status|all) "$COMMAND" ;; *) From e3afd047e9fe07adc948b862c3d27c3cd86dde9f Mon Sep 17 00:00:00 2001 From: alyssavu Date: Wed, 13 May 2026 00:46:51 +0000 Subject: [PATCH 3/4] fix(examples): increase Ray head memory to prevent OOM Head node at 4Gi was hitting 96% memory usage from Ray system processes alone. Increased to 8Gi and updated Kueue memory quota from 20Gi to 24Gi to accommodate. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/ray/inference-cpu/inference-rayjob.yaml | 4 ++-- examples/ray/inference-cpu/stack-kueue-resources.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ray/inference-cpu/inference-rayjob.yaml b/examples/ray/inference-cpu/inference-rayjob.yaml index 9134ef610..667c00af7 100644 --- a/examples/ray/inference-cpu/inference-rayjob.yaml +++ b/examples/ray/inference-cpu/inference-rayjob.yaml @@ -46,10 +46,10 @@ spec: resources: requests: cpu: "2" - memory: "4Gi" + memory: "8Gi" limits: cpu: "2" - memory: "4Gi" + memory: "8Gi" startupProbe: httpGet: path: /api/version diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml index 7a4329421..0bb26c836 100644 --- a/examples/ray/inference-cpu/stack-kueue-resources.yaml +++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml @@ -30,7 +30,7 @@ spec: - name: cpu nominalQuota: 6 - name: memory - nominalQuota: 20Gi + nominalQuota: 24Gi - name: nvidia.com/gpu nominalQuota: 2 --- From 8d32013a193555bcd9c51667340b54ee01e14d3b Mon Sep 17 00:00:00 2001 From: alyssavu Date: Wed, 13 May 2026 00:57:02 +0000 Subject: [PATCH 4/4] feat(examples): add Helm provider to Terraform and update region - Add Helm provider to deploy Kueue and KubeRay via Terraform - Align Terraform defaults with setup.sh (VM sizes, node counts) - Update region to centralus in both setup.sh and Terraform - Simplify deploy.sh now that Terraform handles Helm releases Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- examples/ray/aks-classic/deploy.sh | 114 +++----------------------- examples/ray/aks-classic/main.tf | 29 ++++++- examples/ray/aks-classic/providers.tf | 13 +++ examples/ray/aks-classic/variables.tf | 42 ++++++++-- examples/ray/setup.sh | 2 +- 5 files changed, 88 insertions(+), 112 deletions(-) diff --git a/examples/ray/aks-classic/deploy.sh b/examples/ray/aks-classic/deploy.sh index 01113bd1c..2427f68a1 100755 --- a/examples/ray/aks-classic/deploy.sh +++ b/examples/ray/aks-classic/deploy.sh @@ -1,8 +1,9 @@ #!/bin/bash +set -euo pipefail # Check if the user is logged into Azure CLI if ! az account show > /dev/null 2>&1; then - echo "Please login to Azure CLI using 'az login' before running this script." + echo "Please sign in to Azure CLI using 'az login' before running this script." exit 1 fi @@ -15,114 +16,21 @@ terraform plan -out main.tfplan # Apply the Terraform plan terraform apply main.tfplan -# Retrieve the Terraform outputs and store in variables +# Retrieve the Terraform outputs resource_group_name=$(terraform output -raw resource_group_name) -system_node_pool_name=$(terraform output -raw system_node_pool_name) aks_cluster_name=$(terraform output -raw kubernetes_cluster_name) # Get AKS credentials for the cluster az aks get-credentials \ - --resource-group $resource_group_name \ - --name $aks_cluster_name + --resource-group "$resource_group_name" \ + --name "$aks_cluster_name" \ + --overwrite-existing -# Create the kuberay namespace -kuberay_namespace="kuberay" -kubectl create namespace $kuberay_namespace - -# Output the current Kubernetes context -current_context=$(kubectl config current-context) -echo "Current Kubernetes Context: $current_context" - -# Output the nodes in the cluster +echo "=== Cluster nodes ===" kubectl get nodes -# Check Helm version -helm version - -# Add the KubeRay Helm repository -helm repo add kuberay https://ray-project.github.io/kuberay-helm/ - -# Update the Helm repository -helm repo update - -# Install or upgrade the KubeRay operator using Helm -helm upgrade \ ---install \ ---cleanup-on-fail \ ---wait \ ---timeout 10m0s \ ---namespace "$kuberay_namespace" \ ---create-namespace kuberay-operator kuberay/kuberay-operator \ ---version 1.1.1 - -# Output the pods in the kuberay namespace -kubectl get pods -n $kuberay_namespace - -# Download the PyTorch MNIST job YAML file -curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml - -# Train a PyTorch Model on Fashion MNIST -kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml - -# Output the pods in the kuberay namespace -kubectl get pods -n $kuberay_namespace - -# Get the status of the Ray job -job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}') - -# Wait for the Ray job to complete -while [ "$job_status" != "Complete" ]; do - echo -ne "Job Status: $job_status\\r" - sleep 30 - job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}') -done -echo "Job Status: $job_status" - -# Check if the job succeeded -job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobStatus}') - -if [ "$job_status" != "SUCCEEDED" ]; then - echo "Job Failed!" - exit 1 -fi - -# If the job succeeded, get the Ray cluster head service -rayclusterhead=$(kubectl get service -n $kuberay_namespace | grep 'rayjob-pytorch-mnist-raycluster' | grep 'ClusterIP' | awk '{print $1}') - -# Now create a service of type NodePort for the Ray cluster head -kubectl expose service $rayclusterhead \ --n $kuberay_namespace \ ---port=80 \ ---target-port=8265 \ ---type=NodePort \ ---name=ray-dash - -# Create an ingress for the KubeRay dashboard -cat <