diff --git a/examples/ray/README.md b/examples/ray/README.md new file mode 100644 index 000000000..98c326303 --- /dev/null +++ b/examples/ray/README.md @@ -0,0 +1,75 @@ +# Running Ray on Azure Kubernetes Service (AKS) + +This example demonstrates how to deploy and run a [Ray](https://www.ray.io/) application on AKS using the KubeRay operator. + +## Prerequisites + +- An AKS cluster (Kubernetes 1.26+) +- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured to access your cluster +- [Helm](https://helm.sh/docs/intro/install/) 3.x installed + +## Overview + +Ray is an open-source framework for scaling AI and Python workloads. This example deploys: + +1. The **KubeRay operator** to manage Ray clusters on Kubernetes +2. A **RayCluster** custom resource with a head node and worker nodes +3. A sample **Ray job** to verify the deployment + +## Deploy the KubeRay operator + +```bash +helm repo add kuberay https://ray-project.github.io/kuberay-helm/ +helm repo update + +helm install kuberay-operator kuberay/kuberay-operator \ + --namespace kuberay-system \ + --create-namespace +``` + +Verify the operator is running: + +```bash +kubectl get pods -n kuberay-system +``` + +## Deploy the RayCluster + +```bash +kubectl apply -f ray-cluster.yaml +``` + +Wait for the cluster to be ready: + +```bash +kubectl get rayclusters +kubectl get pods -l ray.io/cluster=ray-cluster +``` + +## Submit a sample job + +```bash +kubectl apply -f ray-job.yaml +``` + +Check job status: + +```bash +kubectl get rayjobs +kubectl logs -l job-name=ray-sample-job +``` + +## Clean up + +```bash +kubectl delete -f ray-job.yaml +kubectl delete -f ray-cluster.yaml +helm uninstall kuberay-operator -n kuberay-system +kubectl delete namespace kuberay-system +``` + +## Resources + +- [Ray documentation](https://docs.ray.io/) +- [KubeRay documentation](https://ray-project.github.io/kuberay/) +- [AKS documentation](https://learn.microsoft.com/azure/aks) diff --git a/examples/ray/aks-classic/deploy.sh b/examples/ray/aks-classic/deploy.sh new file mode 100755 index 000000000..2427f68a1 --- /dev/null +++ b/examples/ray/aks-classic/deploy.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -euo pipefail + +# Check if the user is logged into Azure CLI +if ! az account show > /dev/null 2>&1; then + echo "Please sign in to Azure CLI using 'az login' before running this script." + exit 1 +fi + +# Initialize Terraform +terraform init + +# Create a Terraform plan +terraform plan -out main.tfplan + +# Apply the Terraform plan +terraform apply main.tfplan + +# Retrieve the Terraform outputs +resource_group_name=$(terraform output -raw resource_group_name) +aks_cluster_name=$(terraform output -raw kubernetes_cluster_name) + +# Get AKS credentials for the cluster +az aks get-credentials \ + --resource-group "$resource_group_name" \ + --name "$aks_cluster_name" \ + --overwrite-existing + +echo "=== Cluster nodes ===" +kubectl get nodes + +echo "=== Verifying installations ===" +kubectl get pods -n kueue-system +kubectl get pods -n kuberay-system + +echo "=== Setup complete ===" \ No newline at end of file diff --git a/examples/ray/aks-classic/main.tf b/examples/ray/aks-classic/main.tf new file mode 100644 index 000000000..4576026a6 --- /dev/null +++ b/examples/ray/aks-classic/main.tf @@ -0,0 +1,122 @@ +# Generate random resource group name +resource "random_pet" "rg_name" { + prefix = var.resource_group_name_prefix +} + +resource "azurerm_resource_group" "rg" { + location = var.resource_group_location + name = random_pet.rg_name.id +} + +resource "random_pet" "azurerm_kubernetes_cluster_name" { + prefix = "cluster" +} + +resource "random_pet" "azurerm_kubernetes_cluster_dns_prefix" { + prefix = "dns" +} + +resource "azurerm_kubernetes_cluster" "k8s" { + location = azurerm_resource_group.rg.location + name = random_pet.azurerm_kubernetes_cluster_name.id + resource_group_name = azurerm_resource_group.rg.name + dns_prefix = random_pet.azurerm_kubernetes_cluster_dns_prefix.id + kubernetes_version = var.kubernetes_version + + identity { + type = "SystemAssigned" + } + + default_node_pool { + name = "systempool" + vm_size = var.system_node_pool_vm_size + node_count = var.system_node_pool_node_count + tags = { owner = var.resource_group_owner } + } + + linux_profile { + admin_username = var.username + + ssh_key { + key_data = azapi_resource_action.ssh_public_key_gen.output.publicKey + } + } + + network_profile { + network_plugin = "azure" + } + + web_app_routing { + dns_zone_ids = [] + } +} + +resource "null_resource" "wait_for_aks" { + depends_on = [azurerm_kubernetes_cluster.k8s] + + provisioner "local-exec" { + command = <) and admits it +# once head 2 CPU + 2 workers × 1 CPU = 4 CPU quota is available. +# 3. KubeRay spawns head + 2 workers; Ray Data runs image classification on CPU. +# +# Uses the custom Ray image built from images/ray/. {{RAY_IMAGE}} is resolved at +# test time from the RAY_E2E_IMAGE env var, which CI sets to the image pushed to +# the PR-scoped ACR. {{RAY_VERSION}} is resolved from images/ray/Makefile. +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: cpu-inference + namespace: ray + labels: + kueue.x-k8s.io/queue-name: e2e-stack-queue +spec: + entrypoint: python /home/ray/scripts/inference_job.py + shutdownAfterJobFinishes: true + ttlSecondsAfterFinished: 600 + runtimeEnvYAML: | + pip: + packages: + - torch + - torchvision + pip_install_options: + - --index-url + - https://download.pytorch.org/whl/cpu + - --extra-index-url + - https://pypi.org/simple + pip_check: false + rayClusterSpec: + rayVersion: "{{RAY_VERSION}}" + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + num-cpus: "0" # Don't schedule Ray tasks on head; Kueue still counts its K8s CPU requests. + dashboard-host: "0.0.0.0" + template: + spec: + containers: + - name: ray-head + image: {{RAY_IMAGE}} + resources: + requests: + cpu: "2" + memory: "8Gi" + limits: + cpu: "2" + memory: "8Gi" + startupProbe: + httpGet: + path: /api/version + port: 8265 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 60 + volumeMounts: + - name: inference-script + mountPath: /home/ray/scripts + volumes: + - name: inference-script + configMap: + name: cpu-inference-script + workerGroupSpecs: + - groupName: workers + replicas: 2 + minReplicas: 2 + maxReplicas: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: {{RAY_IMAGE}} + resources: + requests: + cpu: "1" + memory: "4Gi" + limits: + cpu: "1" + memory: "4Gi" + startupProbe: + httpGet: + path: /api/healthz + port: 52365 + periodSeconds: 5 + timeoutSeconds: 2 + failureThreshold: 60 diff --git a/examples/ray/inference-cpu/inference_job.py b/examples/ray/inference-cpu/inference_job.py new file mode 100644 index 000000000..2aaaa1082 --- /dev/null +++ b/examples/ray/inference-cpu/inference_job.py @@ -0,0 +1,85 @@ +""" +Image classification test for e2e validation. +Adapted from aks-unbounded/ray/inference_job.py. + +Uses a locally-constructed model with random weights instead of downloading from +HuggingFace. This avoids network dependencies (SSL proxy issues, rate limits) while +still exercising the full Ray Data distributed inference pipeline: + - ActorPoolStrategy distributes work across workers + - Each actor loads a model, processes batches, returns predictions + - Ray Data orchestrates the dataflow end-to-end + +Auto-detects GPU: if torch.cuda.is_available(), actors declare num_gpus=1 and the +model runs on CUDA; otherwise falls back to CPU. Ray Data infers actor count from +the ActorPoolStrategy size; the cluster must have enough GPU/CPU quota to schedule. +""" +import ray +import numpy as np + +ray.init() +cluster_resources = ray.cluster_resources() +print(f"Cluster resources: {cluster_resources}") +GPU_AVAILABLE = cluster_resources.get("GPU", 0) > 0 +print(f"GPU_AVAILABLE = {GPU_AVAILABLE}") + +# Generate synthetic images in-memory (no S3 dependency) +NUM_IMAGES = 10 +synthetic_images = [ + {"image": np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)} + for _ in range(NUM_IMAGES) +] + +ds = ray.data.from_items(synthetic_images) + +LABELS = ["cat", "dog", "bird", "fish", "car"] + +class ImageClassifier: + def __init__(self): + import torch + import torch.nn as nn + + self.device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"Device: {self.device}") + # Simple conv net with random weights — no download needed. + self.model = nn.Sequential( + nn.Conv2d(3, 16, 3, stride=2), + nn.ReLU(), + nn.AdaptiveAvgPool2d(1), + nn.Flatten(), + nn.Linear(16, len(LABELS)), + ).to(self.device) + self.model.eval() + print("Model loaded successfully") + + def __call__(self, batch): + import torch + + images = torch.from_numpy( + np.stack(batch["image"]) + ).permute(0, 3, 1, 2).float().to(self.device) / 255.0 + + with torch.no_grad(): + logits = self.model(images) + indices = logits.argmax(dim=1) + + batch["label"] = [LABELS[i] for i in indices.tolist()] + batch["score"] = [logits[j, indices[j]].item() for j in range(len(indices))] + return batch + +# Use fewer actors on GPU (1 worker × 1 GPU) than CPU (2 workers × 2 CPU). +POOL_SIZE = 1 if GPU_AVAILABLE else 2 + +predictions = ds.map_batches( + ImageClassifier, + compute=ray.data.ActorPoolStrategy(size=POOL_SIZE), + batch_size=4, + **({"num_gpus": 1} if GPU_AVAILABLE else {}), +) + +results = predictions.take_all() +print(f"\nInference complete: {len(results)} images classified") +for i, r in enumerate(results): + print(f" Image {i}: Label: {r['label']} (score: {r['score']:.4f})") + +print("\nSUCCESS: All images classified") + diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml new file mode 100644 index 000000000..0bb26c836 --- /dev/null +++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml @@ -0,0 +1,45 @@ +--- +# Namespace for the Ray example. +apiVersion: v1 +kind: Namespace +metadata: + name: ray +--- +# ResourceFlavor — represents CPU hardware for the stack test. +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ResourceFlavor +metadata: + name: e2e-stack-flavor +spec: {} +--- +# ClusterQueue — sized to admit both the CPU-only inference job (head 2 + 2 workers × 1 CPU + submitter) +# and the GPU variant (head 1 + 1 worker × 2 CPU + 1 GPU + submitter). GPU quota is only exercised +# on GPU-capable clusters; on CPU clusters workloads that don't request GPU are unaffected. +apiVersion: kueue.x-k8s.io/v1beta2 +kind: ClusterQueue +metadata: + name: e2e-stack-cluster-queue +spec: + namespaceSelector: {} + queueingStrategy: BestEffortFIFO + resourceGroups: + - coveredResources: ["cpu", "memory", "nvidia.com/gpu"] + flavors: + - name: e2e-stack-flavor + resources: + - name: cpu + nominalQuota: 6 + - name: memory + nominalQuota: 24Gi + - name: nvidia.com/gpu + nominalQuota: 2 +--- +# LocalQueue — RayJobs must carry label kueue.x-k8s.io/queue-name: e2e-stack-queue +# to be managed by Kueue. +apiVersion: kueue.x-k8s.io/v1beta2 +kind: LocalQueue +metadata: + name: e2e-stack-queue + namespace: ray +spec: + clusterQueue: e2e-stack-cluster-queue diff --git a/examples/ray/setup.sh b/examples/ray/setup.sh new file mode 100755 index 000000000..56e17ecdd --- /dev/null +++ b/examples/ray/setup.sh @@ -0,0 +1,165 @@ +#!/bin/bash +set -euo pipefail + +# --- Infrastructure Configuration --- +RESOURCE_GROUP="${RESOURCE_GROUP:-ray-example-rg}" +LOCATION="${LOCATION:-centralus}" +CLUSTER_NAME="${CLUSTER_NAME:-demo}" +NODE_COUNT="${NODE_COUNT:-3}" +NODE_VM_SIZE="${NODE_VM_SIZE:-Standard_D4ds_v4}" +KUBERNETES_VERSION="${KUBERNETES_VERSION:-1.35}" +NODEPOOL_NAME="${NODEPOOL_NAME:-cpu-pool}" +NODEPOOL_NODE_COUNT="${NODEPOOL_NODE_COUNT:-2}" +NODEPOOL_VM_SIZE="${NODEPOOL_VM_SIZE:-Standard_D16ds_v7}" + +# --- Helm Charts Configuration --- +HELM_REGISTRY="${HELM_REGISTRY:-oci://mcr.microsoft.com/aks/ai-runtime/helm}" +KUEUE_VERSION="${KUEUE_VERSION:-0.17.1}" +KUBERAY_OPERATOR_VERSION="${KUBERAY_OPERATOR_VERSION:-1.6.1}" + +# --- Inference CPU Configuration --- +RAY_IMAGE="${RAY_IMAGE:-mcr.microsoft.com/aks/ai-runtime/ray:py3.12-ray2.54.0}" +RAY_VERSION="${RAY_VERSION:-2.54.0}" +INFERENCE_NAMESPACE="ray" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +create_infra() { + echo "=== Creating resource group ===" + az group create \ + --name "$RESOURCE_GROUP" \ + --location "$LOCATION" + + echo "=== Creating AKS cluster ===" + az aks create \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --location "$LOCATION" \ + --node-count "$NODE_COUNT" \ + --node-vm-size "$NODE_VM_SIZE" \ + --kubernetes-version "$KUBERNETES_VERSION" \ + --generate-ssh-keys \ + --enable-managed-identity + + echo "=== Adding cpu worker node pool ===" + az aks nodepool add \ + --resource-group "$RESOURCE_GROUP" \ + --cluster-name "$CLUSTER_NAME" \ + --name "$NODEPOOL_NAME" \ + --node-count "$NODEPOOL_NODE_COUNT" \ + --node-vm-size "$NODEPOOL_VM_SIZE" + + echo "=== Getting cluster credentials ===" + az aks get-credentials \ + --resource-group "$RESOURCE_GROUP" \ + --name "$CLUSTER_NAME" \ + --overwrite-existing +} + +install_operators() { + echo "=== Installing Kueue (v${KUEUE_VERSION}) ===" + helm upgrade --install kueue "$HELM_REGISTRY/kueue" \ + --version "$KUEUE_VERSION" \ + --namespace kueue-system \ + --create-namespace \ + --wait + + echo "=== Installing KubeRay Operator (v${KUBERAY_OPERATOR_VERSION}) ===" + helm upgrade --install kuberay-operator "$HELM_REGISTRY/kuberay-operator" \ + --version "$KUBERAY_OPERATOR_VERSION" \ + --namespace kuberay-system \ + --create-namespace \ + --wait +} + +status() { + echo "=== Cluster status ===" + kubectl get nodes + echo "" + kubectl get pods -n kueue-system + echo "" + kubectl get pods -n kuberay-system +} + +run_inference_cpu() { + echo "=== Deploying Kueue resources ===" + kubectl apply -f "$SCRIPT_DIR/inference-cpu/stack-kueue-resources.yaml" + + echo "=== Creating inference script ConfigMap ===" + kubectl create configmap cpu-inference-script \ + --from-file=inference_job.py="$SCRIPT_DIR/inference-cpu/inference_job.py" \ + --namespace "$INFERENCE_NAMESPACE" \ + --dry-run=client -o yaml | kubectl apply -f - + + echo "=== Submitting RayJob ===" + sed "s|{{RAY_IMAGE}}|${RAY_IMAGE}|g; s|{{RAY_VERSION}}|${RAY_VERSION}|g" \ + "$SCRIPT_DIR/inference-cpu/inference-rayjob.yaml" | kubectl apply -f - + + echo "=== Waiting for RayJob to complete ===" + kubectl wait --for=jsonpath='{.status.jobStatus}'=SUCCEEDED \ + rayjob/cpu-inference \ + --namespace "$INFERENCE_NAMESPACE" \ + --timeout=600s + + echo "=== Job logs ===" + kubectl logs -l job-name=cpu-inference --namespace "$INFERENCE_NAMESPACE" --tail=50 +} + +all() { + check_prerequisites + create_infra + install_operators + status +} + +usage() { + echo "Usage: $0 " + echo "" + echo "Commands:" + echo " check_prerequisites Check and install required tools (az, helm)" + echo " create_infra Create resource group, AKS cluster, node pool, and fetch credentials" + echo " install_operators Install Kueue and KubeRay from MCR" + echo " run_inference_cpu Deploy and run the CPU inference example" + echo " status Show cluster and operator pod status" + echo " all Run all steps end-to-end" + echo "" + echo "Examples:" + echo " $0 install_operators # Just install Kueue + KubeRay" + echo " $0 run_inference_cpu # Run CPU inference example" + echo " $0 all # Full setup from scratch" +} + +# --- Prerequisites --- +install_az_cli() { + echo "Installing Azure CLI..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash +} + +install_helm() { + echo "Installing Helm..." + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +} + +check_prerequisites() { + if ! command -v az &> /dev/null; then + echo "Azure CLI (az) not found. Installing..." + install_az_cli + fi + + if ! command -v helm &> /dev/null; then + echo "Helm not found. Installing..." + install_helm + fi + + echo "Prerequisites satisfied: az $(az version --query '"azure-cli"' -o tsv), helm $(helm version --short)" +} + +COMMAND="${1:-}" +case "$COMMAND" in + check_prerequisites|create_infra|install_operators|run_inference_cpu|status|all) + "$COMMAND" + ;; + *) + usage + exit 1 + ;; +esac \ No newline at end of file