From 899908bfb2cc7e4efe1c23b0553b61a3f8ee1cc8 Mon Sep 17 00:00:00 2001
From: alyssavu <alyssavu@microsoft.com>
Date: Wed, 13 May 2026 00:03:42 +0000
Subject: [PATCH 1/4] feat(examples): add Ray on AKS example

Add example for running Ray applications on AKS, including:
- Setup script with commands for infra, Kueue, and KubeRay operator
- RayCluster and RayJob manifests
- Terraform-based AKS deployment (aks-classic)
- CPU inference example with Kueue integration

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 examples/ray/README.md                        |  75 ++++++++++
 examples/ray/aks-classic/deploy.sh            | 128 ++++++++++++++++
 examples/ray/aks-classic/main.tf              |  97 ++++++++++++
 examples/ray/aks-classic/outputs.tf           |  46 ++++++
 examples/ray/aks-classic/providers.tf         |  28 ++++
 examples/ray/aks-classic/ssh.tf               |  25 ++++
 examples/ray/aks-classic/variables.tf         |  51 +++++++
 .../ray/inference-cpu/inference-rayjob.yaml   |  91 ++++++++++++
 examples/ray/inference-cpu/inference_job.py   |  85 +++++++++++
 .../inference-cpu/stack-kueue-resources.yaml  |  45 ++++++
 examples/ray/setup.sh                         | 140 ++++++++++++++++++
 11 files changed, 811 insertions(+)
 create mode 100644 examples/ray/README.md
 create mode 100755 examples/ray/aks-classic/deploy.sh
 create mode 100644 examples/ray/aks-classic/main.tf
 create mode 100644 examples/ray/aks-classic/outputs.tf
 create mode 100644 examples/ray/aks-classic/providers.tf
 create mode 100644 examples/ray/aks-classic/ssh.tf
 create mode 100644 examples/ray/aks-classic/variables.tf
 create mode 100644 examples/ray/inference-cpu/inference-rayjob.yaml
 create mode 100644 examples/ray/inference-cpu/inference_job.py
 create mode 100644 examples/ray/inference-cpu/stack-kueue-resources.yaml
 create mode 100755 examples/ray/setup.sh

diff --git a/examples/ray/README.md b/examples/ray/README.md
new file mode 100644
index 000000000..98c326303
--- /dev/null
+++ b/examples/ray/README.md
@@ -0,0 +1,75 @@
+# Running Ray on Azure Kubernetes Service (AKS)
+
+This example demonstrates how to deploy and run a [Ray](https://www.ray.io/) application on AKS using the KubeRay operator.
+
+## Prerequisites
+
+- An AKS cluster (Kubernetes 1.26+)
+- [kubectl](https://kubernetes.io/docs/tasks/tools/) configured to access your cluster
+- [Helm](https://helm.sh/docs/intro/install/) 3.x installed
+
+## Overview
+
+Ray is an open-source framework for scaling AI and Python workloads. This example deploys:
+
+1. The **KubeRay operator** to manage Ray clusters on Kubernetes
+2. A **RayCluster** custom resource with a head node and worker nodes
+3. A sample **Ray job** to verify the deployment
+
+## Deploy the KubeRay operator
+
+```bash
+helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+helm repo update
+
+helm install kuberay-operator kuberay/kuberay-operator \
+  --namespace kuberay-system \
+  --create-namespace
+```
+
+Verify the operator is running:
+
+```bash
+kubectl get pods -n kuberay-system
+```
+
+## Deploy the RayCluster
+
+```bash
+kubectl apply -f ray-cluster.yaml
+```
+
+Wait for the cluster to be ready:
+
+```bash
+kubectl get rayclusters
+kubectl get pods -l ray.io/cluster=ray-cluster
+```
+
+## Submit a sample job
+
+```bash
+kubectl apply -f ray-job.yaml
+```
+
+Check job status:
+
+```bash
+kubectl get rayjobs
+kubectl logs -l job-name=ray-sample-job
+```
+
+## Clean up
+
+```bash
+kubectl delete -f ray-job.yaml
+kubectl delete -f ray-cluster.yaml
+helm uninstall kuberay-operator -n kuberay-system
+kubectl delete namespace kuberay-system
+```
+
+## Resources
+
+- [Ray documentation](https://docs.ray.io/)
+- [KubeRay documentation](https://ray-project.github.io/kuberay/)
+- [AKS documentation](https://learn.microsoft.com/azure/aks)
diff --git a/examples/ray/aks-classic/deploy.sh b/examples/ray/aks-classic/deploy.sh
new file mode 100755
index 000000000..01113bd1c
--- /dev/null
+++ b/examples/ray/aks-classic/deploy.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+# Check if the user is logged into Azure CLI
+if ! az account show > /dev/null 2>&1; then
+    echo "Please login to Azure CLI using 'az login' before running this script."
+    exit 1
+fi
+
+# Initialize Terraform
+terraform init
+
+# Create a Terraform plan
+terraform plan -out main.tfplan
+
+# Apply the Terraform plan
+terraform apply main.tfplan
+
+# Retrieve the Terraform outputs and store in variables
+resource_group_name=$(terraform output -raw resource_group_name)
+system_node_pool_name=$(terraform output -raw system_node_pool_name)
+aks_cluster_name=$(terraform output -raw kubernetes_cluster_name)
+
+# Get AKS credentials for the cluster
+az aks get-credentials \
+    --resource-group $resource_group_name \
+    --name $aks_cluster_name
+
+# Create the kuberay namespace
+kuberay_namespace="kuberay"
+kubectl create namespace $kuberay_namespace
+
+# Output the current Kubernetes context
+current_context=$(kubectl config current-context)
+echo "Current Kubernetes Context: $current_context"
+
+# Output the nodes in the cluster
+kubectl get nodes
+
+# Check Helm version
+helm version
+
+# Add the KubeRay Helm repository
+helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+
+# Update the Helm repository
+helm repo update
+
+# Install or upgrade the KubeRay operator using Helm
+helm upgrade \
+--install \
+--cleanup-on-fail \
+--wait \
+--timeout 10m0s \
+--namespace "$kuberay_namespace" \
+--create-namespace kuberay-operator kuberay/kuberay-operator \
+--version 1.1.1
+
+# Output the pods in the kuberay namespace
+kubectl get pods -n $kuberay_namespace
+
+# Download the PyTorch MNIST job YAML file
+curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml
+
+# Train a PyTorch Model on Fashion MNIST
+kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml
+
+# Output the pods in the kuberay namespace
+kubectl get pods -n $kuberay_namespace
+
+# Get the status of the Ray job
+job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}')
+
+# Wait for the Ray job to complete
+while [ "$job_status" != "Complete" ]; do
+    echo -ne "Job Status: $job_status\\r"
+    sleep 30
+    job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}')
+done
+echo "Job Status: $job_status"
+
+# Check if the job succeeded
+job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobStatus}')
+
+if [ "$job_status" != "SUCCEEDED" ]; then
+    echo "Job Failed!"
+    exit 1
+fi
+
+# If the job succeeded, get the Ray cluster head service
+rayclusterhead=$(kubectl get service -n $kuberay_namespace | grep 'rayjob-pytorch-mnist-raycluster' | grep 'ClusterIP' | awk '{print $1}')
+
+# Now create a service of type NodePort for the Ray cluster head
+kubectl expose service $rayclusterhead \
+-n $kuberay_namespace \
+--port=80 \
+--target-port=8265 \
+--type=NodePort \
+--name=ray-dash
+
+# Create an ingress for the KubeRay dashboard
+cat <<EOF | kubectl apply -f -
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: ray-dash
+  namespace: kuberay
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: /
+spec:
+  ingressClassName: webapprouting.kubernetes.azure.com
+  rules:
+  - http:
+      paths:
+      - backend:
+          service:
+            name: ray-dash
+            port:
+              number: 80
+        path: /
+        pathType: Prefix
+EOF
+
+# Now find the public IP address of the ingress controller
+lb_public_ip=$(kubectl get svc -n app-routing-system -o jsonpath='{.items[?(@.metadata.name == "nginx")].status.loadBalancer.ingress[0].ip}')
+
+echo "KubeRay Dashboard URL: http://$lb_public_ip/"
+
+exit 0
\ No newline at end of file
diff --git a/examples/ray/aks-classic/main.tf b/examples/ray/aks-classic/main.tf
new file mode 100644
index 000000000..e7e605236
--- /dev/null
+++ b/examples/ray/aks-classic/main.tf
@@ -0,0 +1,97 @@
+# Generate random resource group name
+resource "random_pet" "rg_name" {
+  prefix = var.resource_group_name_prefix
+}
+
+resource "azurerm_resource_group" "rg" {
+  location = var.resource_group_location
+  name     = random_pet.rg_name.id
+}
+
+resource "random_pet" "azurerm_kubernetes_cluster_name" {
+  prefix = "cluster"
+}
+
+resource "random_pet" "azurerm_kubernetes_cluster_dns_prefix" {
+  prefix = "dns"
+}
+
+resource "azurerm_kubernetes_cluster" "k8s" {
+  location                         = azurerm_resource_group.rg.location
+  name                             = random_pet.azurerm_kubernetes_cluster_name.id
+  resource_group_name              = azurerm_resource_group.rg.name
+  dns_prefix                       = random_pet.azurerm_kubernetes_cluster_dns_prefix.id
+    
+  identity {
+    type = "SystemAssigned"
+  }
+
+  default_node_pool {
+    name       = "systempool"
+    vm_size    = var.system_node_pool_vm_size
+    node_count = var.system_node_pool_node_count
+    tags = { owner = var.resource_group_owner }
+  }
+
+  linux_profile {
+    admin_username = var.username
+
+    ssh_key {
+      key_data = azapi_resource_action.ssh_public_key_gen.output.publicKey
+    }
+  }
+
+  network_profile {
+    network_plugin    = "azure"
+  }
+
+  web_app_routing {
+    dns_zone_ids = []
+  }
+}
+
+resource "null_resource" "wait_for_aks" {
+  depends_on = [azurerm_kubernetes_cluster.k8s]
+
+  provisioner "local-exec" {
+    command = <<EOT
+      max_retries=10
+      retries=0
+      while [ "$(az aks show --resource-group ${azurerm_resource_group.rg.name} --name ${azurerm_kubernetes_cluster.k8s.name} --query "provisioningState" -o tsv)" != "Succeeded" ]; do
+        if [ $retries -ge $max_retries ]; then
+          echo "Max retries exceeded. Exiting..."
+          exit 1
+        fi
+        echo "Waiting for AKS cluster to be fully provisioned... (Attempt: $((retries+1)))"
+        retries=$((retries+1))
+        sleep 30
+      done
+    EOT
+  }
+}
+
+resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" {
+  type        = "Microsoft.ContainerService/managedClusters@2024-09-02-preview"
+  resource_id = azurerm_kubernetes_cluster.k8s.id
+  body = jsonencode({
+    properties = {
+      agentPoolProfiles = [
+        {
+          name = "systempool"
+          nodeTaints = ["CriticalAddonsOnly=true:NoSchedule"]
+        }
+      ]
+    }
+  })
+
+  depends_on = [null_resource.wait_for_aks]
+}
+
+resource "azurerm_kubernetes_cluster_node_pool" "workload" {
+  name                  = "ray"
+  kubernetes_cluster_id = azurerm_kubernetes_cluster.k8s.id
+  vm_size               = var.ray_node_pool_vm_size
+  node_count            = 4
+
+  depends_on = [azapi_update_resource.k8s-default-node-pool-systempool-taint]
+}
\ No newline at end of file
diff --git a/examples/ray/aks-classic/outputs.tf b/examples/ray/aks-classic/outputs.tf
new file mode 100644
index 000000000..fe85c3fcd
--- /dev/null
+++ b/examples/ray/aks-classic/outputs.tf
@@ -0,0 +1,46 @@
+output "resource_group_name" {
+  value = azurerm_resource_group.rg.name
+}
+
+output "kubernetes_cluster_name" {
+  value = azurerm_kubernetes_cluster.k8s.name
+}
+
+output "client_certificate" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config[0].client_certificate
+  sensitive = true
+}
+
+output "client_key" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config[0].client_key
+  sensitive = true
+}
+
+output "cluster_ca_certificate" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config[0].cluster_ca_certificate
+  sensitive = true
+}
+
+output "cluster_password" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config[0].password
+  sensitive = true
+}
+
+output "cluster_username" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config[0].username
+  sensitive = true
+}
+
+output "host" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config[0].host
+  sensitive = true
+}
+
+output "kube_config" {
+  value     = azurerm_kubernetes_cluster.k8s.kube_config_raw
+  sensitive = true
+}
+
+output "system_node_pool_name" {
+  value = azurerm_kubernetes_cluster.k8s.default_node_pool[0].name
+}
diff --git a/examples/ray/aks-classic/providers.tf b/examples/ray/aks-classic/providers.tf
new file mode 100644
index 000000000..703b01589
--- /dev/null
+++ b/examples/ray/aks-classic/providers.tf
@@ -0,0 +1,28 @@
+terraform {
+  required_version = ">=1.0"
+
+  required_providers {
+    azapi = {
+      source  = "azure/azapi"
+      version = "~>1.5"
+    }
+    azurerm = {
+      source  = "hashicorp/azurerm"
+      version = "~>4.13"
+    }
+    random = {
+      source  = "hashicorp/random"
+      version = "~>3.0"
+    }
+    time = {
+      source  = "hashicorp/time"
+      version = "0.9.1"
+    }
+  }
+}
+
+provider "azurerm" {
+  features {}
+
+  subscription_id = var.subscription_id
+}
\ No newline at end of file
diff --git a/examples/ray/aks-classic/ssh.tf b/examples/ray/aks-classic/ssh.tf
new file mode 100644
index 000000000..a65439495
--- /dev/null
+++ b/examples/ray/aks-classic/ssh.tf
@@ -0,0 +1,25 @@
+
+resource "random_pet" "ssh_key_name" {
+  prefix    = "ssh"
+  separator = ""
+}
+
+resource "azapi_resource_action" "ssh_public_key_gen" {
+  type        = "Microsoft.Compute/sshPublicKeys@2022-11-01"
+  resource_id = azapi_resource.ssh_public_key.id
+  action      = "generateKeyPair"
+  method      = "POST"
+
+  response_export_values = ["publicKey", "privateKey"]
+}
+
+resource "azapi_resource" "ssh_public_key" {
+  type      = "Microsoft.Compute/sshPublicKeys@2022-11-01"
+  name      = random_pet.ssh_key_name.id
+  location  = azurerm_resource_group.rg.location
+  parent_id = azurerm_resource_group.rg.id
+}
+
+output "key_data" {
+  value = azapi_resource_action.ssh_public_key_gen.output.publicKey
+}
diff --git a/examples/ray/aks-classic/variables.tf b/examples/ray/aks-classic/variables.tf
new file mode 100644
index 000000000..2b519f7c3
--- /dev/null
+++ b/examples/ray/aks-classic/variables.tf
@@ -0,0 +1,51 @@
+variable "subscription_id" {
+  description = "The Azure subscription ID."
+  type        = string
+}
+
+variable "resource_group_owner" {
+  description = "The owner of the resource group."
+  type        = string
+}
+
+variable "resource_group_location" {
+  type        = string
+  default     = "westus3"
+  description = "Location of the resource group."
+}
+
+variable "resource_group_name_prefix" {
+  type        = string
+  default     = "rg"
+  description = "Prefix of the resource group name that's combined with a random ID so name is unique in your Azure subscription."
+}
+
+variable "system_node_pool_vm_size" {
+  type        = string
+  description = "The size of the Virtual Machine."
+  default     = "Standard_D2_v2"
+}
+
+variable "system_node_pool_node_count" {
+  type        = number
+  description = "The initial quantity of nodes for the system node pool."
+  default     = 1
+}
+
+variable "ray_node_pool_vm_size" {
+  type        = string
+  description = "The size of the Virtual Machine."
+  default     = "Standard_D4s_v4"
+}
+
+variable "msi_id" {
+  type        = string
+  description = "The Managed Service Identity ID. Set this value if you're running this example using Managed Identity as the authentication method."
+  default     = null
+}
+
+variable "username" {
+  type        = string
+  description = "The admin username for the new cluster."
+  default     = "azureadmin"
+}
\ No newline at end of file
diff --git a/examples/ray/inference-cpu/inference-rayjob.yaml b/examples/ray/inference-cpu/inference-rayjob.yaml
new file mode 100644
index 000000000..c953293c6
--- /dev/null
+++ b/examples/ray/inference-cpu/inference-rayjob.yaml
@@ -0,0 +1,91 @@
+# RayJob for the stack integration test.
+#
+# Flow:
+#   1. Labelled with kueue.x-k8s.io/queue-name so Kueue gates admission.
+#   2. Kueue creates a Workload (name: rayjob-e2e-inference-<hash>) and admits it
+#      once head 2 CPU + 2 workers × 1 CPU = 4 CPU quota is available.
+#   3. KubeRay spawns head + 2 workers; Ray Data runs image classification on CPU.
+#
+# Uses the custom Ray image built from images/ray/. {{RAY_IMAGE}} is resolved at
+# test time from the RAY_E2E_IMAGE env var, which CI sets to the image pushed to
+# the PR-scoped ACR. {{RAY_VERSION}} is resolved from images/ray/Makefile.
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+  name: e2e-inference
+  namespace: e2e-stack
+  labels:
+    kueue.x-k8s.io/queue-name: e2e-stack-queue
+spec:
+  entrypoint: python /home/ray/scripts/inference_job.py
+  shutdownAfterJobFinishes: true
+  ttlSecondsAfterFinished: 600
+  runtimeEnvYAML: |
+    pip:
+      packages:
+        - torch
+        - torchvision
+      pip_install_options:
+        - --index-url
+        - https://download.pytorch.org/whl/cpu
+        - --extra-index-url
+        - https://pypi.org/simple
+      pip_check: false
+  rayClusterSpec:
+    rayVersion: "{{RAY_VERSION}}"
+    enableInTreeAutoscaling: false
+    headGroupSpec:
+      rayStartParams:
+        num-cpus: "0"            # Don't schedule Ray tasks on head; Kueue still counts its K8s CPU requests.
+        dashboard-host: "0.0.0.0"
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: {{RAY_IMAGE}}
+            resources:
+              requests:
+                cpu: "2"
+                memory: "4Gi"
+              limits:
+                cpu: "2"
+                memory: "4Gi"
+            startupProbe:
+              httpGet:
+                path: /api/version
+                port: 8265
+              periodSeconds: 5
+              timeoutSeconds: 2
+              failureThreshold: 60
+            volumeMounts:
+            - name: inference-script
+              mountPath: /home/ray/scripts
+          volumes:
+          - name: inference-script
+            configMap:
+              name: e2e-inference-script
+    workerGroupSpecs:
+    - groupName: workers
+      replicas: 2
+      minReplicas: 2
+      maxReplicas: 2
+      rayStartParams: {}
+      template:
+        spec:
+          containers:
+          - name: ray-worker
+            image: {{RAY_IMAGE}}
+            resources:
+              requests:
+                cpu: "1"
+                memory: "4Gi"
+              limits:
+                cpu: "1"
+                memory: "4Gi"
+            startupProbe:
+              httpGet:
+                path: /api/healthz
+                port: 52365
+              periodSeconds: 5
+              timeoutSeconds: 2
+              failureThreshold: 60
diff --git a/examples/ray/inference-cpu/inference_job.py b/examples/ray/inference-cpu/inference_job.py
new file mode 100644
index 000000000..2aaaa1082
--- /dev/null
+++ b/examples/ray/inference-cpu/inference_job.py
@@ -0,0 +1,85 @@
+"""
+Image classification test for e2e validation.
+Adapted from aks-unbounded/ray/inference_job.py.
+
+Uses a locally-constructed model with random weights instead of downloading from
+HuggingFace. This avoids network dependencies (SSL proxy issues, rate limits) while
+still exercising the full Ray Data distributed inference pipeline:
+  - ActorPoolStrategy distributes work across workers
+  - Each actor loads a model, processes batches, returns predictions
+  - Ray Data orchestrates the dataflow end-to-end
+
+Auto-detects GPU: if torch.cuda.is_available(), actors declare num_gpus=1 and the
+model runs on CUDA; otherwise falls back to CPU. Ray Data infers actor count from
+the ActorPoolStrategy size; the cluster must have enough GPU/CPU quota to schedule.
+"""
+import ray
+import numpy as np
+
+ray.init()
+cluster_resources = ray.cluster_resources()
+print(f"Cluster resources: {cluster_resources}")
+GPU_AVAILABLE = cluster_resources.get("GPU", 0) > 0
+print(f"GPU_AVAILABLE = {GPU_AVAILABLE}")
+
+# Generate synthetic images in-memory (no S3 dependency)
+NUM_IMAGES = 10
+synthetic_images = [
+    {"image": np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)}
+    for _ in range(NUM_IMAGES)
+]
+
+ds = ray.data.from_items(synthetic_images)
+
+LABELS = ["cat", "dog", "bird", "fish", "car"]
+
+class ImageClassifier:
+    def __init__(self):
+        import torch
+        import torch.nn as nn
+
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Device: {self.device}")
+        # Simple conv net with random weights — no download needed.
+        self.model = nn.Sequential(
+            nn.Conv2d(3, 16, 3, stride=2),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool2d(1),
+            nn.Flatten(),
+            nn.Linear(16, len(LABELS)),
+        ).to(self.device)
+        self.model.eval()
+        print("Model loaded successfully")
+
+    def __call__(self, batch):
+        import torch
+
+        images = torch.from_numpy(
+            np.stack(batch["image"])
+        ).permute(0, 3, 1, 2).float().to(self.device) / 255.0
+
+        with torch.no_grad():
+            logits = self.model(images)
+            indices = logits.argmax(dim=1)
+
+        batch["label"] = [LABELS[i] for i in indices.tolist()]
+        batch["score"] = [logits[j, indices[j]].item() for j in range(len(indices))]
+        return batch
+
+# Use fewer actors on GPU (1 worker × 1 GPU) than CPU (2 workers × 2 CPU).
+POOL_SIZE = 1 if GPU_AVAILABLE else 2
+
+predictions = ds.map_batches(
+    ImageClassifier,
+    compute=ray.data.ActorPoolStrategy(size=POOL_SIZE),
+    batch_size=4,
+    **({"num_gpus": 1} if GPU_AVAILABLE else {}),
+)
+
+results = predictions.take_all()
+print(f"\nInference complete: {len(results)} images classified")
+for i, r in enumerate(results):
+    print(f"  Image {i}: Label: {r['label']} (score: {r['score']:.4f})")
+
+print("\nSUCCESS: All images classified")
+
diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml
new file mode 100644
index 000000000..11fcddd63
--- /dev/null
+++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml
@@ -0,0 +1,45 @@
+---
+# Namespace for the stack integration test.
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: e2e-stack
+---
+# ResourceFlavor — represents CPU hardware for the stack test.
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: ResourceFlavor
+metadata:
+  name: e2e-stack-flavor
+spec: {}
+---
+# ClusterQueue — sized to admit both the CPU-only inference job (head 2 + 2 workers × 1 CPU + submitter)
+# and the GPU variant (head 1 + 1 worker × 2 CPU + 1 GPU + submitter). GPU quota is only exercised
+# on GPU-capable clusters; on CPU clusters workloads that don't request GPU are unaffected.
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: ClusterQueue
+metadata:
+  name: e2e-stack-cluster-queue
+spec:
+  namespaceSelector: {}
+  queueingStrategy: BestEffortFIFO
+  resourceGroups:
+  - coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
+    flavors:
+    - name: e2e-stack-flavor
+      resources:
+      - name: cpu
+        nominalQuota: 6
+      - name: memory
+        nominalQuota: 20Gi
+      - name: nvidia.com/gpu
+        nominalQuota: 2
+---
+# LocalQueue — RayJobs must carry label kueue.x-k8s.io/queue-name: e2e-stack-queue
+# to be managed by Kueue.
+apiVersion: kueue.x-k8s.io/v1beta2
+kind: LocalQueue
+metadata:
+  name: e2e-stack-queue
+  namespace: e2e-stack
+spec:
+  clusterQueue: e2e-stack-cluster-queue
diff --git a/examples/ray/setup.sh b/examples/ray/setup.sh
new file mode 100755
index 000000000..5f1596d7e
--- /dev/null
+++ b/examples/ray/setup.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+set -euo pipefail
+
+# --- Infrastructure Configuration ---
+RESOURCE_GROUP="${RESOURCE_GROUP:-ray-example-rg}"
+LOCATION="${LOCATION:-eastus}"
+CLUSTER_NAME="${CLUSTER_NAME:-demo}"
+NODE_COUNT="${NODE_COUNT:-3}"
+NODE_VM_SIZE="${NODE_VM_SIZE:-Standard_D4ds_v4}"
+KUBERNETES_VERSION="${KUBERNETES_VERSION:-1.35}"
+NODEPOOL_NAME="${NODEPOOL_NAME:-cpu-pool}"
+NODEPOOL_NODE_COUNT="${NODEPOOL_NODE_COUNT:-2}"
+NODEPOOL_VM_SIZE="${NODEPOOL_VM_SIZE:-Standard_D16ds_v7}"
+
+# --- Helm Charts Configuration ---
+HELM_REGISTRY="${HELM_REGISTRY:-oci://mcr.microsoft.com/aks/ai-runtime/helm}"
+KUEUE_VERSION="${KUEUE_VERSION:-0.17.1}"
+KUBERAY_OPERATOR_VERSION="${KUBERAY_OPERATOR_VERSION:-1.6.1}"
+
+create_infra() {
+    echo "=== Creating resource group ==="
+    az group create \
+      --name "$RESOURCE_GROUP" \
+      --location "$LOCATION"
+
+    echo "=== Creating AKS cluster ==="
+    az aks create \
+      --resource-group "$RESOURCE_GROUP" \
+      --name "$CLUSTER_NAME" \
+      --location "$LOCATION" \
+      --node-count "$NODE_COUNT" \
+      --node-vm-size "$NODE_VM_SIZE" \
+      --kubernetes-version "$KUBERNETES_VERSION" \
+      --generate-ssh-keys \
+      --enable-managed-identity
+
+    echo "=== Adding cpu worker node pool ==="
+    az aks nodepool add \
+      --resource-group "$RESOURCE_GROUP" \
+      --cluster-name "$CLUSTER_NAME" \
+      --name "$NODEPOOL_NAME" \
+      --node-count "$NODEPOOL_NODE_COUNT" \
+      --node-vm-size "$NODEPOOL_VM_SIZE"
+
+    echo "=== Getting cluster credentials ==="
+    az aks get-credentials \
+      --resource-group "$RESOURCE_GROUP" \
+      --name "$CLUSTER_NAME" \
+      --overwrite-existing
+}
+
+install_kueue() {
+    echo "=== Installing Kueue (v${KUEUE_VERSION}) ==="
+    helm upgrade --install kueue "$HELM_REGISTRY/kueue" \
+      --version "$KUEUE_VERSION" \
+      --namespace kueue-system \
+      --create-namespace \
+      --wait
+}
+
+install_kuberay() {
+    echo "=== Installing KubeRay Operator (v${KUBERAY_OPERATOR_VERSION}) ==="
+    helm upgrade --install kuberay-operator "$HELM_REGISTRY/kuberay-operator" \
+      --version "$KUBERAY_OPERATOR_VERSION" \
+      --namespace kuberay-system \
+      --create-namespace \
+      --wait
+}
+
+install_operators() {
+    install_kueue
+    install_kuberay
+}
+
+status() {
+    echo "=== Cluster status ==="
+    kubectl get nodes
+    echo ""
+    kubectl get pods -n kueue-system
+    echo ""
+    kubectl get pods -n kuberay-system
+}
+
+all() {
+    check_prerequisites
+    create_infra
+    install_operators
+    status
+}
+
+usage() {
+    echo "Usage: $0 <command>"
+    echo ""
+    echo "Commands:"
+    echo "  check_prerequisites  Check and install required tools (az, helm)"
+    echo "  create_infra       Create resource group, AKS cluster, node pool, and fetch credentials"
+    echo "  install_operators  Install both Kueue and KubeRay"
+    echo "  status             Show cluster and operator pod status"
+    echo "  all                Run all steps end-to-end"
+    echo ""
+    echo "Examples:"
+    echo "  $0 install_operators   # Just install Kueue + KubeRay"
+    echo "  $0 all                 # Full setup from scratch"
+}
+
+# --- Prerequisites ---
+install_az_cli() {
+    echo "Installing Azure CLI..."
+    curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+}
+
+install_helm() {
+    echo "Installing Helm..."
+    curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+}
+
+check_prerequisites() {
+    if ! command -v az &> /dev/null; then
+        echo "Azure CLI (az) not found. Installing..."
+        install_az_cli
+    fi
+
+    if ! command -v helm &> /dev/null; then
+        echo "Helm not found. Installing..."
+        install_helm
+    fi
+
+    echo "Prerequisites satisfied: az $(az version --query '"azure-cli"' -o tsv), helm $(helm version --short)"
+}
+
+COMMAND="${1:-}"
+case "$COMMAND" in
+    check_prerequisites|create_infra|install_operators|status|all)
+        "$COMMAND"
+        ;;
+    *)
+        usage
+        exit 1
+        ;;
+esac
\ No newline at end of file

From 1c8e2af12e150e7432a99fbc4b0f142ff387d2af Mon Sep 17 00:00:00 2001
From: alyssavu <alyssavu@microsoft.com>
Date: Wed, 13 May 2026 00:40:05 +0000
Subject: [PATCH 2/4] feat(examples): add inference-cpu command and update Ray
 config

- Add run_inference_cpu command to setup.sh
- Use MCR Ray image (ray:py3.12-ray2.54.0)
- Rename namespace from e2e-stack to ray
- Rename job from e2e-inference to cpu-inference
- Consolidate install_kueue/install_kuberay into install_operators

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../ray/inference-cpu/inference-rayjob.yaml   |  6 +--
 .../inference-cpu/stack-kueue-resources.yaml  |  6 +--
 examples/ray/setup.sh                         | 47 ++++++++++++++-----
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/examples/ray/inference-cpu/inference-rayjob.yaml b/examples/ray/inference-cpu/inference-rayjob.yaml
index c953293c6..9134ef610 100644
--- a/examples/ray/inference-cpu/inference-rayjob.yaml
+++ b/examples/ray/inference-cpu/inference-rayjob.yaml
@@ -12,8 +12,8 @@
 apiVersion: ray.io/v1
 kind: RayJob
 metadata:
-  name: e2e-inference
-  namespace: e2e-stack
+  name: cpu-inference
+  namespace: ray
   labels:
     kueue.x-k8s.io/queue-name: e2e-stack-queue
 spec:
@@ -63,7 +63,7 @@ spec:
           volumes:
           - name: inference-script
             configMap:
-              name: e2e-inference-script
+              name: cpu-inference-script
     workerGroupSpecs:
     - groupName: workers
       replicas: 2
diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml
index 11fcddd63..7a4329421 100644
--- a/examples/ray/inference-cpu/stack-kueue-resources.yaml
+++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml
@@ -1,9 +1,9 @@
 ---
-# Namespace for the stack integration test.
+# Namespace for the Ray example.
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: e2e-stack
+  name: ray
 ---
 # ResourceFlavor — represents CPU hardware for the stack test.
 apiVersion: kueue.x-k8s.io/v1beta2
@@ -40,6 +40,6 @@ apiVersion: kueue.x-k8s.io/v1beta2
 kind: LocalQueue
 metadata:
   name: e2e-stack-queue
-  namespace: e2e-stack
+  namespace: ray
 spec:
   clusterQueue: e2e-stack-cluster-queue
diff --git a/examples/ray/setup.sh b/examples/ray/setup.sh
index 5f1596d7e..745e6cade 100755
--- a/examples/ray/setup.sh
+++ b/examples/ray/setup.sh
@@ -17,6 +17,12 @@ HELM_REGISTRY="${HELM_REGISTRY:-oci://mcr.microsoft.com/aks/ai-runtime/helm}"
 KUEUE_VERSION="${KUEUE_VERSION:-0.17.1}"
 KUBERAY_OPERATOR_VERSION="${KUBERAY_OPERATOR_VERSION:-1.6.1}"
 
+# --- Inference CPU Configuration ---
+RAY_IMAGE="${RAY_IMAGE:-mcr.microsoft.com/aks/ai-runtime/ray:py3.12-ray2.54.0}"
+RAY_VERSION="${RAY_VERSION:-2.54.0}"
+INFERENCE_NAMESPACE="ray"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 create_infra() {
     echo "=== Creating resource group ==="
     az group create \
@@ -49,16 +55,14 @@ create_infra() {
       --overwrite-existing
 }
 
-install_kueue() {
+install_operators() {
     echo "=== Installing Kueue (v${KUEUE_VERSION}) ==="
     helm upgrade --install kueue "$HELM_REGISTRY/kueue" \
       --version "$KUEUE_VERSION" \
       --namespace kueue-system \
       --create-namespace \
       --wait
-}
 
-install_kuberay() {
     echo "=== Installing KubeRay Operator (v${KUBERAY_OPERATOR_VERSION}) ==="
     helm upgrade --install kuberay-operator "$HELM_REGISTRY/kuberay-operator" \
       --version "$KUBERAY_OPERATOR_VERSION" \
@@ -67,11 +71,6 @@ install_kuberay() {
       --wait
 }
 
-install_operators() {
-    install_kueue
-    install_kuberay
-}
-
 status() {
     echo "=== Cluster status ==="
     kubectl get nodes
@@ -81,6 +80,30 @@ status() {
     kubectl get pods -n kuberay-system
 }
 
+run_inference_cpu() {
+    echo "=== Deploying Kueue resources ==="
+    kubectl apply -f "$SCRIPT_DIR/inference-cpu/stack-kueue-resources.yaml"
+
+    echo "=== Creating inference script ConfigMap ==="
+    kubectl create configmap cpu-inference-script \
+      --from-file=inference_job.py="$SCRIPT_DIR/inference-cpu/inference_job.py" \
+      --namespace "$INFERENCE_NAMESPACE" \
+      --dry-run=client -o yaml | kubectl apply -f -
+
+    echo "=== Submitting RayJob ==="
+    sed "s|{{RAY_IMAGE}}|${RAY_IMAGE}|g; s|{{RAY_VERSION}}|${RAY_VERSION}|g" \
+      "$SCRIPT_DIR/inference-cpu/inference-rayjob.yaml" | kubectl apply -f -
+
+    echo "=== Waiting for RayJob to complete ==="
+    kubectl wait --for=jsonpath='{.status.jobStatus}'=SUCCEEDED \
+      rayjob/cpu-inference \
+      --namespace "$INFERENCE_NAMESPACE" \
+      --timeout=600s
+
+    echo "=== Job logs ==="
+    kubectl logs -l job-name=cpu-inference --namespace "$INFERENCE_NAMESPACE" --tail=50
+}
+
 all() {
     check_prerequisites
     create_infra
@@ -94,12 +117,14 @@ usage() {
     echo "Commands:"
     echo "  check_prerequisites  Check and install required tools (az, helm)"
     echo "  create_infra       Create resource group, AKS cluster, node pool, and fetch credentials"
-    echo "  install_operators  Install both Kueue and KubeRay"
-    echo "  status             Show cluster and operator pod status"
+    echo "  install_operators    Install Kueue and KubeRay from MCR"
+    echo "  run_inference_cpu    Deploy and run the CPU inference example"
+    echo "  status               Show cluster and operator pod status"
     echo "  all                Run all steps end-to-end"
     echo ""
     echo "Examples:"
     echo "  $0 install_operators   # Just install Kueue + KubeRay"
+    echo "  $0 run_inference_cpu   # Run CPU inference example"
     echo "  $0 all                 # Full setup from scratch"
 }
 
@@ -130,7 +155,7 @@ check_prerequisites() {
 
 COMMAND="${1:-}"
 case "$COMMAND" in
-    check_prerequisites|create_infra|install_operators|status|all)
+    check_prerequisites|create_infra|install_operators|run_inference_cpu|status|all)
         "$COMMAND"
         ;;
     *)

From e3afd047e9fe07adc948b862c3d27c3cd86dde9f Mon Sep 17 00:00:00 2001
From: alyssavu <alyssavu@microsoft.com>
Date: Wed, 13 May 2026 00:46:51 +0000
Subject: [PATCH 3/4] fix(examples): increase Ray head memory to prevent OOM

Head node at 4Gi was hitting 96% memory usage from Ray system
processes alone. Increased to 8Gi and updated Kueue memory quota
from 20Gi to 24Gi to accommodate.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 examples/ray/inference-cpu/inference-rayjob.yaml      | 4 ++--
 examples/ray/inference-cpu/stack-kueue-resources.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ray/inference-cpu/inference-rayjob.yaml b/examples/ray/inference-cpu/inference-rayjob.yaml
index 9134ef610..667c00af7 100644
--- a/examples/ray/inference-cpu/inference-rayjob.yaml
+++ b/examples/ray/inference-cpu/inference-rayjob.yaml
@@ -46,10 +46,10 @@ spec:
             resources:
               requests:
                 cpu: "2"
-                memory: "4Gi"
+                memory: "8Gi"
               limits:
                 cpu: "2"
-                memory: "4Gi"
+                memory: "8Gi"
             startupProbe:
               httpGet:
                 path: /api/version
diff --git a/examples/ray/inference-cpu/stack-kueue-resources.yaml b/examples/ray/inference-cpu/stack-kueue-resources.yaml
index 7a4329421..0bb26c836 100644
--- a/examples/ray/inference-cpu/stack-kueue-resources.yaml
+++ b/examples/ray/inference-cpu/stack-kueue-resources.yaml
@@ -30,7 +30,7 @@ spec:
       - name: cpu
         nominalQuota: 6
       - name: memory
-        nominalQuota: 20Gi
+        nominalQuota: 24Gi
       - name: nvidia.com/gpu
         nominalQuota: 2
 ---

From 8d32013a193555bcd9c51667340b54ee01e14d3b Mon Sep 17 00:00:00 2001
From: alyssavu <alyssavu@microsoft.com>
Date: Wed, 13 May 2026 00:57:02 +0000
Subject: [PATCH 4/4] feat(examples): add Helm provider to Terraform and update
 region

- Add Helm provider to deploy Kueue and KubeRay via Terraform
- Align Terraform defaults with setup.sh (VM sizes, node counts)
- Update region to centralus in both setup.sh and Terraform
- Simplify deploy.sh now that Terraform handles Helm releases

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 examples/ray/aks-classic/deploy.sh    | 114 +++-----------------------
 examples/ray/aks-classic/main.tf      |  29 ++++++-
 examples/ray/aks-classic/providers.tf |  13 +++
 examples/ray/aks-classic/variables.tf |  42 ++++++++--
 examples/ray/setup.sh                 |   2 +-
 5 files changed, 88 insertions(+), 112 deletions(-)

diff --git a/examples/ray/aks-classic/deploy.sh b/examples/ray/aks-classic/deploy.sh
index 01113bd1c..2427f68a1 100755
--- a/examples/ray/aks-classic/deploy.sh
+++ b/examples/ray/aks-classic/deploy.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
+set -euo pipefail
 
 # Check if the user is logged into Azure CLI
 if ! az account show > /dev/null 2>&1; then
-    echo "Please login to Azure CLI using 'az login' before running this script."
+    echo "Please sign in to Azure CLI using 'az login' before running this script."
     exit 1
 fi
 
@@ -15,114 +16,21 @@ terraform plan -out main.tfplan
 # Apply the Terraform plan
 terraform apply main.tfplan
 
-# Retrieve the Terraform outputs and store in variables
+# Retrieve the Terraform outputs
 resource_group_name=$(terraform output -raw resource_group_name)
-system_node_pool_name=$(terraform output -raw system_node_pool_name)
 aks_cluster_name=$(terraform output -raw kubernetes_cluster_name)
 
 # Get AKS credentials for the cluster
 az aks get-credentials \
-    --resource-group $resource_group_name \
-    --name $aks_cluster_name
+    --resource-group "$resource_group_name" \
+    --name "$aks_cluster_name" \
+    --overwrite-existing
 
-# Create the kuberay namespace
-kuberay_namespace="kuberay"
-kubectl create namespace $kuberay_namespace
-
-# Output the current Kubernetes context
-current_context=$(kubectl config current-context)
-echo "Current Kubernetes Context: $current_context"
-
-# Output the nodes in the cluster
+echo "=== Cluster nodes ==="
 kubectl get nodes
 
-# Check Helm version
-helm version
-
-# Add the KubeRay Helm repository
-helm repo add kuberay https://ray-project.github.io/kuberay-helm/
-
-# Update the Helm repository
-helm repo update
-
-# Install or upgrade the KubeRay operator using Helm
-helm upgrade \
---install \
---cleanup-on-fail \
---wait \
---timeout 10m0s \
---namespace "$kuberay_namespace" \
---create-namespace kuberay-operator kuberay/kuberay-operator \
---version 1.1.1
-
-# Output the pods in the kuberay namespace
-kubectl get pods -n $kuberay_namespace
-
-# Download the PyTorch MNIST job YAML file
-curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml
-
-# Train a PyTorch Model on Fashion MNIST
-kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml
-
-# Output the pods in the kuberay namespace
-kubectl get pods -n $kuberay_namespace
-
-# Get the status of the Ray job
-job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}')
-
-# Wait for the Ray job to complete
-while [ "$job_status" != "Complete" ]; do
-    echo -ne "Job Status: $job_status\\r"
-    sleep 30
-    job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}')
-done
-echo "Job Status: $job_status"
-
-# Check if the job succeeded
-job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobStatus}')
-
-if [ "$job_status" != "SUCCEEDED" ]; then
-    echo "Job Failed!"
-    exit 1
-fi
-
-# If the job succeeded, get the Ray cluster head service
-rayclusterhead=$(kubectl get service -n $kuberay_namespace | grep 'rayjob-pytorch-mnist-raycluster' | grep 'ClusterIP' | awk '{print $1}')
-
-# Now create a service of type NodePort for the Ray cluster head
-kubectl expose service $rayclusterhead \
--n $kuberay_namespace \
---port=80 \
---target-port=8265 \
---type=NodePort \
---name=ray-dash
-
-# Create an ingress for the KubeRay dashboard
-cat <<EOF | kubectl apply -f -
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: ray-dash
-  namespace: kuberay
-  annotations:
-    nginx.ingress.kubernetes.io/rewrite-target: /
-spec:
-  ingressClassName: webapprouting.kubernetes.azure.com
-  rules:
-  - http:
-      paths:
-      - backend:
-          service:
-            name: ray-dash
-            port:
-              number: 80
-        path: /
-        pathType: Prefix
-EOF
-
-# Now find the public IP address of the ingress controller
-lb_public_ip=$(kubectl get svc -n app-routing-system -o jsonpath='{.items[?(@.metadata.name == "nginx")].status.loadBalancer.ingress[0].ip}')
-
-echo "KubeRay Dashboard URL: http://$lb_public_ip/"
+echo "=== Verifying installations ==="
+kubectl get pods -n kueue-system
+kubectl get pods -n kuberay-system
 
-exit 0
\ No newline at end of file
+echo "=== Setup complete ==="
\ No newline at end of file
diff --git a/examples/ray/aks-classic/main.tf b/examples/ray/aks-classic/main.tf
index e7e605236..4576026a6 100644
--- a/examples/ray/aks-classic/main.tf
+++ b/examples/ray/aks-classic/main.tf
@@ -21,6 +21,7 @@ resource "azurerm_kubernetes_cluster" "k8s" {
   name                             = random_pet.azurerm_kubernetes_cluster_name.id
   resource_group_name              = azurerm_resource_group.rg.name
   dns_prefix                       = random_pet.azurerm_kubernetes_cluster_dns_prefix.id
+  kubernetes_version               = var.kubernetes_version
     
   identity {
     type = "SystemAssigned"
@@ -88,10 +89,34 @@ resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" {
 }
 
 resource "azurerm_kubernetes_cluster_node_pool" "workload" {
-  name                  = "ray"
+  name                  = "cpupool"
   kubernetes_cluster_id = azurerm_kubernetes_cluster.k8s.id
   vm_size               = var.ray_node_pool_vm_size
-  node_count            = 4
+  node_count            = var.ray_node_pool_node_count
 
   depends_on = [azapi_update_resource.k8s-default-node-pool-systempool-taint]
+}
+
+resource "helm_release" "kueue" {
+  name             = "kueue"
+  namespace        = "kueue-system"
+  create_namespace = true
+  repository       = var.helm_registry
+  chart            = "kueue"
+  version          = var.kueue_version
+  wait             = true
+
+  depends_on = [azurerm_kubernetes_cluster_node_pool.workload]
+}
+
+resource "helm_release" "kuberay_operator" {
+  name             = "kuberay-operator"
+  namespace        = "kuberay-system"
+  create_namespace = true
+  repository       = var.helm_registry
+  chart            = "kuberay-operator"
+  version          = var.kuberay_operator_version
+  wait             = true
+
+  depends_on = [azurerm_kubernetes_cluster_node_pool.workload]
 }
\ No newline at end of file
diff --git a/examples/ray/aks-classic/providers.tf b/examples/ray/aks-classic/providers.tf
index 703b01589..dec6184d2 100644
--- a/examples/ray/aks-classic/providers.tf
+++ b/examples/ray/aks-classic/providers.tf
@@ -10,6 +10,10 @@ terraform {
       source  = "hashicorp/azurerm"
       version = "~>4.13"
     }
+    helm = {
+      source  = "hashicorp/helm"
+      version = "~>2.12"
+    }
     random = {
       source  = "hashicorp/random"
       version = "~>3.0"
@@ -25,4 +29,13 @@ provider "azurerm" {
   features {}
 
   subscription_id = var.subscription_id
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = azurerm_kubernetes_cluster.k8s.kube_config[0].host
+    client_certificate     = base64decode(azurerm_kubernetes_cluster.k8s.kube_config[0].client_certificate)
+    client_key             = base64decode(azurerm_kubernetes_cluster.k8s.kube_config[0].client_key)
+    cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.k8s.kube_config[0].cluster_ca_certificate)
+  }
 }
\ No newline at end of file
diff --git a/examples/ray/aks-classic/variables.tf b/examples/ray/aks-classic/variables.tf
index 2b519f7c3..0e717a0da 100644
--- a/examples/ray/aks-classic/variables.tf
+++ b/examples/ray/aks-classic/variables.tf
@@ -10,7 +10,7 @@ variable "resource_group_owner" {
 
 variable "resource_group_location" {
   type        = string
-  default     = "westus3"
+  default     = "centralus"
   description = "Location of the resource group."
 }
 
@@ -22,20 +22,50 @@ variable "resource_group_name_prefix" {
 
 variable "system_node_pool_vm_size" {
   type        = string
-  description = "The size of the Virtual Machine."
-  default     = "Standard_D2_v2"
+  description = "The size of the Virtual Machine for the system node pool."
+  default     = "Standard_D4ds_v4"
 }
 
 variable "system_node_pool_node_count" {
   type        = number
   description = "The initial quantity of nodes for the system node pool."
-  default     = 1
+  default     = 3
 }
 
 variable "ray_node_pool_vm_size" {
   type        = string
-  description = "The size of the Virtual Machine."
-  default     = "Standard_D4s_v4"
+  description = "The size of the Virtual Machine for the Ray worker node pool."
+  default     = "Standard_D16ds_v7"
+}
+
+variable "ray_node_pool_node_count" {
+  type        = number
+  description = "The initial quantity of nodes for the Ray worker node pool."
+  default     = 2
+}
+
+variable "kubernetes_version" {
+  type        = string
+  description = "The Kubernetes version for the AKS cluster."
+  default     = "1.35"
+}
+
+variable "helm_registry" {
+  type        = string
+  description = "OCI registry for AKS AI Runtime Helm charts."
+  default     = "oci://mcr.microsoft.com/aks/ai-runtime/helm"
+}
+
+variable "kueue_version" {
+  type        = string
+  description = "Version of the Kueue Helm chart."
+  default     = "0.17.1"
+}
+
+variable "kuberay_operator_version" {
+  type        = string
+  description = "Version of the KubeRay Operator Helm chart."
+  default     = "1.6.1"
 }
 
 variable "msi_id" {
diff --git a/examples/ray/setup.sh b/examples/ray/setup.sh
index 745e6cade..56e17ecdd 100755
--- a/examples/ray/setup.sh
+++ b/examples/ray/setup.sh
@@ -3,7 +3,7 @@ set -euo pipefail
 
 # --- Infrastructure Configuration ---
 RESOURCE_GROUP="${RESOURCE_GROUP:-ray-example-rg}"
-LOCATION="${LOCATION:-eastus}"
+LOCATION="${LOCATION:-centralus}"
 CLUSTER_NAME="${CLUSTER_NAME:-demo}"
 NODE_COUNT="${NODE_COUNT:-3}"
 NODE_VM_SIZE="${NODE_VM_SIZE:-Standard_D4ds_v4}"