From 3e1a28fc88e3dea6d2834b41e28d015fc490a236 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Fri, 6 Mar 2026 14:48:09 -0500 Subject: [PATCH] ipi-conf-gcp: filter zones by machine type availability The zone selection for GCP control plane and compute nodes in us-central1 and us-south1 regions only filtered out AI zones but did not check whether the requested machine type was actually available in each zone. This caused ARM64 jobs using t2a-standard-4 to fail when us-central1-c was selected, since that zone does not offer t2a instances. Replace get_zones_from_region with get_zones_for_machine_type which queries gcloud compute machine-types list to find zones where the specific machine type is available, with a fallback to the previous behavior if the query returns no results. Also select zones independently for control plane and compute nodes, since heterogeneous clusters may use different machine types that are available in different zones. Co-Authored-By: Claude Opus 4.6 --- .../ipi/conf/gcp/ipi-conf-gcp-commands.sh | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh index 4fe9d7d608c08..a03977a48891f 100755 --- a/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh +++ b/ci-operator/step-registry/ipi/conf/gcp/ipi-conf-gcp-commands.sh @@ -61,13 +61,25 @@ if [[ -z "${COMPUTE_NODE_TYPE}" ]]; then fi fi -# Get standard zones from the region (excluding AI zones) and randomize selection -# This prevents control plane nodes from being placed in AI zones when zones aren't explicitly set -function get_zones_from_region() { - local zone_count=${1:-3} - # Get all zones from the region, filtering out AI zones and randomizing - mapfile -t AVAILABILITY_ZONES < <(gcloud compute zones list --filter="region:${GCP_REGION} AND status:UP" --format='value(name)' 2>/dev/null | grep -v '\-ai[0-9]' | shuf) - +# Get zones from the region that support a given machine type (excluding AI zones) +# and randomize selection. This prevents control plane nodes from being placed in +# zones where their machine type is unavailable. +function get_zones_for_machine_type() { + local machine_type=$1 + local zone_count=${2:-3} + + # Get zones where this machine type is available, filtering out AI zones + mapfile -t AVAILABILITY_ZONES < <(gcloud compute machine-types list \ + --filter="zone~${GCP_REGION} AND name=${machine_type}" \ + --format='value(zone)' 2>/dev/null | grep -v '\-ai[0-9]' | shuf) + + if [[ ${#AVAILABILITY_ZONES[@]} -eq 0 ]]; then + # Fallback: get all non-AI zones if machine type query fails + mapfile -t AVAILABILITY_ZONES < <(gcloud compute zones list \ + --filter="region:${GCP_REGION} AND status:UP" \ + --format='value(name)' 2>/dev/null | grep -v '\-ai[0-9]' | shuf) + fi + # Take the first zone_count zones local zones=("${AVAILABILITY_ZONES[@]:0:${zone_count}}") # Format as YAML array: [zone1, zone2, zone3] @@ -120,10 +132,10 @@ if [[ "${GCP_REGION}" == "us-central1" ]] || [[ "${GCP_REGION}" == "us-south1" ] gcloud config set project "${GOOGLE_PROJECT_ID}" 2>/dev/null || true fi - # Get zones for control plane (3 zones for HA) - CONTROL_PLANE_ZONES_STR=$(get_zones_from_region 3) - # Get zones for compute (same zones for consistency) - COMPUTE_ZONES_STR="${CONTROL_PLANE_ZONES_STR}" + # Get zones for control plane filtered by master machine type availability + CONTROL_PLANE_ZONES_STR=$(get_zones_for_machine_type "${master_type}" 3) + # Get zones for compute filtered by compute machine type availability + COMPUTE_ZONES_STR=$(get_zones_for_machine_type "${COMPUTE_NODE_TYPE}" 3) # Apply zones via patch if we got valid zones if [[ -n "${CONTROL_PLANE_ZONES_STR}" ]] && [[ "${CONTROL_PLANE_ZONES_STR}" != "[]" ]]; then