From dc43cd715ad57191bbd012fdb6040fd0fb8d7372 Mon Sep 17 00:00:00 2001 From: Michael Pruitt Date: Thu, 2 Jul 2026 13:06:29 -0500 Subject: [PATCH 1/5] INTEROP-9230,INTEROP-9231: Add OPP GA-to-nightly upgrade step New step registry ref (interop-opp-upgrade) and ci-operator config variant for OPP upgrade testing. Provisions at GA, installs OPP operators, upgrades to nightly, validates platform and operator health. Cron disabled; to be enabled after manual validation. --- ...n-policy-collection-main__ocp-upgrade.yaml | 86 ++++ .../step-registry/interop/opp/upgrade/OWNERS | 3 + .../upgrade/interop-opp-upgrade-commands.sh | 388 ++++++++++++++++++ .../opp/upgrade/interop-opp-upgrade-ref.yaml | 31 ++ 4 files changed, 508 insertions(+) create mode 100644 ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml create mode 100644 ci-operator/step-registry/interop/opp/upgrade/OWNERS create mode 100755 ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh create mode 100644 ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml diff --git a/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml b/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml new file mode 100644 index 0000000000000..78c1253575040 --- /dev/null +++ b/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml @@ -0,0 +1,86 @@ +base_images: + cli: + name: "4.22" + namespace: ocp + tag: cli +build_root: + image_stream_tag: + name: release + namespace: openshift + tag: rhel-9-release-golang-1.24-openshift-4.22 +releases: + initial: + release: + channel: stable + version: "4.21" + latest: + candidate: + product: ocp + stream: nightly + version: "4.22" +resources: + '*': + requests: + cpu: 100m + memory: 200Mi +tests: +- as: interop-opp-upgrade-aws + capabilities: + - intranet + cron: 0 23 31 2 * + reporter_config: + channel: '#opp-discussion' + job_states_to_report: + - success + - failure + - error + report_template: '{{if eq .Status.State "success"}} :slack-green: Job *{{.Spec.Job}}* + ended with *{{.Status.State}}*. <{{.Status.URL}}|View logs> {{else}} :failed: + Job *{{.Spec.Job}}* ended with *{{.Status.State}}*. <{{.Status.URL}}|View logs> + {{end}}' + steps: + allow_best_effort_post_steps: true + cluster_profile: aws-cspi-qe + dependencies: + OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:initial + OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE: release:latest + env: + BASE_DOMAIN: cspilp.interop.ccitredhat.com + COMPUTE_NODE_REPLICAS: "6" + COMPUTE_NODE_TYPE: m6a.2xlarge + CONTROL_PLANE_INSTANCE_TYPE: m6a.2xlarge + FIREWATCH_CONFIG_FILE_PATH: https://raw.githubusercontent.com/CSPI-QE/cspi-utils/refs/heads/main/firewatch-base-configs/opp/lp-interop-aws.json + FIREWATCH_DEFAULT_JIRA_ADDITIONAL_LABELS: '["4.22-lp","opp-aws-lp","opp-lp","opp-upgrade"]' + FIREWATCH_DEFAULT_JIRA_ASSIGNEE: mpruitt@redhat.com + FIREWATCH_DEFAULT_JIRA_EPIC: INTEROP-8942 + FIREWATCH_DEFAULT_JIRA_PROJECT: LPINTEROP + FIREWATCH_FAIL_WITH_TEST_FAILURES: "true" + ODF_VERSION_MAJOR_MINOR: "4.21" + OPERATORS: | + [ + {"name": "advanced-cluster-management", "source": "redhat-operators", "channel": "release-2.16", "install_namespace": "ocm", "target_namespaces": "ocm", "operator_group": "acm-operator-group"}, + {"name": "rhacs-operator", "source": "redhat-operators", "channel": "stable", "install_namespace": "rhacs-operator", "target_namespaces": "rhacs-operator"}, + {"name": "odf-operator", "source": "redhat-operators", "channel": "stable-4.21", "install_namespace": "openshift-storage", "target_namespaces": "openshift-storage"}, + {"name": "quay-operator", "source": "redhat-operators", "channel": "stable-3.14", "install_namespace": "openshift-operators"} + ] + ZONES_COUNT: "3" + post: + - ref: gather-aws-console + - chain: ipi-deprovision + - ref: firewatch-report-issues + pre: + - ref: ipi-conf + - ref: ipi-conf-telemetry + - ref: ipi-conf-aws-custom-az + - ref: ipi-conf-aws + - ref: ipi-install-monitoringpvc + - chain: ipi-install + test: + - ref: install-operators + - ref: interop-opp-upgrade + - ref: cucushift-upgrade-healthcheck +zz_generated_metadata: + branch: main + org: stolostron + repo: policy-collection + variant: ocp-upgrade diff --git a/ci-operator/step-registry/interop/opp/upgrade/OWNERS b/ci-operator/step-registry/interop/opp/upgrade/OWNERS new file mode 100644 index 0000000000000..41d144d3728a2 --- /dev/null +++ b/ci-operator/step-registry/interop/opp/upgrade/OWNERS @@ -0,0 +1,3 @@ +approvers: &owners +- cspi-qe-ocp-lp +reviewers: *owners diff --git a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh new file mode 100755 index 0000000000000..a634bdf5fde14 --- /dev/null +++ b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh @@ -0,0 +1,388 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -o pipefail + +UPGRADE_TIMEOUT="${UPGRADE_TIMEOUT:-130}" +POLL_INTERVAL="${POLL_INTERVAL:-60}" +STALL_WINDOW="${STALL_WINDOW:-10}" +OPP_OPERATORS="${OPP_OPERATORS:-advanced-cluster-management,rhacs-operator,odf-operator,quay-operator}" + +trap 'EXIT_CODE=$?; debug_on_exit' EXIT TERM + +export HOME="${HOME:-/tmp/home}" +export XDG_RUNTIME_DIR="${HOME}/run" +export REGISTRY_AUTH_PREFERENCE=podman +mkdir -p "${XDG_RUNTIME_DIR}" + +KUBECONFIG="" oc --loglevel=8 registry login + +debug_on_exit() { + if (( EXIT_CODE != 0 )); then + echo -e "\n### DEBUG: Upgrade failure diagnostics ###\n" + if [[ -n "${TARGET_MINOR_VERSION:-}" ]] && (( TARGET_MINOR_VERSION >= 16 )); then + echo -e "\n# oc adm upgrade status\n" + env OC_ENABLE_CMD_UPGRADE_STATUS='true' oc adm upgrade status --details=all || true + fi + echo -e "\n# ClusterVersion YAML\n$(oc get clusterversion/version -oyaml 2>/dev/null || echo 'unavailable')" + echo -e "\n# MachineConfigs\n$(oc get machineconfig 2>/dev/null || echo 'unavailable')" + + echo -e "\n# Abnormal nodes\n" + oc get node --no-headers 2>/dev/null | awk '$2 != "Ready" {print $1}' | while read -r node; do + echo -e "\n### oc describe node ${node} ###\n$(oc describe node "${node}" 2>/dev/null)" + done + + echo -e "\n# Abnormal ClusterOperators\n" + oc get co --no-headers 2>/dev/null | awk '$3 != "True" || $4 != "False" || $5 != "False" {print $1}' | while read -r co; do + echo -e "\n### oc describe co ${co} ###\n$(oc describe co "${co}" 2>/dev/null)" + done + + echo -e "\n# Abnormal MachineConfigPools\n" + oc get machineconfigpools --no-headers 2>/dev/null | awk '$3 != "True" || $4 != "False" || $5 != "False" {print $1}' | while read -r mcp; do + echo -e "\n### oc describe mcp ${mcp} ###\n$(oc describe mcp "${mcp}" 2>/dev/null)" + done + + echo -e "\n# OPP Operator CSVs\n$(oc get csv -A 2>/dev/null || echo 'unavailable')" + fi +} + +resolve_target_image() { + local target="${OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE:-}" + if [[ -z "${target}" ]]; then + echo >&2 "OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE is not set; cannot resolve upgrade target" + exit 3 + fi + echo "Target image: ${target}" + TARGET="${target}" +} + +check_signed() { + local digest algorithm hash_value response try max_retries=3 payload="${1}" + if [[ "${payload}" =~ "@sha256:" ]]; then + digest="$(echo "${payload}" | cut -f2 -d@)" + else + digest="$(oc image info "${payload}" -o json | jq -r '.digest')" + fi + echo "Image digest: ${digest}" + algorithm="$(echo "${digest}" | cut -f1 -d:)" + hash_value="$(echo "${digest}" | cut -f2 -d:)" + try=0 + response=0 + while (( try < max_retries && response != 200 )); do + echo "Signature check attempt #${try}" + response=$(https_proxy="" HTTPS_PROXY="" curl -L --silent --output /dev/null \ + --write-out "%{http_code}" \ + "https://openshift-mirror-list.ci-systems.workers.dev/pub/openshift-v4/signatures/openshift/release/${algorithm}=${hash_value}/signature-1") + (( try += 1 )) + if (( response != 200 && try < max_retries )); then + sleep 60 + fi + done + if (( response == 200 )); then + echo "Image is signed" && return 0 + else + echo "Image is not signed" && return 1 + fi +} + +admin_ack() { + local source_minor="${1}" target_minor="${2}" + if (( source_minor == target_minor )) || (( source_minor < 8 )); then + echo "Admin ack not required (z-stream or pre-4.8)" && return 0 + fi + + local gates + gates="$(oc -n openshift-config-managed get configmap admin-gates -o json 2>/dev/null | jq -r '.data' 2>/dev/null)" || true + if [[ -z "${gates}" || "${gates}" == "null" ]]; then + echo "No admin gates found" && return 0 + fi + echo -e "Admin gates:\n${gates}" + + if [[ ${gates} != *"ack-4.${source_minor}"* ]]; then + echo "No acks required for source minor version ${source_minor}" && return 0 + fi + + echo "Patching admin acks for 4.${source_minor} -> 4.${target_minor}" + local ack_keys + ack_keys="$(echo "${gates}" | jq -r 'keys[]')" + for ack in ${ack_keys}; do + if [[ "${ack}" == *"ack-4.${source_minor}"* ]]; then + echo "Applying ack: ${ack}" + oc -n openshift-config patch configmap admin-acks \ + --patch '{"data":{"'"${ack}"'": "true"}}' --type=merge + fi + done + + echo "Waiting for admin acks to take effect (up to 5 minutes)" + local elapsed=0 + while (( elapsed < 5 )); do + sleep 1m + (( elapsed += 1 )) + if ! oc adm upgrade 2>&1 | grep -q "AdminAckRequired"; then + echo "Admin acks applied successfully" + return 0 + fi + echo "Still waiting... (${elapsed}/5 min)" + done + echo >&2 "Timed out waiting for admin acks" + return 1 +} + +update_cco_annotation() { + local source_version="${1}" target_version="${2}" + local source_minor target_minor + source_minor="$(echo "${source_version}" | cut -f2 -d.)" + target_minor="$(echo "${target_version}" | cut -f2 -d.)" + + if (( source_minor == target_minor )) || (( source_minor < 8 )); then + echo "CCO annotation not required (z-stream or pre-4.8)" && return 0 + fi + + local cco_mode + cco_mode="$(oc get cloudcredential cluster -o jsonpath='{.spec.credentialsMode}' 2>/dev/null)" || true + if [[ "${cco_mode}" != "Manual" ]]; then + echo "CCO annotation not required (mode: ${cco_mode:-default})" && return 0 + fi + + local to_version + to_version="$(echo "${target_version}" | cut -f1 -d-)" + echo "Patching CCO upgradeable-to annotation: ${to_version}" + oc patch cloudcredential.operator.openshift.io/cluster \ + --patch '{"metadata":{"annotations": {"cloudcredential.openshift.io/upgradeable-to": "'"${to_version}"'"}}}' \ + --type=merge + + echo "Waiting for CCO annotation to take effect (up to 5 minutes)" + local elapsed=0 + while (( elapsed < 5 )); do + sleep 1m + (( elapsed += 1 )) + if ! oc adm upgrade 2>&1 | grep -q "MissingUpgradeableAnnotation"; then + echo "CCO annotation applied successfully" + return 0 + fi + echo "Still waiting... (${elapsed}/5 min)" + done + echo >&2 "Timed out waiting for CCO annotation" + return 1 +} + +initiate_upgrade() { + local force_flag="${1}" + echo "Initiating upgrade to ${TARGET}" + echo "Force flag: ${force_flag}" + oc adm upgrade --to-image="${TARGET}" --allow-explicit-upgrade --force="${force_flag}" + echo "Upgrade command accepted at $(date '+%F %T')" + + sleep 10 + local progressing + progressing="$(oc get clusterversion version -o jsonpath='{.status.conditions[?(@.type=="Progressing")].status}' 2>/dev/null)" || true + if [[ "${progressing}" != "True" ]]; then + echo >&2 "WARNING: CVO Progressing is not True after upgrade initiation (status: ${progressing})" + else + echo "CVO confirmed Progressing=True" + fi +} + +monitor_upgrade() { + local remaining="${UPGRADE_TIMEOUT}" + local poll_count=0 + local last_progress_change + last_progress_change=$(date +%s) + + local stat_cmd="oc adm upgrade 2>&1 | grep -vE 'Upstream is unset|Upstream: https|available channels|No updates available|^$'" + if (( TARGET_MINOR_VERSION >= 16 )); then + stat_cmd="env OC_ENABLE_CMD_UPGRADE_STATUS=true oc adm upgrade status 2>&1 | grep -vE 'no token is currently in use|for additional description and links'" + fi + + local prev_status="" + local snapshot_dir="${ARTIFACT_DIR:-/tmp}/upgrade-progress" + mkdir -p "${snapshot_dir}" + + echo "Monitoring upgrade (timeout: ${UPGRADE_TIMEOUT}m, poll: ${POLL_INTERVAL}s)" + echo "Upgrade monitoring start: $(date '+%F %T')" + local start_time + start_time=$(date +%s) + + while (( remaining > 0 )); do + sleep "${POLL_INTERVAL}" + remaining=$(( remaining - 1 )) + (( poll_count += 1 )) + + local current_status + current_status="$(eval "${stat_cmd}" 2>/dev/null)" || true + if [[ -n "${current_status}" && "${current_status}" != "${prev_status}" ]]; then + echo -e "=== Upgrade Status $(date '+%T') ===\n${current_status}\n" + prev_status="${current_status}" + last_progress_change=$(date +%s) + fi + + if (( poll_count % 5 == 0 )); then + oc get clusterversion version -o json > "${snapshot_dir}/cv-$(date +%s).json" 2>/dev/null || true + fi + + local cv_out avail progressing + cv_out="$(oc get clusterversion --no-headers 2>/dev/null)" || continue + avail="$(echo "${cv_out}" | awk '{print $3}')" + progressing="$(echo "${cv_out}" | awk '{print $4}')" + + if [[ "${avail}" == "True" && "${progressing}" == "False" && "${cv_out}" == *"${TARGET_VERSION}"* ]]; then + local end_time + end_time=$(date +%s) + echo "Upgrade completed successfully at $(date '+%F %T')" + echo "Elapsed: $(( (end_time - start_time) / 60 ))m" + return 0 + fi + + local now stall_seconds + now=$(date +%s) + stall_seconds=$(( STALL_WINDOW * 60 )) + if (( now - last_progress_change > stall_seconds )); then + echo "WARNING: No upgrade progress change in ${STALL_WINDOW} minutes (possible stall)" + oc get clusterversion version -o json > "${snapshot_dir}/cv-stall-$(date +%s).json" 2>/dev/null || true + fi + done + + local end_time + end_time=$(date +%s) + echo >&2 "Upgrade timed out after ${UPGRADE_TIMEOUT} minutes at $(date '+%F %T')" + echo >&2 "Elapsed: $(( (end_time - start_time) / 60 ))m" + exit 2 +} + +stabilize_cluster() { + echo "Waiting for cluster stability (minimum-stable-period=5m, timeout=30m)" + oc adm wait-for-stable-cluster --minimum-stable-period=5m --timeout=30m + echo "Cluster is stable" +} + +validate_platform_health() { + echo "Validating platform health" + + local avail progressing degraded + avail="$(oc get clusterversion version -o jsonpath='{.status.conditions[?(@.type=="Available")].status}')" + progressing="$(oc get clusterversion version -o jsonpath='{.status.conditions[?(@.type=="Progressing")].status}')" + degraded="$(oc get clusterversion version -o jsonpath='{.status.conditions[?(@.type=="Degraded")].status}')" + if [[ "${avail}" != "True" || "${progressing}" != "False" || "${degraded}" != "False" ]]; then + echo >&2 "CVO health check failed: Available=${avail} Progressing=${progressing} Degraded=${degraded}" + return 1 + fi + echo "CVO: Available=True, Progressing=False, Degraded=False" + + local unhealthy_co + unhealthy_co="$(oc get co --no-headers | awk '$3 != "True" || $4 != "False" || $5 != "False" {print $1}')" + if [[ -n "${unhealthy_co}" ]]; then + echo >&2 "Unhealthy ClusterOperators: ${unhealthy_co}" + return 1 + fi + echo "All ClusterOperators healthy" + + local unready_nodes + unready_nodes="$(oc get node --no-headers | awk '$2 != "Ready" {print $1}')" + if [[ -n "${unready_nodes}" ]]; then + echo >&2 "Not-Ready nodes: ${unready_nodes}" + return 1 + fi + echo "All nodes Ready" + + local mcp_issues + mcp_issues="$(oc get machineconfigpools --no-headers | awk '$3 != "True" || $4 != "False" || $5 != "False" {print $1}')" + if [[ -n "${mcp_issues}" ]]; then + echo >&2 "Unhealthy MachineConfigPools: ${mcp_issues}" + return 1 + fi + echo "All MachineConfigPools updated" +} + +validate_opp_operators() { + echo "Validating OPP operator health" + local IFS=',' + local operators + read -ra operators <<< "${OPP_OPERATORS}" + + echo "Waiting 5 minutes for operator settling" + sleep 300 + + local all_csvs failed=0 + all_csvs="$(oc get csv -A --no-headers 2>/dev/null)" || { + echo >&2 "Failed to retrieve CSVs" + return 1 + } + + for op in "${operators[@]}"; do + local csv_line phase + csv_line="$(echo "${all_csvs}" | grep "${op}" | head -1)" || true + if [[ -z "${csv_line}" ]]; then + echo >&2 "CSV not found for operator: ${op}" + (( failed += 1 )) + continue + fi + phase="$(echo "${csv_line}" | awk '{print $NF}')" + if [[ "${phase}" != "Succeeded" ]]; then + echo >&2 "Operator ${op} CSV phase: ${phase} (expected: Succeeded)" + (( failed += 1 )) + else + echo "Operator ${op}: CSV phase Succeeded" + fi + done + + if (( failed > 0 )); then + echo >&2 "${failed} OPP operator(s) not healthy after upgrade" + echo -e "\nFull CSV listing:\n${all_csvs}" + return 1 + fi + + echo "Checking pod readiness for OPP operator namespaces" + local opp_namespaces + opp_namespaces="$(echo "${all_csvs}" | grep -E "$(echo "${OPP_OPERATORS}" | tr ',' '|')" | awk '{print $1}' | sort -u)" + for ns in ${opp_namespaces}; do + local not_ready + not_ready="$(oc get pods -n "${ns}" --no-headers 2>/dev/null | grep -v 'Completed' | grep -v 'Running' | grep -v 'Succeeded')" || true + if [[ -n "${not_ready}" ]]; then + echo "WARNING: Non-running pods in ${ns}:" + echo "${not_ready}" + else + echo "All pods healthy in ${ns}" + fi + done + + echo "All OPP operators validated successfully" +} + +main() { + if [[ -f "${SHARED_DIR}/kubeconfig" ]]; then + export KUBECONFIG="${SHARED_DIR}/kubeconfig" + fi + + resolve_target_image + + TARGET_VERSION="$(oc adm release info "${TARGET}" --output=json | jq -r '.metadata.version')" + TARGET_MINOR_VERSION="$(echo "${TARGET_VERSION}" | cut -f2 -d.)" + export TARGET_VERSION TARGET_MINOR_VERSION + echo "Target release: ${TARGET_VERSION} (minor: ${TARGET_MINOR_VERSION})" + + SOURCE_VERSION="$(oc get clusterversion --no-headers | awk '{print $2}')" + SOURCE_MINOR_VERSION="$(echo "${SOURCE_VERSION}" | cut -f2 -d.)" + export SOURCE_VERSION SOURCE_MINOR_VERSION + echo "Source release: ${SOURCE_VERSION} (minor: ${SOURCE_MINOR_VERSION})" + + FORCE_UPDATE="false" + if ! check_signed "${TARGET}"; then + echo "Target is unsigned; will use --force" + FORCE_UPDATE="true" + fi + + if [[ "${FORCE_UPDATE}" == "false" ]]; then + admin_ack "${SOURCE_MINOR_VERSION}" "${TARGET_MINOR_VERSION}" + update_cco_annotation "${SOURCE_VERSION}" "${TARGET_VERSION}" + fi + + initiate_upgrade "${FORCE_UPDATE}" + monitor_upgrade + stabilize_cluster + validate_platform_health + validate_opp_operators + echo "OCP upgrade and OPP validation completed successfully" +} + +main "$@" diff --git a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml new file mode 100644 index 0000000000000..a177a29d827ee --- /dev/null +++ b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml @@ -0,0 +1,31 @@ +ref: + as: interop-opp-upgrade + from: cli + commands: interop-opp-upgrade-commands.sh + timeout: 4h + resources: + requests: + cpu: 100m + memory: 100Mi + dependencies: + - name: OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE + env: release:latest + env: + - name: UPGRADE_TIMEOUT + default: "130" + documentation: Maximum minutes to wait for CVO upgrade completion + - name: POLL_INTERVAL + default: "60" + documentation: Seconds between CVO status polls during upgrade monitoring + - name: STALL_WINDOW + default: "10" + documentation: Minutes of no CVO progress change before logging a stall warning + - name: OPP_OPERATORS + default: "advanced-cluster-management,rhacs-operator,odf-operator,quay-operator" + documentation: Comma-separated CSV name prefixes for OPP operators to validate post-upgrade + documentation: |- + Upgrades an OCP cluster from GA to a nightly release and validates + that OPP operators (ACM, ACS, ODF, Quay) survive the upgrade. + Expects a cluster already provisioned at the GA version with OPP + operators installed. The upgrade target image is resolved from + the ci-operator releases block via OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE. From 4173f42f30a02e88edbdbd8b3c5cd1f57027c205 Mon Sep 17 00:00:00 2001 From: Michael Pruitt Date: Thu, 2 Jul 2026 13:14:29 -0500 Subject: [PATCH 2/5] Fix CI: add grace_period, OWNERS at parent dirs, metadata JSON - Add grace_period: 10m to ref.yaml (required when script uses trap) - Add OWNERS files at interop/ and interop/opp/ parent directories - Add generated metadata JSON for step registry --- ci-operator/step-registry/interop/OWNERS | 3 +++ ci-operator/step-registry/interop/opp/OWNERS | 3 +++ .../opp/upgrade/interop-opp-upgrade-ref.metadata.json | 11 +++++++++++ .../interop/opp/upgrade/interop-opp-upgrade-ref.yaml | 1 + 4 files changed, 18 insertions(+) create mode 100644 ci-operator/step-registry/interop/OWNERS create mode 100644 ci-operator/step-registry/interop/opp/OWNERS create mode 100644 ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.metadata.json diff --git a/ci-operator/step-registry/interop/OWNERS b/ci-operator/step-registry/interop/OWNERS new file mode 100644 index 0000000000000..41d144d3728a2 --- /dev/null +++ b/ci-operator/step-registry/interop/OWNERS @@ -0,0 +1,3 @@ +approvers: &owners +- cspi-qe-ocp-lp +reviewers: *owners diff --git a/ci-operator/step-registry/interop/opp/OWNERS b/ci-operator/step-registry/interop/opp/OWNERS new file mode 100644 index 0000000000000..41d144d3728a2 --- /dev/null +++ b/ci-operator/step-registry/interop/opp/OWNERS @@ -0,0 +1,3 @@ +approvers: &owners +- cspi-qe-ocp-lp +reviewers: *owners diff --git a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.metadata.json b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.metadata.json new file mode 100644 index 0000000000000..b354d27ed2ee8 --- /dev/null +++ b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.metadata.json @@ -0,0 +1,11 @@ +{ + "path": "interop/opp/upgrade/interop-opp-upgrade-ref.yaml", + "owners": { + "approvers": [ + "cspi-qe-ocp-lp" + ], + "reviewers": [ + "cspi-qe-ocp-lp" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml index a177a29d827ee..4ac0c3d95dac1 100644 --- a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml +++ b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml @@ -1,6 +1,7 @@ ref: as: interop-opp-upgrade from: cli + grace_period: 10m commands: interop-opp-upgrade-commands.sh timeout: 4h resources: From 9dbd4eb3b5be2af0f98a1cef686b17230d36d167 Mon Sep 17 00:00:00 2001 From: Michael Pruitt Date: Thu, 2 Jul 2026 13:23:17 -0500 Subject: [PATCH 3/5] Fix config validation: correct dependency format, remove undeclared env - Fix ref.yaml dependencies to use correct name/env mapping (name=image stream tag, env=variable name) - Remove ODF_VERSION_MAJOR_MINOR from config (not declared in any step) - Remove OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE from config deps (ref.yaml already declares it correctly) --- .../stolostron-policy-collection-main__ocp-upgrade.yaml | 2 -- .../interop/opp/upgrade/interop-opp-upgrade-ref.yaml | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml b/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml index 78c1253575040..dda25c0bf25cb 100644 --- a/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml +++ b/ci-operator/config/stolostron/policy-collection/stolostron-policy-collection-main__ocp-upgrade.yaml @@ -43,7 +43,6 @@ tests: cluster_profile: aws-cspi-qe dependencies: OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:initial - OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE: release:latest env: BASE_DOMAIN: cspilp.interop.ccitredhat.com COMPUTE_NODE_REPLICAS: "6" @@ -55,7 +54,6 @@ tests: FIREWATCH_DEFAULT_JIRA_EPIC: INTEROP-8942 FIREWATCH_DEFAULT_JIRA_PROJECT: LPINTEROP FIREWATCH_FAIL_WITH_TEST_FAILURES: "true" - ODF_VERSION_MAJOR_MINOR: "4.21" OPERATORS: | [ {"name": "advanced-cluster-management", "source": "redhat-operators", "channel": "release-2.16", "install_namespace": "ocm", "target_namespaces": "ocm", "operator_group": "acm-operator-group"}, diff --git a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml index 4ac0c3d95dac1..504c4b63a378f 100644 --- a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml +++ b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-ref.yaml @@ -9,8 +9,8 @@ ref: cpu: 100m memory: 100Mi dependencies: - - name: OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE - env: release:latest + - name: "release:latest" + env: "OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE" env: - name: UPGRADE_TIMEOUT default: "130" From 2d2b8a14e70cb35e560f23802ecbe3b3d4081fc4 Mon Sep 17 00:00:00 2001 From: Michael Pruitt Date: Thu, 2 Jul 2026 13:32:08 -0500 Subject: [PATCH 4/5] Add generated Prow job YAML for ocp-upgrade variant Generated periodic job definition for the new ocp-upgrade config variant, matching the format of existing periodics. --- ...tron-policy-collection-main-periodics.yaml | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/ci-operator/jobs/stolostron/policy-collection/stolostron-policy-collection-main-periodics.yaml b/ci-operator/jobs/stolostron/policy-collection/stolostron-policy-collection-main-periodics.yaml index 97e9798f3a389..bde7d11a84773 100644 --- a/ci-operator/jobs/stolostron/policy-collection/stolostron-policy-collection-main-periodics.yaml +++ b/ci-operator/jobs/stolostron/policy-collection/stolostron-policy-collection-main-periodics.yaml @@ -1,4 +1,99 @@ periodics: +- agent: kubernetes + cluster: build03 + cron: 0 23 31 2 * + decorate: true + decoration_config: + skip_cloning: true + extra_refs: + - base_ref: main + org: stolostron + repo: policy-collection + labels: + capability/intranet: intranet + ci-operator.openshift.io/cloud: aws + ci-operator.openshift.io/cloud-cluster-profile: aws-cspi-qe + ci-operator.openshift.io/variant: ocp-upgrade + ci.openshift.io/generator: prowgen + job-release: "4.22" + pj-rehearse.openshift.io/can-be-rehearsed: "true" + name: periodic-ci-stolostron-policy-collection-main-ocp-upgrade-interop-opp-upgrade-aws + reporter_config: + slack: + channel: '#opp-discussion' + job_states_to_report: + - success + - failure + - error + report_template: '{{if eq .Status.State "success"}} :slack-green: Job *{{.Spec.Job}}* + ended with *{{.Status.State}}*. <{{.Status.URL}}|View logs> {{else}} :failed: + Job *{{.Spec.Job}}* ended with *{{.Status.State}}*. <{{.Status.URL}}|View + logs> {{end}}' + spec: + containers: + - args: + - --gcs-upload-secret=/secrets/gcs/service-account.json + - --image-import-pull-secret=/etc/pull-secret/.dockerconfigjson + - --lease-server-credentials-file=/etc/boskos/credentials + - --report-credentials-file=/etc/report/credentials + - --secret-dir=/secrets/ci-pull-credentials + - --target=interop-opp-upgrade-aws + - --variant=ocp-upgrade + command: + - ci-operator + env: + - name: HTTP_SERVER_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay-proxy.ci.openshift.org/openshift/ci:ci_ci-operator_latest + imagePullPolicy: Always + name: "" + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: 10m + volumeMounts: + - mountPath: /etc/boskos + name: boskos + readOnly: true + - mountPath: /secrets/ci-pull-credentials + name: ci-pull-credentials + readOnly: true + - mountPath: /secrets/gcs + name: gcs-credentials + readOnly: true + - mountPath: /secrets/manifest-tool + name: manifest-tool-local-pusher + readOnly: true + - mountPath: /etc/pull-secret + name: pull-secret + readOnly: true + - mountPath: /etc/report + name: result-aggregator + readOnly: true + serviceAccountName: ci-operator + volumes: + - name: boskos + secret: + items: + - key: credentials + path: credentials + secretName: boskos-credentials + - name: ci-pull-credentials + secret: + secretName: ci-pull-credentials + - name: manifest-tool-local-pusher + secret: + secretName: manifest-tool-local-pusher + - name: pull-secret + secret: + secretName: registry-pull-credentials + - name: result-aggregator + secret: + secretName: result-aggregator - agent: kubernetes cluster: build03 cron: 0 23 31 2 * From 334cc5a4c2f331704514f884d9e16de3d044b004 Mon Sep 17 00:00:00 2001 From: Michael Pruitt Date: Thu, 2 Jul 2026 14:27:36 -0500 Subject: [PATCH 5/5] Fix review issues: trap ordering, wall-clock timeout, IFS leak - Move trap installation after debug_on_exit definition so early failures in setup commands invoke a defined function - Use wall-clock deadline instead of iteration counter for upgrade timeout so behavior is correct regardless of POLL_INTERVAL value - Scope IFS=',' to the read call only so the namespace loop in validate_opp_operators splits correctly on whitespace --- .../upgrade/interop-opp-upgrade-commands.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh index a634bdf5fde14..4386429488f94 100755 --- a/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh +++ b/ci-operator/step-registry/interop/opp/upgrade/interop-opp-upgrade-commands.sh @@ -9,15 +9,11 @@ POLL_INTERVAL="${POLL_INTERVAL:-60}" STALL_WINDOW="${STALL_WINDOW:-10}" OPP_OPERATORS="${OPP_OPERATORS:-advanced-cluster-management,rhacs-operator,odf-operator,quay-operator}" -trap 'EXIT_CODE=$?; debug_on_exit' EXIT TERM - export HOME="${HOME:-/tmp/home}" export XDG_RUNTIME_DIR="${HOME}/run" export REGISTRY_AUTH_PREFERENCE=podman mkdir -p "${XDG_RUNTIME_DIR}" -KUBECONFIG="" oc --loglevel=8 registry login - debug_on_exit() { if (( EXIT_CODE != 0 )); then echo -e "\n### DEBUG: Upgrade failure diagnostics ###\n" @@ -47,6 +43,10 @@ debug_on_exit() { fi } +trap 'EXIT_CODE=$?; debug_on_exit' EXIT TERM + +KUBECONFIG="" oc --loglevel=8 registry login + resolve_target_image() { local target="${OPENSHIFT_UPGRADE_RELEASE_IMAGE_OVERRIDE:-}" if [[ -z "${target}" ]]; then @@ -185,7 +185,6 @@ initiate_upgrade() { } monitor_upgrade() { - local remaining="${UPGRADE_TIMEOUT}" local poll_count=0 local last_progress_change last_progress_change=$(date +%s) @@ -201,12 +200,12 @@ monitor_upgrade() { echo "Monitoring upgrade (timeout: ${UPGRADE_TIMEOUT}m, poll: ${POLL_INTERVAL}s)" echo "Upgrade monitoring start: $(date '+%F %T')" - local start_time + local start_time deadline start_time=$(date +%s) + deadline=$(( start_time + UPGRADE_TIMEOUT * 60 )) - while (( remaining > 0 )); do + while (( $(date +%s) < deadline )); do sleep "${POLL_INTERVAL}" - remaining=$(( remaining - 1 )) (( poll_count += 1 )) local current_status @@ -296,9 +295,8 @@ validate_platform_health() { validate_opp_operators() { echo "Validating OPP operator health" - local IFS=',' local operators - read -ra operators <<< "${OPP_OPERATORS}" + IFS=',' read -ra operators <<< "${OPP_OPERATORS}" echo "Waiting 5 minutes for operator settling" sleep 300