|
| 1 | +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +groups: |
| 5 | +- name: hypervisorLifecycle |
| 6 | + rules: |
| 7 | + - alert: HypervisorOnboardingStuck |
| 8 | + expr: | |
| 9 | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 1 |
| 10 | + for: 1h |
| 11 | + labels: |
| 12 | + severity: warning |
| 13 | + type: hypervisor_operator |
| 14 | + annotations: |
| 15 | + summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour" |
| 16 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync." |
| 17 | + |
| 18 | + - alert: HypervisorEvictionStuck |
| 19 | + expr: | |
| 20 | + kube_customresource_hypervisor_condition{condition="Evicting"} == 1 |
| 21 | + for: 4h |
| 22 | + labels: |
| 23 | + severity: warning |
| 24 | + type: hypervisor_operator |
| 25 | + annotations: |
| 26 | + summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours" |
| 27 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs." |
| 28 | + |
| 29 | + - alert: HypervisorEvictedTooLong |
| 30 | + expr: | |
| 31 | + kube_customresource_hypervisor_evicted == 1 |
| 32 | + unless on (name) |
| 33 | + kube_customresource_hypervisor_condition{condition="Offboarded"} == 1 |
| 34 | + for: 7d |
| 35 | + labels: |
| 36 | + severity: info |
| 37 | + type: hypervisor_operator |
| 38 | + annotations: |
| 39 | + summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days" |
| 40 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning." |
| 41 | + |
| 42 | +- name: hypervisorSync |
| 43 | + rules: |
| 44 | + - alert: HypervisorTraitSyncFailed |
| 45 | + expr: | |
| 46 | + kube_customresource_hypervisor_condition{condition="TraitsUpdated"} == 0 |
| 47 | + and on (name) |
| 48 | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 0 |
| 49 | + for: 30m |
| 50 | + labels: |
| 51 | + severity: warning |
| 52 | + type: hypervisor_operator |
| 53 | + annotations: |
| 54 | + summary: "Hypervisor {{ $labels.name }} trait sync has been failing" |
| 55 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity." |
| 56 | + |
| 57 | + - alert: HypervisorAggregateSyncFailed |
| 58 | + expr: | |
| 59 | + kube_customresource_hypervisor_condition{condition="AggregatesUpdated"} == 0 |
| 60 | + and on (name) |
| 61 | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 0 |
| 62 | + unless on (name) |
| 63 | + kube_customresource_hypervisor_condition{condition="Evicting"} == 1 |
| 64 | + for: 30m |
| 65 | + labels: |
| 66 | + severity: warning |
| 67 | + type: hypervisor_operator |
| 68 | + annotations: |
| 69 | + summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing" |
| 70 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity." |
| 71 | + |
| 72 | +- name: hypervisorOperatorHealth |
| 73 | + rules: |
| 74 | + - alert: HypervisorOperatorReconcileErrors |
| 75 | + expr: | |
| 76 | + rate(controller_runtime_reconcile_errors_total[5m]) > 0.01 |
| 77 | + for: 15m |
| 78 | + labels: |
| 79 | + severity: warning |
| 80 | + type: hypervisor_operator |
| 81 | + annotations: |
| 82 | + summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors" |
| 83 | + description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes." |
| 84 | + |
| 85 | + - alert: HypervisorOperatorDown |
| 86 | + expr: | |
| 87 | + up{job=~".*hypervisor-operator.*"} == 0 |
| 88 | + for: 5m |
| 89 | + labels: |
| 90 | + severity: critical |
| 91 | + type: hypervisor_operator |
| 92 | + annotations: |
| 93 | + summary: "Hypervisor operator is down" |
| 94 | + description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes." |
0 commit comments