Skip to content

Commit 41d6642

Browse files
Merge pull request #269 from cobaltcore-dev/kvm-monitoring
alerting/dashbaording: added minimal monitoring setup for the plugin
2 parents 9813dcb + 4c5e987 commit 41d6642

File tree

10 files changed

+1113
-0
lines changed

10 files changed

+1113
-0
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
name: Validate Prometheus Alerts
5+
6+
on:
7+
workflow_dispatch:
8+
pull_request:
9+
paths:
10+
- 'charts/openstack-hypervisor-operator/alerts/*.yaml'
11+
- 'charts/openstack-hypervisor-operator/alerts/*.yml'
12+
13+
permissions:
14+
contents: read
15+
16+
env:
17+
PROMTOOL_VERSION: 3.8.0
18+
19+
defaults:
20+
run:
21+
shell: bash
22+
23+
concurrency:
24+
group: validate-prometheus-alerts-${{ github.ref }}
25+
cancel-in-progress: true
26+
27+
jobs:
28+
validate-alerts:
29+
runs-on: ubuntu-latest
30+
timeout-minutes: 5
31+
steps:
32+
- uses: actions/checkout@v6
33+
34+
- name: Install promtool
35+
run: |
36+
set -euo pipefail
37+
curl -sSfL "https://github.com/prometheus/prometheus/releases/download/v${PROMTOOL_VERSION}/prometheus-${PROMTOOL_VERSION}.linux-amd64.tar.gz" \
38+
| tar xz --strip-components=1 "prometheus-${PROMTOOL_VERSION}.linux-amd64/promtool"
39+
sudo install -m 0755 promtool /usr/local/bin/promtool
40+
promtool --version
41+
42+
- name: Validate Prometheus alert rules
43+
run: |
44+
set -euo pipefail
45+
shopt -s nullglob
46+
files=(charts/openstack-hypervisor-operator/alerts/*.{yaml,yml})
47+
if [ ${#files[@]} -eq 0 ]; then
48+
echo "No Prometheus rule files found."
49+
exit 1
50+
fi
51+
promtool check rules "${files[@]}"
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
groups:
5+
- name: evictionLifecycle
6+
rules:
7+
- alert: EvictionFailed
8+
expr: |
9+
kube_customresource_eviction_condition{condition="Evicting", reason="Failed"} == 1
10+
for: 5m
11+
labels:
12+
severity: warning
13+
type: hypervisor_operator
14+
annotations:
15+
summary: "Eviction {{ $labels.name }} has failed"
16+
description: "The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack."
17+
18+
- alert: EvictionMigrationFailing
19+
expr: |
20+
kube_customresource_eviction_condition{condition="MigratingInstance", reason="Failed"} == 1
21+
and on (name)
22+
kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1
23+
for: 1h
24+
labels:
25+
severity: warning
26+
type: hypervisor_operator
27+
annotations:
28+
summary: "Eviction {{ $labels.name }} has failing instance migrations for over 1 hour"
29+
description: "The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress."
30+
31+
- alert: EvictionOutstandingRamHigh
32+
expr: |
33+
kube_customresource_eviction_outstanding_ram_mb > 0
34+
and on (name)
35+
kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1
36+
for: 6h
37+
labels:
38+
severity: warning
39+
type: hypervisor_operator
40+
annotations:
41+
summary: "Eviction {{ $labels.name }} has outstanding RAM for over 6 hours"
42+
description: "The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved."
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
groups:
5+
- name: hypervisorLifecycle
6+
rules:
7+
- alert: HypervisorOnboardingStuck
8+
expr: |
9+
kube_customresource_hypervisor_condition{condition="Onboarding"} == 1
10+
for: 1h
11+
labels:
12+
severity: warning
13+
type: hypervisor_operator
14+
annotations:
15+
summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour"
16+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync."
17+
18+
- alert: HypervisorEvictionStuck
19+
expr: |
20+
kube_customresource_hypervisor_condition{condition="Evicting"} == 1
21+
for: 4h
22+
labels:
23+
severity: warning
24+
type: hypervisor_operator
25+
annotations:
26+
summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours"
27+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs."
28+
29+
- alert: HypervisorEvictedTooLong
30+
expr: |
31+
kube_customresource_hypervisor_evicted == 1
32+
unless on (name)
33+
kube_customresource_hypervisor_condition{condition="Offboarded"} == 1
34+
for: 7d
35+
labels:
36+
severity: info
37+
type: hypervisor_operator
38+
annotations:
39+
summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days"
40+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning."
41+
42+
- name: hypervisorSync
43+
rules:
44+
- alert: HypervisorTraitSyncFailed
45+
expr: |
46+
kube_customresource_hypervisor_condition{condition="TraitsUpdated"} == 0
47+
and on (name)
48+
kube_customresource_hypervisor_condition{condition="Onboarding"} == 0
49+
for: 30m
50+
labels:
51+
severity: warning
52+
type: hypervisor_operator
53+
annotations:
54+
summary: "Hypervisor {{ $labels.name }} trait sync has been failing"
55+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity."
56+
57+
- alert: HypervisorAggregateSyncFailed
58+
expr: |
59+
kube_customresource_hypervisor_condition{condition="AggregatesUpdated"} == 0
60+
and on (name)
61+
kube_customresource_hypervisor_condition{condition="Onboarding"} == 0
62+
unless on (name)
63+
kube_customresource_hypervisor_condition{condition="Evicting"} == 1
64+
for: 30m
65+
labels:
66+
severity: warning
67+
type: hypervisor_operator
68+
annotations:
69+
summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing"
70+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity."
71+
72+
- name: hypervisorOperatorHealth
73+
rules:
74+
- alert: HypervisorOperatorReconcileErrors
75+
expr: |
76+
rate(controller_runtime_reconcile_errors_total[5m]) > 0.01
77+
for: 15m
78+
labels:
79+
severity: warning
80+
type: hypervisor_operator
81+
annotations:
82+
summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors"
83+
description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes."
84+
85+
- alert: HypervisorOperatorDown
86+
expr: |
87+
up{job=~".*hypervisor-operator.*"} == 0
88+
for: 5m
89+
labels:
90+
severity: critical
91+
type: hypervisor_operator
92+
annotations:
93+
summary: "Hypervisor operator is down"
94+
description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes."

0 commit comments

Comments
 (0)