From 9644acae046646a97e5135f4c4e14cd5a2d208bd Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Wed, 25 Mar 2026 21:29:41 +0100 Subject: [PATCH] added service monitor for metrics --- .../alerts/eviction.yaml | 3 ++ .../alerts/operator.yaml | 7 +++ .../templates/metrics-reader-rbac.yaml | 15 ++++++ .../templates/servicemonitor.yaml | 48 +++++++++++++++++++ .../openstack-hypervisor-operator/values.yaml | 8 ++++ 5 files changed, 81 insertions(+) create mode 100644 charts/openstack-hypervisor-operator/templates/servicemonitor.yaml diff --git a/charts/openstack-hypervisor-operator/alerts/eviction.yaml b/charts/openstack-hypervisor-operator/alerts/eviction.yaml index 127bdf1e..c8f086ba 100644 --- a/charts/openstack-hypervisor-operator/alerts/eviction.yaml +++ b/charts/openstack-hypervisor-operator/alerts/eviction.yaml @@ -11,6 +11,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/evictionfailed annotations: summary: "Eviction {{ $labels.name }} has failed" description: "The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack." @@ -24,6 +25,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/evictionmigrationfailing annotations: summary: "Eviction {{ $labels.name }} has failing instance migrations for over 1 hour" description: "The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress." @@ -37,6 +39,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/evictionoutstandingram annotations: summary: "Eviction {{ $labels.name }} has outstanding RAM for over 6 hours" description: "The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved." diff --git a/charts/openstack-hypervisor-operator/alerts/operator.yaml b/charts/openstack-hypervisor-operator/alerts/operator.yaml index c9322326..edf2478e 100644 --- a/charts/openstack-hypervisor-operator/alerts/operator.yaml +++ b/charts/openstack-hypervisor-operator/alerts/operator.yaml @@ -11,6 +11,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisoronboardingstuck annotations: summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync." @@ -22,6 +23,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisorevictionstuck annotations: summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs." @@ -35,6 +37,7 @@ groups: labels: severity: info type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisorevictedtoolong annotations: summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning." @@ -50,6 +53,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisortraitsyncfailed annotations: summary: "Hypervisor {{ $labels.name }} trait sync has been failing" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity." @@ -65,6 +69,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisoraggregatesyncfailed annotations: summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing" description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity." @@ -78,6 +83,7 @@ groups: labels: severity: warning type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisorreconcileerrors annotations: summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors" description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes." @@ -89,6 +95,7 @@ groups: labels: severity: critical type: hypervisor_operator + playbook: docs/compute/kvm/playbooks/hypervisoroperatordown annotations: summary: "Hypervisor operator is down" description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes." diff --git a/charts/openstack-hypervisor-operator/templates/metrics-reader-rbac.yaml b/charts/openstack-hypervisor-operator/templates/metrics-reader-rbac.yaml index ce27901f..1e05d5fc 100644 --- a/charts/openstack-hypervisor-operator/templates/metrics-reader-rbac.yaml +++ b/charts/openstack-hypervisor-operator/templates/metrics-reader-rbac.yaml @@ -9,3 +9,18 @@ rules: - /metrics verbs: - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "openstack-hypervisor-operator.fullname" . }}-metrics-reader-binding + labels: + {{- include "openstack-hypervisor-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "openstack-hypervisor-operator.fullname" . }}-metrics-reader' +subjects: +- kind: ServiceAccount + name: '{{ include "openstack-hypervisor-operator.serviceAccountName" . }}' + namespace: '{{ .Release.Namespace }}' diff --git a/charts/openstack-hypervisor-operator/templates/servicemonitor.yaml b/charts/openstack-hypervisor-operator/templates/servicemonitor.yaml new file mode 100644 index 00000000..1690e33d --- /dev/null +++ b/charts/openstack-hypervisor-operator/templates/servicemonitor.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.serviceMonitor.create }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "openstack-hypervisor-operator.fullname" . }}-metrics + labels: + control-plane: controller-manager + {{- include "openstack-hypervisor-operator.labels" . | nindent 4 }} + {{- with .Values.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.serviceMonitor.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + jobLabel: control-plane + selector: + matchLabels: + control-plane: controller-manager + {{- include "openstack-hypervisor-operator.selectorLabels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + {{- with .Values.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + {{- with .Values.serviceMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/openstack-hypervisor-operator/values.yaml b/charts/openstack-hypervisor-operator/values.yaml index d77520e2..6c1fc4a8 100644 --- a/charts/openstack-hypervisor-operator/values.yaml +++ b/charts/openstack-hypervisor-operator/values.yaml @@ -45,6 +45,14 @@ metricsService: protocol: TCP targetPort: 8443 type: ClusterIP +serviceMonitor: + create: true + labels: {} + annotations: {} + interval: "" + scrapeTimeout: "" + metricRelabelings: [] + relabelings: [] secret: servicePassword: "" serviceAccount: