From ce95d31a6e7c6a56711ba6977ace585147c65a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Reme=C5=A1?= Date: Mon, 24 Nov 2025 14:27:29 +0100 Subject: [PATCH] feat: cluster/component healh backend & resources update --- ...bility-operator.clusterserviceversion.yaml | 21 ++++++ .../observability-operator-cluster-role.yaml | 21 ++++++ pkg/controllers/uiplugin/components.go | 68 ++++++++++++++++--- .../uiplugin/config/health-analyzer.yaml | 47 +++++++++++++ pkg/controllers/uiplugin/controller.go | 3 + pkg/controllers/uiplugin/health_analyzer.go | 55 +++++++++++++-- pkg/controllers/uiplugin/monitoring.go | 20 +++++- 7 files changed, 222 insertions(+), 13 deletions(-) create mode 100644 pkg/controllers/uiplugin/config/health-analyzer.yaml diff --git a/bundle/manifests/observability-operator.clusterserviceversion.yaml b/bundle/manifests/observability-operator.clusterserviceversion.yaml index 83fefe6bd..9b7f8792c 100644 --- a/bundle/manifests/observability-operator.clusterserviceversion.yaml +++ b/bundle/manifests/observability-operator.clusterserviceversion.yaml @@ -409,6 +409,13 @@ spec: - get - list - watch + - apiGroups: + - config.openshift.io + resources: + - clusteroperators + verbs: + - get + - list - apiGroups: - config.openshift.io resources: @@ -438,6 +445,13 @@ spec: - get - list - watch + - apiGroups: + - kubevirt.io + resources: + - kubevirts + verbs: + - get + - list - apiGroups: - loki.grafana.com resources: @@ -454,6 +468,13 @@ spec: verbs: - get - list + - apiGroups: + - machineconfiguration.openshift.io + resources: + - machineconfigpools + verbs: + - get + - list - apiGroups: - monitoring.coreos.com resourceNames: diff --git a/deploy/operator/observability-operator-cluster-role.yaml b/deploy/operator/observability-operator-cluster-role.yaml index d4eb5ffcc..085e3c6c3 100644 --- a/deploy/operator/observability-operator-cluster-role.yaml +++ b/deploy/operator/observability-operator-cluster-role.yaml @@ -92,6 +92,13 @@ rules: - get - list - watch +- apiGroups: + - config.openshift.io + resources: + - clusteroperators + verbs: + - get + - list - apiGroups: - config.openshift.io resources: @@ -121,6 +128,13 @@ rules: - get - list - watch +- apiGroups: + - kubevirt.io + resources: + - kubevirts + verbs: + - get + - list - apiGroups: - loki.grafana.com resources: @@ -137,6 +151,13 @@ rules: verbs: - get - list +- apiGroups: + - machineconfiguration.openshift.io + resources: + - machineconfigpools + verbs: + - get + - list - apiGroups: - monitoring.coreos.com resourceNames: diff --git a/pkg/controllers/uiplugin/components.go b/pkg/controllers/uiplugin/components.go index e46aac4df..0bbd5f98f 100644 --- a/pkg/controllers/uiplugin/components.go +++ b/pkg/controllers/uiplugin/components.go @@ -121,15 +121,30 @@ func pluginComponentReconcilers(plugin *uiv1alpha1.UIPlugin, pluginInfo UIPlugin monitoringConfig.Incidents != nil && monitoringConfig.Incidents.Enabled && pluginInfo.HealthAnalyzerImage != "" + + healthAnalyzerEnabled := monitoringConfig != nil && + monitoringConfig.ClusterHealthAnalyzer != nil && + monitoringConfig.ClusterHealthAnalyzer.Enabled && + pluginInfo.HealthAnalyzerImage != "" + + deployHealthAnalyzer := incidentsEnabled || healthAnalyzerEnabled + + components = append(components, + reconciler.NewOptionalUpdater(componentsHealthClusterRole("components-health-view"), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "components-health-view", plugin.Name+"-"+"components-health-view"), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newComponentHealthConfig(namespace), plugin, deployHealthAnalyzer), + ) + components = append(components, - reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, monitorClusterroleName, plugin.Name+"-"+monitorClusterroleName), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "system:auth-delegator", serviceAccountName+"-system-auth-delegator"), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newAlertManagerViewRoleBinding(serviceAccountName, namespace), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRole(namespace), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRoleBinding(namespace), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newHealthAnalyzerService(namespace), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newHealthAnalyzerDeployment(namespace, serviceAccountName, pluginInfo), plugin, incidentsEnabled), - reconciler.NewOptionalUpdater(newHealthAnalyzerServiceMonitor(namespace), plugin, incidentsEnabled), + reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "cluster-monitoring-view", plugin.Name+"cluster-monitoring-view"), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newClusterRoleBinding(namespace, serviceAccountName, "system:auth-delegator", serviceAccountName+"-system-auth-delegator"), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newAlertManagerViewRoleBinding(serviceAccountName, namespace), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRole(namespace), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newHealthAnalyzerPrometheusRoleBinding(namespace), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newHealthAnalyzerService(namespace), plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newHealthAnalyzerDeployment(namespace, serviceAccountName, pluginInfo.HealthAnalyzerImage), + plugin, deployHealthAnalyzer), + reconciler.NewOptionalUpdater(newHealthAnalyzerServiceMonitor(namespace), plugin, deployHealthAnalyzer), ) persesServiceAccountName := "perses" + serviceAccountSuffix @@ -436,6 +451,43 @@ func newService(info UIPluginInfo, namespace string) *corev1.Service { } } +// componentsHealthClusterRole creates a new clusterrole with the provided name. +// The clusterrole has read permissions to the cluster resources and it is required +// for the component health evaluation. +func componentsHealthClusterRole(name string) *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "ClusterRole", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"nodes"}, + Verbs: []string{"get", "list"}, + }, + { + APIGroups: []string{"config.openshift.io"}, + Resources: []string{"clusteroperators"}, + Verbs: []string{"get", "list"}, + }, + { + APIGroups: []string{"machineconfiguration.openshift.io"}, + Resources: []string{"machineconfigpools"}, + Verbs: []string{"get", "list"}, + }, + { + APIGroups: []string{"kubevirt.io"}, + Resources: []string{"kubevirts"}, + Verbs: []string{"get", "list"}, + }, + }, + } +} + func newKorrel8rDeployment(name string, namespace string, info UIPluginInfo) *appsv1.Deployment { volumes := []corev1.Volume{ { diff --git a/pkg/controllers/uiplugin/config/health-analyzer.yaml b/pkg/controllers/uiplugin/config/health-analyzer.yaml new file mode 100644 index 000000000..246c10d21 --- /dev/null +++ b/pkg/controllers/uiplugin/config/health-analyzer.yaml @@ -0,0 +1,47 @@ +# Default definition of the component tree used to evaluate component health +# by the cluster-health-analyzer. +components: + - name: control-plane + children: + - name: nodes + objects: + - resource: nodes + selectors: + - matchLabels: + node-role.kubernetes.io/control-plane: [] + - resource: machineconfigpools + group: machineconfiguration.openshift.io + selectors: + - matchLabels: + pools.operator.machineconfiguration.openshift.io/master: [] + - name: capacity + children: + - name: cpu + alerts: + selectors: + - matchLabels: + alertname: ["KubeCPUOvercommit","HighOverallControlPlaneCPU", "ExtremelyHighIndividualControlPlaneCPU"] + - name: memory + alerts: + selectors: + - matchLabels: + alertname: ["HighOverallControlPlaneMemory", "ExtremelyHighIndividualControlPlaneMemory", "SystemMemoryExceedsReservation"] + - name: operators + children: + - name: etcd + alerts: + selectors: + - matchLabels: + namespace: ["openshift-etcd","openshift-etcd-operator"] + - name: addons + children: + - name: kubevirt + alerts: + selectors: + - matchLabels: + kubernetes_operator_part_of: ["kubevirt"] + - matchLabels: + namespace: ["openshift-cnv"] + objects: + - group: kubevirt.io + resource: kubevirts \ No newline at end of file diff --git a/pkg/controllers/uiplugin/controller.go b/pkg/controllers/uiplugin/controller.go index 3911318ee..b9f443cc2 100644 --- a/pkg/controllers/uiplugin/controller.go +++ b/pkg/controllers/uiplugin/controller.go @@ -106,6 +106,9 @@ const ( //+kubebuilder:rbac:groups=authentication.k8s.io,resources=tokenreviews,verbs=create //+kubebuilder:rbac:groups=authorization.k8s.io,resources=subjectaccessreviews,verbs=create //+kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;create;update;patch;delete +//+kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list +//+kubebuilder:rbac:groups=machineconfiguration.openshift.io,resources=machineconfigpools,verbs=get;list +//+kubebuilder:rbac:groups=kubevirt.io,resources=kubevirts,verbs=get;list const finalizerName = "uiplugin.observability.openshift.io/finalizer" diff --git a/pkg/controllers/uiplugin/health_analyzer.go b/pkg/controllers/uiplugin/health_analyzer.go index be7b818a2..a1e6ea747 100644 --- a/pkg/controllers/uiplugin/health_analyzer.go +++ b/pkg/controllers/uiplugin/health_analyzer.go @@ -1,9 +1,12 @@ package uiplugin import ( + _ "embed" + monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" @@ -11,10 +14,15 @@ import ( ) const ( - name = "health-analyzer" - volumeMountName = name + "-tls" + name = "health-analyzer" + volumeMountName = name + "-tls" + componentConfigVolumeName = "components-health-config" + componentConfigMapName = "components-config" ) +//go:embed config/health-analyzer.yaml +var componentHealthConfig string + func newHealthAnalyzerPrometheusRole(namespace string) *rbacv1.Role { role := &rbacv1.Role{ TypeMeta: metav1.TypeMeta{ @@ -94,7 +102,10 @@ func newHealthAnalyzerService(namespace string) *corev1.Service { return service } -func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pluginInfo UIPluginInfo) *appsv1.Deployment { +func newHealthAnalyzerDeployment(namespace string, + serviceAccountName string, + image string) *appsv1.Deployment { + deploy := &appsv1.Deployment{ TypeMeta: metav1.TypeMeta{ APIVersion: appsv1.SchemeGroupVersion.String(), @@ -122,7 +133,7 @@ func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pl Containers: []corev1.Container{ { Name: name, - Image: pluginInfo.HealthAnalyzerImage, + Image: image, ImagePullPolicy: corev1.PullAlways, Args: []string{ "serve", @@ -162,6 +173,11 @@ func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pl Name: volumeMountName, ReadOnly: true, }, + { + Name: componentConfigVolumeName, + MountPath: "/etc/config", + ReadOnly: true, + }, }, }, }, @@ -174,6 +190,16 @@ func newHealthAnalyzerDeployment(namespace string, serviceAccountName string, pl }, }, }, + { + Name: componentConfigVolumeName, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: componentConfigMapName, + }, + }, + }, + }, }, }, }, @@ -218,3 +244,24 @@ func newHealthAnalyzerServiceMonitor(namespace string) *monv1.ServiceMonitor { return serviceMonitor } + +// newComponentHealthConfig creates a new ConfigMap +// that defines the components whose health is evaluated. +func newComponentHealthConfig(namespace string) *v1.ConfigMap { + cm := v1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: componentConfigMapName, + Labels: componentLabels("monitoring"), + }, + Data: map[string]string{ + "components.yaml": componentHealthConfig, + }, + } + + return &cm +} diff --git a/pkg/controllers/uiplugin/monitoring.go b/pkg/controllers/uiplugin/monitoring.go index 29d8e32ba..99459b2db 100644 --- a/pkg/controllers/uiplugin/monitoring.go +++ b/pkg/controllers/uiplugin/monitoring.go @@ -44,6 +44,19 @@ func validatePersesConfig(config *uiv1alpha1.MonitoringConfig) bool { return config.Perses != nil && config.Perses.Enabled } +func validateHealthanalyzerConfig(config *uiv1alpha1.MonitoringConfig, clusterVersion string) bool { + enabled := config.ClusterHealthAnalyzer != nil && + config.ClusterHealthAnalyzer.Enabled + + if !strings.HasPrefix(clusterVersion, "v") { + clusterVersion = "v" + clusterVersion + } + canonicalClusterVersion := fmt.Sprintf("%s-0", semver.Canonical(clusterVersion)) + minClusterVersionMet := semver.Compare(canonicalClusterVersion, "v4.19.0-0") >= 0 + + return enabled && minClusterVersionMet +} + func validateIncidentsConfig(config *uiv1alpha1.MonitoringConfig, clusterVersion string) bool { enabled := config.Incidents != nil && config.Incidents.Enabled @@ -191,8 +204,9 @@ func createMonitoringPluginInfo(plugin *uiv1alpha1.UIPlugin, namespace, name, im isValidAcmConfig := validateACMConfig(config) isValidPersesConfig := validatePersesConfig(config) isValidIncidentsConfig := validateIncidentsConfig(config, clusterVersion) + isValidHealthAnalyzerConfig := validateHealthanalyzerConfig(config, clusterVersion) - atLeastOneValidConfig := isValidAcmConfig || isValidPersesConfig || isValidIncidentsConfig + atLeastOneValidConfig := isValidAcmConfig || isValidPersesConfig || isValidIncidentsConfig || isValidHealthAnalyzerConfig pluginInfo := getBasePluginInfo(namespace, name, image) if !atLeastOneValidConfig { @@ -215,6 +229,10 @@ func createMonitoringPluginInfo(plugin *uiv1alpha1.UIPlugin, namespace, name, im pluginInfo.HealthAnalyzerImage = healthAnalyzerImage features = append(features, "incidents") } + if isValidHealthAnalyzerConfig { + pluginInfo.HealthAnalyzerImage = healthAnalyzerImage + features = append(features, "cluster-health-analyzer") + } addFeatureFlags(pluginInfo, features) return pluginInfo, nil