-
Notifications
You must be signed in to change notification settings - Fork 84
OBSINTA-1219: add e2e tests for UIPlugin incident detection #1038
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ import ( | |
|
|
||
| configv1 "github.com/openshift/api/config/v1" | ||
| "github.com/pkg/errors" | ||
| "golang.org/x/mod/semver" | ||
| appsv1 "k8s.io/api/apps/v1" | ||
| corev1 "k8s.io/api/core/v1" | ||
| v1 "k8s.io/api/core/v1" | ||
|
|
@@ -25,6 +26,7 @@ import ( | |
| "k8s.io/client-go/rest" | ||
| "k8s.io/client-go/tools/portforward" | ||
| "k8s.io/client-go/transport/spdy" | ||
| "k8s.io/utils/ptr" | ||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
| ) | ||
|
|
||
|
|
@@ -257,3 +259,101 @@ func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) { | |
| } | ||
| }) | ||
| } | ||
|
|
||
| // SkipIfClusterVersionBelow skips the test if the cluster version is below | ||
| // minVersion. The minVersion string should be a semver-compatible version | ||
| // (e.g. "4.19" or "v4.19"). | ||
| func (f *Framework) SkipIfClusterVersionBelow(t *testing.T, minVersion string) { | ||
|
DavidRajnoha marked this conversation as resolved.
|
||
| t.Helper() | ||
| cv := &configv1.ClusterVersion{} | ||
| err := f.K8sClient.Get(t.Context(), client.ObjectKey{Name: "version"}, cv) | ||
| if err != nil { | ||
| t.Fatalf("failed to determine cluster version: %v", err) | ||
| return | ||
| } | ||
|
|
||
| actual := cv.Status.Desired.Version | ||
| if actual == "" { | ||
| t.Fatal("cluster version is empty") | ||
| return | ||
| } | ||
| t.Logf("Detected cluster version: %s", actual) | ||
|
|
||
| if !strings.HasPrefix(actual, "v") { | ||
| actual = "v" + actual | ||
| } | ||
| if !strings.HasPrefix(minVersion, "v") { | ||
| minVersion = "v" + minVersion | ||
| } | ||
|
|
||
| canonicalActual := fmt.Sprintf("%s-0", semver.Canonical(actual)) | ||
| canonicalMin := fmt.Sprintf("%s-0", semver.Canonical(minVersion)) | ||
|
|
||
| if semver.Canonical(actual) == "" || semver.Canonical(minVersion) == "" { | ||
| t.Fatalf("Unable to parse version (actual=%q, min=%q)", actual, minVersion) | ||
| return | ||
| } | ||
|
|
||
| if semver.Compare(canonicalActual, canonicalMin) < 0 { | ||
| t.Skipf("Skipping: cluster version %s is below minimum required %s", cv.Status.Desired.Version, minVersion) | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
|
|
||
| // DumpNamespaceDebug logs deployments (with conditions), pods (with container | ||
| // statuses), and events for the given namespace. Useful as a t.Cleanup or | ||
| // on-failure diagnostic helper. | ||
| func (f *Framework) DumpNamespaceDebug(t *testing.T, namespace string) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like if we have a follow-up Jira ticket or PR to generalize the gathering of post-test data to help with troubleshooting. |
||
| t.Helper() | ||
| ctx := context.WithoutCancel(t.Context()) | ||
|
|
||
| t.Log("=== BEGIN DEBUG DUMP ===") | ||
| defer t.Log("=== END DEBUG DUMP ===") | ||
|
|
||
| var deployments appsv1.DeploymentList | ||
| if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(namespace)); err != nil { | ||
| t.Logf("Failed to list deployments in %s: %v", namespace, err) | ||
| } else { | ||
| t.Logf("Deployments in namespace %s: %d", namespace, len(deployments.Items)) | ||
| for _, d := range deployments.Items { | ||
| t.Logf(" Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d", | ||
| d.Name, ptr.Deref(d.Spec.Replicas, 0), d.Status.ReadyReplicas, d.Status.AvailableReplicas) | ||
| for _, c := range d.Status.Conditions { | ||
| t.Logf(" condition: type=%s status=%s reason=%s message=%s", | ||
| c.Type, c.Status, c.Reason, c.Message) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var pods corev1.PodList | ||
| if err := f.K8sClient.List(ctx, &pods, client.InNamespace(namespace)); err != nil { | ||
| t.Logf("Failed to list pods in %s: %v", namespace, err) | ||
| } else { | ||
| t.Logf("Pods in namespace %s: %d", namespace, len(pods.Items)) | ||
| for _, p := range pods.Items { | ||
| t.Logf(" Pod: name=%s phase=%s", p.Name, p.Status.Phase) | ||
| for _, cs := range p.Status.ContainerStatuses { | ||
| switch { | ||
| case cs.State.Running != nil: | ||
| t.Logf(" container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount) | ||
| case cs.State.Waiting != nil: | ||
| t.Logf(" container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s", | ||
| cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message) | ||
| case cs.State.Terminated != nil: | ||
| t.Logf(" container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d", | ||
| cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| var events corev1.EventList | ||
| if err := f.K8sClient.List(ctx, &events, client.InNamespace(namespace)); err != nil { | ||
| t.Logf("Failed to list events in %s: %v", namespace, err) | ||
| } else { | ||
| t.Logf("Events in namespace %s: %d", namespace, len(events.Items)) | ||
| for _, e := range events.Items { | ||
| t.Logf(" Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d", | ||
| e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count) | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,221 @@ | ||
| package e2e | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strconv" | ||
| "testing" | ||
| "time" | ||
|
|
||
| monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" | ||
| "github.com/prometheus/common/model" | ||
| "gotest.tools/v3/assert" | ||
| appsv1 "k8s.io/api/apps/v1" | ||
| corev1 "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/api/errors" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/util/intstr" | ||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
|
|
||
| uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1" | ||
| "github.com/rhobs/observability-operator/test/e2e/framework" | ||
| ) | ||
|
|
||
| const ( | ||
| healthAnalyzerDeploymentName = "health-analyzer" | ||
| prometheusRuleNamespace = "e2e-health-analyzer-rules" | ||
| ) | ||
|
DavidRajnoha marked this conversation as resolved.
|
||
|
|
||
| func clusterHealthAnalyzer(t *testing.T) { | ||
| f.SkipIfClusterVersionBelow(t, "4.19") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) a follow-up change could collect the cluster version before in the setup phase so that we don't have to query ClusterVersion multiple times (and avoid failures if the API is flaky). |
||
|
|
||
| err := monv1.AddToScheme(f.K8sClient.Scheme()) | ||
| assert.NilError(t, err, "failed to add monv1 to scheme") | ||
|
|
||
| plugin := resetMonitoringUIPlugin(t) | ||
| err = f.K8sClient.Create(t.Context(), plugin) | ||
| assert.NilError(t, err, "failed to create monitoring UIPlugin") | ||
|
|
||
| t.Cleanup(func() { | ||
| if t.Failed() { | ||
| dumpClusterHealthAnalyzerDebug(t, plugin.Name) | ||
| } | ||
| }) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| t.Log("Waiting for health-analyzer deployment to become ready...") | ||
| haDeployment := appsv1.Deployment{} | ||
| f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment) | ||
|
DavidRajnoha marked this conversation as resolved.
|
||
| f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t) | ||
|
|
||
| suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this to ensure that we can run the test multiple times locally and not hit conflicts?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, to ensure that potential leftovers from prior tests would not cause any potential conflicts.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment about this? |
||
| ruleName := "e2e-health-analyzer-" + suffix | ||
| alertName := "E2EHealthAnalyzer" + suffix | ||
|
DavidRajnoha marked this conversation as resolved.
|
||
|
|
||
| createRuleNamespace(t, prometheusRuleNamespace) | ||
|
|
||
| rule := newAlwaysFiringRule(t, ruleName, alertName) | ||
| err = f.K8sClient.Create(t.Context(), rule) | ||
| assert.NilError(t, err, "failed to create PrometheusRule") | ||
|
|
||
| t.Log("Waiting for alert to fire in Prometheus...") | ||
| alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName) | ||
| err = f.AssertPromQLResultWithOptions(t, alertQuery, | ||
| func(v model.Value) error { | ||
| vec, ok := v.(model.Vector) | ||
| if !ok || len(vec) == 0 { | ||
| return fmt.Errorf("expected firing alert, got: %v", v) | ||
| } | ||
| return nil | ||
| }, | ||
| framework.WithPollInterval(30*time.Second), | ||
| framework.WithTimeout(10*time.Minute), | ||
| ) | ||
| assert.NilError(t, err, "alert %s never fired", alertName) | ||
|
|
||
| t.Log("Waiting for cluster-health-analyzer to expose incident metric...") | ||
| incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s",src_severity="warning"}`, alertName) | ||
| err = f.AssertPromQLResultWithOptions(t, incidentQuery, | ||
| func(v model.Value) error { | ||
| vec, ok := v.(model.Vector) | ||
| if !ok || len(vec) == 0 { | ||
| return fmt.Errorf("expected incident metric, got: %v", v) | ||
| } | ||
| return nil | ||
| }, | ||
| framework.WithPollInterval(30*time.Second), | ||
| framework.WithTimeout(15*time.Minute), | ||
| ) | ||
| assert.NilError(t, err, "incident metric for %s never appeared", alertName) | ||
| } | ||
|
|
||
| func resetMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin { | ||
| plugin := &uiv1.UIPlugin{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: "monitoring", | ||
| }, | ||
| Spec: uiv1.UIPluginSpec{ | ||
| Type: uiv1.TypeMonitoring, | ||
| Monitoring: &uiv1.MonitoringConfig{ | ||
| ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{ | ||
| Enabled: true, | ||
| }, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| deleteUIPluginIfExists(t, plugin.Name) | ||
|
|
||
| f.CleanUp(t, func() { | ||
| ctx := context.WithoutCancel(t.Context()) | ||
| if err := f.K8sClient.Delete(ctx, plugin); err != nil && !errors.IsNotFound(err) { | ||
| t.Logf("warning: failed to delete UIPlugin during cleanup: %v", err) | ||
| } | ||
| waitForUIPluginDeletion(plugin) | ||
| }) | ||
| return plugin | ||
| } | ||
|
|
||
| func deleteUIPluginIfExists(t *testing.T, name string) { | ||
| t.Helper() | ||
| plugin := &uiv1.UIPlugin{ | ||
| ObjectMeta: metav1.ObjectMeta{Name: name}, | ||
| } | ||
| err := f.K8sClient.Delete(t.Context(), plugin) | ||
| if err != nil { | ||
| if errors.IsNotFound(err) { | ||
| return | ||
| } | ||
| t.Fatalf("failed to delete existing UIPlugin: %v", err) | ||
| } | ||
| t.Log("UIPlugin already existed, waiting for deletion...") | ||
| waitForUIPluginDeletion(plugin) | ||
| } | ||
|
|
||
| func createRuleNamespace(t *testing.T, name string) { | ||
| t.Helper() | ||
| ns := &corev1.Namespace{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: name, | ||
| Labels: map[string]string{ | ||
| "openshift.io/cluster-monitoring": "true", | ||
| }, | ||
| }, | ||
| } | ||
| if err := f.K8sClient.Create(t.Context(), ns); err != nil && !errors.IsAlreadyExists(err) { | ||
| t.Fatalf("failed to create rule namespace %s: %v", name, err) | ||
| } | ||
| f.CleanUp(t, func() { | ||
| ctx := context.WithoutCancel(t.Context()) | ||
| f.K8sClient.Delete(ctx, ns) | ||
| }) | ||
| } | ||
|
|
||
| func newAlwaysFiringRule(t *testing.T, ruleName, alertName string) *monv1.PrometheusRule { | ||
| rule := &monv1.PrometheusRule{ | ||
| ObjectMeta: metav1.ObjectMeta{ | ||
| Name: ruleName, | ||
| Namespace: prometheusRuleNamespace, | ||
| }, | ||
| Spec: monv1.PrometheusRuleSpec{ | ||
| Groups: []monv1.RuleGroup{{ | ||
| Name: "health-analyzer-test-" + ruleName, | ||
| Rules: []monv1.Rule{{ | ||
| Alert: alertName, | ||
| Expr: intstr.FromString(`vector(1)`), | ||
| Labels: map[string]string{"severity": "warning"}, | ||
| Annotations: map[string]string{ | ||
| "summary": "E2E static test alert for cluster health analyzer.", | ||
| }, | ||
| }}, | ||
| }}, | ||
| }, | ||
| } | ||
| f.CleanUp(t, func() { | ||
| ctx := context.WithoutCancel(t.Context()) | ||
| if err := f.K8sClient.Delete(ctx, rule); err != nil && !errors.IsNotFound(err) { | ||
| t.Logf("warning: failed to delete PrometheusRule during cleanup: %v", err) | ||
| } | ||
| }) | ||
| return rule | ||
| } | ||
|
|
||
| func dumpClusterHealthAnalyzerDebug(t *testing.T, pluginName string) { | ||
|
DavidRajnoha marked this conversation as resolved.
|
||
| t.Helper() | ||
| ctx := context.WithoutCancel(t.Context()) | ||
|
|
||
| // UIPlugin-specific diagnostics | ||
| var plugin uiv1.UIPlugin | ||
| if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil { | ||
| t.Logf("Failed to get UIPlugin %q: %v", pluginName, err) | ||
| } else { | ||
| t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion) | ||
| t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type) | ||
| if plugin.Spec.Monitoring != nil { | ||
| if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil { | ||
| t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled) | ||
| } | ||
| if plugin.Spec.Monitoring.Incidents != nil { | ||
| t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled) | ||
| } | ||
| } | ||
| if len(plugin.Status.Conditions) == 0 { | ||
| t.Log("UIPlugin has no status conditions") | ||
| } | ||
| for _, c := range plugin.Status.Conditions { | ||
| t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message) | ||
| } | ||
| } | ||
|
|
||
| var plugins uiv1.UIPluginList | ||
| if err := f.K8sClient.List(ctx, &plugins); err != nil { | ||
| t.Logf("Failed to list UIPlugins: %v", err) | ||
| } else { | ||
| t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items)) | ||
| for _, p := range plugins.Items { | ||
| t.Logf(" UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions)) | ||
| } | ||
| } | ||
|
|
||
| // Generic namespace diagnostics (deployments, pods, events) | ||
| f.DumpNamespaceDebug(t, uiPluginInstallNS) | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.