diff --git a/test/e2e/framework/assertions.go b/test/e2e/framework/assertions.go index da2e8f3fe..953d1e355 100644 --- a/test/e2e/framework/assertions.go +++ b/test/e2e/framework/assertions.go @@ -423,11 +423,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by // the callback function for additional checks. func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error { t.Helper() + return f.AssertPromQLResultWithOptions(t, expr, callback) +} + +// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts +// WithTimeout and WithPollInterval options to override the default polling +// parameters. +func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error { + t.Helper() + option := AssertOption{ + PollInterval: 20 * time.Second, + WaitTimeout: 3 * DefaultTestTimeout, + } + for _, fn := range fns { + fn(&option) + } var ( pollErr error v model.Value ) - if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) { + if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) { v, pollErr = f.getPromQLResult(context.Background(), expr) if pollErr != nil { t.Logf("error from getPromQLResult(): %s", pollErr) diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go index 83c378c08..89911e6fd 100644 --- a/test/e2e/framework/framework.go +++ b/test/e2e/framework/framework.go @@ -14,6 +14,7 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/pkg/errors" + "golang.org/x/mod/semver" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" @@ -25,6 +26,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/portforward" "k8s.io/client-go/transport/spdy" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -37,12 +39,14 @@ type Framework struct { RootCA *x509.CertPool MetricsClientCert *tls.Certificate OperatorNamespace string + ClusterVersion *configv1.ClusterVersion } // Setup finalizes the initilization of the Framework object by setting // parameters which are specific to OpenShift. func (f *Framework) Setup() error { clusterVersion := &configv1.ClusterVersion{} + if err := f.K8sClient.Get(context.Background(), client.ObjectKey{Name: "version"}, clusterVersion); err != nil { if meta.IsNoMatchError(err) { return nil @@ -51,6 +55,7 @@ func (f *Framework) Setup() error { return fmt.Errorf("failed to get clusterversion %w", err) } + f.ClusterVersion = clusterVersion f.IsOpenshiftCluster = true // Load the service CA operator's certificate authority. @@ -257,3 +262,140 @@ func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) { } }) } + +// TODO: remove ForceFailure — temporary helper to exercise DumpOnFailure logging. +func (f *Framework) ForceFailure(t *testing.T) { + t.Helper() + t.Error("forced failure to test debug dump output") +} + +// DebugFunc is a diagnostic function invoked when a test fails. +// Implementations should use t.Logf to emit relevant state. +type DebugFunc func(t *testing.T) + +// DumpOnFailure registers a t.Cleanup that runs the given debug functions +// when the test fails. It can be called multiple times to add more debug +// functions as resources become available during the test. +// +// Cleanups run in LIFO order, so call DumpOnFailure before registering +// resource deletions via CleanUp to ensure debug info is captured while +// the resources still exist. +func (f *Framework) DumpOnFailure(t *testing.T, fns ...DebugFunc) { + t.Helper() + t.Cleanup(func() { + if !t.Failed() { + return + } + for _, fn := range fns { + fn(t) + } + }) +} + +// DebugNamespace returns a DebugFunc that dumps deployments, pods, and events +// for the given namespaces. +func (f *Framework) DebugNamespace(namespaces ...string) DebugFunc { + return func(t *testing.T) { + t.Helper() + for _, ns := range namespaces { + t.Logf("--- Dumping debug info for namespace %s ---", ns) + f.DumpNamespaceDebug(t, ns) + } + } +} + +// SkipIfClusterVersionBelow skips the test if the cluster version is below +// minVersion. The minVersion string should be a semver-compatible version +// (e.g. "4.19" or "v4.19"). +func (f *Framework) SkipIfClusterVersionBelow(t *testing.T, minVersion string) { + t.Helper() + if f.ClusterVersion == nil { + t.Fatal("cluster version not available (non-OpenShift cluster?)") + return + } + + actual := f.ClusterVersion.Status.Desired.Version + if actual == "" { + t.Fatal("cluster version is empty") + return + } + t.Logf("Detected cluster version: %s", actual) + + if !strings.HasPrefix(actual, "v") { + actual = "v" + actual + } + if !strings.HasPrefix(minVersion, "v") { + minVersion = "v" + minVersion + } + + canonicalActual := fmt.Sprintf("%s-0", semver.Canonical(actual)) + canonicalMin := fmt.Sprintf("%s-0", semver.Canonical(minVersion)) + + if semver.Canonical(actual) == "" || semver.Canonical(minVersion) == "" { + t.Fatalf("Unable to parse version (actual=%q, min=%q)", actual, minVersion) + return + } + + if semver.Compare(canonicalActual, canonicalMin) < 0 { + t.Skipf("Skipping: cluster version %s is below minimum required %s", f.ClusterVersion.Status.Desired.Version, minVersion) + } +} + +// DumpNamespaceDebug logs deployments (with conditions), pods (with container +// statuses), and events for the given namespace. Useful as a t.Cleanup or +// on-failure diagnostic helper. +func (f *Framework) DumpNamespaceDebug(t *testing.T, namespace string) { + t.Helper() + ctx := context.WithoutCancel(t.Context()) + + t.Log("=== BEGIN DEBUG DUMP ===") + defer t.Log("=== END DEBUG DUMP ===") + + var deployments appsv1.DeploymentList + if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(namespace)); err != nil { + t.Logf("Failed to list deployments in %s: %v", namespace, err) + } else { + t.Logf("Deployments in namespace %s: %d", namespace, len(deployments.Items)) + for _, d := range deployments.Items { + t.Logf(" Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d", + d.Name, ptr.Deref(d.Spec.Replicas, 0), d.Status.ReadyReplicas, d.Status.AvailableReplicas) + for _, c := range d.Status.Conditions { + t.Logf(" condition: type=%s status=%s reason=%s message=%s", + c.Type, c.Status, c.Reason, c.Message) + } + } + } + + var pods corev1.PodList + if err := f.K8sClient.List(ctx, &pods, client.InNamespace(namespace)); err != nil { + t.Logf("Failed to list pods in %s: %v", namespace, err) + } else { + t.Logf("Pods in namespace %s: %d", namespace, len(pods.Items)) + for _, p := range pods.Items { + t.Logf(" Pod: name=%s phase=%s", p.Name, p.Status.Phase) + for _, cs := range p.Status.ContainerStatuses { + switch { + case cs.State.Running != nil: + t.Logf(" container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount) + case cs.State.Waiting != nil: + t.Logf(" container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s", + cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message) + case cs.State.Terminated != nil: + t.Logf(" container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d", + cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode) + } + } + } + } + + var events corev1.EventList + if err := f.K8sClient.List(ctx, &events, client.InNamespace(namespace)); err != nil { + t.Logf("Failed to list events in %s: %v", namespace, err) + } else { + t.Logf("Events in namespace %s: %d", namespace, len(events.Items)) + for _, e := range events.Items { + t.Logf(" Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d", + e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count) + } + } +} diff --git a/test/e2e/monitoring_stack_controller_test.go b/test/e2e/monitoring_stack_controller_test.go index e1fe0cfb9..57f625e7c 100644 --- a/test/e2e/monitoring_stack_controller_test.go +++ b/test/e2e/monitoring_stack_controller_test.go @@ -47,6 +47,8 @@ func assertCRDExists(t *testing.T, crds ...string) { } func TestMonitoringStackController(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(e2eTestNamespace)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump err := stack.AddToScheme(scheme.Scheme) assert.NilError(t, err, "adding stack to scheme failed") assertCRDExists(t, diff --git a/test/e2e/observability_installer_test.go b/test/e2e/observability_installer_test.go index 66c21b5bb..491f7c036 100644 --- a/test/e2e/observability_installer_test.go +++ b/test/e2e/observability_installer_test.go @@ -45,6 +45,8 @@ func TestObservabilityInstallerController(t *testing.T) { } func testObservabilityInstallerTracing(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(f.OperatorNamespace, "tracing-observability")) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump ctx := context.Background() // The ObservabilityInstaller installs operators via subscriptions, diff --git a/test/e2e/operator_metrics_test.go b/test/e2e/operator_metrics_test.go index ff7f0e31c..711d20d05 100644 --- a/test/e2e/operator_metrics_test.go +++ b/test/e2e/operator_metrics_test.go @@ -11,6 +11,8 @@ import ( ) func TestOperatorMetrics(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(f.OperatorNamespace)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump t.Run("operator exposes metrics", func(t *testing.T) { pod := f.GetOperatorPod(t) diff --git a/test/e2e/po_admission_webhook_test.go b/test/e2e/po_admission_webhook_test.go index d10d32b23..c089fb2a7 100644 --- a/test/e2e/po_admission_webhook_test.go +++ b/test/e2e/po_admission_webhook_test.go @@ -12,6 +12,8 @@ import ( ) func TestPrometheusRuleWebhook(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(e2eTestNamespace)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump assertCRDExists(t, "prometheusrules.monitoring.rhobs", ) diff --git a/test/e2e/prometheus_operator_test.go b/test/e2e/prometheus_operator_test.go index 4ecda84ec..ca9a90c66 100644 --- a/test/e2e/prometheus_operator_test.go +++ b/test/e2e/prometheus_operator_test.go @@ -29,6 +29,8 @@ type testCase struct { } func TestPrometheusOperatorForNonOwnedResources(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(e2eTestNamespace)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump resources := []client.Object{ newPrometheus(nil), newAlertmanager(nil), @@ -74,6 +76,8 @@ func TestPrometheusOperatorForNonOwnedResources(t *testing.T) { } func TestPrometheusOperatorForOwnedResources(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(e2eTestNamespace)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump resources := []client.Object{ newPrometheus(ownedResourceLabels), newAlertmanager(ownedResourceLabels), diff --git a/test/e2e/thanos_querier_controller_test.go b/test/e2e/thanos_querier_controller_test.go index f69c7981a..b5d223307 100644 --- a/test/e2e/thanos_querier_controller_test.go +++ b/test/e2e/thanos_querier_controller_test.go @@ -23,6 +23,8 @@ import ( ) func TestThanosQuerierController(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(e2eTestNamespace)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump assertCRDExists(t, "thanosqueriers.monitoring.rhobs") ts := []testCase{ diff --git a/test/e2e/uiplugin_cluster_health_analyzer_test.go b/test/e2e/uiplugin_cluster_health_analyzer_test.go new file mode 100644 index 000000000..bb06e48d3 --- /dev/null +++ b/test/e2e/uiplugin_cluster_health_analyzer_test.go @@ -0,0 +1,219 @@ +package e2e + +import ( + "context" + "fmt" + "strconv" + "testing" + "time" + + monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/common/model" + "gotest.tools/v3/assert" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" + + uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1" + "github.com/rhobs/observability-operator/test/e2e/framework" +) + +const ( + healthAnalyzerDeploymentName = "health-analyzer" + prometheusRuleNamespace = "e2e-health-analyzer-rules" +) + +func clusterHealthAnalyzer(t *testing.T) { + f.SkipIfClusterVersionBelow(t, "4.19") + + err := monv1.AddToScheme(f.K8sClient.Scheme()) + assert.NilError(t, err, "failed to add monv1 to scheme") + + f.DumpOnFailure(t, f.DebugNamespace(uiPluginInstallNS)) + + plugin := resetMonitoringUIPlugin(t) + err = f.K8sClient.Create(t.Context(), plugin) + assert.NilError(t, err, "failed to create monitoring UIPlugin") + + f.DumpOnFailure(t, dumpUIPluginDebug(plugin.Name)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump + + t.Log("Waiting for health-analyzer deployment to become ready...") + haDeployment := appsv1.Deployment{} + f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment) + f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t) + + // Use a unique suffix so re-runs don't conflict with leftover rules from prior executions. + suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10) + ruleName := "e2e-health-analyzer-" + suffix + alertName := "E2EHealthAnalyzer" + suffix + + createRuleNamespace(t, prometheusRuleNamespace) + + rule := newAlwaysFiringRule(t, ruleName, alertName) + err = f.K8sClient.Create(t.Context(), rule) + assert.NilError(t, err, "failed to create PrometheusRule") + + t.Log("Waiting for alert to fire in Prometheus...") + alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName) + err = f.AssertPromQLResultWithOptions(t, alertQuery, + func(v model.Value) error { + vec, ok := v.(model.Vector) + if !ok || len(vec) == 0 { + return fmt.Errorf("expected firing alert, got: %v", v) + } + return nil + }, + framework.WithPollInterval(30*time.Second), + framework.WithTimeout(10*time.Minute), + ) + assert.NilError(t, err, "alert %s never fired", alertName) + + t.Log("Waiting for cluster-health-analyzer to expose incident metric...") + incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s",src_severity="warning"}`, alertName) + err = f.AssertPromQLResultWithOptions(t, incidentQuery, + func(v model.Value) error { + vec, ok := v.(model.Vector) + if !ok || len(vec) == 0 { + return fmt.Errorf("expected incident metric, got: %v", v) + } + return nil + }, + framework.WithPollInterval(30*time.Second), + framework.WithTimeout(15*time.Minute), + ) + assert.NilError(t, err, "incident metric for %s never appeared", alertName) +} + +func resetMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin { + plugin := &uiv1.UIPlugin{ + ObjectMeta: metav1.ObjectMeta{ + Name: "monitoring", + }, + Spec: uiv1.UIPluginSpec{ + Type: uiv1.TypeMonitoring, + Monitoring: &uiv1.MonitoringConfig{ + ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{ + Enabled: true, + }, + }, + }, + } + + deleteUIPluginIfExists(t, plugin.Name) + + f.CleanUp(t, func() { + ctx := context.WithoutCancel(t.Context()) + if err := f.K8sClient.Delete(ctx, plugin); err != nil && !errors.IsNotFound(err) { + t.Logf("warning: failed to delete UIPlugin during cleanup: %v", err) + } + waitForUIPluginDeletion(plugin) + }) + return plugin +} + +func deleteUIPluginIfExists(t *testing.T, name string) { + t.Helper() + plugin := &uiv1.UIPlugin{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + } + err := f.K8sClient.Delete(t.Context(), plugin) + if err != nil { + if errors.IsNotFound(err) { + return + } + t.Fatalf("failed to delete existing UIPlugin: %v", err) + } + t.Log("UIPlugin already existed, waiting for deletion...") + waitForUIPluginDeletion(plugin) +} + +func createRuleNamespace(t *testing.T, name string) { + t.Helper() + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + "openshift.io/cluster-monitoring": "true", + }, + }, + } + if err := f.K8sClient.Create(t.Context(), ns); err != nil && !errors.IsAlreadyExists(err) { + t.Fatalf("failed to create rule namespace %s: %v", name, err) + } + f.CleanUp(t, func() { + ctx := context.WithoutCancel(t.Context()) + f.K8sClient.Delete(ctx, ns) + }) +} + +func newAlwaysFiringRule(t *testing.T, ruleName, alertName string) *monv1.PrometheusRule { + rule := &monv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: ruleName, + Namespace: prometheusRuleNamespace, + }, + Spec: monv1.PrometheusRuleSpec{ + Groups: []monv1.RuleGroup{{ + Name: "health-analyzer-test-" + ruleName, + Rules: []monv1.Rule{{ + Alert: alertName, + Expr: intstr.FromString(`vector(1)`), + Labels: map[string]string{"severity": "warning"}, + Annotations: map[string]string{ + "summary": "E2E static test alert for cluster health analyzer.", + }, + }}, + }}, + }, + } + f.CleanUp(t, func() { + ctx := context.WithoutCancel(t.Context()) + if err := f.K8sClient.Delete(ctx, rule); err != nil && !errors.IsNotFound(err) { + t.Logf("warning: failed to delete PrometheusRule during cleanup: %v", err) + } + }) + return rule +} + +func dumpUIPluginDebug(pluginName string) framework.DebugFunc { + return func(t *testing.T) { + t.Helper() + ctx := context.WithoutCancel(t.Context()) + + var plugin uiv1.UIPlugin + if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil { + t.Logf("Failed to get UIPlugin %q: %v", pluginName, err) + return + } + t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion) + t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type) + if plugin.Spec.Monitoring != nil { + if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil { + t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled) + } + if plugin.Spec.Monitoring.Incidents != nil { + t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled) + } + } + if len(plugin.Status.Conditions) == 0 { + t.Log("UIPlugin has no status conditions") + } + for _, c := range plugin.Status.Conditions { + t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message) + } + + var plugins uiv1.UIPluginList + if err := f.K8sClient.List(ctx, &plugins); err != nil { + t.Logf("Failed to list UIPlugins: %v", err) + } else { + t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items)) + for _, p := range plugins.Items { + t.Logf(" UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions)) + } + } + } +} diff --git a/test/e2e/uiplugin_test.go b/test/e2e/uiplugin_test.go index b6c05094b..d4ef2069c 100644 --- a/test/e2e/uiplugin_test.go +++ b/test/e2e/uiplugin_test.go @@ -34,6 +34,10 @@ func TestUIPlugin(t *testing.T) { name: "Create dashboards UIPlugin", scenario: dashboardsUIPlugin, }, + { + name: "Cluster health analyzer", + scenario: clusterHealthAnalyzer, + }, } for _, tc := range ts { @@ -42,6 +46,8 @@ func TestUIPlugin(t *testing.T) { } func dashboardsUIPlugin(t *testing.T) { + f.DumpOnFailure(t, f.DebugNamespace(uiPluginInstallNS)) + f.ForceFailure(t) // TODO: remove — temporary, exercises debug dump db := newDashboardsUIPlugin(t) err := f.K8sClient.Create(context.Background(), db) assert.NilError(t, err, "failed to create a dashboards UIPlugin") @@ -63,13 +69,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin { } f.CleanUp(t, func() { f.K8sClient.Delete(context.Background(), db) - waitForDBUIPluginDeletion(db) + waitForUIPluginDeletion(db) }) return db } -func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error { +func waitForUIPluginDeletion(db *uiv1.UIPlugin) error { return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) { err = f.K8sClient.Get(context.Background(), client.ObjectKey{Name: db.Name},