Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion test/e2e/framework/assertions.go
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
// the callback function for additional checks.
func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
t.Helper()
return f.AssertPromQLResultWithOptions(t, expr, callback)
}

// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
// WithTimeout and WithPollInterval options to override the default polling
// parameters.
func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
t.Helper()
option := AssertOption{
PollInterval: 20 * time.Second,
WaitTimeout: 3 * DefaultTestTimeout,
}
for _, fn := range fns {
fn(&option)
}
var (
pollErr error
v model.Value
)
if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
v, pollErr = f.getPromQLResult(context.Background(), expr)
if pollErr != nil {
t.Logf("error from getPromQLResult(): %s", pollErr)
Expand Down
101 changes: 101 additions & 0 deletions test/e2e/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
"github.com/pkg/errors"
"golang.org/x/mod/semver"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
Expand All @@ -25,6 +26,7 @@ import (
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/portforward"
"k8s.io/client-go/transport/spdy"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand All @@ -37,12 +39,14 @@ type Framework struct {
RootCA *x509.CertPool
MetricsClientCert *tls.Certificate
OperatorNamespace string
ClusterVersion *configv1.ClusterVersion
}

// Setup finalizes the initilization of the Framework object by setting
// parameters which are specific to OpenShift.
func (f *Framework) Setup() error {
clusterVersion := &configv1.ClusterVersion{}

if err := f.K8sClient.Get(context.Background(), client.ObjectKey{Name: "version"}, clusterVersion); err != nil {
if meta.IsNoMatchError(err) {
return nil
Expand All @@ -51,6 +55,7 @@ func (f *Framework) Setup() error {
return fmt.Errorf("failed to get clusterversion %w", err)
}

f.ClusterVersion = clusterVersion
f.IsOpenshiftCluster = true

// Load the service CA operator's certificate authority.
Expand Down Expand Up @@ -257,3 +262,99 @@ func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) {
}
})
}

// SkipIfClusterVersionBelow skips the test if the cluster version is below
// minVersion. The minVersion string should be a semver-compatible version
// (e.g. "4.19" or "v4.19").
func (f *Framework) SkipIfClusterVersionBelow(t *testing.T, minVersion string) {
t.Helper()
if f.ClusterVersion == nil {
t.Fatal("cluster version not available (non-OpenShift cluster?)")
return
}

actual := f.ClusterVersion.Status.Desired.Version
if actual == "" {
t.Fatal("cluster version is empty")
return
}
t.Logf("Detected cluster version: %s", actual)

if !strings.HasPrefix(actual, "v") {
actual = "v" + actual
}
if !strings.HasPrefix(minVersion, "v") {
minVersion = "v" + minVersion
}

canonicalActual := fmt.Sprintf("%s-0", semver.Canonical(actual))
canonicalMin := fmt.Sprintf("%s-0", semver.Canonical(minVersion))

if semver.Canonical(actual) == "" || semver.Canonical(minVersion) == "" {
t.Fatalf("Unable to parse version (actual=%q, min=%q)", actual, minVersion)
return
}

if semver.Compare(canonicalActual, canonicalMin) < 0 {
t.Skipf("Skipping: cluster version %s is below minimum required %s", f.ClusterVersion.Status.Desired.Version, minVersion)
}
}

// DumpNamespaceDebug logs deployments (with conditions), pods (with container
// statuses), and events for the given namespace. Useful as a t.Cleanup or
// on-failure diagnostic helper.
func (f *Framework) DumpNamespaceDebug(t *testing.T, namespace string) {
t.Helper()
ctx := context.WithoutCancel(t.Context())

t.Log("=== BEGIN DEBUG DUMP ===")
defer t.Log("=== END DEBUG DUMP ===")

var deployments appsv1.DeploymentList
if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(namespace)); err != nil {
t.Logf("Failed to list deployments in %s: %v", namespace, err)
} else {
t.Logf("Deployments in namespace %s: %d", namespace, len(deployments.Items))
for _, d := range deployments.Items {
t.Logf(" Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d",
d.Name, ptr.Deref(d.Spec.Replicas, 0), d.Status.ReadyReplicas, d.Status.AvailableReplicas)
for _, c := range d.Status.Conditions {
t.Logf(" condition: type=%s status=%s reason=%s message=%s",
c.Type, c.Status, c.Reason, c.Message)
}
}
}

var pods corev1.PodList
if err := f.K8sClient.List(ctx, &pods, client.InNamespace(namespace)); err != nil {
t.Logf("Failed to list pods in %s: %v", namespace, err)
} else {
t.Logf("Pods in namespace %s: %d", namespace, len(pods.Items))
for _, p := range pods.Items {
t.Logf(" Pod: name=%s phase=%s", p.Name, p.Status.Phase)
for _, cs := range p.Status.ContainerStatuses {
switch {
case cs.State.Running != nil:
t.Logf(" container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount)
case cs.State.Waiting != nil:
t.Logf(" container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s",
cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message)
case cs.State.Terminated != nil:
t.Logf(" container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d",
cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
}
}
}
}

var events corev1.EventList
if err := f.K8sClient.List(ctx, &events, client.InNamespace(namespace)); err != nil {
t.Logf("Failed to list events in %s: %v", namespace, err)
} else {
t.Logf("Events in namespace %s: %d", namespace, len(events.Items))
for _, e := range events.Items {
t.Logf(" Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d",
e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count)
}
}
}
222 changes: 222 additions & 0 deletions test/e2e/uiplugin_cluster_health_analyzer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
package e2e

import (
"context"
"fmt"
"strconv"
"testing"
"time"

monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/prometheus/common/model"
"gotest.tools/v3/assert"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"sigs.k8s.io/controller-runtime/pkg/client"

uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
"github.com/rhobs/observability-operator/test/e2e/framework"
)

const (
healthAnalyzerDeploymentName = "health-analyzer"
prometheusRuleNamespace = "e2e-health-analyzer-rules"
)

func clusterHealthAnalyzer(t *testing.T) {
f.SkipIfClusterVersionBelow(t, "4.19")

err := monv1.AddToScheme(f.K8sClient.Scheme())
assert.NilError(t, err, "failed to add monv1 to scheme")

plugin := resetMonitoringUIPlugin(t)
err = f.K8sClient.Create(t.Context(), plugin)
assert.NilError(t, err, "failed to create monitoring UIPlugin")

t.Cleanup(func() {
if t.Failed() {
dumpClusterHealthAnalyzerDebug(t, plugin.Name)
}
})

t.Log("Waiting for health-analyzer deployment to become ready...")
haDeployment := appsv1.Deployment{}
f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)

// Use a unique suffix so re-runs don't conflict with leftover rules from prior executions.
suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
ruleName := "e2e-health-analyzer-" + suffix
alertName := "E2EHealthAnalyzer" + suffix

createRuleNamespace(t, prometheusRuleNamespace)

rule := newAlwaysFiringRule(t, ruleName, alertName)
err = f.K8sClient.Create(t.Context(), rule)
assert.NilError(t, err, "failed to create PrometheusRule")

t.Log("Waiting for alert to fire in Prometheus...")
alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
err = f.AssertPromQLResultWithOptions(t, alertQuery,
func(v model.Value) error {
vec, ok := v.(model.Vector)
if !ok || len(vec) == 0 {
return fmt.Errorf("expected firing alert, got: %v", v)
}
return nil
},
framework.WithPollInterval(30*time.Second),
framework.WithTimeout(10*time.Minute),
)
assert.NilError(t, err, "alert %s never fired", alertName)

t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s",src_severity="warning"}`, alertName)
err = f.AssertPromQLResultWithOptions(t, incidentQuery,
func(v model.Value) error {
vec, ok := v.(model.Vector)
if !ok || len(vec) == 0 {
return fmt.Errorf("expected incident metric, got: %v", v)
}
return nil
},
framework.WithPollInterval(30*time.Second),
framework.WithTimeout(15*time.Minute),
)
assert.NilError(t, err, "incident metric for %s never appeared", alertName)
}

func resetMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
plugin := &uiv1.UIPlugin{
ObjectMeta: metav1.ObjectMeta{
Name: "monitoring",
},
Spec: uiv1.UIPluginSpec{
Type: uiv1.TypeMonitoring,
Monitoring: &uiv1.MonitoringConfig{
ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
Enabled: true,
},
},
},
}

deleteUIPluginIfExists(t, plugin.Name)

f.CleanUp(t, func() {
ctx := context.WithoutCancel(t.Context())
if err := f.K8sClient.Delete(ctx, plugin); err != nil && !errors.IsNotFound(err) {
t.Logf("warning: failed to delete UIPlugin during cleanup: %v", err)
}
waitForUIPluginDeletion(plugin)
})
return plugin
}

func deleteUIPluginIfExists(t *testing.T, name string) {
t.Helper()
plugin := &uiv1.UIPlugin{
ObjectMeta: metav1.ObjectMeta{Name: name},
}
err := f.K8sClient.Delete(t.Context(), plugin)
if err != nil {
if errors.IsNotFound(err) {
return
}
t.Fatalf("failed to delete existing UIPlugin: %v", err)
}
t.Log("UIPlugin already existed, waiting for deletion...")
waitForUIPluginDeletion(plugin)
}

func createRuleNamespace(t *testing.T, name string) {
t.Helper()
ns := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
"openshift.io/cluster-monitoring": "true",
},
},
}
if err := f.K8sClient.Create(t.Context(), ns); err != nil && !errors.IsAlreadyExists(err) {
t.Fatalf("failed to create rule namespace %s: %v", name, err)
}
f.CleanUp(t, func() {
ctx := context.WithoutCancel(t.Context())
f.K8sClient.Delete(ctx, ns)
})
}

func newAlwaysFiringRule(t *testing.T, ruleName, alertName string) *monv1.PrometheusRule {
rule := &monv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: ruleName,
Namespace: prometheusRuleNamespace,
},
Spec: monv1.PrometheusRuleSpec{
Groups: []monv1.RuleGroup{{
Name: "health-analyzer-test-" + ruleName,
Rules: []monv1.Rule{{
Alert: alertName,
Expr: intstr.FromString(`vector(1)`),
Labels: map[string]string{"severity": "warning"},
Annotations: map[string]string{
"summary": "E2E static test alert for cluster health analyzer.",
},
}},
}},
},
}
f.CleanUp(t, func() {
ctx := context.WithoutCancel(t.Context())
if err := f.K8sClient.Delete(ctx, rule); err != nil && !errors.IsNotFound(err) {
t.Logf("warning: failed to delete PrometheusRule during cleanup: %v", err)
}
})
return rule
}

func dumpClusterHealthAnalyzerDebug(t *testing.T, pluginName string) {
t.Helper()
ctx := context.WithoutCancel(t.Context())

// UIPlugin-specific diagnostics
var plugin uiv1.UIPlugin
if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil {
t.Logf("Failed to get UIPlugin %q: %v", pluginName, err)
} else {
t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion)
t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type)
if plugin.Spec.Monitoring != nil {
if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil {
t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled)
}
if plugin.Spec.Monitoring.Incidents != nil {
t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled)
}
}
if len(plugin.Status.Conditions) == 0 {
t.Log("UIPlugin has no status conditions")
}
for _, c := range plugin.Status.Conditions {
t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message)
}
}

var plugins uiv1.UIPluginList
if err := f.K8sClient.List(ctx, &plugins); err != nil {
t.Logf("Failed to list UIPlugins: %v", err)
} else {
t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items))
for _, p := range plugins.Items {
t.Logf(" UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions))
}
}

// Generic namespace diagnostics (deployments, pods, events)
f.DumpNamespaceDebug(t, uiPluginInstallNS)
}
Loading
Loading