Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion test/e2e/framework/assertions.go
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
// the callback function for additional checks.
func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
t.Helper()
return f.AssertPromQLResultWithOptions(t, expr, callback)
Comment thread
DavidRajnoha marked this conversation as resolved.
}

// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
// WithTimeout and WithPollInterval options to override the default polling
// parameters.
func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
t.Helper()
option := AssertOption{
PollInterval: 20 * time.Second,
WaitTimeout: 3 * DefaultTestTimeout,
}
for _, fn := range fns {
fn(&option)
}
var (
pollErr error
v model.Value
)
if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
v, pollErr = f.getPromQLResult(context.Background(), expr)
if pollErr != nil {
t.Logf("error from getPromQLResult(): %s", pollErr)
Expand Down
100 changes: 100 additions & 0 deletions test/e2e/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
"github.com/pkg/errors"
"golang.org/x/mod/semver"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
Expand All @@ -25,6 +26,7 @@ import (
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/portforward"
"k8s.io/client-go/transport/spdy"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -257,3 +259,101 @@ func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) {
}
})
}

// SkipIfClusterVersionBelow skips the test if the cluster version is below
// minVersion. The minVersion string should be a semver-compatible version
// (e.g. "4.19" or "v4.19").
func (f *Framework) SkipIfClusterVersionBelow(t *testing.T, minVersion string) {
Comment thread
DavidRajnoha marked this conversation as resolved.
t.Helper()
cv := &configv1.ClusterVersion{}
err := f.K8sClient.Get(t.Context(), client.ObjectKey{Name: "version"}, cv)
if err != nil {
t.Fatalf("failed to determine cluster version: %v", err)
return
}

actual := cv.Status.Desired.Version
if actual == "" {
t.Fatal("cluster version is empty")
return
}
t.Logf("Detected cluster version: %s", actual)

if !strings.HasPrefix(actual, "v") {
actual = "v" + actual
}
if !strings.HasPrefix(minVersion, "v") {
minVersion = "v" + minVersion
}

canonicalActual := fmt.Sprintf("%s-0", semver.Canonical(actual))
canonicalMin := fmt.Sprintf("%s-0", semver.Canonical(minVersion))

if semver.Canonical(actual) == "" || semver.Canonical(minVersion) == "" {
t.Fatalf("Unable to parse version (actual=%q, min=%q)", actual, minVersion)
return
}

if semver.Compare(canonicalActual, canonicalMin) < 0 {
t.Skipf("Skipping: cluster version %s is below minimum required %s", cv.Status.Desired.Version, minVersion)
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

// DumpNamespaceDebug logs deployments (with conditions), pods (with container
// statuses), and events for the given namespace. Useful as a t.Cleanup or
// on-failure diagnostic helper.
func (f *Framework) DumpNamespaceDebug(t *testing.T, namespace string) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like if we have a follow-up Jira ticket or PR to generalize the gathering of post-test data to help with troubleshooting.

t.Helper()
ctx := context.WithoutCancel(t.Context())

t.Log("=== BEGIN DEBUG DUMP ===")
defer t.Log("=== END DEBUG DUMP ===")

var deployments appsv1.DeploymentList
if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(namespace)); err != nil {
t.Logf("Failed to list deployments in %s: %v", namespace, err)
} else {
t.Logf("Deployments in namespace %s: %d", namespace, len(deployments.Items))
for _, d := range deployments.Items {
t.Logf(" Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d",
d.Name, ptr.Deref(d.Spec.Replicas, 0), d.Status.ReadyReplicas, d.Status.AvailableReplicas)
for _, c := range d.Status.Conditions {
t.Logf(" condition: type=%s status=%s reason=%s message=%s",
c.Type, c.Status, c.Reason, c.Message)
}
}
}

var pods corev1.PodList
if err := f.K8sClient.List(ctx, &pods, client.InNamespace(namespace)); err != nil {
t.Logf("Failed to list pods in %s: %v", namespace, err)
} else {
t.Logf("Pods in namespace %s: %d", namespace, len(pods.Items))
for _, p := range pods.Items {
t.Logf(" Pod: name=%s phase=%s", p.Name, p.Status.Phase)
for _, cs := range p.Status.ContainerStatuses {
switch {
case cs.State.Running != nil:
t.Logf(" container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount)
case cs.State.Waiting != nil:
t.Logf(" container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s",
cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message)
case cs.State.Terminated != nil:
t.Logf(" container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d",
cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
}
}
}
}

var events corev1.EventList
if err := f.K8sClient.List(ctx, &events, client.InNamespace(namespace)); err != nil {
t.Logf("Failed to list events in %s: %v", namespace, err)
} else {
t.Logf("Events in namespace %s: %d", namespace, len(events.Items))
for _, e := range events.Items {
t.Logf(" Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d",
e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count)
}
}
}
221 changes: 221 additions & 0 deletions test/e2e/uiplugin_cluster_health_analyzer_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
package e2e

import (
"context"
"fmt"
"strconv"
"testing"
"time"

monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/prometheus/common/model"
"gotest.tools/v3/assert"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"sigs.k8s.io/controller-runtime/pkg/client"

uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
"github.com/rhobs/observability-operator/test/e2e/framework"
)

const (
healthAnalyzerDeploymentName = "health-analyzer"
prometheusRuleNamespace = "e2e-health-analyzer-rules"
)
Comment thread
DavidRajnoha marked this conversation as resolved.

func clusterHealthAnalyzer(t *testing.T) {
f.SkipIfClusterVersionBelow(t, "4.19")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) a follow-up change could collect the cluster version before in the setup phase so that we don't have to query ClusterVersion multiple times (and avoid failures if the API is flaky).


err := monv1.AddToScheme(f.K8sClient.Scheme())
assert.NilError(t, err, "failed to add monv1 to scheme")

plugin := resetMonitoringUIPlugin(t)
err = f.K8sClient.Create(t.Context(), plugin)
assert.NilError(t, err, "failed to create monitoring UIPlugin")

t.Cleanup(func() {
if t.Failed() {
dumpClusterHealthAnalyzerDebug(t, plugin.Name)
}
})
Comment thread
coderabbitai[bot] marked this conversation as resolved.

t.Log("Waiting for health-analyzer deployment to become ready...")
haDeployment := appsv1.Deployment{}
f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
Comment thread
DavidRajnoha marked this conversation as resolved.
f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)

suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this to ensure that we can run the test multiple times locally and not hit conflicts?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, to ensure that potential leftovers from prior tests would not cause any potential conflicts.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment about this?

ruleName := "e2e-health-analyzer-" + suffix
alertName := "E2EHealthAnalyzer" + suffix
Comment thread
DavidRajnoha marked this conversation as resolved.

createRuleNamespace(t, prometheusRuleNamespace)

rule := newAlwaysFiringRule(t, ruleName, alertName)
err = f.K8sClient.Create(t.Context(), rule)
assert.NilError(t, err, "failed to create PrometheusRule")

t.Log("Waiting for alert to fire in Prometheus...")
alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
err = f.AssertPromQLResultWithOptions(t, alertQuery,
func(v model.Value) error {
vec, ok := v.(model.Vector)
if !ok || len(vec) == 0 {
return fmt.Errorf("expected firing alert, got: %v", v)
}
return nil
},
framework.WithPollInterval(30*time.Second),
framework.WithTimeout(10*time.Minute),
)
assert.NilError(t, err, "alert %s never fired", alertName)

t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s",src_severity="warning"}`, alertName)
err = f.AssertPromQLResultWithOptions(t, incidentQuery,
func(v model.Value) error {
vec, ok := v.(model.Vector)
if !ok || len(vec) == 0 {
return fmt.Errorf("expected incident metric, got: %v", v)
}
return nil
},
framework.WithPollInterval(30*time.Second),
framework.WithTimeout(15*time.Minute),
)
assert.NilError(t, err, "incident metric for %s never appeared", alertName)
}

func resetMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
plugin := &uiv1.UIPlugin{
ObjectMeta: metav1.ObjectMeta{
Name: "monitoring",
},
Spec: uiv1.UIPluginSpec{
Type: uiv1.TypeMonitoring,
Monitoring: &uiv1.MonitoringConfig{
ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
Enabled: true,
},
},
},
}

deleteUIPluginIfExists(t, plugin.Name)

f.CleanUp(t, func() {
ctx := context.WithoutCancel(t.Context())
if err := f.K8sClient.Delete(ctx, plugin); err != nil && !errors.IsNotFound(err) {
t.Logf("warning: failed to delete UIPlugin during cleanup: %v", err)
}
waitForUIPluginDeletion(plugin)
})
return plugin
}

func deleteUIPluginIfExists(t *testing.T, name string) {
t.Helper()
plugin := &uiv1.UIPlugin{
ObjectMeta: metav1.ObjectMeta{Name: name},
}
err := f.K8sClient.Delete(t.Context(), plugin)
if err != nil {
if errors.IsNotFound(err) {
return
}
t.Fatalf("failed to delete existing UIPlugin: %v", err)
}
t.Log("UIPlugin already existed, waiting for deletion...")
waitForUIPluginDeletion(plugin)
}

func createRuleNamespace(t *testing.T, name string) {
t.Helper()
ns := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
"openshift.io/cluster-monitoring": "true",
},
},
}
if err := f.K8sClient.Create(t.Context(), ns); err != nil && !errors.IsAlreadyExists(err) {
t.Fatalf("failed to create rule namespace %s: %v", name, err)
}
f.CleanUp(t, func() {
ctx := context.WithoutCancel(t.Context())
f.K8sClient.Delete(ctx, ns)
})
}

func newAlwaysFiringRule(t *testing.T, ruleName, alertName string) *monv1.PrometheusRule {
rule := &monv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: ruleName,
Namespace: prometheusRuleNamespace,
},
Spec: monv1.PrometheusRuleSpec{
Groups: []monv1.RuleGroup{{
Name: "health-analyzer-test-" + ruleName,
Rules: []monv1.Rule{{
Alert: alertName,
Expr: intstr.FromString(`vector(1)`),
Labels: map[string]string{"severity": "warning"},
Annotations: map[string]string{
"summary": "E2E static test alert for cluster health analyzer.",
},
}},
}},
},
}
f.CleanUp(t, func() {
ctx := context.WithoutCancel(t.Context())
if err := f.K8sClient.Delete(ctx, rule); err != nil && !errors.IsNotFound(err) {
t.Logf("warning: failed to delete PrometheusRule during cleanup: %v", err)
}
})
return rule
}

func dumpClusterHealthAnalyzerDebug(t *testing.T, pluginName string) {
Comment thread
DavidRajnoha marked this conversation as resolved.
t.Helper()
ctx := context.WithoutCancel(t.Context())

// UIPlugin-specific diagnostics
var plugin uiv1.UIPlugin
if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil {
t.Logf("Failed to get UIPlugin %q: %v", pluginName, err)
} else {
t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion)
t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type)
if plugin.Spec.Monitoring != nil {
if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil {
t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled)
}
if plugin.Spec.Monitoring.Incidents != nil {
t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled)
}
}
if len(plugin.Status.Conditions) == 0 {
t.Log("UIPlugin has no status conditions")
}
for _, c := range plugin.Status.Conditions {
t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message)
}
}

var plugins uiv1.UIPluginList
if err := f.K8sClient.List(ctx, &plugins); err != nil {
t.Logf("Failed to list UIPlugins: %v", err)
} else {
t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items))
for _, p := range plugins.Items {
t.Logf(" UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions))
}
}

// Generic namespace diagnostics (deployments, pods, events)
f.DumpNamespaceDebug(t, uiPluginInstallNS)
}
8 changes: 6 additions & 2 deletions test/e2e/uiplugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ func TestUIPlugin(t *testing.T) {
name: "Create dashboards UIPlugin",
scenario: dashboardsUIPlugin,
},
{
name: "Cluster health analyzer",
scenario: clusterHealthAnalyzer,
},
}

for _, tc := range ts {
Expand Down Expand Up @@ -63,13 +67,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin {
}
f.CleanUp(t, func() {
f.K8sClient.Delete(context.Background(), db)
waitForDBUIPluginDeletion(db)
waitForUIPluginDeletion(db)
})

return db
}

func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error {
func waitForUIPluginDeletion(db *uiv1.UIPlugin) error {
return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
err = f.K8sClient.Get(context.Background(),
client.ObjectKey{Name: db.Name},
Expand Down
Loading