diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index e7270a20a7..35f6029ef8 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -51,8 +51,114 @@ const ( // The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM // killer, manual kill) can also produce this code. exitCodeSIGKILL = 137 + + // Container waiting reason strings. These are set by the container runtime and + // kubelet; Kubernetes does not define constants for them. + waitingCrashLoopBackOff = "CrashLoopBackOff" + waitingImagePullBackOff = "ImagePullBackOff" + waitingErrImagePull = "ErrImagePull" +) + +type podIssueSeverity int + +const ( + severityProgressing podIssueSeverity = iota + severityFailing +) + +// podIssueType defines the category of a pod issue. The enum order determines display +// priority when issues compete for the 3-slot cap in summarizeIssues. +type podIssueType int + +const ( + issueCrashLoopBackOff podIssueType = iota + issueImagePull + issueTerminated + issuePodFailed + issueNotReady + issuePending ) +// podIssue represents a single diagnosed problem with a pod. +type podIssue struct { + severity podIssueSeverity + issueType podIssueType + message string + + // isOldRevision is true if the pod belongs to an old revision during a rollout. + isOldRevision bool + + // terminationReason and exitCode provide dedup granularity within an issue type. + terminationReason string + exitCode int32 +} + +// key returns a deduplication key for this issue. Issues with the same key are grouped +// together in summarizeIssues, so that e.g. 10 pods all OOMKilled produce one message +// with "(10 pods affected)" rather than 10 separate messages. +func (p podIssue) key() string { + return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) +} + +// maxIssuesPerWorkload is the maximum number of distinct issue types reported per workload. +// Beyond this cap, additional issues are dropped to keep messages concise. +const maxIssuesPerWorkload = 3 + +// summarizeIssues deduplicates, prioritizes, and caps a list of pod issues into +// human-readable failing and progressing message slices. +func summarizeIssues(issues []podIssue) (failing []string, progressing []string) { + if len(issues) == 0 { + return nil, nil + } + + // Sort: new-revision first, then by issue type priority (enum order). + sort.SliceStable(issues, func(i, j int) bool { + if issues[i].isOldRevision != issues[j].isOldRevision { + return !issues[i].isOldRevision + } + return issues[i].issueType < issues[j].issueType + }) + + // Group by key. First occurrence provides the message; count tracks duplicates. + type issueGroup struct { + issue podIssue + count int + } + seen := map[string]int{} + var groups []issueGroup + for _, iss := range issues { + k := iss.key() + if idx, ok := seen[k]; ok { + groups[idx].count++ + continue + } + seen[k] = len(groups) + groups = append(groups, issueGroup{issue: iss, count: 1}) + } + + // Cap at maxIssuesPerWorkload unique reasons. + if len(groups) > maxIssuesPerWorkload { + groups = groups[:maxIssuesPerWorkload] + } + + // Format messages. + for _, g := range groups { + msg := g.issue.message + if g.count > 1 { + msg += fmt.Sprintf(" (%d pods affected)", g.count) + } + if g.issue.isOldRevision { + msg += " (old revision)" + } + if g.issue.severity == severityFailing { + failing = append(failing, msg) + } else { + progressing = append(progressing, msg) + } + } + return failing, progressing +} + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: @@ -498,7 +604,11 @@ func (m *statusManager) syncState() { ds := &appsv1.DaemonSet{} err := m.client.Get(context.TODO(), dsnn, ds) if err != nil { - log.WithValues("reason", err).Info("Failed to query daemonset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("DaemonSet %q not found", dsnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query daemonset") + } continue } if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled { @@ -521,22 +631,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the daemonset are failing. - if f, err := m.podsFailing(ds.Spec.Selector, ds.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "daemonset", dsnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDaemonSetRevision(ds) + issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.deployments { dep := &appsv1.Deployment{} err := m.client.Get(context.TODO(), depnn, dep) if err != nil { - log.WithValues("reason", err).Info("Failed to query deployment") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("Deployment %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query deployment") + } continue } if dep.Status.UnavailableReplicas > 0 { @@ -561,22 +671,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(dep.Spec.Selector, dep.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "deployment", depnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDeploymentRevision(dep) + issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.statefulsets { ss := &appsv1.StatefulSet{} err := m.client.Get(context.TODO(), depnn, ss) if err != nil { - log.WithValues("reason", err).Info("Failed to query statefulset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("StatefulSet %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query statefulset") + } continue } if *ss.Spec.Replicas != ss.Status.CurrentReplicas { @@ -599,21 +709,20 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(ss.Spec.Selector, ss.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "statefuleset", depnn).Info("Failed to check for failing pods") - continue - } + issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.cronjobs { cj := &batchv1.CronJob{} if err := m.client.Get(context.TODO(), depnn, cj); err != nil { - log.WithValues("reason", err).Info("Failed to query cronjobs") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("CronJob %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query cronjobs") + } continue } @@ -676,9 +785,10 @@ func (m *statusManager) removeTigeraStatus() { } } -// podsFailing takes a selector and returns if any of the pods that match it are failing. Failing pods are defined -// to be in CrashLoopBackOff state. -func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace string) (string, error) { +// diagnosePods lists pods matching the selector and returns a podIssue for each +// unhealthy pod found. If currentRevision is non-empty, pods not matching that +// revision are marked as old-revision. +func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string) []podIssue { l := corev1.PodList{} s, err := metav1.LabelSelectorAsMap(selector) if err != nil { @@ -686,62 +796,190 @@ func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace st } err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace)) if err != nil { - return "", err + log.WithValues("reason", err).Info("Failed to list pods for diagnosis") + return nil } + + var issues []podIssue for _, p := range l.Items { + oldRevision := currentRevision != "" && !podMatchesRevision(p, currentRevision) + if p.Status.Phase == corev1.PodFailed { - return fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), nil + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issuePodFailed, + message: fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + continue } - for _, c := range p.Status.InitContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } + + // Check init and regular container statuses for errors. + if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision); len(iss) > 0 { + issues = append(issues, iss...) + continue } - for _, c := range p.Status.ContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } + if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision); len(iss) > 0 { + issues = append(issues, iss...) + continue } - // If none of the container-level checks matched, check if the pod is running but - // not passing readiness checks. + // Running but not passing readiness checks. if p.Status.Phase == corev1.PodRunning { for _, cond := range p.Status.Conditions { if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse { - return fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name), nil + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueNotReady, + message: fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + break + } + } + continue + } + + // Pending pod - check for scheduling failures. + if p.Status.Phase == corev1.PodPending { + msg := fmt.Sprintf("Pod %s/%s is pending", p.Namespace, p.Name) + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse { + if cond.Message != "" { + msg += fmt.Sprintf(": %s", cond.Message) + } + break } } + issues = append(issues, podIssue{ + severity: severityProgressing, + issueType: issuePending, + message: msg, + isOldRevision: oldRevision, + }) } } - return "", nil + return issues } -func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerStatus) string { - if c.State.Waiting != nil { - // Check well-known error states here and report an appropriate mesage to the end user. - switch c.State.Waiting.Reason { - case "CrashLoopBackOff": - msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) - if lt := c.LastTerminationState.Terminated; lt != nil { - if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { - msg += " (exit code 137, possible liveness probe failure)" - } else { - msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) +// diagnoseContainers checks a list of container statuses for known error states and +// returns a podIssue for each one found. +func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool) []podIssue { + var issues []podIssue + for _, c := range statuses { + if c.State.Waiting != nil { + switch c.State.Waiting.Reason { + case waitingCrashLoopBackOff: + msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + var termReason string + var exitCode int32 + if lt := c.LastTerminationState.Terminated; lt != nil { + termReason = lt.Reason + exitCode = lt.ExitCode + if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { + msg += " (exit code 137, possible liveness probe failure)" + } else { + msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + } } + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueCrashLoopBackOff, + message: msg, + isOldRevision: oldRevision, + terminationReason: termReason, + exitCode: exitCode, + }) + case waitingImagePullBackOff, waitingErrImagePull: + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueImagePull, + message: fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) + } + } + if c.State.Terminated != nil { + if c.State.Terminated.Reason == terminationReasonError { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueTerminated, + message: fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) } - return msg - case "ImagePullBackOff", "ErrImagePull": - return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name) } } - if c.State.Terminated != nil { - if c.State.Terminated.Reason == terminationReasonError { - return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name) + return issues +} + +// podMatchesRevision checks whether a pod belongs to the given revision. +// Works for DaemonSets and StatefulSets (controller-revision-hash label) +// and Deployments (pod-template-hash label). +func podMatchesRevision(p corev1.Pod, currentRevision string) bool { + if hash, ok := p.Labels[appsv1.ControllerRevisionHashLabelKey]; ok { + return hash == currentRevision + } + if hash, ok := p.Labels[appsv1.DefaultDeploymentUniqueLabelKey]; ok { + return hash == currentRevision + } + return true +} + +// currentDeploymentRevision returns the pod-template-hash of the active ReplicaSet +// for the given Deployment. Returns empty string if it cannot be determined. +// Only called when the Deployment is unhealthy. The List call goes through +// the controller-runtime informer cache, not directly to the API server. +func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string { + rsList := &appsv1.ReplicaSetList{} + s, err := metav1.LabelSelectorAsMap(dep.Spec.Selector) + if err != nil { + log.WithValues("reason", err).Info("Failed to parse deployment selector for revision lookup") + return "" + } + err = m.client.List(context.TODO(), rsList, client.MatchingLabels(s), client.InNamespace(dep.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ReplicaSets for revision lookup") + return "" + } + + for _, rs := range rsList.Items { + if ref := metav1.GetControllerOf(&rs); ref == nil || ref.UID != dep.UID { + continue + } + if rs.Status.Replicas > 0 { + return rs.Labels[appsv1.DefaultDeploymentUniqueLabelKey] } } return "" } +// currentDaemonSetRevision returns the controller-revision-hash of the most recent +// ControllerRevision for the given DaemonSet. Returns empty string if it cannot be determined. +// Only called when the DaemonSet is unhealthy. The List call goes through +// the controller-runtime informer cache, not directly to the API server. +func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { + revList := &appsv1.ControllerRevisionList{} + err := m.client.List(context.TODO(), revList, client.InNamespace(ds.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ControllerRevisions for revision lookup") + return "" + } + + var maxRevision int64 + var currentHash string + for _, rev := range revList.Items { + if ref := metav1.GetControllerOf(&rev); ref == nil || ref.UID != ds.UID { + continue + } + if rev.Revision > maxRevision { + maxRevision = rev.Revision + currentHash = rev.Labels[appsv1.ControllerRevisionHashLabelKey] + } + } + return currentHash +} + func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index d19dda0ee5..5aaa8d5f45 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -16,11 +16,14 @@ package status import ( "context" + "fmt" + "strings" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" certV1 "k8s.io/api/certificates/v1" certV1beta1 "k8s.io/api/certificates/v1beta1" corev1 "k8s.io/api/core/v1" @@ -29,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" controllerRuntimeClient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -55,6 +59,7 @@ var _ = Describe("Status reporting tests", func() { err := apis.AddToScheme(scheme, false) Expect(err).NotTo(HaveOccurred()) Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) client = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() @@ -539,6 +544,152 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("when a monitored workload is not found", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should report degraded when a DaemonSet is not found", func() { + sm.AddDaemonsets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ds"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`DaemonSet "NS1/missing-ds" not found`))) + }) + + It("should report degraded when a Deployment is not found", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "missing-dep"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`Deployment "NS1/missing-dep" not found`))) + }) + + It("should report degraded when a StatefulSet is not found", func() { + sm.AddStatefulSets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ss"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`StatefulSet "NS1/missing-ss" not found`))) + }) + + It("should report degraded when a CronJob is not found", func() { + sm.AddCronJobs([]types.NamespacedName{{Namespace: "NS1", Name: "missing-cj"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`CronJob "NS1/missing-cj" not found`))) + }) + }) + + Context("during a rollout with mixed old and new revision pods", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should prioritize new-revision pod failures over old-revision failures", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + replicas := int32(2) + gen := int64(5) + + // Create the Deployment. + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1", + UID: "dp1-uid", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-new", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-old", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod: crash looping. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-new-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod: also crash looping but with different reason. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-old-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(HaveLen(2)) + // New-revision issue should appear first. + Expect(sm.failing[0]).NotTo(ContainSubstring("old revision")) + Expect(sm.failing[1]).To(ContainSubstring("old revision")) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) @@ -880,4 +1031,782 @@ var _ = Describe("Status reporting tests", func() { }, false, true), ) }) + + Describe("podIssue", func() { + It("should produce distinct keys for different crash reasons", func() { + oom := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: "OOMKilled", + exitCode: 137, + } + errExit1 := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: terminationReasonError, + exitCode: 1, + } + Expect(oom.key()).NotTo(Equal(errExit1.key())) + }) + + It("should produce the same key for same issue type with no termination context", func() { + a := podIssue{issueType: issueNotReady} + b := podIssue{issueType: issueNotReady} + Expect(a.key()).To(Equal(b.key())) + }) + + It("should produce distinct keys for different issue types", func() { + a := podIssue{issueType: issueCrashLoopBackOff} + b := podIssue{issueType: issueImagePull} + Expect(a.key()).NotTo(Equal(b.key())) + }) + }) + + Describe("summarizeIssues", func() { + It("should deduplicate issues with the same key", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod1 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod2 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod3 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("3 pods affected")) + Expect(progressing).To(BeEmpty()) + }) + + It("should cap at 3 unique reasons", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "msg1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueImagePull, message: "msg2"}, + {severity: severityFailing, issueType: issueTerminated, message: "msg3", terminationReason: terminationReasonError, exitCode: 1}, + {severity: severityFailing, issueType: issuePodFailed, message: "msg4"}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(3)) + }) + + It("should prioritize new-revision pods over old-revision pods", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "old pod crash", isOldRevision: true, terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "new pod crash", isOldRevision: false, terminationReason: terminationReasonError, exitCode: 1}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(2)) + Expect(failing[0]).To(ContainSubstring("new pod crash")) + Expect(failing[1]).To(ContainSubstring("old pod crash")) + Expect(failing[1]).To(ContainSubstring("old revision")) + }) + + It("should split failing and progressing", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready"}, + {severity: severityProgressing, issueType: issuePending, message: "Pod ns/p2 is pending: Unschedulable"}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(progressing).To(HaveLen(1)) + Expect(progressing[0]).To(ContainSubstring("pending")) + }) + + It("should annotate old revision issues", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready", isOldRevision: true}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("old revision")) + }) + + It("should return empty slices for no issues", func() { + failing, progressing := summarizeIssues(nil) + Expect(failing).To(BeEmpty()) + Expect(progressing).To(BeEmpty()) + }) + }) + + Describe("diagnosePods", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + selector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}} + podMeta := metav1.ObjectMeta{Namespace: "ns", Name: "pod1", Labels: map[string]string{"app": "test"}} + + It("should detect CrashLoopBackOff with OOMKilled context", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff)) + Expect(issues[0].severity).To(Equal(severityFailing)) + Expect(issues[0].terminationReason).To(Equal("OOMKilled")) + Expect(issues[0].exitCode).To(BeEquivalentTo(137)) + Expect(issues[0].message).To(ContainSubstring("OOMKilled")) + }) + + It("should detect CrashLoopBackOff with possible liveness probe failure", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure")) + }) + + It("should detect ImagePullBackOff", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueImagePull)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect terminated container with Error", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueTerminated)) + }) + + It("should detect pod in Failed phase", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePodFailed)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect running but not ready", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueNotReady)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect pending unschedulable pod with scheduler reason", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + Expect(issues[0].message).To(ContainSubstring("Insufficient memory")) + }) + + It("should detect pending pod without scheduler condition", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + }) + + It("should report multiple issues from different pods", func() { + pod1 := podMeta.DeepCopy() + pod1.Name = "pod-crash" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod1, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + pod2 := podMeta.DeepCopy() + pod2.Name = "pod-pending" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod2, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(2)) + }) + + It("should return no issues for healthy pods", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionTrue}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(BeEmpty()) + }) + }) + + Describe("currentRevision helpers", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + Describe("currentDeploymentRevision", func() { + It("should return the pod-template-hash of the newest ReplicaSet", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-abc123", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "abc123"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-old999", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "old999"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(Equal("abc123")) + }) + + It("should return empty string when no ReplicaSets exist", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(BeEmpty()) + }) + }) + + Describe("currentDaemonSetRevision", func() { + It("should return the hash of the highest-revision ControllerRevision", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev1", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-old"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: ptr.To(true)}, + }, + }, + Revision: 1, + })).NotTo(HaveOccurred()) + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev2", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-new"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: ptr.To(true)}, + }, + }, + Revision: 2, + })).NotTo(HaveOccurred()) + + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(Equal("hash-new")) + }) + + It("should return empty string when no ControllerRevisions exist", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(BeEmpty()) + }) + }) + }) +}) + +// getTigeraStatusCondition returns the condition of the given type from the TigeraStatus, or nil if not found. +func getTigeraStatusCondition(ts *operator.TigeraStatus, condType operator.StatusConditionType) *operator.TigeraStatusCondition { + for _, c := range ts.Status.Conditions { + if c.Type == condType { + return &c + } + } + return nil +} + +var _ = Describe("Status manager integration tests", Ordered, func() { + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeAll(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + }) + + Describe("degraded message content", func() { + It("should include CrashLoopBackOff with termination context in the TigeraStatus", func() { + sm := New(cl, "crash-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep1"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "crash"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "crash-pod", Labels: map[string]string{"app": "crash"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "crash-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring("crash looping container")) + Expect(degraded.Message).To(ContainSubstring("OOMKilled, exit code 137")) + }) + + It("should include 'running but not ready' in the TigeraStatus", func() { + sm := New(cl, "notready-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep2"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep2", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "notready"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "notready-pod", Labels: map[string]string{"app": "notready"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "notready-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring("running but not ready")) + }) + + It("should include pending pod scheduler reason in the TigeraStatus progressing message", func() { + sm := New(cl, "pending-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep3"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep3", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "pending"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "pending-pod", Labels: map[string]string{"app": "pending"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "pending-test"}, ts)).NotTo(HaveOccurred()) + + progressing := getTigeraStatusCondition(ts, operator.ComponentProgressing) + Expect(progressing).NotTo(BeNil()) + Expect(progressing.Status).To(Equal(operator.ConditionTrue)) + Expect(progressing.Message).To(ContainSubstring("Insufficient memory")) + }) + + It("should report not-found workloads as degraded in TigeraStatus", func() { + sm := New(cl, "notfound-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "missing-dep"}}) + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "notfound-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring(`Deployment "ns/missing-dep" not found`)) + }) + }) + + Describe("deduplication and capping in TigeraStatus", func() { + It("should deduplicate identical pod failures and show count", func() { + sm := New(cl, "dedup-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(3) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep-dedup"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep-dedup", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dedup"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 3, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Create 3 pods all OOMKilled - should be deduplicated into one message. + for i := 0; i < 3; i++ { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: fmt.Sprintf("dedup-pod-%d", i), + Labels: map[string]string{"app": "dedup"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + } + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "dedup-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Message).To(ContainSubstring("3 pods affected")) + }) + }) + + Describe("rollout revision awareness in TigeraStatus", func() { + It("should annotate old-revision failures and prioritize new-revision failures", func() { + sm := New(cl, "rollout-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(2) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep-rollout"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep-rollout", UID: "rollout-uid", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep-rollout-new", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "rollout-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}}, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep-rollout-old", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "rollout-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}}, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod crash looping. + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "rollout-new-pod", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: 1}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod crash looping with different reason. + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "rollout-old-pod", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "rollout-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + + // The message should contain both failures, with the new-revision first. + // Messages are newline-joined by degradedMessage(). + lines := strings.Split(degraded.Message, "\n") + Expect(lines).To(HaveLen(2)) + Expect(lines[0]).NotTo(ContainSubstring("old revision")) + Expect(lines[1]).To(ContainSubstring("old revision")) + }) + }) })