From 4e4f0e5cb61dbe74f0dd95ef83116d961d233534 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:17:40 -0700 Subject: [PATCH 01/11] Cherry-pick status manager readiness and crash context improvements --- pkg/controller/status/status.go | 23 ++++- pkg/controller/status/status_test.go | 149 +++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 2 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 1a9e00650d..481d1789d1 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -42,6 +42,17 @@ import ( var log = logf.Log.WithName("status_manager") +const ( + // terminationReasonError is the reason string the container runtime sets on + // ContainerStateTerminated when the container exits with a non-zero exit code. + terminationReasonError = "Error" + + // exitCodeSIGKILL is the exit code for a container killed by SIGKILL (128 + 9). + // The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM + // killer, manual kill) can also produce this code. + exitCodeSIGKILL = 137 +) + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: @@ -700,13 +711,21 @@ func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerSt // Check well-known error states here and report an appropriate mesage to the end user. switch c.State.Waiting.Reason { case "CrashLoopBackOff": - return fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + if lt := c.LastTerminationState.Terminated; lt != nil { + if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { + msg += " (exit code 137, possible liveness probe failure)" + } else { + msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + } + } + return msg case "ImagePullBackOff", "ErrImagePull": return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name) } } if c.State.Terminated != nil { - if c.State.Terminated.Reason == "Error" { + if c.State.Terminated.Reason == terminationReasonError { return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name) } } diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index 5484e32e0c..d19dda0ee5 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -390,6 +390,155 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("when pod is crash looping", func() { + var gen int64 + + createCrashLoopPodAndDeployment := func(lastTermination *corev1.ContainerStateTerminated) { + sm.ReadyToMonitor() + podStatus := corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container", + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "CrashLoopBackOff", + }, + }, + LastTerminationState: corev1.ContainerState{ + Terminated: lastTermination, + }, + }, + }, + } + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1pod-crash", + Labels: map[string]string{"dp1Key": "dp1Value"}, + }, + Spec: corev1.PodSpec{}, + Status: podStatus, + })).NotTo(HaveOccurred()) + + gen = 5 + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"dp1Key": "dp1Value"}, + }, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + } + + It("should include OOMKilled reason in degraded message", func() { + createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{ + Reason: "OOMKilled", + ExitCode: 137, + }) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("(OOMKilled, exit code 137)"))) + }) + + It("should suggest liveness probe failure for exit code 137 with Error reason", func() { + createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{ + Reason: "Error", + ExitCode: 137, + }) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("possible liveness probe failure"))) + }) + + It("should include exit code for other termination reasons", func() { + createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{ + Reason: "Error", + ExitCode: 1, + }) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("(Error, exit code 1)"))) + }) + + It("should report crash loop without detail when no last termination state", func() { + createCrashLoopPodAndDeployment(nil) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(And( + ContainSubstring("crash looping container"), + Not(ContainSubstring("exit code")), + ))) + }) + }) + + Context("when pod is running but not ready", func() { + var gen int64 + BeforeEach(func() { + sm.ReadyToMonitor() + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1pod-notready", + Labels: map[string]string{ + "dp1Key": "dp1Value", + }, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + }, + }, + }, + })).NotTo(HaveOccurred()) + gen = 5 + }) + It("should degrade when the deployment does not have the correct pod counts", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + replicas := int32(1) + + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", Name: "DP1", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"dp1Key": "dp1Value"}, + }, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("running but not ready"))) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) From c5b360ad2d3776c74ee476a79816559be7bb8206 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:20:50 -0700 Subject: [PATCH 02/11] Add podIssue types for structured pod diagnostics --- pkg/controller/status/status.go | 38 ++++++++++++++++++++++++++++ pkg/controller/status/status_test.go | 28 ++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 481d1789d1..3cc62d3ab3 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -53,6 +53,44 @@ const ( exitCodeSIGKILL = 137 ) +type podIssueSeverity int + +const ( + severityProgressing podIssueSeverity = iota + severityFailing +) + +// podIssueType defines the category of a pod issue. The enum order determines display +// priority when issues compete for the 3-slot cap in summarizeIssues. +type podIssueType int + +const ( + issueCrashLoopBackOff podIssueType = iota + issueImagePull + issueTerminated + issuePodFailed + issueNotReady + issuePending +) + +// podIssue represents a single diagnosed problem with a pod. +type podIssue struct { + severity podIssueSeverity + issueType podIssueType + message string + + // isOldRevision is true if the pod belongs to an old revision during a rollout. + isOldRevision bool + + // terminationReason and exitCode provide dedup granularity within an issue type. + terminationReason string + exitCode int32 +} + +func (p podIssue) key() string { + return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) +} + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index d19dda0ee5..dec7cadd95 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -880,4 +880,32 @@ var _ = Describe("Status reporting tests", func() { }, false, true), ) }) + + Describe("podIssue", func() { + It("should produce distinct keys for different crash reasons", func() { + oom := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: "OOMKilled", + exitCode: 137, + } + errExit1 := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: terminationReasonError, + exitCode: 1, + } + Expect(oom.key()).NotTo(Equal(errExit1.key())) + }) + + It("should produce the same key for same issue type with no termination context", func() { + a := podIssue{issueType: issueNotReady} + b := podIssue{issueType: issueNotReady} + Expect(a.key()).To(Equal(b.key())) + }) + + It("should produce distinct keys for different issue types", func() { + a := podIssue{issueType: issueCrashLoopBackOff} + b := podIssue{issueType: issueImagePull} + Expect(a.key()).NotTo(Equal(b.key())) + }) + }) }) From 900420ea7414af41fb6411b604b7e25000d020e4 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:22:44 -0700 Subject: [PATCH 03/11] Add summarizeIssues for dedup, prioritization, and capping --- pkg/controller/status/status.go | 57 +++++++++++++++++++++++++ pkg/controller/status/status_test.go | 63 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 3cc62d3ab3..0fa65c41de 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -91,6 +91,63 @@ func (p podIssue) key() string { return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) } +const maxIssuesPerWorkload = 3 + +// summarizeIssues deduplicates, prioritizes, and caps a list of pod issues into +// human-readable failing and progressing message slices. +func summarizeIssues(issues []podIssue) (failing []string, progressing []string) { + if len(issues) == 0 { + return nil, nil + } + + // Sort: new-revision first, then by issue type priority (enum order). + sort.SliceStable(issues, func(i, j int) bool { + if issues[i].isOldRevision != issues[j].isOldRevision { + return !issues[i].isOldRevision + } + return issues[i].issueType < issues[j].issueType + }) + + // Group by key. First occurrence provides the message; count tracks duplicates. + type issueGroup struct { + issue podIssue + count int + } + seen := map[string]int{} + var groups []issueGroup + for _, iss := range issues { + k := iss.key() + if idx, ok := seen[k]; ok { + groups[idx].count++ + continue + } + seen[k] = len(groups) + groups = append(groups, issueGroup{issue: iss, count: 1}) + } + + // Cap at maxIssuesPerWorkload unique reasons. + if len(groups) > maxIssuesPerWorkload { + groups = groups[:maxIssuesPerWorkload] + } + + // Format messages. + for _, g := range groups { + msg := g.issue.message + if g.count > 1 { + msg += fmt.Sprintf(" (%d pods affected)", g.count) + } + if g.issue.isOldRevision { + msg += " (old revision)" + } + if g.issue.severity == severityFailing { + failing = append(failing, msg) + } else { + progressing = append(progressing, msg) + } + } + return failing, progressing +} + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index dec7cadd95..0c05ef4013 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -908,4 +908,67 @@ var _ = Describe("Status reporting tests", func() { Expect(a.key()).NotTo(Equal(b.key())) }) }) + + Describe("summarizeIssues", func() { + It("should deduplicate issues with the same key", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod1 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod2 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod3 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("3 pods affected")) + Expect(progressing).To(BeEmpty()) + }) + + It("should cap at 3 unique reasons", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "msg1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueImagePull, message: "msg2"}, + {severity: severityFailing, issueType: issueTerminated, message: "msg3", terminationReason: terminationReasonError, exitCode: 1}, + {severity: severityFailing, issueType: issuePodFailed, message: "msg4"}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(3)) + }) + + It("should prioritize new-revision pods over old-revision pods", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "old pod crash", isOldRevision: true, terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "new pod crash", isOldRevision: false, terminationReason: terminationReasonError, exitCode: 1}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(2)) + Expect(failing[0]).To(ContainSubstring("new pod crash")) + Expect(failing[1]).To(ContainSubstring("old pod crash")) + Expect(failing[1]).To(ContainSubstring("old revision")) + }) + + It("should split failing and progressing", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready"}, + {severity: severityProgressing, issueType: issuePending, message: "Pod ns/p2 is pending: Unschedulable"}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(progressing).To(HaveLen(1)) + Expect(progressing[0]).To(ContainSubstring("pending")) + }) + + It("should annotate old revision issues", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready", isOldRevision: true}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("old revision")) + }) + + It("should return empty slices for no issues", func() { + failing, progressing := summarizeIssues(nil) + Expect(failing).To(BeEmpty()) + Expect(progressing).To(BeEmpty()) + }) + }) }) From 7ed265687558c165d8e9dc831b214955bb9ca3f6 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:25:41 -0700 Subject: [PATCH 04/11] Add diagnosePods to replace podsFailing with structured diagnosis --- pkg/controller/status/status.go | 141 ++++++++++++++++++ pkg/controller/status/status_test.go | 213 +++++++++++++++++++++++++++ 2 files changed, 354 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 0fa65c41de..04b95b1e6a 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -827,6 +827,147 @@ func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerSt return "" } +// diagnosePods lists pods matching the selector and returns a podIssue for each +// unhealthy pod found. If currentRevision is non-empty, pods not matching that +// revision are marked as old-revision. +func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string) []podIssue { + l := corev1.PodList{} + s, err := metav1.LabelSelectorAsMap(selector) + if err != nil { + panic(err) + } + err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list pods for diagnosis") + return nil + } + + var issues []podIssue + for _, p := range l.Items { + oldRevision := currentRevision != "" && !podMatchesRevision(p, currentRevision) + + if p.Status.Phase == corev1.PodFailed { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issuePodFailed, + message: fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + continue + } + + // Check init and regular container statuses for errors. + if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision); len(iss) > 0 { + issues = append(issues, iss...) + continue + } + if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision); len(iss) > 0 { + issues = append(issues, iss...) + continue + } + + // Running but not passing readiness checks. + if p.Status.Phase == corev1.PodRunning { + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueNotReady, + message: fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + break + } + } + continue + } + + // Pending pod - check for scheduling failures. + if p.Status.Phase == corev1.PodPending { + msg := fmt.Sprintf("Pod %s/%s is pending", p.Namespace, p.Name) + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse { + if cond.Message != "" { + msg += fmt.Sprintf(": %s", cond.Message) + } + break + } + } + issues = append(issues, podIssue{ + severity: severityProgressing, + issueType: issuePending, + message: msg, + isOldRevision: oldRevision, + }) + } + } + return issues +} + +// diagnoseContainers checks a list of container statuses for known error states and +// returns a podIssue for each one found. +func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool) []podIssue { + var issues []podIssue + for _, c := range statuses { + if c.State.Waiting != nil { + switch c.State.Waiting.Reason { + case "CrashLoopBackOff": + msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + var termReason string + var exitCode int32 + if lt := c.LastTerminationState.Terminated; lt != nil { + termReason = lt.Reason + exitCode = lt.ExitCode + if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { + msg += " (exit code 137, possible liveness probe failure)" + } else { + msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + } + } + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueCrashLoopBackOff, + message: msg, + isOldRevision: oldRevision, + terminationReason: termReason, + exitCode: exitCode, + }) + case "ImagePullBackOff", "ErrImagePull": + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueImagePull, + message: fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) + } + } + if c.State.Terminated != nil { + if c.State.Terminated.Reason == terminationReasonError { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueTerminated, + message: fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) + } + } + } + return issues +} + +// podMatchesRevision checks whether a pod belongs to the given revision. +// Works for DaemonSets and StatefulSets (controller-revision-hash label) +// and Deployments (pod-template-hash label). +func podMatchesRevision(p corev1.Pod, currentRevision string) bool { + if hash, ok := p.Labels[appsv1.ControllerRevisionHashLabelKey]; ok { + return hash == currentRevision + } + if hash, ok := p.Labels[appsv1.DefaultDeploymentUniqueLabelKey]; ok { + return hash == currentRevision + } + return true +} + func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index 0c05ef4013..c125b16574 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -971,4 +971,217 @@ var _ = Describe("Status reporting tests", func() { Expect(progressing).To(BeEmpty()) }) }) + + Describe("diagnosePods", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + selector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}} + podMeta := metav1.ObjectMeta{Namespace: "ns", Name: "pod1", Labels: map[string]string{"app": "test"}} + + It("should detect CrashLoopBackOff with OOMKilled context", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff)) + Expect(issues[0].severity).To(Equal(severityFailing)) + Expect(issues[0].terminationReason).To(Equal("OOMKilled")) + Expect(issues[0].exitCode).To(BeEquivalentTo(137)) + Expect(issues[0].message).To(ContainSubstring("OOMKilled")) + }) + + It("should detect CrashLoopBackOff with possible liveness probe failure", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure")) + }) + + It("should detect ImagePullBackOff", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueImagePull)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect terminated container with Error", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueTerminated)) + }) + + It("should detect pod in Failed phase", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePodFailed)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect running but not ready", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueNotReady)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect pending unschedulable pod with scheduler reason", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + Expect(issues[0].message).To(ContainSubstring("Insufficient memory")) + }) + + It("should detect pending pod without scheduler condition", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + }) + + It("should report multiple issues from different pods", func() { + pod1 := podMeta.DeepCopy() + pod1.Name = "pod-crash" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod1, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + pod2 := podMeta.DeepCopy() + pod2.Name = "pod-pending" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod2, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(2)) + }) + + It("should return no issues for healthy pods", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionTrue}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(BeEmpty()) + }) + }) }) From 2b122623956e4323684f32c34755293b990b985c Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:28:05 -0700 Subject: [PATCH 05/11] Add revision detection helpers for Deployments and DaemonSets --- pkg/controller/status/status.go | 60 ++++++++++++++ pkg/controller/status/status_test.go | 114 +++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 04b95b1e6a..764458e6fc 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -968,6 +968,66 @@ func podMatchesRevision(p corev1.Pod, currentRevision string) bool { return true } +// currentDeploymentRevision returns the pod-template-hash of the active ReplicaSet +// for the given Deployment. Returns empty string if it cannot be determined. +func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string { + rsList := &appsv1.ReplicaSetList{} + s, err := metav1.LabelSelectorAsMap(dep.Spec.Selector) + if err != nil { + log.WithValues("reason", err).Info("Failed to parse deployment selector for revision lookup") + return "" + } + err = m.client.List(context.TODO(), rsList, client.MatchingLabels(s), client.InNamespace(dep.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ReplicaSets for revision lookup") + return "" + } + + // Find the ReplicaSet that is owned by this Deployment and has active replicas. + for _, rs := range rsList.Items { + if !isOwnedBy(rs.OwnerReferences, dep.UID) { + continue + } + if rs.Status.Replicas > 0 { + return rs.Labels[appsv1.DefaultDeploymentUniqueLabelKey] + } + } + return "" +} + +// currentDaemonSetRevision returns the controller-revision-hash of the most recent +// ControllerRevision for the given DaemonSet. Returns empty string if it cannot be determined. +func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { + revList := &appsv1.ControllerRevisionList{} + err := m.client.List(context.TODO(), revList, client.InNamespace(ds.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ControllerRevisions for revision lookup") + return "" + } + + var maxRevision int64 + var currentHash string + for _, rev := range revList.Items { + if !isOwnedBy(rev.OwnerReferences, ds.UID) { + continue + } + if rev.Revision > maxRevision { + maxRevision = rev.Revision + currentHash = rev.Labels[appsv1.ControllerRevisionHashLabelKey] + } + } + return currentHash +} + +func isOwnedBy(refs []metav1.OwnerReference, uid types.UID) bool { + for _, ref := range refs { + if ref.UID == uid && ref.Controller != nil && *ref.Controller { + return true + } + } + return false +} + func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index c125b16574..ada11b430c 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -71,6 +71,8 @@ var _ = Describe("Status reporting tests", func() { Expect(oldVersionSm.IsAvailable()).To(BeFalse()) }) + boolPtr := func(b bool) *bool { return &b } + Context("without CR found", func() { It("status is not created", func() { sm.updateStatus() @@ -1184,4 +1186,116 @@ var _ = Describe("Status reporting tests", func() { Expect(issues).To(BeEmpty()) }) }) + + Describe("currentRevision helpers", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + Describe("currentDeploymentRevision", func() { + It("should return the pod-template-hash of the newest ReplicaSet", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-abc123", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "abc123"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-old999", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "old999"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(Equal("abc123")) + }) + + It("should return empty string when no ReplicaSets exist", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(BeEmpty()) + }) + }) + + Describe("currentDaemonSetRevision", func() { + It("should return the hash of the highest-revision ControllerRevision", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev1", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-old"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: boolPtr(true)}, + }, + }, + Revision: 1, + })).NotTo(HaveOccurred()) + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev2", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-new"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: boolPtr(true)}, + }, + }, + Revision: 2, + })).NotTo(HaveOccurred()) + + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(Equal("hash-new")) + }) + + It("should return empty string when no ControllerRevisions exist", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(BeEmpty()) + }) + }) + }) }) From ad24032d3357b6a2bc5e2626083c8e1a3805db8c Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:31:22 -0700 Subject: [PATCH 06/11] Update syncState with not-found handling, diagnosePods, and summarizeIssues Wire diagnosePods and summarizeIssues into syncState, replacing the old podsFailing/containerErrorMessage functions. Each workload type now reports not-found as a degraded condition instead of silently continuing. DaemonSets and Deployments pass revision info so diagnosePods can distinguish old-revision pods from current ones. --- pkg/controller/status/status.go | 121 ++++++++------------------- pkg/controller/status/status_test.go | 36 ++++++++ 2 files changed, 70 insertions(+), 87 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 764458e6fc..ce0af479f4 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -593,7 +593,11 @@ func (m *statusManager) syncState() { ds := &appsv1.DaemonSet{} err := m.client.Get(context.TODO(), dsnn, ds) if err != nil { - log.WithValues("reason", err).Info("Failed to query daemonset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("DaemonSet %q not found", dsnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query daemonset") + } continue } if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled { @@ -616,22 +620,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the daemonset are failing. - if f, err := m.podsFailing(ds.Spec.Selector, ds.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "daemonset", dsnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDaemonSetRevision(ds) + issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.deployments { dep := &appsv1.Deployment{} err := m.client.Get(context.TODO(), depnn, dep) if err != nil { - log.WithValues("reason", err).Info("Failed to query deployment") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("Deployment %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query deployment") + } continue } if dep.Status.UnavailableReplicas > 0 { @@ -656,22 +660,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(dep.Spec.Selector, dep.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "deployment", depnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDeploymentRevision(dep) + issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.statefulsets { ss := &appsv1.StatefulSet{} err := m.client.Get(context.TODO(), depnn, ss) if err != nil { - log.WithValues("reason", err).Info("Failed to query statefulset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("StatefulSet %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query statefulset") + } continue } if *ss.Spec.Replicas != ss.Status.CurrentReplicas { @@ -694,21 +698,20 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(ss.Spec.Selector, ss.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "statefuleset", depnn).Info("Failed to check for failing pods") - continue - } + issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.cronjobs { cj := &batchv1.CronJob{} if err := m.client.Get(context.TODO(), depnn, cj); err != nil { - log.WithValues("reason", err).Info("Failed to query cronjobs") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("CronJob %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query cronjobs") + } continue } @@ -771,62 +774,6 @@ func (m *statusManager) removeTigeraStatus() { } } -// podsFailing takes a selector and returns if any of the pods that match it are failing. Failing pods are defined -// to be in CrashLoopBackOff state. -func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace string) (string, error) { - l := corev1.PodList{} - s, err := metav1.LabelSelectorAsMap(selector) - if err != nil { - panic(err) - } - err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace)) - if err != nil { - return "", err - } - for _, p := range l.Items { - if p.Status.Phase == corev1.PodFailed { - return fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), nil - } - for _, c := range p.Status.InitContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } - } - for _, c := range p.Status.ContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } - } - } - return "", nil -} - -func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerStatus) string { - if c.State.Waiting != nil { - // Check well-known error states here and report an appropriate mesage to the end user. - switch c.State.Waiting.Reason { - case "CrashLoopBackOff": - msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) - if lt := c.LastTerminationState.Terminated; lt != nil { - if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { - msg += " (exit code 137, possible liveness probe failure)" - } else { - msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) - } - } - return msg - case "ImagePullBackOff", "ErrImagePull": - return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name) - } - } - if c.State.Terminated != nil { - if c.State.Terminated.Reason == terminationReasonError { - return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name) - } - } - return "" -} - // diagnosePods lists pods matching the selector and returns a podIssue for each // unhealthy pod found. If currentRevision is non-empty, pods not matching that // revision are marked as old-revision. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index ada11b430c..b77d8ac2c6 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -21,6 +21,7 @@ import ( . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" certV1 "k8s.io/api/certificates/v1" certV1beta1 "k8s.io/api/certificates/v1beta1" corev1 "k8s.io/api/core/v1" @@ -55,6 +56,7 @@ var _ = Describe("Status reporting tests", func() { err := apis.AddToScheme(scheme, false) Expect(err).NotTo(HaveOccurred()) Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) client = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() @@ -541,6 +543,40 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("when a monitored workload is not found", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should report degraded when a DaemonSet is not found", func() { + sm.AddDaemonsets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ds"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`DaemonSet "NS1/missing-ds" not found`))) + }) + + It("should report degraded when a Deployment is not found", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "missing-dep"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`Deployment "NS1/missing-dep" not found`))) + }) + + It("should report degraded when a StatefulSet is not found", func() { + sm.AddStatefulSets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ss"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`StatefulSet "NS1/missing-ss" not found`))) + }) + + It("should report degraded when a CronJob is not found", func() { + sm.AddCronJobs([]types.NamespacedName{{Namespace: "NS1", Name: "missing-cj"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`CronJob "NS1/missing-cj" not found`))) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) From 528915da7cb2fe4aadb96b2b4c40c779268db89e Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:33:11 -0700 Subject: [PATCH 07/11] Add integration test for rollout-aware status reporting --- pkg/controller/status/status_test.go | 112 +++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index b77d8ac2c6..55da840170 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -577,6 +577,118 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("during a rollout with mixed old and new revision pods", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should prioritize new-revision pod failures over old-revision failures", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + replicas := int32(2) + gen := int64(5) + + // Create the Deployment. + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1", + UID: "dp1-uid", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-new", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-old", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod: crash looping. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-new-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod: also crash looping but with different reason. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-old-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(HaveLen(2)) + // New-revision issue should appear first. + Expect(sm.failing[0]).NotTo(ContainSubstring("old revision")) + Expect(sm.failing[1]).To(ContainSubstring("old revision")) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) From 90f17e2b9882357ea7aabfc073eaa9452568877b Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:34:10 -0700 Subject: [PATCH 08/11] Fix formatting --- pkg/controller/status/status_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index 55da840170..d832c4f026 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -670,7 +670,7 @@ var _ = Describe("Status reporting tests", func() { Phase: corev1.PodRunning, ContainerStatuses: []corev1.ContainerStatus{ { - Name: "c1", + Name: "c1", State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, LastTerminationState: corev1.ContainerState{ Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, From ba775a1a106055dc2e4dac7b980ebf0682b8d03b Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 15:51:08 -0700 Subject: [PATCH 09/11] Address review comments: add doc comments, define waiting reason constants, use metav1.GetControllerOf --- pkg/controller/status/status.go | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index ce0af479f4..8e971f6834 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -51,6 +51,12 @@ const ( // The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM // killer, manual kill) can also produce this code. exitCodeSIGKILL = 137 + + // Container waiting reason strings. These are set by the container runtime and + // kubelet; Kubernetes does not define constants for them. + waitingCrashLoopBackOff = "CrashLoopBackOff" + waitingImagePullBackOff = "ImagePullBackOff" + waitingErrImagePull = "ErrImagePull" ) type podIssueSeverity int @@ -87,10 +93,15 @@ type podIssue struct { exitCode int32 } +// key returns a deduplication key for this issue. Issues with the same key are grouped +// together in summarizeIssues, so that e.g. 10 pods all OOMKilled produce one message +// with "(10 pods affected)" rather than 10 separate messages. func (p podIssue) key() string { return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) } +// maxIssuesPerWorkload is the maximum number of distinct issue types reported per workload. +// Beyond this cap, additional issues are dropped to keep messages concise. const maxIssuesPerWorkload = 3 // summarizeIssues deduplicates, prioritizes, and caps a list of pod issues into @@ -858,7 +869,7 @@ func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevi for _, c := range statuses { if c.State.Waiting != nil { switch c.State.Waiting.Reason { - case "CrashLoopBackOff": + case waitingCrashLoopBackOff: msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) var termReason string var exitCode int32 @@ -879,7 +890,7 @@ func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevi terminationReason: termReason, exitCode: exitCode, }) - case "ImagePullBackOff", "ErrImagePull": + case waitingImagePullBackOff, waitingErrImagePull: issues = append(issues, podIssue{ severity: severityFailing, issueType: issueImagePull, @@ -917,6 +928,8 @@ func podMatchesRevision(p corev1.Pod, currentRevision string) bool { // currentDeploymentRevision returns the pod-template-hash of the active ReplicaSet // for the given Deployment. Returns empty string if it cannot be determined. +// Only called when the Deployment is unhealthy. The List call goes through +// the controller-runtime informer cache, not directly to the API server. func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string { rsList := &appsv1.ReplicaSetList{} s, err := metav1.LabelSelectorAsMap(dep.Spec.Selector) @@ -930,9 +943,8 @@ func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string return "" } - // Find the ReplicaSet that is owned by this Deployment and has active replicas. for _, rs := range rsList.Items { - if !isOwnedBy(rs.OwnerReferences, dep.UID) { + if ref := metav1.GetControllerOf(&rs); ref == nil || ref.UID != dep.UID { continue } if rs.Status.Replicas > 0 { @@ -944,6 +956,8 @@ func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string // currentDaemonSetRevision returns the controller-revision-hash of the most recent // ControllerRevision for the given DaemonSet. Returns empty string if it cannot be determined. +// Only called when the DaemonSet is unhealthy. The List call goes through +// the controller-runtime informer cache, not directly to the API server. func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { revList := &appsv1.ControllerRevisionList{} err := m.client.List(context.TODO(), revList, client.InNamespace(ds.Namespace)) @@ -955,7 +969,7 @@ func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { var maxRevision int64 var currentHash string for _, rev := range revList.Items { - if !isOwnedBy(rev.OwnerReferences, ds.UID) { + if ref := metav1.GetControllerOf(&rev); ref == nil || ref.UID != ds.UID { continue } if rev.Revision > maxRevision { @@ -966,14 +980,6 @@ func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { return currentHash } -func isOwnedBy(refs []metav1.OwnerReference, uid types.UID) bool { - for _, ref := range refs { - if ref.UID == uid && ref.Controller != nil && *ref.Controller { - return true - } - } - return false -} func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { From be87fcdb4a713ffa629d0afe0541e03f481d04a1 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 16:14:07 -0700 Subject: [PATCH 10/11] Add integration tests for TigeraStatus message content --- pkg/controller/status/status_test.go | 379 ++++++++++++++++++++++++++- 1 file changed, 372 insertions(+), 7 deletions(-) diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index d832c4f026..f37534b38c 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -16,6 +16,8 @@ package status import ( "context" + "fmt" + "strings" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -30,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" controllerRuntimeClient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -73,7 +76,7 @@ var _ = Describe("Status reporting tests", func() { Expect(oldVersionSm.IsAvailable()).To(BeFalse()) }) - boolPtr := func(b bool) *bool { return &b } + Context("without CR found", func() { It("status is not created", func() { @@ -614,7 +617,7 @@ var _ = Describe("Status reporting tests", func() { Name: "DP1-new", Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, OwnerReferences: []metav1.OwnerReference{ - {UID: "dp1-uid", Controller: boolPtr(true)}, + {UID: "dp1-uid", Controller: ptr.To(true)}, }, }, Spec: appsv1.ReplicaSetSpec{ @@ -630,7 +633,7 @@ var _ = Describe("Status reporting tests", func() { Name: "DP1-old", Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, OwnerReferences: []metav1.OwnerReference{ - {UID: "dp1-uid", Controller: boolPtr(true)}, + {UID: "dp1-uid", Controller: ptr.To(true)}, }, }, Spec: appsv1.ReplicaSetSpec{ @@ -1365,7 +1368,7 @@ var _ = Describe("Status reporting tests", func() { Name: "dep1-abc123", Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "abc123"}, OwnerReferences: []metav1.OwnerReference{ - {UID: "dep-uid", Controller: boolPtr(true)}, + {UID: "dep-uid", Controller: ptr.To(true)}, }, }, Spec: appsv1.ReplicaSetSpec{ @@ -1380,7 +1383,7 @@ var _ = Describe("Status reporting tests", func() { Name: "dep1-old999", Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "old999"}, OwnerReferences: []metav1.OwnerReference{ - {UID: "dep-uid", Controller: boolPtr(true)}, + {UID: "dep-uid", Controller: ptr.To(true)}, }, }, Spec: appsv1.ReplicaSetSpec{ @@ -1416,7 +1419,7 @@ var _ = Describe("Status reporting tests", func() { Name: "ds1-rev1", Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-old"}, OwnerReferences: []metav1.OwnerReference{ - {UID: "ds-uid", Controller: boolPtr(true)}, + {UID: "ds-uid", Controller: ptr.To(true)}, }, }, Revision: 1, @@ -1427,7 +1430,7 @@ var _ = Describe("Status reporting tests", func() { Name: "ds1-rev2", Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-new"}, OwnerReferences: []metav1.OwnerReference{ - {UID: "ds-uid", Controller: boolPtr(true)}, + {UID: "ds-uid", Controller: ptr.To(true)}, }, }, Revision: 2, @@ -1447,3 +1450,365 @@ var _ = Describe("Status reporting tests", func() { }) }) }) + +// getTigeraStatusCondition returns the condition of the given type from the TigeraStatus, or nil if not found. +func getTigeraStatusCondition(ts *operator.TigeraStatus, condType operator.StatusConditionType) *operator.TigeraStatusCondition { + for _, c := range ts.Status.Conditions { + if c.Type == condType { + return &c + } + } + return nil +} + +var _ = Describe("Status manager integration tests", Ordered, func() { + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeAll(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + }) + + Describe("degraded message content", func() { + It("should include CrashLoopBackOff with termination context in the TigeraStatus", func() { + sm := New(cl, "crash-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep1"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "crash"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "crash-pod", Labels: map[string]string{"app": "crash"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "crash-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring("crash looping container")) + Expect(degraded.Message).To(ContainSubstring("OOMKilled, exit code 137")) + }) + + It("should include 'running but not ready' in the TigeraStatus", func() { + sm := New(cl, "notready-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep2"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep2", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "notready"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "notready-pod", Labels: map[string]string{"app": "notready"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "notready-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring("running but not ready")) + }) + + It("should include pending pod scheduler reason in the TigeraStatus progressing message", func() { + sm := New(cl, "pending-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep3"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep3", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "pending"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "pending-pod", Labels: map[string]string{"app": "pending"}}, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "pending-test"}, ts)).NotTo(HaveOccurred()) + + progressing := getTigeraStatusCondition(ts, operator.ComponentProgressing) + Expect(progressing).NotTo(BeNil()) + Expect(progressing.Status).To(Equal(operator.ConditionTrue)) + Expect(progressing.Message).To(ContainSubstring("Insufficient memory")) + }) + + It("should report not-found workloads as degraded in TigeraStatus", func() { + sm := New(cl, "notfound-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "missing-dep"}}) + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "notfound-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Status).To(Equal(operator.ConditionTrue)) + Expect(degraded.Message).To(ContainSubstring(`Deployment "ns/missing-dep" not found`)) + }) + }) + + Describe("deduplication and capping in TigeraStatus", func() { + It("should deduplicate identical pod failures and show count", func() { + sm := New(cl, "dedup-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(3) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep-dedup"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep-dedup", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dedup"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 3, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Create 3 pods all OOMKilled - should be deduplicated into one message. + for i := 0; i < 3; i++ { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: fmt.Sprintf("dedup-pod-%d", i), + Labels: map[string]string{"app": "dedup"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + } + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "dedup-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + Expect(degraded.Message).To(ContainSubstring("3 pods affected")) + }) + }) + + Describe("rollout revision awareness in TigeraStatus", func() { + It("should annotate old-revision failures and prioritize new-revision failures", func() { + sm := New(cl, "rollout-test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + sm.OnCRFound() + sm.ReadyToMonitor() + + gen := int64(1) + replicas := int32(2) + sm.AddDeployments([]types.NamespacedName{{Namespace: "ns", Name: "dep-rollout"}}) + + Expect(cl.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep-rollout", UID: "rollout-uid", Generation: gen}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep-rollout-new", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "rollout-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}}, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep-rollout-old", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "rollout-uid", Controller: ptr.To(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "rollout"}}}, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod crash looping. + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "rollout-new-pod", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: 1}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod crash looping with different reason. + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "rollout-old-pod", + Labels: map[string]string{"app": "rollout", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "main", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + + ts := &operator.TigeraStatus{} + Expect(cl.Get(ctx, types.NamespacedName{Name: "rollout-test"}, ts)).NotTo(HaveOccurred()) + + degraded := getTigeraStatusCondition(ts, operator.ComponentDegraded) + Expect(degraded).NotTo(BeNil()) + + // The message should contain both failures, with the new-revision first. + // Messages are newline-joined by degradedMessage(). + lines := strings.Split(degraded.Message, "\n") + Expect(lines).To(HaveLen(2)) + Expect(lines[0]).NotTo(ContainSubstring("old revision")) + Expect(lines[1]).To(ContainSubstring("old revision")) + }) + }) +}) From 6c4e576cc501517de01dd37b1a3b9757f983ee72 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 16:24:14 -0700 Subject: [PATCH 11/11] Merge master, run gen-versions and gen-files --- pkg/controller/status/status.go | 1 - pkg/controller/status/status_test.go | 2 -- 2 files changed, 3 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 8e971f6834..35f6029ef8 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -980,7 +980,6 @@ func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { return currentHash } - func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index f37534b38c..5aaa8d5f45 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -76,8 +76,6 @@ var _ = Describe("Status reporting tests", func() { Expect(oldVersionSm.IsAvailable()).To(BeFalse()) }) - - Context("without CR found", func() { It("status is not created", func() { sm.updateStatus()