From 4e4f0e5cb61dbe74f0dd95ef83116d961d233534 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:17:40 -0700 Subject: [PATCH 01/12] Cherry-pick status manager readiness and crash context improvements --- pkg/controller/status/status.go | 23 ++++- pkg/controller/status/status_test.go | 149 +++++++++++++++++++++++++++ 2 files changed, 170 insertions(+), 2 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 1a9e00650d..481d1789d1 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -42,6 +42,17 @@ import ( var log = logf.Log.WithName("status_manager") +const ( + // terminationReasonError is the reason string the container runtime sets on + // ContainerStateTerminated when the container exits with a non-zero exit code. + terminationReasonError = "Error" + + // exitCodeSIGKILL is the exit code for a container killed by SIGKILL (128 + 9). + // The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM + // killer, manual kill) can also produce this code. + exitCodeSIGKILL = 137 +) + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: @@ -700,13 +711,21 @@ func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerSt // Check well-known error states here and report an appropriate mesage to the end user. switch c.State.Waiting.Reason { case "CrashLoopBackOff": - return fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + if lt := c.LastTerminationState.Terminated; lt != nil { + if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { + msg += " (exit code 137, possible liveness probe failure)" + } else { + msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + } + } + return msg case "ImagePullBackOff", "ErrImagePull": return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name) } } if c.State.Terminated != nil { - if c.State.Terminated.Reason == "Error" { + if c.State.Terminated.Reason == terminationReasonError { return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name) } } diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index 5484e32e0c..d19dda0ee5 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -390,6 +390,155 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("when pod is crash looping", func() { + var gen int64 + + createCrashLoopPodAndDeployment := func(lastTermination *corev1.ContainerStateTerminated) { + sm.ReadyToMonitor() + podStatus := corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container", + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "CrashLoopBackOff", + }, + }, + LastTerminationState: corev1.ContainerState{ + Terminated: lastTermination, + }, + }, + }, + } + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1pod-crash", + Labels: map[string]string{"dp1Key": "dp1Value"}, + }, + Spec: corev1.PodSpec{}, + Status: podStatus, + })).NotTo(HaveOccurred()) + + gen = 5 + replicas := int32(1) + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"dp1Key": "dp1Value"}, + }, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + } + + It("should include OOMKilled reason in degraded message", func() { + createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{ + Reason: "OOMKilled", + ExitCode: 137, + }) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("(OOMKilled, exit code 137)"))) + }) + + It("should suggest liveness probe failure for exit code 137 with Error reason", func() { + createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{ + Reason: "Error", + ExitCode: 137, + }) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("possible liveness probe failure"))) + }) + + It("should include exit code for other termination reasons", func() { + createCrashLoopPodAndDeployment(&corev1.ContainerStateTerminated{ + Reason: "Error", + ExitCode: 1, + }) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("(Error, exit code 1)"))) + }) + + It("should report crash loop without detail when no last termination state", func() { + createCrashLoopPodAndDeployment(nil) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(And( + ContainSubstring("crash looping container"), + Not(ContainSubstring("exit code")), + ))) + }) + }) + + Context("when pod is running but not ready", func() { + var gen int64 + BeforeEach(func() { + sm.ReadyToMonitor() + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1pod-notready", + Labels: map[string]string{ + "dp1Key": "dp1Value", + }, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.ContainersReady, + Status: corev1.ConditionFalse, + }, + }, + }, + })).NotTo(HaveOccurred()) + gen = 5 + }) + It("should degrade when the deployment does not have the correct pod counts", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + replicas := int32(1) + + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", Name: "DP1", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"dp1Key": "dp1Value"}, + }, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 1, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring("running but not ready"))) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) From c5b360ad2d3776c74ee476a79816559be7bb8206 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:20:50 -0700 Subject: [PATCH 02/12] Add podIssue types for structured pod diagnostics --- pkg/controller/status/status.go | 38 ++++++++++++++++++++++++++++ pkg/controller/status/status_test.go | 28 ++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 481d1789d1..3cc62d3ab3 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -53,6 +53,44 @@ const ( exitCodeSIGKILL = 137 ) +type podIssueSeverity int + +const ( + severityProgressing podIssueSeverity = iota + severityFailing +) + +// podIssueType defines the category of a pod issue. The enum order determines display +// priority when issues compete for the 3-slot cap in summarizeIssues. +type podIssueType int + +const ( + issueCrashLoopBackOff podIssueType = iota + issueImagePull + issueTerminated + issuePodFailed + issueNotReady + issuePending +) + +// podIssue represents a single diagnosed problem with a pod. +type podIssue struct { + severity podIssueSeverity + issueType podIssueType + message string + + // isOldRevision is true if the pod belongs to an old revision during a rollout. + isOldRevision bool + + // terminationReason and exitCode provide dedup granularity within an issue type. + terminationReason string + exitCode int32 +} + +func (p podIssue) key() string { + return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) +} + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index d19dda0ee5..dec7cadd95 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -880,4 +880,32 @@ var _ = Describe("Status reporting tests", func() { }, false, true), ) }) + + Describe("podIssue", func() { + It("should produce distinct keys for different crash reasons", func() { + oom := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: "OOMKilled", + exitCode: 137, + } + errExit1 := podIssue{ + issueType: issueCrashLoopBackOff, + terminationReason: terminationReasonError, + exitCode: 1, + } + Expect(oom.key()).NotTo(Equal(errExit1.key())) + }) + + It("should produce the same key for same issue type with no termination context", func() { + a := podIssue{issueType: issueNotReady} + b := podIssue{issueType: issueNotReady} + Expect(a.key()).To(Equal(b.key())) + }) + + It("should produce distinct keys for different issue types", func() { + a := podIssue{issueType: issueCrashLoopBackOff} + b := podIssue{issueType: issueImagePull} + Expect(a.key()).NotTo(Equal(b.key())) + }) + }) }) From 900420ea7414af41fb6411b604b7e25000d020e4 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:22:44 -0700 Subject: [PATCH 03/12] Add summarizeIssues for dedup, prioritization, and capping --- pkg/controller/status/status.go | 57 +++++++++++++++++++++++++ pkg/controller/status/status_test.go | 63 ++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 3cc62d3ab3..0fa65c41de 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -91,6 +91,63 @@ func (p podIssue) key() string { return fmt.Sprintf("%d:%s:%d", p.issueType, p.terminationReason, p.exitCode) } +const maxIssuesPerWorkload = 3 + +// summarizeIssues deduplicates, prioritizes, and caps a list of pod issues into +// human-readable failing and progressing message slices. +func summarizeIssues(issues []podIssue) (failing []string, progressing []string) { + if len(issues) == 0 { + return nil, nil + } + + // Sort: new-revision first, then by issue type priority (enum order). + sort.SliceStable(issues, func(i, j int) bool { + if issues[i].isOldRevision != issues[j].isOldRevision { + return !issues[i].isOldRevision + } + return issues[i].issueType < issues[j].issueType + }) + + // Group by key. First occurrence provides the message; count tracks duplicates. + type issueGroup struct { + issue podIssue + count int + } + seen := map[string]int{} + var groups []issueGroup + for _, iss := range issues { + k := iss.key() + if idx, ok := seen[k]; ok { + groups[idx].count++ + continue + } + seen[k] = len(groups) + groups = append(groups, issueGroup{issue: iss, count: 1}) + } + + // Cap at maxIssuesPerWorkload unique reasons. + if len(groups) > maxIssuesPerWorkload { + groups = groups[:maxIssuesPerWorkload] + } + + // Format messages. + for _, g := range groups { + msg := g.issue.message + if g.count > 1 { + msg += fmt.Sprintf(" (%d pods affected)", g.count) + } + if g.issue.isOldRevision { + msg += " (old revision)" + } + if g.issue.severity == severityFailing { + failing = append(failing, msg) + } else { + progressing = append(progressing, msg) + } + } + return failing, progressing +} + // StatusManager manages the status for a single controller and component, and reports the status via // a TigeraStatus API object. The status manager uses the following conditions/states to represent the // component's current status: diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index dec7cadd95..0c05ef4013 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -908,4 +908,67 @@ var _ = Describe("Status reporting tests", func() { Expect(a.key()).NotTo(Equal(b.key())) }) }) + + Describe("summarizeIssues", func() { + It("should deduplicate issues with the same key", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod1 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod2 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "Pod ns/pod3 has crash looping container: c1", terminationReason: "OOMKilled", exitCode: 137}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("3 pods affected")) + Expect(progressing).To(BeEmpty()) + }) + + It("should cap at 3 unique reasons", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "msg1", terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueImagePull, message: "msg2"}, + {severity: severityFailing, issueType: issueTerminated, message: "msg3", terminationReason: terminationReasonError, exitCode: 1}, + {severity: severityFailing, issueType: issuePodFailed, message: "msg4"}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(3)) + }) + + It("should prioritize new-revision pods over old-revision pods", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "old pod crash", isOldRevision: true, terminationReason: "OOMKilled", exitCode: 137}, + {severity: severityFailing, issueType: issueCrashLoopBackOff, message: "new pod crash", isOldRevision: false, terminationReason: terminationReasonError, exitCode: 1}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(2)) + Expect(failing[0]).To(ContainSubstring("new pod crash")) + Expect(failing[1]).To(ContainSubstring("old pod crash")) + Expect(failing[1]).To(ContainSubstring("old revision")) + }) + + It("should split failing and progressing", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready"}, + {severity: severityProgressing, issueType: issuePending, message: "Pod ns/p2 is pending: Unschedulable"}, + } + failing, progressing := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(progressing).To(HaveLen(1)) + Expect(progressing[0]).To(ContainSubstring("pending")) + }) + + It("should annotate old revision issues", func() { + issues := []podIssue{ + {severity: severityFailing, issueType: issueNotReady, message: "Pod ns/p1 is running but not ready", isOldRevision: true}, + } + failing, _ := summarizeIssues(issues) + Expect(failing).To(HaveLen(1)) + Expect(failing[0]).To(ContainSubstring("old revision")) + }) + + It("should return empty slices for no issues", func() { + failing, progressing := summarizeIssues(nil) + Expect(failing).To(BeEmpty()) + Expect(progressing).To(BeEmpty()) + }) + }) }) From 7ed265687558c165d8e9dc831b214955bb9ca3f6 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:25:41 -0700 Subject: [PATCH 04/12] Add diagnosePods to replace podsFailing with structured diagnosis --- pkg/controller/status/status.go | 141 ++++++++++++++++++ pkg/controller/status/status_test.go | 213 +++++++++++++++++++++++++++ 2 files changed, 354 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 0fa65c41de..04b95b1e6a 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -827,6 +827,147 @@ func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerSt return "" } +// diagnosePods lists pods matching the selector and returns a podIssue for each +// unhealthy pod found. If currentRevision is non-empty, pods not matching that +// revision are marked as old-revision. +func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string) []podIssue { + l := corev1.PodList{} + s, err := metav1.LabelSelectorAsMap(selector) + if err != nil { + panic(err) + } + err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list pods for diagnosis") + return nil + } + + var issues []podIssue + for _, p := range l.Items { + oldRevision := currentRevision != "" && !podMatchesRevision(p, currentRevision) + + if p.Status.Phase == corev1.PodFailed { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issuePodFailed, + message: fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + continue + } + + // Check init and regular container statuses for errors. + if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision); len(iss) > 0 { + issues = append(issues, iss...) + continue + } + if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision); len(iss) > 0 { + issues = append(issues, iss...) + continue + } + + // Running but not passing readiness checks. + if p.Status.Phase == corev1.PodRunning { + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueNotReady, + message: fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name), + isOldRevision: oldRevision, + }) + break + } + } + continue + } + + // Pending pod - check for scheduling failures. + if p.Status.Phase == corev1.PodPending { + msg := fmt.Sprintf("Pod %s/%s is pending", p.Namespace, p.Name) + for _, cond := range p.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse { + if cond.Message != "" { + msg += fmt.Sprintf(": %s", cond.Message) + } + break + } + } + issues = append(issues, podIssue{ + severity: severityProgressing, + issueType: issuePending, + message: msg, + isOldRevision: oldRevision, + }) + } + } + return issues +} + +// diagnoseContainers checks a list of container statuses for known error states and +// returns a podIssue for each one found. +func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool) []podIssue { + var issues []podIssue + for _, c := range statuses { + if c.State.Waiting != nil { + switch c.State.Waiting.Reason { + case "CrashLoopBackOff": + msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) + var termReason string + var exitCode int32 + if lt := c.LastTerminationState.Terminated; lt != nil { + termReason = lt.Reason + exitCode = lt.ExitCode + if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { + msg += " (exit code 137, possible liveness probe failure)" + } else { + msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + } + } + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueCrashLoopBackOff, + message: msg, + isOldRevision: oldRevision, + terminationReason: termReason, + exitCode: exitCode, + }) + case "ImagePullBackOff", "ErrImagePull": + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueImagePull, + message: fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) + } + } + if c.State.Terminated != nil { + if c.State.Terminated.Reason == terminationReasonError { + issues = append(issues, podIssue{ + severity: severityFailing, + issueType: issueTerminated, + message: fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name), + isOldRevision: oldRevision, + }) + } + } + } + return issues +} + +// podMatchesRevision checks whether a pod belongs to the given revision. +// Works for DaemonSets and StatefulSets (controller-revision-hash label) +// and Deployments (pod-template-hash label). +func podMatchesRevision(p corev1.Pod, currentRevision string) bool { + if hash, ok := p.Labels[appsv1.ControllerRevisionHashLabelKey]; ok { + return hash == currentRevision + } + if hash, ok := p.Labels[appsv1.DefaultDeploymentUniqueLabelKey]; ok { + return hash == currentRevision + } + return true +} + func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index 0c05ef4013..c125b16574 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -971,4 +971,217 @@ var _ = Describe("Status reporting tests", func() { Expect(progressing).To(BeEmpty()) }) }) + + Describe("diagnosePods", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + selector := &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}} + podMeta := metav1.ObjectMeta{Namespace: "ns", Name: "pod1", Labels: map[string]string{"app": "test"}} + + It("should detect CrashLoopBackOff with OOMKilled context", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff)) + Expect(issues[0].severity).To(Equal(severityFailing)) + Expect(issues[0].terminationReason).To(Equal("OOMKilled")) + Expect(issues[0].exitCode).To(BeEquivalentTo(137)) + Expect(issues[0].message).To(ContainSubstring("OOMKilled")) + }) + + It("should detect CrashLoopBackOff with possible liveness probe failure", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure")) + }) + + It("should detect ImagePullBackOff", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "ImagePullBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueImagePull)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect terminated container with Error", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueTerminated)) + }) + + It("should detect pod in Failed phase", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodFailed}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePodFailed)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect running but not ready", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issueNotReady)) + Expect(issues[0].severity).To(Equal(severityFailing)) + }) + + It("should detect pending unschedulable pod with scheduler reason", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Message: "0/3 nodes are available: 3 Insufficient memory", + }, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + Expect(issues[0].message).To(ContainSubstring("Insufficient memory")) + }) + + It("should detect pending pod without scheduler condition", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(1)) + Expect(issues[0].issueType).To(Equal(issuePending)) + Expect(issues[0].severity).To(Equal(severityProgressing)) + }) + + It("should report multiple issues from different pods", func() { + pod1 := podMeta.DeepCopy() + pod1.Name = "pod-crash" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod1, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + pod2 := podMeta.DeepCopy() + pod2.Name = "pod-pending" + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: *pod2, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + })).NotTo(HaveOccurred()) + + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(HaveLen(2)) + }) + + It("should return no issues for healthy pods", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionTrue}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "") + Expect(issues).To(BeEmpty()) + }) + }) }) From 2b122623956e4323684f32c34755293b990b985c Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:28:05 -0700 Subject: [PATCH 05/12] Add revision detection helpers for Deployments and DaemonSets --- pkg/controller/status/status.go | 60 ++++++++++++++ pkg/controller/status/status_test.go | 114 +++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 04b95b1e6a..764458e6fc 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -968,6 +968,66 @@ func podMatchesRevision(p corev1.Pod, currentRevision string) bool { return true } +// currentDeploymentRevision returns the pod-template-hash of the active ReplicaSet +// for the given Deployment. Returns empty string if it cannot be determined. +func (m *statusManager) currentDeploymentRevision(dep *appsv1.Deployment) string { + rsList := &appsv1.ReplicaSetList{} + s, err := metav1.LabelSelectorAsMap(dep.Spec.Selector) + if err != nil { + log.WithValues("reason", err).Info("Failed to parse deployment selector for revision lookup") + return "" + } + err = m.client.List(context.TODO(), rsList, client.MatchingLabels(s), client.InNamespace(dep.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ReplicaSets for revision lookup") + return "" + } + + // Find the ReplicaSet that is owned by this Deployment and has active replicas. + for _, rs := range rsList.Items { + if !isOwnedBy(rs.OwnerReferences, dep.UID) { + continue + } + if rs.Status.Replicas > 0 { + return rs.Labels[appsv1.DefaultDeploymentUniqueLabelKey] + } + } + return "" +} + +// currentDaemonSetRevision returns the controller-revision-hash of the most recent +// ControllerRevision for the given DaemonSet. Returns empty string if it cannot be determined. +func (m *statusManager) currentDaemonSetRevision(ds *appsv1.DaemonSet) string { + revList := &appsv1.ControllerRevisionList{} + err := m.client.List(context.TODO(), revList, client.InNamespace(ds.Namespace)) + if err != nil { + log.WithValues("reason", err).Info("Failed to list ControllerRevisions for revision lookup") + return "" + } + + var maxRevision int64 + var currentHash string + for _, rev := range revList.Items { + if !isOwnedBy(rev.OwnerReferences, ds.UID) { + continue + } + if rev.Revision > maxRevision { + maxRevision = rev.Revision + currentHash = rev.Labels[appsv1.ControllerRevisionHashLabelKey] + } + } + return currentHash +} + +func isOwnedBy(refs []metav1.OwnerReference, uid types.UID) bool { + for _, ref := range refs { + if ref.UID == uid && ref.Controller != nil && *ref.Controller { + return true + } + } + return false +} + func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondition) { if m.enabled == nil || !*m.enabled { // Never set any conditions unless the status manager is enabled. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index c125b16574..ada11b430c 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -71,6 +71,8 @@ var _ = Describe("Status reporting tests", func() { Expect(oldVersionSm.IsAvailable()).To(BeFalse()) }) + boolPtr := func(b bool) *bool { return &b } + Context("without CR found", func() { It("status is not created", func() { sm.updateStatus() @@ -1184,4 +1186,116 @@ var _ = Describe("Status reporting tests", func() { Expect(issues).To(BeEmpty()) }) }) + + Describe("currentRevision helpers", func() { + var sm *statusManager + var cl controllerRuntimeClient.Client + var ctx = context.Background() + + BeforeEach(func() { + scheme := runtime.NewScheme() + Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) + err := apis.AddToScheme(scheme, false) + Expect(err).NotTo(HaveOccurred()) + cl = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() + sm = New(cl, "test", &common.VersionInfo{Major: 1, Minor: 19}).(*statusManager) + }) + + Describe("currentDeploymentRevision", func() { + It("should return the pod-template-hash of the newest ReplicaSet", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + // Current ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-abc123", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "abc123"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + // Old ReplicaSet. + Expect(cl.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "dep1-old999", + Labels: map[string]string{"app": "test", appsv1.DefaultDeploymentUniqueLabelKey: "old999"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dep-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(Equal("abc123")) + }) + + It("should return empty string when no ReplicaSets exist", func() { + dep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "dep1", UID: "dep-uid"}, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + }, + } + rev := sm.currentDeploymentRevision(dep) + Expect(rev).To(BeEmpty()) + }) + }) + + Describe("currentDaemonSetRevision", func() { + It("should return the hash of the highest-revision ControllerRevision", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev1", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-old"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: boolPtr(true)}, + }, + }, + Revision: 1, + })).NotTo(HaveOccurred()) + Expect(cl.Create(ctx, &appsv1.ControllerRevision{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns", + Name: "ds1-rev2", + Labels: map[string]string{appsv1.ControllerRevisionHashLabelKey: "hash-new"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "ds-uid", Controller: boolPtr(true)}, + }, + }, + Revision: 2, + })).NotTo(HaveOccurred()) + + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(Equal("hash-new")) + }) + + It("should return empty string when no ControllerRevisions exist", func() { + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "ns", Name: "ds1", UID: "ds-uid"}, + } + rev := sm.currentDaemonSetRevision(ds) + Expect(rev).To(BeEmpty()) + }) + }) + }) }) From ad24032d3357b6a2bc5e2626083c8e1a3805db8c Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:31:22 -0700 Subject: [PATCH 06/12] Update syncState with not-found handling, diagnosePods, and summarizeIssues Wire diagnosePods and summarizeIssues into syncState, replacing the old podsFailing/containerErrorMessage functions. Each workload type now reports not-found as a degraded condition instead of silently continuing. DaemonSets and Deployments pass revision info so diagnosePods can distinguish old-revision pods from current ones. --- pkg/controller/status/status.go | 121 ++++++++------------------- pkg/controller/status/status_test.go | 36 ++++++++ 2 files changed, 70 insertions(+), 87 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 764458e6fc..ce0af479f4 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -593,7 +593,11 @@ func (m *statusManager) syncState() { ds := &appsv1.DaemonSet{} err := m.client.Get(context.TODO(), dsnn, ds) if err != nil { - log.WithValues("reason", err).Info("Failed to query daemonset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("DaemonSet %q not found", dsnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query daemonset") + } continue } if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled { @@ -616,22 +620,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the daemonset are failing. - if f, err := m.podsFailing(ds.Spec.Selector, ds.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "daemonset", dsnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDaemonSetRevision(ds) + issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.deployments { dep := &appsv1.Deployment{} err := m.client.Get(context.TODO(), depnn, dep) if err != nil { - log.WithValues("reason", err).Info("Failed to query deployment") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("Deployment %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query deployment") + } continue } if dep.Status.UnavailableReplicas > 0 { @@ -656,22 +660,22 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(dep.Spec.Selector, dep.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "deployment", depnn).Info("Failed to check for failing pods") - continue - } + revision := m.currentDeploymentRevision(dep) + issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.statefulsets { ss := &appsv1.StatefulSet{} err := m.client.Get(context.TODO(), depnn, ss) if err != nil { - log.WithValues("reason", err).Info("Failed to query statefulset") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("StatefulSet %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query statefulset") + } continue } if *ss.Spec.Replicas != ss.Status.CurrentReplicas { @@ -694,21 +698,20 @@ func (m *statusManager) syncState() { continue } - // Check if any pods within the deployment are failing. - if f, err := m.podsFailing(ss.Spec.Selector, ss.Namespace); err == nil { - if f != "" { - failing = append(failing, f) - } - } else { - log.WithValues("reason", err, "statefuleset", depnn).Info("Failed to check for failing pods") - continue - } + issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision) + f, p := summarizeIssues(issues) + failing = append(failing, f...) + progressing = append(progressing, p...) } for _, depnn := range m.cronjobs { cj := &batchv1.CronJob{} if err := m.client.Get(context.TODO(), depnn, cj); err != nil { - log.WithValues("reason", err).Info("Failed to query cronjobs") + if errors.IsNotFound(err) { + failing = append(failing, fmt.Sprintf("CronJob %q not found", depnn.String())) + } else { + log.WithValues("reason", err).Info("Failed to query cronjobs") + } continue } @@ -771,62 +774,6 @@ func (m *statusManager) removeTigeraStatus() { } } -// podsFailing takes a selector and returns if any of the pods that match it are failing. Failing pods are defined -// to be in CrashLoopBackOff state. -func (m *statusManager) podsFailing(selector *metav1.LabelSelector, namespace string) (string, error) { - l := corev1.PodList{} - s, err := metav1.LabelSelectorAsMap(selector) - if err != nil { - panic(err) - } - err = m.client.List(context.TODO(), &l, client.MatchingLabels(s), client.InNamespace(namespace)) - if err != nil { - return "", err - } - for _, p := range l.Items { - if p.Status.Phase == corev1.PodFailed { - return fmt.Sprintf("Pod %s/%s has failed", p.Namespace, p.Name), nil - } - for _, c := range p.Status.InitContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } - } - for _, c := range p.Status.ContainerStatuses { - if msg := m.containerErrorMessage(p, c); msg != "" { - return msg, nil - } - } - } - return "", nil -} - -func (m *statusManager) containerErrorMessage(p corev1.Pod, c corev1.ContainerStatus) string { - if c.State.Waiting != nil { - // Check well-known error states here and report an appropriate mesage to the end user. - switch c.State.Waiting.Reason { - case "CrashLoopBackOff": - msg := fmt.Sprintf("Pod %s/%s has crash looping container: %s", p.Namespace, p.Name, c.Name) - if lt := c.LastTerminationState.Terminated; lt != nil { - if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { - msg += " (exit code 137, possible liveness probe failure)" - } else { - msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) - } - } - return msg - case "ImagePullBackOff", "ErrImagePull": - return fmt.Sprintf("Pod %s/%s failed to pull container image for: %s", p.Namespace, p.Name, c.Name) - } - } - if c.State.Terminated != nil { - if c.State.Terminated.Reason == terminationReasonError { - return fmt.Sprintf("Pod %s/%s has terminated container: %s", p.Namespace, p.Name, c.Name) - } - } - return "" -} - // diagnosePods lists pods matching the selector and returns a podIssue for each // unhealthy pod found. If currentRevision is non-empty, pods not matching that // revision are marked as old-revision. diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index ada11b430c..b77d8ac2c6 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -21,6 +21,7 @@ import ( . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" certV1 "k8s.io/api/certificates/v1" certV1beta1 "k8s.io/api/certificates/v1beta1" corev1 "k8s.io/api/core/v1" @@ -55,6 +56,7 @@ var _ = Describe("Status reporting tests", func() { err := apis.AddToScheme(scheme, false) Expect(err).NotTo(HaveOccurred()) Expect(appsv1.AddToScheme(scheme)).NotTo(HaveOccurred()) + Expect(batchv1.AddToScheme(scheme)).NotTo(HaveOccurred()) Expect(corev1.AddToScheme(scheme)).NotTo(HaveOccurred()) client = ctrlrfake.DefaultFakeClientBuilder(scheme).Build() @@ -541,6 +543,40 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("when a monitored workload is not found", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should report degraded when a DaemonSet is not found", func() { + sm.AddDaemonsets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ds"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`DaemonSet "NS1/missing-ds" not found`))) + }) + + It("should report degraded when a Deployment is not found", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "missing-dep"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`Deployment "NS1/missing-dep" not found`))) + }) + + It("should report degraded when a StatefulSet is not found", func() { + sm.AddStatefulSets([]types.NamespacedName{{Namespace: "NS1", Name: "missing-ss"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`StatefulSet "NS1/missing-ss" not found`))) + }) + + It("should report degraded when a CronJob is not found", func() { + sm.AddCronJobs([]types.NamespacedName{{Namespace: "NS1", Name: "missing-cj"}}) + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(ContainElement(ContainSubstring(`CronJob "NS1/missing-cj" not found`))) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) From 528915da7cb2fe4aadb96b2b4c40c779268db89e Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:33:11 -0700 Subject: [PATCH 07/12] Add integration test for rollout-aware status reporting --- pkg/controller/status/status_test.go | 112 +++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index b77d8ac2c6..55da840170 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -577,6 +577,118 @@ var _ = Describe("Status reporting tests", func() { }) }) + Context("during a rollout with mixed old and new revision pods", func() { + BeforeEach(func() { + sm.ReadyToMonitor() + }) + + It("should prioritize new-revision pod failures over old-revision failures", func() { + sm.AddDeployments([]types.NamespacedName{{Namespace: "NS1", Name: "DP1"}}) + replicas := int32(2) + gen := int64(5) + + // Create the Deployment. + Expect(client.Create(ctx, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1", + UID: "dp1-uid", + Generation: gen, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ObservedGeneration: gen, + UnavailableReplicas: 2, + AvailableReplicas: 0, + ReadyReplicas: 0, + }, + })).NotTo(HaveOccurred()) + + // Current ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-new", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 1}, + })).NotTo(HaveOccurred()) + + // Old ReplicaSet. + Expect(client.Create(ctx, &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "DP1-old", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + OwnerReferences: []metav1.OwnerReference{ + {UID: "dp1-uid", Controller: boolPtr(true)}, + }, + }, + Spec: appsv1.ReplicaSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "dp1"}}, + }, + Status: appsv1.ReplicaSetStatus{Replicas: 0}, + })).NotTo(HaveOccurred()) + + // New-revision pod: crash looping. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-new-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "new-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + }, + }, + }, + })).NotTo(HaveOccurred()) + + // Old-revision pod: also crash looping but with different reason. + Expect(client.Create(ctx, &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "NS1", + Name: "dp1-old-pod", + Labels: map[string]string{"app": "dp1", appsv1.DefaultDeploymentUniqueLabelKey: "old-hash"}, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + + sm.updateStatus() + Expect(sm.IsDegraded()).To(BeTrue()) + Expect(sm.failing).To(HaveLen(2)) + // New-revision issue should appear first. + Expect(sm.failing[0]).NotTo(ContainSubstring("old revision")) + Expect(sm.failing[1]).To(ContainSubstring("old revision")) + }) + }) + It("Should handle basic state changes", func() { // We expect no state to be "True" at boot. Expect(sm.IsAvailable()).To(BeFalse()) From 90f17e2b9882357ea7aabfc073eaa9452568877b Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 13:34:10 -0700 Subject: [PATCH 08/12] Fix formatting --- pkg/controller/status/status_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index 55da840170..d832c4f026 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -670,7 +670,7 @@ var _ = Describe("Status reporting tests", func() { Phase: corev1.PodRunning, ContainerStatuses: []corev1.ContainerStatus{ { - Name: "c1", + Name: "c1", State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, LastTerminationState: corev1.ContainerState{ Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, From 9243429d2073fc373c2b927118f3df2d0378036b Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 14:01:48 -0700 Subject: [PATCH 09/12] Add TigeraStatus warning for unsupported ignore annotation When an object has the unsupported.operator.tigera.io/ignore annotation, surface a warning through TigeraStatus so users know the operator is not managing the resource. Clear the warning if the annotation is later removed. --- pkg/controller/utils/component.go | 31 +++++++++++++- pkg/controller/utils/component_test.go | 56 ++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/pkg/controller/utils/component.go b/pkg/controller/utils/component.go index 10ffc062c8..36dd9e5ff9 100644 --- a/pkg/controller/utils/component.go +++ b/pkg/controller/utils/component.go @@ -16,6 +16,7 @@ package utils import ( "context" + stderrors "errors" "fmt" "maps" "reflect" @@ -49,6 +50,10 @@ import ( rmeta "github.com/tigera/operator/pkg/render/common/meta" ) +// errObjectIgnored is returned by createOrUpdateObject when an existing object has the +// unsupported.operator.tigera.io/ignore annotation. +var errObjectIgnored = fmt.Errorf("object has unsupported ignore annotation") + const TLS_CIPHERS_ENV_VAR_NAME = "TLS_CIPHER_SUITES" // dCache is a global deduplication cache that is used to avoid unnecessary updates to objects. It is shared @@ -313,7 +318,7 @@ func (c *componentHandler) createOrUpdateObject(ctx context.Context, obj client. // The object exists. Update it, unless the user has marked it as "ignored". if IgnoreObject(cur) { logCtx.Info("Ignoring annotated object") - return nil + return errObjectIgnored } logCtx.V(2).Info("Resource already exists, update it") @@ -464,7 +469,19 @@ func (c *componentHandler) CreateOrUpdateOrDelete(ctx context.Context, component conflictRetry: err := c.createOrUpdateObject(ctx, obj.DeepCopyObject().(client.Object), osType) if err != nil { - if errors.IsAlreadyExists(err) { + if stderrors.Is(err, errObjectIgnored) { + if status != nil { + kind := obj.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + kind = reflect.TypeOf(obj).Elem().Name() + } + warningKey := fmt.Sprintf("ignore-%s-%s", kind, key) + status.SetWarning(warningKey, fmt.Sprintf( + "%s %q has the unsupported ignore annotation; the operator is not managing this resource", + kind, key, + )) + } + } else if errors.IsAlreadyExists(err) { // Remember that we've had an "already exists" error, but otherwise // carry on. alreadyExistsErr = err @@ -491,6 +508,16 @@ func (c *componentHandler) CreateOrUpdateOrDelete(ctx context.Context, component cronJobs = append(cronJobs, key) } + // Object updated normally - clear any stale ignore warning. + if err == nil && status != nil { + kind := obj.GetObjectKind().GroupVersionKind().Kind + if kind == "" { + kind = reflect.TypeOf(obj).Elem().Name() + } + warningKey := fmt.Sprintf("ignore-%s-%s", kind, key) + status.ClearWarning(warningKey) + } + continue } diff --git a/pkg/controller/utils/component_test.go b/pkg/controller/utils/component_test.go index 6716fa9221..b555049f14 100644 --- a/pkg/controller/utils/component_test.go +++ b/pkg/controller/utils/component_test.go @@ -16,6 +16,7 @@ package utils import ( "context" + stderrors "errors" "fmt" rbacv1 "k8s.io/api/rbac/v1" @@ -2017,6 +2018,61 @@ var _ = Describe("Component handler tests", func() { )) }) }) + + Context("unsupported ignore annotation", func() { + It("should return errObjectIgnored from createOrUpdateObject when object has ignore annotation", func() { + ds := &apps.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ignored-ds", + Namespace: "default", + Annotations: map[string]string{ + "unsupported.operator.tigera.io/ignore": "true", + }, + }, + Spec: apps.DaemonSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test"}}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "test"}}, + Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "c", Image: "img"}}}, + }, + }, + } + Expect(c.Create(ctx, ds)).NotTo(HaveOccurred()) + + rendered := ds.DeepCopy() + rendered.Annotations = nil + rendered.SetGroupVersionKind(apps.SchemeGroupVersion.WithKind("DaemonSet")) + err := handler.(*componentHandler).createOrUpdateObject(ctx, rendered, rmeta.OSTypeLinux) + Expect(stderrors.Is(err, errObjectIgnored)).To(BeTrue()) + }) + + It("should not return an error from CreateOrUpdateOrDelete when object has ignore annotation", func() { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ignored-cm", + Namespace: "default", + Annotations: map[string]string{ + "unsupported.operator.tigera.io/ignore": "true", + }, + }, + } + Expect(c.Create(ctx, cm)).NotTo(HaveOccurred()) + + rendered := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ignored-cm", + Namespace: "default", + }, + Data: map[string]string{"key": "value"}, + } + fc := &fakeComponent{ + objs: []client.Object{rendered}, + supportedOSType: rmeta.OSTypeLinux, + } + err := handler.CreateOrUpdateOrDelete(ctx, fc, sm) + Expect(err).NotTo(HaveOccurred()) + }) + }) }) var _ = Describe("Mocked client Component handler tests", func() { From 3013cc4f646fe9b62723594f9587d717df184a83 Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 14:09:58 -0700 Subject: [PATCH 10/12] Set custom-overrides annotation when probe or resource overrides are applied When the render package applies probe timing or resource overrides to a workload, set an operator.tigera.io/custom-overrides annotation with a comma-separated list of which override types were applied. This will be used by diagnosePods to correlate pod failures with user overrides. --- pkg/render/common/components/components.go | 26 ++++++ .../common/components/components_test.go | 91 +++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/pkg/render/common/components/components.go b/pkg/render/common/components/components.go index e45b4c53fd..a64072f4b4 100644 --- a/pkg/render/common/components/components.go +++ b/pkg/render/common/components/components.go @@ -36,6 +36,10 @@ import ( var log = logf.Log.WithName("components") +// CustomOverridesAnnotation is set on workloads when the render package applies user-specified +// probe timing or resource overrides. The value is a comma-separated list of override types. +const CustomOverridesAnnotation = "operator.tigera.io/custom-overrides" + // containerNameAliases maps deprecated container names to their current names. // When a user provides an override using a deprecated name, it is transparently // resolved to the current name before matching against rendered containers. @@ -396,6 +400,28 @@ func applyReplicatedPodResourceOverrides(r *replicatedPodResource, overrides any // probe timing overrides are applied to the corresponding container. if cos := GetContainerOverrides(overrides); cos != nil { mergeContainerOverrides(r.podTemplateSpec.Spec.Containers, cos) + + // Track which override types were applied for status correlation. + seen := map[string]bool{} + var overrideTypes []string + for _, co := range cos { + if co.ReadinessProbe != nil && !seen["readinessProbe"] { + seen["readinessProbe"] = true + overrideTypes = append(overrideTypes, "readinessProbe") + } + if co.LivenessProbe != nil && !seen["livenessProbe"] { + seen["livenessProbe"] = true + overrideTypes = append(overrideTypes, "livenessProbe") + } + if co.Resources != nil && !seen["resources"] { + seen["resources"] = true + overrideTypes = append(overrideTypes, "resources") + } + } + if len(overrideTypes) > 0 { + r.annotations = common.MapExistsOrInitialize(r.annotations) + r.annotations[CustomOverridesAnnotation] = strings.Join(overrideTypes, ",") + } } // If `overrides` has a Spec.Template.Spec.Affinity field, and it's non-nil, it sets diff --git a/pkg/render/common/components/components_test.go b/pkg/render/common/components/components_test.go index 08adc81231..0d5939ee7b 100644 --- a/pkg/render/common/components/components_test.go +++ b/pkg/render/common/components/components_test.go @@ -602,6 +602,7 @@ var _ = Describe("Common components render tests", func() { expected := defaultedDaemonSet() Expect(expected.Spec.Template.Spec.Containers).To(HaveLen(2)) expected.Spec.Template.Spec.Containers[0].Resources = resources1 + expected.Annotations[CustomOverridesAnnotation] = "resources" Expect(result.Spec.Template.Spec.Containers).To(ContainElements(expected.Spec.Template.Spec.Containers)) Expect(result).To(Equal(expected)) }), @@ -1069,6 +1070,7 @@ var _ = Describe("Common components render tests", func() { expected := defaultedDeployment() Expect(expected.Spec.Template.Spec.Containers).To(HaveLen(2)) expected.Spec.Template.Spec.Containers[0].Resources = resources1 + expected.Annotations[CustomOverridesAnnotation] = "resources" Expect(result.Spec.Template.Spec.Containers).To(ContainElements(expected.Spec.Template.Spec.Containers)) Expect(result).To(Equal(expected)) }), @@ -1260,6 +1262,95 @@ var _ = Describe("Common components render tests", func() { Expect(c.Resources).To(Equal(overrideResources), "container %q should have overridden resources", c.Name) } }) + + Describe("custom-overrides annotation", func() { + It("should set annotation when readiness probe override is applied", func() { + d := appsv1.Deployment{} + d.Spec.Template.Spec.Containers = []corev1.Container{ + { + Name: "compliance-server", + Image: "test-image", + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}}, + }, + }, + } + + period := int32(30) + overrides := &v1.ComplianceServerDeployment{ + Spec: &v1.ComplianceServerDeploymentSpec{ + Template: &v1.ComplianceServerDeploymentPodTemplateSpec{ + Spec: &v1.ComplianceServerDeploymentPodSpec{ + Containers: []v1.ComplianceServerDeploymentContainer{ + { + Name: "compliance-server", + ReadinessProbe: &v1.ProbeOverride{PeriodSeconds: &period}, + }, + }, + }, + }, + }, + } + + ApplyDeploymentOverrides(&d, overrides) + Expect(d.Annotations).To(HaveKeyWithValue(CustomOverridesAnnotation, "readinessProbe")) + }) + + It("should set annotation with multiple override types", func() { + d := appsv1.Deployment{} + d.Spec.Template.Spec.Containers = []corev1.Container{ + { + Name: "compliance-server", + Image: "test-image", + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}}, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{HTTPGet: &corev1.HTTPGetAction{Path: "/health"}}, + }, + }, + } + + period := int32(30) + overrideResources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("500m")}, + } + overrides := &v1.ComplianceServerDeployment{ + Spec: &v1.ComplianceServerDeploymentSpec{ + Template: &v1.ComplianceServerDeploymentPodTemplateSpec{ + Spec: &v1.ComplianceServerDeploymentPodSpec{ + Containers: []v1.ComplianceServerDeploymentContainer{ + { + Name: "compliance-server", + ReadinessProbe: &v1.ProbeOverride{PeriodSeconds: &period}, + LivenessProbe: &v1.ProbeOverride{PeriodSeconds: &period}, + Resources: &overrideResources, + }, + }, + }, + }, + }, + } + + ApplyDeploymentOverrides(&d, overrides) + ann := d.Annotations[CustomOverridesAnnotation] + Expect(ann).To(ContainSubstring("readinessProbe")) + Expect(ann).To(ContainSubstring("livenessProbe")) + Expect(ann).To(ContainSubstring("resources")) + }) + + It("should not set annotation when no probe or resource overrides", func() { + d := appsv1.Deployment{} + d.Spec.Template.Spec.Containers = []corev1.Container{ + {Name: "compliance-server", Image: "test-image"}, + } + overrides := &v1.ComplianceServerDeployment{ + Spec: &v1.ComplianceServerDeploymentSpec{}, + } + ApplyDeploymentOverrides(&d, overrides) + Expect(d.Annotations).NotTo(HaveKey(CustomOverridesAnnotation)) + }) + }) }) func addContainer(cs []corev1.Container) []corev1.Container { From 2e1de6ed81a6a010d600eca2461a8680cb8d239f Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 14:13:19 -0700 Subject: [PATCH 11/12] Add override correlation hints to pod diagnostic messages --- pkg/controller/status/status.go | 42 +++++++++--- pkg/controller/status/status_test.go | 99 +++++++++++++++++++++++++--- 2 files changed, 123 insertions(+), 18 deletions(-) diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index ce0af479f4..94d55bbe04 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -51,8 +51,24 @@ const ( // The kubelet sends SIGKILL when a liveness probe fails, but other actors (OOM // killer, manual kill) can also produce this code. exitCodeSIGKILL = 137 + + customOverridesAnnotation = "operator.tigera.io/custom-overrides" ) +// hasOverride checks if the workload has a specific override type configured. +func hasOverride(annotations map[string]string, overrideType string) bool { + val, ok := annotations[customOverridesAnnotation] + if !ok { + return false + } + for _, t := range strings.Split(val, ",") { + if t == overrideType { + return true + } + } + return false +} + type podIssueSeverity int const ( @@ -621,7 +637,7 @@ func (m *statusManager) syncState() { } revision := m.currentDaemonSetRevision(ds) - issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision) + issues := m.diagnosePods(ds.Spec.Selector, ds.Namespace, revision, ds.Annotations) f, p := summarizeIssues(issues) failing = append(failing, f...) progressing = append(progressing, p...) @@ -661,7 +677,7 @@ func (m *statusManager) syncState() { } revision := m.currentDeploymentRevision(dep) - issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision) + issues := m.diagnosePods(dep.Spec.Selector, dep.Namespace, revision, dep.Annotations) f, p := summarizeIssues(issues) failing = append(failing, f...) progressing = append(progressing, p...) @@ -698,7 +714,7 @@ func (m *statusManager) syncState() { continue } - issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision) + issues := m.diagnosePods(ss.Spec.Selector, ss.Namespace, ss.Status.UpdateRevision, ss.Annotations) f, p := summarizeIssues(issues) failing = append(failing, f...) progressing = append(progressing, p...) @@ -777,7 +793,7 @@ func (m *statusManager) removeTigeraStatus() { // diagnosePods lists pods matching the selector and returns a podIssue for each // unhealthy pod found. If currentRevision is non-empty, pods not matching that // revision are marked as old-revision. -func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string) []podIssue { +func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace string, currentRevision string, workloadAnnotations map[string]string) []podIssue { l := corev1.PodList{} s, err := metav1.LabelSelectorAsMap(selector) if err != nil { @@ -804,11 +820,11 @@ func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace s } // Check init and regular container statuses for errors. - if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision); len(iss) > 0 { + if iss := diagnoseContainers(p, p.Status.InitContainerStatuses, oldRevision, workloadAnnotations); len(iss) > 0 { issues = append(issues, iss...) continue } - if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision); len(iss) > 0 { + if iss := diagnoseContainers(p, p.Status.ContainerStatuses, oldRevision, workloadAnnotations); len(iss) > 0 { issues = append(issues, iss...) continue } @@ -817,10 +833,14 @@ func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace s if p.Status.Phase == corev1.PodRunning { for _, cond := range p.Status.Conditions { if cond.Type == corev1.ContainersReady && cond.Status == corev1.ConditionFalse { + msg := fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name) + if hasOverride(workloadAnnotations, "readinessProbe") { + msg += "; custom readiness probe configuration is in effect" + } issues = append(issues, podIssue{ severity: severityFailing, issueType: issueNotReady, - message: fmt.Sprintf("Pod %s/%s is running but not ready", p.Namespace, p.Name), + message: msg, isOldRevision: oldRevision, }) break @@ -853,7 +873,7 @@ func (m *statusManager) diagnosePods(selector *metav1.LabelSelector, namespace s // diagnoseContainers checks a list of container statuses for known error states and // returns a podIssue for each one found. -func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool) []podIssue { +func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevision bool, workloadAnnotations map[string]string) []podIssue { var issues []podIssue for _, c := range statuses { if c.State.Waiting != nil { @@ -867,8 +887,14 @@ func diagnoseContainers(p corev1.Pod, statuses []corev1.ContainerStatus, oldRevi exitCode = lt.ExitCode if lt.Reason == terminationReasonError && lt.ExitCode == exitCodeSIGKILL { msg += " (exit code 137, possible liveness probe failure)" + if hasOverride(workloadAnnotations, "livenessProbe") { + msg += "; custom liveness probe configuration is in effect" + } } else { msg += fmt.Sprintf(" (%s, exit code %d)", lt.Reason, lt.ExitCode) + if lt.Reason == "OOMKilled" && hasOverride(workloadAnnotations, "resources") { + msg += "; custom resource limits are in effect" + } } } issues = append(issues, podIssue{ diff --git a/pkg/controller/status/status_test.go b/pkg/controller/status/status_test.go index d832c4f026..d8702bf906 100644 --- a/pkg/controller/status/status_test.go +++ b/pkg/controller/status/status_test.go @@ -1157,7 +1157,7 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issueCrashLoopBackOff)) Expect(issues[0].severity).To(Equal(severityFailing)) @@ -1183,7 +1183,7 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].message).To(ContainSubstring("possible liveness probe failure")) }) @@ -1202,7 +1202,7 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issueImagePull)) Expect(issues[0].severity).To(Equal(severityFailing)) @@ -1222,7 +1222,7 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issueTerminated)) }) @@ -1233,7 +1233,7 @@ var _ = Describe("Status reporting tests", func() { Spec: corev1.PodSpec{}, Status: corev1.PodStatus{Phase: corev1.PodFailed}, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issuePodFailed)) Expect(issues[0].severity).To(Equal(severityFailing)) @@ -1250,7 +1250,7 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issueNotReady)) Expect(issues[0].severity).To(Equal(severityFailing)) @@ -1271,7 +1271,7 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issuePending)) Expect(issues[0].severity).To(Equal(severityProgressing)) @@ -1284,7 +1284,7 @@ var _ = Describe("Status reporting tests", func() { Spec: corev1.PodSpec{}, Status: corev1.PodStatus{Phase: corev1.PodPending}, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(1)) Expect(issues[0].issueType).To(Equal(issuePending)) Expect(issues[0].severity).To(Equal(severityProgressing)) @@ -1315,7 +1315,7 @@ var _ = Describe("Status reporting tests", func() { Status: corev1.PodStatus{Phase: corev1.PodPending}, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(HaveLen(2)) }) @@ -1330,9 +1330,88 @@ var _ = Describe("Status reporting tests", func() { }, }, })).NotTo(HaveOccurred()) - issues := sm.diagnosePods(selector, "ns", "") + issues := sm.diagnosePods(selector, "ns", "", nil) Expect(issues).To(BeEmpty()) }) + + It("should add readiness probe hint when pod is not ready and readinessProbe override is configured", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + overrides := map[string]string{"operator.tigera.io/custom-overrides": "readinessProbe,resources"} + issues := sm.diagnosePods(selector, "ns", "", overrides) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("custom readiness probe configuration is in effect")) + }) + + It("should add liveness probe hint when pod has possible liveness failure and livenessProbe override is configured", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: terminationReasonError, ExitCode: exitCodeSIGKILL}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + overrides := map[string]string{"operator.tigera.io/custom-overrides": "livenessProbe"} + issues := sm.diagnosePods(selector, "ns", "", overrides) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("custom liveness probe configuration is in effect")) + }) + + It("should add resources hint when pod is OOMKilled and resources override is configured", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "c1", + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "CrashLoopBackOff"}}, + LastTerminationState: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{Reason: "OOMKilled", ExitCode: 137}, + }, + }, + }, + }, + })).NotTo(HaveOccurred()) + overrides := map[string]string{"operator.tigera.io/custom-overrides": "resources"} + issues := sm.diagnosePods(selector, "ns", "", overrides) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).To(ContainSubstring("custom resource limits are in effect")) + }) + + It("should not add hints when no override annotation is present", func() { + Expect(cl.Create(ctx, &corev1.Pod{ + ObjectMeta: podMeta, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + {Type: corev1.ContainersReady, Status: corev1.ConditionFalse}, + }, + }, + })).NotTo(HaveOccurred()) + issues := sm.diagnosePods(selector, "ns", "", nil) + Expect(issues).To(HaveLen(1)) + Expect(issues[0].message).NotTo(ContainSubstring("custom")) + }) }) Describe("currentRevision helpers", func() { From 54f87dc2fc24510b4368495ba4d56b272e1513cb Mon Sep 17 00:00:00 2001 From: Casey Davenport Date: Tue, 7 Apr 2026 15:15:33 -0700 Subject: [PATCH 12/12] Add top-level Message field to TigeraStatus for better kubectl output --- api/v1/tigerastatus_types.go | 8 ++++- pkg/controller/status/status.go | 30 ++++++++++++++++++- .../operator.tigera.io_tigerastatuses.yaml | 10 +++++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/api/v1/tigerastatus_types.go b/api/v1/tigerastatus_types.go index 9206a9bac9..46e47e047d 100644 --- a/api/v1/tigerastatus_types.go +++ b/api/v1/tigerastatus_types.go @@ -33,6 +33,12 @@ type TigeraStatusStatus struct { // Conditions represents the latest observed set of conditions for this component. A component may be one or more of // Available, Progressing, or Degraded. Conditions []TigeraStatusCondition `json:"conditions"` + + // Message is a human-readable summary of the component's current state. It shows the most + // actionable information: the degraded reason if degraded, the progressing reason if progressing, + // or warnings if available with caveats. Empty when fully healthy with no warnings. + // +optional + Message string `json:"message,omitempty"` } // +kubebuilder:object:root=true @@ -45,7 +51,7 @@ type TigeraStatusStatus struct { // +kubebuilder:printcolumn:name="Progressing",type="string",JSONPath=".status.conditions[?(@.type=='Progressing')].status",description="Whether the component is processing changes." // +kubebuilder:printcolumn:name="Degraded",type="string",JSONPath=".status.conditions[?(@.type=='Degraded')].status",description="Whether the component is degraded." // +kubebuilder:printcolumn:name="Since",type="date",JSONPath=".status.conditions[?(@.type=='Available')].lastTransitionTime",description="The time the component's Available status last changed." -// +kubebuilder:printcolumn:name="Message",type="string",JSONPath=".status.conditions[?(@.type=='Degraded')].message",description="Error message when the component is degraded.",priority=0 +// +kubebuilder:printcolumn:name="Message",type="string",JSONPath=".status.message",description="Human-readable summary of the component's current state." type TigeraStatus struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` diff --git a/pkg/controller/status/status.go b/pkg/controller/status/status.go index 94d55bbe04..982d264b47 100644 --- a/pkg/controller/status/status.go +++ b/pkg/controller/status/status.go @@ -527,6 +527,12 @@ func (m *statusManager) ClearWarning(key string) { func (m *statusManager) warningMessage() string { m.lock.Lock() defer m.lock.Unlock() + return m.warningMessageLocked() +} + +// warningMessageLocked returns the warning message without acquiring the lock. +// The caller must hold m.lock. +func (m *statusManager) warningMessageLocked() string { if len(m.warnings) == 0 { return "" } @@ -1049,8 +1055,11 @@ func (m *statusManager) set(retry bool, conditions ...operator.TigeraStatusCondi } } + // Set the top-level Message field based on the most actionable condition. + ts.Status.Message = m.statusMessage(ts.Status.Conditions) + // If nothing has changed, we don't need to update in the API. - if reflect.DeepEqual(ts.Status.Conditions, old.Status.Conditions) { + if reflect.DeepEqual(ts.Status.Conditions, old.Status.Conditions) && ts.Status.Message == old.Status.Message { return } @@ -1158,6 +1167,25 @@ func (m *statusManager) degradedMessage() string { return strings.Join(msgs, "\n") } +// statusMessage returns the most actionable message for the top-level Message field, +// based on the current conditions. Priority: degraded > progressing > warnings > empty. +func (m *statusManager) statusMessage(conditions []operator.TigeraStatusCondition) string { + for _, c := range conditions { + if c.Type == operator.ComponentDegraded && c.Status == operator.ConditionTrue { + return c.Message + } + } + for _, c := range conditions { + if c.Type == operator.ComponentProgressing && c.Status == operator.ConditionTrue { + return c.Message + } + } + if w := m.warningMessageLocked(); w != "" { + return w + } + return "" +} + // This function should only be called if we are in a degraded state. // Every path should return a non-empty string that can be used in // the Condition Reason. diff --git a/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml b/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml index 838626ba16..b00cb4760c 100644 --- a/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml +++ b/pkg/imports/crds/operator/operator.tigera.io_tigerastatuses.yaml @@ -30,8 +30,8 @@ spec: jsonPath: .status.conditions[?(@.type=='Available')].lastTransitionTime name: Since type: date - - description: Error message when the component is degraded. - jsonPath: .status.conditions[?(@.type=='Degraded')].message + - description: Human-readable summary of the component's current state. + jsonPath: .status.message name: Message type: string name: v1 @@ -110,6 +110,12 @@ spec: - type type: object type: array + message: + description: |- + Message is a human-readable summary of the component's current state. It shows the most + actionable information: the degraded reason if degraded, the progressing reason if progressing, + or warnings if available with caveats. Empty when fully healthy with no warnings. + type: string required: - conditions type: object